diff options
Diffstat (limited to 'arch/x86')
94 files changed, 4398 insertions, 2334 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index dfabfefc21c4..299fbc86f570 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -347,6 +347,7 @@ endif config X86_VSMP bool "ScaleMP vSMP" + select PARAVIRT_GUEST select PARAVIRT depends on X86_64 && PCI depends on X86_EXTENDED_PLATFORM diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index e5bb96b10f1a..b59ee765414e 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -125,16 +125,6 @@ config DEBUG_NX_TEST and the software setup of this feature. If in doubt, say "N" -config 4KSTACKS - bool "Use 4Kb for kernel stacks instead of 8Kb" - depends on X86_32 - ---help--- - If you say Y here the kernel will use a 4Kb stacksize for the - kernel stack attached to each process/thread. This facilitates - running more threads on a system and also reduces the pressure - on the VM subsystem for higher order allocations. This option - will also use IRQ stacks to compensate for the reduced stackspace. - config DOUBLEFAULT default y bool "Enable doublefault exception handler" if EMBEDDED diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 8f7bef8e9fff..23f315c9f215 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -229,18 +229,35 @@ void *memset(void *s, int c, size_t n) ss[i] = c; return s; } - +#ifdef CONFIG_X86_32 void *memcpy(void *dest, const void *src, size_t n) { - int i; - const char *s = src; - char *d = dest; + int d0, d1, d2; + asm volatile( + "rep ; movsl\n\t" + "movl %4,%%ecx\n\t" + "rep ; movsb\n\t" + : "=&c" (d0), "=&D" (d1), "=&S" (d2) + : "0" (n >> 2), "g" (n & 3), "1" (dest), "2" (src) + : "memory"); - for (i = 0; i < n; i++) - d[i] = s[i]; return dest; } +#else +void *memcpy(void *dest, const void *src, size_t n) +{ + long d0, d1, d2; + asm volatile( + "rep ; movsq\n\t" + "movq %4,%%rcx\n\t" + "rep ; movsb\n\t" + : "=&c" (d0), "=&D" (d1), "=&S" (d2) + : "0" (n >> 3), "g" (n & 7), "1" (dest), "2" (src) + : "memory"); + return dest; +} +#endif static void error(char *x) { diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index bafd80defa43..903683b07e42 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -440,6 +440,8 @@ static inline int fls(int x) #ifdef __KERNEL__ +#include <asm-generic/bitops/find.h> + #include <asm-generic/bitops/sched.h> #define ARCH_HAS_FAST_MULTIPLIER 1 diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index 0e63c9a2a8d0..30af5a832163 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h @@ -48,36 +48,38 @@ For 32-bit we have the following conventions - kernel is built with /* - * 64-bit system call stack frame layout defines and helpers, - * for assembly code: + * 64-bit system call stack frame layout defines and helpers, for + * assembly code (note that the seemingly unnecessary parentheses + * are to prevent cpp from inserting spaces in expressions that get + * passed to macros): */ -#define R15 0 -#define R14 8 -#define R13 16 -#define R12 24 -#define RBP 32 -#define RBX 40 +#define R15 (0) +#define R14 (8) +#define R13 (16) +#define R12 (24) +#define RBP (32) +#define RBX (40) /* arguments: interrupts/non tracing syscalls only save up to here: */ -#define R11 48 -#define R10 56 -#define R9 64 -#define R8 72 -#define RAX 80 -#define RCX 88 -#define RDX 96 -#define RSI 104 -#define RDI 112 -#define ORIG_RAX 120 /* + error_code */ +#define R11 (48) +#define R10 (56) +#define R9 (64) +#define R8 (72) +#define RAX (80) +#define RCX (88) +#define RDX (96) +#define RSI (104) +#define RDI (112) +#define ORIG_RAX (120) /* + error_code */ /* end of arguments */ /* cpu exception frame or undefined in case of fast syscall: */ -#define RIP 128 -#define CS 136 -#define EFLAGS 144 -#define RSP 152 -#define SS 160 +#define RIP (128) +#define CS (136) +#define EFLAGS (144) +#define RSP (152) +#define SS (160) #define ARGOFFSET R11 #define SWFRAME ORIG_RAX @@ -111,7 +113,7 @@ For 32-bit we have the following conventions - kernel is built with .endif .endm -#define ARG_SKIP 9*8 +#define ARG_SKIP (9*8) .macro RESTORE_ARGS skiprax=0, addskip=0, skiprcx=0, skipr11=0, \ skipr8910=0, skiprdx=0 @@ -169,7 +171,7 @@ For 32-bit we have the following conventions - kernel is built with .endif .endm -#define REST_SKIP 6*8 +#define REST_SKIP (6*8) .macro SAVE_REST subq $REST_SKIP, %rsp diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index b8e96a18676b..57650ab4a5f5 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -16,22 +16,11 @@ BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) -BUILD_INTERRUPT3(invalidate_interrupt0,INVALIDATE_TLB_VECTOR_START+0, - smp_invalidate_interrupt) -BUILD_INTERRUPT3(invalidate_interrupt1,INVALIDATE_TLB_VECTOR_START+1, - smp_invalidate_interrupt) -BUILD_INTERRUPT3(invalidate_interrupt2,INVALIDATE_TLB_VECTOR_START+2, - smp_invalidate_interrupt) -BUILD_INTERRUPT3(invalidate_interrupt3,INVALIDATE_TLB_VECTOR_START+3, - smp_invalidate_interrupt) -BUILD_INTERRUPT3(invalidate_interrupt4,INVALIDATE_TLB_VECTOR_START+4, - smp_invalidate_interrupt) -BUILD_INTERRUPT3(invalidate_interrupt5,INVALIDATE_TLB_VECTOR_START+5, - smp_invalidate_interrupt) -BUILD_INTERRUPT3(invalidate_interrupt6,INVALIDATE_TLB_VECTOR_START+6, - smp_invalidate_interrupt) -BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7, +.irpc idx, "01234567" +BUILD_INTERRUPT3(invalidate_interrupt\idx, + (INVALIDATE_TLB_VECTOR_START)+\idx, smp_invalidate_interrupt) +.endr #endif BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 8caac76ac324..3bd04022fd0c 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -59,11 +59,12 @@ extern void kunmap_high(struct page *page); void *kmap(struct page *page); void kunmap(struct page *page); -void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot); -void *kmap_atomic(struct page *page, enum km_type type); -void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type); -void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); -void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); + +void *kmap_atomic_prot(struct page *page, pgprot_t prot); +void *__kmap_atomic(struct page *page); +void __kunmap_atomic(void *kvaddr); +void *kmap_atomic_pfn(unsigned long pfn); +void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); struct page *kmap_atomic_to_page(void *ptr); #define flush_cache_kmaps() do { } while (0) diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h index c4191b3b7056..363e33eb6ec1 100644 --- a/arch/x86/include/asm/iomap.h +++ b/arch/x86/include/asm/iomap.h @@ -27,10 +27,10 @@ #include <asm/tlbflush.h> void __iomem * -iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); +iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); void -iounmap_atomic(void __iomem *kvaddr, enum km_type type); +iounmap_atomic(void __iomem *kvaddr); int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot); diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 5458380b6ef8..13b0ebaa512f 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -19,18 +19,14 @@ static inline int irq_canonicalize(int irq) # define ARCH_HAS_NMI_WATCHDOG #endif -#ifdef CONFIG_4KSTACKS - extern void irq_ctx_init(int cpu); - extern void irq_ctx_exit(int cpu); -# define __ARCH_HAS_DO_SOFTIRQ +#ifdef CONFIG_X86_32 +extern void irq_ctx_init(int cpu); #else # define irq_ctx_init(cpu) do { } while (0) -# define irq_ctx_exit(cpu) do { } while (0) -# ifdef CONFIG_X86_64 -# define __ARCH_HAS_DO_SOFTIRQ -# endif #endif +#define __ARCH_HAS_DO_SOFTIRQ + #ifdef CONFIG_HOTPLUG_CPU #include <linux/cpumask.h> extern void fixup_irqs(void); diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 1f99ecfc48e1..b36c6b3fe144 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -139,6 +139,7 @@ struct x86_emulate_ops { void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu); void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); + void (*get_idt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu); int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); int (*cpl)(struct kvm_vcpu *vcpu); @@ -156,7 +157,10 @@ struct operand { unsigned long orig_val; u64 orig_val64; }; - unsigned long *ptr; + union { + unsigned long *reg; + unsigned long mem; + } addr; union { unsigned long val; u64 val64; @@ -190,6 +194,7 @@ struct decode_cache { bool has_seg_override; u8 seg_override; unsigned int d; + int (*execute)(struct x86_emulate_ctxt *ctxt); unsigned long regs[NR_VCPU_REGS]; unsigned long eip; /* modrm */ @@ -197,17 +202,16 @@ struct decode_cache { u8 modrm_mod; u8 modrm_reg; u8 modrm_rm; - u8 use_modrm_ea; + u8 modrm_seg; bool rip_relative; - unsigned long modrm_ea; - void *modrm_ptr; - unsigned long modrm_val; struct fetch_cache fetch; struct read_cache io_read; struct read_cache mem_read; }; struct x86_emulate_ctxt { + struct x86_emulate_ops *ops; + /* Register state before/after emulation. */ struct kvm_vcpu *vcpu; @@ -220,12 +224,11 @@ struct x86_emulate_ctxt { /* interruptibility state, as a result of execution of STI or MOV SS */ int interruptibility; - bool restart; /* restart string instruction after writeback */ + bool perm_ok; /* do not check permissions if true */ int exception; /* exception that happens during emulation or -1 */ u32 error_code; /* error code for exception */ bool error_code_valid; - unsigned long cr2; /* faulted address in case of #PF */ /* decode cache */ struct decode_cache decode; @@ -249,13 +252,14 @@ struct x86_emulate_ctxt { #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 #endif -int x86_decode_insn(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops); -int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops); +int x86_decode_insn(struct x86_emulate_ctxt *ctxt); +#define EMULATION_FAILED -1 +#define EMULATION_OK 0 +#define EMULATION_RESTART 1 +int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); int emulator_task_switch(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, u16 tss_selector, int reason, bool has_error_code, u32 error_code); - +int emulate_int_real(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int irq); #endif /* _ASM_X86_KVM_X86_EMULATE_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c52e2eb40a1e..9e6fe391094e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -236,10 +236,14 @@ struct kvm_pio_request { */ struct kvm_mmu { void (*new_cr3)(struct kvm_vcpu *vcpu); + void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); + unsigned long (*get_cr3)(struct kvm_vcpu *vcpu); int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); + void (*inject_page_fault)(struct kvm_vcpu *vcpu); void (*free)(struct kvm_vcpu *vcpu); gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, u32 *error); + gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access); void (*prefetch_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page); int (*sync_page)(struct kvm_vcpu *vcpu, @@ -249,13 +253,18 @@ struct kvm_mmu { int root_level; int shadow_root_level; union kvm_mmu_page_role base_role; + bool direct_map; u64 *pae_root; + u64 *lm_root; u64 rsvd_bits_mask[2][4]; + + bool nx; + + u64 pdptrs[4]; /* pae */ }; struct kvm_vcpu_arch { - u64 host_tsc; /* * rip and regs accesses must go through * kvm_{register,rip}_{read,write} functions. @@ -272,7 +281,6 @@ struct kvm_vcpu_arch { unsigned long cr4_guest_owned_bits; unsigned long cr8; u32 hflags; - u64 pdptrs[4]; /* pae */ u64 efer; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ @@ -282,7 +290,41 @@ struct kvm_vcpu_arch { u64 ia32_misc_enable_msr; bool tpr_access_reporting; + /* + * Paging state of the vcpu + * + * If the vcpu runs in guest mode with two level paging this still saves + * the paging mode of the l1 guest. This context is always used to + * handle faults. + */ struct kvm_mmu mmu; + + /* + * Paging state of an L2 guest (used for nested npt) + * + * This context will save all necessary information to walk page tables + * of the an L2 guest. This context is only initialized for page table + * walking and not for faulting since we never handle l2 page faults on + * the host. + */ + struct kvm_mmu nested_mmu; + + /* + * Pointer to the mmu context currently used for + * gva_to_gpa translations. + */ + struct kvm_mmu *walk_mmu; + + /* + * This struct is filled with the necessary information to propagate a + * page fault into the guest + */ + struct { + u64 address; + unsigned error_code; + bool nested; + } fault; + /* only needed in kvm_pv_mmu_op() path, but it's hot so * put it here to avoid allocation */ struct kvm_pv_mmu_op_buffer mmu_op_buffer; @@ -336,9 +378,15 @@ struct kvm_vcpu_arch { gpa_t time; struct pvclock_vcpu_time_info hv_clock; - unsigned int hv_clock_tsc_khz; + unsigned int hw_tsc_khz; unsigned int time_offset; struct page *time_page; + u64 last_host_tsc; + u64 last_guest_tsc; + u64 last_kernel_ns; + u64 last_tsc_nsec; + u64 last_tsc_write; + bool tsc_catchup; bool nmi_pending; bool nmi_injected; @@ -367,9 +415,9 @@ struct kvm_vcpu_arch { }; struct kvm_arch { - unsigned int n_free_mmu_pages; + unsigned int n_used_mmu_pages; unsigned int n_requested_mmu_pages; - unsigned int n_alloc_mmu_pages; + unsigned int n_max_mmu_pages; atomic_t invlpg_counter; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; /* @@ -394,8 +442,14 @@ struct kvm_arch { gpa_t ept_identity_map_addr; unsigned long irq_sources_bitmap; - u64 vm_init_tsc; s64 kvmclock_offset; + spinlock_t tsc_write_lock; + u64 last_tsc_nsec; + u64 last_tsc_offset; + u64 last_tsc_write; + u32 virtual_tsc_khz; + u32 virtual_tsc_mult; + s8 virtual_tsc_shift; struct kvm_xen_hvm_config xen_hvm_config; @@ -505,6 +559,7 @@ struct kvm_x86_ops { void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, bool has_error_code, u32 error_code, bool reinject); + void (*cancel_injection)(struct kvm_vcpu *vcpu); int (*interrupt_allowed)(struct kvm_vcpu *vcpu); int (*nmi_allowed)(struct kvm_vcpu *vcpu); bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); @@ -517,11 +572,16 @@ struct kvm_x86_ops { u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); int (*get_lpage_level)(void); bool (*rdtscp_supported)(void); + void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment); + + void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry); bool (*has_wbinvd_exit)(void); + void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); + const struct trace_print_flags *exit_reasons_str; }; @@ -544,7 +604,7 @@ void kvm_mmu_zap_all(struct kvm *kvm); unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); -int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); +int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3); int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes); @@ -608,8 +668,11 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); -void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, - u32 error_code); +void kvm_inject_page_fault(struct kvm_vcpu *vcpu); +int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + gfn_t gfn, void *data, int offset, int len, + u32 access); +void kvm_propagate_fault(struct kvm_vcpu *vcpu); bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); int kvm_pic_set_irq(void *opaque, int irq, int level); diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 05eba5e9a8e8..7b562b6184bc 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -158,6 +158,12 @@ static inline unsigned int kvm_arch_para_features(void) return cpuid_eax(KVM_CPUID_FEATURES); } +#ifdef CONFIG_KVM_GUEST +void __init kvm_guest_init(void); +#else +#define kvm_guest_init() do { } while (0) #endif +#endif /* __KERNEL__ */ + #endif /* _ASM_X86_KVM_PARA_H */ diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index 3e2ce58a31a3..67763c5d8b4e 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -60,12 +60,7 @@ #endif #ifdef CONFIG_X86_32 -# ifdef CONFIG_4KSTACKS -# define MODULE_STACKSIZE "4KSTACKS " -# else -# define MODULE_STACKSIZE "" -# endif -# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_STACKSIZE +# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY #endif #endif /* _ASM_X86_MODULE_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 91ba8e6b630a..3ea3dc487047 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -199,6 +199,7 @@ #define MSR_IA32_TSC 0x00000010 #define MSR_IA32_PLATFORM_ID 0x00000017 #define MSR_IA32_EBL_CR_POWERON 0x0000002a +#define MSR_EBC_FREQUENCY_ID 0x0000002c #define MSR_IA32_FEATURE_CONTROL 0x0000003a #define FEATURE_CONTROL_LOCKED (1<<0) diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h index 101229b0d8ed..42a978c0c1b3 100644 --- a/arch/x86/include/asm/olpc.h +++ b/arch/x86/include/asm/olpc.h @@ -89,6 +89,8 @@ extern int olpc_ec_mask_unset(uint8_t bits); /* EC commands */ #define EC_FIRMWARE_REV 0x08 +#define EC_WLAN_ENTER_RESET 0x35 +#define EC_WLAN_LEAVE_RESET 0x25 /* SCI source values */ diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index 6f1b7331313f..ade619ff9e2a 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -15,11 +15,7 @@ */ #define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) -#ifdef CONFIG_4KSTACKS -#define THREAD_ORDER 0 -#else #define THREAD_ORDER 1 -#endif #define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) #define STACKFAULT_STACK 0 diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index cd28f9ad910d..f899e01a8ac9 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -47,6 +47,20 @@ #ifdef CONFIG_SMP #define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x #define __my_cpu_offset percpu_read(this_cpu_off) + +/* + * Compared to the generic __my_cpu_offset version, the following + * saves one instruction and avoids clobbering a temp register. + */ +#define __this_cpu_ptr(ptr) \ +({ \ + unsigned long tcp_ptr__; \ + __verify_pcpu_ptr(ptr); \ + asm volatile("add " __percpu_arg(1) ", %0" \ + : "=r" (tcp_ptr__) \ + : "m" (this_cpu_off), "0" (ptr)); \ + (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \ +}) #else #define __percpu_arg(x) "%P" #x #endif diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index f686f49e8b7b..0c92113c4cb6 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -26,7 +26,7 @@ struct mm_struct; struct vm_area_struct; extern pgd_t swapper_pg_dir[1024]; -extern pgd_t trampoline_pg_dir[1024]; +extern pgd_t initial_page_table[1024]; static inline void pgtable_cache_init(void) { } static inline void check_pgt_cache(void) { } @@ -49,24 +49,14 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t); #endif #if defined(CONFIG_HIGHPTE) -#define __KM_PTE \ - (in_nmi() ? KM_NMI_PTE : \ - in_irq() ? KM_IRQ_PTE : \ - KM_PTE0) #define pte_offset_map(dir, address) \ - ((pte_t *)kmap_atomic(pmd_page(*(dir)), __KM_PTE) + \ + ((pte_t *)kmap_atomic(pmd_page(*(dir))) + \ pte_index((address))) -#define pte_offset_map_nested(dir, address) \ - ((pte_t *)kmap_atomic(pmd_page(*(dir)), KM_PTE1) + \ - pte_index((address))) -#define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE) -#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) +#define pte_unmap(pte) kunmap_atomic((pte)) #else #define pte_offset_map(dir, address) \ ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address))) -#define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address)) #define pte_unmap(pte) do { } while (0) -#define pte_unmap_nested(pte) do { } while (0) #endif /* Clear a kernel PTE and flush it from the TLB */ diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index f96ac9bedf75..f86da20347f2 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -127,9 +127,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; } /* x86-64 always has all page tables mapped. */ #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) -#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address)) #define pte_unmap(pte) ((void)(pte))/* NOP */ -#define pte_unmap_nested(pte) ((void)(pte)) /* NOP */ #define update_mmu_cache(vma, address, ptep) do { } while (0) diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index cd02f324aa6b..7f7e577a0e39 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -12,4 +12,42 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall, struct pvclock_vcpu_time_info *vcpu, struct timespec *ts); +/* + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, + * yielding a 64-bit result. + */ +static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) +{ + u64 product; +#ifdef __i386__ + u32 tmp1, tmp2; +#endif + + if (shift < 0) + delta >>= -shift; + else + delta <<= shift; + +#ifdef __i386__ + __asm__ ( + "mul %5 ; " + "mov %4,%%eax ; " + "mov %%edx,%4 ; " + "mul %5 ; " + "xor %5,%5 ; " + "add %4,%%eax ; " + "adc %5,%%edx ; " + : "=A" (product), "=r" (tmp1), "=r" (tmp2) + : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); +#elif defined(__x86_64__) + __asm__ ( + "mul %%rdx ; shrd $32,%%rdx,%%rax" + : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); +#else +#error implement me! +#endif + + return product; +} + #endif /* _ASM_X86_PVCLOCK_H */ diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 14e0ed86a6f9..231f1c1d6607 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -73,31 +73,31 @@ #define GDT_ENTRY_DEFAULT_USER_DS 15 -#define GDT_ENTRY_KERNEL_BASE 12 +#define GDT_ENTRY_KERNEL_BASE (12) -#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0) +#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE+0) -#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1) +#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE+1) -#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4) -#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5) +#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE+4) +#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE+5) -#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6) -#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11) +#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE+6) +#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE+11) -#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14) -#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8) +#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE+14) +#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8) -#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15) +#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE+15) #ifdef CONFIG_SMP #define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8) #else #define __KERNEL_PERCPU 0 #endif -#define GDT_ENTRY_STACK_CANARY (GDT_ENTRY_KERNEL_BASE + 16) +#define GDT_ENTRY_STACK_CANARY (GDT_ENTRY_KERNEL_BASE+16) #ifdef CONFIG_CC_STACKPROTECTOR -#define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY * 8) +#define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY*8) #else #define __KERNEL_STACK_CANARY 0 #endif @@ -182,10 +182,10 @@ #endif -#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8) -#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8) -#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS* 8 + 3) -#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS* 8 + 3) +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8) +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8+3) +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8+3) #ifndef CONFIG_PARAVIRT #define get_kernel_rpl() 0 #endif diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 4cfc90824068..4c2f63c7fc1b 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -50,7 +50,7 @@ struct smp_ops { void (*smp_prepare_cpus)(unsigned max_cpus); void (*smp_cpus_done)(unsigned max_cpus); - void (*smp_send_stop)(void); + void (*stop_other_cpus)(int wait); void (*smp_send_reschedule)(int cpu); int (*cpu_up)(unsigned cpu); @@ -73,7 +73,12 @@ extern struct smp_ops smp_ops; static inline void smp_send_stop(void) { - smp_ops.smp_send_stop(); + smp_ops.stop_other_cpus(0); +} + +static inline void stop_other_cpus(void) +{ + smp_ops.stop_other_cpus(1); } static inline void smp_prepare_boot_cpu(void) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 7f3eba08e7de..169be8938b96 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -172,6 +172,4 @@ static inline void flush_tlb_kernel_range(unsigned long start, flush_tlb_all(); } -extern void zap_low_mappings(bool early); - #endif /* _ASM_X86_TLBFLUSH_H */ diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h index 4dde797c0578..f4500fb3b485 100644 --- a/arch/x86/include/asm/trampoline.h +++ b/arch/x86/include/asm/trampoline.h @@ -13,16 +13,13 @@ extern unsigned char *trampoline_base; extern unsigned long init_rsp; extern unsigned long initial_code; -extern unsigned long initial_page_table; extern unsigned long initial_gs; #define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE) extern unsigned long setup_trampoline(void); -extern void __init setup_trampoline_page_table(void); extern void __init reserve_trampoline_memory(void); #else -static inline void setup_trampoline_page_table(void) {} static inline void reserve_trampoline_memory(void) {} #endif /* CONFIG_X86_TRAMPOLINE */ diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 7fda040a76cd..a3c28ae4025b 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -200,6 +200,23 @@ extern struct { char _entry[32]; } hypercall_page[]; (type)__res; \ }) +static inline long +privcmd_call(unsigned call, + unsigned long a1, unsigned long a2, + unsigned long a3, unsigned long a4, + unsigned long a5) +{ + __HYPERCALL_DECLS; + __HYPERCALL_5ARG(a1, a2, a3, a4, a5); + + asm volatile("call *%[call]" + : __HYPERCALL_5PARAM + : [call] "a" (&hypercall_page[call]) + : __HYPERCALL_CLOBBER5); + + return (long)__res; +} + static inline int HYPERVISOR_set_trap_table(struct trap_info *table) { diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index bf5f7d32bd08..dd8c1414b3d5 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -37,14 +37,21 @@ typedef struct xpaddr { extern unsigned long get_phys_to_machine(unsigned long pfn); -extern void set_phys_to_machine(unsigned long pfn, unsigned long mfn); +extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); static inline unsigned long pfn_to_mfn(unsigned long pfn) { + unsigned long mfn; + if (xen_feature(XENFEAT_auto_translated_physmap)) return pfn; - return get_phys_to_machine(pfn) & ~FOREIGN_FRAME_BIT; + mfn = get_phys_to_machine(pfn); + + if (mfn != INVALID_P2M_ENTRY) + mfn &= ~FOREIGN_FRAME_BIT; + + return mfn; } static inline int phys_to_machine_mapping_valid(unsigned long pfn) @@ -159,6 +166,7 @@ static inline pte_t __pte_ma(pteval_t x) #define pgd_val_ma(x) ((x).pgd) +void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid); xmaddr_t arbitrary_virt_to_machine(void *address); unsigned long arbitrary_virt_to_mfn(void *vaddr); diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index e1252074ea40..69fd72aa5594 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -13,6 +13,10 @@ #include <asm/segment.h> #include <asm/desc.h> +#ifdef CONFIG_X86_32 +#include <asm/pgtable.h> +#endif + #include "realmode/wakeup.h" #include "sleep.h" @@ -91,7 +95,7 @@ int acpi_save_state_mem(void) #ifndef CONFIG_64BIT header->pmode_entry = (u32)&wakeup_pmode_return; - header->pmode_cr3 = (u32)(swsusp_pg_dir - __PAGE_OFFSET); + header->pmode_cr3 = (u32)__pa(&initial_page_table); saved_magic = 0x12345678; #else /* CONFIG_64BIT */ header->trampoline_segment = setup_trampoline() >> 4; diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 4c9c67bf09b7..0e4f24c2a746 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -189,8 +189,8 @@ * Intel Order Number 241704-001. Microsoft Part Number 781-110-X01. * * [This document is available free from Intel by calling 800.628.8686 (fax - * 916.356.6100) or 800.548.4725; or via anonymous ftp from - * ftp://ftp.intel.com/pub/IAL/software_specs/apmv11.doc. It is also + * 916.356.6100) or 800.548.4725; or from + * http://www.microsoft.com/whdc/archive/amp_12.mspx It is also * available from Microsoft by calling 206.882.8080.] * * APM 1.2 Reference: @@ -1926,6 +1926,7 @@ static const struct file_operations apm_bios_fops = { .unlocked_ioctl = do_ioctl, .open = do_open, .release = do_release, + .llseek = noop_llseek, }; static struct miscdevice apm_device = { diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index dfdbf6403895..1a4088dda37a 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -99,9 +99,7 @@ void foo(void) DEFINE(PAGE_SIZE_asm, PAGE_SIZE); DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT); - DEFINE(PTRS_PER_PTE, PTRS_PER_PTE); - DEFINE(PTRS_PER_PMD, PTRS_PER_PMD); - DEFINE(PTRS_PER_PGD, PTRS_PER_PGD); + DEFINE(THREAD_SIZE_asm, THREAD_SIZE); OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index cd8da247dda1..a2baafb2fe6d 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -701,6 +701,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) per_cpu(acfreq_data, policy->cpu) = NULL; acpi_processor_unregister_performance(data->acpi_data, policy->cpu); + kfree(data->freq_table); kfree(data); } diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c index 733093d60436..141abebc4516 100644 --- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c +++ b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c @@ -393,7 +393,7 @@ static struct cpufreq_driver nforce2_driver = { * Detects nForce2 A2 and C1 stepping * */ -static unsigned int nforce2_detect_chipset(void) +static int nforce2_detect_chipset(void) { nforce2_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE2, diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c index fc09f142d94d..d9f51367666b 100644 --- a/arch/x86/kernel/cpu/cpufreq/longrun.c +++ b/arch/x86/kernel/cpu/cpufreq/longrun.c @@ -35,7 +35,7 @@ static unsigned int longrun_low_freq, longrun_high_freq; * Reads the current LongRun policy by access to MSR_TMTA_LONGRUN_FLAGS * and MSR_TMTA_LONGRUN_CTRL */ -static void __init longrun_get_policy(struct cpufreq_policy *policy) +static void __cpuinit longrun_get_policy(struct cpufreq_policy *policy) { u32 msr_lo, msr_hi; @@ -165,7 +165,7 @@ static unsigned int longrun_get(unsigned int cpu) * TMTA rules: * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq) */ -static unsigned int __cpuinit longrun_determine_freqs(unsigned int *low_freq, +static int __cpuinit longrun_determine_freqs(unsigned int *low_freq, unsigned int *high_freq) { u32 msr_lo, msr_hi; diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 695f17731e23..d16c2c53d6bf 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -284,9 +284,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) /* Don't do the funky fallback heuristics the AMD version employs for now. */ node = apicid_to_node[apicid]; - if (node == NUMA_NO_NODE) - node = first_node(node_online_map); - else if (!node_online(node)) { + if (node == NUMA_NO_NODE || !node_online(node)) { /* reuse the value from init_cpu_to_node() */ node = cpu_to_node(cpu); } diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 12cd823c8d03..17ad03366211 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -327,6 +327,7 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1; + l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; } static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node) diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 8a85dd1b1aa1..1e8d66c1336a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -192,6 +192,7 @@ static const struct file_operations severities_coverage_fops = { .release = seq_release, .read = seq_read, .write = severities_coverage_write, + .llseek = seq_lseek, }; static int __init severities_debugfs_init(void) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index ed41562909fe..7a35b72d7c03 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1665,6 +1665,7 @@ struct file_operations mce_chrdev_ops = { .read = mce_read, .poll = mce_poll, .unlocked_ioctl = mce_ioctl, + .llseek = no_llseek, }; EXPORT_SYMBOL_GPL(mce_chrdev_ops); diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index a333bf9189f6..ed6310183efb 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -49,7 +49,6 @@ static unsigned long copy_from_user_nmi(void *to, const void __user *from, unsigned long n) { unsigned long offset, addr = (unsigned long)from; - int type = in_nmi() ? KM_NMI : KM_IRQ0; unsigned long size, len = 0; struct page *page; void *map; @@ -63,9 +62,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) offset = addr & (PAGE_SIZE - 1); size = min(PAGE_SIZE - offset, n - len); - map = kmap_atomic(page, type); + map = kmap_atomic(page); memcpy(to, map+offset, size); - kunmap_atomic(map, type); + kunmap_atomic(map); put_page(page); len += size; diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index 67414550c3cc..d5cd13945d5a 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -61,7 +61,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (!is_crashed_pfn_valid(pfn)) return -EFAULT; - vaddr = kmap_atomic_pfn(pfn, KM_PTE0); + vaddr = kmap_atomic_pfn(pfn); if (!userbuf) { memcpy(buf, (vaddr + offset), csize); diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 0f6376ffa2d9..1bc7f75a5bda 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -82,11 +82,11 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, if (kstack_end(stack)) break; if (i && ((i % STACKSLOTS_PER_LINE) == 0)) - printk("\n%s", log_lvl); - printk(" %08lx", *stack++); + printk(KERN_CONT "\n"); + printk(KERN_CONT " %08lx", *stack++); touch_nmi_watchdog(); } - printk("\n"); + printk(KERN_CONT "\n"); show_trace_log_lvl(task, regs, sp, bp, log_lvl); } diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 57a21f11c791..6a340485249a 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -265,20 +265,20 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, if (stack >= irq_stack && stack <= irq_stack_end) { if (stack == irq_stack_end) { stack = (unsigned long *) (irq_stack_end[-1]); - printk(" <EOI> "); + printk(KERN_CONT " <EOI> "); } } else { if (((long) stack & (THREAD_SIZE-1)) == 0) break; } if (i && ((i % STACKSLOTS_PER_LINE) == 0)) - printk("\n%s", log_lvl); - printk(" %016lx", *stack++); + printk(KERN_CONT "\n"); + printk(KERN_CONT " %016lx", *stack++); touch_nmi_watchdog(); } preempt_enable(); - printk("\n"); + printk(KERN_CONT "\n"); show_trace_log_lvl(task, regs, sp, bp, log_lvl); } diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 9fb188d7bc76..59e175e89599 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -382,20 +382,20 @@ sysenter_past_esp: * enough kernel state to call TRACE_IRQS_OFF can be called - but * we immediately enable interrupts at that point anyway. */ - pushl_cfi $(__USER_DS) + pushl_cfi $__USER_DS /*CFI_REL_OFFSET ss, 0*/ pushl_cfi %ebp CFI_REL_OFFSET esp, 0 pushfl_cfi orl $X86_EFLAGS_IF, (%esp) - pushl_cfi $(__USER_CS) + pushl_cfi $__USER_CS /*CFI_REL_OFFSET cs, 0*/ /* * Push current_thread_info()->sysenter_return to the stack. * A tiny bit of offset fixup is necessary - 4*4 means the 4 words * pushed above; +8 corresponds to copy_thread's esp0 setting. */ - pushl_cfi (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) + pushl_cfi (TI_sysenter_return-THREAD_SIZE_asm+8+4*4)(%esp) CFI_REL_OFFSET eip, 0 pushl_cfi %eax diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index a7ae7fd1010f..fe2690d71c0c 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -963,22 +963,10 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \ x86_platform_ipi smp_x86_platform_ipi #ifdef CONFIG_SMP -apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ - invalidate_interrupt0 smp_invalidate_interrupt -apicinterrupt INVALIDATE_TLB_VECTOR_START+1 \ - invalidate_interrupt1 smp_invalidate_interrupt -apicinterrupt INVALIDATE_TLB_VECTOR_START+2 \ - invalidate_interrupt2 smp_invalidate_interrupt -apicinterrupt INVALIDATE_TLB_VECTOR_START+3 \ - invalidate_interrupt3 smp_invalidate_interrupt -apicinterrupt INVALIDATE_TLB_VECTOR_START+4 \ - invalidate_interrupt4 smp_invalidate_interrupt -apicinterrupt INVALIDATE_TLB_VECTOR_START+5 \ - invalidate_interrupt5 smp_invalidate_interrupt -apicinterrupt INVALIDATE_TLB_VECTOR_START+6 \ - invalidate_interrupt6 smp_invalidate_interrupt -apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \ - invalidate_interrupt7 smp_invalidate_interrupt +.irpc idx, "01234567" +apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \ + invalidate_interrupt\idx smp_invalidate_interrupt +.endr #endif apicinterrupt THRESHOLD_APIC_VECTOR \ diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 9a6ca2392170..763310165fa0 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -18,6 +18,7 @@ #include <asm/apic.h> #include <asm/io_apic.h> #include <asm/bios_ebda.h> +#include <asm/tlbflush.h> static void __init i386_default_early_setup(void) { diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index fa8c1b8e09fb..bcece91dd311 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -183,13 +183,12 @@ default_entry: #ifdef CONFIG_X86_PAE /* - * In PAE mode swapper_pg_dir is statically defined to contain enough - * entries to cover the VMSPLIT option (that is the top 1, 2 or 3 - * entries). The identity mapping is handled by pointing two PGD - * entries to the first kernel PMD. + * In PAE mode initial_page_table is statically defined to contain + * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3 + * entries). The identity mapping is handled by pointing two PGD entries + * to the first kernel PMD. * - * Note the upper half of each PMD or PTE are always zero at - * this stage. + * Note the upper half of each PMD or PTE are always zero at this stage. */ #define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */ @@ -197,7 +196,7 @@ default_entry: xorl %ebx,%ebx /* %ebx is kept at zero */ movl $pa(__brk_base), %edi - movl $pa(swapper_pg_pmd), %edx + movl $pa(initial_pg_pmd), %edx movl $PTE_IDENT_ATTR, %eax 10: leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */ @@ -226,14 +225,14 @@ default_entry: movl %eax, pa(max_pfn_mapped) /* Do early initialization of the fixmap area */ - movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax - movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8) + movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax + movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8) #else /* Not PAE */ page_pde_offset = (__PAGE_OFFSET >> 20); movl $pa(__brk_base), %edi - movl $pa(swapper_pg_dir), %edx + movl $pa(initial_page_table), %edx movl $PTE_IDENT_ATTR, %eax 10: leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */ @@ -257,8 +256,8 @@ page_pde_offset = (__PAGE_OFFSET >> 20); movl %eax, pa(max_pfn_mapped) /* Do early initialization of the fixmap area */ - movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax - movl %eax,pa(swapper_pg_dir+0xffc) + movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax + movl %eax,pa(initial_page_table+0xffc) #endif jmp 3f /* @@ -334,7 +333,7 @@ ENTRY(startup_32_smp) /* * Enable paging */ - movl pa(initial_page_table), %eax + movl $pa(initial_page_table), %eax movl %eax,%cr3 /* set the page table pointer.. */ movl %cr0,%eax orl $X86_CR0_PG,%eax @@ -614,8 +613,6 @@ ignore_int: .align 4 ENTRY(initial_code) .long i386_start_kernel -ENTRY(initial_page_table) - .long pa(swapper_pg_dir) /* * BSS section @@ -623,20 +620,18 @@ ENTRY(initial_page_table) __PAGE_ALIGNED_BSS .align PAGE_SIZE_asm #ifdef CONFIG_X86_PAE -swapper_pg_pmd: +initial_pg_pmd: .fill 1024*KPMDS,4,0 #else -ENTRY(swapper_pg_dir) +ENTRY(initial_page_table) .fill 1024,4,0 #endif -swapper_pg_fixmap: +initial_pg_fixmap: .fill 1024,4,0 -#ifdef CONFIG_X86_TRAMPOLINE -ENTRY(trampoline_pg_dir) - .fill 1024,4,0 -#endif ENTRY(empty_zero_page) .fill 4096,1,0 +ENTRY(swapper_pg_dir) + .fill 1024,4,0 /* * This starts the data section. @@ -645,20 +640,20 @@ ENTRY(empty_zero_page) __PAGE_ALIGNED_DATA /* Page-aligned for the benefit of paravirt? */ .align PAGE_SIZE_asm -ENTRY(swapper_pg_dir) - .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ +ENTRY(initial_page_table) + .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ # if KPMDS == 3 - .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 - .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0 - .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0 + .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 + .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0 + .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x2000),0 # elif KPMDS == 2 .long 0,0 - .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 - .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0 + .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 + .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0 # elif KPMDS == 1 .long 0,0 .long 0,0 - .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 + .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 # else # error "Kernel PMDs should be 1, 2 or 3" # endif diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index efaf906daf93..ae03cab4352e 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -380,44 +380,35 @@ static int hpet_next_event(unsigned long delta, struct clock_event_device *evt, int timer) { u32 cnt; + s32 res; cnt = hpet_readl(HPET_COUNTER); cnt += (u32) delta; hpet_writel(cnt, HPET_Tn_CMP(timer)); /* - * We need to read back the CMP register on certain HPET - * implementations (ATI chipsets) which seem to delay the - * transfer of the compare register into the internal compare - * logic. With small deltas this might actually be too late as - * the counter could already be higher than the compare value - * at that point and we would wait for the next hpet interrupt - * forever. We found out that reading the CMP register back - * forces the transfer so we can rely on the comparison with - * the counter register below. If the read back from the - * compare register does not match the value we programmed - * then we might have a real hardware problem. We can not do - * much about it here, but at least alert the user/admin with - * a prominent warning. - * - * An erratum on some chipsets (ICH9,..), results in - * comparator read immediately following a write returning old - * value. Workaround for this is to read this value second - * time, when first read returns old value. - * - * In fact the write to the comparator register is delayed up - * to two HPET cycles so the workaround we tried to restrict - * the readback to those known to be borked ATI chipsets - * failed miserably. So we give up on optimizations forever - * and penalize all HPET incarnations unconditionally. + * HPETs are a complete disaster. The compare register is + * based on a equal comparison and neither provides a less + * than or equal functionality (which would require to take + * the wraparound into account) nor a simple count down event + * mode. Further the write to the comparator register is + * delayed internally up to two HPET clock cycles in certain + * chipsets (ATI, ICH9,10). We worked around that by reading + * back the compare register, but that required another + * workaround for ICH9,10 chips where the first readout after + * write can return the old stale value. We already have a + * minimum delta of 5us enforced, but a NMI or SMI hitting + * between the counter readout and the comparator write can + * move us behind that point easily. Now instead of reading + * the compare register back several times, we make the ETIME + * decision based on the following: Return ETIME if the + * counter value after the write is less than 8 HPET cycles + * away from the event or if the counter is already ahead of + * the event. */ - if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) { - if (hpet_readl(HPET_Tn_CMP(timer)) != cnt) - printk_once(KERN_WARNING - "hpet: compare register read back failed.\n"); - } + res = (s32)(cnt - hpet_readl(HPET_COUNTER)); - return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; + return res < 8 ? -ETIME : 0; } static void hpet_legacy_set_mode(enum clock_event_mode mode, @@ -722,7 +713,7 @@ static int hpet_cpuhp_notify(struct notifier_block *n, switch (action & 0xf) { case CPU_ONLINE: - INIT_DELAYED_WORK_ON_STACK(&work.work, hpet_work); + INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work); init_completion(&work.complete); /* FIXME: add schedule_work_on() */ schedule_delayed_work_on(cpu, &work.work, 0); diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 10709f29d166..64668dbf00a4 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -49,21 +49,17 @@ static inline int check_stack_overflow(void) { return 0; } static inline void print_stack_overflow(void) { } #endif -#ifdef CONFIG_4KSTACKS /* * per-CPU IRQ handling contexts (thread information and stack) */ union irq_ctx { struct thread_info tinfo; u32 stack[THREAD_SIZE/sizeof(u32)]; -} __attribute__((aligned(PAGE_SIZE))); +} __attribute__((aligned(THREAD_SIZE))); static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx); static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx); -static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack); -static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack); - static void call_on_stack(void *func, void *stack) { asm volatile("xchgl %%ebx,%%esp \n" @@ -129,7 +125,7 @@ void __cpuinit irq_ctx_init(int cpu) if (per_cpu(hardirq_ctx, cpu)) return; - irqctx = &per_cpu(hardirq_stack, cpu); + irqctx = (union irq_ctx *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER); irqctx->tinfo.task = NULL; irqctx->tinfo.exec_domain = NULL; irqctx->tinfo.cpu = cpu; @@ -138,7 +134,7 @@ void __cpuinit irq_ctx_init(int cpu) per_cpu(hardirq_ctx, cpu) = irqctx; - irqctx = &per_cpu(softirq_stack, cpu); + irqctx = (union irq_ctx *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER); irqctx->tinfo.task = NULL; irqctx->tinfo.exec_domain = NULL; irqctx->tinfo.cpu = cpu; @@ -151,11 +147,6 @@ void __cpuinit irq_ctx_init(int cpu) cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu)); } -void irq_ctx_exit(int cpu) -{ - per_cpu(hardirq_ctx, cpu) = NULL; -} - asmlinkage void do_softirq(void) { unsigned long flags; @@ -187,11 +178,6 @@ asmlinkage void do_softirq(void) local_irq_restore(flags); } -#else -static inline int -execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; } -#endif - bool handle_irq(unsigned irq, struct pt_regs *regs) { struct irq_desc *desc; diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index 8afd9f321f10..90fcf62854bb 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -78,6 +78,7 @@ static int setup_data_open(struct inode *inode, struct file *file) static const struct file_operations fops_setup_data = { .read = setup_data_read, .open = setup_data_open, + .llseek = default_llseek, }; static int __init diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 852b81967a37..d81cfebb848f 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -477,8 +477,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, raw_smp_processor_id()); } - kgdb_correct_hw_break(); - return 0; } @@ -621,7 +619,12 @@ int kgdb_arch_init(void) static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { - kgdb_ll_trap(DIE_DEBUG, "debug", regs, 0, 0, SIGTRAP); + struct task_struct *tsk = current; + int i; + + for (i = 0; i < 4; i++) + if (breakinfo[i].enabled) + tsk->thread.debugreg6 |= (DR_TRAP0 << i); } void kgdb_arch_late(void) @@ -644,7 +647,7 @@ void kgdb_arch_late(void) if (breakinfo[i].pev) continue; breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); - if (IS_ERR(breakinfo[i].pev)) { + if (IS_ERR((void * __force)breakinfo[i].pev)) { printk(KERN_ERR "kgdb: Could not allocate hw" "breakpoints\nDisabling the kernel debugger\n"); breakinfo[i].pev = NULL; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index eb9b76c716c2..ca43ce31a19c 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -128,13 +128,15 @@ static struct clocksource kvm_clock = { static int kvm_register_clock(char *txt) { int cpu = smp_processor_id(); - int low, high; + int low, high, ret; + low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); + ret = native_write_msr_safe(msr_kvm_system_time, low, high); printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", cpu, high, low, txt); - return native_write_msr_safe(msr_kvm_system_time, low, high); + return ret; } #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index fa6551d36c10..1cca374a2bac 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -12,7 +12,7 @@ * Software Developer's Manual * Order Number 253668 or free download from: * - * http://developer.intel.com/design/pentium4/manuals/253668.htm + * http://developer.intel.com/Assets/PDF/manual/253668.pdf * * For more information, go to http://www.urbanmyth.org/microcode * @@ -232,6 +232,7 @@ static const struct file_operations microcode_fops = { .owner = THIS_MODULE, .write = microcode_write, .open = microcode_open, + .llseek = no_llseek, }; static struct miscdevice microcode_dev = { diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 356170262a93..dcb65cc0a053 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -12,7 +12,7 @@ * Software Developer's Manual * Order Number 253668 or free download from: * - * http://developer.intel.com/design/pentium4/manuals/253668.htm + * http://developer.intel.com/Assets/PDF/manual/253668.pdf * * For more information, go to http://www.urbanmyth.org/microcode * diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 70c4872cd8aa..45892dc4b72a 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -801,7 +801,8 @@ void ptrace_disable(struct task_struct *child) static const struct user_regset_view user_x86_32_view; /* Initialized below. */ #endif -long arch_ptrace(struct task_struct *child, long request, long addr, long data) +long arch_ptrace(struct task_struct *child, long request, + unsigned long addr, unsigned long data) { int ret; unsigned long __user *datap = (unsigned long __user *)data; @@ -812,8 +813,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) unsigned long tmp; ret = -EIO; - if ((addr & (sizeof(data) - 1)) || addr < 0 || - addr >= sizeof(struct user)) + if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user)) break; tmp = 0; /* Default return condition */ @@ -830,8 +830,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ ret = -EIO; - if ((addr & (sizeof(data) - 1)) || addr < 0 || - addr >= sizeof(struct user)) + if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user)) break; if (addr < sizeof(struct user_regs_struct)) @@ -888,17 +887,17 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION case PTRACE_GET_THREAD_AREA: - if (addr < 0) + if ((int) addr < 0) return -EIO; ret = do_get_thread_area(child, addr, - (struct user_desc __user *) data); + (struct user_desc __user *)data); break; case PTRACE_SET_THREAD_AREA: - if (addr < 0) + if ((int) addr < 0) return -EIO; ret = do_set_thread_area(child, addr, - (struct user_desc __user *) data, 0); + (struct user_desc __user *)data, 0); break; #endif diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 239427ca02af..bab3b9e6f66d 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -82,7 +82,8 @@ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) { u64 delta = native_read_tsc() - shadow->tsc_timestamp; - return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); + return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul, + shadow->tsc_shift); } /* diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 939b9e98245f..8bbe8c56916d 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -344,6 +344,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, vt8237_force_enable_hpet); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237, vt8237_force_enable_hpet); +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_CX700, + vt8237_force_enable_hpet); static void ati_force_hpet_resume(void) { diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 7a4cf14223ba..c495aa8d4815 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -371,16 +371,10 @@ void machine_real_restart(const unsigned char *code, int length) CMOS_WRITE(0x00, 0x8f); spin_unlock(&rtc_lock); - /* Remap the kernel at virtual address zero, as well as offset zero - from the kernel segment. This assumes the kernel segment starts at - virtual address PAGE_OFFSET. */ - memcpy(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY, - sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS); - /* - * Use `swapper_pg_dir' as our page directory. + * Switch back to the initial page table. */ - load_cr3(swapper_pg_dir); + load_cr3(initial_page_table); /* Write 0x1234 to absolute memory location 0x472. The BIOS reads this on booting to tell it to "Bypass memory test (also warm @@ -641,7 +635,7 @@ void native_machine_shutdown(void) /* O.K Now that I'm on the appropriate processor, * stop all of the others. */ - smp_send_stop(); + stop_other_cpus(); #endif lapic_shutdown(); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 420e64197850..95a32746fbf9 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -700,6 +700,17 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_32 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); visws_early_detect(); + + /* + * copy kernel address range established so far and switch + * to the proper swapper page table + */ + clone_pgd_range(swapper_pg_dir + KERNEL_PGD_BOUNDARY, + initial_page_table + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + + load_cr3(swapper_pg_dir); + __flush_tlb_all(); #else printk(KERN_INFO "Command line: %s\n", boot_command_line); #endif @@ -985,7 +996,12 @@ void __init setup_arch(char **cmdline_p) paging_init(); x86_init.paging.pagetable_setup_done(swapper_pg_dir); - setup_trampoline_page_table(); +#ifdef CONFIG_X86_32 + /* sync back kernel address range */ + clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); +#endif tboot_probe(); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index d801210945d6..513deac7228d 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -159,10 +159,10 @@ asmlinkage void smp_reboot_interrupt(void) irq_exit(); } -static void native_smp_send_stop(void) +static void native_stop_other_cpus(int wait) { unsigned long flags; - unsigned long wait; + unsigned long timeout; if (reboot_force) return; @@ -179,9 +179,12 @@ static void native_smp_send_stop(void) if (num_online_cpus() > 1) { apic->send_IPI_allbutself(REBOOT_VECTOR); - /* Don't wait longer than a second */ - wait = USEC_PER_SEC; - while (num_online_cpus() > 1 && wait--) + /* + * Don't wait longer than a second if the caller + * didn't ask us to wait. + */ + timeout = USEC_PER_SEC; + while (num_online_cpus() > 1 && (wait || timeout--)) udelay(1); } @@ -227,7 +230,7 @@ struct smp_ops smp_ops = { .smp_prepare_cpus = native_smp_prepare_cpus, .smp_cpus_done = native_smp_cpus_done, - .smp_send_stop = native_smp_send_stop, + .stop_other_cpus = native_stop_other_cpus, .smp_send_reschedule = native_smp_send_reschedule, .cpu_up = native_cpu_up, diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index dfb50890b5b7..083e99d1b7df 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -299,22 +299,16 @@ notrace static void __cpuinit start_secondary(void *unused) * fragile that we want to limit the things done here to the * most necessary things. */ + cpu_init(); + preempt_disable(); + smp_callin(); #ifdef CONFIG_X86_32 - /* - * Switch away from the trampoline page-table - * - * Do this before cpu_init() because it needs to access per-cpu - * data which may not be mapped in the trampoline page-table. - */ + /* switch away from the initial page table */ load_cr3(swapper_pg_dir); __flush_tlb_all(); #endif - cpu_init(); - preempt_disable(); - smp_callin(); - /* otherwise gcc will move up smp_processor_id before the cpu_init */ barrier(); /* @@ -753,7 +747,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), }; - INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle); + INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle); alternatives_smp_switch(1); @@ -785,7 +779,6 @@ do_rest: #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ irq_ctx_init(cpu); - initial_page_table = __pa(&trampoline_pg_dir); #else clear_tsk_thread_flag(c_idle.idle, TIF_FORK); initial_gs = per_cpu_offset(cpu); @@ -934,7 +927,6 @@ int __cpuinit native_cpu_up(unsigned int cpu) per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; err = do_boot_cpu(apicid, cpu); - if (err) { pr_debug("do_boot_cpu failed %d\n", err); return -EIO; @@ -1381,7 +1373,6 @@ void play_dead_common(void) { idle_task_exit(); reset_lazy_tlbstate(); - irq_ctx_exit(raw_smp_processor_id()); c1e_remove_cpu(raw_smp_processor_id()); mb(); diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 312ef0292815..20ea20a39e2a 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -1001,10 +1001,10 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) static ssize_t tunables_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { - char buf[300]; + char *buf; int ret; - ret = snprintf(buf, 300, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n", + buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n", "max_bau_concurrent plugged_delay plugsb4reset", "timeoutsb4reset ipi_reset_limit complete_threshold", "congested_response_us congested_reps congested_period", @@ -1012,7 +1012,12 @@ static ssize_t tunables_read(struct file *file, char __user *userbuf, timeoutsb4reset, ipi_reset_limit, complete_threshold, congested_response_us, congested_reps, congested_period); - return simple_read_from_buffer(userbuf, count, ppos, buf, ret); + if (!buf) + return -ENOMEM; + + ret = simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf)); + kfree(buf); + return ret; } /* @@ -1285,6 +1290,7 @@ static const struct file_operations tunables_fops = { .open = tunables_open, .read = tunables_read, .write = tunables_write, + .llseek = default_llseek, }; static int __init uv_ptc_init(void) diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index 4c3da5674e67..a375616d77f7 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -38,19 +38,3 @@ unsigned long __trampinit setup_trampoline(void) memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); return virt_to_phys(trampoline_base); } - -void __init setup_trampoline_page_table(void) -{ -#ifdef CONFIG_X86_32 - /* Copy kernel address range */ - clone_pgd_range(trampoline_pg_dir + KERNEL_PGD_BOUNDARY, - swapper_pg_dir + KERNEL_PGD_BOUNDARY, - KERNEL_PGD_PTRS); - - /* Initialize low mappings */ - clone_pgd_range(trampoline_pg_dir, - swapper_pg_dir + KERNEL_PGD_BOUNDARY, - min_t(unsigned long, KERNEL_PGD_PTRS, - KERNEL_PGD_BOUNDARY)); -#endif -} diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d43968503dd2..cb838ca42c96 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -575,6 +575,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) if (regs->flags & X86_VM_MASK) { handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); + preempt_conditional_cli(regs); return; } diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 5ffb5622f793..61fb98519622 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -551,8 +551,14 @@ cannot_handle: int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno) { if (VMPI.is_vm86pus) { - if ((trapno == 3) || (trapno == 1)) - return_to_32bit(regs, VM86_TRAP + (trapno << 8)); + if ((trapno == 3) || (trapno == 1)) { + KVM86->regs32->ax = VM86_TRAP + (trapno << 8); + /* setting this flag forces the code in entry_32.S to + call save_v86_state() and change the stack pointer + to KVM86->regs32 */ + set_thread_flag(TIF_IRET); + return 0; + } do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs)); return 0; } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 38e2b67807e1..e03530aebfd0 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -301,7 +301,7 @@ SECTIONS } #if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) - PERCPU(PAGE_SIZE) + PERCPU(THREAD_SIZE) #endif . = ALIGN(PAGE_SIZE); diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 970bbd479516..ddc131ff438f 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -64,6 +64,13 @@ config KVM_AMD To compile this as a module, choose M here: the module will be called kvm-amd. +config KVM_MMU_AUDIT + bool "Audit KVM MMU" + depends on KVM && TRACEPOINTS + ---help--- + This option adds a R/W kVM module parameter 'mmu_audit', which allows + audit KVM MMU at runtime. + # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. source drivers/vhost/Kconfig diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 66ca98aafdd6..38b6e8dafaff 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -9,7 +9,7 @@ * privileged instructions: * * Copyright (C) 2006 Qumranet - * Copyright 2010 Red Hat, Inc. and/or its affilates. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Avi Kivity <avi@qumranet.com> * Yaniv Kamay <yaniv@qumranet.com> @@ -51,13 +51,13 @@ #define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ #define DstReg (2<<1) /* Register operand. */ #define DstMem (3<<1) /* Memory operand. */ -#define DstAcc (4<<1) /* Destination Accumulator */ +#define DstAcc (4<<1) /* Destination Accumulator */ #define DstDI (5<<1) /* Destination is in ES:(E)DI */ #define DstMem64 (6<<1) /* 64bit memory operand */ +#define DstImmUByte (7<<1) /* 8-bit unsigned immediate operand */ #define DstMask (7<<1) /* Source operand type. */ #define SrcNone (0<<4) /* No source operand. */ -#define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */ #define SrcReg (1<<4) /* Register operand. */ #define SrcMem (2<<4) /* Memory operand. */ #define SrcMem16 (3<<4) /* Memory operand (16-bit). */ @@ -71,6 +71,7 @@ #define SrcImmFAddr (0xb<<4) /* Source is immediate far address */ #define SrcMemFAddr (0xc<<4) /* Source is far address in memory */ #define SrcAcc (0xd<<4) /* Source Accumulator */ +#define SrcImmU16 (0xe<<4) /* Immediate operand, unsigned, 16 bits */ #define SrcMask (0xf<<4) /* Generic ModRM decode. */ #define ModRM (1<<8) @@ -82,8 +83,10 @@ #define Stack (1<<13) /* Stack instruction (push/pop) */ #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ -#define GroupMask 0xff /* Group number stored in bits 0:7 */ /* Misc flags */ +#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ +#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */ +#define Undefined (1<<25) /* No Such Instruction */ #define Lock (1<<26) /* lock prefix is allowed for the instruction */ #define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ #define No64 (1<<28) @@ -92,285 +95,30 @@ #define Src2CL (1<<29) #define Src2ImmByte (2<<29) #define Src2One (3<<29) +#define Src2Imm (4<<29) #define Src2Mask (7<<29) -enum { - Group1_80, Group1_81, Group1_82, Group1_83, - Group1A, Group3_Byte, Group3, Group4, Group5, Group7, - Group8, Group9, +#define X2(x...) x, x +#define X3(x...) X2(x), x +#define X4(x...) X2(x), X2(x) +#define X5(x...) X4(x), x +#define X6(x...) X4(x), X2(x) +#define X7(x...) X4(x), X3(x) +#define X8(x...) X4(x), X4(x) +#define X16(x...) X8(x), X8(x) + +struct opcode { + u32 flags; + union { + int (*execute)(struct x86_emulate_ctxt *ctxt); + struct opcode *group; + struct group_dual *gdual; + } u; }; -static u32 opcode_table[256] = { - /* 0x00 - 0x07 */ - ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, - ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, - /* 0x08 - 0x0F */ - ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, - ImplicitOps | Stack | No64, 0, - /* 0x10 - 0x17 */ - ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, - ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, - /* 0x18 - 0x1F */ - ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, - ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, - /* 0x20 - 0x27 */ - ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, - /* 0x28 - 0x2F */ - ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, - /* 0x30 - 0x37 */ - ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, - /* 0x38 - 0x3F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, - 0, 0, - /* 0x40 - 0x47 */ - DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, - /* 0x48 - 0x4F */ - DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, - /* 0x50 - 0x57 */ - SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, - SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, - /* 0x58 - 0x5F */ - DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, - DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, - /* 0x60 - 0x67 */ - ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, - 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , - 0, 0, 0, 0, - /* 0x68 - 0x6F */ - SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0, - DstDI | ByteOp | Mov | String, DstDI | Mov | String, /* insb, insw/insd */ - SrcSI | ByteOp | ImplicitOps | String, SrcSI | ImplicitOps | String, /* outsb, outsw/outsd */ - /* 0x70 - 0x77 */ - SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, - SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, - /* 0x78 - 0x7F */ - SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, - SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte, - /* 0x80 - 0x87 */ - Group | Group1_80, Group | Group1_81, - Group | Group1_82, Group | Group1_83, - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, - /* 0x88 - 0x8F */ - ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, - ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstMem | SrcNone | ModRM | Mov, ModRM | DstReg, - ImplicitOps | SrcMem16 | ModRM, Group | Group1A, - /* 0x90 - 0x97 */ - DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, - /* 0x98 - 0x9F */ - 0, 0, SrcImmFAddr | No64, 0, - ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, - /* 0xA0 - 0xA7 */ - ByteOp | DstAcc | SrcMem | Mov | MemAbs, DstAcc | SrcMem | Mov | MemAbs, - ByteOp | DstMem | SrcAcc | Mov | MemAbs, DstMem | SrcAcc | Mov | MemAbs, - ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String, - ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String, - /* 0xA8 - 0xAF */ - DstAcc | SrcImmByte | ByteOp, DstAcc | SrcImm, ByteOp | DstDI | Mov | String, DstDI | Mov | String, - ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String, - ByteOp | DstDI | String, DstDI | String, - /* 0xB0 - 0xB7 */ - ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, - ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, - ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, - ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, - /* 0xB8 - 0xBF */ - DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, - DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, - DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, - DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, - /* 0xC0 - 0xC7 */ - ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, - 0, ImplicitOps | Stack, 0, 0, - ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, - /* 0xC8 - 0xCF */ - 0, 0, 0, ImplicitOps | Stack, - ImplicitOps, SrcImmByte, ImplicitOps | No64, ImplicitOps, - /* 0xD0 - 0xD7 */ - ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, - ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, - 0, 0, 0, 0, - /* 0xD8 - 0xDF */ - 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xE0 - 0xE7 */ - 0, 0, 0, 0, - ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc, - ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc, - /* 0xE8 - 0xEF */ - SrcImm | Stack, SrcImm | ImplicitOps, - SrcImmFAddr | No64, SrcImmByte | ImplicitOps, - SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, - SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, - /* 0xF0 - 0xF7 */ - 0, 0, 0, 0, - ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3, - /* 0xF8 - 0xFF */ - ImplicitOps, 0, ImplicitOps, ImplicitOps, - ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, -}; - -static u32 twobyte_table[256] = { - /* 0x00 - 0x0F */ - 0, Group | GroupDual | Group7, 0, 0, - 0, ImplicitOps, ImplicitOps | Priv, 0, - ImplicitOps | Priv, ImplicitOps | Priv, 0, 0, - 0, ImplicitOps | ModRM, 0, 0, - /* 0x10 - 0x1F */ - 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, - /* 0x20 - 0x2F */ - ModRM | ImplicitOps | Priv, ModRM | Priv, - ModRM | ImplicitOps | Priv, ModRM | Priv, - 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x30 - 0x3F */ - ImplicitOps | Priv, 0, ImplicitOps | Priv, 0, - ImplicitOps, ImplicitOps | Priv, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x40 - 0x47 */ - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - /* 0x48 - 0x4F */ - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - /* 0x50 - 0x5F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x60 - 0x6F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x70 - 0x7F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0x80 - 0x8F */ - SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, - SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, - /* 0x90 - 0x9F */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xA0 - 0xA7 */ - ImplicitOps | Stack, ImplicitOps | Stack, - 0, DstMem | SrcReg | ModRM | BitOp, - DstMem | SrcReg | Src2ImmByte | ModRM, - DstMem | SrcReg | Src2CL | ModRM, 0, 0, - /* 0xA8 - 0xAF */ - ImplicitOps | Stack, ImplicitOps | Stack, - 0, DstMem | SrcReg | ModRM | BitOp | Lock, - DstMem | SrcReg | Src2ImmByte | ModRM, - DstMem | SrcReg | Src2CL | ModRM, - ModRM, 0, - /* 0xB0 - 0xB7 */ - ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, - 0, DstMem | SrcReg | ModRM | BitOp | Lock, - 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem16 | ModRM | Mov, - /* 0xB8 - 0xBF */ - 0, 0, - Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock, - 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, - DstReg | SrcMem16 | ModRM | Mov, - /* 0xC0 - 0xCF */ - 0, 0, 0, DstMem | SrcReg | ModRM | Mov, - 0, 0, 0, Group | GroupDual | Group9, - 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xD0 - 0xDF */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xE0 - 0xEF */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - /* 0xF0 - 0xFF */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -}; - -static u32 group_table[] = { - [Group1_80*8] = - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM | Lock, - ByteOp | DstMem | SrcImm | ModRM, - [Group1_81*8] = - DstMem | SrcImm | ModRM | Lock, - DstMem | SrcImm | ModRM | Lock, - DstMem | SrcImm | ModRM | Lock, - DstMem | SrcImm | ModRM | Lock, - DstMem | SrcImm | ModRM | Lock, - DstMem | SrcImm | ModRM | Lock, - DstMem | SrcImm | ModRM | Lock, - DstMem | SrcImm | ModRM, - [Group1_82*8] = - ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, - ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, - ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, - ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, - ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, - ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, - ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, - ByteOp | DstMem | SrcImm | ModRM | No64, - [Group1_83*8] = - DstMem | SrcImmByte | ModRM | Lock, - DstMem | SrcImmByte | ModRM | Lock, - DstMem | SrcImmByte | ModRM | Lock, - DstMem | SrcImmByte | ModRM | Lock, - DstMem | SrcImmByte | ModRM | Lock, - DstMem | SrcImmByte | ModRM | Lock, - DstMem | SrcImmByte | ModRM | Lock, - DstMem | SrcImmByte | ModRM, - [Group1A*8] = - DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, - [Group3_Byte*8] = - ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM, - ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, - 0, 0, 0, 0, - [Group3*8] = - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, - 0, 0, 0, 0, - [Group4*8] = - ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock, - 0, 0, 0, 0, 0, 0, - [Group5*8] = - DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock, - SrcMem | ModRM | Stack, 0, - SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps, - SrcMem | ModRM | Stack, 0, - [Group7*8] = - 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, - SrcNone | ModRM | DstMem | Mov, 0, - SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv, - [Group8*8] = - 0, 0, 0, 0, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock, - DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock, - [Group9*8] = - 0, DstMem64 | ModRM | Lock, 0, 0, 0, 0, 0, 0, -}; - -static u32 group2_table[] = { - [Group7*8] = - SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM | Priv, - SrcNone | ModRM | DstMem | Mov, 0, - SrcMem16 | ModRM | Mov | Priv, 0, - [Group9*8] = - 0, 0, 0, 0, 0, 0, 0, 0, +struct group_dual { + struct opcode mod012[8]; + struct opcode mod3[8]; }; /* EFLAGS bit definitions. */ @@ -392,6 +140,9 @@ static u32 group2_table[] = { #define EFLG_PF (1<<2) #define EFLG_CF (1<<0) +#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a +#define EFLG_RESERVED_ONE_MASK 2 + /* * Instruction emulation: * Most instructions are emulated directly via a fragment of inline assembly @@ -444,13 +195,13 @@ static u32 group2_table[] = { #define ON64(x) #endif -#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix) \ +#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix, _dsttype) \ do { \ __asm__ __volatile__ ( \ _PRE_EFLAGS("0", "4", "2") \ _op _suffix " %"_x"3,%1; " \ _POST_EFLAGS("0", "4", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ + : "=m" (_eflags), "+q" (*(_dsttype*)&(_dst).val),\ "=&r" (_tmp) \ : _y ((_src).val), "i" (EFLAGS_MASK)); \ } while (0) @@ -463,13 +214,13 @@ static u32 group2_table[] = { \ switch ((_dst).bytes) { \ case 2: \ - ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \ + ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w",u16);\ break; \ case 4: \ - ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \ + ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l",u32);\ break; \ case 8: \ - ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \ + ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q",u64)); \ break; \ } \ } while (0) @@ -479,7 +230,7 @@ static u32 group2_table[] = { unsigned long _tmp; \ switch ((_dst).bytes) { \ case 1: \ - ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b"); \ + ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b",u8); \ break; \ default: \ __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ @@ -566,6 +317,74 @@ static u32 group2_table[] = { } \ } while (0) +#define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix) \ + do { \ + unsigned long _tmp; \ + \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "4", "1") \ + _op _suffix " %5; " \ + _POST_EFLAGS("0", "4", "1") \ + : "=m" (_eflags), "=&r" (_tmp), \ + "+a" (_rax), "+d" (_rdx) \ + : "i" (EFLAGS_MASK), "m" ((_src).val), \ + "a" (_rax), "d" (_rdx)); \ + } while (0) + +#define __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) \ + do { \ + unsigned long _tmp; \ + \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "5", "1") \ + "1: \n\t" \ + _op _suffix " %6; " \ + "2: \n\t" \ + _POST_EFLAGS("0", "5", "1") \ + ".pushsection .fixup,\"ax\" \n\t" \ + "3: movb $1, %4 \n\t" \ + "jmp 2b \n\t" \ + ".popsection \n\t" \ + _ASM_EXTABLE(1b, 3b) \ + : "=m" (_eflags), "=&r" (_tmp), \ + "+a" (_rax), "+d" (_rdx), "+qm"(_ex) \ + : "i" (EFLAGS_MASK), "m" ((_src).val), \ + "a" (_rax), "d" (_rdx)); \ + } while (0) + +/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ +#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \ + do { \ + switch((_src).bytes) { \ + case 1: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "b"); break; \ + case 2: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "w"); break; \ + case 4: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "l"); break; \ + case 8: ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "q")); break; \ + } \ + } while (0) + +#define emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _ex) \ + do { \ + switch((_src).bytes) { \ + case 1: \ + __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ + _eflags, "b", _ex); \ + break; \ + case 2: \ + __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ + _eflags, "w", _ex); \ + break; \ + case 4: \ + __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ + _eflags, "l", _ex); \ + break; \ + case 8: ON64( \ + __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ + _eflags, "q", _ex)); \ + break; \ + } \ + } while (0) + /* Fetch next part of the instruction being emulated. */ #define insn_fetch(_type, _size, _eip) \ ({ unsigned long _x; \ @@ -661,7 +480,6 @@ static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, ctxt->exception = vec; ctxt->error_code = error; ctxt->error_code_valid = valid; - ctxt->restart = false; } static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err) @@ -669,11 +487,9 @@ static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err) emulate_exception(ctxt, GP_VECTOR, err, true); } -static void emulate_pf(struct x86_emulate_ctxt *ctxt, unsigned long addr, - int err) +static void emulate_pf(struct x86_emulate_ctxt *ctxt) { - ctxt->cr2 = addr; - emulate_exception(ctxt, PF_VECTOR, err, true); + emulate_exception(ctxt, PF_VECTOR, 0, true); } static void emulate_ud(struct x86_emulate_ctxt *ctxt) @@ -686,6 +502,12 @@ static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err) emulate_exception(ctxt, TS_VECTOR, err, true); } +static int emulate_de(struct x86_emulate_ctxt *ctxt) +{ + emulate_exception(ctxt, DE_VECTOR, 0, false); + return X86EMUL_PROPAGATE_FAULT; +} + static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, unsigned long eip, u8 *dest) @@ -742,7 +564,7 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs, static int read_descriptor(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, - void *ptr, + ulong addr, u16 *size, unsigned long *address, int op_bytes) { int rc; @@ -750,12 +572,10 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt, if (op_bytes == 2) op_bytes = 3; *address = 0; - rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, - ctxt->vcpu, NULL); + rc = ops->read_std(addr, (unsigned long *)size, 2, ctxt->vcpu, NULL); if (rc != X86EMUL_CONTINUE) return rc; - rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, - ctxt->vcpu, NULL); + rc = ops->read_std(addr + 2, address, op_bytes, ctxt->vcpu, NULL); return rc; } @@ -794,6 +614,24 @@ static int test_cc(unsigned int condition, unsigned int flags) return (!!rc ^ (condition & 1)); } +static void fetch_register_operand(struct operand *op) +{ + switch (op->bytes) { + case 1: + op->val = *(u8 *)op->addr.reg; + break; + case 2: + op->val = *(u16 *)op->addr.reg; + break; + case 4: + op->val = *(u32 *)op->addr.reg; + break; + case 8: + op->val = *(u64 *)op->addr.reg; + break; + } +} + static void decode_register_operand(struct operand *op, struct decode_cache *c, int inhibit_bytereg) @@ -805,34 +643,25 @@ static void decode_register_operand(struct operand *op, reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); op->type = OP_REG; if ((c->d & ByteOp) && !inhibit_bytereg) { - op->ptr = decode_register(reg, c->regs, highbyte_regs); - op->val = *(u8 *)op->ptr; + op->addr.reg = decode_register(reg, c->regs, highbyte_regs); op->bytes = 1; } else { - op->ptr = decode_register(reg, c->regs, 0); + op->addr.reg = decode_register(reg, c->regs, 0); op->bytes = c->op_bytes; - switch (op->bytes) { - case 2: - op->val = *(u16 *)op->ptr; - break; - case 4: - op->val = *(u32 *)op->ptr; - break; - case 8: - op->val = *(u64 *) op->ptr; - break; - } } + fetch_register_operand(op); op->orig_val = op->val; } static int decode_modrm(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) + struct x86_emulate_ops *ops, + struct operand *op) { struct decode_cache *c = &ctxt->decode; u8 sib; int index_reg = 0, base_reg = 0, scale; int rc = X86EMUL_CONTINUE; + ulong modrm_ea = 0; if (c->rex_prefix) { c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ @@ -844,16 +673,19 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, c->modrm_mod |= (c->modrm & 0xc0) >> 6; c->modrm_reg |= (c->modrm & 0x38) >> 3; c->modrm_rm |= (c->modrm & 0x07); - c->modrm_ea = 0; - c->use_modrm_ea = 1; + c->modrm_seg = VCPU_SREG_DS; if (c->modrm_mod == 3) { - c->modrm_ptr = decode_register(c->modrm_rm, + op->type = OP_REG; + op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + op->addr.reg = decode_register(c->modrm_rm, c->regs, c->d & ByteOp); - c->modrm_val = *(unsigned long *)c->modrm_ptr; + fetch_register_operand(op); return rc; } + op->type = OP_MEM; + if (c->ad_bytes == 2) { unsigned bx = c->regs[VCPU_REGS_RBX]; unsigned bp = c->regs[VCPU_REGS_RBP]; @@ -864,47 +696,46 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, switch (c->modrm_mod) { case 0: if (c->modrm_rm == 6) - c->modrm_ea += insn_fetch(u16, 2, c->eip); + modrm_ea += insn_fetch(u16, 2, c->eip); break; case 1: - c->modrm_ea += insn_fetch(s8, 1, c->eip); + modrm_ea += insn_fetch(s8, 1, c->eip); break; case 2: - c->modrm_ea += insn_fetch(u16, 2, c->eip); + modrm_ea += insn_fetch(u16, 2, c->eip); break; } switch (c->modrm_rm) { case 0: - c->modrm_ea += bx + si; + modrm_ea += bx + si; break; case 1: - c->modrm_ea += bx + di; + modrm_ea += bx + di; break; case 2: - c->modrm_ea += bp + si; + modrm_ea += bp + si; break; case 3: - c->modrm_ea += bp + di; + modrm_ea += bp + di; break; case 4: - c->modrm_ea += si; + modrm_ea += si; break; case 5: - c->modrm_ea += di; + modrm_ea += di; break; case 6: if (c->modrm_mod != 0) - c->modrm_ea += bp; + modrm_ea += bp; break; case 7: - c->modrm_ea += bx; + modrm_ea += bx; break; } if (c->modrm_rm == 2 || c->modrm_rm == 3 || (c->modrm_rm == 6 && c->modrm_mod != 0)) - if (!c->has_seg_override) - set_seg_override(c, VCPU_SREG_SS); - c->modrm_ea = (u16)c->modrm_ea; + c->modrm_seg = VCPU_SREG_SS; + modrm_ea = (u16)modrm_ea; } else { /* 32/64-bit ModR/M decode. */ if ((c->modrm_rm & 7) == 4) { @@ -914,410 +745,74 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, scale = sib >> 6; if ((base_reg & 7) == 5 && c->modrm_mod == 0) - c->modrm_ea += insn_fetch(s32, 4, c->eip); + modrm_ea += insn_fetch(s32, 4, c->eip); else - c->modrm_ea += c->regs[base_reg]; + modrm_ea += c->regs[base_reg]; if (index_reg != 4) - c->modrm_ea += c->regs[index_reg] << scale; + modrm_ea += c->regs[index_reg] << scale; } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) { if (ctxt->mode == X86EMUL_MODE_PROT64) c->rip_relative = 1; } else - c->modrm_ea += c->regs[c->modrm_rm]; + modrm_ea += c->regs[c->modrm_rm]; switch (c->modrm_mod) { case 0: if (c->modrm_rm == 5) - c->modrm_ea += insn_fetch(s32, 4, c->eip); + modrm_ea += insn_fetch(s32, 4, c->eip); break; case 1: - c->modrm_ea += insn_fetch(s8, 1, c->eip); + modrm_ea += insn_fetch(s8, 1, c->eip); break; case 2: - c->modrm_ea += insn_fetch(s32, 4, c->eip); + modrm_ea += insn_fetch(s32, 4, c->eip); break; } } + op->addr.mem = modrm_ea; done: return rc; } static int decode_abs(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) + struct x86_emulate_ops *ops, + struct operand *op) { struct decode_cache *c = &ctxt->decode; int rc = X86EMUL_CONTINUE; + op->type = OP_MEM; switch (c->ad_bytes) { case 2: - c->modrm_ea = insn_fetch(u16, 2, c->eip); + op->addr.mem = insn_fetch(u16, 2, c->eip); break; case 4: - c->modrm_ea = insn_fetch(u32, 4, c->eip); + op->addr.mem = insn_fetch(u32, 4, c->eip); break; case 8: - c->modrm_ea = insn_fetch(u64, 8, c->eip); + op->addr.mem = insn_fetch(u64, 8, c->eip); break; } done: return rc; } -int -x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) +static void fetch_bit_operand(struct decode_cache *c) { - struct decode_cache *c = &ctxt->decode; - int rc = X86EMUL_CONTINUE; - int mode = ctxt->mode; - int def_op_bytes, def_ad_bytes, group; - - - /* we cannot decode insn before we complete previous rep insn */ - WARN_ON(ctxt->restart); - - c->eip = ctxt->eip; - c->fetch.start = c->fetch.end = c->eip; - ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS); - - switch (mode) { - case X86EMUL_MODE_REAL: - case X86EMUL_MODE_VM86: - case X86EMUL_MODE_PROT16: - def_op_bytes = def_ad_bytes = 2; - break; - case X86EMUL_MODE_PROT32: - def_op_bytes = def_ad_bytes = 4; - break; -#ifdef CONFIG_X86_64 - case X86EMUL_MODE_PROT64: - def_op_bytes = 4; - def_ad_bytes = 8; - break; -#endif - default: - return -1; - } - - c->op_bytes = def_op_bytes; - c->ad_bytes = def_ad_bytes; - - /* Legacy prefixes. */ - for (;;) { - switch (c->b = insn_fetch(u8, 1, c->eip)) { - case 0x66: /* operand-size override */ - /* switch between 2/4 bytes */ - c->op_bytes = def_op_bytes ^ 6; - break; - case 0x67: /* address-size override */ - if (mode == X86EMUL_MODE_PROT64) - /* switch between 4/8 bytes */ - c->ad_bytes = def_ad_bytes ^ 12; - else - /* switch between 2/4 bytes */ - c->ad_bytes = def_ad_bytes ^ 6; - break; - case 0x26: /* ES override */ - case 0x2e: /* CS override */ - case 0x36: /* SS override */ - case 0x3e: /* DS override */ - set_seg_override(c, (c->b >> 3) & 3); - break; - case 0x64: /* FS override */ - case 0x65: /* GS override */ - set_seg_override(c, c->b & 7); - break; - case 0x40 ... 0x4f: /* REX */ - if (mode != X86EMUL_MODE_PROT64) - goto done_prefixes; - c->rex_prefix = c->b; - continue; - case 0xf0: /* LOCK */ - c->lock_prefix = 1; - break; - case 0xf2: /* REPNE/REPNZ */ - c->rep_prefix = REPNE_PREFIX; - break; - case 0xf3: /* REP/REPE/REPZ */ - c->rep_prefix = REPE_PREFIX; - break; - default: - goto done_prefixes; - } - - /* Any legacy prefix after a REX prefix nullifies its effect. */ - - c->rex_prefix = 0; - } - -done_prefixes: - - /* REX prefix. */ - if (c->rex_prefix) - if (c->rex_prefix & 8) - c->op_bytes = 8; /* REX.W */ - - /* Opcode byte(s). */ - c->d = opcode_table[c->b]; - if (c->d == 0) { - /* Two-byte opcode? */ - if (c->b == 0x0f) { - c->twobyte = 1; - c->b = insn_fetch(u8, 1, c->eip); - c->d = twobyte_table[c->b]; - } - } - - if (c->d & Group) { - group = c->d & GroupMask; - c->modrm = insn_fetch(u8, 1, c->eip); - --c->eip; - - group = (group << 3) + ((c->modrm >> 3) & 7); - if ((c->d & GroupDual) && (c->modrm >> 6) == 3) - c->d = group2_table[group]; - else - c->d = group_table[group]; - } - - /* Unrecognised? */ - if (c->d == 0) { - DPRINTF("Cannot emulate %02x\n", c->b); - return -1; - } - - if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) - c->op_bytes = 8; - - /* ModRM and SIB bytes. */ - if (c->d & ModRM) - rc = decode_modrm(ctxt, ops); - else if (c->d & MemAbs) - rc = decode_abs(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - goto done; - - if (!c->has_seg_override) - set_seg_override(c, VCPU_SREG_DS); - - if (!(!c->twobyte && c->b == 0x8d)) - c->modrm_ea += seg_override_base(ctxt, ops, c); - - if (c->ad_bytes != 8) - c->modrm_ea = (u32)c->modrm_ea; - - if (c->rip_relative) - c->modrm_ea += c->eip; - - /* - * Decode and fetch the source operand: register, memory - * or immediate. - */ - switch (c->d & SrcMask) { - case SrcNone: - break; - case SrcReg: - decode_register_operand(&c->src, c, 0); - break; - case SrcMem16: - c->src.bytes = 2; - goto srcmem_common; - case SrcMem32: - c->src.bytes = 4; - goto srcmem_common; - case SrcMem: - c->src.bytes = (c->d & ByteOp) ? 1 : - c->op_bytes; - /* Don't fetch the address for invlpg: it could be unmapped. */ - if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7) - break; - srcmem_common: - /* - * For instructions with a ModR/M byte, switch to register - * access if Mod = 3. - */ - if ((c->d & ModRM) && c->modrm_mod == 3) { - c->src.type = OP_REG; - c->src.val = c->modrm_val; - c->src.ptr = c->modrm_ptr; - break; - } - c->src.type = OP_MEM; - c->src.ptr = (unsigned long *)c->modrm_ea; - c->src.val = 0; - break; - case SrcImm: - case SrcImmU: - c->src.type = OP_IMM; - c->src.ptr = (unsigned long *)c->eip; - c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - if (c->src.bytes == 8) - c->src.bytes = 4; - /* NB. Immediates are sign-extended as necessary. */ - switch (c->src.bytes) { - case 1: - c->src.val = insn_fetch(s8, 1, c->eip); - break; - case 2: - c->src.val = insn_fetch(s16, 2, c->eip); - break; - case 4: - c->src.val = insn_fetch(s32, 4, c->eip); - break; - } - if ((c->d & SrcMask) == SrcImmU) { - switch (c->src.bytes) { - case 1: - c->src.val &= 0xff; - break; - case 2: - c->src.val &= 0xffff; - break; - case 4: - c->src.val &= 0xffffffff; - break; - } - } - break; - case SrcImmByte: - case SrcImmUByte: - c->src.type = OP_IMM; - c->src.ptr = (unsigned long *)c->eip; - c->src.bytes = 1; - if ((c->d & SrcMask) == SrcImmByte) - c->src.val = insn_fetch(s8, 1, c->eip); - else - c->src.val = insn_fetch(u8, 1, c->eip); - break; - case SrcAcc: - c->src.type = OP_REG; - c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->src.ptr = &c->regs[VCPU_REGS_RAX]; - switch (c->src.bytes) { - case 1: - c->src.val = *(u8 *)c->src.ptr; - break; - case 2: - c->src.val = *(u16 *)c->src.ptr; - break; - case 4: - c->src.val = *(u32 *)c->src.ptr; - break; - case 8: - c->src.val = *(u64 *)c->src.ptr; - break; - } - break; - case SrcOne: - c->src.bytes = 1; - c->src.val = 1; - break; - case SrcSI: - c->src.type = OP_MEM; - c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->src.ptr = (unsigned long *) - register_address(c, seg_override_base(ctxt, ops, c), - c->regs[VCPU_REGS_RSI]); - c->src.val = 0; - break; - case SrcImmFAddr: - c->src.type = OP_IMM; - c->src.ptr = (unsigned long *)c->eip; - c->src.bytes = c->op_bytes + 2; - insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); - break; - case SrcMemFAddr: - c->src.type = OP_MEM; - c->src.ptr = (unsigned long *)c->modrm_ea; - c->src.bytes = c->op_bytes + 2; - break; - } + long sv = 0, mask; - /* - * Decode and fetch the second source operand: register, memory - * or immediate. - */ - switch (c->d & Src2Mask) { - case Src2None: - break; - case Src2CL: - c->src2.bytes = 1; - c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8; - break; - case Src2ImmByte: - c->src2.type = OP_IMM; - c->src2.ptr = (unsigned long *)c->eip; - c->src2.bytes = 1; - c->src2.val = insn_fetch(u8, 1, c->eip); - break; - case Src2One: - c->src2.bytes = 1; - c->src2.val = 1; - break; - } + if (c->dst.type == OP_MEM && c->src.type == OP_REG) { + mask = ~(c->dst.bytes * 8 - 1); - /* Decode and fetch the destination operand: register or memory. */ - switch (c->d & DstMask) { - case ImplicitOps: - /* Special instructions do their own operand decoding. */ - return 0; - case DstReg: - decode_register_operand(&c->dst, c, - c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); - break; - case DstMem: - case DstMem64: - if ((c->d & ModRM) && c->modrm_mod == 3) { - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.type = OP_REG; - c->dst.val = c->dst.orig_val = c->modrm_val; - c->dst.ptr = c->modrm_ptr; - break; - } - c->dst.type = OP_MEM; - c->dst.ptr = (unsigned long *)c->modrm_ea; - if ((c->d & DstMask) == DstMem64) - c->dst.bytes = 8; - else - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.val = 0; - if (c->d & BitOp) { - unsigned long mask = ~(c->dst.bytes * 8 - 1); + if (c->src.bytes == 2) + sv = (s16)c->src.val & (s16)mask; + else if (c->src.bytes == 4) + sv = (s32)c->src.val & (s32)mask; - c->dst.ptr = (void *)c->dst.ptr + - (c->src.val & mask) / 8; - } - break; - case DstAcc: - c->dst.type = OP_REG; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = &c->regs[VCPU_REGS_RAX]; - switch (c->dst.bytes) { - case 1: - c->dst.val = *(u8 *)c->dst.ptr; - break; - case 2: - c->dst.val = *(u16 *)c->dst.ptr; - break; - case 4: - c->dst.val = *(u32 *)c->dst.ptr; - break; - case 8: - c->dst.val = *(u64 *)c->dst.ptr; - break; - } - c->dst.orig_val = c->dst.val; - break; - case DstDI: - c->dst.type = OP_MEM; - c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = (unsigned long *) - register_address(c, es_base(ctxt, ops), - c->regs[VCPU_REGS_RDI]); - c->dst.val = 0; - break; + c->dst.addr.mem += (sv >> 3); } -done: - return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; + /* only subword offset */ + c->src.val &= (c->dst.bytes << 3) - 1; } static int read_emulated(struct x86_emulate_ctxt *ctxt, @@ -1337,7 +832,7 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, rc = ops->read_emulated(addr, mc->data + mc->end, n, &err, ctxt->vcpu); if (rc == X86EMUL_PROPAGATE_FAULT) - emulate_pf(ctxt, addr, err); + emulate_pf(ctxt); if (rc != X86EMUL_CONTINUE) return rc; mc->end += n; @@ -1424,7 +919,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, addr = dt.address + index * 8; ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); if (ret == X86EMUL_PROPAGATE_FAULT) - emulate_pf(ctxt, addr, err); + emulate_pf(ctxt); return ret; } @@ -1450,7 +945,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, addr = dt.address + index * 8; ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); if (ret == X86EMUL_PROPAGATE_FAULT) - emulate_pf(ctxt, addr, err); + emulate_pf(ctxt); return ret; } @@ -1573,6 +1068,25 @@ exception: return X86EMUL_PROPAGATE_FAULT; } +static void write_register_operand(struct operand *op) +{ + /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */ + switch (op->bytes) { + case 1: + *(u8 *)op->addr.reg = (u8)op->val; + break; + case 2: + *(u16 *)op->addr.reg = (u16)op->val; + break; + case 4: + *op->addr.reg = (u32)op->val; + break; /* 64b: zero-extend */ + case 8: + *op->addr.reg = op->val; + break; + } +} + static inline int writeback(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { @@ -1582,28 +1096,12 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, switch (c->dst.type) { case OP_REG: - /* The 4-byte case *is* correct: - * in 64-bit mode we zero-extend. - */ - switch (c->dst.bytes) { - case 1: - *(u8 *)c->dst.ptr = (u8)c->dst.val; - break; - case 2: - *(u16 *)c->dst.ptr = (u16)c->dst.val; - break; - case 4: - *c->dst.ptr = (u32)c->dst.val; - break; /* 64b: zero-ext */ - case 8: - *c->dst.ptr = c->dst.val; - break; - } + write_register_operand(&c->dst); break; case OP_MEM: if (c->lock_prefix) rc = ops->cmpxchg_emulated( - (unsigned long)c->dst.ptr, + c->dst.addr.mem, &c->dst.orig_val, &c->dst.val, c->dst.bytes, @@ -1611,14 +1109,13 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, ctxt->vcpu); else rc = ops->write_emulated( - (unsigned long)c->dst.ptr, + c->dst.addr.mem, &c->dst.val, c->dst.bytes, &err, ctxt->vcpu); if (rc == X86EMUL_PROPAGATE_FAULT) - emulate_pf(ctxt, - (unsigned long)c->dst.ptr, err); + emulate_pf(ctxt); if (rc != X86EMUL_CONTINUE) return rc; break; @@ -1640,8 +1137,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt, c->dst.bytes = c->op_bytes; c->dst.val = c->src.val; register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); - c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops), - c->regs[VCPU_REGS_RSP]); + c->dst.addr.mem = register_address(c, ss_base(ctxt, ops), + c->regs[VCPU_REGS_RSP]); } static int emulate_pop(struct x86_emulate_ctxt *ctxt, @@ -1701,6 +1198,9 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, *(unsigned long *)dest = (ctxt->eflags & ~change_mask) | (val & change_mask); + if (rc == X86EMUL_PROPAGATE_FAULT) + emulate_pf(ctxt); + return rc; } @@ -1778,6 +1278,150 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt, return rc; } +int emulate_int_real(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int irq) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + struct desc_ptr dt; + gva_t cs_addr; + gva_t eip_addr; + u16 cs, eip; + u32 err; + + /* TODO: Add limit checks */ + c->src.val = ctxt->eflags; + emulate_push(ctxt, ops); + rc = writeback(ctxt, ops); + if (rc != X86EMUL_CONTINUE) + return rc; + + ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC); + + c->src.val = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); + emulate_push(ctxt, ops); + rc = writeback(ctxt, ops); + if (rc != X86EMUL_CONTINUE) + return rc; + + c->src.val = c->eip; + emulate_push(ctxt, ops); + rc = writeback(ctxt, ops); + if (rc != X86EMUL_CONTINUE) + return rc; + + c->dst.type = OP_NONE; + + ops->get_idt(&dt, ctxt->vcpu); + + eip_addr = dt.address + (irq << 2); + cs_addr = dt.address + (irq << 2) + 2; + + rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &err); + if (rc != X86EMUL_CONTINUE) + return rc; + + rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &err); + if (rc != X86EMUL_CONTINUE) + return rc; + + rc = load_segment_descriptor(ctxt, ops, cs, VCPU_SREG_CS); + if (rc != X86EMUL_CONTINUE) + return rc; + + c->eip = eip; + + return rc; +} + +static int emulate_int(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int irq) +{ + switch(ctxt->mode) { + case X86EMUL_MODE_REAL: + return emulate_int_real(ctxt, ops, irq); + case X86EMUL_MODE_VM86: + case X86EMUL_MODE_PROT16: + case X86EMUL_MODE_PROT32: + case X86EMUL_MODE_PROT64: + default: + /* Protected mode interrupts unimplemented yet */ + return X86EMUL_UNHANDLEABLE; + } +} + +static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + int rc = X86EMUL_CONTINUE; + unsigned long temp_eip = 0; + unsigned long temp_eflags = 0; + unsigned long cs = 0; + unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF | + EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF | + EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */ + unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP; + + /* TODO: Add stack limit check */ + + rc = emulate_pop(ctxt, ops, &temp_eip, c->op_bytes); + + if (rc != X86EMUL_CONTINUE) + return rc; + + if (temp_eip & ~0xffff) { + emulate_gp(ctxt, 0); + return X86EMUL_PROPAGATE_FAULT; + } + + rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); + + if (rc != X86EMUL_CONTINUE) + return rc; + + rc = emulate_pop(ctxt, ops, &temp_eflags, c->op_bytes); + + if (rc != X86EMUL_CONTINUE) + return rc; + + rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); + + if (rc != X86EMUL_CONTINUE) + return rc; + + c->eip = temp_eip; + + + if (c->op_bytes == 4) + ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask)); + else if (c->op_bytes == 2) { + ctxt->eflags &= ~0xffff; + ctxt->eflags |= temp_eflags; + } + + ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */ + ctxt->eflags |= EFLG_RESERVED_ONE_MASK; + + return rc; +} + +static inline int emulate_iret(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops* ops) +{ + switch(ctxt->mode) { + case X86EMUL_MODE_REAL: + return emulate_iret_real(ctxt, ops); + case X86EMUL_MODE_VM86: + case X86EMUL_MODE_PROT16: + case X86EMUL_MODE_PROT32: + case X86EMUL_MODE_PROT64: + default: + /* iret from protected mode unimplemented yet */ + return X86EMUL_UNHANDLEABLE; + } +} + static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { @@ -1819,6 +1463,9 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; + unsigned long *rax = &c->regs[VCPU_REGS_RAX]; + unsigned long *rdx = &c->regs[VCPU_REGS_RDX]; + u8 de = 0; switch (c->modrm_reg) { case 0 ... 1: /* test */ @@ -1830,10 +1477,26 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, case 3: /* neg */ emulate_1op("neg", c->dst, ctxt->eflags); break; + case 4: /* mul */ + emulate_1op_rax_rdx("mul", c->src, *rax, *rdx, ctxt->eflags); + break; + case 5: /* imul */ + emulate_1op_rax_rdx("imul", c->src, *rax, *rdx, ctxt->eflags); + break; + case 6: /* div */ + emulate_1op_rax_rdx_ex("div", c->src, *rax, *rdx, + ctxt->eflags, de); + break; + case 7: /* idiv */ + emulate_1op_rax_rdx_ex("idiv", c->src, *rax, *rdx, + ctxt->eflags, de); + break; default: - return 0; + return X86EMUL_UNHANDLEABLE; } - return 1; + if (de) + return emulate_de(ctxt); + return X86EMUL_CONTINUE; } static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, @@ -1905,6 +1568,23 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, return rc; } +static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int seg) +{ + struct decode_cache *c = &ctxt->decode; + unsigned short sel; + int rc; + + memcpy(&sel, c->src.valptr + c->op_bytes, 2); + + rc = load_segment_descriptor(ctxt, ops, sel, seg); + if (rc != X86EMUL_CONTINUE) + return rc; + + c->dst.val = c->src.val; + return rc; +} + static inline void setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, struct desc_struct *cs, @@ -2160,9 +1840,15 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, u16 port, u16 len) { + if (ctxt->perm_ok) + return true; + if (emulator_bad_iopl(ctxt, ops)) if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) return false; + + ctxt->perm_ok = true; + return true; } @@ -2254,7 +1940,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - emulate_pf(ctxt, old_tss_base, err); + emulate_pf(ctxt); return ret; } @@ -2264,7 +1950,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - emulate_pf(ctxt, old_tss_base, err); + emulate_pf(ctxt); return ret; } @@ -2272,7 +1958,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - emulate_pf(ctxt, new_tss_base, err); + emulate_pf(ctxt); return ret; } @@ -2285,7 +1971,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, ctxt->vcpu, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - emulate_pf(ctxt, new_tss_base, err); + emulate_pf(ctxt); return ret; } } @@ -2396,7 +2082,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - emulate_pf(ctxt, old_tss_base, err); + emulate_pf(ctxt); return ret; } @@ -2406,7 +2092,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - emulate_pf(ctxt, old_tss_base, err); + emulate_pf(ctxt); return ret; } @@ -2414,7 +2100,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - emulate_pf(ctxt, new_tss_base, err); + emulate_pf(ctxt); return ret; } @@ -2427,7 +2113,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, ctxt->vcpu, &err); if (ret == X86EMUL_PROPAGATE_FAULT) { /* FIXME: need to provide precise fault address */ - emulate_pf(ctxt, new_tss_base, err); + emulate_pf(ctxt); return ret; } } @@ -2523,10 +2209,10 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, } int emulator_task_switch(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, u16 tss_selector, int reason, bool has_error_code, u32 error_code) { + struct x86_emulate_ops *ops = ctxt->ops; struct decode_cache *c = &ctxt->decode; int rc; @@ -2552,16 +2238,784 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base, int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; register_address_increment(c, &c->regs[reg], df * op->bytes); - op->ptr = (unsigned long *)register_address(c, base, c->regs[reg]); + op->addr.mem = register_address(c, base, c->regs[reg]); +} + +static int em_push(struct x86_emulate_ctxt *ctxt) +{ + emulate_push(ctxt, ctxt->ops); + return X86EMUL_CONTINUE; +} + +static int em_das(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + u8 al, old_al; + bool af, cf, old_cf; + + cf = ctxt->eflags & X86_EFLAGS_CF; + al = c->dst.val; + + old_al = al; + old_cf = cf; + cf = false; + af = ctxt->eflags & X86_EFLAGS_AF; + if ((al & 0x0f) > 9 || af) { + al -= 6; + cf = old_cf | (al >= 250); + af = true; + } else { + af = false; + } + if (old_al > 0x99 || old_cf) { + al -= 0x60; + cf = true; + } + + c->dst.val = al; + /* Set PF, ZF, SF */ + c->src.type = OP_IMM; + c->src.val = 0; + c->src.bytes = 1; + emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); + ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF); + if (cf) + ctxt->eflags |= X86_EFLAGS_CF; + if (af) + ctxt->eflags |= X86_EFLAGS_AF; + return X86EMUL_CONTINUE; +} + +static int em_call_far(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + u16 sel, old_cs; + ulong old_eip; + int rc; + + old_cs = ctxt->ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); + old_eip = c->eip; + + memcpy(&sel, c->src.valptr + c->op_bytes, 2); + if (load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS)) + return X86EMUL_CONTINUE; + + c->eip = 0; + memcpy(&c->eip, c->src.valptr, c->op_bytes); + + c->src.val = old_cs; + emulate_push(ctxt, ctxt->ops); + rc = writeback(ctxt, ctxt->ops); + if (rc != X86EMUL_CONTINUE) + return rc; + + c->src.val = old_eip; + emulate_push(ctxt, ctxt->ops); + rc = writeback(ctxt, ctxt->ops); + if (rc != X86EMUL_CONTINUE) + return rc; + + c->dst.type = OP_NONE; + + return X86EMUL_CONTINUE; +} + +static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + + c->dst.type = OP_REG; + c->dst.addr.reg = &c->eip; + c->dst.bytes = c->op_bytes; + rc = emulate_pop(ctxt, ctxt->ops, &c->dst.val, c->op_bytes); + if (rc != X86EMUL_CONTINUE) + return rc; + register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val); + return X86EMUL_CONTINUE; +} + +static int em_imul(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV_nobyte("imul", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + +static int em_imul_3op(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + c->dst.val = c->src2.val; + return em_imul(ctxt); +} + +static int em_cwd(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + c->dst.type = OP_REG; + c->dst.bytes = c->src.bytes; + c->dst.addr.reg = &c->regs[VCPU_REGS_RDX]; + c->dst.val = ~((c->src.val >> (c->src.bytes * 8 - 1)) - 1); + + return X86EMUL_CONTINUE; +} + +static int em_rdtsc(struct x86_emulate_ctxt *ctxt) +{ + unsigned cpl = ctxt->ops->cpl(ctxt->vcpu); + struct decode_cache *c = &ctxt->decode; + u64 tsc = 0; + + if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) { + emulate_gp(ctxt, 0); + return X86EMUL_PROPAGATE_FAULT; + } + ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc); + c->regs[VCPU_REGS_RAX] = (u32)tsc; + c->regs[VCPU_REGS_RDX] = tsc >> 32; + return X86EMUL_CONTINUE; +} + +static int em_mov(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + c->dst.val = c->src.val; + return X86EMUL_CONTINUE; +} + +#define D(_y) { .flags = (_y) } +#define N D(0) +#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } +#define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) } +#define I(_f, _e) { .flags = (_f), .u.execute = (_e) } + +#define D2bv(_f) D((_f) | ByteOp), D(_f) +#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e) + +#define D6ALU(_f) D2bv((_f) | DstMem | SrcReg | ModRM), \ + D2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock), \ + D2bv(((_f) & ~Lock) | DstAcc | SrcImm) + + +static struct opcode group1[] = { + X7(D(Lock)), N +}; + +static struct opcode group1A[] = { + D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N, +}; + +static struct opcode group3[] = { + D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM), + D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), + X4(D(SrcMem | ModRM)), +}; + +static struct opcode group4[] = { + D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock), + N, N, N, N, N, N, +}; + +static struct opcode group5[] = { + D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), + D(SrcMem | ModRM | Stack), + I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far), + D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps), + D(SrcMem | ModRM | Stack), N, +}; + +static struct group_dual group7 = { { + N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv), + D(SrcNone | ModRM | DstMem | Mov), N, + D(SrcMem16 | ModRM | Mov | Priv), + D(SrcMem | ModRM | ByteOp | Priv | NoAccess), +}, { + D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv), + D(SrcNone | ModRM | DstMem | Mov), N, + D(SrcMem16 | ModRM | Mov | Priv), N, +} }; + +static struct opcode group8[] = { + N, N, N, N, + D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock), + D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock), +}; + +static struct group_dual group9 = { { + N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N, +}, { + N, N, N, N, N, N, N, N, +} }; + +static struct opcode group11[] = { + I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)), +}; + +static struct opcode opcode_table[256] = { + /* 0x00 - 0x07 */ + D6ALU(Lock), + D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), + /* 0x08 - 0x0F */ + D6ALU(Lock), + D(ImplicitOps | Stack | No64), N, + /* 0x10 - 0x17 */ + D6ALU(Lock), + D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), + /* 0x18 - 0x1F */ + D6ALU(Lock), + D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), + /* 0x20 - 0x27 */ + D6ALU(Lock), N, N, + /* 0x28 - 0x2F */ + D6ALU(Lock), N, I(ByteOp | DstAcc | No64, em_das), + /* 0x30 - 0x37 */ + D6ALU(Lock), N, N, + /* 0x38 - 0x3F */ + D6ALU(0), N, N, + /* 0x40 - 0x4F */ + X16(D(DstReg)), + /* 0x50 - 0x57 */ + X8(I(SrcReg | Stack, em_push)), + /* 0x58 - 0x5F */ + X8(D(DstReg | Stack)), + /* 0x60 - 0x67 */ + D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), + N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ , + N, N, N, N, + /* 0x68 - 0x6F */ + I(SrcImm | Mov | Stack, em_push), + I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op), + I(SrcImmByte | Mov | Stack, em_push), + I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op), + D2bv(DstDI | Mov | String), /* insb, insw/insd */ + D2bv(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */ + /* 0x70 - 0x7F */ + X16(D(SrcImmByte)), + /* 0x80 - 0x87 */ + G(ByteOp | DstMem | SrcImm | ModRM | Group, group1), + G(DstMem | SrcImm | ModRM | Group, group1), + G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1), + G(DstMem | SrcImmByte | ModRM | Group, group1), + D2bv(DstMem | SrcReg | ModRM), D2bv(DstMem | SrcReg | ModRM | Lock), + /* 0x88 - 0x8F */ + I2bv(DstMem | SrcReg | ModRM | Mov, em_mov), + I2bv(DstReg | SrcMem | ModRM | Mov, em_mov), + D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg), + D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A), + /* 0x90 - 0x97 */ + X8(D(SrcAcc | DstReg)), + /* 0x98 - 0x9F */ + D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), + I(SrcImmFAddr | No64, em_call_far), N, + D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N, + /* 0xA0 - 0xA7 */ + I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), + I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov), + I2bv(SrcSI | DstDI | Mov | String, em_mov), + D2bv(SrcSI | DstDI | String), + /* 0xA8 - 0xAF */ + D2bv(DstAcc | SrcImm), + I2bv(SrcAcc | DstDI | Mov | String, em_mov), + I2bv(SrcSI | DstAcc | Mov | String, em_mov), + D2bv(SrcAcc | DstDI | String), + /* 0xB0 - 0xB7 */ + X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)), + /* 0xB8 - 0xBF */ + X8(I(DstReg | SrcImm | Mov, em_mov)), + /* 0xC0 - 0xC7 */ + D2bv(DstMem | SrcImmByte | ModRM), + I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), + D(ImplicitOps | Stack), + D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64), + G(ByteOp, group11), G(0, group11), + /* 0xC8 - 0xCF */ + N, N, N, D(ImplicitOps | Stack), + D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps), + /* 0xD0 - 0xD7 */ + D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM), + N, N, N, N, + /* 0xD8 - 0xDF */ + N, N, N, N, N, N, N, N, + /* 0xE0 - 0xE7 */ + X4(D(SrcImmByte)), + D2bv(SrcImmUByte | DstAcc), D2bv(SrcAcc | DstImmUByte), + /* 0xE8 - 0xEF */ + D(SrcImm | Stack), D(SrcImm | ImplicitOps), + D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps), + D2bv(SrcNone | DstAcc), D2bv(SrcAcc | ImplicitOps), + /* 0xF0 - 0xF7 */ + N, N, N, N, + D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3), + /* 0xF8 - 0xFF */ + D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), + D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), +}; + +static struct opcode twobyte_table[256] = { + /* 0x00 - 0x0F */ + N, GD(0, &group7), N, N, + N, D(ImplicitOps), D(ImplicitOps | Priv), N, + D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N, + N, D(ImplicitOps | ModRM), N, N, + /* 0x10 - 0x1F */ + N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N, + /* 0x20 - 0x2F */ + D(ModRM | DstMem | Priv | Op3264), D(ModRM | DstMem | Priv | Op3264), + D(ModRM | SrcMem | Priv | Op3264), D(ModRM | SrcMem | Priv | Op3264), + N, N, N, N, + N, N, N, N, N, N, N, N, + /* 0x30 - 0x3F */ + D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc), + D(ImplicitOps | Priv), N, + D(ImplicitOps), D(ImplicitOps | Priv), N, N, + N, N, N, N, N, N, N, N, + /* 0x40 - 0x4F */ + X16(D(DstReg | SrcMem | ModRM | Mov)), + /* 0x50 - 0x5F */ + N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, + /* 0x60 - 0x6F */ + N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, + /* 0x70 - 0x7F */ + N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, + /* 0x80 - 0x8F */ + X16(D(SrcImm)), + /* 0x90 - 0x9F */ + X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), + /* 0xA0 - 0xA7 */ + D(ImplicitOps | Stack), D(ImplicitOps | Stack), + N, D(DstMem | SrcReg | ModRM | BitOp), + D(DstMem | SrcReg | Src2ImmByte | ModRM), + D(DstMem | SrcReg | Src2CL | ModRM), N, N, + /* 0xA8 - 0xAF */ + D(ImplicitOps | Stack), D(ImplicitOps | Stack), + N, D(DstMem | SrcReg | ModRM | BitOp | Lock), + D(DstMem | SrcReg | Src2ImmByte | ModRM), + D(DstMem | SrcReg | Src2CL | ModRM), + D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), + /* 0xB0 - 0xB7 */ + D2bv(DstMem | SrcReg | ModRM | Lock), + D(DstReg | SrcMemFAddr | ModRM), D(DstMem | SrcReg | ModRM | BitOp | Lock), + D(DstReg | SrcMemFAddr | ModRM), D(DstReg | SrcMemFAddr | ModRM), + D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), + /* 0xB8 - 0xBF */ + N, N, + G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock), + D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM), + D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), + /* 0xC0 - 0xCF */ + D2bv(DstMem | SrcReg | ModRM | Lock), + N, D(DstMem | SrcReg | ModRM | Mov), + N, N, N, GD(0, &group9), + N, N, N, N, N, N, N, N, + /* 0xD0 - 0xDF */ + N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, + /* 0xE0 - 0xEF */ + N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, + /* 0xF0 - 0xFF */ + N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N +}; + +#undef D +#undef N +#undef G +#undef GD +#undef I + +#undef D2bv +#undef I2bv +#undef D6ALU + +static unsigned imm_size(struct decode_cache *c) +{ + unsigned size; + + size = (c->d & ByteOp) ? 1 : c->op_bytes; + if (size == 8) + size = 4; + return size; +} + +static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op, + unsigned size, bool sign_extension) +{ + struct decode_cache *c = &ctxt->decode; + struct x86_emulate_ops *ops = ctxt->ops; + int rc = X86EMUL_CONTINUE; + + op->type = OP_IMM; + op->bytes = size; + op->addr.mem = c->eip; + /* NB. Immediates are sign-extended as necessary. */ + switch (op->bytes) { + case 1: + op->val = insn_fetch(s8, 1, c->eip); + break; + case 2: + op->val = insn_fetch(s16, 2, c->eip); + break; + case 4: + op->val = insn_fetch(s32, 4, c->eip); + break; + } + if (!sign_extension) { + switch (op->bytes) { + case 1: + op->val &= 0xff; + break; + case 2: + op->val &= 0xffff; + break; + case 4: + op->val &= 0xffffffff; + break; + } + } +done: + return rc; } int -x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) +x86_decode_insn(struct x86_emulate_ctxt *ctxt) { + struct x86_emulate_ops *ops = ctxt->ops; + struct decode_cache *c = &ctxt->decode; + int rc = X86EMUL_CONTINUE; + int mode = ctxt->mode; + int def_op_bytes, def_ad_bytes, dual, goffset; + struct opcode opcode, *g_mod012, *g_mod3; + struct operand memop = { .type = OP_NONE }; + + c->eip = ctxt->eip; + c->fetch.start = c->fetch.end = c->eip; + ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS); + + switch (mode) { + case X86EMUL_MODE_REAL: + case X86EMUL_MODE_VM86: + case X86EMUL_MODE_PROT16: + def_op_bytes = def_ad_bytes = 2; + break; + case X86EMUL_MODE_PROT32: + def_op_bytes = def_ad_bytes = 4; + break; +#ifdef CONFIG_X86_64 + case X86EMUL_MODE_PROT64: + def_op_bytes = 4; + def_ad_bytes = 8; + break; +#endif + default: + return -1; + } + + c->op_bytes = def_op_bytes; + c->ad_bytes = def_ad_bytes; + + /* Legacy prefixes. */ + for (;;) { + switch (c->b = insn_fetch(u8, 1, c->eip)) { + case 0x66: /* operand-size override */ + /* switch between 2/4 bytes */ + c->op_bytes = def_op_bytes ^ 6; + break; + case 0x67: /* address-size override */ + if (mode == X86EMUL_MODE_PROT64) + /* switch between 4/8 bytes */ + c->ad_bytes = def_ad_bytes ^ 12; + else + /* switch between 2/4 bytes */ + c->ad_bytes = def_ad_bytes ^ 6; + break; + case 0x26: /* ES override */ + case 0x2e: /* CS override */ + case 0x36: /* SS override */ + case 0x3e: /* DS override */ + set_seg_override(c, (c->b >> 3) & 3); + break; + case 0x64: /* FS override */ + case 0x65: /* GS override */ + set_seg_override(c, c->b & 7); + break; + case 0x40 ... 0x4f: /* REX */ + if (mode != X86EMUL_MODE_PROT64) + goto done_prefixes; + c->rex_prefix = c->b; + continue; + case 0xf0: /* LOCK */ + c->lock_prefix = 1; + break; + case 0xf2: /* REPNE/REPNZ */ + c->rep_prefix = REPNE_PREFIX; + break; + case 0xf3: /* REP/REPE/REPZ */ + c->rep_prefix = REPE_PREFIX; + break; + default: + goto done_prefixes; + } + + /* Any legacy prefix after a REX prefix nullifies its effect. */ + + c->rex_prefix = 0; + } + +done_prefixes: + + /* REX prefix. */ + if (c->rex_prefix & 8) + c->op_bytes = 8; /* REX.W */ + + /* Opcode byte(s). */ + opcode = opcode_table[c->b]; + /* Two-byte opcode? */ + if (c->b == 0x0f) { + c->twobyte = 1; + c->b = insn_fetch(u8, 1, c->eip); + opcode = twobyte_table[c->b]; + } + c->d = opcode.flags; + + if (c->d & Group) { + dual = c->d & GroupDual; + c->modrm = insn_fetch(u8, 1, c->eip); + --c->eip; + + if (c->d & GroupDual) { + g_mod012 = opcode.u.gdual->mod012; + g_mod3 = opcode.u.gdual->mod3; + } else + g_mod012 = g_mod3 = opcode.u.group; + + c->d &= ~(Group | GroupDual); + + goffset = (c->modrm >> 3) & 7; + + if ((c->modrm >> 6) == 3) + opcode = g_mod3[goffset]; + else + opcode = g_mod012[goffset]; + c->d |= opcode.flags; + } + + c->execute = opcode.u.execute; + + /* Unrecognised? */ + if (c->d == 0 || (c->d & Undefined)) { + DPRINTF("Cannot emulate %02x\n", c->b); + return -1; + } + + if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) + c->op_bytes = 8; + + if (c->d & Op3264) { + if (mode == X86EMUL_MODE_PROT64) + c->op_bytes = 8; + else + c->op_bytes = 4; + } + + /* ModRM and SIB bytes. */ + if (c->d & ModRM) { + rc = decode_modrm(ctxt, ops, &memop); + if (!c->has_seg_override) + set_seg_override(c, c->modrm_seg); + } else if (c->d & MemAbs) + rc = decode_abs(ctxt, ops, &memop); + if (rc != X86EMUL_CONTINUE) + goto done; + + if (!c->has_seg_override) + set_seg_override(c, VCPU_SREG_DS); + + if (memop.type == OP_MEM && !(!c->twobyte && c->b == 0x8d)) + memop.addr.mem += seg_override_base(ctxt, ops, c); + + if (memop.type == OP_MEM && c->ad_bytes != 8) + memop.addr.mem = (u32)memop.addr.mem; + + if (memop.type == OP_MEM && c->rip_relative) + memop.addr.mem += c->eip; + + /* + * Decode and fetch the source operand: register, memory + * or immediate. + */ + switch (c->d & SrcMask) { + case SrcNone: + break; + case SrcReg: + decode_register_operand(&c->src, c, 0); + break; + case SrcMem16: + memop.bytes = 2; + goto srcmem_common; + case SrcMem32: + memop.bytes = 4; + goto srcmem_common; + case SrcMem: + memop.bytes = (c->d & ByteOp) ? 1 : + c->op_bytes; + srcmem_common: + c->src = memop; + break; + case SrcImmU16: + rc = decode_imm(ctxt, &c->src, 2, false); + break; + case SrcImm: + rc = decode_imm(ctxt, &c->src, imm_size(c), true); + break; + case SrcImmU: + rc = decode_imm(ctxt, &c->src, imm_size(c), false); + break; + case SrcImmByte: + rc = decode_imm(ctxt, &c->src, 1, true); + break; + case SrcImmUByte: + rc = decode_imm(ctxt, &c->src, 1, false); + break; + case SrcAcc: + c->src.type = OP_REG; + c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->src.addr.reg = &c->regs[VCPU_REGS_RAX]; + fetch_register_operand(&c->src); + break; + case SrcOne: + c->src.bytes = 1; + c->src.val = 1; + break; + case SrcSI: + c->src.type = OP_MEM; + c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->src.addr.mem = + register_address(c, seg_override_base(ctxt, ops, c), + c->regs[VCPU_REGS_RSI]); + c->src.val = 0; + break; + case SrcImmFAddr: + c->src.type = OP_IMM; + c->src.addr.mem = c->eip; + c->src.bytes = c->op_bytes + 2; + insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); + break; + case SrcMemFAddr: + memop.bytes = c->op_bytes + 2; + goto srcmem_common; + break; + } + + if (rc != X86EMUL_CONTINUE) + goto done; + + /* + * Decode and fetch the second source operand: register, memory + * or immediate. + */ + switch (c->d & Src2Mask) { + case Src2None: + break; + case Src2CL: + c->src2.bytes = 1; + c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8; + break; + case Src2ImmByte: + rc = decode_imm(ctxt, &c->src2, 1, true); + break; + case Src2One: + c->src2.bytes = 1; + c->src2.val = 1; + break; + case Src2Imm: + rc = decode_imm(ctxt, &c->src2, imm_size(c), true); + break; + } + + if (rc != X86EMUL_CONTINUE) + goto done; + + /* Decode and fetch the destination operand: register or memory. */ + switch (c->d & DstMask) { + case DstReg: + decode_register_operand(&c->dst, c, + c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); + break; + case DstImmUByte: + c->dst.type = OP_IMM; + c->dst.addr.mem = c->eip; + c->dst.bytes = 1; + c->dst.val = insn_fetch(u8, 1, c->eip); + break; + case DstMem: + case DstMem64: + c->dst = memop; + if ((c->d & DstMask) == DstMem64) + c->dst.bytes = 8; + else + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + if (c->d & BitOp) + fetch_bit_operand(c); + c->dst.orig_val = c->dst.val; + break; + case DstAcc: + c->dst.type = OP_REG; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.addr.reg = &c->regs[VCPU_REGS_RAX]; + fetch_register_operand(&c->dst); + c->dst.orig_val = c->dst.val; + break; + case DstDI: + c->dst.type = OP_MEM; + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; + c->dst.addr.mem = + register_address(c, es_base(ctxt, ops), + c->regs[VCPU_REGS_RDI]); + c->dst.val = 0; + break; + case ImplicitOps: + /* Special instructions do their own operand decoding. */ + default: + c->dst.type = OP_NONE; /* Disable writeback. */ + return 0; + } + +done: + return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; +} + +static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + /* The second termination condition only applies for REPE + * and REPNE. Test if the repeat string operation prefix is + * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the + * corresponding termination condition according to: + * - if REPE/REPZ and ZF = 0 then done + * - if REPNE/REPNZ and ZF = 1 then done + */ + if (((c->b == 0xa6) || (c->b == 0xa7) || + (c->b == 0xae) || (c->b == 0xaf)) + && (((c->rep_prefix == REPE_PREFIX) && + ((ctxt->eflags & EFLG_ZF) == 0)) + || ((c->rep_prefix == REPNE_PREFIX) && + ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)))) + return true; + + return false; +} + +int +x86_emulate_insn(struct x86_emulate_ctxt *ctxt) +{ + struct x86_emulate_ops *ops = ctxt->ops; u64 msr_data; struct decode_cache *c = &ctxt->decode; int rc = X86EMUL_CONTINUE; int saved_dst_type = c->dst.type; + int irq; /* Used for int 3, int, and into */ ctxt->decode.mem_read.pos = 0; @@ -2576,6 +3030,11 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) goto done; } + if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) { + emulate_ud(ctxt); + goto done; + } + /* Privileged instruction can be executed only in CPL=0 */ if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { emulate_gp(ctxt, 0); @@ -2583,35 +3042,15 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) } if (c->rep_prefix && (c->d & String)) { - ctxt->restart = true; /* All REP prefixes have the same first termination condition */ if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { - string_done: - ctxt->restart = false; ctxt->eip = c->eip; goto done; } - /* The second termination condition only applies for REPE - * and REPNE. Test if the repeat string operation prefix is - * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the - * corresponding termination condition according to: - * - if REPE/REPZ and ZF = 0 then done - * - if REPNE/REPNZ and ZF = 1 then done - */ - if ((c->b == 0xa6) || (c->b == 0xa7) || - (c->b == 0xae) || (c->b == 0xaf)) { - if ((c->rep_prefix == REPE_PREFIX) && - ((ctxt->eflags & EFLG_ZF) == 0)) - goto string_done; - if ((c->rep_prefix == REPNE_PREFIX) && - ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) - goto string_done; - } - c->eip = ctxt->eip; } - if (c->src.type == OP_MEM) { - rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr, + if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) { + rc = read_emulated(ctxt, ops, c->src.addr.mem, c->src.valptr, c->src.bytes); if (rc != X86EMUL_CONTINUE) goto done; @@ -2619,7 +3058,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) } if (c->src2.type == OP_MEM) { - rc = read_emulated(ctxt, ops, (unsigned long)c->src2.ptr, + rc = read_emulated(ctxt, ops, c->src2.addr.mem, &c->src2.val, c->src2.bytes); if (rc != X86EMUL_CONTINUE) goto done; @@ -2631,7 +3070,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { /* optimisation - avoid slow emulated read if Mov */ - rc = read_emulated(ctxt, ops, (unsigned long)c->dst.ptr, + rc = read_emulated(ctxt, ops, c->dst.addr.mem, &c->dst.val, c->dst.bytes); if (rc != X86EMUL_CONTINUE) goto done; @@ -2640,6 +3079,13 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) special_insn: + if (c->execute) { + rc = c->execute(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; + goto writeback; + } + if (c->twobyte) goto twobyte_insn; @@ -2653,8 +3099,6 @@ special_insn: break; case 0x07: /* pop es */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0x08 ... 0x0d: or: /* or */ @@ -2672,8 +3116,6 @@ special_insn: break; case 0x17: /* pop ss */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0x18 ... 0x1d: sbb: /* sbb */ @@ -2684,8 +3126,6 @@ special_insn: break; case 0x1f: /* pop ds */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0x20 ... 0x25: and: /* and */ @@ -2709,58 +3149,29 @@ special_insn: case 0x48 ... 0x4f: /* dec r16/r32 */ emulate_1op("dec", c->dst, ctxt->eflags); break; - case 0x50 ... 0x57: /* push reg */ - emulate_push(ctxt, ops); - break; case 0x58 ... 0x5f: /* pop reg */ pop_instruction: rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0x60: /* pusha */ rc = emulate_pusha(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0x61: /* popa */ rc = emulate_popa(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0x63: /* movsxd */ if (ctxt->mode != X86EMUL_MODE_PROT64) goto cannot_emulate; c->dst.val = (s32) c->src.val; break; - case 0x68: /* push imm */ - case 0x6a: /* push imm8 */ - emulate_push(ctxt, ops); - break; case 0x6c: /* insb */ case 0x6d: /* insw/insd */ - c->dst.bytes = min(c->dst.bytes, 4u); - if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], - c->dst.bytes)) { - emulate_gp(ctxt, 0); - goto done; - } - if (!pio_in_emulated(ctxt, ops, c->dst.bytes, - c->regs[VCPU_REGS_RDX], &c->dst.val)) - goto done; /* IO is needed, skip writeback */ - break; + c->src.val = c->regs[VCPU_REGS_RDX]; + goto do_io_in; case 0x6e: /* outsb */ case 0x6f: /* outsw/outsd */ - c->src.bytes = min(c->src.bytes, 4u); - if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], - c->src.bytes)) { - emulate_gp(ctxt, 0); - goto done; - } - ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX], - &c->src.val, 1, ctxt->vcpu); - - c->dst.type = OP_NONE; /* nothing to writeback */ + c->dst.val = c->regs[VCPU_REGS_RDX]; + goto do_io_out; break; case 0x70 ... 0x7f: /* jcc (short) */ if (test_cc(c->b, ctxt->eflags)) @@ -2793,29 +3204,15 @@ special_insn: case 0x86 ... 0x87: /* xchg */ xchg: /* Write back the register source. */ - switch (c->dst.bytes) { - case 1: - *(u8 *) c->src.ptr = (u8) c->dst.val; - break; - case 2: - *(u16 *) c->src.ptr = (u16) c->dst.val; - break; - case 4: - *c->src.ptr = (u32) c->dst.val; - break; /* 64b reg: zero-extend */ - case 8: - *c->src.ptr = c->dst.val; - break; - } + c->src.val = c->dst.val; + write_register_operand(&c->src); /* * Write back the memory destination with implicit LOCK * prefix. */ - c->dst.val = c->src.val; + c->dst.val = c->src.orig_val; c->lock_prefix = 1; break; - case 0x88 ... 0x8b: /* mov */ - goto mov; case 0x8c: /* mov r/m, sreg */ if (c->modrm_reg > VCPU_SREG_GS) { emulate_ud(ctxt); @@ -2824,7 +3221,7 @@ special_insn: c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); break; case 0x8d: /* lea r16/r32, m */ - c->dst.val = c->modrm_ea; + c->dst.val = c->src.addr.mem; break; case 0x8e: { /* mov seg, r/m16 */ uint16_t sel; @@ -2847,76 +3244,87 @@ special_insn: } case 0x8f: /* pop (sole member of Grp1a) */ rc = emulate_grp1a(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - goto done; break; - case 0x90: /* nop / xchg r8,rax */ - if (c->dst.ptr == (unsigned long *)&c->regs[VCPU_REGS_RAX]) { - c->dst.type = OP_NONE; /* nop */ + case 0x90 ... 0x97: /* nop / xchg reg, rax */ + if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX]) break; - } - case 0x91 ... 0x97: /* xchg reg,rax */ - c->src.type = OP_REG; - c->src.bytes = c->op_bytes; - c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX]; - c->src.val = *(c->src.ptr); goto xchg; + case 0x98: /* cbw/cwde/cdqe */ + switch (c->op_bytes) { + case 2: c->dst.val = (s8)c->dst.val; break; + case 4: c->dst.val = (s16)c->dst.val; break; + case 8: c->dst.val = (s32)c->dst.val; break; + } + break; case 0x9c: /* pushf */ c->src.val = (unsigned long) ctxt->eflags; emulate_push(ctxt, ops); break; case 0x9d: /* popf */ c->dst.type = OP_REG; - c->dst.ptr = (unsigned long *) &ctxt->eflags; + c->dst.addr.reg = &ctxt->eflags; c->dst.bytes = c->op_bytes; rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes); - if (rc != X86EMUL_CONTINUE) - goto done; break; - case 0xa0 ... 0xa3: /* mov */ - case 0xa4 ... 0xa5: /* movs */ - goto mov; case 0xa6 ... 0xa7: /* cmps */ c->dst.type = OP_NONE; /* Disable writeback. */ - DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); + DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.addr.mem, c->dst.addr.mem); goto cmp; case 0xa8 ... 0xa9: /* test ax, imm */ goto test; - case 0xaa ... 0xab: /* stos */ - c->dst.val = c->regs[VCPU_REGS_RAX]; - break; - case 0xac ... 0xad: /* lods */ - goto mov; case 0xae ... 0xaf: /* scas */ - DPRINTF("Urk! I don't handle SCAS.\n"); - goto cannot_emulate; - case 0xb0 ... 0xbf: /* mov r, imm */ - goto mov; + goto cmp; case 0xc0 ... 0xc1: emulate_grp2(ctxt); break; case 0xc3: /* ret */ c->dst.type = OP_REG; - c->dst.ptr = &c->eip; + c->dst.addr.reg = &c->eip; c->dst.bytes = c->op_bytes; goto pop_instruction; - case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ - mov: - c->dst.val = c->src.val; + case 0xc4: /* les */ + rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES); + break; + case 0xc5: /* lds */ + rc = emulate_load_segment(ctxt, ops, VCPU_SREG_DS); break; case 0xcb: /* ret far */ rc = emulate_ret_far(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - goto done; + break; + case 0xcc: /* int3 */ + irq = 3; + goto do_interrupt; + case 0xcd: /* int n */ + irq = c->src.val; + do_interrupt: + rc = emulate_int(ctxt, ops, irq); + break; + case 0xce: /* into */ + if (ctxt->eflags & EFLG_OF) { + irq = 4; + goto do_interrupt; + } + break; + case 0xcf: /* iret */ + rc = emulate_iret(ctxt, ops); break; case 0xd0 ... 0xd1: /* Grp2 */ - c->src.val = 1; emulate_grp2(ctxt); break; case 0xd2 ... 0xd3: /* Grp2 */ c->src.val = c->regs[VCPU_REGS_RCX]; emulate_grp2(ctxt); break; + case 0xe0 ... 0xe2: /* loop/loopz/loopnz */ + register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); + if (address_mask(c, c->regs[VCPU_REGS_RCX]) != 0 && + (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags))) + jmp_rel(c, c->src.val); + break; + case 0xe3: /* jcxz/jecxz/jrcxz */ + if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) + jmp_rel(c, c->src.val); + break; case 0xe4: /* inb */ case 0xe5: /* in */ goto do_io_in; @@ -2964,15 +3372,16 @@ special_insn: break; case 0xee: /* out dx,al */ case 0xef: /* out dx,(e/r)ax */ - c->src.val = c->regs[VCPU_REGS_RDX]; + c->dst.val = c->regs[VCPU_REGS_RDX]; do_io_out: - c->dst.bytes = min(c->dst.bytes, 4u); - if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { + c->src.bytes = min(c->src.bytes, 4u); + if (!emulator_io_permited(ctxt, ops, c->dst.val, + c->src.bytes)) { emulate_gp(ctxt, 0); goto done; } - ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1, - ctxt->vcpu); + ops->pio_out_emulated(c->src.bytes, c->dst.val, + &c->src.val, 1, ctxt->vcpu); c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xf4: /* hlt */ @@ -2981,24 +3390,22 @@ special_insn: case 0xf5: /* cmc */ /* complement carry flag from eflags reg */ ctxt->eflags ^= EFLG_CF; - c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xf6 ... 0xf7: /* Grp3 */ - if (!emulate_grp3(ctxt, ops)) - goto cannot_emulate; + rc = emulate_grp3(ctxt, ops); break; case 0xf8: /* clc */ ctxt->eflags &= ~EFLG_CF; - c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xf9: /* stc */ + ctxt->eflags |= EFLG_CF; break; case 0xfa: /* cli */ if (emulator_bad_iopl(ctxt, ops)) { emulate_gp(ctxt, 0); goto done; - } else { + } else ctxt->eflags &= ~X86_EFLAGS_IF; - c->dst.type = OP_NONE; /* Disable writeback. */ - } break; case 0xfb: /* sti */ if (emulator_bad_iopl(ctxt, ops)) { @@ -3007,29 +3414,29 @@ special_insn: } else { ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; ctxt->eflags |= X86_EFLAGS_IF; - c->dst.type = OP_NONE; /* Disable writeback. */ } break; case 0xfc: /* cld */ ctxt->eflags &= ~EFLG_DF; - c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xfd: /* std */ ctxt->eflags |= EFLG_DF; - c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xfe: /* Grp4 */ grp45: rc = emulate_grp45(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0xff: /* Grp5 */ if (c->modrm_reg == 5) goto jump_far; goto grp45; + default: + goto cannot_emulate; } + if (rc != X86EMUL_CONTINUE) + goto done; + writeback: rc = writeback(ctxt, ops); if (rc != X86EMUL_CONTINUE) @@ -3050,25 +3457,32 @@ writeback: &c->dst); if (c->rep_prefix && (c->d & String)) { - struct read_cache *rc = &ctxt->decode.io_read; + struct read_cache *r = &ctxt->decode.io_read; register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); - /* - * Re-enter guest when pio read ahead buffer is empty or, - * if it is not used, after each 1024 iteration. - */ - if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) || - (rc->end != 0 && rc->end == rc->pos)) - ctxt->restart = false; + + if (!string_insn_completed(ctxt)) { + /* + * Re-enter guest when pio read ahead buffer is empty + * or, if it is not used, after each 1024 iteration. + */ + if ((r->end != 0 || c->regs[VCPU_REGS_RCX] & 0x3ff) && + (r->end == 0 || r->end != r->pos)) { + /* + * Reset read cache. Usually happens before + * decode, but since instruction is restarted + * we have to do it here. + */ + ctxt->decode.mem_read.end = 0; + return EMULATION_RESTART; + } + goto done; /* skip rip writeback */ + } } - /* - * reset read cache here in case string instruction is restared - * without decoding - */ - ctxt->decode.mem_read.end = 0; + ctxt->eip = c->eip; done: - return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; + return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; twobyte_insn: switch (c->b) { @@ -3091,7 +3505,7 @@ twobyte_insn: c->dst.type = OP_NONE; break; case 2: /* lgdt */ - rc = read_descriptor(ctxt, ops, c->src.ptr, + rc = read_descriptor(ctxt, ops, c->src.addr.mem, &size, &address, c->op_bytes); if (rc != X86EMUL_CONTINUE) goto done; @@ -3104,14 +3518,12 @@ twobyte_insn: switch (c->modrm_rm) { case 1: rc = kvm_fix_hypercall(ctxt->vcpu); - if (rc != X86EMUL_CONTINUE) - goto done; break; default: goto cannot_emulate; } } else { - rc = read_descriptor(ctxt, ops, c->src.ptr, + rc = read_descriptor(ctxt, ops, c->src.addr.mem, &size, &address, c->op_bytes); if (rc != X86EMUL_CONTINUE) @@ -3126,7 +3538,7 @@ twobyte_insn: c->dst.val = ops->get_cr(0, ctxt->vcpu); break; case 6: /* lmsw */ - ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0ful) | + ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0eul) | (c->src.val & 0x0f), ctxt->vcpu); c->dst.type = OP_NONE; break; @@ -3134,7 +3546,7 @@ twobyte_insn: emulate_ud(ctxt); goto done; case 7: /* invlpg*/ - emulate_invlpg(ctxt->vcpu, c->modrm_ea); + emulate_invlpg(ctxt->vcpu, c->src.addr.mem); /* Disable writeback. */ c->dst.type = OP_NONE; break; @@ -3144,23 +3556,16 @@ twobyte_insn: break; case 0x05: /* syscall */ rc = emulate_syscall(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - goto done; - else - goto writeback; break; case 0x06: emulate_clts(ctxt->vcpu); - c->dst.type = OP_NONE; break; case 0x09: /* wbinvd */ kvm_emulate_wbinvd(ctxt->vcpu); - c->dst.type = OP_NONE; break; case 0x08: /* invd */ case 0x0d: /* GrpP (prefetch) */ case 0x18: /* Grp16 (prefetch/nop) */ - c->dst.type = OP_NONE; break; case 0x20: /* mov cr, reg */ switch (c->modrm_reg) { @@ -3170,8 +3575,7 @@ twobyte_insn: emulate_ud(ctxt); goto done; } - c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu); - c->dst.type = OP_NONE; /* no writeback */ + c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu); break; case 0x21: /* mov from dr to reg */ if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && @@ -3179,11 +3583,10 @@ twobyte_insn: emulate_ud(ctxt); goto done; } - ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu); - c->dst.type = OP_NONE; /* no writeback */ + ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu); break; case 0x22: /* mov reg, cr */ - if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) { + if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) { emulate_gp(ctxt, 0); goto done; } @@ -3196,7 +3599,7 @@ twobyte_insn: goto done; } - if (ops->set_dr(c->modrm_reg, c->regs[c->modrm_rm] & + if (ops->set_dr(c->modrm_reg, c->src.val & ((ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U), ctxt->vcpu) < 0) { /* #UD condition is already handled by the code above */ @@ -3215,7 +3618,6 @@ twobyte_insn: goto done; } rc = X86EMUL_CONTINUE; - c->dst.type = OP_NONE; break; case 0x32: /* rdmsr */ @@ -3227,21 +3629,12 @@ twobyte_insn: c->regs[VCPU_REGS_RDX] = msr_data >> 32; } rc = X86EMUL_CONTINUE; - c->dst.type = OP_NONE; break; case 0x34: /* sysenter */ rc = emulate_sysenter(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - goto done; - else - goto writeback; break; case 0x35: /* sysexit */ rc = emulate_sysexit(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - goto done; - else - goto writeback; break; case 0x40 ... 0x4f: /* cmov */ c->dst.val = c->dst.orig_val = c->src.val; @@ -3251,15 +3644,15 @@ twobyte_insn: case 0x80 ... 0x8f: /* jnz rel, etc*/ if (test_cc(c->b, ctxt->eflags)) jmp_rel(c, c->src.val); - c->dst.type = OP_NONE; + break; + case 0x90 ... 0x9f: /* setcc r/m8 */ + c->dst.val = test_cc(c->b, ctxt->eflags); break; case 0xa0: /* push fs */ emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); break; case 0xa1: /* pop fs */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0xa3: bt: /* bt */ @@ -3277,13 +3670,9 @@ twobyte_insn: break; case 0xa9: /* pop gs */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); - if (rc != X86EMUL_CONTINUE) - goto done; break; case 0xab: bts: /* bts */ - /* only subword offset */ - c->src.val &= (c->dst.bytes << 3) - 1; emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); break; case 0xac: /* shrd imm8, r, r/m */ @@ -3306,15 +3695,22 @@ twobyte_insn: } else { /* Failure: write the value we saw to EAX. */ c->dst.type = OP_REG; - c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; + c->dst.addr.reg = (unsigned long *)&c->regs[VCPU_REGS_RAX]; } break; + case 0xb2: /* lss */ + rc = emulate_load_segment(ctxt, ops, VCPU_SREG_SS); + break; case 0xb3: btr: /* btr */ - /* only subword offset */ - c->src.val &= (c->dst.bytes << 3) - 1; emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); break; + case 0xb4: /* lfs */ + rc = emulate_load_segment(ctxt, ops, VCPU_SREG_FS); + break; + case 0xb5: /* lgs */ + rc = emulate_load_segment(ctxt, ops, VCPU_SREG_GS); + break; case 0xb6 ... 0xb7: /* movzx */ c->dst.bytes = c->op_bytes; c->dst.val = (c->d & ByteOp) ? (u8) c->src.val @@ -3334,15 +3730,43 @@ twobyte_insn: break; case 0xbb: btc: /* btc */ - /* only subword offset */ - c->src.val &= (c->dst.bytes << 3) - 1; emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); break; + case 0xbc: { /* bsf */ + u8 zf; + __asm__ ("bsf %2, %0; setz %1" + : "=r"(c->dst.val), "=q"(zf) + : "r"(c->src.val)); + ctxt->eflags &= ~X86_EFLAGS_ZF; + if (zf) { + ctxt->eflags |= X86_EFLAGS_ZF; + c->dst.type = OP_NONE; /* Disable writeback. */ + } + break; + } + case 0xbd: { /* bsr */ + u8 zf; + __asm__ ("bsr %2, %0; setz %1" + : "=r"(c->dst.val), "=q"(zf) + : "r"(c->src.val)); + ctxt->eflags &= ~X86_EFLAGS_ZF; + if (zf) { + ctxt->eflags |= X86_EFLAGS_ZF; + c->dst.type = OP_NONE; /* Disable writeback. */ + } + break; + } case 0xbe ... 0xbf: /* movsx */ c->dst.bytes = c->op_bytes; c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : (s16) c->src.val; break; + case 0xc0 ... 0xc1: /* xadd */ + emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); + /* Write back the register source. */ + c->src.val = c->dst.orig_val; + write_register_operand(&c->src); + break; case 0xc3: /* movnti */ c->dst.bytes = c->op_bytes; c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val : @@ -3350,10 +3774,14 @@ twobyte_insn: break; case 0xc7: /* Grp9 (cmpxchg8b) */ rc = emulate_grp9(ctxt, ops); - if (rc != X86EMUL_CONTINUE) - goto done; break; + default: + goto cannot_emulate; } + + if (rc != X86EMUL_CONTINUE) + goto done; + goto writeback; cannot_emulate: diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index ddeb2314b522..efad72385058 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -5,7 +5,7 @@ * Copyright (c) 2006 Intel Corporation * Copyright (c) 2007 Keir Fraser, XenSource Inc * Copyright (c) 2008 Intel Corporation - * Copyright 2009 Red Hat, Inc. and/or its affilates. + * Copyright 2009 Red Hat, Inc. and/or its affiliates. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -232,15 +232,6 @@ static void pit_latch_status(struct kvm *kvm, int channel) } } -int pit_has_pending_timer(struct kvm_vcpu *vcpu) -{ - struct kvm_pit *pit = vcpu->kvm->arch.vpit; - - if (pit && kvm_vcpu_is_bsp(vcpu) && pit->pit_state.irq_ack) - return atomic_read(&pit->pit_state.pit_timer.pending); - return 0; -} - static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) { struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 4b7b73ce2098..f628234fbeca 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -3,7 +3,7 @@ * * Copyright (c) 2003-2004 Fabrice Bellard * Copyright (c) 2007 Intel Corporation - * Copyright 2009 Red Hat, Inc. and/or its affilates. + * Copyright 2009 Red Hat, Inc. and/or its affiliates. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -39,7 +39,7 @@ static void pic_irq_request(struct kvm *kvm, int level); static void pic_lock(struct kvm_pic *s) __acquires(&s->lock) { - raw_spin_lock(&s->lock); + spin_lock(&s->lock); } static void pic_unlock(struct kvm_pic *s) @@ -51,7 +51,7 @@ static void pic_unlock(struct kvm_pic *s) s->wakeup_needed = false; - raw_spin_unlock(&s->lock); + spin_unlock(&s->lock); if (wakeup) { kvm_for_each_vcpu(i, vcpu, s->kvm) { @@ -67,6 +67,7 @@ static void pic_unlock(struct kvm_pic *s) if (!found) return; + kvm_make_request(KVM_REQ_EVENT, found); kvm_vcpu_kick(found); } } @@ -308,13 +309,17 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) addr &= 1; if (addr == 0) { if (val & 0x10) { - kvm_pic_reset(s); /* init */ - /* - * deassert a pending interrupt - */ - pic_irq_request(s->pics_state->kvm, 0); - s->init_state = 1; s->init4 = val & 1; + s->last_irr = 0; + s->imr = 0; + s->priority_add = 0; + s->special_mask = 0; + s->read_reg_select = 0; + if (!s->init4) { + s->special_fully_nested_mode = 0; + s->auto_eoi = 0; + } + s->init_state = 1; if (val & 0x02) printk(KERN_ERR "single mode not supported"); if (val & 0x08) @@ -564,7 +569,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); if (!s) return NULL; - raw_spin_lock_init(&s->lock); + spin_lock_init(&s->lock); s->kvm = kvm; s->pics[0].elcr_mask = 0xf8; s->pics[1].elcr_mask = 0xde; diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 2095a049835e..7e06ba1618bd 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -1,7 +1,7 @@ /* * irq.c: API for in kernel interrupt controller * Copyright (c) 2007, Intel Corporation. - * Copyright 2009 Red Hat, Inc. and/or its affilates. + * Copyright 2009 Red Hat, Inc. and/or its affiliates. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, @@ -33,12 +33,7 @@ */ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) { - int ret; - - ret = pit_has_pending_timer(vcpu); - ret |= apic_has_pending_timer(vcpu); - - return ret; + return apic_has_pending_timer(vcpu); } EXPORT_SYMBOL(kvm_cpu_has_pending_timer); diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 63c314502993..ba910d149410 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -60,7 +60,7 @@ struct kvm_kpic_state { }; struct kvm_pic { - raw_spinlock_t lock; + spinlock_t lock; bool wakeup_needed; unsigned pending_acks; struct kvm *kvm; diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 6491ac8e755b..975bb45329a1 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -42,7 +42,14 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) (unsigned long *)&vcpu->arch.regs_avail)) kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); - return vcpu->arch.pdptrs[index]; + return vcpu->arch.walk_mmu->pdptrs[index]; +} + +static inline u64 kvm_pdptr_read_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int index) +{ + load_pdptrs(vcpu, mmu, mmu->get_cr3(vcpu)); + + return mmu->pdptrs[index]; } static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 22b06f7660f4..413f8973a855 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -5,7 +5,7 @@ * Copyright (C) 2006 Qumranet, Inc. * Copyright (C) 2007 Novell * Copyright (C) 2007 Intel - * Copyright 2009 Red Hat, Inc. and/or its affilates. + * Copyright 2009 Red Hat, Inc. and/or its affiliates. * * Authors: * Dor Laor <dor.laor@qumranet.com> @@ -259,9 +259,10 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic) static void apic_update_ppr(struct kvm_lapic *apic) { - u32 tpr, isrv, ppr; + u32 tpr, isrv, ppr, old_ppr; int isr; + old_ppr = apic_get_reg(apic, APIC_PROCPRI); tpr = apic_get_reg(apic, APIC_TASKPRI); isr = apic_find_highest_isr(apic); isrv = (isr != -1) ? isr : 0; @@ -274,7 +275,10 @@ static void apic_update_ppr(struct kvm_lapic *apic) apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x", apic, ppr, isr, isrv); - apic_set_reg(apic, APIC_PROCPRI, ppr); + if (old_ppr != ppr) { + apic_set_reg(apic, APIC_PROCPRI, ppr); + kvm_make_request(KVM_REQ_EVENT, apic->vcpu); + } } static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) @@ -391,6 +395,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, break; } + kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); break; @@ -416,6 +421,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, "INIT on a runnable vcpu %d\n", vcpu->vcpu_id); vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; + kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); } else { apic_debug("Ignoring de-assert INIT to vcpu %d\n", @@ -430,6 +436,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, result = 1; vcpu->arch.sipi_vector = vector; vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; + kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); } break; @@ -475,6 +482,7 @@ static void apic_set_eoi(struct kvm_lapic *apic) trigger_mode = IOAPIC_EDGE_TRIG; if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); + kvm_make_request(KVM_REQ_EVENT, apic->vcpu); } static void apic_send_ipi(struct kvm_lapic *apic) @@ -1151,6 +1159,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) update_divide_count(apic); start_apic_timer(apic); apic->irr_pending = true; + kvm_make_request(KVM_REQ_EVENT, vcpu); } void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 311f6dad8951..908ea5464a51 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -7,7 +7,7 @@ * MMU support * * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affilates. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Yaniv Kamay <yaniv@qumranet.com> @@ -49,15 +49,25 @@ */ bool tdp_enabled = false; -#undef MMU_DEBUG +enum { + AUDIT_PRE_PAGE_FAULT, + AUDIT_POST_PAGE_FAULT, + AUDIT_PRE_PTE_WRITE, + AUDIT_POST_PTE_WRITE, + AUDIT_PRE_SYNC, + AUDIT_POST_SYNC +}; -#undef AUDIT +char *audit_point_name[] = { + "pre page fault", + "post page fault", + "pre pte write", + "post pte write", + "pre sync", + "post sync" +}; -#ifdef AUDIT -static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg); -#else -static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} -#endif +#undef MMU_DEBUG #ifdef MMU_DEBUG @@ -71,7 +81,7 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} #endif -#if defined(MMU_DEBUG) || defined(AUDIT) +#ifdef MMU_DEBUG static int dbg = 0; module_param(dbg, bool, 0644); #endif @@ -89,6 +99,8 @@ module_param(oos_shadow, bool, 0644); } #endif +#define PTE_PREFETCH_NUM 8 + #define PT_FIRST_AVAIL_BITS_SHIFT 9 #define PT64_SECOND_AVAIL_BITS_SHIFT 52 @@ -178,6 +190,7 @@ typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte); static struct kmem_cache *pte_chain_cache; static struct kmem_cache *rmap_desc_cache; static struct kmem_cache *mmu_page_header_cache; +static struct percpu_counter kvm_total_used_mmu_pages; static u64 __read_mostly shadow_trap_nonpresent_pte; static u64 __read_mostly shadow_notrap_nonpresent_pte; @@ -299,18 +312,50 @@ static u64 __xchg_spte(u64 *sptep, u64 new_spte) #endif } +static bool spte_has_volatile_bits(u64 spte) +{ + if (!shadow_accessed_mask) + return false; + + if (!is_shadow_present_pte(spte)) + return false; + + if ((spte & shadow_accessed_mask) && + (!is_writable_pte(spte) || (spte & shadow_dirty_mask))) + return false; + + return true; +} + +static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask) +{ + return (old_spte & bit_mask) && !(new_spte & bit_mask); +} + static void update_spte(u64 *sptep, u64 new_spte) { - u64 old_spte; + u64 mask, old_spte = *sptep; + + WARN_ON(!is_rmap_spte(new_spte)); + + new_spte |= old_spte & shadow_dirty_mask; - if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) || - !is_rmap_spte(*sptep)) + mask = shadow_accessed_mask; + if (is_writable_pte(old_spte)) + mask |= shadow_dirty_mask; + + if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) __set_spte(sptep, new_spte); - else { + else old_spte = __xchg_spte(sptep, new_spte); - if (old_spte & shadow_accessed_mask) - mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); - } + + if (!shadow_accessed_mask) + return; + + if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) + kvm_set_pfn_accessed(spte_to_pfn(old_spte)); + if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) + kvm_set_pfn_dirty(spte_to_pfn(old_spte)); } static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, @@ -367,7 +412,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) if (r) goto out; r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, - rmap_desc_cache, 4); + rmap_desc_cache, 4 + PTE_PREFETCH_NUM); if (r) goto out; r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); @@ -591,6 +636,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) desc->sptes[0] = (u64 *)*rmapp; desc->sptes[1] = spte; *rmapp = (unsigned long)desc | 1; + ++count; } else { rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); @@ -603,7 +649,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) desc = desc->more; } for (i = 0; desc->sptes[i]; ++i) - ; + ++count; desc->sptes[i] = spte; } return count; @@ -645,18 +691,17 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); if (!*rmapp) { - printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); + printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte); BUG(); } else if (!(*rmapp & 1)) { - rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); + rmap_printk("rmap_remove: %p 1->0\n", spte); if ((u64 *)*rmapp != spte) { - printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", - spte, *spte); + printk(KERN_ERR "rmap_remove: %p 1->BUG\n", spte); BUG(); } *rmapp = 0; } else { - rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); + rmap_printk("rmap_remove: %p many->many\n", spte); desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); prev_desc = NULL; while (desc) { @@ -670,7 +715,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) prev_desc = desc; desc = desc->more; } - pr_err("rmap_remove: %p %llx many->many\n", spte, *spte); + pr_err("rmap_remove: %p many->many\n", spte); BUG(); } } @@ -680,18 +725,18 @@ static void set_spte_track_bits(u64 *sptep, u64 new_spte) pfn_t pfn; u64 old_spte = *sptep; - if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) || - old_spte & shadow_accessed_mask) { + if (!spte_has_volatile_bits(old_spte)) __set_spte(sptep, new_spte); - } else + else old_spte = __xchg_spte(sptep, new_spte); if (!is_rmap_spte(old_spte)) return; + pfn = spte_to_pfn(old_spte); if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) kvm_set_pfn_accessed(pfn); - if (is_writable_pte(old_spte)) + if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) kvm_set_pfn_dirty(pfn); } @@ -746,13 +791,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) } spte = rmap_next(kvm, rmapp, spte); } - if (write_protected) { - pfn_t pfn; - - spte = rmap_next(kvm, rmapp, NULL); - pfn = spte_to_pfn(*spte); - kvm_set_pfn_dirty(pfn); - } /* check for huge page mappings */ for (i = PT_DIRECTORY_LEVEL; @@ -947,6 +985,18 @@ static int is_empty_shadow_page(u64 *spt) } #endif +/* + * This value is the sum of all of the kvm instances's + * kvm->arch.n_used_mmu_pages values. We need a global, + * aggregate version in order to make the slab shrinker + * faster + */ +static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) +{ + kvm->arch.n_used_mmu_pages += nr; + percpu_counter_add(&kvm_total_used_mmu_pages, nr); +} + static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) { ASSERT(is_empty_shadow_page(sp->spt)); @@ -956,7 +1006,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) if (!sp->role.direct) __free_page(virt_to_page(sp->gfns)); kmem_cache_free(mmu_page_header_cache, sp); - ++kvm->arch.n_free_mmu_pages; + kvm_mod_used_mmu_pages(kvm, -1); } static unsigned kvm_page_table_hashfn(gfn_t gfn) @@ -979,7 +1029,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); sp->multimapped = 0; sp->parent_pte = parent_pte; - --vcpu->kvm->arch.n_free_mmu_pages; + kvm_mod_used_mmu_pages(vcpu->kvm, +1); return sp; } @@ -1403,7 +1453,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (role.direct) role.cr4_pae = 0; role.access = access; - if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { + if (!vcpu->arch.mmu.direct_map + && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; @@ -1458,6 +1509,12 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, iterator->addr = addr; iterator->shadow_addr = vcpu->arch.mmu.root_hpa; iterator->level = vcpu->arch.mmu.shadow_root_level; + + if (iterator->level == PT64_ROOT_LEVEL && + vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL && + !vcpu->arch.mmu.direct_map) + --iterator->level; + if (iterator->level == PT32E_ROOT_LEVEL) { iterator->shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; @@ -1665,41 +1722,31 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, /* * Changing the number of mmu pages allocated to the vm - * Note: if kvm_nr_mmu_pages is too small, you will get dead lock + * Note: if goal_nr_mmu_pages is too small, you will get dead lock */ -void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) +void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) { - int used_pages; LIST_HEAD(invalid_list); - - used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; - used_pages = max(0, used_pages); - /* * If we set the number of mmu pages to be smaller be than the * number of actived pages , we must to free some mmu pages before we * change the value */ - if (used_pages > kvm_nr_mmu_pages) { - while (used_pages > kvm_nr_mmu_pages && + if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { + while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages && !list_empty(&kvm->arch.active_mmu_pages)) { struct kvm_mmu_page *page; page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - used_pages -= kvm_mmu_prepare_zap_page(kvm, page, - &invalid_list); + kvm_mmu_prepare_zap_page(kvm, page, &invalid_list); + kvm_mmu_commit_zap_page(kvm, &invalid_list); } - kvm_mmu_commit_zap_page(kvm, &invalid_list); - kvm_nr_mmu_pages = used_pages; - kvm->arch.n_free_mmu_pages = 0; + goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; } - else - kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages - - kvm->arch.n_alloc_mmu_pages; - kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages; + kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; } static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) @@ -1709,11 +1756,11 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) LIST_HEAD(invalid_list); int r; - pgprintk("%s: looking for gfn %lx\n", __func__, gfn); + pgprintk("%s: looking for gfn %llx\n", __func__, gfn); r = 0; for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { - pgprintk("%s: gfn %lx role %x\n", __func__, gfn, + pgprintk("%s: gfn %llx role %x\n", __func__, gfn, sp->role.word); r = 1; kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); @@ -1729,7 +1776,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) LIST_HEAD(invalid_list); for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { - pgprintk("%s: zap %lx %x\n", + pgprintk("%s: zap %llx %x\n", __func__, gfn, sp->role.word); kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); } @@ -1925,7 +1972,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, * whether the guest actually used the pte (in order to detect * demand paging). */ - spte = shadow_base_present_pte | shadow_dirty_mask; + spte = shadow_base_present_pte; if (!speculative) spte |= shadow_accessed_mask; if (!dirty) @@ -1948,8 +1995,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= (u64)pfn << PAGE_SHIFT; if ((pte_access & ACC_WRITE_MASK) - || (!tdp_enabled && write_fault && !is_write_protection(vcpu) - && !user_fault)) { + || (!vcpu->arch.mmu.direct_map && write_fault + && !is_write_protection(vcpu) && !user_fault)) { if (level > PT_PAGE_TABLE_LEVEL && has_wrprotected_page(vcpu->kvm, gfn, level)) { @@ -1960,7 +2007,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= PT_WRITABLE_MASK; - if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK)) + if (!vcpu->arch.mmu.direct_map + && !(pte_access & ACC_WRITE_MASK)) spte &= ~PT_USER_MASK; /* @@ -1973,7 +2021,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, goto set_pte; if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { - pgprintk("%s: found shadow page for %lx, marking ro\n", + pgprintk("%s: found shadow page for %llx, marking ro\n", __func__, gfn); ret = 1; pte_access &= ~ACC_WRITE_MASK; @@ -1986,8 +2034,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, mark_page_dirty(vcpu->kvm, gfn); set_pte: - if (is_writable_pte(*sptep) && !is_writable_pte(spte)) - kvm_set_pfn_dirty(pfn); update_spte(sptep, spte); done: return ret; @@ -2004,7 +2050,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, int rmap_count; pgprintk("%s: spte %llx access %x write_fault %d" - " user_fault %d gfn %lx\n", + " user_fault %d gfn %llx\n", __func__, *sptep, pt_access, write_fault, user_fault, gfn); @@ -2023,7 +2069,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, __set_spte(sptep, shadow_trap_nonpresent_pte); kvm_flush_remote_tlbs(vcpu->kvm); } else if (pfn != spte_to_pfn(*sptep)) { - pgprintk("hfn old %lx new %lx\n", + pgprintk("hfn old %llx new %llx\n", spte_to_pfn(*sptep), pfn); drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); kvm_flush_remote_tlbs(vcpu->kvm); @@ -2040,7 +2086,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, } pgprintk("%s: setting spte %llx\n", __func__, *sptep); - pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", + pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n", is_large_pte(*sptep)? "2MB" : "4kB", *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, *sptep, sptep); @@ -2064,6 +2110,105 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) { } +static struct kvm_memory_slot * +pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) +{ + struct kvm_memory_slot *slot; + + slot = gfn_to_memslot(vcpu->kvm, gfn); + if (!slot || slot->flags & KVM_MEMSLOT_INVALID || + (no_dirty_log && slot->dirty_bitmap)) + slot = NULL; + + return slot; +} + +static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, + bool no_dirty_log) +{ + struct kvm_memory_slot *slot; + unsigned long hva; + + slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log); + if (!slot) { + get_page(bad_page); + return page_to_pfn(bad_page); + } + + hva = gfn_to_hva_memslot(slot, gfn); + + return hva_to_pfn_atomic(vcpu->kvm, hva); +} + +static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, + u64 *start, u64 *end) +{ + struct page *pages[PTE_PREFETCH_NUM]; + unsigned access = sp->role.access; + int i, ret; + gfn_t gfn; + + gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); + if (!pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK)) + return -1; + + ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start); + if (ret <= 0) + return -1; + + for (i = 0; i < ret; i++, gfn++, start++) + mmu_set_spte(vcpu, start, ACC_ALL, + access, 0, 0, 1, NULL, + sp->role.level, gfn, + page_to_pfn(pages[i]), true, true); + + return 0; +} + +static void __direct_pte_prefetch(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, u64 *sptep) +{ + u64 *spte, *start = NULL; + int i; + + WARN_ON(!sp->role.direct); + + i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); + spte = sp->spt + i; + + for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { + if (*spte != shadow_trap_nonpresent_pte || spte == sptep) { + if (!start) + continue; + if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0) + break; + start = NULL; + } else if (!start) + start = spte; + } +} + +static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) +{ + struct kvm_mmu_page *sp; + + /* + * Since it's no accessed bit on EPT, it's no way to + * distinguish between actually accessed translations + * and prefetched, so disable pte prefetch if EPT is + * enabled. + */ + if (!shadow_accessed_mask) + return; + + sp = page_header(__pa(sptep)); + if (sp->role.level > PT_PAGE_TABLE_LEVEL) + return; + + __direct_pte_prefetch(vcpu, sp, sptep); +} + static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, int level, gfn_t gfn, pfn_t pfn) { @@ -2077,6 +2222,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 0, write, 1, &pt_write, level, gfn, pfn, false, true); + direct_pte_prefetch(vcpu, iterator.sptep); ++vcpu->stat.pf_fixed; break; } @@ -2098,28 +2244,31 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, __set_spte(iterator.sptep, __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK - | shadow_user_mask | shadow_x_mask); + | shadow_user_mask | shadow_x_mask + | shadow_accessed_mask); } } return pt_write; } -static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn) +static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) { - char buf[1]; - void __user *hva; - int r; + siginfo_t info; + + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = BUS_MCEERR_AR; + info.si_addr = (void __user *)address; + info.si_addr_lsb = PAGE_SHIFT; - /* Touch the page, so send SIGBUS */ - hva = (void __user *)gfn_to_hva(kvm, gfn); - r = copy_from_user(buf, hva, 1); + send_sig_info(SIGBUS, &info, tsk); } static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) { kvm_release_pfn_clean(pfn); if (is_hwpoison_pfn(pfn)) { - kvm_send_hwpoison_signal(kvm, gfn); + kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current); return 0; } else if (is_fault_pfn(pfn)) return -EFAULT; @@ -2179,7 +2328,9 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; spin_lock(&vcpu->kvm->mmu_lock); - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL && + (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL || + vcpu->arch.mmu.direct_map)) { hpa_t root = vcpu->arch.mmu.root_hpa; sp = page_header(root); @@ -2222,80 +2373,158 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) return ret; } -static int mmu_alloc_roots(struct kvm_vcpu *vcpu) +static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) { - int i; - gfn_t root_gfn; struct kvm_mmu_page *sp; - int direct = 0; - u64 pdptr; - - root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; + unsigned i; if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu); + sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL, + 1, ACC_ALL, NULL); + ++sp->root_count; + spin_unlock(&vcpu->kvm->mmu_lock); + vcpu->arch.mmu.root_hpa = __pa(sp->spt); + } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) { + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[i]; + + ASSERT(!VALID_PAGE(root)); + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu); + sp = kvm_mmu_get_page(vcpu, i << 30, i << 30, + PT32_ROOT_LEVEL, 1, ACC_ALL, + NULL); + root = __pa(sp->spt); + ++sp->root_count; + spin_unlock(&vcpu->kvm->mmu_lock); + vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; + } + vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); + } else + BUG(); + + return 0; +} + +static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu_page *sp; + u64 pdptr, pm_mask; + gfn_t root_gfn; + int i; + + root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT; + + if (mmu_check_root(vcpu, root_gfn)) + return 1; + + /* + * Do we shadow a long mode page table? If so we need to + * write-protect the guests page table root. + */ + if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; ASSERT(!VALID_PAGE(root)); - if (mmu_check_root(vcpu, root_gfn)) - return 1; - if (tdp_enabled) { - direct = 1; - root_gfn = 0; - } + spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); - sp = kvm_mmu_get_page(vcpu, root_gfn, 0, - PT64_ROOT_LEVEL, direct, - ACC_ALL, NULL); + sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, + 0, ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); vcpu->arch.mmu.root_hpa = root; return 0; } - direct = !is_paging(vcpu); + + /* + * We shadow a 32 bit page table. This may be a legacy 2-level + * or a PAE 3-level page table. In either case we need to be aware that + * the shadow page table may be a PAE or a long mode page table. + */ + pm_mask = PT_PRESENT_MASK; + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) + pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; + for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; ASSERT(!VALID_PAGE(root)); if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { - pdptr = kvm_pdptr_read(vcpu, i); + pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i); if (!is_present_gpte(pdptr)) { vcpu->arch.mmu.pae_root[i] = 0; continue; } root_gfn = pdptr >> PAGE_SHIFT; - } else if (vcpu->arch.mmu.root_level == 0) - root_gfn = 0; - if (mmu_check_root(vcpu, root_gfn)) - return 1; - if (tdp_enabled) { - direct = 1; - root_gfn = i << 30; + if (mmu_check_root(vcpu, root_gfn)) + return 1; } spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, - PT32_ROOT_LEVEL, direct, + PT32_ROOT_LEVEL, 0, ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); - vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; + vcpu->arch.mmu.pae_root[i] = root | pm_mask; } vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); + + /* + * If we shadow a 32 bit page table with a long mode page + * table we enter this path. + */ + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + if (vcpu->arch.mmu.lm_root == NULL) { + /* + * The additional page necessary for this is only + * allocated on demand. + */ + + u64 *lm_root; + + lm_root = (void*)get_zeroed_page(GFP_KERNEL); + if (lm_root == NULL) + return 1; + + lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask; + + vcpu->arch.mmu.lm_root = lm_root; + } + + vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root); + } + return 0; } +static int mmu_alloc_roots(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.mmu.direct_map) + return mmu_alloc_direct_roots(vcpu); + else + return mmu_alloc_shadow_roots(vcpu); +} + static void mmu_sync_roots(struct kvm_vcpu *vcpu) { int i; struct kvm_mmu_page *sp; + if (vcpu->arch.mmu.direct_map) + return; + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + + trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC); + if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { hpa_t root = vcpu->arch.mmu.root_hpa; sp = page_header(root); mmu_sync_children(vcpu, sp); @@ -2310,6 +2539,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) mmu_sync_children(vcpu, sp); } } + trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC); } void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) @@ -2327,6 +2557,14 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, return vaddr; } +static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, + u32 access, u32 *error) +{ + if (error) + *error = 0; + return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access); +} + static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code) { @@ -2393,10 +2631,9 @@ static void nonpaging_free(struct kvm_vcpu *vcpu) mmu_free_roots(vcpu); } -static int nonpaging_init_context(struct kvm_vcpu *vcpu) +static int nonpaging_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { - struct kvm_mmu *context = &vcpu->arch.mmu; - context->new_cr3 = nonpaging_new_cr3; context->page_fault = nonpaging_page_fault; context->gva_to_gpa = nonpaging_gva_to_gpa; @@ -2407,6 +2644,8 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; + context->direct_map = true; + context->nx = false; return 0; } @@ -2422,11 +2661,14 @@ static void paging_new_cr3(struct kvm_vcpu *vcpu) mmu_free_roots(vcpu); } -static void inject_page_fault(struct kvm_vcpu *vcpu, - u64 addr, - u32 err_code) +static unsigned long get_cr3(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.cr3; +} + +static void inject_page_fault(struct kvm_vcpu *vcpu) { - kvm_inject_page_fault(vcpu, addr, err_code); + vcpu->arch.mmu.inject_page_fault(vcpu); } static void paging_free(struct kvm_vcpu *vcpu) @@ -2434,12 +2676,12 @@ static void paging_free(struct kvm_vcpu *vcpu) nonpaging_free(vcpu); } -static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level) +static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) { int bit7; bit7 = (gpte >> 7) & 1; - return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0; + return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; } #define PTTYPE 64 @@ -2450,13 +2692,14 @@ static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level) #include "paging_tmpl.h" #undef PTTYPE -static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) +static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, + struct kvm_mmu *context, + int level) { - struct kvm_mmu *context = &vcpu->arch.mmu; int maxphyaddr = cpuid_maxphyaddr(vcpu); u64 exb_bit_rsvd = 0; - if (!is_nx(vcpu)) + if (!context->nx) exb_bit_rsvd = rsvd_bits(63, 63); switch (level) { case PT32_ROOT_LEVEL: @@ -2511,9 +2754,13 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) } } -static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) +static int paging64_init_context_common(struct kvm_vcpu *vcpu, + struct kvm_mmu *context, + int level) { - struct kvm_mmu *context = &vcpu->arch.mmu; + context->nx = is_nx(vcpu); + + reset_rsvds_bits_mask(vcpu, context, level); ASSERT(is_pae(vcpu)); context->new_cr3 = paging_new_cr3; @@ -2526,20 +2773,23 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) context->root_level = level; context->shadow_root_level = level; context->root_hpa = INVALID_PAGE; + context->direct_map = false; return 0; } -static int paging64_init_context(struct kvm_vcpu *vcpu) +static int paging64_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { - reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); - return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL); + return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL); } -static int paging32_init_context(struct kvm_vcpu *vcpu) +static int paging32_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { - struct kvm_mmu *context = &vcpu->arch.mmu; + context->nx = false; + + reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); - reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); context->new_cr3 = paging_new_cr3; context->page_fault = paging32_page_fault; context->gva_to_gpa = paging32_gva_to_gpa; @@ -2550,18 +2800,19 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) context->root_level = PT32_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; + context->direct_map = false; return 0; } -static int paging32E_init_context(struct kvm_vcpu *vcpu) +static int paging32E_init_context(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { - reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); - return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); + return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL); } static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) { - struct kvm_mmu *context = &vcpu->arch.mmu; + struct kvm_mmu *context = vcpu->arch.walk_mmu; context->new_cr3 = nonpaging_new_cr3; context->page_fault = tdp_page_fault; @@ -2571,20 +2822,29 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->invlpg = nonpaging_invlpg; context->shadow_root_level = kvm_x86_ops->get_tdp_level(); context->root_hpa = INVALID_PAGE; + context->direct_map = true; + context->set_cr3 = kvm_x86_ops->set_tdp_cr3; + context->get_cr3 = get_cr3; + context->inject_page_fault = kvm_inject_page_fault; + context->nx = is_nx(vcpu); if (!is_paging(vcpu)) { + context->nx = false; context->gva_to_gpa = nonpaging_gva_to_gpa; context->root_level = 0; } else if (is_long_mode(vcpu)) { - reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); + context->nx = is_nx(vcpu); + reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL); context->gva_to_gpa = paging64_gva_to_gpa; context->root_level = PT64_ROOT_LEVEL; } else if (is_pae(vcpu)) { - reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); + context->nx = is_nx(vcpu); + reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL); context->gva_to_gpa = paging64_gva_to_gpa; context->root_level = PT32E_ROOT_LEVEL; } else { - reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); + context->nx = false; + reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); context->gva_to_gpa = paging32_gva_to_gpa; context->root_level = PT32_ROOT_LEVEL; } @@ -2592,33 +2852,83 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) return 0; } -static int init_kvm_softmmu(struct kvm_vcpu *vcpu) +int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { int r; - ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); if (!is_paging(vcpu)) - r = nonpaging_init_context(vcpu); + r = nonpaging_init_context(vcpu, context); else if (is_long_mode(vcpu)) - r = paging64_init_context(vcpu); + r = paging64_init_context(vcpu, context); else if (is_pae(vcpu)) - r = paging32E_init_context(vcpu); + r = paging32E_init_context(vcpu, context); else - r = paging32_init_context(vcpu); + r = paging32_init_context(vcpu, context); vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); - vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); + vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); return r; } +EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); + +static int init_kvm_softmmu(struct kvm_vcpu *vcpu) +{ + int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu); + + vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3; + vcpu->arch.walk_mmu->get_cr3 = get_cr3; + vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; + + return r; +} + +static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; + + g_context->get_cr3 = get_cr3; + g_context->inject_page_fault = kvm_inject_page_fault; + + /* + * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The + * translation of l2_gpa to l1_gpa addresses is done using the + * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa + * functions between mmu and nested_mmu are swapped. + */ + if (!is_paging(vcpu)) { + g_context->nx = false; + g_context->root_level = 0; + g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; + } else if (is_long_mode(vcpu)) { + g_context->nx = is_nx(vcpu); + reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL); + g_context->root_level = PT64_ROOT_LEVEL; + g_context->gva_to_gpa = paging64_gva_to_gpa_nested; + } else if (is_pae(vcpu)) { + g_context->nx = is_nx(vcpu); + reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL); + g_context->root_level = PT32E_ROOT_LEVEL; + g_context->gva_to_gpa = paging64_gva_to_gpa_nested; + } else { + g_context->nx = false; + reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL); + g_context->root_level = PT32_ROOT_LEVEL; + g_context->gva_to_gpa = paging32_gva_to_gpa_nested; + } + + return 0; +} static int init_kvm_mmu(struct kvm_vcpu *vcpu) { vcpu->arch.update_pte.pfn = bad_pfn; - if (tdp_enabled) + if (mmu_is_nested(vcpu)) + return init_kvm_nested_mmu(vcpu); + else if (tdp_enabled) return init_kvm_tdp_mmu(vcpu); else return init_kvm_softmmu(vcpu); @@ -2653,7 +2963,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) if (r) goto out; /* set_cr3() should ensure TLB has been flushed */ - kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); + vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa); out: return r; } @@ -2663,6 +2973,7 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu) { mmu_free_roots(vcpu); } +EXPORT_SYMBOL_GPL(kvm_mmu_unload); static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, @@ -2695,7 +3006,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, return; } - if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL)) + if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL)) return; ++vcpu->kvm->stat.mmu_pte_updated; @@ -2837,7 +3148,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_mmu_access_page(vcpu, gfn); kvm_mmu_free_some_pages(vcpu); ++vcpu->kvm->stat.mmu_pte_write; - kvm_mmu_audit(vcpu, "pre pte write"); + trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); if (guest_initiated) { if (gfn == vcpu->arch.last_pt_write_gfn && !last_updated_pte_accessed(vcpu)) { @@ -2910,7 +3221,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, } mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); - kvm_mmu_audit(vcpu, "post pte write"); + trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); spin_unlock(&vcpu->kvm->mmu_lock); if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { kvm_release_pfn_clean(vcpu->arch.update_pte.pfn); @@ -2923,7 +3234,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) gpa_t gpa; int r; - if (tdp_enabled) + if (vcpu->arch.mmu.direct_map) return 0; gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); @@ -2937,21 +3248,18 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { - int free_pages; LIST_HEAD(invalid_list); - free_pages = vcpu->kvm->arch.n_free_mmu_pages; - while (free_pages < KVM_REFILL_PAGES && + while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES && !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { struct kvm_mmu_page *sp; sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); - free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp, - &invalid_list); + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); ++vcpu->kvm->stat.mmu_recycled; } - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); } int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) @@ -3013,6 +3321,8 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp); static void free_mmu_pages(struct kvm_vcpu *vcpu) { free_page((unsigned long)vcpu->arch.mmu.pae_root); + if (vcpu->arch.mmu.lm_root != NULL) + free_page((unsigned long)vcpu->arch.mmu.lm_root); } static int alloc_mmu_pages(struct kvm_vcpu *vcpu) @@ -3054,15 +3364,6 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu) return init_kvm_mmu(vcpu); } -void kvm_mmu_destroy(struct kvm_vcpu *vcpu) -{ - ASSERT(vcpu); - - destroy_kvm_mmu(vcpu); - free_mmu_pages(vcpu); - mmu_free_memory_caches(vcpu); -} - void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) { struct kvm_mmu_page *sp; @@ -3112,23 +3413,22 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) { struct kvm *kvm; struct kvm *kvm_freed = NULL; - int cache_count = 0; + + if (nr_to_scan == 0) + goto out; spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { - int npages, idx, freed_pages; + int idx, freed_pages; LIST_HEAD(invalid_list); idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); - npages = kvm->arch.n_alloc_mmu_pages - - kvm->arch.n_free_mmu_pages; - cache_count += npages; - if (!kvm_freed && nr_to_scan > 0 && npages > 0) { + if (!kvm_freed && nr_to_scan > 0 && + kvm->arch.n_used_mmu_pages > 0) { freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, &invalid_list); - cache_count -= freed_pages; kvm_freed = kvm; } nr_to_scan--; @@ -3142,7 +3442,8 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) spin_unlock(&kvm_lock); - return cache_count; +out: + return percpu_counter_read_positive(&kvm_total_used_mmu_pages); } static struct shrinker mmu_shrinker = { @@ -3163,6 +3464,7 @@ static void mmu_destroy_caches(void) void kvm_mmu_module_exit(void) { mmu_destroy_caches(); + percpu_counter_destroy(&kvm_total_used_mmu_pages); unregister_shrinker(&mmu_shrinker); } @@ -3185,6 +3487,9 @@ int kvm_mmu_module_init(void) if (!mmu_page_header_cache) goto nomem; + if (percpu_counter_init(&kvm_total_used_mmu_pages, 0)) + goto nomem; + register_shrinker(&mmu_shrinker); return 0; @@ -3355,271 +3660,18 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) } EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); -#ifdef AUDIT - -static const char *audit_msg; - -static gva_t canonicalize(gva_t gva) -{ -#ifdef CONFIG_X86_64 - gva = (long long)(gva << 16) >> 16; +#ifdef CONFIG_KVM_MMU_AUDIT +#include "mmu_audit.c" +#else +static void mmu_audit_disable(void) { } #endif - return gva; -} - - -typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep); - -static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp, - inspect_spte_fn fn) -{ - int i; - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - u64 ent = sp->spt[i]; - - if (is_shadow_present_pte(ent)) { - if (!is_last_spte(ent, sp->role.level)) { - struct kvm_mmu_page *child; - child = page_header(ent & PT64_BASE_ADDR_MASK); - __mmu_spte_walk(kvm, child, fn); - } else - fn(kvm, &sp->spt[i]); - } - } -} - -static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) -{ - int i; - struct kvm_mmu_page *sp; - - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) - return; - if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { - hpa_t root = vcpu->arch.mmu.root_hpa; - sp = page_header(root); - __mmu_spte_walk(vcpu->kvm, sp, fn); - return; - } - for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->arch.mmu.pae_root[i]; - - if (root && VALID_PAGE(root)) { - root &= PT64_BASE_ADDR_MASK; - sp = page_header(root); - __mmu_spte_walk(vcpu->kvm, sp, fn); - } - } - return; -} - -static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, - gva_t va, int level) -{ - u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK); - int i; - gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1)); - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) { - u64 ent = pt[i]; - - if (ent == shadow_trap_nonpresent_pte) - continue; - - va = canonicalize(va); - if (is_shadow_present_pte(ent) && !is_last_spte(ent, level)) - audit_mappings_page(vcpu, ent, va, level - 1); - else { - gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL); - gfn_t gfn = gpa >> PAGE_SHIFT; - pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); - hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; - if (is_error_pfn(pfn)) { - kvm_release_pfn_clean(pfn); - continue; - } - - if (is_shadow_present_pte(ent) - && (ent & PT64_BASE_ADDR_MASK) != hpa) - printk(KERN_ERR "xx audit error: (%s) levels %d" - " gva %lx gpa %llx hpa %llx ent %llx %d\n", - audit_msg, vcpu->arch.mmu.root_level, - va, gpa, hpa, ent, - is_shadow_present_pte(ent)); - else if (ent == shadow_notrap_nonpresent_pte - && !is_error_hpa(hpa)) - printk(KERN_ERR "audit: (%s) notrap shadow," - " valid guest gva %lx\n", audit_msg, va); - kvm_release_pfn_clean(pfn); - - } - } -} - -static void audit_mappings(struct kvm_vcpu *vcpu) -{ - unsigned i; - - if (vcpu->arch.mmu.root_level == 4) - audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4); - else - for (i = 0; i < 4; ++i) - if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK) - audit_mappings_page(vcpu, - vcpu->arch.mmu.pae_root[i], - i << 30, - 2); -} - -static int count_rmaps(struct kvm_vcpu *vcpu) -{ - struct kvm *kvm = vcpu->kvm; - struct kvm_memslots *slots; - int nmaps = 0; - int i, j, k, idx; - - idx = srcu_read_lock(&kvm->srcu); - slots = kvm_memslots(kvm); - for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { - struct kvm_memory_slot *m = &slots->memslots[i]; - struct kvm_rmap_desc *d; - - for (j = 0; j < m->npages; ++j) { - unsigned long *rmapp = &m->rmap[j]; - - if (!*rmapp) - continue; - if (!(*rmapp & 1)) { - ++nmaps; - continue; - } - d = (struct kvm_rmap_desc *)(*rmapp & ~1ul); - while (d) { - for (k = 0; k < RMAP_EXT; ++k) - if (d->sptes[k]) - ++nmaps; - else - break; - d = d->more; - } - } - } - srcu_read_unlock(&kvm->srcu, idx); - return nmaps; -} - -void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) -{ - unsigned long *rmapp; - struct kvm_mmu_page *rev_sp; - gfn_t gfn; - - if (is_writable_pte(*sptep)) { - rev_sp = page_header(__pa(sptep)); - gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); - - if (!gfn_to_memslot(kvm, gfn)) { - if (!printk_ratelimit()) - return; - printk(KERN_ERR "%s: no memslot for gfn %ld\n", - audit_msg, gfn); - printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n", - audit_msg, (long int)(sptep - rev_sp->spt), - rev_sp->gfn); - dump_stack(); - return; - } - - rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); - if (!*rmapp) { - if (!printk_ratelimit()) - return; - printk(KERN_ERR "%s: no rmap for writable spte %llx\n", - audit_msg, *sptep); - dump_stack(); - } - } - -} - -void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu) -{ - mmu_spte_walk(vcpu, inspect_spte_has_rmap); -} - -static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu_page *sp; - int i; - - list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { - u64 *pt = sp->spt; - - if (sp->role.level != PT_PAGE_TABLE_LEVEL) - continue; - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - u64 ent = pt[i]; - - if (!(ent & PT_PRESENT_MASK)) - continue; - if (!is_writable_pte(ent)) - continue; - inspect_spte_has_rmap(vcpu->kvm, &pt[i]); - } - } - return; -} - -static void audit_rmap(struct kvm_vcpu *vcpu) -{ - check_writable_mappings_rmap(vcpu); - count_rmaps(vcpu); -} - -static void audit_write_protection(struct kvm_vcpu *vcpu) -{ - struct kvm_mmu_page *sp; - struct kvm_memory_slot *slot; - unsigned long *rmapp; - u64 *spte; - gfn_t gfn; - - list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) { - if (sp->role.direct) - continue; - if (sp->unsync) - continue; - - slot = gfn_to_memslot(vcpu->kvm, sp->gfn); - rmapp = &slot->rmap[gfn - slot->base_gfn]; - - spte = rmap_next(vcpu->kvm, rmapp, NULL); - while (spte) { - if (is_writable_pte(*spte)) - printk(KERN_ERR "%s: (%s) shadow page has " - "writable mappings: gfn %lx role %x\n", - __func__, audit_msg, sp->gfn, - sp->role.word); - spte = rmap_next(vcpu->kvm, rmapp, spte); - } - } -} - -static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) +void kvm_mmu_destroy(struct kvm_vcpu *vcpu) { - int olddbg = dbg; + ASSERT(vcpu); - dbg = 0; - audit_msg = msg; - audit_rmap(vcpu); - audit_write_protection(vcpu); - if (strcmp("pre pte write", audit_msg) != 0) - audit_mappings(vcpu); - audit_writable_sptes_have_rmaps(vcpu); - dbg = olddbg; + destroy_kvm_mmu(vcpu); + free_mmu_pages(vcpu); + mmu_free_memory_caches(vcpu); + mmu_audit_disable(); } - -#endif diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index be66759321a5..7086ca85d3e7 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -49,10 +49,17 @@ #define PFERR_FETCH_MASK (1U << 4) int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); +int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context); + +static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) +{ + return kvm->arch.n_max_mmu_pages - + kvm->arch.n_used_mmu_pages; +} static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { - if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) + if (unlikely(kvm_mmu_available_pages(vcpu->kvm)< KVM_MIN_FREE_MMU_PAGES)) __kvm_mmu_free_some_pages(vcpu); } diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c new file mode 100644 index 000000000000..ba2bcdde6221 --- /dev/null +++ b/arch/x86/kvm/mmu_audit.c @@ -0,0 +1,299 @@ +/* + * mmu_audit.c: + * + * Audit code for KVM MMU + * + * Copyright (C) 2006 Qumranet, Inc. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. + * + * Authors: + * Yaniv Kamay <yaniv@qumranet.com> + * Avi Kivity <avi@qumranet.com> + * Marcelo Tosatti <mtosatti@redhat.com> + * Xiao Guangrong <xiaoguangrong@cn.fujitsu.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include <linux/ratelimit.h> + +static int audit_point; + +#define audit_printk(fmt, args...) \ + printk(KERN_ERR "audit: (%s) error: " \ + fmt, audit_point_name[audit_point], ##args) + +typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level); + +static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + inspect_spte_fn fn, int level) +{ + int i; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + u64 *ent = sp->spt; + + fn(vcpu, ent + i, level); + + if (is_shadow_present_pte(ent[i]) && + !is_last_spte(ent[i], level)) { + struct kvm_mmu_page *child; + + child = page_header(ent[i] & PT64_BASE_ADDR_MASK); + __mmu_spte_walk(vcpu, child, fn, level - 1); + } + } +} + +static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) +{ + int i; + struct kvm_mmu_page *sp; + + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return; + + if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) { + hpa_t root = vcpu->arch.mmu.root_hpa; + + sp = page_header(root); + __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_LEVEL); + return; + } + + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[i]; + + if (root && VALID_PAGE(root)) { + root &= PT64_BASE_ADDR_MASK; + sp = page_header(root); + __mmu_spte_walk(vcpu, sp, fn, 2); + } + } + + return; +} + +typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp); + +static void walk_all_active_sps(struct kvm *kvm, sp_handler fn) +{ + struct kvm_mmu_page *sp; + + list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) + fn(kvm, sp); +} + +static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) +{ + struct kvm_mmu_page *sp; + gfn_t gfn; + pfn_t pfn; + hpa_t hpa; + + sp = page_header(__pa(sptep)); + + if (sp->unsync) { + if (level != PT_PAGE_TABLE_LEVEL) { + audit_printk("unsync sp: %p level = %d\n", sp, level); + return; + } + + if (*sptep == shadow_notrap_nonpresent_pte) { + audit_printk("notrap spte in unsync sp: %p\n", sp); + return; + } + } + + if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) { + audit_printk("notrap spte in direct sp: %p\n", sp); + return; + } + + if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level)) + return; + + gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); + pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn); + + if (is_error_pfn(pfn)) { + kvm_release_pfn_clean(pfn); + return; + } + + hpa = pfn << PAGE_SHIFT; + if ((*sptep & PT64_BASE_ADDR_MASK) != hpa) + audit_printk("levels %d pfn %llx hpa %llx ent %llxn", + vcpu->arch.mmu.root_level, pfn, hpa, *sptep); +} + +static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) +{ + unsigned long *rmapp; + struct kvm_mmu_page *rev_sp; + gfn_t gfn; + + + rev_sp = page_header(__pa(sptep)); + gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); + + if (!gfn_to_memslot(kvm, gfn)) { + if (!printk_ratelimit()) + return; + audit_printk("no memslot for gfn %llx\n", gfn); + audit_printk("index %ld of sp (gfn=%llx)\n", + (long int)(sptep - rev_sp->spt), rev_sp->gfn); + dump_stack(); + return; + } + + rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); + if (!*rmapp) { + if (!printk_ratelimit()) + return; + audit_printk("no rmap for writable spte %llx\n", *sptep); + dump_stack(); + } +} + +static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level) +{ + if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level)) + inspect_spte_has_rmap(vcpu->kvm, sptep); +} + +static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level) +{ + struct kvm_mmu_page *sp = page_header(__pa(sptep)); + + if (audit_point == AUDIT_POST_SYNC && sp->unsync) + audit_printk("meet unsync sp(%p) after sync root.\n", sp); +} + +static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + int i; + + if (sp->role.level != PT_PAGE_TABLE_LEVEL) + return; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + if (!is_rmap_spte(sp->spt[i])) + continue; + + inspect_spte_has_rmap(kvm, sp->spt + i); + } +} + +static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + struct kvm_memory_slot *slot; + unsigned long *rmapp; + u64 *spte; + + if (sp->role.direct || sp->unsync || sp->role.invalid) + return; + + slot = gfn_to_memslot(kvm, sp->gfn); + rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; + + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + if (is_writable_pte(*spte)) + audit_printk("shadow page has writable mappings: gfn " + "%llx role %x\n", sp->gfn, sp->role.word); + spte = rmap_next(kvm, rmapp, spte); + } +} + +static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + check_mappings_rmap(kvm, sp); + audit_write_protection(kvm, sp); +} + +static void audit_all_active_sps(struct kvm *kvm) +{ + walk_all_active_sps(kvm, audit_sp); +} + +static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level) +{ + audit_sptes_have_rmaps(vcpu, sptep, level); + audit_mappings(vcpu, sptep, level); + audit_spte_after_sync(vcpu, sptep, level); +} + +static void audit_vcpu_spte(struct kvm_vcpu *vcpu) +{ + mmu_spte_walk(vcpu, audit_spte); +} + +static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point) +{ + static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); + + if (!__ratelimit(&ratelimit_state)) + return; + + audit_point = point; + audit_all_active_sps(vcpu->kvm); + audit_vcpu_spte(vcpu); +} + +static bool mmu_audit; + +static void mmu_audit_enable(void) +{ + int ret; + + if (mmu_audit) + return; + + ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL); + WARN_ON(ret); + + mmu_audit = true; +} + +static void mmu_audit_disable(void) +{ + if (!mmu_audit) + return; + + unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL); + tracepoint_synchronize_unregister(); + mmu_audit = false; +} + +static int mmu_audit_set(const char *val, const struct kernel_param *kp) +{ + int ret; + unsigned long enable; + + ret = strict_strtoul(val, 10, &enable); + if (ret < 0) + return -EINVAL; + + switch (enable) { + case 0: + mmu_audit_disable(); + break; + case 1: + mmu_audit_enable(); + break; + default: + return -EINVAL; + } + + return 0; +} + +static struct kernel_param_ops audit_param_ops = { + .set = mmu_audit_set, + .get = param_get_bool, +}; + +module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644); diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 3aab0f0930ef..b60b4fdb3eda 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -195,6 +195,25 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page, TP_ARGS(sp) ); + +TRACE_EVENT( + kvm_mmu_audit, + TP_PROTO(struct kvm_vcpu *vcpu, int audit_point), + TP_ARGS(vcpu, audit_point), + + TP_STRUCT__entry( + __field(struct kvm_vcpu *, vcpu) + __field(int, audit_point) + ), + + TP_fast_assign( + __entry->vcpu = vcpu; + __entry->audit_point = audit_point; + ), + + TP_printk("vcpu:%d %s", __entry->vcpu->cpu, + audit_point_name[__entry->audit_point]) +); #endif /* _TRACE_KVMMMU_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 51ef9097960d..cd7a833a3b52 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -7,7 +7,7 @@ * MMU support * * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affilates. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Yaniv Kamay <yaniv@qumranet.com> @@ -67,6 +67,7 @@ struct guest_walker { int level; gfn_t table_gfn[PT_MAX_FULL_LEVELS]; pt_element_t ptes[PT_MAX_FULL_LEVELS]; + pt_element_t prefetch_ptes[PTE_PREFETCH_NUM]; gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; unsigned pt_access; unsigned pte_access; @@ -104,7 +105,7 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; #if PTTYPE == 64 - if (is_nx(vcpu)) + if (vcpu->arch.mmu.nx) access &= ~(gpte >> PT64_NX_SHIFT); #endif return access; @@ -113,26 +114,32 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte) /* * Fetch a guest pte for a guest virtual address */ -static int FNAME(walk_addr)(struct guest_walker *walker, - struct kvm_vcpu *vcpu, gva_t addr, - int write_fault, int user_fault, int fetch_fault) +static int FNAME(walk_addr_generic)(struct guest_walker *walker, + struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + gva_t addr, u32 access) { pt_element_t pte; gfn_t table_gfn; unsigned index, pt_access, uninitialized_var(pte_access); gpa_t pte_gpa; bool eperm, present, rsvd_fault; + int offset, write_fault, user_fault, fetch_fault; + + write_fault = access & PFERR_WRITE_MASK; + user_fault = access & PFERR_USER_MASK; + fetch_fault = access & PFERR_FETCH_MASK; trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, fetch_fault); walk: present = true; eperm = rsvd_fault = false; - walker->level = vcpu->arch.mmu.root_level; - pte = vcpu->arch.cr3; + walker->level = mmu->root_level; + pte = mmu->get_cr3(vcpu); + #if PTTYPE == 64 - if (!is_long_mode(vcpu)) { - pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); + if (walker->level == PT32E_ROOT_LEVEL) { + pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3); trace_kvm_mmu_paging_element(pte, walker->level); if (!is_present_gpte(pte)) { present = false; @@ -142,7 +149,7 @@ walk: } #endif ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || - (vcpu->arch.cr3 & CR3_NONPAE_RESERVED_BITS) == 0); + (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0); pt_access = ACC_ALL; @@ -150,12 +157,14 @@ walk: index = PT_INDEX(addr, walker->level); table_gfn = gpte_to_gfn(pte); - pte_gpa = gfn_to_gpa(table_gfn); - pte_gpa += index * sizeof(pt_element_t); + offset = index * sizeof(pt_element_t); + pte_gpa = gfn_to_gpa(table_gfn) + offset; walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; - if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) { + if (kvm_read_guest_page_mmu(vcpu, mmu, table_gfn, &pte, + offset, sizeof(pte), + PFERR_USER_MASK|PFERR_WRITE_MASK)) { present = false; break; } @@ -167,7 +176,7 @@ walk: break; } - if (is_rsvd_bits_set(vcpu, pte, walker->level)) { + if (is_rsvd_bits_set(&vcpu->arch.mmu, pte, walker->level)) { rsvd_fault = true; break; } @@ -204,17 +213,28 @@ walk: (PTTYPE == 64 || is_pse(vcpu))) || ((walker->level == PT_PDPE_LEVEL) && is_large_pte(pte) && - is_long_mode(vcpu))) { + mmu->root_level == PT64_ROOT_LEVEL)) { int lvl = walker->level; + gpa_t real_gpa; + gfn_t gfn; + u32 ac; - walker->gfn = gpte_to_gfn_lvl(pte, lvl); - walker->gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) - >> PAGE_SHIFT; + gfn = gpte_to_gfn_lvl(pte, lvl); + gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT; if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36()) - walker->gfn += pse36_gfn_delta(pte); + gfn += pse36_gfn_delta(pte); + + ac = write_fault | fetch_fault | user_fault; + + real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), + ac); + if (real_gpa == UNMAPPED_GVA) + return 0; + + walker->gfn = real_gpa >> PAGE_SHIFT; break; } @@ -249,18 +269,36 @@ error: walker->error_code = 0; if (present) walker->error_code |= PFERR_PRESENT_MASK; - if (write_fault) - walker->error_code |= PFERR_WRITE_MASK; - if (user_fault) - walker->error_code |= PFERR_USER_MASK; - if (fetch_fault && is_nx(vcpu)) + + walker->error_code |= write_fault | user_fault; + + if (fetch_fault && mmu->nx) walker->error_code |= PFERR_FETCH_MASK; if (rsvd_fault) walker->error_code |= PFERR_RSVD_MASK; + + vcpu->arch.fault.address = addr; + vcpu->arch.fault.error_code = walker->error_code; + trace_kvm_mmu_walker_error(walker->error_code); return 0; } +static int FNAME(walk_addr)(struct guest_walker *walker, + struct kvm_vcpu *vcpu, gva_t addr, u32 access) +{ + return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr, + access); +} + +static int FNAME(walk_addr_nested)(struct guest_walker *walker, + struct kvm_vcpu *vcpu, gva_t addr, + u32 access) +{ + return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu, + addr, access); +} + static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *pte) { @@ -302,14 +340,87 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, struct guest_walker *gw, int level) { - int r; pt_element_t curr_pte; - - r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1], + gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1]; + u64 mask; + int r, index; + + if (level == PT_PAGE_TABLE_LEVEL) { + mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1; + base_gpa = pte_gpa & ~mask; + index = (pte_gpa - base_gpa) / sizeof(pt_element_t); + + r = kvm_read_guest_atomic(vcpu->kvm, base_gpa, + gw->prefetch_ptes, sizeof(gw->prefetch_ptes)); + curr_pte = gw->prefetch_ptes[index]; + } else + r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &curr_pte, sizeof(curr_pte)); + return r || curr_pte != gw->ptes[level - 1]; } +static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, + u64 *sptep) +{ + struct kvm_mmu_page *sp; + struct kvm_mmu *mmu = &vcpu->arch.mmu; + pt_element_t *gptep = gw->prefetch_ptes; + u64 *spte; + int i; + + sp = page_header(__pa(sptep)); + + if (sp->role.level > PT_PAGE_TABLE_LEVEL) + return; + + if (sp->role.direct) + return __direct_pte_prefetch(vcpu, sp, sptep); + + i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1); + spte = sp->spt + i; + + for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { + pt_element_t gpte; + unsigned pte_access; + gfn_t gfn; + pfn_t pfn; + bool dirty; + + if (spte == sptep) + continue; + + if (*spte != shadow_trap_nonpresent_pte) + continue; + + gpte = gptep[i]; + + if (!is_present_gpte(gpte) || + is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL)) { + if (!sp->unsync) + __set_spte(spte, shadow_notrap_nonpresent_pte); + continue; + } + + if (!(gpte & PT_ACCESSED_MASK)) + continue; + + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + gfn = gpte_to_gfn(gpte); + dirty = is_dirty_gpte(gpte); + pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, + (pte_access & ACC_WRITE_MASK) && dirty); + if (is_error_pfn(pfn)) { + kvm_release_pfn_clean(pfn); + break; + } + + mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, + dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn, + pfn, true, true); + } +} + /* * Fetch a shadow pte for a specific level in the paging hierarchy. */ @@ -391,6 +502,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, user_fault, write_fault, dirty, ptwrite, it.level, gw->gfn, pfn, false, true); + FNAME(pte_prefetch)(vcpu, gw, it.sptep); return it.sptep; @@ -420,7 +532,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, { int write_fault = error_code & PFERR_WRITE_MASK; int user_fault = error_code & PFERR_USER_MASK; - int fetch_fault = error_code & PFERR_FETCH_MASK; struct guest_walker walker; u64 *sptep; int write_pt = 0; @@ -430,7 +541,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, unsigned long mmu_seq; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); - kvm_mmu_audit(vcpu, "pre page fault"); r = mmu_topup_memory_caches(vcpu); if (r) @@ -439,15 +549,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, /* * Look up the guest pte for the faulting address. */ - r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, - fetch_fault); + r = FNAME(walk_addr)(&walker, vcpu, addr, error_code); /* * The page is not mapped by the guest. Let the guest handle it. */ if (!r) { pgprintk("%s: guest page fault\n", __func__); - inject_page_fault(vcpu, addr, walker.error_code); + inject_page_fault(vcpu); vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ return 0; } @@ -468,6 +577,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; + + trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); kvm_mmu_free_some_pages(vcpu); sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, level, &write_pt, pfn); @@ -479,7 +590,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ ++vcpu->stat.pf_fixed; - kvm_mmu_audit(vcpu, "post page fault (fixed)"); + trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); spin_unlock(&vcpu->kvm->mmu_lock); return write_pt; @@ -556,10 +667,25 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, gpa_t gpa = UNMAPPED_GVA; int r; - r = FNAME(walk_addr)(&walker, vcpu, vaddr, - !!(access & PFERR_WRITE_MASK), - !!(access & PFERR_USER_MASK), - !!(access & PFERR_FETCH_MASK)); + r = FNAME(walk_addr)(&walker, vcpu, vaddr, access); + + if (r) { + gpa = gfn_to_gpa(walker.gfn); + gpa |= vaddr & ~PAGE_MASK; + } else if (error) + *error = walker.error_code; + + return gpa; +} + +static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr, + u32 access, u32 *error) +{ + struct guest_walker walker; + gpa_t gpa = UNMAPPED_GVA; + int r; + + r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access); if (r) { gpa = gfn_to_gpa(walker.gfn); @@ -638,7 +764,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, return -EINVAL; gfn = gpte_to_gfn(gpte); - if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL) + if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL) || gfn != sp->gfns[i] || !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) { u64 nonpresent; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8a3f9f64f86f..82e144a4e514 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4,7 +4,7 @@ * AMD SVM support * * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affilates. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Yaniv Kamay <yaniv@qumranet.com> @@ -88,6 +88,14 @@ struct nested_state { /* A VMEXIT is required but not yet emulated */ bool exit_required; + /* + * If we vmexit during an instruction emulation we need this to restore + * the l1 guest rip after the emulation + */ + unsigned long vmexit_rip; + unsigned long vmexit_rsp; + unsigned long vmexit_rax; + /* cache for intercepts of the guest */ u16 intercept_cr_read; u16 intercept_cr_write; @@ -96,6 +104,8 @@ struct nested_state { u32 intercept_exceptions; u64 intercept; + /* Nested Paging related state */ + u64 nested_cr3; }; #define MSRPM_OFFSETS 16 @@ -284,6 +294,15 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu) force_new_asid(vcpu); } +static int get_npt_level(void) +{ +#ifdef CONFIG_X86_64 + return PT64_ROOT_LEVEL; +#else + return PT32E_ROOT_LEVEL; +#endif +} + static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) { vcpu->arch.efer = efer; @@ -701,6 +720,29 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) seg->base = 0; } +static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u64 g_tsc_offset = 0; + + if (is_nested(svm)) { + g_tsc_offset = svm->vmcb->control.tsc_offset - + svm->nested.hsave->control.tsc_offset; + svm->nested.hsave->control.tsc_offset = offset; + } + + svm->vmcb->control.tsc_offset = offset + g_tsc_offset; +} + +static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->control.tsc_offset += adjustment; + if (is_nested(svm)) + svm->nested.hsave->control.tsc_offset += adjustment; +} + static void init_vmcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -793,7 +835,7 @@ static void init_vmcb(struct vcpu_svm *svm) init_sys_seg(&save->ldtr, SEG_TYPE_LDT); init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); - save->efer = EFER_SVME; + svm_set_efer(&svm->vcpu, 0); save->dr6 = 0xffff0ff0; save->dr7 = 0x400; save->rflags = 2; @@ -804,8 +846,8 @@ static void init_vmcb(struct vcpu_svm *svm) * This is the guest-visible cr0 value. * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. */ - svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; - (void)kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0); + svm->vcpu.arch.cr0 = 0; + (void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); save->cr4 = X86_CR4_PAE; /* rdx = ?? */ @@ -901,7 +943,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; svm->asid_generation = 0; init_vmcb(svm); - svm->vmcb->control.tsc_offset = 0-native_read_tsc(); + kvm_write_tsc(&svm->vcpu, 0); err = fx_init(&svm->vcpu); if (err) @@ -947,20 +989,6 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) int i; if (unlikely(cpu != vcpu->cpu)) { - u64 delta; - - if (check_tsc_unstable()) { - /* - * Make sure that the guest sees a monotonically - * increasing TSC. - */ - delta = vcpu->arch.host_tsc - native_read_tsc(); - svm->vmcb->control.tsc_offset += delta; - if (is_nested(svm)) - svm->nested.hsave->control.tsc_offset += delta; - } - vcpu->cpu = cpu; - kvm_migrate_timers(vcpu); svm->asid_generation = 0; } @@ -976,8 +1004,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) ++vcpu->stat.host_state_reload; for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); - - vcpu->arch.host_tsc = native_read_tsc(); } static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) @@ -995,7 +1021,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) switch (reg) { case VCPU_EXREG_PDPTR: BUG_ON(!npt_enabled); - load_pdptrs(vcpu, vcpu->arch.cr3); + load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3); break; default: BUG(); @@ -1206,8 +1232,12 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (old == new) { /* cr0 write with ts and mp unchanged */ svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; - if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) + if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) { + svm->nested.vmexit_rip = kvm_rip_read(vcpu); + svm->nested.vmexit_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); + svm->nested.vmexit_rax = kvm_register_read(vcpu, VCPU_REGS_RAX); return; + } } } @@ -1581,6 +1611,54 @@ static int vmmcall_interception(struct vcpu_svm *svm) return 1; } +static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + return svm->nested.nested_cr3; +} + +static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu, + unsigned long root) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->control.nested_cr3 = root; + force_new_asid(vcpu); +} + +static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->control.exit_code = SVM_EXIT_NPF; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = vcpu->arch.fault.error_code; + svm->vmcb->control.exit_info_2 = vcpu->arch.fault.address; + + nested_svm_vmexit(svm); +} + +static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) +{ + int r; + + r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu); + + vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3; + vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3; + vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit; + vcpu->arch.mmu.shadow_root_level = get_npt_level(); + vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; + + return r; +} + +static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu) +{ + vcpu->arch.walk_mmu = &vcpu->arch.mmu; +} + static int nested_svm_check_permissions(struct vcpu_svm *svm) { if (!(svm->vcpu.arch.efer & EFER_SVME) @@ -1629,6 +1707,14 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm) if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) return false; + /* + * if vmexit was already requested (by intercepted exception + * for instance) do not overwrite it with "external interrupt" + * vmexit. + */ + if (svm->nested.exit_required) + return false; + svm->vmcb->control.exit_code = SVM_EXIT_INTR; svm->vmcb->control.exit_info_1 = 0; svm->vmcb->control.exit_info_2 = 0; @@ -1896,6 +1982,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) nested_vmcb->save.ds = vmcb->save.ds; nested_vmcb->save.gdtr = vmcb->save.gdtr; nested_vmcb->save.idtr = vmcb->save.idtr; + nested_vmcb->save.efer = svm->vcpu.arch.efer; nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); nested_vmcb->save.cr3 = svm->vcpu.arch.cr3; nested_vmcb->save.cr2 = vmcb->save.cr2; @@ -1917,6 +2004,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; + nested_vmcb->control.next_rip = vmcb->control.next_rip; /* * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have @@ -1947,6 +2035,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) kvm_clear_exception_queue(&svm->vcpu); kvm_clear_interrupt_queue(&svm->vcpu); + svm->nested.nested_cr3 = 0; + /* Restore selected save entries */ svm->vmcb->save.es = hsave->save.es; svm->vmcb->save.cs = hsave->save.cs; @@ -1973,6 +2063,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) nested_svm_unmap(page); + nested_svm_uninit_mmu_context(&svm->vcpu); kvm_mmu_reset_context(&svm->vcpu); kvm_mmu_load(&svm->vcpu); @@ -2012,6 +2103,20 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) return true; } +static bool nested_vmcb_checks(struct vmcb *vmcb) +{ + if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0) + return false; + + if (vmcb->control.asid == 0) + return false; + + if (vmcb->control.nested_ctl && !npt_enabled) + return false; + + return true; +} + static bool nested_svm_vmrun(struct vcpu_svm *svm) { struct vmcb *nested_vmcb; @@ -2026,7 +2131,18 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) if (!nested_vmcb) return false; - trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, vmcb_gpa, + if (!nested_vmcb_checks(nested_vmcb)) { + nested_vmcb->control.exit_code = SVM_EXIT_ERR; + nested_vmcb->control.exit_code_hi = 0; + nested_vmcb->control.exit_info_1 = 0; + nested_vmcb->control.exit_info_2 = 0; + + nested_svm_unmap(page); + + return false; + } + + trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa, nested_vmcb->save.rip, nested_vmcb->control.int_ctl, nested_vmcb->control.event_inj, @@ -2055,7 +2171,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); hsave->save.cr4 = svm->vcpu.arch.cr4; hsave->save.rflags = vmcb->save.rflags; - hsave->save.rip = svm->next_rip; + hsave->save.rip = kvm_rip_read(&svm->vcpu); hsave->save.rsp = vmcb->save.rsp; hsave->save.rax = vmcb->save.rax; if (npt_enabled) @@ -2070,6 +2186,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) else svm->vcpu.arch.hflags &= ~HF_HIF_MASK; + if (nested_vmcb->control.nested_ctl) { + kvm_mmu_unload(&svm->vcpu); + svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3; + nested_svm_init_mmu_context(&svm->vcpu); + } + /* Load the nested guest state */ svm->vmcb->save.es = nested_vmcb->save.es; svm->vmcb->save.cs = nested_vmcb->save.cs; @@ -2227,8 +2349,8 @@ static int vmrun_interception(struct vcpu_svm *svm) if (nested_svm_check_permissions(svm)) return 1; - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); + /* Save rip after vmrun instruction */ + kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3); if (!nested_svm_vmrun(svm)) return 1; @@ -2257,6 +2379,7 @@ static int stgi_interception(struct vcpu_svm *svm) svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); enable_gif(svm); @@ -2399,6 +2522,23 @@ static int emulate_on_interception(struct vcpu_svm *svm) return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; } +static int cr0_write_interception(struct vcpu_svm *svm) +{ + struct kvm_vcpu *vcpu = &svm->vcpu; + int r; + + r = emulate_instruction(&svm->vcpu, 0, 0, 0); + + if (svm->nested.vmexit_rip) { + kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip); + kvm_register_write(vcpu, VCPU_REGS_RSP, svm->nested.vmexit_rsp); + kvm_register_write(vcpu, VCPU_REGS_RAX, svm->nested.vmexit_rax); + svm->nested.vmexit_rip = 0; + } + + return r == EMULATE_DONE; +} + static int cr8_write_interception(struct vcpu_svm *svm) { struct kvm_run *kvm_run = svm->vcpu.run; @@ -2542,20 +2682,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) struct vcpu_svm *svm = to_svm(vcpu); switch (ecx) { - case MSR_IA32_TSC: { - u64 tsc_offset = data - native_read_tsc(); - u64 g_tsc_offset = 0; - - if (is_nested(svm)) { - g_tsc_offset = svm->vmcb->control.tsc_offset - - svm->nested.hsave->control.tsc_offset; - svm->nested.hsave->control.tsc_offset = tsc_offset; - } - - svm->vmcb->control.tsc_offset = tsc_offset + g_tsc_offset; - + case MSR_IA32_TSC: + kvm_write_tsc(vcpu, data); break; - } case MSR_STAR: svm->vmcb->save.star = data; break; @@ -2643,6 +2772,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm) { struct kvm_run *kvm_run = svm->vcpu.run; + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); svm_clear_vintr(svm); svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; /* @@ -2672,7 +2802,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_CR4] = emulate_on_interception, [SVM_EXIT_READ_CR8] = emulate_on_interception, [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, - [SVM_EXIT_WRITE_CR0] = emulate_on_interception, + [SVM_EXIT_WRITE_CR0] = cr0_write_interception, [SVM_EXIT_WRITE_CR3] = emulate_on_interception, [SVM_EXIT_WRITE_CR4] = emulate_on_interception, [SVM_EXIT_WRITE_CR8] = cr8_write_interception, @@ -2871,7 +3001,8 @@ static int handle_exit(struct kvm_vcpu *vcpu) if (is_external_interrupt(svm->vmcb->control.exit_int_info) && exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && - exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH) + exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH && + exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI) printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " "exit_code 0x%x\n", __func__, svm->vmcb->control.exit_int_info, @@ -3088,8 +3219,10 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) svm->int3_injected = 0; - if (svm->vcpu.arch.hflags & HF_IRET_MASK) + if (svm->vcpu.arch.hflags & HF_IRET_MASK) { svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + } svm->vcpu.arch.nmi_injected = false; kvm_clear_exception_queue(&svm->vcpu); @@ -3098,6 +3231,8 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) if (!(exitintinfo & SVM_EXITINTINFO_VALID)) return; + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); + vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; @@ -3134,6 +3269,17 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) } } +static void svm_cancel_injection(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb_control_area *control = &svm->vmcb->control; + + control->exit_int_info = control->event_inj; + control->exit_int_info_err = control->event_inj_err; + control->event_inj = 0; + svm_complete_interrupts(svm); +} + #ifdef CONFIG_X86_64 #define R "r" #else @@ -3167,9 +3313,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) savesegment(gs, gs_selector); ldt_selector = kvm_read_ldt(); svm->vmcb->save.cr2 = vcpu->arch.cr2; - /* required for live migration with NPT */ - if (npt_enabled) - svm->vmcb->save.cr3 = vcpu->arch.cr3; clgi(); @@ -3291,16 +3434,22 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) { struct vcpu_svm *svm = to_svm(vcpu); - if (npt_enabled) { - svm->vmcb->control.nested_cr3 = root; - force_new_asid(vcpu); - return; - } - svm->vmcb->save.cr3 = root; force_new_asid(vcpu); } +static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + svm->vmcb->control.nested_cr3 = root; + + /* Also sync guest cr3 here in case we live migrate */ + svm->vmcb->save.cr3 = vcpu->arch.cr3; + + force_new_asid(vcpu); +} + static int is_disabled(void) { u64 vm_cr; @@ -3333,15 +3482,6 @@ static bool svm_cpu_has_accelerated_tpr(void) return false; } -static int get_npt_level(void) -{ -#ifdef CONFIG_X86_64 - return PT64_ROOT_LEVEL; -#else - return PT32E_ROOT_LEVEL; -#endif -} - static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { return 0; @@ -3354,12 +3494,25 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu) static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) { switch (func) { + case 0x80000001: + if (nested) + entry->ecx |= (1 << 2); /* Set SVM bit */ + break; case 0x8000000A: entry->eax = 1; /* SVM revision 1 */ entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper ASID emulation to nested SVM */ entry->ecx = 0; /* Reserved */ - entry->edx = 0; /* Do not support any additional features */ + entry->edx = 0; /* Per default do not support any + additional features */ + + /* Support next_rip if host supports it */ + if (svm_has(SVM_FEATURE_NRIP)) + entry->edx |= SVM_FEATURE_NRIP; + + /* Support NPT for the guest if enabled */ + if (npt_enabled) + entry->edx |= SVM_FEATURE_NPT; break; } @@ -3497,6 +3650,7 @@ static struct kvm_x86_ops svm_x86_ops = { .set_irq = svm_set_irq, .set_nmi = svm_inject_nmi, .queue_exception = svm_queue_exception, + .cancel_injection = svm_cancel_injection, .interrupt_allowed = svm_interrupt_allowed, .nmi_allowed = svm_nmi_allowed, .get_nmi_mask = svm_get_nmi_mask, @@ -3519,6 +3673,11 @@ static struct kvm_x86_ops svm_x86_ops = { .set_supported_cpuid = svm_set_supported_cpuid, .has_wbinvd_exit = svm_has_wbinvd_exit, + + .write_tsc_offset = svm_write_tsc_offset, + .adjust_tsc_offset = svm_adjust_tsc_offset, + + .set_tdp_cr3 = set_tdp_cr3, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c index e16a0dbe74d8..fc7a101c4a35 100644 --- a/arch/x86/kvm/timer.c +++ b/arch/x86/kvm/timer.c @@ -6,7 +6,7 @@ * * timer support * - * Copyright 2010 Red Hat, Inc. and/or its affilates. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * This work is licensed under the terms of the GNU GPL, version 2. See * the COPYING file in the top-level directory. diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7bddfab12013..8da0e45ff7c9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5,7 +5,7 @@ * machines without emulation or binary translation. * * Copyright (C) 2006 Qumranet, Inc. - * Copyright 2010 Red Hat, Inc. and/or its affilates. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Avi Kivity <avi@qumranet.com> @@ -125,6 +125,7 @@ struct vcpu_vmx { unsigned long host_rsp; int launched; u8 fail; + u32 exit_intr_info; u32 idt_vectoring_info; struct shared_msr_entry *guest_msrs; int nmsrs; @@ -154,11 +155,6 @@ struct vcpu_vmx { u32 limit; u32 ar; } tr, es, ds, fs, gs; - struct { - bool pending; - u8 vector; - unsigned rip; - } irq; } rmode; int vpid; bool emulation_required; @@ -505,7 +501,6 @@ static void __vcpu_clear(void *arg) vmcs_clear(vmx->vmcs); if (per_cpu(current_vmcs, cpu) == vmx->vmcs) per_cpu(current_vmcs, cpu) = NULL; - rdtscll(vmx->vcpu.arch.host_tsc); list_del(&vmx->local_vcpus_link); vmx->vcpu.cpu = -1; vmx->launched = 0; @@ -706,11 +701,10 @@ static void reload_tss(void) /* * VT restores TR but not its size. Useless. */ - struct desc_ptr gdt; + struct desc_ptr *gdt = &__get_cpu_var(host_gdt); struct desc_struct *descs; - native_store_gdt(&gdt); - descs = (void *)gdt.address; + descs = (void *)gdt->address; descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ load_TR_desc(); } @@ -753,7 +747,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) static unsigned long segment_base(u16 selector) { - struct desc_ptr gdt; + struct desc_ptr *gdt = &__get_cpu_var(host_gdt); struct desc_struct *d; unsigned long table_base; unsigned long v; @@ -761,8 +755,7 @@ static unsigned long segment_base(u16 selector) if (!(selector & ~3)) return 0; - native_store_gdt(&gdt); - table_base = gdt.address; + table_base = gdt->address; if (selector & 4) { /* from ldt */ u16 ldt_selector = kvm_read_ldt(); @@ -883,7 +876,6 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx) static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 tsc_this, delta, new_offset; u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); if (!vmm_exclusive) @@ -897,37 +889,24 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } if (vcpu->cpu != cpu) { - struct desc_ptr dt; + struct desc_ptr *gdt = &__get_cpu_var(host_gdt); unsigned long sysenter_esp; - kvm_migrate_timers(vcpu); kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); local_irq_disable(); list_add(&vmx->local_vcpus_link, &per_cpu(vcpus_on_cpu, cpu)); local_irq_enable(); - vcpu->cpu = cpu; /* * Linux uses per-cpu TSS and GDT, so set these when switching * processors. */ vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ - native_store_gdt(&dt); - vmcs_writel(HOST_GDTR_BASE, dt.address); /* 22.2.4 */ + vmcs_writel(HOST_GDTR_BASE, gdt->address); /* 22.2.4 */ rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ - - /* - * Make sure the time stamp counter is monotonous. - */ - rdtscll(tsc_this); - if (tsc_this < vcpu->arch.host_tsc) { - delta = vcpu->arch.host_tsc - tsc_this; - new_offset = vmcs_read64(TSC_OFFSET) + delta; - vmcs_write64(TSC_OFFSET, new_offset); - } } } @@ -1044,16 +1023,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, } if (vmx->rmode.vm86_active) { - vmx->rmode.irq.pending = true; - vmx->rmode.irq.vector = nr; - vmx->rmode.irq.rip = kvm_rip_read(vcpu); - if (kvm_exception_is_soft(nr)) - vmx->rmode.irq.rip += - vmx->vcpu.arch.event_exit_inst_len; - intr_info |= INTR_TYPE_SOFT_INTR; - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); - kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); + if (kvm_inject_realmode_interrupt(vcpu, nr) != EMULATE_DONE) + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return; } @@ -1149,12 +1120,17 @@ static u64 guest_read_tsc(void) } /* - * writes 'guest_tsc' into guest's timestamp counter "register" - * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc + * writes 'offset' into guest's timestamp counter offset register */ -static void guest_write_tsc(u64 guest_tsc, u64 host_tsc) +static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +{ + vmcs_write64(TSC_OFFSET, offset); +} + +static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) { - vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc); + u64 offset = vmcs_read64(TSC_OFFSET); + vmcs_write64(TSC_OFFSET, offset + adjustment); } /* @@ -1227,7 +1203,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct shared_msr_entry *msr; - u64 host_tsc; int ret = 0; switch (msr_index) { @@ -1257,8 +1232,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_TSC: - rdtscll(host_tsc); - guest_write_tsc(data, host_tsc); + kvm_write_tsc(vcpu, data); break; case MSR_IA32_CR_PAT: if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { @@ -1856,20 +1830,20 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu) return; if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]); - vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]); - vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]); - vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]); + vmcs_write64(GUEST_PDPTR0, vcpu->arch.mmu.pdptrs[0]); + vmcs_write64(GUEST_PDPTR1, vcpu->arch.mmu.pdptrs[1]); + vmcs_write64(GUEST_PDPTR2, vcpu->arch.mmu.pdptrs[2]); + vmcs_write64(GUEST_PDPTR3, vcpu->arch.mmu.pdptrs[3]); } } static void ept_save_pdptrs(struct kvm_vcpu *vcpu) { if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { - vcpu->arch.pdptrs[0] = vmcs_read64(GUEST_PDPTR0); - vcpu->arch.pdptrs[1] = vmcs_read64(GUEST_PDPTR1); - vcpu->arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); - vcpu->arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); + vcpu->arch.mmu.pdptrs[0] = vmcs_read64(GUEST_PDPTR0); + vcpu->arch.mmu.pdptrs[1] = vmcs_read64(GUEST_PDPTR1); + vcpu->arch.mmu.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); + vcpu->arch.mmu.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); } __set_bit(VCPU_EXREG_PDPTR, @@ -2515,7 +2489,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) { u32 host_sysenter_cs, msr_low, msr_high; u32 junk; - u64 host_pat, tsc_this, tsc_base; + u64 host_pat; unsigned long a; struct desc_ptr dt; int i; @@ -2656,12 +2630,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); - tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; - rdtscll(tsc_this); - if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc) - tsc_base = tsc_this; - - guest_write_tsc(0, tsc_base); + kvm_write_tsc(&vmx->vcpu, 0); return 0; } @@ -2834,16 +2803,8 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) ++vcpu->stat.irq_injections; if (vmx->rmode.vm86_active) { - vmx->rmode.irq.pending = true; - vmx->rmode.irq.vector = irq; - vmx->rmode.irq.rip = kvm_rip_read(vcpu); - if (vcpu->arch.interrupt.soft) - vmx->rmode.irq.rip += - vmx->vcpu.arch.event_exit_inst_len; - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); - kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); + if (kvm_inject_realmode_interrupt(vcpu, irq) != EMULATE_DONE) + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return; } intr = irq | INTR_INFO_VALID_MASK; @@ -2875,14 +2836,8 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) ++vcpu->stat.nmi_injections; if (vmx->rmode.vm86_active) { - vmx->rmode.irq.pending = true; - vmx->rmode.irq.vector = NMI_VECTOR; - vmx->rmode.irq.rip = kvm_rip_read(vcpu); - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, - NMI_VECTOR | INTR_TYPE_SOFT_INTR | - INTR_INFO_VALID_MASK); - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); - kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); + if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR) != EMULATE_DONE) + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return; } vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, @@ -3346,6 +3301,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu) static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) { + kvm_make_request(KVM_REQ_EVENT, vcpu); return 1; } @@ -3358,6 +3314,8 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu) cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + kvm_make_request(KVM_REQ_EVENT, vcpu); + ++vcpu->stat.irq_window_exits; /* @@ -3614,6 +3572,7 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu) cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); ++vcpu->stat.nmi_window_exits; + kvm_make_request(KVM_REQ_EVENT, vcpu); return 1; } @@ -3623,8 +3582,17 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); enum emulation_result err = EMULATE_DONE; int ret = 1; + u32 cpu_exec_ctrl; + bool intr_window_requested; + + cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; while (!guest_state_valid(vcpu)) { + if (intr_window_requested + && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF)) + return handle_interrupt_window(&vmx->vcpu); + err = emulate_instruction(vcpu, 0, 0, 0); if (err == EMULATE_DO_MMIO) { @@ -3790,18 +3758,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) vmcs_write32(TPR_THRESHOLD, irr); } -static void vmx_complete_interrupts(struct vcpu_vmx *vmx) +static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { - u32 exit_intr_info; - u32 idt_vectoring_info = vmx->idt_vectoring_info; - bool unblock_nmi; - u8 vector; - int type; - bool idtv_info_valid; - - exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - - vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); + u32 exit_intr_info = vmx->exit_intr_info; /* Handle machine checks before interrupts are enabled */ if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) @@ -3816,8 +3775,16 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) asm("int $2"); kvm_after_handle_nmi(&vmx->vcpu); } +} - idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; +static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) +{ + u32 exit_intr_info = vmx->exit_intr_info; + bool unblock_nmi; + u8 vector; + bool idtv_info_valid; + + idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; if (cpu_has_virtual_nmis()) { unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; @@ -3839,6 +3806,18 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) } else if (unlikely(vmx->soft_vnmi_blocked)) vmx->vnmi_blocked_time += ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); +} + +static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, + u32 idt_vectoring_info, + int instr_len_field, + int error_code_field) +{ + u8 vector; + int type; + bool idtv_info_valid; + + idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; vmx->vcpu.arch.nmi_injected = false; kvm_clear_exception_queue(&vmx->vcpu); @@ -3847,6 +3826,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) if (!idtv_info_valid) return; + kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); + vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; @@ -3863,18 +3844,18 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) break; case INTR_TYPE_SOFT_EXCEPTION: vmx->vcpu.arch.event_exit_inst_len = - vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + vmcs_read32(instr_len_field); /* fall through */ case INTR_TYPE_HARD_EXCEPTION: if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { - u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE); + u32 err = vmcs_read32(error_code_field); kvm_queue_exception_e(&vmx->vcpu, vector, err); } else kvm_queue_exception(&vmx->vcpu, vector); break; case INTR_TYPE_SOFT_INTR: vmx->vcpu.arch.event_exit_inst_len = - vmcs_read32(VM_EXIT_INSTRUCTION_LEN); + vmcs_read32(instr_len_field); /* fall through */ case INTR_TYPE_EXT_INTR: kvm_queue_interrupt(&vmx->vcpu, vector, @@ -3885,27 +3866,21 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) } } -/* - * Failure to inject an interrupt should give us the information - * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs - * when fetching the interrupt redirection bitmap in the real-mode - * tss, this doesn't happen. So we do it ourselves. - */ -static void fixup_rmode_irq(struct vcpu_vmx *vmx) +static void vmx_complete_interrupts(struct vcpu_vmx *vmx) { - vmx->rmode.irq.pending = 0; - if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip) - return; - kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip); - if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { - vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; - vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; - return; - } - vmx->idt_vectoring_info = - VECTORING_INFO_VALID_MASK - | INTR_TYPE_EXT_INTR - | vmx->rmode.irq.vector; + __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info, + VM_EXIT_INSTRUCTION_LEN, + IDT_VECTORING_ERROR_CODE); +} + +static void vmx_cancel_injection(struct kvm_vcpu *vcpu) +{ + __vmx_complete_interrupts(to_vmx(vcpu), + vmcs_read32(VM_ENTRY_INTR_INFO_FIELD), + VM_ENTRY_INSTRUCTION_LEN, + VM_ENTRY_EXCEPTION_ERROR_CODE); + + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); } #ifdef CONFIG_X86_64 @@ -4032,7 +4007,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) #endif [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) : "cc", "memory" - , R"bx", R"di", R"si" + , R"ax", R"bx", R"di", R"si" #ifdef CONFIG_X86_64 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" #endif @@ -4043,12 +4018,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.regs_dirty = 0; vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); - if (vmx->rmode.irq.pending) - fixup_rmode_irq(vmx); asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); vmx->launched = 1; + vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); + vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + vmx_complete_atomic_exit(vmx); + vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); } @@ -4119,6 +4097,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) cpu = get_cpu(); vmx_vcpu_load(&vmx->vcpu, cpu); + vmx->vcpu.cpu = cpu; err = vmx_vcpu_setup(vmx); vmx_vcpu_put(&vmx->vcpu); put_cpu(); @@ -4334,6 +4313,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_irq = vmx_inject_irq, .set_nmi = vmx_inject_nmi, .queue_exception = vmx_queue_exception, + .cancel_injection = vmx_cancel_injection, .interrupt_allowed = vmx_interrupt_allowed, .nmi_allowed = vmx_nmi_allowed, .get_nmi_mask = vmx_get_nmi_mask, @@ -4356,6 +4336,11 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_supported_cpuid = vmx_set_supported_cpuid, .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, + + .write_tsc_offset = vmx_write_tsc_offset, + .adjust_tsc_offset = vmx_adjust_tsc_offset, + + .set_tdp_cr3 = vmx_set_cr3, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6c2ecf0a806d..2288ad829b32 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6,7 +6,7 @@ * Copyright (C) 2006 Qumranet, Inc. * Copyright (C) 2008 Qumranet, Inc. * Copyright IBM Corporation, 2008 - * Copyright 2010 Red Hat, Inc. and/or its affilates. + * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Avi Kivity <avi@qumranet.com> @@ -55,6 +55,8 @@ #include <asm/mce.h> #include <asm/i387.h> #include <asm/xcr.h> +#include <asm/pvclock.h> +#include <asm/div64.h> #define MAX_IO_MSRS 256 #define CR0_RESERVED_BITS \ @@ -71,7 +73,7 @@ #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) #define KVM_MAX_MCE_BANKS 32 -#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P +#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P) /* EFER defaults: * - enable syscall per default because its emulated by KVM @@ -282,6 +284,8 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, u32 prev_nr; int class1, class2; + kvm_make_request(KVM_REQ_EVENT, vcpu); + if (!vcpu->arch.exception.pending) { queue: vcpu->arch.exception.pending = true; @@ -327,16 +331,28 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) } EXPORT_SYMBOL_GPL(kvm_requeue_exception); -void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, - u32 error_code) +void kvm_inject_page_fault(struct kvm_vcpu *vcpu) { + unsigned error_code = vcpu->arch.fault.error_code; + ++vcpu->stat.pf_guest; - vcpu->arch.cr2 = addr; + vcpu->arch.cr2 = vcpu->arch.fault.address; kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); } +void kvm_propagate_fault(struct kvm_vcpu *vcpu) +{ + if (mmu_is_nested(vcpu) && !vcpu->arch.fault.nested) + vcpu->arch.nested_mmu.inject_page_fault(vcpu); + else + vcpu->arch.mmu.inject_page_fault(vcpu); + + vcpu->arch.fault.nested = false; +} + void kvm_inject_nmi(struct kvm_vcpu *vcpu) { + kvm_make_request(KVM_REQ_EVENT, vcpu); vcpu->arch.nmi_pending = 1; } EXPORT_SYMBOL_GPL(kvm_inject_nmi); @@ -367,18 +383,49 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) EXPORT_SYMBOL_GPL(kvm_require_cpl); /* + * This function will be used to read from the physical memory of the currently + * running guest. The difference to kvm_read_guest_page is that this function + * can read from guest physical or from the guest's guest physical memory. + */ +int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + gfn_t ngfn, void *data, int offset, int len, + u32 access) +{ + gfn_t real_gfn; + gpa_t ngpa; + + ngpa = gfn_to_gpa(ngfn); + real_gfn = mmu->translate_gpa(vcpu, ngpa, access); + if (real_gfn == UNMAPPED_GVA) + return -EFAULT; + + real_gfn = gpa_to_gfn(real_gfn); + + return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len); +} +EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu); + +int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, + void *data, int offset, int len, u32 access) +{ + return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn, + data, offset, len, access); +} + +/* * Load the pae pdptrs. Return true is they are all valid. */ -int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) +int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3) { gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; int i; int ret; - u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; + u64 pdpte[ARRAY_SIZE(mmu->pdptrs)]; - ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, - offset * sizeof(u64), sizeof(pdpte)); + ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte, + offset * sizeof(u64), sizeof(pdpte), + PFERR_USER_MASK|PFERR_WRITE_MASK); if (ret < 0) { ret = 0; goto out; @@ -392,7 +439,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) } ret = 1; - memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); + memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); __set_bit(VCPU_EXREG_PDPTR, (unsigned long *)&vcpu->arch.regs_avail); __set_bit(VCPU_EXREG_PDPTR, @@ -405,8 +452,10 @@ EXPORT_SYMBOL_GPL(load_pdptrs); static bool pdptrs_changed(struct kvm_vcpu *vcpu) { - u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; + u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)]; bool changed = true; + int offset; + gfn_t gfn; int r; if (is_long_mode(vcpu) || !is_pae(vcpu)) @@ -416,10 +465,13 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu) (unsigned long *)&vcpu->arch.regs_avail)) return true; - r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); + gfn = (vcpu->arch.cr3 & ~31u) >> PAGE_SHIFT; + offset = (vcpu->arch.cr3 & ~31u) & (PAGE_SIZE - 1); + r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte), + PFERR_USER_MASK | PFERR_WRITE_MASK); if (r < 0) goto out; - changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; + changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0; out: return changed; @@ -458,7 +510,8 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) return 1; } else #endif - if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) + if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, + vcpu->arch.cr3)) return 1; } @@ -547,7 +600,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) && ((cr4 ^ old_cr4) & pdptr_bits) - && !load_pdptrs(vcpu, vcpu->arch.cr3)) + && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3)) return 1; if (cr4 & X86_CR4_VMXE) @@ -580,7 +633,8 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (is_pae(vcpu)) { if (cr3 & CR3_PAE_RESERVED_BITS) return 1; - if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) + if (is_paging(vcpu) && + !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) return 1; } /* @@ -737,7 +791,7 @@ static u32 msrs_to_save[] = { #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif - MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA }; static unsigned num_msrs_to_save; @@ -838,7 +892,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) /* * The guest calculates current wall clock time by adding - * system time (updated by kvm_write_guest_time below) to the + * system time (updated by kvm_guest_time_update below) to the * wall clock specified here. guest system time equals host * system time for us, thus we must fill in host boot time here. */ @@ -866,65 +920,229 @@ static uint32_t div_frac(uint32_t dividend, uint32_t divisor) return quotient; } -static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) +static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz, + s8 *pshift, u32 *pmultiplier) { - uint64_t nsecs = 1000000000LL; + uint64_t scaled64; int32_t shift = 0; uint64_t tps64; uint32_t tps32; - tps64 = tsc_khz * 1000LL; - while (tps64 > nsecs*2) { + tps64 = base_khz * 1000LL; + scaled64 = scaled_khz * 1000LL; + while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) { tps64 >>= 1; shift--; } tps32 = (uint32_t)tps64; - while (tps32 <= (uint32_t)nsecs) { - tps32 <<= 1; + while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) { + if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000) + scaled64 >>= 1; + else + tps32 <<= 1; shift++; } - hv_clock->tsc_shift = shift; - hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); + *pshift = shift; + *pmultiplier = div_frac(scaled64, tps32); - pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", - __func__, tsc_khz, hv_clock->tsc_shift, - hv_clock->tsc_to_system_mul); + pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n", + __func__, base_khz, scaled_khz, shift, *pmultiplier); +} + +static inline u64 get_kernel_ns(void) +{ + struct timespec ts; + + WARN_ON(preemptible()); + ktime_get_ts(&ts); + monotonic_to_bootbased(&ts); + return timespec_to_ns(&ts); } static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); +unsigned long max_tsc_khz; -static void kvm_write_guest_time(struct kvm_vcpu *v) +static inline int kvm_tsc_changes_freq(void) +{ + int cpu = get_cpu(); + int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && + cpufreq_quick_get(cpu) != 0; + put_cpu(); + return ret; +} + +static inline u64 nsec_to_cycles(u64 nsec) +{ + u64 ret; + + WARN_ON(preemptible()); + if (kvm_tsc_changes_freq()) + printk_once(KERN_WARNING + "kvm: unreliable cycle conversion on adjustable rate TSC\n"); + ret = nsec * __get_cpu_var(cpu_tsc_khz); + do_div(ret, USEC_PER_SEC); + return ret; +} + +static void kvm_arch_set_tsc_khz(struct kvm *kvm, u32 this_tsc_khz) +{ + /* Compute a scale to convert nanoseconds in TSC cycles */ + kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, + &kvm->arch.virtual_tsc_shift, + &kvm->arch.virtual_tsc_mult); + kvm->arch.virtual_tsc_khz = this_tsc_khz; +} + +static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) +{ + u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, + vcpu->kvm->arch.virtual_tsc_mult, + vcpu->kvm->arch.virtual_tsc_shift); + tsc += vcpu->arch.last_tsc_write; + return tsc; +} + +void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) +{ + struct kvm *kvm = vcpu->kvm; + u64 offset, ns, elapsed; + unsigned long flags; + s64 sdiff; + + spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); + offset = data - native_read_tsc(); + ns = get_kernel_ns(); + elapsed = ns - kvm->arch.last_tsc_nsec; + sdiff = data - kvm->arch.last_tsc_write; + if (sdiff < 0) + sdiff = -sdiff; + + /* + * Special case: close write to TSC within 5 seconds of + * another CPU is interpreted as an attempt to synchronize + * The 5 seconds is to accomodate host load / swapping as + * well as any reset of TSC during the boot process. + * + * In that case, for a reliable TSC, we can match TSC offsets, + * or make a best guest using elapsed value. + */ + if (sdiff < nsec_to_cycles(5ULL * NSEC_PER_SEC) && + elapsed < 5ULL * NSEC_PER_SEC) { + if (!check_tsc_unstable()) { + offset = kvm->arch.last_tsc_offset; + pr_debug("kvm: matched tsc offset for %llu\n", data); + } else { + u64 delta = nsec_to_cycles(elapsed); + offset += delta; + pr_debug("kvm: adjusted tsc offset by %llu\n", delta); + } + ns = kvm->arch.last_tsc_nsec; + } + kvm->arch.last_tsc_nsec = ns; + kvm->arch.last_tsc_write = data; + kvm->arch.last_tsc_offset = offset; + kvm_x86_ops->write_tsc_offset(vcpu, offset); + spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); + + /* Reset of TSC must disable overshoot protection below */ + vcpu->arch.hv_clock.tsc_timestamp = 0; + vcpu->arch.last_tsc_write = data; + vcpu->arch.last_tsc_nsec = ns; +} +EXPORT_SYMBOL_GPL(kvm_write_tsc); + +static int kvm_guest_time_update(struct kvm_vcpu *v) { - struct timespec ts; unsigned long flags; struct kvm_vcpu_arch *vcpu = &v->arch; void *shared_kaddr; unsigned long this_tsc_khz; + s64 kernel_ns, max_kernel_ns; + u64 tsc_timestamp; - if ((!vcpu->time_page)) - return; + /* Keep irq disabled to prevent changes to the clock */ + local_irq_save(flags); + kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); + kernel_ns = get_kernel_ns(); + this_tsc_khz = __get_cpu_var(cpu_tsc_khz); - this_tsc_khz = get_cpu_var(cpu_tsc_khz); - if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) { - kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); - vcpu->hv_clock_tsc_khz = this_tsc_khz; + if (unlikely(this_tsc_khz == 0)) { + local_irq_restore(flags); + kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); + return 1; + } + + /* + * We may have to catch up the TSC to match elapsed wall clock + * time for two reasons, even if kvmclock is used. + * 1) CPU could have been running below the maximum TSC rate + * 2) Broken TSC compensation resets the base at each VCPU + * entry to avoid unknown leaps of TSC even when running + * again on the same CPU. This may cause apparent elapsed + * time to disappear, and the guest to stand still or run + * very slowly. + */ + if (vcpu->tsc_catchup) { + u64 tsc = compute_guest_tsc(v, kernel_ns); + if (tsc > tsc_timestamp) { + kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp); + tsc_timestamp = tsc; + } } - put_cpu_var(cpu_tsc_khz); - /* Keep irq disabled to prevent changes to the clock */ - local_irq_save(flags); - kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); - ktime_get_ts(&ts); - monotonic_to_bootbased(&ts); local_irq_restore(flags); - /* With all the info we got, fill in the values */ + if (!vcpu->time_page) + return 0; - vcpu->hv_clock.system_time = ts.tv_nsec + - (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset; + /* + * Time as measured by the TSC may go backwards when resetting the base + * tsc_timestamp. The reason for this is that the TSC resolution is + * higher than the resolution of the other clock scales. Thus, many + * possible measurments of the TSC correspond to one measurement of any + * other clock, and so a spread of values is possible. This is not a + * problem for the computation of the nanosecond clock; with TSC rates + * around 1GHZ, there can only be a few cycles which correspond to one + * nanosecond value, and any path through this code will inevitably + * take longer than that. However, with the kernel_ns value itself, + * the precision may be much lower, down to HZ granularity. If the + * first sampling of TSC against kernel_ns ends in the low part of the + * range, and the second in the high end of the range, we can get: + * + * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new + * + * As the sampling errors potentially range in the thousands of cycles, + * it is possible such a time value has already been observed by the + * guest. To protect against this, we must compute the system time as + * observed by the guest and ensure the new system time is greater. + */ + max_kernel_ns = 0; + if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) { + max_kernel_ns = vcpu->last_guest_tsc - + vcpu->hv_clock.tsc_timestamp; + max_kernel_ns = pvclock_scale_delta(max_kernel_ns, + vcpu->hv_clock.tsc_to_system_mul, + vcpu->hv_clock.tsc_shift); + max_kernel_ns += vcpu->last_kernel_ns; + } + if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) { + kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz, + &vcpu->hv_clock.tsc_shift, + &vcpu->hv_clock.tsc_to_system_mul); + vcpu->hw_tsc_khz = this_tsc_khz; + } + + if (max_kernel_ns > kernel_ns) + kernel_ns = max_kernel_ns; + + /* With all the info we got, fill in the values */ + vcpu->hv_clock.tsc_timestamp = tsc_timestamp; + vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; + vcpu->last_kernel_ns = kernel_ns; + vcpu->last_guest_tsc = tsc_timestamp; vcpu->hv_clock.flags = 0; /* @@ -942,16 +1160,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) kunmap_atomic(shared_kaddr, KM_USER0); mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); -} - -static int kvm_request_guest_time_update(struct kvm_vcpu *v) -{ - struct kvm_vcpu_arch *vcpu = &v->arch; - - if (!vcpu->time_page) - return 0; - kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v); - return 1; + return 0; } static bool msr_mtrr_valid(unsigned msr) @@ -1277,6 +1486,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) } vcpu->arch.time = data; + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); /* we verify if the enable bit is set... */ if (!(data & 1)) @@ -1292,8 +1502,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) kvm_release_page_clean(vcpu->arch.time_page); vcpu->arch.time_page = NULL; } - - kvm_request_guest_time_update(vcpu); break; } case MSR_IA32_MCG_CTL: @@ -1330,6 +1538,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " "0x%x data 0x%llx\n", msr, data); break; + case MSR_K7_CLK_CTL: + /* + * Ignore all writes to this no longer documented MSR. + * Writes are only relevant for old K7 processors, + * all pre-dating SVM, but a recommended workaround from + * AMD for these chips. It is possible to speicify the + * affected processor models on the command line, hence + * the need to ignore the workaround. + */ + break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: if (kvm_hv_msr_partition_wide(msr)) { int r; @@ -1522,6 +1740,20 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case 0xcd: /* fsb frequency */ data = 3; break; + /* + * MSR_EBC_FREQUENCY_ID + * Conservative value valid for even the basic CPU models. + * Models 0,1: 000 in bits 23:21 indicating a bus speed of + * 100MHz, model 2 000 in bits 18:16 indicating 100MHz, + * and 266MHz for model 3, or 4. Set Core Clock + * Frequency to System Bus Frequency Ratio to 1 (bits + * 31:24) even though these are only valid for CPU + * models > 2, however guests may end up dividing or + * multiplying by zero otherwise. + */ + case MSR_EBC_FREQUENCY_ID: + data = 1 << 24; + break; case MSR_IA32_APICBASE: data = kvm_get_apic_base(vcpu); break; @@ -1555,6 +1787,18 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: return get_msr_mce(vcpu, msr, pdata); + case MSR_K7_CLK_CTL: + /* + * Provide expected ramp-up count for K7. All other + * are set to zero, indicating minimum divisors for + * every field. + * + * This prevents guest kernels on AMD host with CPU + * type 6, model 8 and higher from exploding due to + * the rdmsr failing. + */ + data = 0x20000000; + break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: if (kvm_hv_msr_partition_wide(msr)) { int r; @@ -1808,19 +2052,28 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } kvm_x86_ops->vcpu_load(vcpu, cpu); - if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { - unsigned long khz = cpufreq_quick_get(cpu); - if (!khz) - khz = tsc_khz; - per_cpu(cpu_tsc_khz, cpu) = khz; + if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { + /* Make sure TSC doesn't go backwards */ + s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : + native_read_tsc() - vcpu->arch.last_host_tsc; + if (tsc_delta < 0) + mark_tsc_unstable("KVM discovered backwards TSC"); + if (check_tsc_unstable()) { + kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); + vcpu->arch.tsc_catchup = 1; + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + } + if (vcpu->cpu != cpu) + kvm_migrate_timers(vcpu); + vcpu->cpu = cpu; } - kvm_request_guest_time_update(vcpu); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); + vcpu->arch.last_host_tsc = native_read_tsc(); } static int is_efer_nx(void) @@ -1995,7 +2248,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(F16C); /* cpuid 0x80000001.ecx */ const u32 kvm_supported_word6_x86_features = - F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | + F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); @@ -2204,6 +2457,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, return -ENXIO; kvm_queue_interrupt(vcpu, irq->irq, false); + kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; } @@ -2357,6 +2611,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) vcpu->arch.sipi_vector = events->sipi_vector; + kvm_make_request(KVM_REQ_EVENT, vcpu); + return 0; } @@ -2760,7 +3016,7 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) { - return kvm->arch.n_alloc_mmu_pages; + return kvm->arch.n_max_mmu_pages; } static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) @@ -2796,18 +3052,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_PIC_MASTER: - raw_spin_lock(&pic_irqchip(kvm)->lock); + spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[0], &chip->chip.pic, sizeof(struct kvm_pic_state)); - raw_spin_unlock(&pic_irqchip(kvm)->lock); + spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_PIC_SLAVE: - raw_spin_lock(&pic_irqchip(kvm)->lock); + spin_lock(&pic_irqchip(kvm)->lock); memcpy(&pic_irqchip(kvm)->pics[1], &chip->chip.pic, sizeof(struct kvm_pic_state)); - raw_spin_unlock(&pic_irqchip(kvm)->lock); + spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_IOAPIC: r = kvm_set_ioapic(kvm, &chip->chip.ioapic); @@ -3201,7 +3457,6 @@ long kvm_arch_vm_ioctl(struct file *filp, break; } case KVM_SET_CLOCK: { - struct timespec now; struct kvm_clock_data user_ns; u64 now_ns; s64 delta; @@ -3215,20 +3470,21 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; r = 0; - ktime_get_ts(&now); - now_ns = timespec_to_ns(&now); + local_irq_disable(); + now_ns = get_kernel_ns(); delta = user_ns.clock - now_ns; + local_irq_enable(); kvm->arch.kvmclock_offset = delta; break; } case KVM_GET_CLOCK: { - struct timespec now; struct kvm_clock_data user_ns; u64 now_ns; - ktime_get_ts(&now); - now_ns = timespec_to_ns(&now); + local_irq_disable(); + now_ns = get_kernel_ns(); user_ns.clock = kvm->arch.kvmclock_offset + now_ns; + local_irq_enable(); user_ns.flags = 0; r = -EFAULT; @@ -3292,30 +3548,51 @@ void kvm_get_segment(struct kvm_vcpu *vcpu, kvm_x86_ops->get_segment(vcpu, var, seg); } +static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) +{ + return gpa; +} + +static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access) +{ + gpa_t t_gpa; + u32 error; + + BUG_ON(!mmu_is_nested(vcpu)); + + /* NPT walks are always user-walks */ + access |= PFERR_USER_MASK; + t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &error); + if (t_gpa == UNMAPPED_GVA) + vcpu->arch.fault.nested = true; + + return t_gpa; +} + gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; - return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); + return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); } gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_FETCH_MASK; - return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); + return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); } gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) { u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_WRITE_MASK; - return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); + return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error); } /* uses this to access any guest's mapped memory without checking CPL */ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) { - return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error); + return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, error); } static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, @@ -3326,7 +3603,8 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error); + gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access, + error); unsigned offset = addr & (PAGE_SIZE-1); unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -3381,8 +3659,9 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val, int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, - PFERR_WRITE_MASK, error); + gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, + PFERR_WRITE_MASK, + error); unsigned offset = addr & (PAGE_SIZE-1); unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -3624,7 +3903,7 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val, if (vcpu->arch.pio.count) goto data_avail; - trace_kvm_pio(1, port, size, 1); + trace_kvm_pio(0, port, size, 1); vcpu->arch.pio.port = port; vcpu->arch.pio.in = 1; @@ -3652,7 +3931,7 @@ static int emulator_pio_out_emulated(int size, unsigned short port, const void *val, unsigned int count, struct kvm_vcpu *vcpu) { - trace_kvm_pio(0, port, size, 1); + trace_kvm_pio(1, port, size, 1); vcpu->arch.pio.port = port; vcpu->arch.pio.in = 0; @@ -3791,6 +4070,11 @@ static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) kvm_x86_ops->get_gdt(vcpu, dt); } +static void emulator_get_idt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) +{ + kvm_x86_ops->get_idt(vcpu, dt); +} + static unsigned long emulator_get_cached_segment_base(int seg, struct kvm_vcpu *vcpu) { @@ -3884,6 +4168,7 @@ static struct x86_emulate_ops emulate_ops = { .set_segment_selector = emulator_set_segment_selector, .get_cached_segment_base = emulator_get_cached_segment_base, .get_gdt = emulator_get_gdt, + .get_idt = emulator_get_idt, .get_cr = emulator_get_cr, .set_cr = emulator_set_cr, .cpl = emulator_get_cpl, @@ -3919,13 +4204,64 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu) { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; if (ctxt->exception == PF_VECTOR) - kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code); + kvm_propagate_fault(vcpu); else if (ctxt->error_code_valid) kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code); else kvm_queue_exception(vcpu, ctxt->exception); } +static void init_emulate_ctxt(struct kvm_vcpu *vcpu) +{ + struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + int cs_db, cs_l; + + cache_all_regs(vcpu); + + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + + vcpu->arch.emulate_ctxt.vcpu = vcpu; + vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); + vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); + vcpu->arch.emulate_ctxt.mode = + (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : + (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) + ? X86EMUL_MODE_VM86 : cs_l + ? X86EMUL_MODE_PROT64 : cs_db + ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; + memset(c, 0, sizeof(struct decode_cache)); + memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); +} + +int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq) +{ + struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + int ret; + + init_emulate_ctxt(vcpu); + + vcpu->arch.emulate_ctxt.decode.op_bytes = 2; + vcpu->arch.emulate_ctxt.decode.ad_bytes = 2; + vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip; + ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq); + + if (ret != X86EMUL_CONTINUE) + return EMULATE_FAIL; + + vcpu->arch.emulate_ctxt.eip = c->eip; + memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); + kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + + if (irq == NMI_VECTOR) + vcpu->arch.nmi_pending = false; + else + vcpu->arch.interrupt.pending = false; + + return EMULATE_DONE; +} +EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); + static int handle_emulation_failure(struct kvm_vcpu *vcpu) { ++vcpu->stat.insn_emulation_fail; @@ -3982,24 +4318,15 @@ int emulate_instruction(struct kvm_vcpu *vcpu, cache_all_regs(vcpu); if (!(emulation_type & EMULTYPE_NO_DECODE)) { - int cs_db, cs_l; - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - - vcpu->arch.emulate_ctxt.vcpu = vcpu; - vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); - vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); - vcpu->arch.emulate_ctxt.mode = - (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : - (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) - ? X86EMUL_MODE_VM86 : cs_l - ? X86EMUL_MODE_PROT64 : cs_db - ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - memset(c, 0, sizeof(struct decode_cache)); - memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + init_emulate_ctxt(vcpu); vcpu->arch.emulate_ctxt.interruptibility = 0; vcpu->arch.emulate_ctxt.exception = -1; + vcpu->arch.emulate_ctxt.perm_ok = false; + + r = x86_decode_insn(&vcpu->arch.emulate_ctxt); + if (r == X86EMUL_PROPAGATE_FAULT) + goto done; - r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); trace_kvm_emulate_insn_start(vcpu); /* Only allow emulation of specific instructions on #UD @@ -4049,41 +4376,39 @@ int emulate_instruction(struct kvm_vcpu *vcpu, memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); restart: - r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); + r = x86_emulate_insn(&vcpu->arch.emulate_ctxt); - if (r) { /* emulation failed */ + if (r == EMULATION_FAILED) { if (reexecute_instruction(vcpu, cr2)) return EMULATE_DONE; return handle_emulation_failure(vcpu); } - toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); - kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); - memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); - kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); - +done: if (vcpu->arch.emulate_ctxt.exception >= 0) { inject_emulated_exception(vcpu); - return EMULATE_DONE; - } - - if (vcpu->arch.pio.count) { + r = EMULATE_DONE; + } else if (vcpu->arch.pio.count) { if (!vcpu->arch.pio.in) vcpu->arch.pio.count = 0; - return EMULATE_DO_MMIO; - } - - if (vcpu->mmio_needed) { + r = EMULATE_DO_MMIO; + } else if (vcpu->mmio_needed) { if (vcpu->mmio_is_write) vcpu->mmio_needed = 0; - return EMULATE_DO_MMIO; - } - - if (vcpu->arch.emulate_ctxt.restart) + r = EMULATE_DO_MMIO; + } else if (r == EMULATION_RESTART) goto restart; + else + r = EMULATE_DONE; - return EMULATE_DONE; + toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); + kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + kvm_make_request(KVM_REQ_EVENT, vcpu); + memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); + + return r; } EXPORT_SYMBOL_GPL(emulate_instruction); @@ -4097,9 +4422,23 @@ int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) } EXPORT_SYMBOL_GPL(kvm_fast_pio_out); -static void bounce_off(void *info) +static void tsc_bad(void *info) +{ + __get_cpu_var(cpu_tsc_khz) = 0; +} + +static void tsc_khz_changed(void *data) { - /* nothing */ + struct cpufreq_freqs *freq = data; + unsigned long khz = 0; + + if (data) + khz = freq->new; + else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + khz = cpufreq_quick_get(raw_smp_processor_id()); + if (!khz) + khz = tsc_khz; + __get_cpu_var(cpu_tsc_khz) = khz; } static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, @@ -4110,21 +4449,60 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va struct kvm_vcpu *vcpu; int i, send_ipi = 0; + /* + * We allow guests to temporarily run on slowing clocks, + * provided we notify them after, or to run on accelerating + * clocks, provided we notify them before. Thus time never + * goes backwards. + * + * However, we have a problem. We can't atomically update + * the frequency of a given CPU from this function; it is + * merely a notifier, which can be called from any CPU. + * Changing the TSC frequency at arbitrary points in time + * requires a recomputation of local variables related to + * the TSC for each VCPU. We must flag these local variables + * to be updated and be sure the update takes place with the + * new frequency before any guests proceed. + * + * Unfortunately, the combination of hotplug CPU and frequency + * change creates an intractable locking scenario; the order + * of when these callouts happen is undefined with respect to + * CPU hotplug, and they can race with each other. As such, + * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is + * undefined; you can actually have a CPU frequency change take + * place in between the computation of X and the setting of the + * variable. To protect against this problem, all updates of + * the per_cpu tsc_khz variable are done in an interrupt + * protected IPI, and all callers wishing to update the value + * must wait for a synchronous IPI to complete (which is trivial + * if the caller is on the CPU already). This establishes the + * necessary total order on variable updates. + * + * Note that because a guest time update may take place + * anytime after the setting of the VCPU's request bit, the + * correct TSC value must be set before the request. However, + * to ensure the update actually makes it to any guest which + * starts running in hardware virtualization between the set + * and the acquisition of the spinlock, we must also ping the + * CPU after setting the request bit. + * + */ + if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) return 0; if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) return 0; - per_cpu(cpu_tsc_khz, freq->cpu) = freq->new; + + smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu->cpu != freq->cpu) continue; - if (!kvm_request_guest_time_update(vcpu)) - continue; + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); if (vcpu->cpu != smp_processor_id()) - send_ipi++; + send_ipi = 1; } } spin_unlock(&kvm_lock); @@ -4142,32 +4520,57 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va * guest context is entered kvmclock will be updated, * so the guest will not see stale values. */ - smp_call_function_single(freq->cpu, bounce_off, NULL, 1); + smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); } return 0; } static struct notifier_block kvmclock_cpufreq_notifier_block = { - .notifier_call = kvmclock_cpufreq_notifier + .notifier_call = kvmclock_cpufreq_notifier +}; + +static int kvmclock_cpu_notifier(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + switch (action) { + case CPU_ONLINE: + case CPU_DOWN_FAILED: + smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); + break; + case CPU_DOWN_PREPARE: + smp_call_function_single(cpu, tsc_bad, NULL, 1); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block kvmclock_cpu_notifier_block = { + .notifier_call = kvmclock_cpu_notifier, + .priority = -INT_MAX }; static void kvm_timer_init(void) { int cpu; + max_tsc_khz = tsc_khz; + register_hotcpu_notifier(&kvmclock_cpu_notifier_block); if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { +#ifdef CONFIG_CPU_FREQ + struct cpufreq_policy policy; + memset(&policy, 0, sizeof(policy)); + cpufreq_get_policy(&policy, get_cpu()); + if (policy.cpuinfo.max_freq) + max_tsc_khz = policy.cpuinfo.max_freq; +#endif cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); - for_each_online_cpu(cpu) { - unsigned long khz = cpufreq_get(cpu); - if (!khz) - khz = tsc_khz; - per_cpu(cpu_tsc_khz, cpu) = khz; - } - } else { - for_each_possible_cpu(cpu) - per_cpu(cpu_tsc_khz, cpu) = tsc_khz; } + pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz); + for_each_online_cpu(cpu) + smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); } static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); @@ -4269,6 +4672,7 @@ void kvm_arch_exit(void) if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); + unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block); kvm_x86_ops = NULL; kvm_mmu_module_exit(); } @@ -4684,8 +5088,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_mmu_unload(vcpu); if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) __kvm_migrate_timers(vcpu); - if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu)) - kvm_write_guest_time(vcpu); + if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { + r = kvm_guest_time_update(vcpu); + if (unlikely(r)) + goto out; + } if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) kvm_mmu_sync_roots(vcpu); if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) @@ -4710,6 +5117,21 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (unlikely(r)) goto out; + if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { + inject_pending_event(vcpu); + + /* enable NMI/IRQ window open exits if needed */ + if (vcpu->arch.nmi_pending) + kvm_x86_ops->enable_nmi_window(vcpu); + else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) + kvm_x86_ops->enable_irq_window(vcpu); + + if (kvm_lapic_enabled(vcpu)) { + update_cr8_intercept(vcpu); + kvm_lapic_sync_to_vapic(vcpu); + } + } + preempt_disable(); kvm_x86_ops->prepare_guest_switch(vcpu); @@ -4728,23 +5150,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) smp_wmb(); local_irq_enable(); preempt_enable(); + kvm_x86_ops->cancel_injection(vcpu); r = 1; goto out; } - inject_pending_event(vcpu); - - /* enable NMI/IRQ window open exits if needed */ - if (vcpu->arch.nmi_pending) - kvm_x86_ops->enable_nmi_window(vcpu); - else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) - kvm_x86_ops->enable_irq_window(vcpu); - - if (kvm_lapic_enabled(vcpu)) { - update_cr8_intercept(vcpu); - kvm_lapic_sync_to_vapic(vcpu); - } - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); kvm_guest_enter(); @@ -4770,6 +5180,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (hw_breakpoint_active()) hw_breakpoint_restore(); + kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); + atomic_set(&vcpu->guest_mode, 0); smp_wmb(); local_irq_enable(); @@ -4899,8 +5311,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (!irqchip_in_kernel(vcpu->kvm)) kvm_set_cr8(vcpu, kvm_run->cr8); - if (vcpu->arch.pio.count || vcpu->mmio_needed || - vcpu->arch.emulate_ctxt.restart) { + if (vcpu->arch.pio.count || vcpu->mmio_needed) { if (vcpu->mmio_needed) { memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); vcpu->mmio_read_completed = 1; @@ -4981,6 +5392,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.exception.pending = false; + kvm_make_request(KVM_REQ_EVENT, vcpu); + return 0; } @@ -5044,6 +5457,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { vcpu->arch.mp_state = mp_state->mp_state; + kvm_make_request(KVM_REQ_EVENT, vcpu); return 0; } @@ -5051,24 +5465,11 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, bool has_error_code, u32 error_code) { struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; - int cs_db, cs_l, ret; - cache_all_regs(vcpu); - - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + int ret; - vcpu->arch.emulate_ctxt.vcpu = vcpu; - vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); - vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); - vcpu->arch.emulate_ctxt.mode = - (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : - (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) - ? X86EMUL_MODE_VM86 : cs_l - ? X86EMUL_MODE_PROT64 : cs_db - ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - memset(c, 0, sizeof(struct decode_cache)); - memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + init_emulate_ctxt(vcpu); - ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops, + ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, tss_selector, reason, has_error_code, error_code); @@ -5078,6 +5479,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + kvm_make_request(KVM_REQ_EVENT, vcpu); return EMULATE_DONE; } EXPORT_SYMBOL_GPL(kvm_task_switch); @@ -5113,7 +5515,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; kvm_x86_ops->set_cr4(vcpu, sregs->cr4); if (!is_long_mode(vcpu) && is_pae(vcpu)) { - load_pdptrs(vcpu, vcpu->arch.cr3); + load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3); mmu_reset_needed = 1; } @@ -5148,6 +5550,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, !is_protmode(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + kvm_make_request(KVM_REQ_EVENT, vcpu); + return 0; } @@ -5334,6 +5738,10 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) { + if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) + printk_once(KERN_WARNING + "kvm: SMP vm created on host with unstable TSC; " + "guest TSC will not be reliable\n"); return kvm_x86_ops->vcpu_create(kvm, id); } @@ -5376,22 +5784,22 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) vcpu->arch.dr6 = DR6_FIXED_1; vcpu->arch.dr7 = DR7_FIXED_1; + kvm_make_request(KVM_REQ_EVENT, vcpu); + return kvm_x86_ops->vcpu_reset(vcpu); } int kvm_arch_hardware_enable(void *garbage) { - /* - * Since this may be called from a hotplug notifcation, - * we can't get the CPU frequency directly. - */ - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { - int cpu = raw_smp_processor_id(); - per_cpu(cpu_tsc_khz, cpu) = 0; - } + struct kvm *kvm; + struct kvm_vcpu *vcpu; + int i; kvm_shared_msr_cpu_online(); - + list_for_each_entry(kvm, &vm_list, vm_list) + kvm_for_each_vcpu(i, vcpu, kvm) + if (vcpu->cpu == smp_processor_id()) + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); return kvm_x86_ops->hardware_enable(garbage); } @@ -5425,7 +5833,11 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) BUG_ON(vcpu->kvm == NULL); kvm = vcpu->kvm; + vcpu->arch.emulate_ctxt.ops = &emulate_ops; + vcpu->arch.walk_mmu = &vcpu->arch.mmu; vcpu->arch.mmu.root_hpa = INVALID_PAGE; + vcpu->arch.mmu.translate_gpa = translate_gpa; + vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa; if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else @@ -5438,6 +5850,9 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) } vcpu->arch.pio_data = page_address(page); + if (!kvm->arch.virtual_tsc_khz) + kvm_arch_set_tsc_khz(kvm, max_tsc_khz); + r = kvm_mmu_create(vcpu); if (r < 0) goto fail_free_pio_data; @@ -5497,7 +5912,7 @@ struct kvm *kvm_arch_create_vm(void) /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); - rdtscll(kvm->arch.vm_init_tsc); + spin_lock_init(&kvm->arch.tsc_write_lock); return kvm; } @@ -5684,6 +6099,7 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) rflags |= X86_EFLAGS_TF; kvm_x86_ops->set_rflags(vcpu, rflags); + kvm_make_request(KVM_REQ_EVENT, vcpu); } EXPORT_SYMBOL_GPL(kvm_set_rflags); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index b7a404722d2b..2cea414489f3 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -50,6 +50,11 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu) #endif } +static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu; +} + static inline int is_pae(struct kvm_vcpu *vcpu) { return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); @@ -67,5 +72,8 @@ static inline int is_paging(struct kvm_vcpu *vcpu) void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); +int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq); + +void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); #endif diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 79b0b372d2d0..7d90ceb882a4 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -11,6 +11,7 @@ #include <linux/kprobes.h> /* __kprobes, ... */ #include <linux/mmiotrace.h> /* kmmio_handler, ... */ #include <linux/perf_event.h> /* perf_sw_event */ +#include <linux/hugetlb.h> /* hstate_index_to_shift */ #include <asm/traps.h> /* dotraplinkage, ... */ #include <asm/pgalloc.h> /* pgd_*(), ... */ @@ -160,15 +161,20 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) static void force_sig_info_fault(int si_signo, int si_code, unsigned long address, - struct task_struct *tsk) + struct task_struct *tsk, int fault) { + unsigned lsb = 0; siginfo_t info; info.si_signo = si_signo; info.si_errno = 0; info.si_code = si_code; info.si_addr = (void __user *)address; - info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0; + if (fault & VM_FAULT_HWPOISON_LARGE) + lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); + if (fault & VM_FAULT_HWPOISON) + lsb = PAGE_SHIFT; + info.si_addr_lsb = lsb; force_sig_info(si_signo, &info, tsk); } @@ -722,7 +728,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, tsk->thread.error_code = error_code | (address >= TASK_SIZE); tsk->thread.trap_no = 14; - force_sig_info_fault(SIGSEGV, si_code, address, tsk); + force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0); return; } @@ -807,14 +813,14 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, tsk->thread.trap_no = 14; #ifdef CONFIG_MEMORY_FAILURE - if (fault & VM_FAULT_HWPOISON) { + if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { printk(KERN_ERR "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", tsk->comm, tsk->pid, address); code = BUS_MCEERR_AR; } #endif - force_sig_info_fault(SIGBUS, code, address, tsk); + force_sig_info_fault(SIGBUS, code, address, tsk, fault); } static noinline void @@ -824,7 +830,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, if (fault & VM_FAULT_OOM) { out_of_memory(regs, error_code, address); } else { - if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) + if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| + VM_FAULT_HWPOISON_LARGE)) do_sigbus(regs, error_code, address, fault); else BUG(); @@ -912,9 +919,9 @@ spurious_fault(unsigned long error_code, unsigned long address) int show_unhandled_signals = 1; static inline int -access_error(unsigned long error_code, int write, struct vm_area_struct *vma) +access_error(unsigned long error_code, struct vm_area_struct *vma) { - if (write) { + if (error_code & PF_WRITE) { /* write, present and write, not present: */ if (unlikely(!(vma->vm_flags & VM_WRITE))) return 1; @@ -949,8 +956,10 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) struct task_struct *tsk; unsigned long address; struct mm_struct *mm; - int write; int fault; + int write = error_code & PF_WRITE; + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | + (write ? FAULT_FLAG_WRITE : 0); tsk = current; mm = tsk->mm; @@ -1061,6 +1070,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) bad_area_nosemaphore(regs, error_code, address); return; } +retry: down_read(&mm->mmap_sem); } else { /* @@ -1104,9 +1114,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) * we can handle it.. */ good_area: - write = error_code & PF_WRITE; - - if (unlikely(access_error(error_code, write, vma))) { + if (unlikely(access_error(error_code, vma))) { bad_area_access_error(regs, error_code, address); return; } @@ -1116,21 +1124,34 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault: */ - fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); + fault = handle_mm_fault(mm, vma, address, flags); if (unlikely(fault & VM_FAULT_ERROR)) { mm_fault_error(regs, error_code, address, fault); return; } - if (fault & VM_FAULT_MAJOR) { - tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, - regs, address); - } else { - tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, - regs, address); + /* + * Major/minor page fault accounting is only done on the + * initial attempt. If we go through a retry, it is extremely + * likely that the page will be found in page cache at that point. + */ + if (flags & FAULT_FLAG_ALLOW_RETRY) { + if (fault & VM_FAULT_MAJOR) { + tsk->maj_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + regs, address); + } else { + tsk->min_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + regs, address); + } + if (fault & VM_FAULT_RETRY) { + /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk + * of starvation. */ + flags &= ~FAULT_FLAG_ALLOW_RETRY; + goto retry; + } } check_v8086_mode(regs, address, tsk); diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 5e8fa12ef861..b49962662101 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -9,6 +9,7 @@ void *kmap(struct page *page) return page_address(page); return kmap_high(page); } +EXPORT_SYMBOL(kmap); void kunmap(struct page *page) { @@ -18,6 +19,7 @@ void kunmap(struct page *page) return; kunmap_high(page); } +EXPORT_SYMBOL(kunmap); /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because @@ -27,10 +29,10 @@ void kunmap(struct page *page) * However when holding an atomic kmap it is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ -void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) +void *kmap_atomic_prot(struct page *page, pgprot_t prot) { - enum fixed_addresses idx; unsigned long vaddr; + int idx, type; /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ pagefault_disable(); @@ -38,8 +40,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) if (!PageHighMem(page)) return page_address(page); - debug_kmap_atomic(type); - + type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); BUG_ON(!pte_none(*(kmap_pte-idx))); @@ -47,44 +48,57 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) return (void *)vaddr; } +EXPORT_SYMBOL(kmap_atomic_prot); + +void *__kmap_atomic(struct page *page) +{ + return kmap_atomic_prot(page, kmap_prot); +} +EXPORT_SYMBOL(__kmap_atomic); -void *kmap_atomic(struct page *page, enum km_type type) +/* + * This is the same as kmap_atomic() but can map memory that doesn't + * have a struct page associated with it. + */ +void *kmap_atomic_pfn(unsigned long pfn) { - return kmap_atomic_prot(page, type, kmap_prot); + return kmap_atomic_prot_pfn(pfn, kmap_prot); } +EXPORT_SYMBOL_GPL(kmap_atomic_pfn); -void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type) +void __kunmap_atomic(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); - - /* - * Force other mappings to Oops if they'll try to access this pte - * without first remap it. Keeping stale mappings around is a bad idea - * also, in case the page changes cacheability attributes or becomes - * a protected page in a hypervisor. - */ - if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) + + if (vaddr >= __fix_to_virt(FIX_KMAP_END) && + vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) { + int idx, type; + + type = kmap_atomic_idx(); + idx = type + KM_TYPE_NR * smp_processor_id(); + +#ifdef CONFIG_DEBUG_HIGHMEM + WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); +#endif + /* + * Force other mappings to Oops if they'll try to access this + * pte without first remap it. Keeping stale mappings around + * is a bad idea also, in case the page changes cacheability + * attributes or becomes a protected page in a hypervisor. + */ kpte_clear_flush(kmap_pte-idx, vaddr); - else { + kmap_atomic_idx_pop(); + } #ifdef CONFIG_DEBUG_HIGHMEM + else { BUG_ON(vaddr < PAGE_OFFSET); BUG_ON(vaddr >= (unsigned long)high_memory); -#endif } +#endif pagefault_enable(); } - -/* - * This is the same as kmap_atomic() but can map memory that doesn't - * have a struct page associated with it. - */ -void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) -{ - return kmap_atomic_prot_pfn(pfn, type, kmap_prot); -} -EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */ +EXPORT_SYMBOL(__kunmap_atomic); struct page *kmap_atomic_to_page(void *ptr) { @@ -98,12 +112,6 @@ struct page *kmap_atomic_to_page(void *ptr) pte = kmap_pte - (idx - FIX_KMAP_BEGIN); return pte_page(*pte); } - -EXPORT_SYMBOL(kmap); -EXPORT_SYMBOL(kunmap); -EXPORT_SYMBOL(kmap_atomic); -EXPORT_SYMBOL(kunmap_atomic_notypecheck); -EXPORT_SYMBOL(kmap_atomic_prot); EXPORT_SYMBOL(kmap_atomic_to_page); void __init set_highmem_pages_init(void) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 5d0a6711c282..0e969f9f401b 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -528,48 +528,6 @@ static void __init pagetable_init(void) permanent_kmaps_init(pgd_base); } -#ifdef CONFIG_ACPI_SLEEP -/* - * ACPI suspend needs this for resume, because things like the intel-agp - * driver might have split up a kernel 4MB mapping. - */ -char swsusp_pg_dir[PAGE_SIZE] - __attribute__ ((aligned(PAGE_SIZE))); - -static inline void save_pg_dir(void) -{ - copy_page(swsusp_pg_dir, swapper_pg_dir); -} -#else /* !CONFIG_ACPI_SLEEP */ -static inline void save_pg_dir(void) -{ -} -#endif /* !CONFIG_ACPI_SLEEP */ - -void zap_low_mappings(bool early) -{ - int i; - - /* - * Zap initial low-memory mappings. - * - * Note that "pgd_clear()" doesn't do it for - * us, because pgd_clear() is a no-op on i386. - */ - for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) { -#ifdef CONFIG_X86_PAE - set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); -#else - set_pgd(swapper_pg_dir+i, __pgd(0)); -#endif - } - - if (early) - __flush_tlb(); - else - flush_tlb_all(); -} - pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); EXPORT_SYMBOL_GPL(__supported_pte_mask); @@ -882,9 +840,6 @@ void __init mem_init(void) if (boot_cpu_data.wp_works_ok < 0) test_wp_bit(); - - save_pg_dir(); - zap_low_mappings(true); } #ifdef CONFIG_MEMORY_HOTPLUG diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 84346200e783..71a59296af80 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -51,7 +51,6 @@ #include <asm/numa.h> #include <asm/cacheflush.h> #include <asm/init.h> -#include <linux/bootmem.h> static int __init parse_direct_gbpages_off(char *arg) { diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 72fc70cf6184..7b179b499fa3 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -48,21 +48,20 @@ int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot) } EXPORT_SYMBOL_GPL(iomap_create_wc); -void -iomap_free(resource_size_t base, unsigned long size) +void iomap_free(resource_size_t base, unsigned long size) { io_free_memtype(base, base + size); } EXPORT_SYMBOL_GPL(iomap_free); -void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) +void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) { - enum fixed_addresses idx; unsigned long vaddr; + int idx, type; pagefault_disable(); - debug_kmap_atomic(type); + type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR * smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); set_pte(kmap_pte - idx, pfn_pte(pfn, prot)); @@ -72,10 +71,10 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) } /* - * Map 'pfn' using fixed map 'type' and protections 'prot' + * Map 'pfn' using protections 'prot' */ void __iomem * -iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) +iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) { /* * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS. @@ -86,24 +85,34 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC)) prot = PAGE_KERNEL_UC_MINUS; - return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, type, prot); + return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot); } EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); void -iounmap_atomic(void __iomem *kvaddr, enum km_type type) +iounmap_atomic(void __iomem *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); - /* - * Force other mappings to Oops if they'll try to access this pte - * without first remap it. Keeping stale mappings around is a bad idea - * also, in case the page changes cacheability attributes or becomes - * a protected page in a hypervisor. - */ - if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) + if (vaddr >= __fix_to_virt(FIX_KMAP_END) && + vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) { + int idx, type; + + type = kmap_atomic_idx(); + idx = type + KM_TYPE_NR * smp_processor_id(); + +#ifdef CONFIG_DEBUG_HIGHMEM + WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); +#endif + /* + * Force other mappings to Oops if they'll try to access this + * pte without first remap it. Keeping stale mappings around + * is a bad idea also, in case the page changes cacheability + * attributes or becomes a protected page in a hypervisor. + */ kpte_clear_flush(kmap_pte-idx, vaddr); + kmap_atomic_idx_pop(); + } pagefault_enable(); } diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 68128a1b401a..90a7f5ad6916 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -19,15 +19,12 @@ config XEN_PVHVM depends on X86_LOCAL_APIC config XEN_MAX_DOMAIN_MEMORY - int "Maximum allowed size of a domain in gigabytes" - default 8 if X86_32 - default 32 if X86_64 + int + default 128 depends on XEN help - The pseudo-physical to machine address array is sized - according to the maximum possible memory size of a Xen - domain. This array uses 1 page per gigabyte, so there's no - need to be too stingy here. + This only affects the sizing of some bss arrays, the unused + portions of which are freed. config XEN_SAVE_RESTORE bool diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c index 1304bcec8ee5..7c0fedd98ea0 100644 --- a/arch/x86/xen/debugfs.c +++ b/arch/x86/xen/debugfs.c @@ -106,6 +106,7 @@ static const struct file_operations u32_array_fops = { .open = u32_array_open, .release= xen_array_release, .read = u32_array_read, + .llseek = no_llseek, }; struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode, diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 63b83ceebd1a..70ddeaeb1ef3 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -59,7 +59,6 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/reboot.h> -#include <asm/setup.h> #include <asm/stackprotector.h> #include <asm/hypervisor.h> @@ -136,9 +135,6 @@ static void xen_vcpu_setup(int cpu) info.mfn = arbitrary_virt_to_mfn(vcpup); info.offset = offset_in_page(vcpup); - printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n", - cpu, vcpup, info.mfn, info.offset); - /* Check to see if the hypervisor will put the vcpu_info structure where we want it, which allows direct access via a percpu-variable. */ @@ -152,9 +148,6 @@ static void xen_vcpu_setup(int cpu) /* This cpu is using the registered vcpu info, even if later ones fail to. */ per_cpu(xen_vcpu, cpu) = vcpup; - - printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n", - cpu, vcpup); } } @@ -836,6 +829,11 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) Xen console noise. */ break; + case MSR_IA32_CR_PAT: + if (smp_processor_id() == 0) + xen_set_pat(((u64)high << 32) | low); + break; + default: ret = native_write_msr_safe(msr, low, high); } @@ -874,8 +872,6 @@ void xen_setup_vcpu_info_placement(void) /* xen_vcpu_setup managed to place the vcpu_info within the percpu area for all cpus, so make use of it */ if (have_vcpu_info_placement) { - printk(KERN_INFO "Xen: using vcpu_info placement\n"); - pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); @@ -1019,7 +1015,7 @@ static void xen_reboot(int reason) struct sched_shutdown r = { .reason = reason }; #ifdef CONFIG_SMP - smp_send_stop(); + stop_other_cpus(); #endif if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) @@ -1189,6 +1185,9 @@ asmlinkage void __init xen_start_kernel(void) xen_raw_console_write("mapping kernel into physical memory\n"); pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); + /* Allocate and initialize top and mid mfn levels for p2m structure */ + xen_build_mfn_list_list(); + init_mm.pgd = pgd; /* keep using Xen gdt for now; no urgent need to change it */ diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index f72d18c69221..9631c90907eb 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -57,6 +57,7 @@ #include <asm/linkage.h> #include <asm/page.h> #include <asm/init.h> +#include <asm/pat.h> #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> @@ -140,7 +141,8 @@ static inline void check_zero(void) * large enough to allocate page table pages to allocate the rest. * Each page can map 2MB. */ -static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss; +#define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4) +static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES); #ifdef CONFIG_X86_64 /* l3 pud for userspace vsyscall mapping */ @@ -171,49 +173,182 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ */ #define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) +/* + * Xen leaves the responsibility for maintaining p2m mappings to the + * guests themselves, but it must also access and update the p2m array + * during suspend/resume when all the pages are reallocated. + * + * The p2m table is logically a flat array, but we implement it as a + * three-level tree to allow the address space to be sparse. + * + * Xen + * | + * p2m_top p2m_top_mfn + * / \ / \ + * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn + * / \ / \ / / + * p2m p2m p2m p2m p2m p2m p2m ... + * + * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p. + * + * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the + * maximum representable pseudo-physical address space is: + * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages + * + * P2M_PER_PAGE depends on the architecture, as a mfn is always + * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to + * 512 and 1024 entries respectively. + */ + +unsigned long xen_max_p2m_pfn __read_mostly; -#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) -#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) +#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) +#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) +#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) -/* Placeholder for holes in the address space */ -static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data = - { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL }; +#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) - /* Array of pointers to pages containing p2m entries */ -static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data = - { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] }; +/* Placeholders for holes in the address space */ +static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE); -/* Arrays of p2m arrays expressed in mfns used for save/restore */ -static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss; +static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); -static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE] - __page_aligned_bss; +RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); +RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); static inline unsigned p2m_top_index(unsigned long pfn) { - BUG_ON(pfn >= MAX_DOMAIN_PAGES); - return pfn / P2M_ENTRIES_PER_PAGE; + BUG_ON(pfn >= MAX_P2M_PFN); + return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE); +} + +static inline unsigned p2m_mid_index(unsigned long pfn) +{ + return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE; } static inline unsigned p2m_index(unsigned long pfn) { - return pfn % P2M_ENTRIES_PER_PAGE; + return pfn % P2M_PER_PAGE; +} + +static void p2m_top_init(unsigned long ***top) +{ + unsigned i; + + for (i = 0; i < P2M_TOP_PER_PAGE; i++) + top[i] = p2m_mid_missing; +} + +static void p2m_top_mfn_init(unsigned long *top) +{ + unsigned i; + + for (i = 0; i < P2M_TOP_PER_PAGE; i++) + top[i] = virt_to_mfn(p2m_mid_missing_mfn); +} + +static void p2m_top_mfn_p_init(unsigned long **top) +{ + unsigned i; + + for (i = 0; i < P2M_TOP_PER_PAGE; i++) + top[i] = p2m_mid_missing_mfn; +} + +static void p2m_mid_init(unsigned long **mid) +{ + unsigned i; + + for (i = 0; i < P2M_MID_PER_PAGE; i++) + mid[i] = p2m_missing; +} + +static void p2m_mid_mfn_init(unsigned long *mid) +{ + unsigned i; + + for (i = 0; i < P2M_MID_PER_PAGE; i++) + mid[i] = virt_to_mfn(p2m_missing); } -/* Build the parallel p2m_top_mfn structures */ +static void p2m_init(unsigned long *p2m) +{ + unsigned i; + + for (i = 0; i < P2M_MID_PER_PAGE; i++) + p2m[i] = INVALID_P2M_ENTRY; +} + +/* + * Build the parallel p2m_top_mfn and p2m_mid_mfn structures + * + * This is called both at boot time, and after resuming from suspend: + * - At boot time we're called very early, and must use extend_brk() + * to allocate memory. + * + * - After resume we're called from within stop_machine, but the mfn + * tree should alreay be completely allocated. + */ void xen_build_mfn_list_list(void) { - unsigned pfn, idx; + unsigned long pfn; - for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) { - unsigned topidx = p2m_top_index(pfn); + /* Pre-initialize p2m_top_mfn to be completely missing */ + if (p2m_top_mfn == NULL) { + p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_mfn_init(p2m_mid_missing_mfn); + + p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_mfn_p_init(p2m_top_mfn_p); - p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]); + p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_mfn_init(p2m_top_mfn); + } else { + /* Reinitialise, mfn's all change after migration */ + p2m_mid_mfn_init(p2m_mid_missing_mfn); } - for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) { - unsigned topidx = idx * P2M_ENTRIES_PER_PAGE; - p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]); + for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { + unsigned topidx = p2m_top_index(pfn); + unsigned mididx = p2m_mid_index(pfn); + unsigned long **mid; + unsigned long *mid_mfn_p; + + mid = p2m_top[topidx]; + mid_mfn_p = p2m_top_mfn_p[topidx]; + + /* Don't bother allocating any mfn mid levels if + * they're just missing, just update the stored mfn, + * since all could have changed over a migrate. + */ + if (mid == p2m_mid_missing) { + BUG_ON(mididx); + BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); + p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn); + pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE; + continue; + } + + if (mid_mfn_p == p2m_mid_missing_mfn) { + /* + * XXX boot-time only! We should never find + * missing parts of the mfn tree after + * runtime. extend_brk() will BUG if we call + * it too late. + */ + mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_mfn_init(mid_mfn_p); + + p2m_top_mfn_p[topidx] = mid_mfn_p; + } + + p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); + mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]); } } @@ -222,8 +357,8 @@ void xen_setup_mfn_list_list(void) BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = - virt_to_mfn(p2m_top_mfn_list); - HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages; + virt_to_mfn(p2m_top_mfn); + HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; } /* Set up p2m_top to point to the domain-builder provided p2m pages */ @@ -231,98 +366,176 @@ void __init xen_build_dynamic_phys_to_machine(void) { unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); - unsigned pfn; + unsigned long pfn; + + xen_max_p2m_pfn = max_pfn; - for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { + p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_init(p2m_missing); + + p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_init(p2m_mid_missing); + + p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_init(p2m_top); + + /* + * The domain builder gives us a pre-constructed p2m array in + * mfn_list for all the pages initially given to us, so we just + * need to graft that into our tree structure. + */ + for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) { unsigned topidx = p2m_top_index(pfn); + unsigned mididx = p2m_mid_index(pfn); - p2m_top[topidx] = &mfn_list[pfn]; - } + if (p2m_top[topidx] == p2m_mid_missing) { + unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_init(mid); + + p2m_top[topidx] = mid; + } - xen_build_mfn_list_list(); + p2m_top[topidx][mididx] = &mfn_list[pfn]; + } } unsigned long get_phys_to_machine(unsigned long pfn) { - unsigned topidx, idx; + unsigned topidx, mididx, idx; - if (unlikely(pfn >= MAX_DOMAIN_PAGES)) + if (unlikely(pfn >= MAX_P2M_PFN)) return INVALID_P2M_ENTRY; topidx = p2m_top_index(pfn); + mididx = p2m_mid_index(pfn); idx = p2m_index(pfn); - return p2m_top[topidx][idx]; + + return p2m_top[topidx][mididx][idx]; } EXPORT_SYMBOL_GPL(get_phys_to_machine); -/* install a new p2m_top page */ -bool install_p2mtop_page(unsigned long pfn, unsigned long *p) +static void *alloc_p2m_page(void) { - unsigned topidx = p2m_top_index(pfn); - unsigned long **pfnp, *mfnp; - unsigned i; + return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); +} - pfnp = &p2m_top[topidx]; - mfnp = &p2m_top_mfn[topidx]; +static void free_p2m_page(void *p) +{ + free_page((unsigned long)p); +} - for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++) - p[i] = INVALID_P2M_ENTRY; +/* + * Fully allocate the p2m structure for a given pfn. We need to check + * that both the top and mid levels are allocated, and make sure the + * parallel mfn tree is kept in sync. We may race with other cpus, so + * the new pages are installed with cmpxchg; if we lose the race then + * simply free the page we allocated and use the one that's there. + */ +static bool alloc_p2m(unsigned long pfn) +{ + unsigned topidx, mididx; + unsigned long ***top_p, **mid; + unsigned long *top_mfn_p, *mid_mfn; - if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) { - *mfnp = virt_to_mfn(p); - return true; + topidx = p2m_top_index(pfn); + mididx = p2m_mid_index(pfn); + + top_p = &p2m_top[topidx]; + mid = *top_p; + + if (mid == p2m_mid_missing) { + /* Mid level is missing, allocate a new one */ + mid = alloc_p2m_page(); + if (!mid) + return false; + + p2m_mid_init(mid); + + if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing) + free_p2m_page(mid); } - return false; -} + top_mfn_p = &p2m_top_mfn[topidx]; + mid_mfn = p2m_top_mfn_p[topidx]; -static void alloc_p2m(unsigned long pfn) -{ - unsigned long *p; + BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); + + if (mid_mfn == p2m_mid_missing_mfn) { + /* Separately check the mid mfn level */ + unsigned long missing_mfn; + unsigned long mid_mfn_mfn; + + mid_mfn = alloc_p2m_page(); + if (!mid_mfn) + return false; + + p2m_mid_mfn_init(mid_mfn); + + missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); + mid_mfn_mfn = virt_to_mfn(mid_mfn); + if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn) + free_p2m_page(mid_mfn); + else + p2m_top_mfn_p[topidx] = mid_mfn; + } + + if (p2m_top[topidx][mididx] == p2m_missing) { + /* p2m leaf page is missing */ + unsigned long *p2m; + + p2m = alloc_p2m_page(); + if (!p2m) + return false; - p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL); - BUG_ON(p == NULL); + p2m_init(p2m); + + if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing) + free_p2m_page(p2m); + else + mid_mfn[mididx] = virt_to_mfn(p2m); + } - if (!install_p2mtop_page(pfn, p)) - free_page((unsigned long)p); + return true; } /* Try to install p2m mapping; fail if intermediate bits missing */ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) { - unsigned topidx, idx; + unsigned topidx, mididx, idx; - if (unlikely(pfn >= MAX_DOMAIN_PAGES)) { + if (unlikely(pfn >= MAX_P2M_PFN)) { BUG_ON(mfn != INVALID_P2M_ENTRY); return true; } topidx = p2m_top_index(pfn); - if (p2m_top[topidx] == p2m_missing) { - if (mfn == INVALID_P2M_ENTRY) - return true; - return false; - } - + mididx = p2m_mid_index(pfn); idx = p2m_index(pfn); - p2m_top[topidx][idx] = mfn; + + if (p2m_top[topidx][mididx] == p2m_missing) + return mfn == INVALID_P2M_ENTRY; + + p2m_top[topidx][mididx][idx] = mfn; return true; } -void set_phys_to_machine(unsigned long pfn, unsigned long mfn) +bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) { if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); - return; + return true; } if (unlikely(!__set_phys_to_machine(pfn, mfn))) { - alloc_p2m(pfn); + if (!alloc_p2m(pfn)) + return false; if (!__set_phys_to_machine(pfn, mfn)) - BUG(); + return false; } + + return true; } unsigned long arbitrary_virt_to_mfn(void *vaddr) @@ -399,7 +612,7 @@ static bool xen_iomap_pte(pte_t pte) return pte_flags(pte) & _PAGE_IOMAP; } -static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval) +void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid) { struct multicall_space mcs; struct mmu_update *u; @@ -411,10 +624,16 @@ static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval) u->ptr = arbitrary_virt_to_machine(ptep).maddr; u->val = pte_val_ma(pteval); - MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_IO); + MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid); xen_mc_issue(PARAVIRT_LAZY_MMU); } +EXPORT_SYMBOL_GPL(xen_set_domain_pte); + +static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval) +{ + xen_set_domain_pte(ptep, pteval, DOMID_IO); +} static void xen_extend_mmu_update(const struct mmu_update *update) { @@ -561,7 +780,20 @@ static pteval_t pte_pfn_to_mfn(pteval_t val) if (val & _PAGE_PRESENT) { unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; pteval_t flags = val & PTE_FLAGS_MASK; - val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; + unsigned long mfn = pfn_to_mfn(pfn); + + /* + * If there's no mfn for the pfn, then just create an + * empty non-present pte. Unfortunately this loses + * information about the original pfn, so + * pte_mfn_to_pfn is asymmetric. + */ + if (unlikely(mfn == INVALID_P2M_ENTRY)) { + mfn = 0; + flags = 0; + } + + val = ((pteval_t)mfn << PAGE_SHIFT) | flags; } return val; @@ -583,10 +815,18 @@ static pteval_t iomap_pte(pteval_t val) pteval_t xen_pte_val(pte_t pte) { - if (xen_initial_domain() && (pte.pte & _PAGE_IOMAP)) - return pte.pte; + pteval_t pteval = pte.pte; + + /* If this is a WC pte, convert back from Xen WC to Linux WC */ + if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) { + WARN_ON(!pat_enabled); + pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT; + } - return pte_mfn_to_pfn(pte.pte); + if (xen_initial_domain() && (pteval & _PAGE_IOMAP)) + return pteval; + + return pte_mfn_to_pfn(pteval); } PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); @@ -596,10 +836,48 @@ pgdval_t xen_pgd_val(pgd_t pgd) } PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); +/* + * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7 + * are reserved for now, to correspond to the Intel-reserved PAT + * types. + * + * We expect Linux's PAT set as follows: + * + * Idx PTE flags Linux Xen Default + * 0 WB WB WB + * 1 PWT WC WT WT + * 2 PCD UC- UC- UC- + * 3 PCD PWT UC UC UC + * 4 PAT WB WC WB + * 5 PAT PWT WC WP WT + * 6 PAT PCD UC- UC UC- + * 7 PAT PCD PWT UC UC UC + */ + +void xen_set_pat(u64 pat) +{ + /* We expect Linux to use a PAT setting of + * UC UC- WC WB (ignoring the PAT flag) */ + WARN_ON(pat != 0x0007010600070106ull); +} + pte_t xen_make_pte(pteval_t pte) { phys_addr_t addr = (pte & PTE_PFN_MASK); + /* If Linux is trying to set a WC pte, then map to the Xen WC. + * If _PAGE_PAT is set, then it probably means it is really + * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope + * things work out OK... + * + * (We should never see kernel mappings with _PAGE_PSE set, + * but we could see hugetlbfs mappings, I think.). + */ + if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) { + if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT) + pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT; + } + /* * Unprivileged domains are allowed to do IOMAPpings for * PCI passthrough, but not map ISA space. The ISA @@ -1712,6 +1990,9 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) unsigned ident_pte; unsigned long pfn; + level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES, + PAGE_SIZE); + ident_pte = 0; pfn = 0; for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { @@ -1722,7 +2003,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) pte_page = m2v(pmd[pmdidx].pmd); else { /* Check for free pte pages */ - if (ident_pte == ARRAY_SIZE(level1_ident_pgt)) + if (ident_pte == LEVEL1_IDENT_ENTRIES) break; pte_page = &level1_ident_pgt[ident_pte]; @@ -1837,13 +2118,15 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, return pgd; } #else /* !CONFIG_X86_64 */ -static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss; +static RESERVE_BRK_ARRAY(pmd_t, level2_kernel_pgt, PTRS_PER_PMD); __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { pmd_t *kernel_pmd; + level2_kernel_pgt = extend_brk(sizeof(pmd_t *) * PTRS_PER_PMD, PAGE_SIZE); + max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + xen_start_info->nr_pt_frames * PAGE_SIZE + 512*1024); @@ -2269,6 +2552,72 @@ void __init xen_hvm_init_mmu_ops(void) } #endif +#define REMAP_BATCH_SIZE 16 + +struct remap_data { + unsigned long mfn; + pgprot_t prot; + struct mmu_update *mmu_update; +}; + +static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, + unsigned long addr, void *data) +{ + struct remap_data *rmd = data; + pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot)); + + rmd->mmu_update->ptr = arbitrary_virt_to_machine(ptep).maddr; + rmd->mmu_update->val = pte_val_ma(pte); + rmd->mmu_update++; + + return 0; +} + +int xen_remap_domain_mfn_range(struct vm_area_struct *vma, + unsigned long addr, + unsigned long mfn, int nr, + pgprot_t prot, unsigned domid) +{ + struct remap_data rmd; + struct mmu_update mmu_update[REMAP_BATCH_SIZE]; + int batch; + unsigned long range; + int err = 0; + + prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP); + + vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; + + rmd.mfn = mfn; + rmd.prot = prot; + + while (nr) { + batch = min(REMAP_BATCH_SIZE, nr); + range = (unsigned long)batch << PAGE_SHIFT; + + rmd.mmu_update = mmu_update; + err = apply_to_page_range(vma->vm_mm, addr, range, + remap_area_mfn_pte_fn, &rmd); + if (err) + goto out; + + err = -EFAULT; + if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0) + goto out; + + nr -= batch; + addr += range; + } + + err = 0; +out: + + flush_tlb_all(); + + return err; +} +EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); + #ifdef CONFIG_XEN_DEBUG_FS static struct dentry *d_mmu_debug; diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index fa938c4aa2f7..537bb9aab777 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -12,7 +12,6 @@ enum pt_level { bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); -bool install_p2mtop_page(unsigned long pfn, unsigned long *p); void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 9729c903404b..105db2501050 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -18,8 +18,10 @@ #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> +#include <xen/xen.h> #include <xen/page.h> #include <xen/interface/callback.h> +#include <xen/interface/memory.h> #include <xen/interface/physdev.h> #include <xen/interface/memory.h> #include <xen/features.h> @@ -34,6 +36,39 @@ extern void xen_sysenter_target(void); extern void xen_syscall_target(void); extern void xen_syscall32_target(void); +/* Amount of extra memory space we add to the e820 ranges */ +phys_addr_t xen_extra_mem_start, xen_extra_mem_size; + +/* + * The maximum amount of extra memory compared to the base size. The + * main scaling factor is the size of struct page. At extreme ratios + * of base:extra, all the base memory can be filled with page + * structures for the extra memory, leaving no space for anything + * else. + * + * 10x seems like a reasonable balance between scaling flexibility and + * leaving a practically usable system. + */ +#define EXTRA_MEM_RATIO (10) + +static __init void xen_add_extra_mem(unsigned long pages) +{ + u64 size = (u64)pages * PAGE_SIZE; + u64 extra_start = xen_extra_mem_start + xen_extra_mem_size; + + if (!pages) + return; + + e820_add_region(extra_start, size, E820_RAM); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + + memblock_x86_reserve_range(extra_start, extra_start + size, "XEN EXTRA"); + + xen_extra_mem_size += size; + + xen_max_p2m_pfn = PFN_DOWN(extra_start + size); +} + static unsigned long __init xen_release_chunk(phys_addr_t start_addr, phys_addr_t end_addr) { @@ -105,16 +140,65 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn, /** * machine_specific_memory_setup - Hook for machine specific memory setup. **/ - char * __init xen_memory_setup(void) { + static struct e820entry map[E820MAX] __initdata; + unsigned long max_pfn = xen_start_info->nr_pages; + unsigned long long mem_end; + int rc; + struct xen_memory_map memmap; + unsigned long extra_pages = 0; + unsigned long extra_limit; + int i; + int op; max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); + mem_end = PFN_PHYS(max_pfn); + + memmap.nr_entries = E820MAX; + set_xen_guest_handle(memmap.buffer, map); + + op = xen_initial_domain() ? + XENMEM_machine_memory_map : + XENMEM_memory_map; + rc = HYPERVISOR_memory_op(op, &memmap); + if (rc == -ENOSYS) { + memmap.nr_entries = 1; + map[0].addr = 0ULL; + map[0].size = mem_end; + /* 8MB slack (to balance backend allocations). */ + map[0].size += 8ULL << 20; + map[0].type = E820_RAM; + rc = 0; + } + BUG_ON(rc); e820.nr_map = 0; + xen_extra_mem_start = mem_end; + for (i = 0; i < memmap.nr_entries; i++) { + unsigned long long end = map[i].addr + map[i].size; + + if (map[i].type == E820_RAM) { + if (map[i].addr < mem_end && end > mem_end) { + /* Truncate region to max_mem. */ + u64 delta = end - mem_end; + + map[i].size -= delta; + extra_pages += PFN_DOWN(delta); + + end = mem_end; + } + } - e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM); + if (end > xen_extra_mem_start) + xen_extra_mem_start = end; + + /* If region is non-RAM or below mem_end, add what remains */ + if ((map[i].type != E820_RAM || map[i].addr < mem_end) && + map[i].size > 0) + e820_add_region(map[i].addr, map[i].size, map[i].type); + } /* * Even though this is normal, usable memory under Xen, reserve @@ -136,7 +220,29 @@ char * __init xen_memory_setup(void) sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); - xen_return_unused_memory(xen_start_info->nr_pages, &e820); + extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820); + + /* + * Clamp the amount of extra memory to a EXTRA_MEM_RATIO + * factor the base size. On non-highmem systems, the base + * size is the full initial memory allocation; on highmem it + * is limited to the max size of lowmem, so that it doesn't + * get completely filled. + * + * In principle there could be a problem in lowmem systems if + * the initial memory is also very large with respect to + * lowmem, but we won't try to deal with that here. + */ + extra_limit = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), + max_pfn + extra_pages); + + if (extra_limit >= max_pfn) + extra_pages = extra_limit - max_pfn; + else + extra_pages = 0; + + if (!xen_initial_domain()) + xen_add_extra_mem(extra_pages); return "Xen"; } diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 25f232b18a82..f4d010031465 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -400,9 +400,9 @@ static void stop_self(void *v) BUG(); } -static void xen_smp_send_stop(void) +static void xen_stop_other_cpus(int wait) { - smp_call_function(stop_self, NULL, 0); + smp_call_function(stop_self, NULL, wait); } static void xen_smp_send_reschedule(int cpu) @@ -470,7 +470,7 @@ static const struct smp_ops xen_smp_ops __initdata = { .cpu_disable = xen_cpu_disable, .play_dead = xen_play_dead, - .smp_send_stop = xen_smp_send_stop, + .stop_other_cpus = xen_stop_other_cpus, .smp_send_reschedule = xen_smp_send_reschedule, .send_call_func_ipi = xen_smp_send_call_function_ipi, diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 7c8ab86163e9..64044747348e 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -30,6 +30,9 @@ void xen_setup_machphys_mapping(void); pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); void xen_ident_map_ISA(void); void xen_reserve_top(void); +extern unsigned long xen_max_p2m_pfn; + +void xen_set_pat(u64); char * __init xen_memory_setup(void); void __init xen_arch_setup(void); |