diff options
Diffstat (limited to 'arch/x86')
73 files changed, 2259 insertions, 681 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 249d1e0824b5..862adb9bf0d4 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -586,6 +586,16 @@ config AMD_IOMMU your BIOS for an option to enable it or if you have an IVRS ACPI table. +config AMD_IOMMU_STATS + bool "Export AMD IOMMU statistics to debugfs" + depends on AMD_IOMMU + select DEBUG_FS + help + This option enables code in the AMD IOMMU driver to collect various + statistics about whats happening in the driver and exports that + information to userspace via debugfs. + If unsure, say N. + # need this always selected by IOMMU for the VIA workaround config SWIOTLB def_bool y if X86_64 @@ -599,6 +609,9 @@ config SWIOTLB config IOMMU_HELPER def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU) +config IOMMU_API + def_bool (AMD_IOMMU || DMAR) + config MAXSMP bool "Configure Maximum number of SMP Processors and NUMA Nodes" depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index b195f85526e3..9dabd00e9805 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -24,15 +24,14 @@ #include <asm/ucontext.h> #include <asm/uaccess.h> #include <asm/i387.h> -#include <asm/ia32.h> #include <asm/ptrace.h> #include <asm/ia32_unistd.h> #include <asm/user32.h> #include <asm/sigcontext32.h> #include <asm/proto.h> #include <asm/vdso.h> - #include <asm/sigframe.h> +#include <asm/sys_ia32.h> #define DEBUG_SIG 0 diff --git a/arch/x86/ia32/ipc32.c b/arch/x86/ia32/ipc32.c index d21991ce606c..29cdcd02ead3 100644 --- a/arch/x86/ia32/ipc32.c +++ b/arch/x86/ia32/ipc32.c @@ -8,6 +8,7 @@ #include <linux/shm.h> #include <linux/ipc.h> #include <linux/compat.h> +#include <asm/sys_ia32.h> asmlinkage long sys32_ipc(u32 call, int first, int second, int third, compat_uptr_t ptr, u32 fifth) diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 2e09dcd3c0a6..6c0d7f6231af 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -44,8 +44,8 @@ #include <asm/types.h> #include <asm/uaccess.h> #include <asm/atomic.h> -#include <asm/ia32.h> #include <asm/vgtod.h> +#include <asm/sys_ia32.h> #define AA(__x) ((unsigned long)(__x)) diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index ac302a2fa339..95c8cd9d22b5 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -190,16 +190,23 @@ /* FIXME: move this macro to <linux/pci.h> */ #define PCI_BUS(x) (((x) >> 8) & 0xff) +/* Protection domain flags */ +#define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */ +#define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops + domain for an IOMMU */ + /* * This structure contains generic data for IOMMU protection domains * independent of their use. */ struct protection_domain { - spinlock_t lock; /* mostly used to lock the page table*/ - u16 id; /* the domain id written to the device table */ - int mode; /* paging mode (0-6 levels) */ - u64 *pt_root; /* page table root pointer */ - void *priv; /* private data */ + spinlock_t lock; /* mostly used to lock the page table*/ + u16 id; /* the domain id written to the device table */ + int mode; /* paging mode (0-6 levels) */ + u64 *pt_root; /* page table root pointer */ + unsigned long flags; /* flags to find out type of domain */ + unsigned dev_cnt; /* devices assigned to this domain */ + void *priv; /* private data */ }; /* @@ -295,7 +302,7 @@ struct amd_iommu { bool int_enabled; /* if one, we need to send a completion wait command */ - int need_sync; + bool need_sync; /* default dma_ops domain for that IOMMU */ struct dma_ops_domain *default_dom; @@ -374,7 +381,7 @@ extern struct protection_domain **amd_iommu_pd_table; extern unsigned long *amd_iommu_pd_alloc_bitmap; /* will be 1 if device isolation is enabled */ -extern int amd_iommu_isolate; +extern bool amd_iommu_isolate; /* * If true, the addresses will be flushed on unmap time, not when @@ -382,18 +389,6 @@ extern int amd_iommu_isolate; */ extern bool amd_iommu_unmap_flush; -/* takes a PCI device id and prints it out in a readable form */ -static inline void print_devid(u16 devid, int nl) -{ - int bus = devid >> 8; - int dev = devid >> 3 & 0x1f; - int fn = devid & 0x07; - - printk("%02x:%02x.%x", bus, dev, fn); - if (nl) - printk("\n"); -} - /* takes bus and device/function and returns the device id * FIXME: should that be in generic PCI code? */ static inline u16 calc_devid(u8 bus, u8 devfn) @@ -401,4 +396,32 @@ static inline u16 calc_devid(u8 bus, u8 devfn) return (((u16)bus) << 8) | devfn; } +#ifdef CONFIG_AMD_IOMMU_STATS + +struct __iommu_counter { + char *name; + struct dentry *dent; + u64 value; +}; + +#define DECLARE_STATS_COUNTER(nm) \ + static struct __iommu_counter nm = { \ + .name = #nm, \ + } + +#define INC_STATS_COUNTER(name) name.value += 1 +#define ADD_STATS_COUNTER(name, x) name.value += (x) +#define SUB_STATS_COUNTER(name, x) name.value -= (x) + +#else /* CONFIG_AMD_IOMMU_STATS */ + +#define DECLARE_STATS_COUNTER(name) +#define INC_STATS_COUNTER(name) +#define ADD_STATS_COUNTER(name, x) +#define SUB_STATS_COUNTER(name, x) + +static inline void amd_iommu_stats_init(void) { } + +#endif /* CONFIG_AMD_IOMMU_STATS */ + #endif /* _ASM_X86_AMD_IOMMU_TYPES_H */ diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 25caa0738af5..ab1d51a8855e 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -54,7 +54,6 @@ extern int disable_apic; extern int is_vsmp_box(void); extern void xapic_wait_icr_idle(void); extern u32 safe_xapic_wait_icr_idle(void); -extern u64 xapic_icr_read(void); extern void xapic_icr_write(u32, u32); extern int setup_profiling_timer(unsigned int); @@ -93,7 +92,7 @@ static inline u32 native_apic_msr_read(u32 reg) } #ifndef CONFIG_X86_32 -extern int x2apic, x2apic_preenabled; +extern int x2apic; extern void check_x2apic(void); extern void enable_x2apic(void); extern void enable_IR_x2apic(void); diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index a2e545c91c35..ca5ffb2856b6 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -90,6 +90,7 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size); #endif /* CONFIG_X86_32 */ +extern int add_efi_memmap; extern void efi_reserve_early(void); extern void efi_call_phys_prelog(void); extern void efi_call_phys_epilog(void); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8346be87cfa1..730843d1d2fb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -21,6 +21,7 @@ #include <asm/pvclock-abi.h> #include <asm/desc.h> +#include <asm/mtrr.h> #define KVM_MAX_VCPUS 16 #define KVM_MEMORY_SLOTS 32 @@ -86,6 +87,7 @@ #define KVM_MIN_FREE_MMU_PAGES 5 #define KVM_REFILL_PAGES 25 #define KVM_MAX_CPUID_ENTRIES 40 +#define KVM_NR_FIXED_MTRR_REGION 88 #define KVM_NR_VAR_MTRR 8 extern spinlock_t kvm_lock; @@ -180,6 +182,8 @@ struct kvm_mmu_page { struct list_head link; struct hlist_node hash_link; + struct list_head oos_link; + /* * The following two entries are used to key the shadow page in the * hash table. @@ -190,13 +194,16 @@ struct kvm_mmu_page { u64 *spt; /* hold the gfn of each spte inside spt */ gfn_t *gfns; - unsigned long slot_bitmap; /* One bit set per slot which has memory - * in this shadow page. - */ + /* + * One bit set per slot which has memory + * in this shadow page. + */ + DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); int multimapped; /* More than one parent_pte? */ int root_count; /* Currently serving as active root */ bool unsync; - bool unsync_children; + bool global; + unsigned int unsync_children; union { u64 *parent_pte; /* !multimapped */ struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */ @@ -327,8 +334,10 @@ struct kvm_vcpu_arch { bool nmi_pending; bool nmi_injected; + bool nmi_window_open; - u64 mtrr[0x100]; + struct mtrr_state_type mtrr_state; + u32 pat; }; struct kvm_mem_alias { @@ -350,11 +359,13 @@ struct kvm_arch{ */ struct list_head active_mmu_pages; struct list_head assigned_dev_head; - struct dmar_domain *intel_iommu_domain; + struct list_head oos_global_pages; + struct iommu_domain *iommu_domain; struct kvm_pic *vpic; struct kvm_ioapic *vioapic; struct kvm_pit *vpit; struct hlist_head irq_ack_notifier_list; + int vapics_in_nmi_mode; int round_robin_prev_vcpu; unsigned int tss_addr; @@ -378,6 +389,7 @@ struct kvm_vm_stat { u32 mmu_recycled; u32 mmu_cache_miss; u32 mmu_unsync; + u32 mmu_unsync_global; u32 remote_tlb_flush; u32 lpages; }; @@ -397,6 +409,7 @@ struct kvm_vcpu_stat { u32 halt_exits; u32 halt_wakeup; u32 request_irq_exits; + u32 request_nmi_exits; u32 irq_exits; u32 host_state_reload; u32 efer_reload; @@ -405,6 +418,7 @@ struct kvm_vcpu_stat { u32 insn_emulation_fail; u32 hypercalls; u32 irq_injections; + u32 nmi_injections; }; struct descriptor_table { @@ -477,6 +491,7 @@ struct kvm_x86_ops { int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); + int (*get_mt_mask_shift)(void); }; extern struct kvm_x86_ops *kvm_x86_ops; @@ -490,7 +505,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu); void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); void kvm_mmu_set_base_ptes(u64 base_pte); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask); + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask); int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); @@ -587,12 +602,14 @@ unsigned long segment_base(u16 selector); void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes); + const u8 *new, int bytes, + bool guest_initiated); int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); +void kvm_mmu_sync_global(struct kvm_vcpu *vcpu); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); @@ -607,6 +624,8 @@ void kvm_disable_tdp(void); int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); int complete_pio(struct kvm_vcpu *vcpu); +struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn); + static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) { struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); @@ -702,18 +721,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); } -#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30" -#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2" -#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3" -#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30" -#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0" -#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0" -#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4" -#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4" -#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30" -#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08" -#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08" - #define MSR_IA32_TIME_STAMP_COUNTER 0x010 #define TSS_IOPB_BASE_OFFSET 0x66 diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h index 25179a29f208..6a159732881a 100644 --- a/arch/x86/include/asm/kvm_x86_emulate.h +++ b/arch/x86/include/asm/kvm_x86_emulate.h @@ -123,6 +123,7 @@ struct decode_cache { u8 ad_bytes; u8 rex_prefix; struct operand src; + struct operand src2; struct operand dst; bool has_seg_override; u8 seg_override; @@ -146,22 +147,18 @@ struct x86_emulate_ctxt { /* Register state before/after emulation. */ struct kvm_vcpu *vcpu; - /* Linear faulting address (if emulating a page-faulting instruction) */ unsigned long eflags; - /* Emulated execution mode, represented by an X86EMUL_MODE value. */ int mode; - u32 cs_base; /* decode cache */ - struct decode_cache decode; }; /* Repeat String Operation Prefix */ -#define REPE_PREFIX 1 -#define REPNE_PREFIX 2 +#define REPE_PREFIX 1 +#define REPNE_PREFIX 2 /* Execution mode, passed to the emulator. */ #define X86EMUL_MODE_REAL 0 /* Real mode. */ @@ -170,7 +167,7 @@ struct x86_emulate_ctxt { #define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ /* Host execution mode. */ -#if defined(__i386__) +#if defined(CONFIG_X86_32) #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32 #elif defined(CONFIG_X86_64) #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index 91885c28f66b..62d14ce3cd00 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -6,13 +6,13 @@ #include <asm/mpspec_def.h> extern int apic_version[MAX_APICS]; +extern int pic_mode; #ifdef CONFIG_X86_32 #include <mach_mpspec.h> extern unsigned int def_to_bigsmp; extern u8 apicid_2_node[]; -extern int pic_mode; #ifdef CONFIG_X86_NUMAQ extern int mp_bus_id_to_node[MAX_MP_BUSSES]; diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h index 7c1e4258b31e..cb988aab716d 100644 --- a/arch/x86/include/asm/mtrr.h +++ b/arch/x86/include/asm/mtrr.h @@ -57,6 +57,31 @@ struct mtrr_gentry { }; #endif /* !__i386__ */ +struct mtrr_var_range { + u32 base_lo; + u32 base_hi; + u32 mask_lo; + u32 mask_hi; +}; + +/* In the Intel processor's MTRR interface, the MTRR type is always held in + an 8 bit field: */ +typedef u8 mtrr_type; + +#define MTRR_NUM_FIXED_RANGES 88 +#define MTRR_MAX_VAR_RANGES 256 + +struct mtrr_state_type { + struct mtrr_var_range var_ranges[MTRR_MAX_VAR_RANGES]; + mtrr_type fixed_ranges[MTRR_NUM_FIXED_RANGES]; + unsigned char enabled; + unsigned char have_fixed; + mtrr_type def_type; +}; + +#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg)) +#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1) + /* These are the various ioctls */ #define MTRRIOC_ADD_ENTRY _IOW(MTRR_IOCTL_BASE, 0, struct mtrr_sentry) #define MTRRIOC_SET_ENTRY _IOW(MTRR_IOCTL_BASE, 1, struct mtrr_sentry) diff --git a/arch/x86/pci/pci.h b/arch/x86/include/asm/pci_x86.h index 1959018aac02..e60fd3e14bdf 100644 --- a/arch/x86/pci/pci.h +++ b/arch/x86/include/asm/pci_x86.h @@ -57,7 +57,8 @@ extern struct pci_ops pci_root_ops; struct irq_info { u8 bus, devfn; /* Bus, device and function */ struct { - u8 link; /* IRQ line ID, chipset dependent, 0=not routed */ + u8 link; /* IRQ line ID, chipset dependent, + 0 = not routed */ u16 bitmap; /* Available IRQs */ } __attribute__((packed)) irq[4]; u8 slot; /* Slot number, 0=onboard */ @@ -69,11 +70,13 @@ struct irq_routing_table { u16 version; /* PIRQ_VERSION */ u16 size; /* Table size in bytes */ u8 rtr_bus, rtr_devfn; /* Where the interrupt router lies */ - u16 exclusive_irqs; /* IRQs devoted exclusively to PCI usage */ - u16 rtr_vendor, rtr_device; /* Vendor and device ID of interrupt router */ + u16 exclusive_irqs; /* IRQs devoted exclusively to + PCI usage */ + u16 rtr_vendor, rtr_device; /* Vendor and device ID of + interrupt router */ u32 miniport_data; /* Crap */ u8 rfu[11]; - u8 checksum; /* Modulo 256 checksum must give zero */ + u8 checksum; /* Modulo 256 checksum must give 0 */ struct irq_info slots[0]; } __attribute__((packed)); @@ -148,15 +151,15 @@ static inline unsigned int mmio_config_readl(void __iomem *pos) static inline void mmio_config_writeb(void __iomem *pos, u8 val) { - asm volatile("movb %%al,(%1)" :: "a" (val), "r" (pos) : "memory"); + asm volatile("movb %%al,(%1)" : : "a" (val), "r" (pos) : "memory"); } static inline void mmio_config_writew(void __iomem *pos, u16 val) { - asm volatile("movw %%ax,(%1)" :: "a" (val), "r" (pos) : "memory"); + asm volatile("movw %%ax,(%1)" : : "a" (val), "r" (pos) : "memory"); } static inline void mmio_config_writel(void __iomem *pos, u32 val) { - asm volatile("movl %%eax,(%1)" :: "a" (val), "r" (pos) : "memory"); + asm volatile("movl %%eax,(%1)" : : "a" (val), "r" (pos) : "memory"); } diff --git a/arch/x86/kvm/svm.h b/arch/x86/include/asm/svm.h index 1b8afa78e869..1b8afa78e869 100644 --- a/arch/x86/kvm/svm.h +++ b/arch/x86/include/asm/svm.h diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h new file mode 100644 index 000000000000..ffb08be2a530 --- /dev/null +++ b/arch/x86/include/asm/sys_ia32.h @@ -0,0 +1,101 @@ +/* + * sys_ia32.h - Linux ia32 syscall interfaces + * + * Copyright (c) 2008 Jaswinder Singh Rajput + * + * This file is released under the GPLv2. + * See the file COPYING for more details. + */ + +#ifndef _ASM_X86_SYS_IA32_H +#define _ASM_X86_SYS_IA32_H + +#include <linux/compiler.h> +#include <linux/linkage.h> +#include <linux/types.h> +#include <linux/signal.h> +#include <asm/compat.h> +#include <asm/ia32.h> + +/* ia32/sys_ia32.c */ +asmlinkage long sys32_truncate64(char __user *, unsigned long, unsigned long); +asmlinkage long sys32_ftruncate64(unsigned int, unsigned long, unsigned long); + +asmlinkage long sys32_stat64(char __user *, struct stat64 __user *); +asmlinkage long sys32_lstat64(char __user *, struct stat64 __user *); +asmlinkage long sys32_fstat64(unsigned int, struct stat64 __user *); +asmlinkage long sys32_fstatat(unsigned int, char __user *, + struct stat64 __user *, int); +struct mmap_arg_struct; +asmlinkage long sys32_mmap(struct mmap_arg_struct __user *); +asmlinkage long sys32_mprotect(unsigned long, size_t, unsigned long); + +asmlinkage long sys32_pipe(int __user *); +struct sigaction32; +struct old_sigaction32; +asmlinkage long sys32_rt_sigaction(int, struct sigaction32 __user *, + struct sigaction32 __user *, unsigned int); +asmlinkage long sys32_sigaction(int, struct old_sigaction32 __user *, + struct old_sigaction32 __user *); +asmlinkage long sys32_rt_sigprocmask(int, compat_sigset_t __user *, + compat_sigset_t __user *, unsigned int); +asmlinkage long sys32_alarm(unsigned int); + +struct sel_arg_struct; +asmlinkage long sys32_old_select(struct sel_arg_struct __user *); +asmlinkage long sys32_waitpid(compat_pid_t, unsigned int *, int); +asmlinkage long sys32_sysfs(int, u32, u32); + +asmlinkage long sys32_sched_rr_get_interval(compat_pid_t, + struct compat_timespec __user *); +asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *, compat_size_t); +asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *); + +#ifdef CONFIG_SYSCTL_SYSCALL +struct sysctl_ia32; +asmlinkage long sys32_sysctl(struct sysctl_ia32 __user *); +#endif + +asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32); +asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32); + +asmlinkage long sys32_personality(unsigned long); +asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32); + +asmlinkage long sys32_mmap2(unsigned long, unsigned long, unsigned long, + unsigned long, unsigned long, unsigned long); + +struct oldold_utsname; +struct old_utsname; +asmlinkage long sys32_olduname(struct oldold_utsname __user *); +long sys32_uname(struct old_utsname __user *); + +long sys32_ustat(unsigned, struct ustat32 __user *); + +asmlinkage long sys32_execve(char __user *, compat_uptr_t __user *, + compat_uptr_t __user *, struct pt_regs *); +asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *); + +long sys32_lseek(unsigned int, int, unsigned int); +long sys32_kill(int, int); +long sys32_fadvise64_64(int, __u32, __u32, __u32, __u32, int); +long sys32_vm86_warning(void); +long sys32_lookup_dcookie(u32, u32, char __user *, size_t); + +asmlinkage ssize_t sys32_readahead(int, unsigned, unsigned, size_t); +asmlinkage long sys32_sync_file_range(int, unsigned, unsigned, + unsigned, unsigned, int); +asmlinkage long sys32_fadvise64(int, unsigned, unsigned, size_t, int); +asmlinkage long sys32_fallocate(int, int, unsigned, + unsigned, unsigned, unsigned); + +/* ia32/ia32_signal.c */ +asmlinkage long sys32_sigsuspend(int, int, old_sigset_t); +asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *, + stack_ia32_t __user *, struct pt_regs *); +asmlinkage long sys32_sigreturn(struct pt_regs *); +asmlinkage long sys32_rt_sigreturn(struct pt_regs *); + +/* ia32/ipc32.c */ +asmlinkage long sys32_ipc(u32, int, int, int, compat_uptr_t, u32); +#endif /* _ASM_X86_SYS_IA32_H */ diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index e2363253bbbf..50423c7b56b2 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -133,61 +133,61 @@ struct bau_msg_payload { * see table 4.2.3.0.1 in broacast_assist spec. */ struct bau_msg_header { - int dest_subnodeid:6; /* must be zero */ + unsigned int dest_subnodeid:6; /* must be zero */ /* bits 5:0 */ - int base_dest_nodeid:15; /* nasid>>1 (pnode) of first bit in node_map */ - /* bits 20:6 */ - int command:8; /* message type */ + unsigned int base_dest_nodeid:15; /* nasid>>1 (pnode) of */ + /* bits 20:6 */ /* first bit in node_map */ + unsigned int command:8; /* message type */ /* bits 28:21 */ /* 0x38: SN3net EndPoint Message */ - int rsvd_1:3; /* must be zero */ + unsigned int rsvd_1:3; /* must be zero */ /* bits 31:29 */ /* int will align on 32 bits */ - int rsvd_2:9; /* must be zero */ + unsigned int rsvd_2:9; /* must be zero */ /* bits 40:32 */ /* Suppl_A is 56-41 */ - int payload_2a:8; /* becomes byte 16 of msg */ + unsigned int payload_2a:8;/* becomes byte 16 of msg */ /* bits 48:41 */ /* not currently using */ - int payload_2b:8; /* becomes byte 17 of msg */ + unsigned int payload_2b:8;/* becomes byte 17 of msg */ /* bits 56:49 */ /* not currently using */ /* Address field (96:57) is never used as an address (these are address bits 42:3) */ - int rsvd_3:1; /* must be zero */ + unsigned int rsvd_3:1; /* must be zero */ /* bit 57 */ /* address bits 27:4 are payload */ /* these 24 bits become bytes 12-14 of msg */ - int replied_to:1; /* sent as 0 by the source to byte 12 */ + unsigned int replied_to:1;/* sent as 0 by the source to byte 12 */ /* bit 58 */ - int payload_1a:5; /* not currently used */ + unsigned int payload_1a:5;/* not currently used */ /* bits 63:59 */ - int payload_1b:8; /* not currently used */ + unsigned int payload_1b:8;/* not currently used */ /* bits 71:64 */ - int payload_1c:8; /* not currently used */ + unsigned int payload_1c:8;/* not currently used */ /* bits 79:72 */ - int payload_1d:2; /* not currently used */ + unsigned int payload_1d:2;/* not currently used */ /* bits 81:80 */ - int rsvd_4:7; /* must be zero */ + unsigned int rsvd_4:7; /* must be zero */ /* bits 88:82 */ - int sw_ack_flag:1; /* software acknowledge flag */ + unsigned int sw_ack_flag:1;/* software acknowledge flag */ /* bit 89 */ /* INTD trasactions at destination are to wait for software acknowledge */ - int rsvd_5:6; /* must be zero */ + unsigned int rsvd_5:6; /* must be zero */ /* bits 95:90 */ - int rsvd_6:5; /* must be zero */ + unsigned int rsvd_6:5; /* must be zero */ /* bits 100:96 */ - int int_both:1; /* if 1, interrupt both sockets on the blade */ + unsigned int int_both:1;/* if 1, interrupt both sockets on the blade */ /* bit 101*/ - int fairness:3; /* usually zero */ + unsigned int fairness:3;/* usually zero */ /* bits 104:102 */ - int multilevel:1; /* multi-level multicast format */ + unsigned int multilevel:1; /* multi-level multicast format */ /* bit 105 */ /* 0 for TLB: endpoint multi-unicast messages */ - int chaining:1; /* next descriptor is part of this activation*/ + unsigned int chaining:1;/* next descriptor is part of this activation*/ /* bit 106 */ - int rsvd_7:21; /* must be zero */ + unsigned int rsvd_7:21; /* must be zero */ /* bits 127:107 */ }; diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h new file mode 100644 index 000000000000..593636275238 --- /dev/null +++ b/arch/x86/include/asm/virtext.h @@ -0,0 +1,132 @@ +/* CPU virtualization extensions handling + * + * This should carry the code for handling CPU virtualization extensions + * that needs to live in the kernel core. + * + * Author: Eduardo Habkost <ehabkost@redhat.com> + * + * Copyright (C) 2008, Red Hat Inc. + * + * Contains code from KVM, Copyright (C) 2006 Qumranet, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ +#ifndef _ASM_X86_VIRTEX_H +#define _ASM_X86_VIRTEX_H + +#include <asm/processor.h> +#include <asm/system.h> + +#include <asm/vmx.h> +#include <asm/svm.h> + +/* + * VMX functions: + */ + +static inline int cpu_has_vmx(void) +{ + unsigned long ecx = cpuid_ecx(1); + return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */ +} + + +/** Disable VMX on the current CPU + * + * vmxoff causes a undefined-opcode exception if vmxon was not run + * on the CPU previously. Only call this function if you know VMX + * is enabled. + */ +static inline void cpu_vmxoff(void) +{ + asm volatile (ASM_VMX_VMXOFF : : : "cc"); + write_cr4(read_cr4() & ~X86_CR4_VMXE); +} + +static inline int cpu_vmx_enabled(void) +{ + return read_cr4() & X86_CR4_VMXE; +} + +/** Disable VMX if it is enabled on the current CPU + * + * You shouldn't call this if cpu_has_vmx() returns 0. + */ +static inline void __cpu_emergency_vmxoff(void) +{ + if (cpu_vmx_enabled()) + cpu_vmxoff(); +} + +/** Disable VMX if it is supported and enabled on the current CPU + */ +static inline void cpu_emergency_vmxoff(void) +{ + if (cpu_has_vmx()) + __cpu_emergency_vmxoff(); +} + + + + +/* + * SVM functions: + */ + +/** Check if the CPU has SVM support + * + * You can use the 'msg' arg to get a message describing the problem, + * if the function returns zero. Simply pass NULL if you are not interested + * on the messages; gcc should take care of not generating code for + * the messages on this case. + */ +static inline int cpu_has_svm(const char **msg) +{ + uint32_t eax, ebx, ecx, edx; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { + if (msg) + *msg = "not amd"; + return 0; + } + + cpuid(0x80000000, &eax, &ebx, &ecx, &edx); + if (eax < SVM_CPUID_FUNC) { + if (msg) + *msg = "can't execute cpuid_8000000a"; + return 0; + } + + cpuid(0x80000001, &eax, &ebx, &ecx, &edx); + if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) { + if (msg) + *msg = "svm not available"; + return 0; + } + return 1; +} + + +/** Disable SVM on the current CPU + * + * You should call this only if cpu_has_svm() returned true. + */ +static inline void cpu_svm_disable(void) +{ + uint64_t efer; + + wrmsrl(MSR_VM_HSAVE_PA, 0); + rdmsrl(MSR_EFER, efer); + wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK); +} + +/** Makes sure SVM is disabled, if it is supported on the CPU + */ +static inline void cpu_emergency_svm_disable(void) +{ + if (cpu_has_svm(NULL)) + cpu_svm_disable(); +} + +#endif /* _ASM_X86_VIRTEX_H */ diff --git a/arch/x86/kvm/vmx.h b/arch/x86/include/asm/vmx.h index ec5edc339da6..d0238e6151d8 100644 --- a/arch/x86/kvm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -63,10 +63,13 @@ #define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 #define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 +#define VM_EXIT_SAVE_IA32_PAT 0x00040000 +#define VM_EXIT_LOAD_IA32_PAT 0x00080000 #define VM_ENTRY_IA32E_MODE 0x00000200 #define VM_ENTRY_SMM 0x00000400 #define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 +#define VM_ENTRY_LOAD_IA32_PAT 0x00004000 /* VMCS Encodings */ enum vmcs_field { @@ -112,6 +115,8 @@ enum vmcs_field { VMCS_LINK_POINTER_HIGH = 0x00002801, GUEST_IA32_DEBUGCTL = 0x00002802, GUEST_IA32_DEBUGCTL_HIGH = 0x00002803, + GUEST_IA32_PAT = 0x00002804, + GUEST_IA32_PAT_HIGH = 0x00002805, GUEST_PDPTR0 = 0x0000280a, GUEST_PDPTR0_HIGH = 0x0000280b, GUEST_PDPTR1 = 0x0000280c, @@ -120,6 +125,8 @@ enum vmcs_field { GUEST_PDPTR2_HIGH = 0x0000280f, GUEST_PDPTR3 = 0x00002810, GUEST_PDPTR3_HIGH = 0x00002811, + HOST_IA32_PAT = 0x00002c00, + HOST_IA32_PAT_HIGH = 0x00002c01, PIN_BASED_VM_EXEC_CONTROL = 0x00004000, CPU_BASED_VM_EXEC_CONTROL = 0x00004002, EXCEPTION_BITMAP = 0x00004004, @@ -331,8 +338,9 @@ enum vmcs_field { #define AR_RESERVD_MASK 0xfffe0f00 -#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 -#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10 +#define TSS_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 0) +#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 1) +#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 2) #define VMX_NR_VPIDS (1 << 16) #define VMX_VPID_EXTENT_SINGLE_CONTEXT 1 @@ -356,4 +364,19 @@ enum vmcs_field { #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul + +#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30" +#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2" +#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3" +#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30" +#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0" +#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0" +#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4" +#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4" +#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30" +#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08" +#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08" + + + #endif diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 2e2da717b350..5113c080f0c4 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -20,8 +20,12 @@ #include <linux/pci.h> #include <linux/gfp.h> #include <linux/bitops.h> +#include <linux/debugfs.h> #include <linux/scatterlist.h> #include <linux/iommu-helper.h> +#ifdef CONFIG_IOMMU_API +#include <linux/iommu.h> +#endif #include <asm/proto.h> #include <asm/iommu.h> #include <asm/gart.h> @@ -38,6 +42,10 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock); static LIST_HEAD(iommu_pd_list); static DEFINE_SPINLOCK(iommu_pd_list_lock); +#ifdef CONFIG_IOMMU_API +static struct iommu_ops amd_iommu_ops; +#endif + /* * general struct to manage commands send to an IOMMU */ @@ -47,6 +55,68 @@ struct iommu_cmd { static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, struct unity_map_entry *e); +static struct dma_ops_domain *find_protection_domain(u16 devid); + + +#ifdef CONFIG_AMD_IOMMU_STATS + +/* + * Initialization code for statistics collection + */ + +DECLARE_STATS_COUNTER(compl_wait); +DECLARE_STATS_COUNTER(cnt_map_single); +DECLARE_STATS_COUNTER(cnt_unmap_single); +DECLARE_STATS_COUNTER(cnt_map_sg); +DECLARE_STATS_COUNTER(cnt_unmap_sg); +DECLARE_STATS_COUNTER(cnt_alloc_coherent); +DECLARE_STATS_COUNTER(cnt_free_coherent); +DECLARE_STATS_COUNTER(cross_page); +DECLARE_STATS_COUNTER(domain_flush_single); +DECLARE_STATS_COUNTER(domain_flush_all); +DECLARE_STATS_COUNTER(alloced_io_mem); +DECLARE_STATS_COUNTER(total_map_requests); + +static struct dentry *stats_dir; +static struct dentry *de_isolate; +static struct dentry *de_fflush; + +static void amd_iommu_stats_add(struct __iommu_counter *cnt) +{ + if (stats_dir == NULL) + return; + + cnt->dent = debugfs_create_u64(cnt->name, 0444, stats_dir, + &cnt->value); +} + +static void amd_iommu_stats_init(void) +{ + stats_dir = debugfs_create_dir("amd-iommu", NULL); + if (stats_dir == NULL) + return; + + de_isolate = debugfs_create_bool("isolation", 0444, stats_dir, + (u32 *)&amd_iommu_isolate); + + de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir, + (u32 *)&amd_iommu_unmap_flush); + + amd_iommu_stats_add(&compl_wait); + amd_iommu_stats_add(&cnt_map_single); + amd_iommu_stats_add(&cnt_unmap_single); + amd_iommu_stats_add(&cnt_map_sg); + amd_iommu_stats_add(&cnt_unmap_sg); + amd_iommu_stats_add(&cnt_alloc_coherent); + amd_iommu_stats_add(&cnt_free_coherent); + amd_iommu_stats_add(&cross_page); + amd_iommu_stats_add(&domain_flush_single); + amd_iommu_stats_add(&domain_flush_all); + amd_iommu_stats_add(&alloced_io_mem); + amd_iommu_stats_add(&total_map_requests); +} + +#endif /* returns !0 if the IOMMU is caching non-present entries in its TLB */ static int iommu_has_npcache(struct amd_iommu *iommu) @@ -189,13 +259,55 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) spin_lock_irqsave(&iommu->lock, flags); ret = __iommu_queue_command(iommu, cmd); if (!ret) - iommu->need_sync = 1; + iommu->need_sync = true; spin_unlock_irqrestore(&iommu->lock, flags); return ret; } /* + * This function waits until an IOMMU has completed a completion + * wait command + */ +static void __iommu_wait_for_completion(struct amd_iommu *iommu) +{ + int ready = 0; + unsigned status = 0; + unsigned long i = 0; + + INC_STATS_COUNTER(compl_wait); + + while (!ready && (i < EXIT_LOOP_COUNT)) { + ++i; + /* wait for the bit to become one */ + status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); + ready = status & MMIO_STATUS_COM_WAIT_INT_MASK; + } + + /* set bit back to zero */ + status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; + writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); + + if (unlikely(i == EXIT_LOOP_COUNT)) + panic("AMD IOMMU: Completion wait loop failed\n"); +} + +/* + * This function queues a completion wait command into the command + * buffer of an IOMMU + */ +static int __iommu_completion_wait(struct amd_iommu *iommu) +{ + struct iommu_cmd cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.data[0] = CMD_COMPL_WAIT_INT_MASK; + CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT); + + return __iommu_queue_command(iommu, &cmd); +} + +/* * This function is called whenever we need to ensure that the IOMMU has * completed execution of all commands we sent. It sends a * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs @@ -204,40 +316,22 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) */ static int iommu_completion_wait(struct amd_iommu *iommu) { - int ret = 0, ready = 0; - unsigned status = 0; - struct iommu_cmd cmd; - unsigned long flags, i = 0; - - memset(&cmd, 0, sizeof(cmd)); - cmd.data[0] = CMD_COMPL_WAIT_INT_MASK; - CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT); + int ret = 0; + unsigned long flags; spin_lock_irqsave(&iommu->lock, flags); if (!iommu->need_sync) goto out; - iommu->need_sync = 0; + ret = __iommu_completion_wait(iommu); - ret = __iommu_queue_command(iommu, &cmd); + iommu->need_sync = false; if (ret) goto out; - while (!ready && (i < EXIT_LOOP_COUNT)) { - ++i; - /* wait for the bit to become one */ - status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); - ready = status & MMIO_STATUS_COM_WAIT_INT_MASK; - } - - /* set bit back to zero */ - status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; - writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); - - if (unlikely(i == EXIT_LOOP_COUNT)) - panic("AMD IOMMU: Completion wait loop failed\n"); + __iommu_wait_for_completion(iommu); out: spin_unlock_irqrestore(&iommu->lock, flags); @@ -264,6 +358,21 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) return ret; } +static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, + u16 domid, int pde, int s) +{ + memset(cmd, 0, sizeof(*cmd)); + address &= PAGE_MASK; + CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); + cmd->data[1] |= domid; + cmd->data[2] = lower_32_bits(address); + cmd->data[3] = upper_32_bits(address); + if (s) /* size bit - we flush more than one 4kb page */ + cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; + if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ + cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; +} + /* * Generic command send function for invalidaing TLB entries */ @@ -273,16 +382,7 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, struct iommu_cmd cmd; int ret; - memset(&cmd, 0, sizeof(cmd)); - address &= PAGE_MASK; - CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES); - cmd.data[1] |= domid; - cmd.data[2] = lower_32_bits(address); - cmd.data[3] = upper_32_bits(address); - if (s) /* size bit - we flush more than one 4kb page */ - cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; - if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ - cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; + __iommu_build_inv_iommu_pages(&cmd, address, domid, pde, s); ret = iommu_queue_command(iommu, &cmd); @@ -321,9 +421,35 @@ static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid) { u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; + INC_STATS_COUNTER(domain_flush_single); + iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1); } +/* + * This function is used to flush the IO/TLB for a given protection domain + * on every IOMMU in the system + */ +static void iommu_flush_domain(u16 domid) +{ + unsigned long flags; + struct amd_iommu *iommu; + struct iommu_cmd cmd; + + INC_STATS_COUNTER(domain_flush_all); + + __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, + domid, 1, 1); + + list_for_each_entry(iommu, &amd_iommu_list, list) { + spin_lock_irqsave(&iommu->lock, flags); + __iommu_queue_command(iommu, &cmd); + __iommu_completion_wait(iommu); + __iommu_wait_for_completion(iommu); + spin_unlock_irqrestore(&iommu->lock, flags); + } +} + /**************************************************************************** * * The functions below are used the create the page table mappings for @@ -338,10 +464,10 @@ static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid) * supporting all features of AMD IOMMU page tables like level skipping * and full 64 bit address spaces. */ -static int iommu_map(struct protection_domain *dom, - unsigned long bus_addr, - unsigned long phys_addr, - int prot) +static int iommu_map_page(struct protection_domain *dom, + unsigned long bus_addr, + unsigned long phys_addr, + int prot) { u64 __pte, *pte, *page; @@ -388,6 +514,28 @@ static int iommu_map(struct protection_domain *dom, return 0; } +static void iommu_unmap_page(struct protection_domain *dom, + unsigned long bus_addr) +{ + u64 *pte; + + pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)]; + + if (!IOMMU_PTE_PRESENT(*pte)) + return; + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)]; + + if (!IOMMU_PTE_PRESENT(*pte)) + return; + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)]; + + *pte = 0; +} + /* * This function checks if a specific unity mapping entry is needed for * this specific IOMMU. @@ -440,7 +588,7 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, for (addr = e->address_start; addr < e->address_end; addr += PAGE_SIZE) { - ret = iommu_map(&dma_dom->domain, addr, addr, e->prot); + ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot); if (ret) return ret; /* @@ -571,6 +719,16 @@ static u16 domain_id_alloc(void) return id; } +static void domain_id_free(int id) +{ + unsigned long flags; + + write_lock_irqsave(&amd_iommu_devtable_lock, flags); + if (id > 0 && id < MAX_DOMAIN_ID) + __clear_bit(id, amd_iommu_pd_alloc_bitmap); + write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); +} + /* * Used to reserve address ranges in the aperture (e.g. for exclusion * ranges. @@ -587,12 +745,12 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, iommu_area_reserve(dom->bitmap, start_page, pages); } -static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) +static void free_pagetable(struct protection_domain *domain) { int i, j; u64 *p1, *p2, *p3; - p1 = dma_dom->domain.pt_root; + p1 = domain->pt_root; if (!p1) return; @@ -613,6 +771,8 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) } free_page((unsigned long)p1); + + domain->pt_root = NULL; } /* @@ -624,7 +784,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) if (!dom) return; - dma_ops_free_pagetable(dom); + free_pagetable(&dom->domain); kfree(dom->pte_pages); @@ -663,6 +823,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, goto free_dma_dom; dma_dom->domain.mode = PAGE_MODE_3_LEVEL; dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); + dma_dom->domain.flags = PD_DMA_OPS_MASK; dma_dom->domain.priv = dma_dom; if (!dma_dom->domain.pt_root) goto free_dma_dom; @@ -725,6 +886,15 @@ free_dma_dom: } /* + * little helper function to check whether a given protection domain is a + * dma_ops domain + */ +static bool dma_ops_domain(struct protection_domain *domain) +{ + return domain->flags & PD_DMA_OPS_MASK; +} + +/* * Find out the protection domain structure for a given PCI device. This * will give us the pointer to the page table root for example. */ @@ -744,14 +914,15 @@ static struct protection_domain *domain_for_device(u16 devid) * If a device is not yet associated with a domain, this function does * assigns it visible for the hardware */ -static void set_device_domain(struct amd_iommu *iommu, - struct protection_domain *domain, - u16 devid) +static void attach_device(struct amd_iommu *iommu, + struct protection_domain *domain, + u16 devid) { unsigned long flags; - u64 pte_root = virt_to_phys(domain->pt_root); + domain->dev_cnt += 1; + pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) << DEV_ENTRY_MODE_SHIFT; pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; @@ -767,6 +938,116 @@ static void set_device_domain(struct amd_iommu *iommu, iommu_queue_inv_dev_entry(iommu, devid); } +/* + * Removes a device from a protection domain (unlocked) + */ +static void __detach_device(struct protection_domain *domain, u16 devid) +{ + + /* lock domain */ + spin_lock(&domain->lock); + + /* remove domain from the lookup table */ + amd_iommu_pd_table[devid] = NULL; + + /* remove entry from the device table seen by the hardware */ + amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV; + amd_iommu_dev_table[devid].data[1] = 0; + amd_iommu_dev_table[devid].data[2] = 0; + + /* decrease reference counter */ + domain->dev_cnt -= 1; + + /* ready */ + spin_unlock(&domain->lock); +} + +/* + * Removes a device from a protection domain (with devtable_lock held) + */ +static void detach_device(struct protection_domain *domain, u16 devid) +{ + unsigned long flags; + + /* lock device table */ + write_lock_irqsave(&amd_iommu_devtable_lock, flags); + __detach_device(domain, devid); + write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); +} + +static int device_change_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct device *dev = data; + struct pci_dev *pdev = to_pci_dev(dev); + u16 devid = calc_devid(pdev->bus->number, pdev->devfn); + struct protection_domain *domain; + struct dma_ops_domain *dma_domain; + struct amd_iommu *iommu; + int order = amd_iommu_aperture_order; + unsigned long flags; + + if (devid > amd_iommu_last_bdf) + goto out; + + devid = amd_iommu_alias_table[devid]; + + iommu = amd_iommu_rlookup_table[devid]; + if (iommu == NULL) + goto out; + + domain = domain_for_device(devid); + + if (domain && !dma_ops_domain(domain)) + WARN_ONCE(1, "AMD IOMMU WARNING: device %s already bound " + "to a non-dma-ops domain\n", dev_name(dev)); + + switch (action) { + case BUS_NOTIFY_BOUND_DRIVER: + if (domain) + goto out; + dma_domain = find_protection_domain(devid); + if (!dma_domain) + dma_domain = iommu->default_dom; + attach_device(iommu, &dma_domain->domain, devid); + printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " + "device %s\n", dma_domain->domain.id, dev_name(dev)); + break; + case BUS_NOTIFY_UNBIND_DRIVER: + if (!domain) + goto out; + detach_device(domain, devid); + break; + case BUS_NOTIFY_ADD_DEVICE: + /* allocate a protection domain if a device is added */ + dma_domain = find_protection_domain(devid); + if (dma_domain) + goto out; + dma_domain = dma_ops_domain_alloc(iommu, order); + if (!dma_domain) + goto out; + dma_domain->target_dev = devid; + + spin_lock_irqsave(&iommu_pd_list_lock, flags); + list_add_tail(&dma_domain->list, &iommu_pd_list); + spin_unlock_irqrestore(&iommu_pd_list_lock, flags); + + break; + default: + goto out; + } + + iommu_queue_inv_dev_entry(iommu, devid); + iommu_completion_wait(iommu); + +out: + return 0; +} + +struct notifier_block device_nb = { + .notifier_call = device_change_notifier, +}; + /***************************************************************************** * * The next functions belong to the dma_ops mapping/unmapping code. @@ -802,7 +1083,6 @@ static struct dma_ops_domain *find_protection_domain(u16 devid) list_for_each_entry(entry, &iommu_pd_list, list) { if (entry->target_dev == devid) { ret = entry; - list_del(&ret->list); break; } } @@ -853,14 +1133,13 @@ static int get_device_resources(struct device *dev, if (!dma_dom) dma_dom = (*iommu)->default_dom; *domain = &dma_dom->domain; - set_device_domain(*iommu, *domain, *bdf); + attach_device(*iommu, *domain, *bdf); printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " - "device ", (*domain)->id); - print_devid(_bdf, 1); + "device %s\n", (*domain)->id, dev_name(dev)); } if (domain_for_device(_bdf) == NULL) - set_device_domain(*iommu, *domain, _bdf); + attach_device(*iommu, *domain, _bdf); return 1; } @@ -946,6 +1225,11 @@ static dma_addr_t __map_single(struct device *dev, pages = iommu_num_pages(paddr, size, PAGE_SIZE); paddr &= PAGE_MASK; + INC_STATS_COUNTER(total_map_requests); + + if (pages > 1) + INC_STATS_COUNTER(cross_page); + if (align) align_mask = (1UL << get_order(size)) - 1; @@ -962,6 +1246,8 @@ static dma_addr_t __map_single(struct device *dev, } address += offset; + ADD_STATS_COUNTER(alloced_io_mem, size); + if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { iommu_flush_tlb(iommu, dma_dom->domain.id); dma_dom->need_flush = false; @@ -998,6 +1284,8 @@ static void __unmap_single(struct amd_iommu *iommu, start += PAGE_SIZE; } + SUB_STATS_COUNTER(alloced_io_mem, size); + dma_ops_free_addresses(dma_dom, dma_addr, pages); if (amd_iommu_unmap_flush || dma_dom->need_flush) { @@ -1019,6 +1307,8 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, dma_addr_t addr; u64 dma_mask; + INC_STATS_COUNTER(cnt_map_single); + if (!check_device(dev)) return bad_dma_address; @@ -1030,6 +1320,9 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, /* device not handled by any AMD IOMMU */ return (dma_addr_t)paddr; + if (!dma_ops_domain(domain)) + return bad_dma_address; + spin_lock_irqsave(&domain->lock, flags); addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false, dma_mask); @@ -1055,11 +1348,16 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr, struct protection_domain *domain; u16 devid; + INC_STATS_COUNTER(cnt_unmap_single); + if (!check_device(dev) || !get_device_resources(dev, &iommu, &domain, &devid)) /* device not handled by any AMD IOMMU */ return; + if (!dma_ops_domain(domain)) + return; + spin_lock_irqsave(&domain->lock, flags); __unmap_single(iommu, domain->priv, dma_addr, size, dir); @@ -1104,6 +1402,8 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, int mapped_elems = 0; u64 dma_mask; + INC_STATS_COUNTER(cnt_map_sg); + if (!check_device(dev)) return 0; @@ -1114,6 +1414,9 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, if (!iommu || !domain) return map_sg_no_iommu(dev, sglist, nelems, dir); + if (!dma_ops_domain(domain)) + return 0; + spin_lock_irqsave(&domain->lock, flags); for_each_sg(sglist, s, nelems, i) { @@ -1163,10 +1466,15 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, u16 devid; int i; + INC_STATS_COUNTER(cnt_unmap_sg); + if (!check_device(dev) || !get_device_resources(dev, &iommu, &domain, &devid)) return; + if (!dma_ops_domain(domain)) + return; + spin_lock_irqsave(&domain->lock, flags); for_each_sg(sglist, s, nelems, i) { @@ -1194,6 +1502,8 @@ static void *alloc_coherent(struct device *dev, size_t size, phys_addr_t paddr; u64 dma_mask = dev->coherent_dma_mask; + INC_STATS_COUNTER(cnt_alloc_coherent); + if (!check_device(dev)) return NULL; @@ -1212,6 +1522,9 @@ static void *alloc_coherent(struct device *dev, size_t size, return virt_addr; } + if (!dma_ops_domain(domain)) + goto out_free; + if (!dma_mask) dma_mask = *dev->dma_mask; @@ -1220,18 +1533,20 @@ static void *alloc_coherent(struct device *dev, size_t size, *dma_addr = __map_single(dev, iommu, domain->priv, paddr, size, DMA_BIDIRECTIONAL, true, dma_mask); - if (*dma_addr == bad_dma_address) { - free_pages((unsigned long)virt_addr, get_order(size)); - virt_addr = NULL; - goto out; - } + if (*dma_addr == bad_dma_address) + goto out_free; iommu_completion_wait(iommu); -out: spin_unlock_irqrestore(&domain->lock, flags); return virt_addr; + +out_free: + + free_pages((unsigned long)virt_addr, get_order(size)); + + return NULL; } /* @@ -1245,6 +1560,8 @@ static void free_coherent(struct device *dev, size_t size, struct protection_domain *domain; u16 devid; + INC_STATS_COUNTER(cnt_free_coherent); + if (!check_device(dev)) return; @@ -1253,6 +1570,9 @@ static void free_coherent(struct device *dev, size_t size, if (!iommu || !domain) goto free_mem; + if (!dma_ops_domain(domain)) + goto free_mem; + spin_lock_irqsave(&domain->lock, flags); __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); @@ -1296,7 +1616,7 @@ static int amd_iommu_dma_supported(struct device *dev, u64 mask) * we don't need to preallocate the protection domains anymore. * For now we have to. */ -void prealloc_protection_domains(void) +static void prealloc_protection_domains(void) { struct pci_dev *dev = NULL; struct dma_ops_domain *dma_dom; @@ -1305,7 +1625,7 @@ void prealloc_protection_domains(void) u16 devid; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - devid = (dev->bus->number << 8) | dev->devfn; + devid = calc_devid(dev->bus->number, dev->devfn); if (devid > amd_iommu_last_bdf) continue; devid = amd_iommu_alias_table[devid]; @@ -1352,6 +1672,7 @@ int __init amd_iommu_init_dma_ops(void) iommu->default_dom = dma_ops_domain_alloc(iommu, order); if (iommu->default_dom == NULL) return -ENOMEM; + iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; ret = iommu_init_unity_mappings(iommu); if (ret) goto free_domains; @@ -1375,6 +1696,12 @@ int __init amd_iommu_init_dma_ops(void) /* Make the driver finally visible to the drivers */ dma_ops = &amd_iommu_dma_ops; + register_iommu(&amd_iommu_ops); + + bus_register_notifier(&pci_bus_type, &device_nb); + + amd_iommu_stats_init(); + return 0; free_domains: @@ -1386,3 +1713,224 @@ free_domains: return ret; } + +/***************************************************************************** + * + * The following functions belong to the exported interface of AMD IOMMU + * + * This interface allows access to lower level functions of the IOMMU + * like protection domain handling and assignement of devices to domains + * which is not possible with the dma_ops interface. + * + *****************************************************************************/ + +static void cleanup_domain(struct protection_domain *domain) +{ + unsigned long flags; + u16 devid; + + write_lock_irqsave(&amd_iommu_devtable_lock, flags); + + for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) + if (amd_iommu_pd_table[devid] == domain) + __detach_device(domain, devid); + + write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); +} + +static int amd_iommu_domain_init(struct iommu_domain *dom) +{ + struct protection_domain *domain; + + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (!domain) + return -ENOMEM; + + spin_lock_init(&domain->lock); + domain->mode = PAGE_MODE_3_LEVEL; + domain->id = domain_id_alloc(); + if (!domain->id) + goto out_free; + domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL); + if (!domain->pt_root) + goto out_free; + + dom->priv = domain; + + return 0; + +out_free: + kfree(domain); + + return -ENOMEM; +} + +static void amd_iommu_domain_destroy(struct iommu_domain *dom) +{ + struct protection_domain *domain = dom->priv; + + if (!domain) + return; + + if (domain->dev_cnt > 0) + cleanup_domain(domain); + + BUG_ON(domain->dev_cnt != 0); + + free_pagetable(domain); + + domain_id_free(domain->id); + + kfree(domain); + + dom->priv = NULL; +} + +static void amd_iommu_detach_device(struct iommu_domain *dom, + struct device *dev) +{ + struct protection_domain *domain = dom->priv; + struct amd_iommu *iommu; + struct pci_dev *pdev; + u16 devid; + + if (dev->bus != &pci_bus_type) + return; + + pdev = to_pci_dev(dev); + + devid = calc_devid(pdev->bus->number, pdev->devfn); + + if (devid > 0) + detach_device(domain, devid); + + iommu = amd_iommu_rlookup_table[devid]; + if (!iommu) + return; + + iommu_queue_inv_dev_entry(iommu, devid); + iommu_completion_wait(iommu); +} + +static int amd_iommu_attach_device(struct iommu_domain *dom, + struct device *dev) +{ + struct protection_domain *domain = dom->priv; + struct protection_domain *old_domain; + struct amd_iommu *iommu; + struct pci_dev *pdev; + u16 devid; + + if (dev->bus != &pci_bus_type) + return -EINVAL; + + pdev = to_pci_dev(dev); + + devid = calc_devid(pdev->bus->number, pdev->devfn); + + if (devid >= amd_iommu_last_bdf || + devid != amd_iommu_alias_table[devid]) + return -EINVAL; + + iommu = amd_iommu_rlookup_table[devid]; + if (!iommu) + return -EINVAL; + + old_domain = domain_for_device(devid); + if (old_domain) + return -EBUSY; + + attach_device(iommu, domain, devid); + + iommu_completion_wait(iommu); + + return 0; +} + +static int amd_iommu_map_range(struct iommu_domain *dom, + unsigned long iova, phys_addr_t paddr, + size_t size, int iommu_prot) +{ + struct protection_domain *domain = dom->priv; + unsigned long i, npages = iommu_num_pages(paddr, size, PAGE_SIZE); + int prot = 0; + int ret; + + if (iommu_prot & IOMMU_READ) + prot |= IOMMU_PROT_IR; + if (iommu_prot & IOMMU_WRITE) + prot |= IOMMU_PROT_IW; + + iova &= PAGE_MASK; + paddr &= PAGE_MASK; + + for (i = 0; i < npages; ++i) { + ret = iommu_map_page(domain, iova, paddr, prot); + if (ret) + return ret; + + iova += PAGE_SIZE; + paddr += PAGE_SIZE; + } + + return 0; +} + +static void amd_iommu_unmap_range(struct iommu_domain *dom, + unsigned long iova, size_t size) +{ + + struct protection_domain *domain = dom->priv; + unsigned long i, npages = iommu_num_pages(iova, size, PAGE_SIZE); + + iova &= PAGE_MASK; + + for (i = 0; i < npages; ++i) { + iommu_unmap_page(domain, iova); + iova += PAGE_SIZE; + } + + iommu_flush_domain(domain->id); +} + +static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, + unsigned long iova) +{ + struct protection_domain *domain = dom->priv; + unsigned long offset = iova & ~PAGE_MASK; + phys_addr_t paddr; + u64 *pte; + + pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(iova)]; + + if (!IOMMU_PTE_PRESENT(*pte)) + return 0; + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L1_INDEX(iova)]; + + if (!IOMMU_PTE_PRESENT(*pte)) + return 0; + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L0_INDEX(iova)]; + + if (!IOMMU_PTE_PRESENT(*pte)) + return 0; + + paddr = *pte & IOMMU_PAGE_MASK; + paddr |= offset; + + return paddr; +} + +static struct iommu_ops amd_iommu_ops = { + .domain_init = amd_iommu_domain_init, + .domain_destroy = amd_iommu_domain_destroy, + .attach_dev = amd_iommu_attach_device, + .detach_dev = amd_iommu_detach_device, + .map = amd_iommu_map_range, + .unmap = amd_iommu_unmap_range, + .iova_to_phys = amd_iommu_iova_to_phys, +}; + diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index c625800c55ca..42c33cebf00f 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -122,7 +122,8 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings we find in ACPI */ unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ -int amd_iommu_isolate = 1; /* if 1, device isolation is enabled */ +bool amd_iommu_isolate = true; /* if true, device isolation is + enabled */ bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the @@ -243,20 +244,16 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) } /* Function to enable the hardware */ -void __init iommu_enable(struct amd_iommu *iommu) +static void __init iommu_enable(struct amd_iommu *iommu) { - printk(KERN_INFO "AMD IOMMU: Enabling IOMMU " - "at %02x:%02x.%x cap 0x%hx\n", - iommu->dev->bus->number, - PCI_SLOT(iommu->dev->devfn), - PCI_FUNC(iommu->dev->devfn), - iommu->cap_ptr); + printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n", + dev_name(&iommu->dev->dev), iommu->cap_ptr); iommu_feature_enable(iommu, CONTROL_IOMMU_EN); } /* Function to enable IOMMU event logging and event interrupts */ -void __init iommu_enable_event_logging(struct amd_iommu *iommu) +static void __init iommu_enable_event_logging(struct amd_iommu *iommu) { iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); @@ -1218,9 +1215,9 @@ static int __init parse_amd_iommu_options(char *str) { for (; *str; ++str) { if (strncmp(str, "isolate", 7) == 0) - amd_iommu_isolate = 1; + amd_iommu_isolate = true; if (strncmp(str, "share", 5) == 0) - amd_iommu_isolate = 0; + amd_iommu_isolate = false; if (strncmp(str, "fullflush", 9) == 0) amd_iommu_unmap_flush = true; } diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 99589245fd8d..b13d3c4dbd42 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -98,8 +98,8 @@ __setup("apicpmtimer", setup_apicpmtimer); #ifdef HAVE_X2APIC int x2apic; /* x2apic enabled before OS handover */ -int x2apic_preenabled; -int disable_x2apic; +static int x2apic_preenabled; +static int disable_x2apic; static __init int setup_nox2apic(char *str) { disable_x2apic = 1; @@ -226,7 +226,7 @@ void xapic_icr_write(u32 low, u32 id) apic_write(APIC_ICR, low); } -u64 xapic_icr_read(void) +static u64 xapic_icr_read(void) { u32 icr1, icr2; @@ -266,7 +266,7 @@ void x2apic_icr_write(u32 low, u32 id) wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); } -u64 x2apic_icr_read(void) +static u64 x2apic_icr_read(void) { unsigned long val; diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index 2a0a2a3cac26..f63882728d91 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -25,7 +25,7 @@ #include <asm/uv/bios.h> #include <asm/uv/uv_hub.h> -struct uv_systab uv_systab; +static struct uv_systab uv_systab; s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) { diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 4e8d77f01eeb..b59ddcc88cd8 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -14,14 +14,6 @@ #include <asm/pat.h> #include "mtrr.h" -struct mtrr_state { - struct mtrr_var_range var_ranges[MAX_VAR_RANGES]; - mtrr_type fixed_ranges[NUM_FIXED_RANGES]; - unsigned char enabled; - unsigned char have_fixed; - mtrr_type def_type; -}; - struct fixed_range_block { int base_msr; /* start address of an MTRR block */ int ranges; /* number of MTRRs in this block */ @@ -35,10 +27,12 @@ static struct fixed_range_block fixed_range_blocks[] = { }; static unsigned long smp_changes_mask; -static struct mtrr_state mtrr_state = {}; static int mtrr_state_set; u64 mtrr_tom2; +struct mtrr_state_type mtrr_state = {}; +EXPORT_SYMBOL_GPL(mtrr_state); + #undef MODULE_PARAM_PREFIX #define MODULE_PARAM_PREFIX "mtrr." diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 1159e269e596..d259e5d2e054 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -49,7 +49,7 @@ u32 num_var_ranges = 0; -unsigned int mtrr_usage_table[MAX_VAR_RANGES]; +unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; @@ -574,7 +574,7 @@ struct mtrr_value { unsigned long lsize; }; -static struct mtrr_value mtrr_state[MAX_VAR_RANGES]; +static struct mtrr_value mtrr_state[MTRR_MAX_VAR_RANGES]; static int mtrr_save(struct sys_device * sysdev, pm_message_t state) { @@ -824,16 +824,14 @@ static int enable_mtrr_cleanup __initdata = static int __init disable_mtrr_cleanup_setup(char *str) { - if (enable_mtrr_cleanup != -1) - enable_mtrr_cleanup = 0; + enable_mtrr_cleanup = 0; return 0; } early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup); static int __init enable_mtrr_cleanup_setup(char *str) { - if (enable_mtrr_cleanup != -1) - enable_mtrr_cleanup = 1; + enable_mtrr_cleanup = 1; return 0; } early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup); diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 2dc4ec656b23..ffd60409cc6d 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -8,11 +8,6 @@ #define MTRRcap_MSR 0x0fe #define MTRRdefType_MSR 0x2ff -#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg)) -#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1) - -#define NUM_FIXED_RANGES 88 -#define MAX_VAR_RANGES 256 #define MTRRfix64K_00000_MSR 0x250 #define MTRRfix16K_80000_MSR 0x258 #define MTRRfix16K_A0000_MSR 0x259 @@ -29,11 +24,7 @@ #define MTRR_CHANGE_MASK_VARIABLE 0x02 #define MTRR_CHANGE_MASK_DEFTYPE 0x04 -/* In the Intel processor's MTRR interface, the MTRR type is always held in - an 8 bit field: */ -typedef u8 mtrr_type; - -extern unsigned int mtrr_usage_table[MAX_VAR_RANGES]; +extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; struct mtrr_ops { u32 vendor; @@ -70,13 +61,6 @@ struct set_mtrr_context { u32 ccr3; }; -struct mtrr_var_range { - u32 base_lo; - u32 base_hi; - u32 mask_lo; - u32 mask_hi; -}; - void set_mtrr_done(struct set_mtrr_context *ctxt); void set_mtrr_cache_disable(struct set_mtrr_context *ctxt); void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 62a3c23bd703..2ac1f0c2beb3 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -39,10 +39,10 @@ #include <linux/device.h> #include <linux/cpu.h> #include <linux/notifier.h> +#include <linux/uaccess.h> #include <asm/processor.h> #include <asm/msr.h> -#include <asm/uaccess.h> #include <asm/system.h> static struct class *cpuid_class; @@ -82,7 +82,7 @@ static loff_t cpuid_seek(struct file *file, loff_t offset, int orig) } static ssize_t cpuid_read(struct file *file, char __user *buf, - size_t count, loff_t * ppos) + size_t count, loff_t *ppos) { char __user *tmp = buf; struct cpuid_regs cmd; @@ -117,7 +117,7 @@ static int cpuid_open(struct inode *inode, struct file *file) unsigned int cpu; struct cpuinfo_x86 *c; int ret = 0; - + lock_kernel(); cpu = iminor(file->f_path.dentry->d_inode); diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index d84a852e4cd7..c689d19e35ab 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -26,6 +26,7 @@ #include <linux/kdebug.h> #include <asm/smp.h> #include <asm/reboot.h> +#include <asm/virtext.h> #include <mach_ipi.h> @@ -49,6 +50,15 @@ static void kdump_nmi_callback(int cpu, struct die_args *args) #endif crash_save_cpu(regs, cpu); + /* Disable VMX or SVM if needed. + * + * We need to disable virtualization on all CPUs. + * Having VMX or SVM enabled on any CPU may break rebooting + * after the kdump kernel has finished its task. + */ + cpu_emergency_vmxoff(); + cpu_emergency_svm_disable(); + disable_local_APIC(); } @@ -80,6 +90,14 @@ void native_machine_crash_shutdown(struct pt_regs *regs) local_irq_disable(); kdump_nmi_shootdown_cpus(); + + /* Booting kdump kernel with VMX or SVM enabled won't work, + * because (among other limitations) we can't disable paging + * with the virt flags. + */ + cpu_emergency_vmxoff(); + cpu_emergency_svm_disable(); + lapic_shutdown(); #if defined(CONFIG_X86_IO_APIC) disable_IO_APIC(); diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 23b138e31e9c..504ad198e4ad 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -886,7 +886,7 @@ asmlinkage void early_printk(const char *fmt, ...) va_list ap; va_start(ap, fmt); - n = vscnprintf(buf, 512, fmt, ap); + n = vscnprintf(buf, sizeof(buf), fmt, ap); early_console->write(early_console, buf, n); va_end(ap); } diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c index 62895cf315ff..21bcc0e098ba 100644 --- a/arch/x86/kernel/genx2apic_phys.c +++ b/arch/x86/kernel/genx2apic_phys.c @@ -161,12 +161,12 @@ static unsigned int phys_pkg_id(int index_msb) return current_cpu_data.initial_apicid >> index_msb; } -void x2apic_send_IPI_self(int vector) +static void x2apic_send_IPI_self(int vector) { apic_write(APIC_SELF_IPI, vector); } -void init_x2apic_ldr(void) +static void init_x2apic_ldr(void) { return; } diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 388e05a5fc17..b9a4d8c4b935 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -27,7 +27,7 @@ #include <asm/trampoline.h> /* boot cpu pda */ -static struct x8664_pda _boot_cpu_pda __read_mostly; +static struct x8664_pda _boot_cpu_pda; #ifdef CONFIG_SMP /* diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index d39918076bb4..df3bf269beab 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -10,7 +10,6 @@ #include <asm/pgtable.h> #include <asm/desc.h> -static struct fs_struct init_fs = INIT_FS; static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); struct mm_struct init_mm = INIT_MM(init_mm); diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index a25c3f76b8ac..3639442aa7a4 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -170,7 +170,7 @@ static struct irq_cfg irq_cfgx[NR_IRQS] = { [15] = { .vector = IRQ15_VECTOR, }, }; -void __init arch_early_irq_init(void) +int __init arch_early_irq_init(void) { struct irq_cfg *cfg; struct irq_desc *desc; @@ -188,6 +188,8 @@ void __init arch_early_irq_init(void) if (i < NR_IRQS_LEGACY) cpumask_setall(cfg[i].domain); } + + return 0; } #ifdef CONFIG_SPARSE_IRQ @@ -230,7 +232,7 @@ static struct irq_cfg *get_one_free_irq_cfg(int cpu) return cfg; } -void arch_init_chip_data(struct irq_desc *desc, int cpu) +int arch_init_chip_data(struct irq_desc *desc, int cpu) { struct irq_cfg *cfg; @@ -242,6 +244,8 @@ void arch_init_chip_data(struct irq_desc *desc, int cpu) BUG_ON(1); } } + + return 0; } #ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC @@ -702,7 +706,7 @@ static void __unmask_IO_APIC_irq(struct irq_cfg *cfg) } #ifdef CONFIG_X86_64 -void io_apic_sync(struct irq_pin_list *entry) +static void io_apic_sync(struct irq_pin_list *entry) { /* * Synchronize the IO-APIC and the CPU by doing @@ -1400,8 +1404,6 @@ void __setup_vector_irq(int cpu) /* Mark the inuse vectors */ for_each_irq_desc(irq, desc) { - if (!desc) - continue; cfg = desc->chip_data; if (!cpumask_test_cpu(cpu, cfg->domain)) continue; @@ -1783,8 +1785,6 @@ __apicdebuginit(void) print_IO_APIC(void) for_each_irq_desc(irq, desc) { struct irq_pin_list *entry; - if (!desc) - continue; cfg = desc->chip_data; entry = cfg->irq_2_pin; if (!entry) @@ -2425,9 +2425,6 @@ static void ir_irq_migration(struct work_struct *work) struct irq_desc *desc; for_each_irq_desc(irq, desc) { - if (!desc) - continue; - if (desc->status & IRQ_MOVE_PENDING) { unsigned long flags; @@ -2713,9 +2710,6 @@ static inline void init_IO_APIC_traps(void) * 0x80, because int 0x80 is hm, kind of importantish. ;) */ for_each_irq_desc(irq, desc) { - if (!desc) - continue; - cfg = desc->chip_data; if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { /* diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index e169ae9b6a62..652fce6d2cce 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -89,17 +89,17 @@ static cycle_t kvm_clock_read(void) */ static unsigned long kvm_get_tsc_khz(void) { - return preset_lpj; + struct pvclock_vcpu_time_info *src; + src = &per_cpu(hv_clock, 0); + return pvclock_tsc_khz(src); } static void kvm_get_preset_lpj(void) { - struct pvclock_vcpu_time_info *src; unsigned long khz; u64 lpj; - src = &per_cpu(hv_clock, 0); - khz = pvclock_tsc_khz(src); + khz = kvm_get_tsc_khz(); lpj = ((u64)khz * 1000); do_div(lpj, HZ); @@ -194,5 +194,7 @@ void __init kvmclock_init(void) #endif kvm_get_preset_lpj(); clocksource_register(&kvm_clock); + pv_info.paravirt_enabled = 1; + pv_info.name = "KVM"; } } diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index eee32b43fee3..71f1d99a635d 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -12,8 +12,8 @@ #include <linux/mm.h> #include <linux/smp.h> #include <linux/vmalloc.h> +#include <linux/uaccess.h> -#include <asm/uaccess.h> #include <asm/system.h> #include <asm/ldt.h> #include <asm/desc.h> @@ -93,7 +93,7 @@ static inline int copy_ldt(mm_context_t *new, mm_context_t *old) if (err < 0) return err; - for(i = 0; i < old->size; i++) + for (i = 0; i < old->size; i++) write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE); return 0; } diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index efc2f361fe85..666e43df51f9 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -13,8 +13,7 @@ #include <asm/msr.h> #include <asm/acpi.h> #include <asm/mmconfig.h> - -#include "../pci/pci.h" +#include <asm/pci_x86.h> struct pci_hostbridge_probe { u32 bus; diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 45e3b69808ba..c5c5b8df1dbc 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -16,14 +16,14 @@ #include <linux/bitops.h> #include <linux/acpi.h> #include <linux/module.h> +#include <linux/smp.h> +#include <linux/acpi.h> -#include <asm/smp.h> #include <asm/mtrr.h> #include <asm/mpspec.h> #include <asm/pgalloc.h> #include <asm/io_apic.h> #include <asm/proto.h> -#include <asm/acpi.h> #include <asm/bios_ebda.h> #include <asm/e820.h> #include <asm/trampoline.h> @@ -95,8 +95,8 @@ static void __init MP_bus_info(struct mpc_config_bus *m) #endif if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) { - set_bit(m->mpc_busid, mp_bus_not_pci); -#if defined(CONFIG_EISA) || defined (CONFIG_MCA) + set_bit(m->mpc_busid, mp_bus_not_pci); +#if defined(CONFIG_EISA) || defined(CONFIG_MCA) mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; #endif } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { @@ -104,7 +104,7 @@ static void __init MP_bus_info(struct mpc_config_bus *m) x86_quirks->mpc_oem_pci_bus(m); clear_bit(m->mpc_busid, mp_bus_not_pci); -#if defined(CONFIG_EISA) || defined (CONFIG_MCA) +#if defined(CONFIG_EISA) || defined(CONFIG_MCA) mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) { mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 8bd1bf9622a7..45a09ccdc214 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -26,11 +26,10 @@ #include <linux/kernel_stat.h> #include <linux/kdebug.h> #include <linux/smp.h> +#include <linux/nmi.h> #include <asm/i8259.h> #include <asm/io_apic.h> -#include <asm/smp.h> -#include <asm/nmi.h> #include <asm/proto.h> #include <asm/timer.h> diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index a35eaa379ff6..00c2bcd41463 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -52,7 +52,7 @@ static u32 *iommu_gatt_base; /* Remapping table */ * to trigger bugs with some popular PCI cards, in particular 3ware (but * has been also also seen with Qlogic at least). */ -int iommu_fullflush = 1; +static int iommu_fullflush = 1; /* Allocation bitmap for the remapping area: */ static DEFINE_SPINLOCK(iommu_bitmap_lock); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index de4a9d643bee..2b46eb41643b 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -12,6 +12,8 @@ #include <asm/proto.h> #include <asm/reboot_fixups.h> #include <asm/reboot.h> +#include <asm/pci_x86.h> +#include <asm/virtext.h> #ifdef CONFIG_X86_32 # include <linux/dmi.h> @@ -23,7 +25,6 @@ #include <mach_ipi.h> - /* * Power off function, if any */ @@ -39,6 +40,12 @@ int reboot_force; static int reboot_cpu = -1; #endif +/* This is set if we need to go through the 'emergency' path. + * When machine_emergency_restart() is called, we may be on + * an inconsistent state and won't be able to do a clean cleanup + */ +static int reboot_emergency; + /* This is set by the PCI code if either type 1 or type 2 PCI is detected */ bool port_cf9_safe = false; @@ -368,6 +375,48 @@ static inline void kb_wait(void) } } +static void vmxoff_nmi(int cpu, struct die_args *args) +{ + cpu_emergency_vmxoff(); +} + +/* Use NMIs as IPIs to tell all CPUs to disable virtualization + */ +static void emergency_vmx_disable_all(void) +{ + /* Just make sure we won't change CPUs while doing this */ + local_irq_disable(); + + /* We need to disable VMX on all CPUs before rebooting, otherwise + * we risk hanging up the machine, because the CPU ignore INIT + * signals when VMX is enabled. + * + * We can't take any locks and we may be on an inconsistent + * state, so we use NMIs as IPIs to tell the other CPUs to disable + * VMX and halt. + * + * For safety, we will avoid running the nmi_shootdown_cpus() + * stuff unnecessarily, but we don't have a way to check + * if other CPUs have VMX enabled. So we will call it only if the + * CPU we are running on has VMX enabled. + * + * We will miss cases where VMX is not enabled on all CPUs. This + * shouldn't do much harm because KVM always enable VMX on all + * CPUs anyway. But we can miss it on the small window where KVM + * is still enabling VMX. + */ + if (cpu_has_vmx() && cpu_vmx_enabled()) { + /* Disable VMX on this CPU. + */ + cpu_vmxoff(); + + /* Halt and disable VMX on the other CPUs */ + nmi_shootdown_cpus(vmxoff_nmi); + + } +} + + void __attribute__((weak)) mach_reboot_fixups(void) { } @@ -376,6 +425,9 @@ static void native_machine_emergency_restart(void) { int i; + if (reboot_emergency) + emergency_vmx_disable_all(); + /* Tell the BIOS if we want cold or warm reboot */ *((unsigned short *)__va(0x472)) = reboot_mode; @@ -482,13 +534,19 @@ void native_machine_shutdown(void) #endif } +static void __machine_emergency_restart(int emergency) +{ + reboot_emergency = emergency; + machine_ops.emergency_restart(); +} + static void native_machine_restart(char *__unused) { printk("machine restart\n"); if (!reboot_force) machine_shutdown(); - machine_emergency_restart(); + __machine_emergency_restart(0); } static void native_machine_halt(void) @@ -532,7 +590,7 @@ void machine_shutdown(void) void machine_emergency_restart(void) { - machine_ops.emergency_restart(); + __machine_emergency_restart(1); } void machine_restart(char *cmd) diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 6a00e5faaa74..f885023167e0 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -582,7 +582,6 @@ static int __init uv_ptc_init(void) static struct bau_control * __init uv_table_bases_init(int blade, int node) { int i; - int *ip; struct bau_msg_status *msp; struct bau_control *bau_tabp; @@ -599,13 +598,6 @@ static struct bau_control * __init uv_table_bases_init(int blade, int node) bau_cpubits_clear(&msp->seen_by, (int) uv_blade_nr_possible_cpus(blade)); - bau_tabp->watching = - kmalloc_node(sizeof(int) * DEST_NUM_RESOURCES, GFP_KERNEL, node); - BUG_ON(!bau_tabp->watching); - - for (i = 0, ip = bau_tabp->watching; i < DEST_Q_SIZE; i++, ip++) - *ip = 0; - uv_bau_table_bases[blade] = bau_tabp; return bau_tabp; @@ -628,7 +620,6 @@ uv_table_bases_finish(int blade, int node, int cur_cpu, bcp->bau_msg_head = bau_tablesp->va_queue_first; bcp->va_queue_first = bau_tablesp->va_queue_first; bcp->va_queue_last = bau_tablesp->va_queue_last; - bcp->watching = bau_tablesp->watching; bcp->msg_statuses = bau_tablesp->msg_statuses; bcp->descriptor_base = adp; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 2d1f4c7e4052..ce6650eb64e9 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -292,8 +292,10 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) tsk->thread.error_code = error_code; tsk->thread.trap_no = 8; - /* This is always a kernel trap and never fixable (and thus must - never return). */ + /* + * This is always a kernel trap and never fixable (and thus must + * never return). + */ for (;;) die(str, regs, error_code); } @@ -520,9 +522,11 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) } #ifdef CONFIG_X86_64 -/* Help handler running on IST stack to switch back to user stack - for scheduling or signal handling. The actual stack switch is done in - entry.S */ +/* + * Help handler running on IST stack to switch back to user stack + * for scheduling or signal handling. The actual stack switch is done in + * entry.S + */ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) { struct pt_regs *regs = eregs; @@ -532,8 +536,10 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) /* Exception from user space */ else if (user_mode(eregs)) regs = task_pt_regs(current); - /* Exception from kernel and interrupts are enabled. Move to - kernel process stack. */ + /* + * Exception from kernel and interrupts are enabled. Move to + * kernel process stack. + */ else if (eregs->flags & X86_EFLAGS_IF) regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); if (eregs != regs) @@ -685,12 +691,7 @@ void math_error(void __user *ip) cwd = get_fpu_cwd(task); swd = get_fpu_swd(task); - err = swd & ~cwd & 0x3f; - -#ifdef CONFIG_X86_32 - if (!err) - return; -#endif + err = swd & ~cwd; if (err & 0x001) { /* Invalid op */ /* @@ -708,7 +709,11 @@ void math_error(void __user *ip) } else if (err & 0x020) { /* Precision */ info.si_code = FPE_FLTRES; } else { - info.si_code = __SI_FAULT|SI_KERNEL; /* WTF? */ + /* + * If we're using IRQ 13, or supposedly even some trap 16 + * implementations, it's possible we get a spurious trap... + */ + return; /* Spurious trap, no error */ } force_sig_info(SIGFPE, &info, task); } diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 15c3e6999182..2b54fe002e94 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -159,7 +159,7 @@ int save_i387_xstate(void __user *buf) * Restore the extended state if present. Otherwise, restore the FP/SSE * state. */ -int restore_user_xstate(void __user *buf) +static int restore_user_xstate(void __user *buf) { struct _fpx_sw_bytes fx_sw_user; u64 mask; diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index c02343594b4d..d3ec292f00f2 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -7,8 +7,8 @@ common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ ifeq ($(CONFIG_KVM_TRACE),y) common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) endif -ifeq ($(CONFIG_DMAR),y) -common-objs += $(addprefix ../../../virt/kvm/, vtd.o) +ifeq ($(CONFIG_IOMMU_API),y) +common-objs += $(addprefix ../../../virt/kvm/, iommu.o) endif EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 59ebd37ad79e..e665d1c623ca 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -603,10 +603,29 @@ void kvm_free_pit(struct kvm *kvm) static void __inject_pit_timer_intr(struct kvm *kvm) { + struct kvm_vcpu *vcpu; + int i; + mutex_lock(&kvm->lock); kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); mutex_unlock(&kvm->lock); + + /* + * Provides NMI watchdog support via Virtual Wire mode. + * The route is: PIT -> PIC -> LVT0 in NMI mode. + * + * Note: Our Virtual Wire implementation is simplified, only + * propagating PIT interrupts to all VCPUs when they have set + * LVT0 to NMI delivery. Other PIC interrupts are just sent to + * VCPU0, and only if its LVT0 is in EXTINT mode. + */ + if (kvm->arch.vapics_in_nmi_mode > 0) + for (i = 0; i < KVM_MAX_VCPUS; ++i) { + vcpu = kvm->vcpus[i]; + if (vcpu) + kvm_apic_nmi_wd_deliver(vcpu); + } } void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 17e41e165f1a..179dcb0103fd 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -26,10 +26,40 @@ * Port from Qemu. */ #include <linux/mm.h> +#include <linux/bitops.h> #include "irq.h" #include <linux/kvm_host.h> +static void pic_lock(struct kvm_pic *s) +{ + spin_lock(&s->lock); +} + +static void pic_unlock(struct kvm_pic *s) +{ + struct kvm *kvm = s->kvm; + unsigned acks = s->pending_acks; + bool wakeup = s->wakeup_needed; + struct kvm_vcpu *vcpu; + + s->pending_acks = 0; + s->wakeup_needed = false; + + spin_unlock(&s->lock); + + while (acks) { + kvm_notify_acked_irq(kvm, __ffs(acks)); + acks &= acks - 1; + } + + if (wakeup) { + vcpu = s->kvm->vcpus[0]; + if (vcpu) + kvm_vcpu_kick(vcpu); + } +} + static void pic_clear_isr(struct kvm_kpic_state *s, int irq) { s->isr &= ~(1 << irq); @@ -136,17 +166,21 @@ static void pic_update_irq(struct kvm_pic *s) void kvm_pic_update_irq(struct kvm_pic *s) { + pic_lock(s); pic_update_irq(s); + pic_unlock(s); } void kvm_pic_set_irq(void *opaque, int irq, int level) { struct kvm_pic *s = opaque; + pic_lock(s); if (irq >= 0 && irq < PIC_NUM_PINS) { pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); pic_update_irq(s); } + pic_unlock(s); } /* @@ -172,6 +206,7 @@ int kvm_pic_read_irq(struct kvm *kvm) int irq, irq2, intno; struct kvm_pic *s = pic_irqchip(kvm); + pic_lock(s); irq = pic_get_irq(&s->pics[0]); if (irq >= 0) { pic_intack(&s->pics[0], irq); @@ -196,6 +231,7 @@ int kvm_pic_read_irq(struct kvm *kvm) intno = s->pics[0].irq_base + irq; } pic_update_irq(s); + pic_unlock(s); kvm_notify_acked_irq(kvm, irq); return intno; @@ -203,7 +239,7 @@ int kvm_pic_read_irq(struct kvm *kvm) void kvm_pic_reset(struct kvm_kpic_state *s) { - int irq, irqbase; + int irq, irqbase, n; struct kvm *kvm = s->pics_state->irq_request_opaque; struct kvm_vcpu *vcpu0 = kvm->vcpus[0]; @@ -214,8 +250,10 @@ void kvm_pic_reset(struct kvm_kpic_state *s) for (irq = 0; irq < PIC_NUM_PINS/2; irq++) { if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) - if (s->irr & (1 << irq) || s->isr & (1 << irq)) - kvm_notify_acked_irq(kvm, irq+irqbase); + if (s->irr & (1 << irq) || s->isr & (1 << irq)) { + n = irq + irqbase; + s->pics_state->pending_acks |= 1 << n; + } } s->last_irr = 0; s->irr = 0; @@ -406,6 +444,7 @@ static void picdev_write(struct kvm_io_device *this, printk(KERN_ERR "PIC: non byte write\n"); return; } + pic_lock(s); switch (addr) { case 0x20: case 0x21: @@ -418,6 +457,7 @@ static void picdev_write(struct kvm_io_device *this, elcr_ioport_write(&s->pics[addr & 1], addr, data); break; } + pic_unlock(s); } static void picdev_read(struct kvm_io_device *this, @@ -431,6 +471,7 @@ static void picdev_read(struct kvm_io_device *this, printk(KERN_ERR "PIC: non byte read\n"); return; } + pic_lock(s); switch (addr) { case 0x20: case 0x21: @@ -444,6 +485,7 @@ static void picdev_read(struct kvm_io_device *this, break; } *(unsigned char *)val = data; + pic_unlock(s); } /* @@ -459,7 +501,7 @@ static void pic_irq_request(void *opaque, int level) s->output = level; if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { s->pics[0].isr_ack &= ~(1 << irq); - kvm_vcpu_kick(vcpu); + s->wakeup_needed = true; } } @@ -469,6 +511,8 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); if (!s) return NULL; + spin_lock_init(&s->lock); + s->kvm = kvm; s->pics[0].elcr_mask = 0xf8; s->pics[1].elcr_mask = 0xde; s->irq_request = pic_irq_request; diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index f17c8f5bbf31..2bf32a03ceec 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -25,6 +25,7 @@ #include <linux/mm_types.h> #include <linux/hrtimer.h> #include <linux/kvm_host.h> +#include <linux/spinlock.h> #include "iodev.h" #include "ioapic.h" @@ -59,6 +60,10 @@ struct kvm_kpic_state { }; struct kvm_pic { + spinlock_t lock; + bool wakeup_needed; + unsigned pending_acks; + struct kvm *kvm; struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ irq_request_func *irq_request; void *irq_request_opaque; @@ -87,6 +92,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s); void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); +void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu); void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu); void __kvm_migrate_timers(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h index 65ef0fc2c036..8e5ee99551f6 100644 --- a/arch/x86/kvm/kvm_svm.h +++ b/arch/x86/kvm/kvm_svm.h @@ -7,7 +7,7 @@ #include <linux/kvm_host.h> #include <asm/msr.h> -#include "svm.h" +#include <asm/svm.h> static const u32 host_save_user_msrs[] = { #ifdef CONFIG_X86_64 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0fc3cab48943..afac68c0815c 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -130,6 +130,11 @@ static inline int apic_lvtt_period(struct kvm_lapic *apic) return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC; } +static inline int apic_lvt_nmi_mode(u32 lvt_val) +{ + return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI; +} + static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ @@ -354,6 +359,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, case APIC_DM_NMI: kvm_inject_nmi(vcpu); + kvm_vcpu_kick(vcpu); break; case APIC_DM_INIT: @@ -380,6 +386,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, } break; + case APIC_DM_EXTINT: + /* + * Should only be called by kvm_apic_local_deliver() with LVT0, + * before NMI watchdog was enabled. Already handled by + * kvm_apic_accept_pic_intr(). + */ + break; + default: printk(KERN_ERR "TODO: unsupported delivery mode %x\n", delivery_mode); @@ -663,6 +677,20 @@ static void start_apic_timer(struct kvm_lapic *apic) apic->timer.period))); } +static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) +{ + int nmi_wd_enabled = apic_lvt_nmi_mode(apic_get_reg(apic, APIC_LVT0)); + + if (apic_lvt_nmi_mode(lvt0_val)) { + if (!nmi_wd_enabled) { + apic_debug("Receive NMI setting on APIC_LVT0 " + "for cpu %d\n", apic->vcpu->vcpu_id); + apic->vcpu->kvm->arch.vapics_in_nmi_mode++; + } + } else if (nmi_wd_enabled) + apic->vcpu->kvm->arch.vapics_in_nmi_mode--; +} + static void apic_mmio_write(struct kvm_io_device *this, gpa_t address, int len, const void *data) { @@ -743,10 +771,11 @@ static void apic_mmio_write(struct kvm_io_device *this, apic_set_reg(apic, APIC_ICR2, val & 0xff000000); break; + case APIC_LVT0: + apic_manage_nmi_watchdog(apic, val); case APIC_LVTT: case APIC_LVTTHMR: case APIC_LVTPC: - case APIC_LVT0: case APIC_LVT1: case APIC_LVTERR: /* TODO: Check vector */ @@ -961,12 +990,26 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu) return 0; } -static int __inject_apic_timer_irq(struct kvm_lapic *apic) +static int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) +{ + u32 reg = apic_get_reg(apic, lvt_type); + int vector, mode, trig_mode; + + if (apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) { + vector = reg & APIC_VECTOR_MASK; + mode = reg & APIC_MODE_MASK; + trig_mode = reg & APIC_LVT_LEVEL_TRIGGER; + return __apic_accept_irq(apic, mode, vector, 1, trig_mode); + } + return 0; +} + +void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu) { - int vector; + struct kvm_lapic *apic = vcpu->arch.apic; - vector = apic_lvt_vector(apic, APIC_LVTT); - return __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0); + if (apic) + kvm_apic_local_deliver(apic, APIC_LVT0); } static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) @@ -1061,9 +1104,8 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (apic && apic_lvt_enabled(apic, APIC_LVTT) && - atomic_read(&apic->timer.pending) > 0) { - if (__inject_apic_timer_irq(apic)) + if (apic && atomic_read(&apic->timer.pending) > 0) { + if (kvm_apic_local_deliver(apic, APIC_LVTT)) atomic_dec(&apic->timer.pending); } } diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 410ddbc1aa2e..83f11c7474a1 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -17,7 +17,6 @@ * */ -#include "vmx.h" #include "mmu.h" #include <linux/kvm_host.h> @@ -33,6 +32,7 @@ #include <asm/page.h> #include <asm/cmpxchg.h> #include <asm/io.h> +#include <asm/vmx.h> /* * When setting this variable to true it enables Two-Dimensional-Paging @@ -168,6 +168,7 @@ static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ static u64 __read_mostly shadow_user_mask; static u64 __read_mostly shadow_accessed_mask; static u64 __read_mostly shadow_dirty_mask; +static u64 __read_mostly shadow_mt_mask; void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte) { @@ -183,13 +184,14 @@ void kvm_mmu_set_base_ptes(u64 base_pte) EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes); void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, - u64 dirty_mask, u64 nx_mask, u64 x_mask) + u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask) { shadow_user_mask = user_mask; shadow_accessed_mask = accessed_mask; shadow_dirty_mask = dirty_mask; shadow_nx_mask = nx_mask; shadow_x_mask = x_mask; + shadow_mt_mask = mt_mask; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); @@ -384,7 +386,9 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn) { int *write_count; - write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn)); + gfn = unalias_gfn(kvm, gfn); + write_count = slot_largepage_idx(gfn, + gfn_to_memslot_unaliased(kvm, gfn)); *write_count += 1; } @@ -392,16 +396,20 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) { int *write_count; - write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn)); + gfn = unalias_gfn(kvm, gfn); + write_count = slot_largepage_idx(gfn, + gfn_to_memslot_unaliased(kvm, gfn)); *write_count -= 1; WARN_ON(*write_count < 0); } static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn) { - struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); + struct kvm_memory_slot *slot; int *largepage_idx; + gfn = unalias_gfn(kvm, gfn); + slot = gfn_to_memslot_unaliased(kvm, gfn); if (slot) { largepage_idx = slot_largepage_idx(gfn, slot); return *largepage_idx; @@ -613,7 +621,7 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) return NULL; } -static void rmap_write_protect(struct kvm *kvm, u64 gfn) +static int rmap_write_protect(struct kvm *kvm, u64 gfn) { unsigned long *rmapp; u64 *spte; @@ -659,8 +667,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) spte = rmap_next(kvm, rmapp, spte); } - if (write_protected) - kvm_flush_remote_tlbs(kvm); + return write_protected; } static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) @@ -786,9 +793,11 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); + INIT_LIST_HEAD(&sp->oos_link); ASSERT(is_empty_shadow_page(sp->spt)); - sp->slot_bitmap = 0; + bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); sp->multimapped = 0; + sp->global = 1; sp->parent_pte = parent_pte; --vcpu->kvm->arch.n_free_mmu_pages; return sp; @@ -900,8 +909,9 @@ static void kvm_mmu_update_unsync_bitmap(u64 *spte) struct kvm_mmu_page *sp = page_header(__pa(spte)); index = spte - sp->spt; - __set_bit(index, sp->unsync_child_bitmap); - sp->unsync_children = 1; + if (!__test_and_set_bit(index, sp->unsync_child_bitmap)) + sp->unsync_children++; + WARN_ON(!sp->unsync_children); } static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) @@ -928,7 +938,6 @@ static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { - sp->unsync_children = 1; kvm_mmu_update_parents_unsync(sp); return 1; } @@ -959,38 +968,66 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { } +#define KVM_PAGE_ARRAY_NR 16 + +struct kvm_mmu_pages { + struct mmu_page_and_offset { + struct kvm_mmu_page *sp; + unsigned int idx; + } page[KVM_PAGE_ARRAY_NR]; + unsigned int nr; +}; + #define for_each_unsync_children(bitmap, idx) \ for (idx = find_first_bit(bitmap, 512); \ idx < 512; \ idx = find_next_bit(bitmap, 512, idx+1)) -static int mmu_unsync_walk(struct kvm_mmu_page *sp, - struct kvm_unsync_walk *walker) +int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, + int idx) { - int i, ret; + int i; - if (!sp->unsync_children) - return 0; + if (sp->unsync) + for (i=0; i < pvec->nr; i++) + if (pvec->page[i].sp == sp) + return 0; + + pvec->page[pvec->nr].sp = sp; + pvec->page[pvec->nr].idx = idx; + pvec->nr++; + return (pvec->nr == KVM_PAGE_ARRAY_NR); +} + +static int __mmu_unsync_walk(struct kvm_mmu_page *sp, + struct kvm_mmu_pages *pvec) +{ + int i, ret, nr_unsync_leaf = 0; for_each_unsync_children(sp->unsync_child_bitmap, i) { u64 ent = sp->spt[i]; - if (is_shadow_present_pte(ent)) { + if (is_shadow_present_pte(ent) && !is_large_pte(ent)) { struct kvm_mmu_page *child; child = page_header(ent & PT64_BASE_ADDR_MASK); if (child->unsync_children) { - ret = mmu_unsync_walk(child, walker); - if (ret) + if (mmu_pages_add(pvec, child, i)) + return -ENOSPC; + + ret = __mmu_unsync_walk(child, pvec); + if (!ret) + __clear_bit(i, sp->unsync_child_bitmap); + else if (ret > 0) + nr_unsync_leaf += ret; + else return ret; - __clear_bit(i, sp->unsync_child_bitmap); } if (child->unsync) { - ret = walker->entry(child, walker); - __clear_bit(i, sp->unsync_child_bitmap); - if (ret) - return ret; + nr_unsync_leaf++; + if (mmu_pages_add(pvec, child, i)) + return -ENOSPC; } } } @@ -998,7 +1035,17 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp, if (find_first_bit(sp->unsync_child_bitmap, 512) == 512) sp->unsync_children = 0; - return 0; + return nr_unsync_leaf; +} + +static int mmu_unsync_walk(struct kvm_mmu_page *sp, + struct kvm_mmu_pages *pvec) +{ + if (!sp->unsync_children) + return 0; + + mmu_pages_add(pvec, sp, 0); + return __mmu_unsync_walk(sp, pvec); } static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) @@ -1021,10 +1068,18 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) return NULL; } +static void kvm_unlink_unsync_global(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + list_del(&sp->oos_link); + --kvm->stat.mmu_unsync_global; +} + static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) { WARN_ON(!sp->unsync); sp->unsync = 0; + if (sp->global) + kvm_unlink_unsync_global(kvm, sp); --kvm->stat.mmu_unsync; } @@ -1037,7 +1092,8 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) return 1; } - rmap_write_protect(vcpu->kvm, sp->gfn); + if (rmap_write_protect(vcpu->kvm, sp->gfn)) + kvm_flush_remote_tlbs(vcpu->kvm); kvm_unlink_unsync_page(vcpu->kvm, sp); if (vcpu->arch.mmu.sync_page(vcpu, sp)) { kvm_mmu_zap_page(vcpu->kvm, sp); @@ -1048,30 +1104,89 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) return 0; } -struct sync_walker { - struct kvm_vcpu *vcpu; - struct kvm_unsync_walk walker; +struct mmu_page_path { + struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1]; + unsigned int idx[PT64_ROOT_LEVEL-1]; }; -static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk) +#define for_each_sp(pvec, sp, parents, i) \ + for (i = mmu_pages_next(&pvec, &parents, -1), \ + sp = pvec.page[i].sp; \ + i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \ + i = mmu_pages_next(&pvec, &parents, i)) + +int mmu_pages_next(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents, + int i) { - struct sync_walker *sync_walk = container_of(walk, struct sync_walker, - walker); - struct kvm_vcpu *vcpu = sync_walk->vcpu; + int n; - kvm_sync_page(vcpu, sp); - return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)); + for (n = i+1; n < pvec->nr; n++) { + struct kvm_mmu_page *sp = pvec->page[n].sp; + + if (sp->role.level == PT_PAGE_TABLE_LEVEL) { + parents->idx[0] = pvec->page[n].idx; + return n; + } + + parents->parent[sp->role.level-2] = sp; + parents->idx[sp->role.level-1] = pvec->page[n].idx; + } + + return n; } -static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +void mmu_pages_clear_parents(struct mmu_page_path *parents) { - struct sync_walker walker = { - .walker = { .entry = mmu_sync_fn, }, - .vcpu = vcpu, - }; + struct kvm_mmu_page *sp; + unsigned int level = 0; + + do { + unsigned int idx = parents->idx[level]; + + sp = parents->parent[level]; + if (!sp) + return; + + --sp->unsync_children; + WARN_ON((int)sp->unsync_children < 0); + __clear_bit(idx, sp->unsync_child_bitmap); + level++; + } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children); +} + +static void kvm_mmu_pages_init(struct kvm_mmu_page *parent, + struct mmu_page_path *parents, + struct kvm_mmu_pages *pvec) +{ + parents->parent[parent->role.level-1] = NULL; + pvec->nr = 0; +} + +static void mmu_sync_children(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *parent) +{ + int i; + struct kvm_mmu_page *sp; + struct mmu_page_path parents; + struct kvm_mmu_pages pages; + + kvm_mmu_pages_init(parent, &parents, &pages); + while (mmu_unsync_walk(parent, &pages)) { + int protected = 0; - while (mmu_unsync_walk(sp, &walker.walker)) + for_each_sp(pages, sp, parents, i) + protected |= rmap_write_protect(vcpu->kvm, sp->gfn); + + if (protected) + kvm_flush_remote_tlbs(vcpu->kvm); + + for_each_sp(pages, sp, parents, i) { + kvm_sync_page(vcpu, sp); + mmu_pages_clear_parents(&parents); + } cond_resched_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_pages_init(parent, &parents, &pages); + } } static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, @@ -1129,7 +1244,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, sp->role = role; hlist_add_head(&sp->hash_link, bucket); if (!metaphysical) { - rmap_write_protect(vcpu->kvm, gfn); + if (rmap_write_protect(vcpu->kvm, gfn)) + kvm_flush_remote_tlbs(vcpu->kvm); account_shadowed(vcpu->kvm, gfn); } if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) @@ -1153,6 +1269,8 @@ static int walk_shadow(struct kvm_shadow_walk *walker, if (level == PT32E_ROOT_LEVEL) { shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; shadow_addr &= PT64_BASE_ADDR_MASK; + if (!shadow_addr) + return 1; --level; } @@ -1237,33 +1355,29 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) } } -struct zap_walker { - struct kvm_unsync_walk walker; - struct kvm *kvm; - int zapped; -}; - -static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk) +static int mmu_zap_unsync_children(struct kvm *kvm, + struct kvm_mmu_page *parent) { - struct zap_walker *zap_walk = container_of(walk, struct zap_walker, - walker); - kvm_mmu_zap_page(zap_walk->kvm, sp); - zap_walk->zapped = 1; - return 0; -} + int i, zapped = 0; + struct mmu_page_path parents; + struct kvm_mmu_pages pages; -static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp) -{ - struct zap_walker walker = { - .walker = { .entry = mmu_zap_fn, }, - .kvm = kvm, - .zapped = 0, - }; - - if (sp->role.level == PT_PAGE_TABLE_LEVEL) + if (parent->role.level == PT_PAGE_TABLE_LEVEL) return 0; - mmu_unsync_walk(sp, &walker.walker); - return walker.zapped; + + kvm_mmu_pages_init(parent, &parents, &pages); + while (mmu_unsync_walk(parent, &pages)) { + struct kvm_mmu_page *sp; + + for_each_sp(pages, sp, parents, i) { + kvm_mmu_zap_page(kvm, sp); + mmu_pages_clear_parents(&parents); + } + zapped += pages.nr; + kvm_mmu_pages_init(parent, &parents, &pages); + } + + return zapped; } static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) @@ -1362,7 +1476,7 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); struct kvm_mmu_page *sp = page_header(__pa(pte)); - __set_bit(slot, &sp->slot_bitmap); + __set_bit(slot, sp->slot_bitmap); } static void mmu_convert_notrap(struct kvm_mmu_page *sp) @@ -1393,6 +1507,110 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) return page; } +/* + * The function is based on mtrr_type_lookup() in + * arch/x86/kernel/cpu/mtrr/generic.c + */ +static int get_mtrr_type(struct mtrr_state_type *mtrr_state, + u64 start, u64 end) +{ + int i; + u64 base, mask; + u8 prev_match, curr_match; + int num_var_ranges = KVM_NR_VAR_MTRR; + + if (!mtrr_state->enabled) + return 0xFF; + + /* Make end inclusive end, instead of exclusive */ + end--; + + /* Look in fixed ranges. Just return the type as per start */ + if (mtrr_state->have_fixed && (start < 0x100000)) { + int idx; + + if (start < 0x80000) { + idx = 0; + idx += (start >> 16); + return mtrr_state->fixed_ranges[idx]; + } else if (start < 0xC0000) { + idx = 1 * 8; + idx += ((start - 0x80000) >> 14); + return mtrr_state->fixed_ranges[idx]; + } else if (start < 0x1000000) { + idx = 3 * 8; + idx += ((start - 0xC0000) >> 12); + return mtrr_state->fixed_ranges[idx]; + } + } + + /* + * Look in variable ranges + * Look of multiple ranges matching this address and pick type + * as per MTRR precedence + */ + if (!(mtrr_state->enabled & 2)) + return mtrr_state->def_type; + + prev_match = 0xFF; + for (i = 0; i < num_var_ranges; ++i) { + unsigned short start_state, end_state; + + if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11))) + continue; + + base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) + + (mtrr_state->var_ranges[i].base_lo & PAGE_MASK); + mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) + + (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK); + + start_state = ((start & mask) == (base & mask)); + end_state = ((end & mask) == (base & mask)); + if (start_state != end_state) + return 0xFE; + + if ((start & mask) != (base & mask)) + continue; + + curr_match = mtrr_state->var_ranges[i].base_lo & 0xff; + if (prev_match == 0xFF) { + prev_match = curr_match; + continue; + } + + if (prev_match == MTRR_TYPE_UNCACHABLE || + curr_match == MTRR_TYPE_UNCACHABLE) + return MTRR_TYPE_UNCACHABLE; + + if ((prev_match == MTRR_TYPE_WRBACK && + curr_match == MTRR_TYPE_WRTHROUGH) || + (prev_match == MTRR_TYPE_WRTHROUGH && + curr_match == MTRR_TYPE_WRBACK)) { + prev_match = MTRR_TYPE_WRTHROUGH; + curr_match = MTRR_TYPE_WRTHROUGH; + } + + if (prev_match != curr_match) + return MTRR_TYPE_UNCACHABLE; + } + + if (prev_match != 0xFF) + return prev_match; + + return mtrr_state->def_type; +} + +static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + u8 mtrr; + + mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT, + (gfn << PAGE_SHIFT) + PAGE_SIZE); + if (mtrr == 0xfe || mtrr == 0xff) + mtrr = MTRR_TYPE_WRBACK; + return mtrr; +} + static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { unsigned index; @@ -1409,9 +1627,15 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (s->role.word != sp->role.word) return 1; } - kvm_mmu_mark_parents_unsync(vcpu, sp); ++vcpu->kvm->stat.mmu_unsync; sp->unsync = 1; + + if (sp->global) { + list_add(&sp->oos_link, &vcpu->kvm->arch.oos_global_pages); + ++vcpu->kvm->stat.mmu_unsync_global; + } else + kvm_mmu_mark_parents_unsync(vcpu, sp); + mmu_convert_notrap(sp); return 0; } @@ -1437,11 +1661,24 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, unsigned pte_access, int user_fault, int write_fault, int dirty, int largepage, - gfn_t gfn, pfn_t pfn, bool speculative, + int global, gfn_t gfn, pfn_t pfn, bool speculative, bool can_unsync) { u64 spte; int ret = 0; + u64 mt_mask = shadow_mt_mask; + struct kvm_mmu_page *sp = page_header(__pa(shadow_pte)); + + if (!(vcpu->arch.cr4 & X86_CR4_PGE)) + global = 0; + if (!global && sp->global) { + sp->global = 0; + if (sp->unsync) { + kvm_unlink_unsync_global(vcpu->kvm, sp); + kvm_mmu_mark_parents_unsync(vcpu, sp); + } + } + /* * We don't set the accessed bit, since we sometimes want to see * whether the guest actually used the pte (in order to detect @@ -1460,6 +1697,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, spte |= shadow_user_mask; if (largepage) spte |= PT_PAGE_SIZE_MASK; + if (mt_mask) { + mt_mask = get_memory_type(vcpu, gfn) << + kvm_x86_ops->get_mt_mask_shift(); + spte |= mt_mask; + } spte |= (u64)pfn << PAGE_SHIFT; @@ -1474,6 +1716,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, spte |= PT_WRITABLE_MASK; + /* + * Optimization: for pte sync, if spte was writable the hash + * lookup is unnecessary (and expensive). Write protection + * is responsibility of mmu_get_page / kvm_sync_page. + * Same reasoning can be applied to dirty page accounting. + */ + if (!can_unsync && is_writeble_pte(*shadow_pte)) + goto set_pte; + if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { pgprintk("%s: found shadow page for %lx, marking ro\n", __func__, gfn); @@ -1495,8 +1746,8 @@ set_pte: static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, unsigned pt_access, unsigned pte_access, int user_fault, int write_fault, int dirty, - int *ptwrite, int largepage, gfn_t gfn, - pfn_t pfn, bool speculative) + int *ptwrite, int largepage, int global, + gfn_t gfn, pfn_t pfn, bool speculative) { int was_rmapped = 0; int was_writeble = is_writeble_pte(*shadow_pte); @@ -1529,7 +1780,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, } } if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, - dirty, largepage, gfn, pfn, speculative, true)) { + dirty, largepage, global, gfn, pfn, speculative, true)) { if (write_fault) *ptwrite = 1; kvm_x86_ops->tlb_flush(vcpu); @@ -1586,7 +1837,7 @@ static int direct_map_entry(struct kvm_shadow_walk *_walk, || (walk->largepage && level == PT_DIRECTORY_LEVEL)) { mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL, 0, walk->write, 1, &walk->pt_write, - walk->largepage, gfn, walk->pfn, false); + walk->largepage, 0, gfn, walk->pfn, false); ++vcpu->stat.pf_fixed; return 1; } @@ -1773,6 +2024,15 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu) } } +static void mmu_sync_global(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_mmu_page *sp, *n; + + list_for_each_entry_safe(sp, n, &kvm->arch.oos_global_pages, oos_link) + kvm_sync_page(vcpu, sp); +} + void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) { spin_lock(&vcpu->kvm->mmu_lock); @@ -1780,6 +2040,13 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) spin_unlock(&vcpu->kvm->mmu_lock); } +void kvm_mmu_sync_global(struct kvm_vcpu *vcpu) +{ + spin_lock(&vcpu->kvm->mmu_lock); + mmu_sync_global(vcpu); + spin_unlock(&vcpu->kvm->mmu_lock); +} + static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) { return vaddr; @@ -2178,7 +2445,8 @@ static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) } void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes) + const u8 *new, int bytes, + bool guest_initiated) { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *sp; @@ -2204,15 +2472,17 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_mmu_free_some_pages(vcpu); ++vcpu->kvm->stat.mmu_pte_write; kvm_mmu_audit(vcpu, "pre pte write"); - if (gfn == vcpu->arch.last_pt_write_gfn - && !last_updated_pte_accessed(vcpu)) { - ++vcpu->arch.last_pt_write_count; - if (vcpu->arch.last_pt_write_count >= 3) - flooded = 1; - } else { - vcpu->arch.last_pt_write_gfn = gfn; - vcpu->arch.last_pt_write_count = 1; - vcpu->arch.last_pte_updated = NULL; + if (guest_initiated) { + if (gfn == vcpu->arch.last_pt_write_gfn + && !last_updated_pte_accessed(vcpu)) { + ++vcpu->arch.last_pt_write_count; + if (vcpu->arch.last_pt_write_count >= 3) + flooded = 1; + } else { + vcpu->arch.last_pt_write_gfn = gfn; + vcpu->arch.last_pt_write_count = 1; + vcpu->arch.last_pte_updated = NULL; + } } index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; @@ -2352,9 +2622,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { - spin_lock(&vcpu->kvm->mmu_lock); vcpu->arch.mmu.invlpg(vcpu, gva); - spin_unlock(&vcpu->kvm->mmu_lock); kvm_mmu_flush_tlb(vcpu); ++vcpu->stat.invlpg; } @@ -2451,7 +2719,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) int i; u64 *pt; - if (!test_bit(slot, &sp->slot_bitmap)) + if (!test_bit(slot, sp->slot_bitmap)) continue; pt = sp->spt; @@ -2860,8 +3128,8 @@ static void audit_write_protection(struct kvm_vcpu *vcpu) if (sp->role.metaphysical) continue; - slot = gfn_to_memslot(vcpu->kvm, sp->gfn); gfn = unalias_gfn(vcpu->kvm, sp->gfn); + slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn); rmapp = &slot->rmap[gfn - slot->base_gfn]; if (*rmapp) printk(KERN_ERR "%s: (%s) shadow page has writable" diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 84eee43bbe74..9fd78b6e17ad 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -82,6 +82,7 @@ struct shadow_walker { int *ptwrite; pfn_t pfn; u64 *sptep; + gpa_t pte_gpa; }; static gfn_t gpte_to_gfn(pt_element_t gpte) @@ -222,7 +223,7 @@ walk: if (ret) goto walk; pte |= PT_DIRTY_MASK; - kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte)); + kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte), 0); walker->ptes[walker->level - 1] = pte; } @@ -274,7 +275,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, return; kvm_get_pfn(pfn); mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, - gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte), + gpte & PT_DIRTY_MASK, NULL, largepage, + gpte & PT_GLOBAL_MASK, gpte_to_gfn(gpte), pfn, true); } @@ -301,8 +303,9 @@ static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw, mmu_set_spte(vcpu, sptep, access, gw->pte_access & access, sw->user_fault, sw->write_fault, gw->ptes[gw->level-1] & PT_DIRTY_MASK, - sw->ptwrite, sw->largepage, gw->gfn, sw->pfn, - false); + sw->ptwrite, sw->largepage, + gw->ptes[gw->level-1] & PT_GLOBAL_MASK, + gw->gfn, sw->pfn, false); sw->sptep = sptep; return 1; } @@ -466,10 +469,22 @@ static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw, struct kvm_vcpu *vcpu, u64 addr, u64 *sptep, int level) { + struct shadow_walker *sw = + container_of(_sw, struct shadow_walker, walker); - if (level == PT_PAGE_TABLE_LEVEL) { - if (is_shadow_present_pte(*sptep)) + /* FIXME: properly handle invlpg on large guest pages */ + if (level == PT_PAGE_TABLE_LEVEL || + ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) { + struct kvm_mmu_page *sp = page_header(__pa(sptep)); + + sw->pte_gpa = (sp->gfn << PAGE_SHIFT); + sw->pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); + + if (is_shadow_present_pte(*sptep)) { rmap_remove(vcpu->kvm, sptep); + if (is_large_pte(*sptep)) + --vcpu->kvm->stat.lpages; + } set_shadow_pte(sptep, shadow_trap_nonpresent_pte); return 1; } @@ -480,11 +495,26 @@ static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw, static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) { + pt_element_t gpte; struct shadow_walker walker = { .walker = { .entry = FNAME(shadow_invlpg_entry), }, + .pte_gpa = -1, }; + spin_lock(&vcpu->kvm->mmu_lock); walk_shadow(&walker.walker, vcpu, gva); + spin_unlock(&vcpu->kvm->mmu_lock); + if (walker.pte_gpa == -1) + return; + if (kvm_read_guest_atomic(vcpu->kvm, walker.pte_gpa, &gpte, + sizeof(pt_element_t))) + return; + if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) { + if (mmu_topup_memory_caches(vcpu)) + return; + kvm_mmu_pte_write(vcpu, walker.pte_gpa, (const u8 *)&gpte, + sizeof(pt_element_t), 0); + } } static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) @@ -580,7 +610,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) nr_present++; pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, - is_dirty_pte(gpte), 0, gfn, + is_dirty_pte(gpte), 0, gpte & PT_GLOBAL_MASK, gfn, spte_to_pfn(sp->spt[i]), true, false); } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9c4ce657d963..1452851ae258 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -28,6 +28,8 @@ #include <asm/desc.h> +#include <asm/virtext.h> + #define __ex(x) __kvm_handle_fault_on_reboot(x) MODULE_AUTHOR("Qumranet"); @@ -245,34 +247,19 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) static int has_svm(void) { - uint32_t eax, ebx, ecx, edx; - - if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { - printk(KERN_INFO "has_svm: not amd\n"); - return 0; - } + const char *msg; - cpuid(0x80000000, &eax, &ebx, &ecx, &edx); - if (eax < SVM_CPUID_FUNC) { - printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n"); + if (!cpu_has_svm(&msg)) { + printk(KERN_INFO "has_svn: %s\n", msg); return 0; } - cpuid(0x80000001, &eax, &ebx, &ecx, &edx); - if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) { - printk(KERN_DEBUG "has_svm: svm not available\n"); - return 0; - } return 1; } static void svm_hardware_disable(void *garbage) { - uint64_t efer; - - wrmsrl(MSR_VM_HSAVE_PA, 0); - rdmsrl(MSR_EFER, efer); - wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK); + cpu_svm_disable(); } static void svm_hardware_enable(void *garbage) @@ -772,6 +759,22 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1; + + /* + * SVM always stores 0 for the 'G' bit in the CS selector in + * the VMCB on a VMEXIT. This hurts cross-vendor migration: + * Intel's VMENTRY has a check on the 'G' bit. + */ + if (seg == VCPU_SREG_CS) + var->g = s->limit > 0xfffff; + + /* + * Work around a bug where the busy flag in the tr selector + * isn't exposed + */ + if (seg == VCPU_SREG_TR) + var->type |= 0x2; + var->unusable = !var->present; } @@ -1099,6 +1102,7 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) rep = (io_info & SVM_IOIO_REP_MASK) != 0; down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0; + skip_emulated_instruction(&svm->vcpu); return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); } @@ -1912,6 +1916,11 @@ static int get_npt_level(void) #endif } +static int svm_get_mt_mask_shift(void) +{ + return 0; +} + static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -1967,6 +1976,7 @@ static struct kvm_x86_ops svm_x86_ops = { .set_tss_addr = svm_set_tss_addr, .get_tdp_level = get_npt_level, + .get_mt_mask_shift = svm_get_mt_mask_shift, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a4018b01e1f9..6259d7467648 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -16,7 +16,6 @@ */ #include "irq.h" -#include "vmx.h" #include "mmu.h" #include <linux/kvm_host.h> @@ -31,6 +30,8 @@ #include <asm/io.h> #include <asm/desc.h> +#include <asm/vmx.h> +#include <asm/virtext.h> #define __ex(x) __kvm_handle_fault_on_reboot(x) @@ -90,6 +91,11 @@ struct vcpu_vmx { } rmode; int vpid; bool emulation_required; + + /* Support for vnmi-less CPUs */ + int soft_vnmi_blocked; + ktime_t entry_time; + s64 vnmi_blocked_time; }; static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) @@ -122,7 +128,7 @@ static struct vmcs_config { u32 vmentry_ctrl; } vmcs_config; -struct vmx_capability { +static struct vmx_capability { u32 ept; u32 vpid; } vmx_capability; @@ -957,6 +963,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data); break; + case MSR_IA32_CR_PAT: + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + vmcs_write64(GUEST_IA32_PAT, data); + vcpu->arch.pat = data; + break; + } + /* Otherwise falls through to kvm_set_msr_common */ default: vmx_load_host_state(vmx); msr = find_msr_entry(vmx, msr_index); @@ -1032,8 +1045,7 @@ static int vmx_get_irq(struct kvm_vcpu *vcpu) static __init int cpu_has_kvm_support(void) { - unsigned long ecx = cpuid_ecx(1); - return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */ + return cpu_has_vmx(); } static __init int vmx_disabled_by_bios(void) @@ -1079,13 +1091,22 @@ static void vmclear_local_vcpus(void) __vcpu_clear(vmx); } -static void hardware_disable(void *garbage) + +/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot() + * tricks. + */ +static void kvm_cpu_vmxoff(void) { - vmclear_local_vcpus(); asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); write_cr4(read_cr4() & ~X86_CR4_VMXE); } +static void hardware_disable(void *garbage) +{ + vmclear_local_vcpus(); + kvm_cpu_vmxoff(); +} + static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result) { @@ -1176,12 +1197,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) #ifdef CONFIG_X86_64 min |= VM_EXIT_HOST_ADDR_SPACE_SIZE; #endif - opt = 0; + opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS, &_vmexit_control) < 0) return -EIO; - min = opt = 0; + min = 0; + opt = VM_ENTRY_LOAD_IA32_PAT; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, &_vmentry_control) < 0) return -EIO; @@ -2087,8 +2109,9 @@ static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr) */ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) { - u32 host_sysenter_cs; + u32 host_sysenter_cs, msr_low, msr_high; u32 junk; + u64 host_pat; unsigned long a; struct descriptor_table dt; int i; @@ -2176,6 +2199,20 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) rdmsrl(MSR_IA32_SYSENTER_EIP, a); vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */ + if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) { + rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); + host_pat = msr_low | ((u64) msr_high << 32); + vmcs_write64(HOST_IA32_PAT, host_pat); + } + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high); + host_pat = msr_low | ((u64) msr_high << 32); + /* Write the default value follow host pat */ + vmcs_write64(GUEST_IA32_PAT, host_pat); + /* Keep arch.pat sync with GUEST_IA32_PAT */ + vmx->vcpu.arch.pat = host_pat; + } + for (i = 0; i < NR_VMX_MSR; ++i) { u32 index = vmx_msr_index[i]; u32 data_low, data_high; @@ -2230,6 +2267,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx->vcpu.arch.rmode.active = 0; + vmx->soft_vnmi_blocked = 0; + vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); kvm_set_cr8(&vmx->vcpu, 0); msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; @@ -2335,6 +2374,29 @@ out: return ret; } +static void enable_irq_window(struct kvm_vcpu *vcpu) +{ + u32 cpu_based_vm_exec_control; + + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); +} + +static void enable_nmi_window(struct kvm_vcpu *vcpu) +{ + u32 cpu_based_vm_exec_control; + + if (!cpu_has_virtual_nmis()) { + enable_irq_window(vcpu); + return; + } + + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); +} + static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -2358,10 +2420,54 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) static void vmx_inject_nmi(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!cpu_has_virtual_nmis()) { + /* + * Tracking the NMI-blocked state in software is built upon + * finding the next open IRQ window. This, in turn, depends on + * well-behaving guests: They have to keep IRQs disabled at + * least as long as the NMI handler runs. Otherwise we may + * cause NMI nesting, maybe breaking the guest. But as this is + * highly unlikely, we can live with the residual risk. + */ + vmx->soft_vnmi_blocked = 1; + vmx->vnmi_blocked_time = 0; + } + + ++vcpu->stat.nmi_injections; + if (vcpu->arch.rmode.active) { + vmx->rmode.irq.pending = true; + vmx->rmode.irq.vector = NMI_VECTOR; + vmx->rmode.irq.rip = kvm_rip_read(vcpu); + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + NMI_VECTOR | INTR_TYPE_SOFT_INTR | + INTR_INFO_VALID_MASK); + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); + kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); + return; + } vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); } +static void vmx_update_window_states(struct kvm_vcpu *vcpu) +{ + u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + + vcpu->arch.nmi_window_open = + !(guest_intr & (GUEST_INTR_STATE_STI | + GUEST_INTR_STATE_MOV_SS | + GUEST_INTR_STATE_NMI)); + if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) + vcpu->arch.nmi_window_open = 0; + + vcpu->arch.interrupt_window_open = + ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && + !(guest_intr & (GUEST_INTR_STATE_STI | + GUEST_INTR_STATE_MOV_SS))); +} + static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) { int word_index = __ffs(vcpu->arch.irq_summary); @@ -2374,40 +2480,49 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) kvm_queue_interrupt(vcpu, irq); } - static void do_interrupt_requests(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { - u32 cpu_based_vm_exec_control; - - vcpu->arch.interrupt_window_open = - ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && - (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); + vmx_update_window_states(vcpu); - if (vcpu->arch.interrupt_window_open && - vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending) - kvm_do_inject_irq(vcpu); + if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { + if (vcpu->arch.interrupt.pending) { + enable_nmi_window(vcpu); + } else if (vcpu->arch.nmi_window_open) { + vcpu->arch.nmi_pending = false; + vcpu->arch.nmi_injected = true; + } else { + enable_nmi_window(vcpu); + return; + } + } + if (vcpu->arch.nmi_injected) { + vmx_inject_nmi(vcpu); + if (vcpu->arch.nmi_pending) + enable_nmi_window(vcpu); + else if (vcpu->arch.irq_summary + || kvm_run->request_interrupt_window) + enable_irq_window(vcpu); + return; + } - if (vcpu->arch.interrupt_window_open && vcpu->arch.interrupt.pending) - vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); + if (vcpu->arch.interrupt_window_open) { + if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending) + kvm_do_inject_irq(vcpu); - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + if (vcpu->arch.interrupt.pending) + vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); + } if (!vcpu->arch.interrupt_window_open && (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) - /* - * Interrupts blocked. Wait for unblock. - */ - cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; - else - cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + enable_irq_window(vcpu); } static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) { int ret; struct kvm_userspace_memory_region tss_mem = { - .slot = 8, + .slot = TSS_PRIVATE_MEMSLOT, .guest_phys_addr = addr, .memory_size = PAGE_SIZE * 3, .flags = 0, @@ -2492,7 +2607,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary); } - if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ + if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) return 1; /* already handled by vmx_vcpu_run() */ if (is_no_device(intr_info)) { @@ -2581,6 +2696,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) rep = (exit_qualification & 32) != 0; port = exit_qualification >> 16; + skip_emulated_instruction(vcpu); return kvm_emulate_pio(vcpu, kvm_run, in, size, port); } @@ -2767,6 +2883,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); KVMTRACE_0D(PEND_INTR, vcpu, handler); + ++vcpu->stat.irq_window_exits; /* * If the user space waits to inject interrupts, exit as soon as @@ -2775,7 +2892,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, if (kvm_run->request_interrupt_window && !vcpu->arch.irq_summary) { kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; - ++vcpu->stat.irq_window_exits; return 0; } return 1; @@ -2832,6 +2948,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { + struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long exit_qualification; u16 tss_selector; int reason; @@ -2839,6 +2956,15 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) exit_qualification = vmcs_readl(EXIT_QUALIFICATION); reason = (u32)exit_qualification >> 30; + if (reason == TASK_SWITCH_GATE && vmx->vcpu.arch.nmi_injected && + (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && + (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK) + == INTR_TYPE_NMI_INTR) { + vcpu->arch.nmi_injected = false; + if (cpu_has_virtual_nmis()) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + } tss_selector = exit_qualification; return kvm_task_switch(vcpu, tss_selector, reason); @@ -2927,16 +3053,12 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, while (!guest_state_valid(vcpu)) { err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); - switch (err) { - case EMULATE_DONE: - break; - case EMULATE_DO_MMIO: - kvm_report_emulation_failure(vcpu, "mmio"); - /* TODO: Handle MMIO */ - return; - default: - kvm_report_emulation_failure(vcpu, "emulation failure"); - return; + if (err == EMULATE_DO_MMIO) + break; + + if (err != EMULATE_DONE) { + kvm_report_emulation_failure(vcpu, "emulation failure"); + return; } if (signal_pending(current)) @@ -2948,8 +3070,10 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, local_irq_disable(); preempt_disable(); - /* Guest state should be valid now, no more emulation should be needed */ - vmx->emulation_required = 0; + /* Guest state should be valid now except if we need to + * emulate an MMIO */ + if (guest_state_valid(vcpu)) + vmx->emulation_required = 0; } /* @@ -2996,6 +3120,11 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit); + /* If we need to emulate an MMIO from handle_invalid_guest_state + * we just return 0 */ + if (vmx->emulation_required && emulate_invalid_guest_state) + return 0; + /* Access CR3 don't cause VMExit in paging mode, so we need * to sync with guest real CR3. */ if (vm_need_ept() && is_paging(vcpu)) { @@ -3012,9 +3141,32 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) if ((vectoring_info & VECTORING_INFO_VALID_MASK) && (exit_reason != EXIT_REASON_EXCEPTION_NMI && - exit_reason != EXIT_REASON_EPT_VIOLATION)) - printk(KERN_WARNING "%s: unexpected, valid vectoring info and " - "exit reason is 0x%x\n", __func__, exit_reason); + exit_reason != EXIT_REASON_EPT_VIOLATION && + exit_reason != EXIT_REASON_TASK_SWITCH)) + printk(KERN_WARNING "%s: unexpected, valid vectoring info " + "(0x%x) and exit reason is 0x%x\n", + __func__, vectoring_info, exit_reason); + + if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) { + if (vcpu->arch.interrupt_window_open) { + vmx->soft_vnmi_blocked = 0; + vcpu->arch.nmi_window_open = 1; + } else if (vmx->vnmi_blocked_time > 1000000000LL && + vcpu->arch.nmi_pending) { + /* + * This CPU don't support us in finding the end of an + * NMI-blocked window if the guest runs with IRQs + * disabled. So we pull the trigger after 1 s of + * futile waiting, but inform the user about this. + */ + printk(KERN_WARNING "%s: Breaking out of NMI-blocked " + "state on VCPU %d after 1 s timeout\n", + __func__, vcpu->vcpu_id); + vmx->soft_vnmi_blocked = 0; + vmx->vcpu.arch.nmi_window_open = 1; + } + } + if (exit_reason < kvm_vmx_max_exit_handlers && kvm_vmx_exit_handlers[exit_reason]) return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); @@ -3042,51 +3194,6 @@ static void update_tpr_threshold(struct kvm_vcpu *vcpu) vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4); } -static void enable_irq_window(struct kvm_vcpu *vcpu) -{ - u32 cpu_based_vm_exec_control; - - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); -} - -static void enable_nmi_window(struct kvm_vcpu *vcpu) -{ - u32 cpu_based_vm_exec_control; - - if (!cpu_has_virtual_nmis()) - return; - - cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); - cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; - vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); -} - -static int vmx_nmi_enabled(struct kvm_vcpu *vcpu) -{ - u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); - return !(guest_intr & (GUEST_INTR_STATE_NMI | - GUEST_INTR_STATE_MOV_SS | - GUEST_INTR_STATE_STI)); -} - -static int vmx_irq_enabled(struct kvm_vcpu *vcpu) -{ - u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); - return (!(guest_intr & (GUEST_INTR_STATE_MOV_SS | - GUEST_INTR_STATE_STI)) && - (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)); -} - -static void enable_intr_window(struct kvm_vcpu *vcpu) -{ - if (vcpu->arch.nmi_pending) - enable_nmi_window(vcpu); - else if (kvm_cpu_has_interrupt(vcpu)) - enable_irq_window(vcpu); -} - static void vmx_complete_interrupts(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -3109,7 +3216,9 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) if (unblock_nmi && vector != DF_VECTOR) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); - } + } else if (unlikely(vmx->soft_vnmi_blocked)) + vmx->vnmi_blocked_time += + ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); idt_vectoring_info = vmx->idt_vectoring_info; idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; @@ -3147,26 +3256,29 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) { update_tpr_threshold(vcpu); - if (cpu_has_virtual_nmis()) { - if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { - if (vcpu->arch.interrupt.pending) { - enable_nmi_window(vcpu); - } else if (vmx_nmi_enabled(vcpu)) { - vcpu->arch.nmi_pending = false; - vcpu->arch.nmi_injected = true; - } else { - enable_intr_window(vcpu); - return; - } - } - if (vcpu->arch.nmi_injected) { - vmx_inject_nmi(vcpu); - enable_intr_window(vcpu); + vmx_update_window_states(vcpu); + + if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { + if (vcpu->arch.interrupt.pending) { + enable_nmi_window(vcpu); + } else if (vcpu->arch.nmi_window_open) { + vcpu->arch.nmi_pending = false; + vcpu->arch.nmi_injected = true; + } else { + enable_nmi_window(vcpu); return; } } + if (vcpu->arch.nmi_injected) { + vmx_inject_nmi(vcpu); + if (vcpu->arch.nmi_pending) + enable_nmi_window(vcpu); + else if (kvm_cpu_has_interrupt(vcpu)) + enable_irq_window(vcpu); + return; + } if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) { - if (vmx_irq_enabled(vcpu)) + if (vcpu->arch.interrupt_window_open) kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu)); else enable_irq_window(vcpu); @@ -3174,6 +3286,8 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) if (vcpu->arch.interrupt.pending) { vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr); + if (kvm_cpu_has_interrupt(vcpu)) + enable_irq_window(vcpu); } } @@ -3213,6 +3327,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 intr_info; + /* Record the guest's net vcpu time for enforced NMI injections. */ + if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) + vmx->entry_time = ktime_get(); + /* Handle invalid guest state instead of entering VMX */ if (vmx->emulation_required && emulate_invalid_guest_state) { handle_invalid_guest_state(vcpu, kvm_run); @@ -3327,9 +3445,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (vmx->rmode.irq.pending) fixup_rmode_irq(vmx); - vcpu->arch.interrupt_window_open = - (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & - (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)) == 0; + vmx_update_window_states(vcpu); asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); vmx->launched = 1; @@ -3337,7 +3453,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) intr_info = vmcs_read32(VM_EXIT_INTR_INFO); /* We need to handle NMIs before interrupts are enabled */ - if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200 && + if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && (intr_info & INTR_INFO_VALID_MASK)) { KVMTRACE_0D(NMI, vcpu, handler); asm("int $2"); @@ -3455,6 +3571,11 @@ static int get_ept_level(void) return VMX_EPT_DEFAULT_GAW + 1; } +static int vmx_get_mt_mask_shift(void) +{ + return VMX_EPT_MT_EPTE_SHIFT; +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -3510,6 +3631,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_tss_addr = vmx_set_tss_addr, .get_tdp_level = get_ept_level, + .get_mt_mask_shift = vmx_get_mt_mask_shift, }; static int __init vmx_init(void) @@ -3566,10 +3688,10 @@ static int __init vmx_init(void) bypass_guest_pf = 0; kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK | - VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT | VMX_EPT_IGMT_BIT); kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, - VMX_EPT_EXECUTABLE_MASK); + VMX_EPT_EXECUTABLE_MASK, + VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT); kvm_enable_tdp(); } else kvm_disable_tdp(); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f1f8ff2f1fa2..cc17546a2406 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -34,11 +34,13 @@ #include <linux/module.h> #include <linux/mman.h> #include <linux/highmem.h> +#include <linux/iommu.h> #include <linux/intel-iommu.h> #include <asm/uaccess.h> #include <asm/msr.h> #include <asm/desc.h> +#include <asm/mtrr.h> #define MAX_IO_MSRS 256 #define CR0_RESERVED_BITS \ @@ -86,6 +88,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "hypercalls", VCPU_STAT(hypercalls) }, { "request_irq", VCPU_STAT(request_irq_exits) }, + { "request_nmi", VCPU_STAT(request_nmi_exits) }, { "irq_exits", VCPU_STAT(irq_exits) }, { "host_state_reload", VCPU_STAT(host_state_reload) }, { "efer_reload", VCPU_STAT(efer_reload) }, @@ -93,6 +96,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "insn_emulation", VCPU_STAT(insn_emulation) }, { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, { "irq_injections", VCPU_STAT(irq_injections) }, + { "nmi_injections", VCPU_STAT(nmi_injections) }, { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, { "mmu_pte_write", VM_STAT(mmu_pte_write) }, { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, @@ -101,6 +105,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmu_recycled", VM_STAT(mmu_recycled) }, { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, { "mmu_unsync", VM_STAT(mmu_unsync) }, + { "mmu_unsync_global", VM_STAT(mmu_unsync_global) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, { "largepages", VM_STAT(lpages) }, { NULL } @@ -312,6 +317,7 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) kvm_x86_ops->set_cr0(vcpu, cr0); vcpu->arch.cr0 = cr0; + kvm_mmu_sync_global(vcpu); kvm_mmu_reset_context(vcpu); return; } @@ -355,6 +361,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) } kvm_x86_ops->set_cr4(vcpu, cr4); vcpu->arch.cr4 = cr4; + kvm_mmu_sync_global(vcpu); kvm_mmu_reset_context(vcpu); } EXPORT_SYMBOL_GPL(kvm_set_cr4); @@ -449,7 +456,7 @@ static u32 msrs_to_save[] = { MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, - MSR_IA32_PERF_STATUS, + MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT }; static unsigned num_msrs_to_save; @@ -648,10 +655,38 @@ static bool msr_mtrr_valid(unsigned msr) static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { + u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; + if (!msr_mtrr_valid(msr)) return 1; - vcpu->arch.mtrr[msr - 0x200] = data; + if (msr == MSR_MTRRdefType) { + vcpu->arch.mtrr_state.def_type = data; + vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10; + } else if (msr == MSR_MTRRfix64K_00000) + p[0] = data; + else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) + p[1 + msr - MSR_MTRRfix16K_80000] = data; + else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) + p[3 + msr - MSR_MTRRfix4K_C0000] = data; + else if (msr == MSR_IA32_CR_PAT) + vcpu->arch.pat = data; + else { /* Variable MTRRs */ + int idx, is_mtrr_mask; + u64 *pt; + + idx = (msr - 0x200) / 2; + is_mtrr_mask = msr - 0x200 - 2 * idx; + if (!is_mtrr_mask) + pt = + (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; + else + pt = + (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; + *pt = data; + } + + kvm_mmu_reset_context(vcpu); return 0; } @@ -747,10 +782,37 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { + u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges; + if (!msr_mtrr_valid(msr)) return 1; - *pdata = vcpu->arch.mtrr[msr - 0x200]; + if (msr == MSR_MTRRdefType) + *pdata = vcpu->arch.mtrr_state.def_type + + (vcpu->arch.mtrr_state.enabled << 10); + else if (msr == MSR_MTRRfix64K_00000) + *pdata = p[0]; + else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000) + *pdata = p[1 + msr - MSR_MTRRfix16K_80000]; + else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000) + *pdata = p[3 + msr - MSR_MTRRfix4K_C0000]; + else if (msr == MSR_IA32_CR_PAT) + *pdata = vcpu->arch.pat; + else { /* Variable MTRRs */ + int idx, is_mtrr_mask; + u64 *pt; + + idx = (msr - 0x200) / 2; + is_mtrr_mask = msr - 0x200 - 2 * idx; + if (!is_mtrr_mask) + pt = + (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo; + else + pt = + (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo; + *pdata = *pt; + } + return 0; } @@ -903,7 +965,6 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_IRQCHIP: case KVM_CAP_HLT: case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: - case KVM_CAP_USER_MEMORY: case KVM_CAP_SET_TSS_ADDR: case KVM_CAP_EXT_CPUID: case KVM_CAP_CLOCKSOURCE: @@ -929,7 +990,7 @@ int kvm_dev_ioctl_check_extension(long ext) r = !tdp_enabled; break; case KVM_CAP_IOMMU: - r = intel_iommu_found(); + r = iommu_found(); break; default: r = 0; @@ -1188,6 +1249,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, int t, times = entry->eax & 0xff; entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; + entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; for (t = 1; t < times && *nent < maxnent; ++t) { do_cpuid_1_ent(&entry[t], function, 0); entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; @@ -1218,7 +1280,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; /* read more entries until level_type is zero */ for (i = 1; *nent < maxnent; ++i) { - level_type = entry[i - 1].ecx & 0xff; + level_type = entry[i - 1].ecx & 0xff00; if (!level_type) break; do_cpuid_1_ent(&entry[i], function, i); @@ -1318,6 +1380,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, return 0; } +static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) +{ + vcpu_load(vcpu); + kvm_inject_nmi(vcpu); + vcpu_put(vcpu); + + return 0; +} + static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, struct kvm_tpr_access_ctl *tac) { @@ -1377,6 +1448,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = 0; break; } + case KVM_NMI: { + r = kvm_vcpu_ioctl_nmi(vcpu); + if (r) + goto out; + r = 0; + break; + } case KVM_SET_CPUID: { struct kvm_cpuid __user *cpuid_arg = argp; struct kvm_cpuid cpuid; @@ -1968,7 +2046,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); if (ret < 0) return 0; - kvm_mmu_pte_write(vcpu, gpa, val, bytes); + kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1); return 1; } @@ -2404,8 +2482,6 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, val = kvm_register_read(vcpu, VCPU_REGS_RAX); memcpy(vcpu->arch.pio_data, &val, 4); - kvm_x86_ops->skip_emulated_instruction(vcpu); - pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in); if (pio_dev) { kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); @@ -2541,7 +2617,7 @@ int kvm_arch_init(void *opaque) kvm_mmu_set_nonpresent_ptes(0ull, 0ull); kvm_mmu_set_base_ptes(PT_PRESENT_MASK); kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, - PT_DIRTY_MASK, PT64_NX_MASK, 0); + PT_DIRTY_MASK, PT64_NX_MASK, 0, 0); return 0; out: @@ -2729,7 +2805,7 @@ static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; /* when no next entry is found, the current entry[i] is reselected */ - for (j = i + 1; j == i; j = (j + 1) % nent) { + for (j = i + 1; ; j = (j + 1) % nent) { struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j]; if (ej->function == e->function) { ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; @@ -2973,7 +3049,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) pr_debug("vcpu %d received sipi with vector # %x\n", vcpu->vcpu_id, vcpu->arch.sipi_vector); kvm_lapic_reset(vcpu); - r = kvm_x86_ops->vcpu_reset(vcpu); + r = kvm_arch_vcpu_reset(vcpu); if (r) return r; vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; @@ -3275,9 +3351,9 @@ static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, kvm_desct->padding = 0; } -static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu, - u16 selector, - struct descriptor_table *dtable) +static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu, + u16 selector, + struct descriptor_table *dtable) { if (selector & 1 << 2) { struct kvm_segment kvm_seg; @@ -3302,7 +3378,7 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, struct descriptor_table dtable; u16 index = selector >> 3; - get_segment_descritptor_dtable(vcpu, selector, &dtable); + get_segment_descriptor_dtable(vcpu, selector, &dtable); if (dtable.limit < index * 8 + 7) { kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); @@ -3321,7 +3397,7 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, struct descriptor_table dtable; u16 index = selector >> 3; - get_segment_descritptor_dtable(vcpu, selector, &dtable); + get_segment_descriptor_dtable(vcpu, selector, &dtable); if (dtable.limit < index * 8 + 7) return 1; @@ -3900,6 +3976,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) /* We do fxsave: this must be aligned. */ BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); + vcpu->arch.mtrr_state.have_fixed = 1; vcpu_load(vcpu); r = kvm_arch_vcpu_reset(vcpu); if (r == 0) @@ -3925,6 +4002,9 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) { + vcpu->arch.nmi_pending = false; + vcpu->arch.nmi_injected = false; + return kvm_x86_ops->vcpu_reset(vcpu); } @@ -4012,6 +4092,7 @@ struct kvm *kvm_arch_create_vm(void) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + INIT_LIST_HEAD(&kvm->arch.oos_global_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ @@ -4048,8 +4129,8 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { - kvm_iommu_unmap_guest(kvm); kvm_free_all_assigned_devices(kvm); + kvm_iommu_unmap_guest(kvm); kvm_free_pit(kvm); kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); @@ -4127,7 +4208,8 @@ void kvm_arch_flush_shadow(struct kvm *kvm) int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE - || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED; + || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED + || vcpu->arch.nmi_pending; } static void vcpu_kick_intr(void *info) diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index ea051173b0da..d174db7a3370 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -58,6 +58,7 @@ #define SrcMem32 (4<<4) /* Memory operand (32-bit). */ #define SrcImm (5<<4) /* Immediate operand. */ #define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ +#define SrcOne (7<<4) /* Implied '1' */ #define SrcMask (7<<4) /* Generic ModRM decode. */ #define ModRM (1<<7) @@ -70,17 +71,23 @@ #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ #define GroupMask 0xff /* Group number stored in bits 0:7 */ +/* Source 2 operand type */ +#define Src2None (0<<29) +#define Src2CL (1<<29) +#define Src2ImmByte (2<<29) +#define Src2One (3<<29) +#define Src2Mask (7<<29) enum { Group1_80, Group1_81, Group1_82, Group1_83, Group1A, Group3_Byte, Group3, Group4, Group5, Group7, }; -static u16 opcode_table[256] = { +static u32 opcode_table[256] = { /* 0x00 - 0x07 */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, + ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, /* 0x08 - 0x0F */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, @@ -195,7 +202,7 @@ static u16 opcode_table[256] = { ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, }; -static u16 twobyte_table[256] = { +static u32 twobyte_table[256] = { /* 0x00 - 0x0F */ 0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0, ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, @@ -230,9 +237,14 @@ static u16 twobyte_table[256] = { /* 0x90 - 0x9F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xA0 - 0xA7 */ - 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, + 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, + DstMem | SrcReg | Src2ImmByte | ModRM, + DstMem | SrcReg | Src2CL | ModRM, 0, 0, /* 0xA8 - 0xAF */ - 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, ModRM, 0, + 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, + DstMem | SrcReg | Src2ImmByte | ModRM, + DstMem | SrcReg | Src2CL | ModRM, + ModRM, 0, /* 0xB0 - 0xB7 */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, DstMem | SrcReg | ModRM | BitOp, @@ -253,7 +265,7 @@ static u16 twobyte_table[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; -static u16 group_table[] = { +static u32 group_table[] = { [Group1_80*8] = ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, @@ -297,9 +309,9 @@ static u16 group_table[] = { SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp, }; -static u16 group2_table[] = { +static u32 group2_table[] = { [Group7*8] = - SrcNone | ModRM, 0, 0, 0, + SrcNone | ModRM, 0, 0, SrcNone | ModRM, SrcNone | ModRM | DstMem | Mov, 0, SrcMem16 | ModRM | Mov, 0, }; @@ -359,49 +371,48 @@ static u16 group2_table[] = { "andl %"_msk",%"_LO32 _tmp"; " \ "orl %"_LO32 _tmp",%"_sav"; " +#ifdef CONFIG_X86_64 +#define ON64(x) x +#else +#define ON64(x) +#endif + +#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix) \ + do { \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "4", "2") \ + _op _suffix " %"_x"3,%1; " \ + _POST_EFLAGS("0", "4", "2") \ + : "=m" (_eflags), "=m" ((_dst).val), \ + "=&r" (_tmp) \ + : _y ((_src).val), "i" (EFLAGS_MASK)); \ + } while (0) + + /* Raw emulation: instruction has two explicit operands. */ #define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ - do { \ - unsigned long _tmp; \ - \ - switch ((_dst).bytes) { \ - case 2: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "4", "2") \ - _op"w %"_wx"3,%1; " \ - _POST_EFLAGS("0", "4", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : _wy ((_src).val), "i" (EFLAGS_MASK)); \ - break; \ - case 4: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "4", "2") \ - _op"l %"_lx"3,%1; " \ - _POST_EFLAGS("0", "4", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : _ly ((_src).val), "i" (EFLAGS_MASK)); \ - break; \ - case 8: \ - __emulate_2op_8byte(_op, _src, _dst, \ - _eflags, _qx, _qy); \ - break; \ - } \ + do { \ + unsigned long _tmp; \ + \ + switch ((_dst).bytes) { \ + case 2: \ + ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \ + break; \ + case 4: \ + ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \ + break; \ + case 8: \ + ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \ + break; \ + } \ } while (0) #define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ do { \ - unsigned long __tmp; \ + unsigned long _tmp; \ switch ((_dst).bytes) { \ case 1: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "4", "2") \ - _op"b %"_bx"3,%1; " \ - _POST_EFLAGS("0", "4", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (__tmp) \ - : _by ((_src).val), "i" (EFLAGS_MASK)); \ + ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b"); \ break; \ default: \ __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ @@ -425,71 +436,68 @@ static u16 group2_table[] = { __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ "w", "r", _LO32, "r", "", "r") -/* Instruction has only one explicit operand (no source operand). */ -#define emulate_1op(_op, _dst, _eflags) \ - do { \ - unsigned long _tmp; \ - \ - switch ((_dst).bytes) { \ - case 1: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "3", "2") \ - _op"b %1; " \ - _POST_EFLAGS("0", "3", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : "i" (EFLAGS_MASK)); \ - break; \ - case 2: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "3", "2") \ - _op"w %1; " \ - _POST_EFLAGS("0", "3", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : "i" (EFLAGS_MASK)); \ - break; \ - case 4: \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "3", "2") \ - _op"l %1; " \ - _POST_EFLAGS("0", "3", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ - : "i" (EFLAGS_MASK)); \ - break; \ - case 8: \ - __emulate_1op_8byte(_op, _dst, _eflags); \ - break; \ - } \ +/* Instruction has three operands and one operand is stored in ECX register */ +#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \ + do { \ + unsigned long _tmp; \ + _type _clv = (_cl).val; \ + _type _srcv = (_src).val; \ + _type _dstv = (_dst).val; \ + \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "5", "2") \ + _op _suffix " %4,%1 \n" \ + _POST_EFLAGS("0", "5", "2") \ + : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \ + : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ + ); \ + \ + (_cl).val = (unsigned long) _clv; \ + (_src).val = (unsigned long) _srcv; \ + (_dst).val = (unsigned long) _dstv; \ } while (0) -/* Emulate an instruction with quadword operands (x86/64 only). */ -#if defined(CONFIG_X86_64) -#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \ - do { \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "4", "2") \ - _op"q %"_qx"3,%1; " \ - _POST_EFLAGS("0", "4", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ - : _qy ((_src).val), "i" (EFLAGS_MASK)); \ +#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \ + do { \ + switch ((_dst).bytes) { \ + case 2: \ + __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ + "w", unsigned short); \ + break; \ + case 4: \ + __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ + "l", unsigned int); \ + break; \ + case 8: \ + ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ + "q", unsigned long)); \ + break; \ + } \ } while (0) -#define __emulate_1op_8byte(_op, _dst, _eflags) \ - do { \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "3", "2") \ - _op"q %1; " \ - _POST_EFLAGS("0", "3", "2") \ - : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \ - : "i" (EFLAGS_MASK)); \ +#define __emulate_1op(_op, _dst, _eflags, _suffix) \ + do { \ + unsigned long _tmp; \ + \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "3", "2") \ + _op _suffix " %1; " \ + _POST_EFLAGS("0", "3", "2") \ + : "=m" (_eflags), "+m" ((_dst).val), \ + "=&r" (_tmp) \ + : "i" (EFLAGS_MASK)); \ } while (0) -#elif defined(__i386__) -#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) -#define __emulate_1op_8byte(_op, _dst, _eflags) -#endif /* __i386__ */ +/* Instruction has only one explicit operand (no source operand). */ +#define emulate_1op(_op, _dst, _eflags) \ + do { \ + switch ((_dst).bytes) { \ + case 1: __emulate_1op(_op, _dst, _eflags, "b"); break; \ + case 2: __emulate_1op(_op, _dst, _eflags, "w"); break; \ + case 4: __emulate_1op(_op, _dst, _eflags, "l"); break; \ + case 8: ON64(__emulate_1op(_op, _dst, _eflags, "q")); break; \ + } \ + } while (0) /* Fetch next part of the instruction being emulated. */ #define insn_fetch(_type, _size, _eip) \ @@ -1041,6 +1049,33 @@ done_prefixes: c->src.bytes = 1; c->src.val = insn_fetch(s8, 1, c->eip); break; + case SrcOne: + c->src.bytes = 1; + c->src.val = 1; + break; + } + + /* + * Decode and fetch the second source operand: register, memory + * or immediate. + */ + switch (c->d & Src2Mask) { + case Src2None: + break; + case Src2CL: + c->src2.bytes = 1; + c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8; + break; + case Src2ImmByte: + c->src2.type = OP_IMM; + c->src2.ptr = (unsigned long *)c->eip; + c->src2.bytes = 1; + c->src2.val = insn_fetch(u8, 1, c->eip); + break; + case Src2One: + c->src2.bytes = 1; + c->src2.val = 1; + break; } /* Decode and fetch the destination operand: register or memory. */ @@ -1100,20 +1135,33 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt) c->regs[VCPU_REGS_RSP]); } -static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int emulate_pop(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; int rc; - rc = ops->read_std(register_address(c, ss_base(ctxt), - c->regs[VCPU_REGS_RSP]), - &c->dst.val, c->dst.bytes, ctxt->vcpu); + rc = ops->read_emulated(register_address(c, ss_base(ctxt), + c->regs[VCPU_REGS_RSP]), + &c->src.val, c->src.bytes, ctxt->vcpu); if (rc != 0) return rc; - register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->dst.bytes); + register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.bytes); + return rc; +} + +static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + c->src.bytes = c->dst.bytes; + rc = emulate_pop(ctxt, ops); + if (rc != 0) + return rc; + c->dst.val = c->src.val; return 0; } @@ -1415,24 +1463,15 @@ special_insn: emulate_1op("dec", c->dst, ctxt->eflags); break; case 0x50 ... 0x57: /* push reg */ - c->dst.type = OP_MEM; - c->dst.bytes = c->op_bytes; - c->dst.val = c->src.val; - register_address_increment(c, &c->regs[VCPU_REGS_RSP], - -c->op_bytes); - c->dst.ptr = (void *) register_address( - c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]); + emulate_push(ctxt); break; case 0x58 ... 0x5f: /* pop reg */ pop_instruction: - if ((rc = ops->read_std(register_address(c, ss_base(ctxt), - c->regs[VCPU_REGS_RSP]), c->dst.ptr, - c->op_bytes, ctxt->vcpu)) != 0) + c->src.bytes = c->op_bytes; + rc = emulate_pop(ctxt, ops); + if (rc != 0) goto done; - - register_address_increment(c, &c->regs[VCPU_REGS_RSP], - c->op_bytes); - c->dst.type = OP_NONE; /* Disable writeback. */ + c->dst.val = c->src.val; break; case 0x63: /* movsxd */ if (ctxt->mode != X86EMUL_MODE_PROT64) @@ -1591,7 +1630,9 @@ special_insn: emulate_push(ctxt); break; case 0x9d: /* popf */ + c->dst.type = OP_REG; c->dst.ptr = (unsigned long *) &ctxt->eflags; + c->dst.bytes = c->op_bytes; goto pop_instruction; case 0xa0 ... 0xa1: /* mov */ c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; @@ -1689,7 +1730,9 @@ special_insn: emulate_grp2(ctxt); break; case 0xc3: /* ret */ + c->dst.type = OP_REG; c->dst.ptr = &c->eip; + c->dst.bytes = c->op_bytes; goto pop_instruction; case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ mov: @@ -1778,7 +1821,7 @@ special_insn: c->eip = saved_eip; goto cannot_emulate; } - return 0; + break; case 0xf4: /* hlt */ ctxt->vcpu->arch.halt_request = 1; break; @@ -1999,12 +2042,20 @@ twobyte_insn: c->src.val &= (c->dst.bytes << 3) - 1; emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags); break; + case 0xa4: /* shld imm8, r, r/m */ + case 0xa5: /* shld cl, r, r/m */ + emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); + break; case 0xab: bts: /* bts */ /* only subword offset */ c->src.val &= (c->dst.bytes << 3) - 1; emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); break; + case 0xac: /* shrd imm8, r, r/m */ + case 0xad: /* shrd cl, r, r/m */ + emulate_2op_cl("shrd", c->src2, c->src, c->dst, ctxt->eflags); + break; case 0xae: /* clflush */ break; case 0xb0 ... 0xb1: /* cmpxchg */ diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c index 37b9ae4d44c5..df167f265622 100644 --- a/arch/x86/mach-default/setup.c +++ b/arch/x86/mach-default/setup.c @@ -133,29 +133,28 @@ void __init time_init_hook(void) **/ void mca_nmi_hook(void) { - /* If I recall correctly, there's a whole bunch of other things that + /* + * If I recall correctly, there's a whole bunch of other things that * we can do to check for NMI problems, but that's all I know about * at the moment. */ - - printk("NMI generated from unknown source!\n"); + pr_warning("NMI generated from unknown source!\n"); } #endif static __init int no_ipi_broadcast(char *str) { get_option(&str, &no_broadcast); - printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" : - "IPI Broadcast"); + pr_info("Using %s mode\n", + no_broadcast ? "No IPI Broadcast" : "IPI Broadcast"); return 1; } - __setup("no_ipi_broadcast=", no_ipi_broadcast); static int __init print_ipi_mode(void) { - printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" : - "Shortcut"); + pr_info("Using IPI %s mode\n", + no_broadcast ? "No-Shortcut" : "Shortcut"); return 0; } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8655b5bb0963..f99a6c6c432e 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -435,8 +435,12 @@ static void __init set_highmem_pages_init(void) #endif /* !CONFIG_NUMA */ #else -# define permanent_kmaps_init(pgd_base) do { } while (0) -# define set_highmem_pages_init() do { } while (0) +static inline void permanent_kmaps_init(pgd_t *pgd_base) +{ +} +static inline void set_highmem_pages_init(void) +{ +} #endif /* CONFIG_HIGHMEM */ void __init native_pagetable_setup_start(pgd_t *base) diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 1d88d2b39771..9e5752fe4d15 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -4,7 +4,7 @@ #include <linux/irq.h> #include <linux/dmi.h> #include <asm/numa.h> -#include "pci.h" +#include <asm/pci_x86.h> struct pci_root_info { char *name; diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index 22e057665e55..9bb09823b362 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -2,7 +2,7 @@ #include <linux/pci.h> #include <linux/topology.h> #include <linux/cpu.h> -#include "pci.h" +#include <asm/pci_x86.h> #ifdef CONFIG_X86_64 #include <asm/pci-direct.h> diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index bb1a01f089e2..62ddb73e09ed 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -14,8 +14,7 @@ #include <asm/segment.h> #include <asm/io.h> #include <asm/smp.h> - -#include "pci.h" +#include <asm/pci_x86.h> unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 | PCI_PROBE_MMCONF; diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c index 9a5af6c8fbe9..bd13c3e4c6db 100644 --- a/arch/x86/pci/direct.c +++ b/arch/x86/pci/direct.c @@ -5,7 +5,7 @@ #include <linux/pci.h> #include <linux/init.h> #include <linux/dmi.h> -#include "pci.h" +#include <asm/pci_x86.h> /* * Functions for accessing PCI base (first 256 bytes) and extended diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c index 86631ccbc25a..f6adf2c6d751 100644 --- a/arch/x86/pci/early.c +++ b/arch/x86/pci/early.c @@ -2,7 +2,7 @@ #include <linux/pci.h> #include <asm/pci-direct.h> #include <asm/io.h> -#include "pci.h" +#include <asm/pci_x86.h> /* Direct PCI access. This is used for PCI accesses in early boot before the PCI subsystem works. */ diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 2051dc96b8e9..7d388d5cf548 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -6,8 +6,7 @@ #include <linux/dmi.h> #include <linux/pci.h> #include <linux/init.h> -#include "pci.h" - +#include <asm/pci_x86.h> static void __devinit pci_fixup_i450nx(struct pci_dev *d) { diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 844df0cbbd3e..e51bf2cda4b0 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -34,8 +34,8 @@ #include <asm/pat.h> #include <asm/e820.h> +#include <asm/pci_x86.h> -#include "pci.h" static int skip_isa_ioresource_align(struct pci_dev *dev) { diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c index d6c950f81858..bec3b048e72b 100644 --- a/arch/x86/pci/init.c +++ b/arch/x86/pci/init.c @@ -1,6 +1,6 @@ #include <linux/pci.h> #include <linux/init.h> -#include "pci.h" +#include <asm/pci_x86.h> /* arch_initcall has too random ordering, so call the initializers in the right sequence from here. */ diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index bf69dbe08bff..373b9afe6d44 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -16,8 +16,7 @@ #include <asm/io_apic.h> #include <linux/irq.h> #include <linux/acpi.h> - -#include "pci.h" +#include <asm/pci_x86.h> #define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24)) #define PIRQ_VERSION 0x0100 diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c index b722dd481b39..f1065b129e9c 100644 --- a/arch/x86/pci/legacy.c +++ b/arch/x86/pci/legacy.c @@ -3,7 +3,7 @@ */ #include <linux/init.h> #include <linux/pci.h> -#include "pci.h" +#include <asm/pci_x86.h> /* * Discover remaining PCI buses in case there are peer host bridges. diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 654a2234f8f3..89bf9242c80a 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -15,8 +15,7 @@ #include <linux/acpi.h> #include <linux/bitmap.h> #include <asm/e820.h> - -#include "pci.h" +#include <asm/pci_x86.h> /* aperture is up to 256MB but BIOS may reserve less */ #define MMCONFIG_APER_MIN (2 * 1024*1024) diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c index f3c761dce695..8b2d561046a3 100644 --- a/arch/x86/pci/mmconfig_32.c +++ b/arch/x86/pci/mmconfig_32.c @@ -13,7 +13,7 @@ #include <linux/init.h> #include <linux/acpi.h> #include <asm/e820.h> -#include "pci.h" +#include <asm/pci_x86.h> /* Assume systems with more busses have correct MCFG */ #define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG)) diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c index a1994163c99d..30007ffc8e11 100644 --- a/arch/x86/pci/mmconfig_64.c +++ b/arch/x86/pci/mmconfig_64.c @@ -10,8 +10,7 @@ #include <linux/acpi.h> #include <linux/bitmap.h> #include <asm/e820.h> - -#include "pci.h" +#include <asm/pci_x86.h> /* Static virtual mapping of the MMCONFIG aperture */ struct mmcfg_virt { diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c index 1177845d3186..2089354968a2 100644 --- a/arch/x86/pci/numaq_32.c +++ b/arch/x86/pci/numaq_32.c @@ -7,7 +7,7 @@ #include <linux/nodemask.h> #include <mach_apic.h> #include <asm/mpspec.h> -#include "pci.h" +#include <asm/pci_x86.h> #define XQUAD_PORTIO_BASE 0xfe400000 #define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */ diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c index e11e9e803d5f..b889d824f7c6 100644 --- a/arch/x86/pci/olpc.c +++ b/arch/x86/pci/olpc.c @@ -29,7 +29,7 @@ #include <linux/init.h> #include <asm/olpc.h> #include <asm/geode.h> -#include "pci.h" +#include <asm/pci_x86.h> /* * In the tables below, the first two line (8 longwords) are the diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c index 37472fc6f729..b82cae970dfd 100644 --- a/arch/x86/pci/pcbios.c +++ b/arch/x86/pci/pcbios.c @@ -6,9 +6,8 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/uaccess.h> -#include "pci.h" -#include "pci-functions.h" - +#include <asm/pci_x86.h> +#include <asm/mach-default/pci-functions.h> /* BIOS32 signature: "_32_" */ #define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24)) diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c index 42f4cb19faca..16d0c0eb0d19 100644 --- a/arch/x86/pci/visws.c +++ b/arch/x86/pci/visws.c @@ -9,11 +9,10 @@ #include <linux/init.h> #include <asm/setup.h> +#include <asm/pci_x86.h> #include <asm/visws/cobalt.h> #include <asm/visws/lithium.h> -#include "pci.h" - static int pci_visws_enable_irq(struct pci_dev *dev) { return 0; } static void pci_visws_disable_irq(struct pci_dev *dev) { } diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 65d75a6be0ba..14f240623497 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -132,8 +132,7 @@ static void do_stolen_accounting(void) *snap = state; /* Add the appropriate number of ticks of stolen time, - including any left-overs from last time. Passing NULL to - account_steal_time accounts the time as stolen. */ + including any left-overs from last time. */ stolen = runnable + offline + __get_cpu_var(residual_stolen); if (stolen < 0) @@ -141,11 +140,10 @@ static void do_stolen_accounting(void) ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); __get_cpu_var(residual_stolen) = stolen; - account_steal_time(NULL, ticks); + account_steal_ticks(ticks); /* Add the appropriate number of ticks of blocked time, - including any left-overs from last time. Passing idle to - account_steal_time accounts the time as idle/wait. */ + including any left-overs from last time. */ blocked += __get_cpu_var(residual_blocked); if (blocked < 0) @@ -153,7 +151,7 @@ static void do_stolen_accounting(void) ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked); __get_cpu_var(residual_blocked) = blocked; - account_steal_time(idle_task(smp_processor_id()), ticks); + account_idle_ticks(ticks); } /* |