summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig13
-rw-r--r--arch/x86/ia32/ia32_signal.c3
-rw-r--r--arch/x86/ia32/ipc32.c1
-rw-r--r--arch/x86/ia32/sys_ia32.c2
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h61
-rw-r--r--arch/x86/include/asm/apic.h3
-rw-r--r--arch/x86/include/asm/efi.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h47
-rw-r--r--arch/x86/include/asm/kvm_x86_emulate.h11
-rw-r--r--arch/x86/include/asm/mpspec.h2
-rw-r--r--arch/x86/include/asm/mtrr.h25
-rw-r--r--arch/x86/include/asm/pci_x86.h (renamed from arch/x86/pci/pci.h)17
-rw-r--r--arch/x86/include/asm/svm.h (renamed from arch/x86/kvm/svm.h)0
-rw-r--r--arch/x86/include/asm/sys_ia32.h101
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h46
-rw-r--r--arch/x86/include/asm/virtext.h132
-rw-r--r--arch/x86/include/asm/vmx.h (renamed from arch/x86/kvm/vmx.h)27
-rw-r--r--arch/x86/kernel/amd_iommu.c666
-rw-r--r--arch/x86/kernel/amd_iommu_init.c19
-rw-r--r--arch/x86/kernel/apic.c8
-rw-r--r--arch/x86/kernel/bios_uv.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c12
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c10
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h18
-rw-r--r--arch/x86/kernel/cpuid.c6
-rw-r--r--arch/x86/kernel/crash.c18
-rw-r--r--arch/x86/kernel/early_printk.c2
-rw-r--r--arch/x86/kernel/genx2apic_phys.c4
-rw-r--r--arch/x86/kernel/head64.c2
-rw-r--r--arch/x86/kernel/init_task.c1
-rw-r--r--arch/x86/kernel/io_apic.c20
-rw-r--r--arch/x86/kernel/kvmclock.c10
-rw-r--r--arch/x86/kernel/ldt.c4
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c3
-rw-r--r--arch/x86/kernel/mpparse.c10
-rw-r--r--arch/x86/kernel/nmi.c3
-rw-r--r--arch/x86/kernel/pci-gart_64.c2
-rw-r--r--arch/x86/kernel/reboot.c64
-rw-r--r--arch/x86/kernel/tlb_uv.c9
-rw-r--r--arch/x86/kernel/traps.c33
-rw-r--r--arch/x86/kernel/xsave.c2
-rw-r--r--arch/x86/kvm/Makefile4
-rw-r--r--arch/x86/kvm/i8254.c19
-rw-r--r--arch/x86/kvm/i8259.c52
-rw-r--r--arch/x86/kvm/irq.h6
-rw-r--r--arch/x86/kvm/kvm_svm.h2
-rw-r--r--arch/x86/kvm/lapic.c58
-rw-r--r--arch/x86/kvm/mmu.c444
-rw-r--r--arch/x86/kvm/paging_tmpl.h44
-rw-r--r--arch/x86/kvm/svm.c48
-rw-r--r--arch/x86/kvm/vmx.c350
-rw-r--r--arch/x86/kvm/x86.c120
-rw-r--r--arch/x86/kvm/x86_emulate.c297
-rw-r--r--arch/x86/mach-default/setup.c15
-rw-r--r--arch/x86/mm/init_32.c8
-rw-r--r--arch/x86/pci/acpi.c2
-rw-r--r--arch/x86/pci/amd_bus.c2
-rw-r--r--arch/x86/pci/common.c3
-rw-r--r--arch/x86/pci/direct.c2
-rw-r--r--arch/x86/pci/early.c2
-rw-r--r--arch/x86/pci/fixup.c3
-rw-r--r--arch/x86/pci/i386.c2
-rw-r--r--arch/x86/pci/init.c2
-rw-r--r--arch/x86/pci/irq.c3
-rw-r--r--arch/x86/pci/legacy.c2
-rw-r--r--arch/x86/pci/mmconfig-shared.c3
-rw-r--r--arch/x86/pci/mmconfig_32.c2
-rw-r--r--arch/x86/pci/mmconfig_64.c3
-rw-r--r--arch/x86/pci/numaq_32.c2
-rw-r--r--arch/x86/pci/olpc.c2
-rw-r--r--arch/x86/pci/pcbios.c5
-rw-r--r--arch/x86/pci/visws.c3
-rw-r--r--arch/x86/xen/time.c10
73 files changed, 2259 insertions, 681 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 249d1e0824b5..862adb9bf0d4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -586,6 +586,16 @@ config AMD_IOMMU
your BIOS for an option to enable it or if you have an IVRS ACPI
table.
+config AMD_IOMMU_STATS
+ bool "Export AMD IOMMU statistics to debugfs"
+ depends on AMD_IOMMU
+ select DEBUG_FS
+ help
+ This option enables code in the AMD IOMMU driver to collect various
+ statistics about whats happening in the driver and exports that
+ information to userspace via debugfs.
+ If unsure, say N.
+
# need this always selected by IOMMU for the VIA workaround
config SWIOTLB
def_bool y if X86_64
@@ -599,6 +609,9 @@ config SWIOTLB
config IOMMU_HELPER
def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU)
+config IOMMU_API
+ def_bool (AMD_IOMMU || DMAR)
+
config MAXSMP
bool "Configure Maximum number of SMP Processors and NUMA Nodes"
depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index b195f85526e3..9dabd00e9805 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -24,15 +24,14 @@
#include <asm/ucontext.h>
#include <asm/uaccess.h>
#include <asm/i387.h>
-#include <asm/ia32.h>
#include <asm/ptrace.h>
#include <asm/ia32_unistd.h>
#include <asm/user32.h>
#include <asm/sigcontext32.h>
#include <asm/proto.h>
#include <asm/vdso.h>
-
#include <asm/sigframe.h>
+#include <asm/sys_ia32.h>
#define DEBUG_SIG 0
diff --git a/arch/x86/ia32/ipc32.c b/arch/x86/ia32/ipc32.c
index d21991ce606c..29cdcd02ead3 100644
--- a/arch/x86/ia32/ipc32.c
+++ b/arch/x86/ia32/ipc32.c
@@ -8,6 +8,7 @@
#include <linux/shm.h>
#include <linux/ipc.h>
#include <linux/compat.h>
+#include <asm/sys_ia32.h>
asmlinkage long sys32_ipc(u32 call, int first, int second, int third,
compat_uptr_t ptr, u32 fifth)
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 2e09dcd3c0a6..6c0d7f6231af 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -44,8 +44,8 @@
#include <asm/types.h>
#include <asm/uaccess.h>
#include <asm/atomic.h>
-#include <asm/ia32.h>
#include <asm/vgtod.h>
+#include <asm/sys_ia32.h>
#define AA(__x) ((unsigned long)(__x))
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index ac302a2fa339..95c8cd9d22b5 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -190,16 +190,23 @@
/* FIXME: move this macro to <linux/pci.h> */
#define PCI_BUS(x) (((x) >> 8) & 0xff)
+/* Protection domain flags */
+#define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */
+#define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops
+ domain for an IOMMU */
+
/*
* This structure contains generic data for IOMMU protection domains
* independent of their use.
*/
struct protection_domain {
- spinlock_t lock; /* mostly used to lock the page table*/
- u16 id; /* the domain id written to the device table */
- int mode; /* paging mode (0-6 levels) */
- u64 *pt_root; /* page table root pointer */
- void *priv; /* private data */
+ spinlock_t lock; /* mostly used to lock the page table*/
+ u16 id; /* the domain id written to the device table */
+ int mode; /* paging mode (0-6 levels) */
+ u64 *pt_root; /* page table root pointer */
+ unsigned long flags; /* flags to find out type of domain */
+ unsigned dev_cnt; /* devices assigned to this domain */
+ void *priv; /* private data */
};
/*
@@ -295,7 +302,7 @@ struct amd_iommu {
bool int_enabled;
/* if one, we need to send a completion wait command */
- int need_sync;
+ bool need_sync;
/* default dma_ops domain for that IOMMU */
struct dma_ops_domain *default_dom;
@@ -374,7 +381,7 @@ extern struct protection_domain **amd_iommu_pd_table;
extern unsigned long *amd_iommu_pd_alloc_bitmap;
/* will be 1 if device isolation is enabled */
-extern int amd_iommu_isolate;
+extern bool amd_iommu_isolate;
/*
* If true, the addresses will be flushed on unmap time, not when
@@ -382,18 +389,6 @@ extern int amd_iommu_isolate;
*/
extern bool amd_iommu_unmap_flush;
-/* takes a PCI device id and prints it out in a readable form */
-static inline void print_devid(u16 devid, int nl)
-{
- int bus = devid >> 8;
- int dev = devid >> 3 & 0x1f;
- int fn = devid & 0x07;
-
- printk("%02x:%02x.%x", bus, dev, fn);
- if (nl)
- printk("\n");
-}
-
/* takes bus and device/function and returns the device id
* FIXME: should that be in generic PCI code? */
static inline u16 calc_devid(u8 bus, u8 devfn)
@@ -401,4 +396,32 @@ static inline u16 calc_devid(u8 bus, u8 devfn)
return (((u16)bus) << 8) | devfn;
}
+#ifdef CONFIG_AMD_IOMMU_STATS
+
+struct __iommu_counter {
+ char *name;
+ struct dentry *dent;
+ u64 value;
+};
+
+#define DECLARE_STATS_COUNTER(nm) \
+ static struct __iommu_counter nm = { \
+ .name = #nm, \
+ }
+
+#define INC_STATS_COUNTER(name) name.value += 1
+#define ADD_STATS_COUNTER(name, x) name.value += (x)
+#define SUB_STATS_COUNTER(name, x) name.value -= (x)
+
+#else /* CONFIG_AMD_IOMMU_STATS */
+
+#define DECLARE_STATS_COUNTER(name)
+#define INC_STATS_COUNTER(name)
+#define ADD_STATS_COUNTER(name, x)
+#define SUB_STATS_COUNTER(name, x)
+
+static inline void amd_iommu_stats_init(void) { }
+
+#endif /* CONFIG_AMD_IOMMU_STATS */
+
#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 25caa0738af5..ab1d51a8855e 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -54,7 +54,6 @@ extern int disable_apic;
extern int is_vsmp_box(void);
extern void xapic_wait_icr_idle(void);
extern u32 safe_xapic_wait_icr_idle(void);
-extern u64 xapic_icr_read(void);
extern void xapic_icr_write(u32, u32);
extern int setup_profiling_timer(unsigned int);
@@ -93,7 +92,7 @@ static inline u32 native_apic_msr_read(u32 reg)
}
#ifndef CONFIG_X86_32
-extern int x2apic, x2apic_preenabled;
+extern int x2apic;
extern void check_x2apic(void);
extern void enable_x2apic(void);
extern void enable_IR_x2apic(void);
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index a2e545c91c35..ca5ffb2856b6 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -90,6 +90,7 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size);
#endif /* CONFIG_X86_32 */
+extern int add_efi_memmap;
extern void efi_reserve_early(void);
extern void efi_call_phys_prelog(void);
extern void efi_call_phys_epilog(void);
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8346be87cfa1..730843d1d2fb 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -21,6 +21,7 @@
#include <asm/pvclock-abi.h>
#include <asm/desc.h>
+#include <asm/mtrr.h>
#define KVM_MAX_VCPUS 16
#define KVM_MEMORY_SLOTS 32
@@ -86,6 +87,7 @@
#define KVM_MIN_FREE_MMU_PAGES 5
#define KVM_REFILL_PAGES 25
#define KVM_MAX_CPUID_ENTRIES 40
+#define KVM_NR_FIXED_MTRR_REGION 88
#define KVM_NR_VAR_MTRR 8
extern spinlock_t kvm_lock;
@@ -180,6 +182,8 @@ struct kvm_mmu_page {
struct list_head link;
struct hlist_node hash_link;
+ struct list_head oos_link;
+
/*
* The following two entries are used to key the shadow page in the
* hash table.
@@ -190,13 +194,16 @@ struct kvm_mmu_page {
u64 *spt;
/* hold the gfn of each spte inside spt */
gfn_t *gfns;
- unsigned long slot_bitmap; /* One bit set per slot which has memory
- * in this shadow page.
- */
+ /*
+ * One bit set per slot which has memory
+ * in this shadow page.
+ */
+ DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
int multimapped; /* More than one parent_pte? */
int root_count; /* Currently serving as active root */
bool unsync;
- bool unsync_children;
+ bool global;
+ unsigned int unsync_children;
union {
u64 *parent_pte; /* !multimapped */
struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
@@ -327,8 +334,10 @@ struct kvm_vcpu_arch {
bool nmi_pending;
bool nmi_injected;
+ bool nmi_window_open;
- u64 mtrr[0x100];
+ struct mtrr_state_type mtrr_state;
+ u32 pat;
};
struct kvm_mem_alias {
@@ -350,11 +359,13 @@ struct kvm_arch{
*/
struct list_head active_mmu_pages;
struct list_head assigned_dev_head;
- struct dmar_domain *intel_iommu_domain;
+ struct list_head oos_global_pages;
+ struct iommu_domain *iommu_domain;
struct kvm_pic *vpic;
struct kvm_ioapic *vioapic;
struct kvm_pit *vpit;
struct hlist_head irq_ack_notifier_list;
+ int vapics_in_nmi_mode;
int round_robin_prev_vcpu;
unsigned int tss_addr;
@@ -378,6 +389,7 @@ struct kvm_vm_stat {
u32 mmu_recycled;
u32 mmu_cache_miss;
u32 mmu_unsync;
+ u32 mmu_unsync_global;
u32 remote_tlb_flush;
u32 lpages;
};
@@ -397,6 +409,7 @@ struct kvm_vcpu_stat {
u32 halt_exits;
u32 halt_wakeup;
u32 request_irq_exits;
+ u32 request_nmi_exits;
u32 irq_exits;
u32 host_state_reload;
u32 efer_reload;
@@ -405,6 +418,7 @@ struct kvm_vcpu_stat {
u32 insn_emulation_fail;
u32 hypercalls;
u32 irq_injections;
+ u32 nmi_injections;
};
struct descriptor_table {
@@ -477,6 +491,7 @@ struct kvm_x86_ops {
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*get_tdp_level)(void);
+ int (*get_mt_mask_shift)(void);
};
extern struct kvm_x86_ops *kvm_x86_ops;
@@ -490,7 +505,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu);
void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
void kvm_mmu_set_base_ptes(u64 base_pte);
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
- u64 dirty_mask, u64 nx_mask, u64 x_mask);
+ u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask);
int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
@@ -587,12 +602,14 @@ unsigned long segment_base(u16 selector);
void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- const u8 *new, int bytes);
+ const u8 *new, int bytes,
+ bool guest_initiated);
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+void kvm_mmu_sync_global(struct kvm_vcpu *vcpu);
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
@@ -607,6 +624,8 @@ void kvm_disable_tdp(void);
int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
int complete_pio(struct kvm_vcpu *vcpu);
+struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
+
static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
{
struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
@@ -702,18 +721,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
}
-#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30"
-#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2"
-#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3"
-#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30"
-#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0"
-#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0"
-#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4"
-#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4"
-#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30"
-#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
-#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
-
#define MSR_IA32_TIME_STAMP_COUNTER 0x010
#define TSS_IOPB_BASE_OFFSET 0x66
diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h
index 25179a29f208..6a159732881a 100644
--- a/arch/x86/include/asm/kvm_x86_emulate.h
+++ b/arch/x86/include/asm/kvm_x86_emulate.h
@@ -123,6 +123,7 @@ struct decode_cache {
u8 ad_bytes;
u8 rex_prefix;
struct operand src;
+ struct operand src2;
struct operand dst;
bool has_seg_override;
u8 seg_override;
@@ -146,22 +147,18 @@ struct x86_emulate_ctxt {
/* Register state before/after emulation. */
struct kvm_vcpu *vcpu;
- /* Linear faulting address (if emulating a page-faulting instruction) */
unsigned long eflags;
-
/* Emulated execution mode, represented by an X86EMUL_MODE value. */
int mode;
-
u32 cs_base;
/* decode cache */
-
struct decode_cache decode;
};
/* Repeat String Operation Prefix */
-#define REPE_PREFIX 1
-#define REPNE_PREFIX 2
+#define REPE_PREFIX 1
+#define REPNE_PREFIX 2
/* Execution mode, passed to the emulator. */
#define X86EMUL_MODE_REAL 0 /* Real mode. */
@@ -170,7 +167,7 @@ struct x86_emulate_ctxt {
#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */
/* Host execution mode. */
-#if defined(__i386__)
+#if defined(CONFIG_X86_32)
#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
#elif defined(CONFIG_X86_64)
#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 91885c28f66b..62d14ce3cd00 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -6,13 +6,13 @@
#include <asm/mpspec_def.h>
extern int apic_version[MAX_APICS];
+extern int pic_mode;
#ifdef CONFIG_X86_32
#include <mach_mpspec.h>
extern unsigned int def_to_bigsmp;
extern u8 apicid_2_node[];
-extern int pic_mode;
#ifdef CONFIG_X86_NUMAQ
extern int mp_bus_id_to_node[MAX_MP_BUSSES];
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
index 7c1e4258b31e..cb988aab716d 100644
--- a/arch/x86/include/asm/mtrr.h
+++ b/arch/x86/include/asm/mtrr.h
@@ -57,6 +57,31 @@ struct mtrr_gentry {
};
#endif /* !__i386__ */
+struct mtrr_var_range {
+ u32 base_lo;
+ u32 base_hi;
+ u32 mask_lo;
+ u32 mask_hi;
+};
+
+/* In the Intel processor's MTRR interface, the MTRR type is always held in
+ an 8 bit field: */
+typedef u8 mtrr_type;
+
+#define MTRR_NUM_FIXED_RANGES 88
+#define MTRR_MAX_VAR_RANGES 256
+
+struct mtrr_state_type {
+ struct mtrr_var_range var_ranges[MTRR_MAX_VAR_RANGES];
+ mtrr_type fixed_ranges[MTRR_NUM_FIXED_RANGES];
+ unsigned char enabled;
+ unsigned char have_fixed;
+ mtrr_type def_type;
+};
+
+#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg))
+#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
+
/* These are the various ioctls */
#define MTRRIOC_ADD_ENTRY _IOW(MTRR_IOCTL_BASE, 0, struct mtrr_sentry)
#define MTRRIOC_SET_ENTRY _IOW(MTRR_IOCTL_BASE, 1, struct mtrr_sentry)
diff --git a/arch/x86/pci/pci.h b/arch/x86/include/asm/pci_x86.h
index 1959018aac02..e60fd3e14bdf 100644
--- a/arch/x86/pci/pci.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -57,7 +57,8 @@ extern struct pci_ops pci_root_ops;
struct irq_info {
u8 bus, devfn; /* Bus, device and function */
struct {
- u8 link; /* IRQ line ID, chipset dependent, 0=not routed */
+ u8 link; /* IRQ line ID, chipset dependent,
+ 0 = not routed */
u16 bitmap; /* Available IRQs */
} __attribute__((packed)) irq[4];
u8 slot; /* Slot number, 0=onboard */
@@ -69,11 +70,13 @@ struct irq_routing_table {
u16 version; /* PIRQ_VERSION */
u16 size; /* Table size in bytes */
u8 rtr_bus, rtr_devfn; /* Where the interrupt router lies */
- u16 exclusive_irqs; /* IRQs devoted exclusively to PCI usage */
- u16 rtr_vendor, rtr_device; /* Vendor and device ID of interrupt router */
+ u16 exclusive_irqs; /* IRQs devoted exclusively to
+ PCI usage */
+ u16 rtr_vendor, rtr_device; /* Vendor and device ID of
+ interrupt router */
u32 miniport_data; /* Crap */
u8 rfu[11];
- u8 checksum; /* Modulo 256 checksum must give zero */
+ u8 checksum; /* Modulo 256 checksum must give 0 */
struct irq_info slots[0];
} __attribute__((packed));
@@ -148,15 +151,15 @@ static inline unsigned int mmio_config_readl(void __iomem *pos)
static inline void mmio_config_writeb(void __iomem *pos, u8 val)
{
- asm volatile("movb %%al,(%1)" :: "a" (val), "r" (pos) : "memory");
+ asm volatile("movb %%al,(%1)" : : "a" (val), "r" (pos) : "memory");
}
static inline void mmio_config_writew(void __iomem *pos, u16 val)
{
- asm volatile("movw %%ax,(%1)" :: "a" (val), "r" (pos) : "memory");
+ asm volatile("movw %%ax,(%1)" : : "a" (val), "r" (pos) : "memory");
}
static inline void mmio_config_writel(void __iomem *pos, u32 val)
{
- asm volatile("movl %%eax,(%1)" :: "a" (val), "r" (pos) : "memory");
+ asm volatile("movl %%eax,(%1)" : : "a" (val), "r" (pos) : "memory");
}
diff --git a/arch/x86/kvm/svm.h b/arch/x86/include/asm/svm.h
index 1b8afa78e869..1b8afa78e869 100644
--- a/arch/x86/kvm/svm.h
+++ b/arch/x86/include/asm/svm.h
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
new file mode 100644
index 000000000000..ffb08be2a530
--- /dev/null
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -0,0 +1,101 @@
+/*
+ * sys_ia32.h - Linux ia32 syscall interfaces
+ *
+ * Copyright (c) 2008 Jaswinder Singh Rajput
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#ifndef _ASM_X86_SYS_IA32_H
+#define _ASM_X86_SYS_IA32_H
+
+#include <linux/compiler.h>
+#include <linux/linkage.h>
+#include <linux/types.h>
+#include <linux/signal.h>
+#include <asm/compat.h>
+#include <asm/ia32.h>
+
+/* ia32/sys_ia32.c */
+asmlinkage long sys32_truncate64(char __user *, unsigned long, unsigned long);
+asmlinkage long sys32_ftruncate64(unsigned int, unsigned long, unsigned long);
+
+asmlinkage long sys32_stat64(char __user *, struct stat64 __user *);
+asmlinkage long sys32_lstat64(char __user *, struct stat64 __user *);
+asmlinkage long sys32_fstat64(unsigned int, struct stat64 __user *);
+asmlinkage long sys32_fstatat(unsigned int, char __user *,
+ struct stat64 __user *, int);
+struct mmap_arg_struct;
+asmlinkage long sys32_mmap(struct mmap_arg_struct __user *);
+asmlinkage long sys32_mprotect(unsigned long, size_t, unsigned long);
+
+asmlinkage long sys32_pipe(int __user *);
+struct sigaction32;
+struct old_sigaction32;
+asmlinkage long sys32_rt_sigaction(int, struct sigaction32 __user *,
+ struct sigaction32 __user *, unsigned int);
+asmlinkage long sys32_sigaction(int, struct old_sigaction32 __user *,
+ struct old_sigaction32 __user *);
+asmlinkage long sys32_rt_sigprocmask(int, compat_sigset_t __user *,
+ compat_sigset_t __user *, unsigned int);
+asmlinkage long sys32_alarm(unsigned int);
+
+struct sel_arg_struct;
+asmlinkage long sys32_old_select(struct sel_arg_struct __user *);
+asmlinkage long sys32_waitpid(compat_pid_t, unsigned int *, int);
+asmlinkage long sys32_sysfs(int, u32, u32);
+
+asmlinkage long sys32_sched_rr_get_interval(compat_pid_t,
+ struct compat_timespec __user *);
+asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *, compat_size_t);
+asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *);
+
+#ifdef CONFIG_SYSCTL_SYSCALL
+struct sysctl_ia32;
+asmlinkage long sys32_sysctl(struct sysctl_ia32 __user *);
+#endif
+
+asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32);
+asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32);
+
+asmlinkage long sys32_personality(unsigned long);
+asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32);
+
+asmlinkage long sys32_mmap2(unsigned long, unsigned long, unsigned long,
+ unsigned long, unsigned long, unsigned long);
+
+struct oldold_utsname;
+struct old_utsname;
+asmlinkage long sys32_olduname(struct oldold_utsname __user *);
+long sys32_uname(struct old_utsname __user *);
+
+long sys32_ustat(unsigned, struct ustat32 __user *);
+
+asmlinkage long sys32_execve(char __user *, compat_uptr_t __user *,
+ compat_uptr_t __user *, struct pt_regs *);
+asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *);
+
+long sys32_lseek(unsigned int, int, unsigned int);
+long sys32_kill(int, int);
+long sys32_fadvise64_64(int, __u32, __u32, __u32, __u32, int);
+long sys32_vm86_warning(void);
+long sys32_lookup_dcookie(u32, u32, char __user *, size_t);
+
+asmlinkage ssize_t sys32_readahead(int, unsigned, unsigned, size_t);
+asmlinkage long sys32_sync_file_range(int, unsigned, unsigned,
+ unsigned, unsigned, int);
+asmlinkage long sys32_fadvise64(int, unsigned, unsigned, size_t, int);
+asmlinkage long sys32_fallocate(int, int, unsigned,
+ unsigned, unsigned, unsigned);
+
+/* ia32/ia32_signal.c */
+asmlinkage long sys32_sigsuspend(int, int, old_sigset_t);
+asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *,
+ stack_ia32_t __user *, struct pt_regs *);
+asmlinkage long sys32_sigreturn(struct pt_regs *);
+asmlinkage long sys32_rt_sigreturn(struct pt_regs *);
+
+/* ia32/ipc32.c */
+asmlinkage long sys32_ipc(u32, int, int, int, compat_uptr_t, u32);
+#endif /* _ASM_X86_SYS_IA32_H */
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index e2363253bbbf..50423c7b56b2 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -133,61 +133,61 @@ struct bau_msg_payload {
* see table 4.2.3.0.1 in broacast_assist spec.
*/
struct bau_msg_header {
- int dest_subnodeid:6; /* must be zero */
+ unsigned int dest_subnodeid:6; /* must be zero */
/* bits 5:0 */
- int base_dest_nodeid:15; /* nasid>>1 (pnode) of first bit in node_map */
- /* bits 20:6 */
- int command:8; /* message type */
+ unsigned int base_dest_nodeid:15; /* nasid>>1 (pnode) of */
+ /* bits 20:6 */ /* first bit in node_map */
+ unsigned int command:8; /* message type */
/* bits 28:21 */
/* 0x38: SN3net EndPoint Message */
- int rsvd_1:3; /* must be zero */
+ unsigned int rsvd_1:3; /* must be zero */
/* bits 31:29 */
/* int will align on 32 bits */
- int rsvd_2:9; /* must be zero */
+ unsigned int rsvd_2:9; /* must be zero */
/* bits 40:32 */
/* Suppl_A is 56-41 */
- int payload_2a:8; /* becomes byte 16 of msg */
+ unsigned int payload_2a:8;/* becomes byte 16 of msg */
/* bits 48:41 */ /* not currently using */
- int payload_2b:8; /* becomes byte 17 of msg */
+ unsigned int payload_2b:8;/* becomes byte 17 of msg */
/* bits 56:49 */ /* not currently using */
/* Address field (96:57) is never used as an
address (these are address bits 42:3) */
- int rsvd_3:1; /* must be zero */
+ unsigned int rsvd_3:1; /* must be zero */
/* bit 57 */
/* address bits 27:4 are payload */
/* these 24 bits become bytes 12-14 of msg */
- int replied_to:1; /* sent as 0 by the source to byte 12 */
+ unsigned int replied_to:1;/* sent as 0 by the source to byte 12 */
/* bit 58 */
- int payload_1a:5; /* not currently used */
+ unsigned int payload_1a:5;/* not currently used */
/* bits 63:59 */
- int payload_1b:8; /* not currently used */
+ unsigned int payload_1b:8;/* not currently used */
/* bits 71:64 */
- int payload_1c:8; /* not currently used */
+ unsigned int payload_1c:8;/* not currently used */
/* bits 79:72 */
- int payload_1d:2; /* not currently used */
+ unsigned int payload_1d:2;/* not currently used */
/* bits 81:80 */
- int rsvd_4:7; /* must be zero */
+ unsigned int rsvd_4:7; /* must be zero */
/* bits 88:82 */
- int sw_ack_flag:1; /* software acknowledge flag */
+ unsigned int sw_ack_flag:1;/* software acknowledge flag */
/* bit 89 */
/* INTD trasactions at destination are to
wait for software acknowledge */
- int rsvd_5:6; /* must be zero */
+ unsigned int rsvd_5:6; /* must be zero */
/* bits 95:90 */
- int rsvd_6:5; /* must be zero */
+ unsigned int rsvd_6:5; /* must be zero */
/* bits 100:96 */
- int int_both:1; /* if 1, interrupt both sockets on the blade */
+ unsigned int int_both:1;/* if 1, interrupt both sockets on the blade */
/* bit 101*/
- int fairness:3; /* usually zero */
+ unsigned int fairness:3;/* usually zero */
/* bits 104:102 */
- int multilevel:1; /* multi-level multicast format */
+ unsigned int multilevel:1; /* multi-level multicast format */
/* bit 105 */
/* 0 for TLB: endpoint multi-unicast messages */
- int chaining:1; /* next descriptor is part of this activation*/
+ unsigned int chaining:1;/* next descriptor is part of this activation*/
/* bit 106 */
- int rsvd_7:21; /* must be zero */
+ unsigned int rsvd_7:21; /* must be zero */
/* bits 127:107 */
};
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
new file mode 100644
index 000000000000..593636275238
--- /dev/null
+++ b/arch/x86/include/asm/virtext.h
@@ -0,0 +1,132 @@
+/* CPU virtualization extensions handling
+ *
+ * This should carry the code for handling CPU virtualization extensions
+ * that needs to live in the kernel core.
+ *
+ * Author: Eduardo Habkost <ehabkost@redhat.com>
+ *
+ * Copyright (C) 2008, Red Hat Inc.
+ *
+ * Contains code from KVM, Copyright (C) 2006 Qumranet, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+#ifndef _ASM_X86_VIRTEX_H
+#define _ASM_X86_VIRTEX_H
+
+#include <asm/processor.h>
+#include <asm/system.h>
+
+#include <asm/vmx.h>
+#include <asm/svm.h>
+
+/*
+ * VMX functions:
+ */
+
+static inline int cpu_has_vmx(void)
+{
+ unsigned long ecx = cpuid_ecx(1);
+ return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
+}
+
+
+/** Disable VMX on the current CPU
+ *
+ * vmxoff causes a undefined-opcode exception if vmxon was not run
+ * on the CPU previously. Only call this function if you know VMX
+ * is enabled.
+ */
+static inline void cpu_vmxoff(void)
+{
+ asm volatile (ASM_VMX_VMXOFF : : : "cc");
+ write_cr4(read_cr4() & ~X86_CR4_VMXE);
+}
+
+static inline int cpu_vmx_enabled(void)
+{
+ return read_cr4() & X86_CR4_VMXE;
+}
+
+/** Disable VMX if it is enabled on the current CPU
+ *
+ * You shouldn't call this if cpu_has_vmx() returns 0.
+ */
+static inline void __cpu_emergency_vmxoff(void)
+{
+ if (cpu_vmx_enabled())
+ cpu_vmxoff();
+}
+
+/** Disable VMX if it is supported and enabled on the current CPU
+ */
+static inline void cpu_emergency_vmxoff(void)
+{
+ if (cpu_has_vmx())
+ __cpu_emergency_vmxoff();
+}
+
+
+
+
+/*
+ * SVM functions:
+ */
+
+/** Check if the CPU has SVM support
+ *
+ * You can use the 'msg' arg to get a message describing the problem,
+ * if the function returns zero. Simply pass NULL if you are not interested
+ * on the messages; gcc should take care of not generating code for
+ * the messages on this case.
+ */
+static inline int cpu_has_svm(const char **msg)
+{
+ uint32_t eax, ebx, ecx, edx;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
+ if (msg)
+ *msg = "not amd";
+ return 0;
+ }
+
+ cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
+ if (eax < SVM_CPUID_FUNC) {
+ if (msg)
+ *msg = "can't execute cpuid_8000000a";
+ return 0;
+ }
+
+ cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
+ if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
+ if (msg)
+ *msg = "svm not available";
+ return 0;
+ }
+ return 1;
+}
+
+
+/** Disable SVM on the current CPU
+ *
+ * You should call this only if cpu_has_svm() returned true.
+ */
+static inline void cpu_svm_disable(void)
+{
+ uint64_t efer;
+
+ wrmsrl(MSR_VM_HSAVE_PA, 0);
+ rdmsrl(MSR_EFER, efer);
+ wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
+}
+
+/** Makes sure SVM is disabled, if it is supported on the CPU
+ */
+static inline void cpu_emergency_svm_disable(void)
+{
+ if (cpu_has_svm(NULL))
+ cpu_svm_disable();
+}
+
+#endif /* _ASM_X86_VIRTEX_H */
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/include/asm/vmx.h
index ec5edc339da6..d0238e6151d8 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -63,10 +63,13 @@
#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000
+#define VM_EXIT_SAVE_IA32_PAT 0x00040000
+#define VM_EXIT_LOAD_IA32_PAT 0x00080000
#define VM_ENTRY_IA32E_MODE 0x00000200
#define VM_ENTRY_SMM 0x00000400
#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
+#define VM_ENTRY_LOAD_IA32_PAT 0x00004000
/* VMCS Encodings */
enum vmcs_field {
@@ -112,6 +115,8 @@ enum vmcs_field {
VMCS_LINK_POINTER_HIGH = 0x00002801,
GUEST_IA32_DEBUGCTL = 0x00002802,
GUEST_IA32_DEBUGCTL_HIGH = 0x00002803,
+ GUEST_IA32_PAT = 0x00002804,
+ GUEST_IA32_PAT_HIGH = 0x00002805,
GUEST_PDPTR0 = 0x0000280a,
GUEST_PDPTR0_HIGH = 0x0000280b,
GUEST_PDPTR1 = 0x0000280c,
@@ -120,6 +125,8 @@ enum vmcs_field {
GUEST_PDPTR2_HIGH = 0x0000280f,
GUEST_PDPTR3 = 0x00002810,
GUEST_PDPTR3_HIGH = 0x00002811,
+ HOST_IA32_PAT = 0x00002c00,
+ HOST_IA32_PAT_HIGH = 0x00002c01,
PIN_BASED_VM_EXEC_CONTROL = 0x00004000,
CPU_BASED_VM_EXEC_CONTROL = 0x00004002,
EXCEPTION_BITMAP = 0x00004004,
@@ -331,8 +338,9 @@ enum vmcs_field {
#define AR_RESERVD_MASK 0xfffe0f00
-#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9
-#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10
+#define TSS_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 0)
+#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 1)
+#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_MEMORY_SLOTS + 2)
#define VMX_NR_VPIDS (1 << 16)
#define VMX_VPID_EXTENT_SINGLE_CONTEXT 1
@@ -356,4 +364,19 @@ enum vmcs_field {
#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
+
+#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30"
+#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2"
+#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3"
+#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30"
+#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0"
+#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0"
+#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4"
+#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4"
+#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30"
+#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
+#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
+
+
+
#endif
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 2e2da717b350..5113c080f0c4 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -20,8 +20,12 @@
#include <linux/pci.h>
#include <linux/gfp.h>
#include <linux/bitops.h>
+#include <linux/debugfs.h>
#include <linux/scatterlist.h>
#include <linux/iommu-helper.h>
+#ifdef CONFIG_IOMMU_API
+#include <linux/iommu.h>
+#endif
#include <asm/proto.h>
#include <asm/iommu.h>
#include <asm/gart.h>
@@ -38,6 +42,10 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock);
static LIST_HEAD(iommu_pd_list);
static DEFINE_SPINLOCK(iommu_pd_list_lock);
+#ifdef CONFIG_IOMMU_API
+static struct iommu_ops amd_iommu_ops;
+#endif
+
/*
* general struct to manage commands send to an IOMMU
*/
@@ -47,6 +55,68 @@ struct iommu_cmd {
static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
struct unity_map_entry *e);
+static struct dma_ops_domain *find_protection_domain(u16 devid);
+
+
+#ifdef CONFIG_AMD_IOMMU_STATS
+
+/*
+ * Initialization code for statistics collection
+ */
+
+DECLARE_STATS_COUNTER(compl_wait);
+DECLARE_STATS_COUNTER(cnt_map_single);
+DECLARE_STATS_COUNTER(cnt_unmap_single);
+DECLARE_STATS_COUNTER(cnt_map_sg);
+DECLARE_STATS_COUNTER(cnt_unmap_sg);
+DECLARE_STATS_COUNTER(cnt_alloc_coherent);
+DECLARE_STATS_COUNTER(cnt_free_coherent);
+DECLARE_STATS_COUNTER(cross_page);
+DECLARE_STATS_COUNTER(domain_flush_single);
+DECLARE_STATS_COUNTER(domain_flush_all);
+DECLARE_STATS_COUNTER(alloced_io_mem);
+DECLARE_STATS_COUNTER(total_map_requests);
+
+static struct dentry *stats_dir;
+static struct dentry *de_isolate;
+static struct dentry *de_fflush;
+
+static void amd_iommu_stats_add(struct __iommu_counter *cnt)
+{
+ if (stats_dir == NULL)
+ return;
+
+ cnt->dent = debugfs_create_u64(cnt->name, 0444, stats_dir,
+ &cnt->value);
+}
+
+static void amd_iommu_stats_init(void)
+{
+ stats_dir = debugfs_create_dir("amd-iommu", NULL);
+ if (stats_dir == NULL)
+ return;
+
+ de_isolate = debugfs_create_bool("isolation", 0444, stats_dir,
+ (u32 *)&amd_iommu_isolate);
+
+ de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir,
+ (u32 *)&amd_iommu_unmap_flush);
+
+ amd_iommu_stats_add(&compl_wait);
+ amd_iommu_stats_add(&cnt_map_single);
+ amd_iommu_stats_add(&cnt_unmap_single);
+ amd_iommu_stats_add(&cnt_map_sg);
+ amd_iommu_stats_add(&cnt_unmap_sg);
+ amd_iommu_stats_add(&cnt_alloc_coherent);
+ amd_iommu_stats_add(&cnt_free_coherent);
+ amd_iommu_stats_add(&cross_page);
+ amd_iommu_stats_add(&domain_flush_single);
+ amd_iommu_stats_add(&domain_flush_all);
+ amd_iommu_stats_add(&alloced_io_mem);
+ amd_iommu_stats_add(&total_map_requests);
+}
+
+#endif
/* returns !0 if the IOMMU is caching non-present entries in its TLB */
static int iommu_has_npcache(struct amd_iommu *iommu)
@@ -189,13 +259,55 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
spin_lock_irqsave(&iommu->lock, flags);
ret = __iommu_queue_command(iommu, cmd);
if (!ret)
- iommu->need_sync = 1;
+ iommu->need_sync = true;
spin_unlock_irqrestore(&iommu->lock, flags);
return ret;
}
/*
+ * This function waits until an IOMMU has completed a completion
+ * wait command
+ */
+static void __iommu_wait_for_completion(struct amd_iommu *iommu)
+{
+ int ready = 0;
+ unsigned status = 0;
+ unsigned long i = 0;
+
+ INC_STATS_COUNTER(compl_wait);
+
+ while (!ready && (i < EXIT_LOOP_COUNT)) {
+ ++i;
+ /* wait for the bit to become one */
+ status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
+ ready = status & MMIO_STATUS_COM_WAIT_INT_MASK;
+ }
+
+ /* set bit back to zero */
+ status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
+ writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
+
+ if (unlikely(i == EXIT_LOOP_COUNT))
+ panic("AMD IOMMU: Completion wait loop failed\n");
+}
+
+/*
+ * This function queues a completion wait command into the command
+ * buffer of an IOMMU
+ */
+static int __iommu_completion_wait(struct amd_iommu *iommu)
+{
+ struct iommu_cmd cmd;
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
+ CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
+
+ return __iommu_queue_command(iommu, &cmd);
+}
+
+/*
* This function is called whenever we need to ensure that the IOMMU has
* completed execution of all commands we sent. It sends a
* COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
@@ -204,40 +316,22 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
*/
static int iommu_completion_wait(struct amd_iommu *iommu)
{
- int ret = 0, ready = 0;
- unsigned status = 0;
- struct iommu_cmd cmd;
- unsigned long flags, i = 0;
-
- memset(&cmd, 0, sizeof(cmd));
- cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
- CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
+ int ret = 0;
+ unsigned long flags;
spin_lock_irqsave(&iommu->lock, flags);
if (!iommu->need_sync)
goto out;
- iommu->need_sync = 0;
+ ret = __iommu_completion_wait(iommu);
- ret = __iommu_queue_command(iommu, &cmd);
+ iommu->need_sync = false;
if (ret)
goto out;
- while (!ready && (i < EXIT_LOOP_COUNT)) {
- ++i;
- /* wait for the bit to become one */
- status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
- ready = status & MMIO_STATUS_COM_WAIT_INT_MASK;
- }
-
- /* set bit back to zero */
- status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
- writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
-
- if (unlikely(i == EXIT_LOOP_COUNT))
- panic("AMD IOMMU: Completion wait loop failed\n");
+ __iommu_wait_for_completion(iommu);
out:
spin_unlock_irqrestore(&iommu->lock, flags);
@@ -264,6 +358,21 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
return ret;
}
+static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
+ u16 domid, int pde, int s)
+{
+ memset(cmd, 0, sizeof(*cmd));
+ address &= PAGE_MASK;
+ CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
+ cmd->data[1] |= domid;
+ cmd->data[2] = lower_32_bits(address);
+ cmd->data[3] = upper_32_bits(address);
+ if (s) /* size bit - we flush more than one 4kb page */
+ cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
+ if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
+ cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
+}
+
/*
* Generic command send function for invalidaing TLB entries
*/
@@ -273,16 +382,7 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
struct iommu_cmd cmd;
int ret;
- memset(&cmd, 0, sizeof(cmd));
- address &= PAGE_MASK;
- CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES);
- cmd.data[1] |= domid;
- cmd.data[2] = lower_32_bits(address);
- cmd.data[3] = upper_32_bits(address);
- if (s) /* size bit - we flush more than one 4kb page */
- cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
- if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
- cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
+ __iommu_build_inv_iommu_pages(&cmd, address, domid, pde, s);
ret = iommu_queue_command(iommu, &cmd);
@@ -321,9 +421,35 @@ static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)
{
u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
+ INC_STATS_COUNTER(domain_flush_single);
+
iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1);
}
+/*
+ * This function is used to flush the IO/TLB for a given protection domain
+ * on every IOMMU in the system
+ */
+static void iommu_flush_domain(u16 domid)
+{
+ unsigned long flags;
+ struct amd_iommu *iommu;
+ struct iommu_cmd cmd;
+
+ INC_STATS_COUNTER(domain_flush_all);
+
+ __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
+ domid, 1, 1);
+
+ list_for_each_entry(iommu, &amd_iommu_list, list) {
+ spin_lock_irqsave(&iommu->lock, flags);
+ __iommu_queue_command(iommu, &cmd);
+ __iommu_completion_wait(iommu);
+ __iommu_wait_for_completion(iommu);
+ spin_unlock_irqrestore(&iommu->lock, flags);
+ }
+}
+
/****************************************************************************
*
* The functions below are used the create the page table mappings for
@@ -338,10 +464,10 @@ static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)
* supporting all features of AMD IOMMU page tables like level skipping
* and full 64 bit address spaces.
*/
-static int iommu_map(struct protection_domain *dom,
- unsigned long bus_addr,
- unsigned long phys_addr,
- int prot)
+static int iommu_map_page(struct protection_domain *dom,
+ unsigned long bus_addr,
+ unsigned long phys_addr,
+ int prot)
{
u64 __pte, *pte, *page;
@@ -388,6 +514,28 @@ static int iommu_map(struct protection_domain *dom,
return 0;
}
+static void iommu_unmap_page(struct protection_domain *dom,
+ unsigned long bus_addr)
+{
+ u64 *pte;
+
+ pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)];
+
+ if (!IOMMU_PTE_PRESENT(*pte))
+ return;
+
+ pte = IOMMU_PTE_PAGE(*pte);
+ pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
+
+ if (!IOMMU_PTE_PRESENT(*pte))
+ return;
+
+ pte = IOMMU_PTE_PAGE(*pte);
+ pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
+
+ *pte = 0;
+}
+
/*
* This function checks if a specific unity mapping entry is needed for
* this specific IOMMU.
@@ -440,7 +588,7 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
for (addr = e->address_start; addr < e->address_end;
addr += PAGE_SIZE) {
- ret = iommu_map(&dma_dom->domain, addr, addr, e->prot);
+ ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot);
if (ret)
return ret;
/*
@@ -571,6 +719,16 @@ static u16 domain_id_alloc(void)
return id;
}
+static void domain_id_free(int id)
+{
+ unsigned long flags;
+
+ write_lock_irqsave(&amd_iommu_devtable_lock, flags);
+ if (id > 0 && id < MAX_DOMAIN_ID)
+ __clear_bit(id, amd_iommu_pd_alloc_bitmap);
+ write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+}
+
/*
* Used to reserve address ranges in the aperture (e.g. for exclusion
* ranges.
@@ -587,12 +745,12 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
iommu_area_reserve(dom->bitmap, start_page, pages);
}
-static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
+static void free_pagetable(struct protection_domain *domain)
{
int i, j;
u64 *p1, *p2, *p3;
- p1 = dma_dom->domain.pt_root;
+ p1 = domain->pt_root;
if (!p1)
return;
@@ -613,6 +771,8 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
}
free_page((unsigned long)p1);
+
+ domain->pt_root = NULL;
}
/*
@@ -624,7 +784,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
if (!dom)
return;
- dma_ops_free_pagetable(dom);
+ free_pagetable(&dom->domain);
kfree(dom->pte_pages);
@@ -663,6 +823,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
goto free_dma_dom;
dma_dom->domain.mode = PAGE_MODE_3_LEVEL;
dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
+ dma_dom->domain.flags = PD_DMA_OPS_MASK;
dma_dom->domain.priv = dma_dom;
if (!dma_dom->domain.pt_root)
goto free_dma_dom;
@@ -725,6 +886,15 @@ free_dma_dom:
}
/*
+ * little helper function to check whether a given protection domain is a
+ * dma_ops domain
+ */
+static bool dma_ops_domain(struct protection_domain *domain)
+{
+ return domain->flags & PD_DMA_OPS_MASK;
+}
+
+/*
* Find out the protection domain structure for a given PCI device. This
* will give us the pointer to the page table root for example.
*/
@@ -744,14 +914,15 @@ static struct protection_domain *domain_for_device(u16 devid)
* If a device is not yet associated with a domain, this function does
* assigns it visible for the hardware
*/
-static void set_device_domain(struct amd_iommu *iommu,
- struct protection_domain *domain,
- u16 devid)
+static void attach_device(struct amd_iommu *iommu,
+ struct protection_domain *domain,
+ u16 devid)
{
unsigned long flags;
-
u64 pte_root = virt_to_phys(domain->pt_root);
+ domain->dev_cnt += 1;
+
pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
<< DEV_ENTRY_MODE_SHIFT;
pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
@@ -767,6 +938,116 @@ static void set_device_domain(struct amd_iommu *iommu,
iommu_queue_inv_dev_entry(iommu, devid);
}
+/*
+ * Removes a device from a protection domain (unlocked)
+ */
+static void __detach_device(struct protection_domain *domain, u16 devid)
+{
+
+ /* lock domain */
+ spin_lock(&domain->lock);
+
+ /* remove domain from the lookup table */
+ amd_iommu_pd_table[devid] = NULL;
+
+ /* remove entry from the device table seen by the hardware */
+ amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
+ amd_iommu_dev_table[devid].data[1] = 0;
+ amd_iommu_dev_table[devid].data[2] = 0;
+
+ /* decrease reference counter */
+ domain->dev_cnt -= 1;
+
+ /* ready */
+ spin_unlock(&domain->lock);
+}
+
+/*
+ * Removes a device from a protection domain (with devtable_lock held)
+ */
+static void detach_device(struct protection_domain *domain, u16 devid)
+{
+ unsigned long flags;
+
+ /* lock device table */
+ write_lock_irqsave(&amd_iommu_devtable_lock, flags);
+ __detach_device(domain, devid);
+ write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+}
+
+static int device_change_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct device *dev = data;
+ struct pci_dev *pdev = to_pci_dev(dev);
+ u16 devid = calc_devid(pdev->bus->number, pdev->devfn);
+ struct protection_domain *domain;
+ struct dma_ops_domain *dma_domain;
+ struct amd_iommu *iommu;
+ int order = amd_iommu_aperture_order;
+ unsigned long flags;
+
+ if (devid > amd_iommu_last_bdf)
+ goto out;
+
+ devid = amd_iommu_alias_table[devid];
+
+ iommu = amd_iommu_rlookup_table[devid];
+ if (iommu == NULL)
+ goto out;
+
+ domain = domain_for_device(devid);
+
+ if (domain && !dma_ops_domain(domain))
+ WARN_ONCE(1, "AMD IOMMU WARNING: device %s already bound "
+ "to a non-dma-ops domain\n", dev_name(dev));
+
+ switch (action) {
+ case BUS_NOTIFY_BOUND_DRIVER:
+ if (domain)
+ goto out;
+ dma_domain = find_protection_domain(devid);
+ if (!dma_domain)
+ dma_domain = iommu->default_dom;
+ attach_device(iommu, &dma_domain->domain, devid);
+ printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
+ "device %s\n", dma_domain->domain.id, dev_name(dev));
+ break;
+ case BUS_NOTIFY_UNBIND_DRIVER:
+ if (!domain)
+ goto out;
+ detach_device(domain, devid);
+ break;
+ case BUS_NOTIFY_ADD_DEVICE:
+ /* allocate a protection domain if a device is added */
+ dma_domain = find_protection_domain(devid);
+ if (dma_domain)
+ goto out;
+ dma_domain = dma_ops_domain_alloc(iommu, order);
+ if (!dma_domain)
+ goto out;
+ dma_domain->target_dev = devid;
+
+ spin_lock_irqsave(&iommu_pd_list_lock, flags);
+ list_add_tail(&dma_domain->list, &iommu_pd_list);
+ spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
+
+ break;
+ default:
+ goto out;
+ }
+
+ iommu_queue_inv_dev_entry(iommu, devid);
+ iommu_completion_wait(iommu);
+
+out:
+ return 0;
+}
+
+struct notifier_block device_nb = {
+ .notifier_call = device_change_notifier,
+};
+
/*****************************************************************************
*
* The next functions belong to the dma_ops mapping/unmapping code.
@@ -802,7 +1083,6 @@ static struct dma_ops_domain *find_protection_domain(u16 devid)
list_for_each_entry(entry, &iommu_pd_list, list) {
if (entry->target_dev == devid) {
ret = entry;
- list_del(&ret->list);
break;
}
}
@@ -853,14 +1133,13 @@ static int get_device_resources(struct device *dev,
if (!dma_dom)
dma_dom = (*iommu)->default_dom;
*domain = &dma_dom->domain;
- set_device_domain(*iommu, *domain, *bdf);
+ attach_device(*iommu, *domain, *bdf);
printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
- "device ", (*domain)->id);
- print_devid(_bdf, 1);
+ "device %s\n", (*domain)->id, dev_name(dev));
}
if (domain_for_device(_bdf) == NULL)
- set_device_domain(*iommu, *domain, _bdf);
+ attach_device(*iommu, *domain, _bdf);
return 1;
}
@@ -946,6 +1225,11 @@ static dma_addr_t __map_single(struct device *dev,
pages = iommu_num_pages(paddr, size, PAGE_SIZE);
paddr &= PAGE_MASK;
+ INC_STATS_COUNTER(total_map_requests);
+
+ if (pages > 1)
+ INC_STATS_COUNTER(cross_page);
+
if (align)
align_mask = (1UL << get_order(size)) - 1;
@@ -962,6 +1246,8 @@ static dma_addr_t __map_single(struct device *dev,
}
address += offset;
+ ADD_STATS_COUNTER(alloced_io_mem, size);
+
if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
iommu_flush_tlb(iommu, dma_dom->domain.id);
dma_dom->need_flush = false;
@@ -998,6 +1284,8 @@ static void __unmap_single(struct amd_iommu *iommu,
start += PAGE_SIZE;
}
+ SUB_STATS_COUNTER(alloced_io_mem, size);
+
dma_ops_free_addresses(dma_dom, dma_addr, pages);
if (amd_iommu_unmap_flush || dma_dom->need_flush) {
@@ -1019,6 +1307,8 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
dma_addr_t addr;
u64 dma_mask;
+ INC_STATS_COUNTER(cnt_map_single);
+
if (!check_device(dev))
return bad_dma_address;
@@ -1030,6 +1320,9 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
/* device not handled by any AMD IOMMU */
return (dma_addr_t)paddr;
+ if (!dma_ops_domain(domain))
+ return bad_dma_address;
+
spin_lock_irqsave(&domain->lock, flags);
addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false,
dma_mask);
@@ -1055,11 +1348,16 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
struct protection_domain *domain;
u16 devid;
+ INC_STATS_COUNTER(cnt_unmap_single);
+
if (!check_device(dev) ||
!get_device_resources(dev, &iommu, &domain, &devid))
/* device not handled by any AMD IOMMU */
return;
+ if (!dma_ops_domain(domain))
+ return;
+
spin_lock_irqsave(&domain->lock, flags);
__unmap_single(iommu, domain->priv, dma_addr, size, dir);
@@ -1104,6 +1402,8 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
int mapped_elems = 0;
u64 dma_mask;
+ INC_STATS_COUNTER(cnt_map_sg);
+
if (!check_device(dev))
return 0;
@@ -1114,6 +1414,9 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
if (!iommu || !domain)
return map_sg_no_iommu(dev, sglist, nelems, dir);
+ if (!dma_ops_domain(domain))
+ return 0;
+
spin_lock_irqsave(&domain->lock, flags);
for_each_sg(sglist, s, nelems, i) {
@@ -1163,10 +1466,15 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
u16 devid;
int i;
+ INC_STATS_COUNTER(cnt_unmap_sg);
+
if (!check_device(dev) ||
!get_device_resources(dev, &iommu, &domain, &devid))
return;
+ if (!dma_ops_domain(domain))
+ return;
+
spin_lock_irqsave(&domain->lock, flags);
for_each_sg(sglist, s, nelems, i) {
@@ -1194,6 +1502,8 @@ static void *alloc_coherent(struct device *dev, size_t size,
phys_addr_t paddr;
u64 dma_mask = dev->coherent_dma_mask;
+ INC_STATS_COUNTER(cnt_alloc_coherent);
+
if (!check_device(dev))
return NULL;
@@ -1212,6 +1522,9 @@ static void *alloc_coherent(struct device *dev, size_t size,
return virt_addr;
}
+ if (!dma_ops_domain(domain))
+ goto out_free;
+
if (!dma_mask)
dma_mask = *dev->dma_mask;
@@ -1220,18 +1533,20 @@ static void *alloc_coherent(struct device *dev, size_t size,
*dma_addr = __map_single(dev, iommu, domain->priv, paddr,
size, DMA_BIDIRECTIONAL, true, dma_mask);
- if (*dma_addr == bad_dma_address) {
- free_pages((unsigned long)virt_addr, get_order(size));
- virt_addr = NULL;
- goto out;
- }
+ if (*dma_addr == bad_dma_address)
+ goto out_free;
iommu_completion_wait(iommu);
-out:
spin_unlock_irqrestore(&domain->lock, flags);
return virt_addr;
+
+out_free:
+
+ free_pages((unsigned long)virt_addr, get_order(size));
+
+ return NULL;
}
/*
@@ -1245,6 +1560,8 @@ static void free_coherent(struct device *dev, size_t size,
struct protection_domain *domain;
u16 devid;
+ INC_STATS_COUNTER(cnt_free_coherent);
+
if (!check_device(dev))
return;
@@ -1253,6 +1570,9 @@ static void free_coherent(struct device *dev, size_t size,
if (!iommu || !domain)
goto free_mem;
+ if (!dma_ops_domain(domain))
+ goto free_mem;
+
spin_lock_irqsave(&domain->lock, flags);
__unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
@@ -1296,7 +1616,7 @@ static int amd_iommu_dma_supported(struct device *dev, u64 mask)
* we don't need to preallocate the protection domains anymore.
* For now we have to.
*/
-void prealloc_protection_domains(void)
+static void prealloc_protection_domains(void)
{
struct pci_dev *dev = NULL;
struct dma_ops_domain *dma_dom;
@@ -1305,7 +1625,7 @@ void prealloc_protection_domains(void)
u16 devid;
while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
- devid = (dev->bus->number << 8) | dev->devfn;
+ devid = calc_devid(dev->bus->number, dev->devfn);
if (devid > amd_iommu_last_bdf)
continue;
devid = amd_iommu_alias_table[devid];
@@ -1352,6 +1672,7 @@ int __init amd_iommu_init_dma_ops(void)
iommu->default_dom = dma_ops_domain_alloc(iommu, order);
if (iommu->default_dom == NULL)
return -ENOMEM;
+ iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
ret = iommu_init_unity_mappings(iommu);
if (ret)
goto free_domains;
@@ -1375,6 +1696,12 @@ int __init amd_iommu_init_dma_ops(void)
/* Make the driver finally visible to the drivers */
dma_ops = &amd_iommu_dma_ops;
+ register_iommu(&amd_iommu_ops);
+
+ bus_register_notifier(&pci_bus_type, &device_nb);
+
+ amd_iommu_stats_init();
+
return 0;
free_domains:
@@ -1386,3 +1713,224 @@ free_domains:
return ret;
}
+
+/*****************************************************************************
+ *
+ * The following functions belong to the exported interface of AMD IOMMU
+ *
+ * This interface allows access to lower level functions of the IOMMU
+ * like protection domain handling and assignement of devices to domains
+ * which is not possible with the dma_ops interface.
+ *
+ *****************************************************************************/
+
+static void cleanup_domain(struct protection_domain *domain)
+{
+ unsigned long flags;
+ u16 devid;
+
+ write_lock_irqsave(&amd_iommu_devtable_lock, flags);
+
+ for (devid = 0; devid <= amd_iommu_last_bdf; ++devid)
+ if (amd_iommu_pd_table[devid] == domain)
+ __detach_device(domain, devid);
+
+ write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+}
+
+static int amd_iommu_domain_init(struct iommu_domain *dom)
+{
+ struct protection_domain *domain;
+
+ domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+ if (!domain)
+ return -ENOMEM;
+
+ spin_lock_init(&domain->lock);
+ domain->mode = PAGE_MODE_3_LEVEL;
+ domain->id = domain_id_alloc();
+ if (!domain->id)
+ goto out_free;
+ domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!domain->pt_root)
+ goto out_free;
+
+ dom->priv = domain;
+
+ return 0;
+
+out_free:
+ kfree(domain);
+
+ return -ENOMEM;
+}
+
+static void amd_iommu_domain_destroy(struct iommu_domain *dom)
+{
+ struct protection_domain *domain = dom->priv;
+
+ if (!domain)
+ return;
+
+ if (domain->dev_cnt > 0)
+ cleanup_domain(domain);
+
+ BUG_ON(domain->dev_cnt != 0);
+
+ free_pagetable(domain);
+
+ domain_id_free(domain->id);
+
+ kfree(domain);
+
+ dom->priv = NULL;
+}
+
+static void amd_iommu_detach_device(struct iommu_domain *dom,
+ struct device *dev)
+{
+ struct protection_domain *domain = dom->priv;
+ struct amd_iommu *iommu;
+ struct pci_dev *pdev;
+ u16 devid;
+
+ if (dev->bus != &pci_bus_type)
+ return;
+
+ pdev = to_pci_dev(dev);
+
+ devid = calc_devid(pdev->bus->number, pdev->devfn);
+
+ if (devid > 0)
+ detach_device(domain, devid);
+
+ iommu = amd_iommu_rlookup_table[devid];
+ if (!iommu)
+ return;
+
+ iommu_queue_inv_dev_entry(iommu, devid);
+ iommu_completion_wait(iommu);
+}
+
+static int amd_iommu_attach_device(struct iommu_domain *dom,
+ struct device *dev)
+{
+ struct protection_domain *domain = dom->priv;
+ struct protection_domain *old_domain;
+ struct amd_iommu *iommu;
+ struct pci_dev *pdev;
+ u16 devid;
+
+ if (dev->bus != &pci_bus_type)
+ return -EINVAL;
+
+ pdev = to_pci_dev(dev);
+
+ devid = calc_devid(pdev->bus->number, pdev->devfn);
+
+ if (devid >= amd_iommu_last_bdf ||
+ devid != amd_iommu_alias_table[devid])
+ return -EINVAL;
+
+ iommu = amd_iommu_rlookup_table[devid];
+ if (!iommu)
+ return -EINVAL;
+
+ old_domain = domain_for_device(devid);
+ if (old_domain)
+ return -EBUSY;
+
+ attach_device(iommu, domain, devid);
+
+ iommu_completion_wait(iommu);
+
+ return 0;
+}
+
+static int amd_iommu_map_range(struct iommu_domain *dom,
+ unsigned long iova, phys_addr_t paddr,
+ size_t size, int iommu_prot)
+{
+ struct protection_domain *domain = dom->priv;
+ unsigned long i, npages = iommu_num_pages(paddr, size, PAGE_SIZE);
+ int prot = 0;
+ int ret;
+
+ if (iommu_prot & IOMMU_READ)
+ prot |= IOMMU_PROT_IR;
+ if (iommu_prot & IOMMU_WRITE)
+ prot |= IOMMU_PROT_IW;
+
+ iova &= PAGE_MASK;
+ paddr &= PAGE_MASK;
+
+ for (i = 0; i < npages; ++i) {
+ ret = iommu_map_page(domain, iova, paddr, prot);
+ if (ret)
+ return ret;
+
+ iova += PAGE_SIZE;
+ paddr += PAGE_SIZE;
+ }
+
+ return 0;
+}
+
+static void amd_iommu_unmap_range(struct iommu_domain *dom,
+ unsigned long iova, size_t size)
+{
+
+ struct protection_domain *domain = dom->priv;
+ unsigned long i, npages = iommu_num_pages(iova, size, PAGE_SIZE);
+
+ iova &= PAGE_MASK;
+
+ for (i = 0; i < npages; ++i) {
+ iommu_unmap_page(domain, iova);
+ iova += PAGE_SIZE;
+ }
+
+ iommu_flush_domain(domain->id);
+}
+
+static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
+ unsigned long iova)
+{
+ struct protection_domain *domain = dom->priv;
+ unsigned long offset = iova & ~PAGE_MASK;
+ phys_addr_t paddr;
+ u64 *pte;
+
+ pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(iova)];
+
+ if (!IOMMU_PTE_PRESENT(*pte))
+ return 0;
+
+ pte = IOMMU_PTE_PAGE(*pte);
+ pte = &pte[IOMMU_PTE_L1_INDEX(iova)];
+
+ if (!IOMMU_PTE_PRESENT(*pte))
+ return 0;
+
+ pte = IOMMU_PTE_PAGE(*pte);
+ pte = &pte[IOMMU_PTE_L0_INDEX(iova)];
+
+ if (!IOMMU_PTE_PRESENT(*pte))
+ return 0;
+
+ paddr = *pte & IOMMU_PAGE_MASK;
+ paddr |= offset;
+
+ return paddr;
+}
+
+static struct iommu_ops amd_iommu_ops = {
+ .domain_init = amd_iommu_domain_init,
+ .domain_destroy = amd_iommu_domain_destroy,
+ .attach_dev = amd_iommu_attach_device,
+ .detach_dev = amd_iommu_detach_device,
+ .map = amd_iommu_map_range,
+ .unmap = amd_iommu_unmap_range,
+ .iova_to_phys = amd_iommu_iova_to_phys,
+};
+
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index c625800c55ca..42c33cebf00f 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -122,7 +122,8 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have
LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
we find in ACPI */
unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
-int amd_iommu_isolate = 1; /* if 1, device isolation is enabled */
+bool amd_iommu_isolate = true; /* if true, device isolation is
+ enabled */
bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
@@ -243,20 +244,16 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
}
/* Function to enable the hardware */
-void __init iommu_enable(struct amd_iommu *iommu)
+static void __init iommu_enable(struct amd_iommu *iommu)
{
- printk(KERN_INFO "AMD IOMMU: Enabling IOMMU "
- "at %02x:%02x.%x cap 0x%hx\n",
- iommu->dev->bus->number,
- PCI_SLOT(iommu->dev->devfn),
- PCI_FUNC(iommu->dev->devfn),
- iommu->cap_ptr);
+ printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n",
+ dev_name(&iommu->dev->dev), iommu->cap_ptr);
iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
}
/* Function to enable IOMMU event logging and event interrupts */
-void __init iommu_enable_event_logging(struct amd_iommu *iommu)
+static void __init iommu_enable_event_logging(struct amd_iommu *iommu)
{
iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
@@ -1218,9 +1215,9 @@ static int __init parse_amd_iommu_options(char *str)
{
for (; *str; ++str) {
if (strncmp(str, "isolate", 7) == 0)
- amd_iommu_isolate = 1;
+ amd_iommu_isolate = true;
if (strncmp(str, "share", 5) == 0)
- amd_iommu_isolate = 0;
+ amd_iommu_isolate = false;
if (strncmp(str, "fullflush", 9) == 0)
amd_iommu_unmap_flush = true;
}
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 99589245fd8d..b13d3c4dbd42 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -98,8 +98,8 @@ __setup("apicpmtimer", setup_apicpmtimer);
#ifdef HAVE_X2APIC
int x2apic;
/* x2apic enabled before OS handover */
-int x2apic_preenabled;
-int disable_x2apic;
+static int x2apic_preenabled;
+static int disable_x2apic;
static __init int setup_nox2apic(char *str)
{
disable_x2apic = 1;
@@ -226,7 +226,7 @@ void xapic_icr_write(u32 low, u32 id)
apic_write(APIC_ICR, low);
}
-u64 xapic_icr_read(void)
+static u64 xapic_icr_read(void)
{
u32 icr1, icr2;
@@ -266,7 +266,7 @@ void x2apic_icr_write(u32 low, u32 id)
wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
}
-u64 x2apic_icr_read(void)
+static u64 x2apic_icr_read(void)
{
unsigned long val;
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index 2a0a2a3cac26..f63882728d91 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -25,7 +25,7 @@
#include <asm/uv/bios.h>
#include <asm/uv/uv_hub.h>
-struct uv_systab uv_systab;
+static struct uv_systab uv_systab;
s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
{
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 4e8d77f01eeb..b59ddcc88cd8 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -14,14 +14,6 @@
#include <asm/pat.h>
#include "mtrr.h"
-struct mtrr_state {
- struct mtrr_var_range var_ranges[MAX_VAR_RANGES];
- mtrr_type fixed_ranges[NUM_FIXED_RANGES];
- unsigned char enabled;
- unsigned char have_fixed;
- mtrr_type def_type;
-};
-
struct fixed_range_block {
int base_msr; /* start address of an MTRR block */
int ranges; /* number of MTRRs in this block */
@@ -35,10 +27,12 @@ static struct fixed_range_block fixed_range_blocks[] = {
};
static unsigned long smp_changes_mask;
-static struct mtrr_state mtrr_state = {};
static int mtrr_state_set;
u64 mtrr_tom2;
+struct mtrr_state_type mtrr_state = {};
+EXPORT_SYMBOL_GPL(mtrr_state);
+
#undef MODULE_PARAM_PREFIX
#define MODULE_PARAM_PREFIX "mtrr."
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 1159e269e596..d259e5d2e054 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -49,7 +49,7 @@
u32 num_var_ranges = 0;
-unsigned int mtrr_usage_table[MAX_VAR_RANGES];
+unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
static DEFINE_MUTEX(mtrr_mutex);
u64 size_or_mask, size_and_mask;
@@ -574,7 +574,7 @@ struct mtrr_value {
unsigned long lsize;
};
-static struct mtrr_value mtrr_state[MAX_VAR_RANGES];
+static struct mtrr_value mtrr_state[MTRR_MAX_VAR_RANGES];
static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
{
@@ -824,16 +824,14 @@ static int enable_mtrr_cleanup __initdata =
static int __init disable_mtrr_cleanup_setup(char *str)
{
- if (enable_mtrr_cleanup != -1)
- enable_mtrr_cleanup = 0;
+ enable_mtrr_cleanup = 0;
return 0;
}
early_param("disable_mtrr_cleanup", disable_mtrr_cleanup_setup);
static int __init enable_mtrr_cleanup_setup(char *str)
{
- if (enable_mtrr_cleanup != -1)
- enable_mtrr_cleanup = 1;
+ enable_mtrr_cleanup = 1;
return 0;
}
early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 2dc4ec656b23..ffd60409cc6d 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -8,11 +8,6 @@
#define MTRRcap_MSR 0x0fe
#define MTRRdefType_MSR 0x2ff
-#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg))
-#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
-
-#define NUM_FIXED_RANGES 88
-#define MAX_VAR_RANGES 256
#define MTRRfix64K_00000_MSR 0x250
#define MTRRfix16K_80000_MSR 0x258
#define MTRRfix16K_A0000_MSR 0x259
@@ -29,11 +24,7 @@
#define MTRR_CHANGE_MASK_VARIABLE 0x02
#define MTRR_CHANGE_MASK_DEFTYPE 0x04
-/* In the Intel processor's MTRR interface, the MTRR type is always held in
- an 8 bit field: */
-typedef u8 mtrr_type;
-
-extern unsigned int mtrr_usage_table[MAX_VAR_RANGES];
+extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
struct mtrr_ops {
u32 vendor;
@@ -70,13 +61,6 @@ struct set_mtrr_context {
u32 ccr3;
};
-struct mtrr_var_range {
- u32 base_lo;
- u32 base_hi;
- u32 mask_lo;
- u32 mask_hi;
-};
-
void set_mtrr_done(struct set_mtrr_context *ctxt);
void set_mtrr_cache_disable(struct set_mtrr_context *ctxt);
void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 62a3c23bd703..2ac1f0c2beb3 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -39,10 +39,10 @@
#include <linux/device.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
+#include <linux/uaccess.h>
#include <asm/processor.h>
#include <asm/msr.h>
-#include <asm/uaccess.h>
#include <asm/system.h>
static struct class *cpuid_class;
@@ -82,7 +82,7 @@ static loff_t cpuid_seek(struct file *file, loff_t offset, int orig)
}
static ssize_t cpuid_read(struct file *file, char __user *buf,
- size_t count, loff_t * ppos)
+ size_t count, loff_t *ppos)
{
char __user *tmp = buf;
struct cpuid_regs cmd;
@@ -117,7 +117,7 @@ static int cpuid_open(struct inode *inode, struct file *file)
unsigned int cpu;
struct cpuinfo_x86 *c;
int ret = 0;
-
+
lock_kernel();
cpu = iminor(file->f_path.dentry->d_inode);
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index d84a852e4cd7..c689d19e35ab 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -26,6 +26,7 @@
#include <linux/kdebug.h>
#include <asm/smp.h>
#include <asm/reboot.h>
+#include <asm/virtext.h>
#include <mach_ipi.h>
@@ -49,6 +50,15 @@ static void kdump_nmi_callback(int cpu, struct die_args *args)
#endif
crash_save_cpu(regs, cpu);
+ /* Disable VMX or SVM if needed.
+ *
+ * We need to disable virtualization on all CPUs.
+ * Having VMX or SVM enabled on any CPU may break rebooting
+ * after the kdump kernel has finished its task.
+ */
+ cpu_emergency_vmxoff();
+ cpu_emergency_svm_disable();
+
disable_local_APIC();
}
@@ -80,6 +90,14 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
local_irq_disable();
kdump_nmi_shootdown_cpus();
+
+ /* Booting kdump kernel with VMX or SVM enabled won't work,
+ * because (among other limitations) we can't disable paging
+ * with the virt flags.
+ */
+ cpu_emergency_vmxoff();
+ cpu_emergency_svm_disable();
+
lapic_shutdown();
#if defined(CONFIG_X86_IO_APIC)
disable_IO_APIC();
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 23b138e31e9c..504ad198e4ad 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -886,7 +886,7 @@ asmlinkage void early_printk(const char *fmt, ...)
va_list ap;
va_start(ap, fmt);
- n = vscnprintf(buf, 512, fmt, ap);
+ n = vscnprintf(buf, sizeof(buf), fmt, ap);
early_console->write(early_console, buf, n);
va_end(ap);
}
diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c
index 62895cf315ff..21bcc0e098ba 100644
--- a/arch/x86/kernel/genx2apic_phys.c
+++ b/arch/x86/kernel/genx2apic_phys.c
@@ -161,12 +161,12 @@ static unsigned int phys_pkg_id(int index_msb)
return current_cpu_data.initial_apicid >> index_msb;
}
-void x2apic_send_IPI_self(int vector)
+static void x2apic_send_IPI_self(int vector)
{
apic_write(APIC_SELF_IPI, vector);
}
-void init_x2apic_ldr(void)
+static void init_x2apic_ldr(void)
{
return;
}
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 388e05a5fc17..b9a4d8c4b935 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -27,7 +27,7 @@
#include <asm/trampoline.h>
/* boot cpu pda */
-static struct x8664_pda _boot_cpu_pda __read_mostly;
+static struct x8664_pda _boot_cpu_pda;
#ifdef CONFIG_SMP
/*
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index d39918076bb4..df3bf269beab 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -10,7 +10,6 @@
#include <asm/pgtable.h>
#include <asm/desc.h>
-static struct fs_struct init_fs = INIT_FS;
static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
struct mm_struct init_mm = INIT_MM(init_mm);
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index a25c3f76b8ac..3639442aa7a4 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -170,7 +170,7 @@ static struct irq_cfg irq_cfgx[NR_IRQS] = {
[15] = { .vector = IRQ15_VECTOR, },
};
-void __init arch_early_irq_init(void)
+int __init arch_early_irq_init(void)
{
struct irq_cfg *cfg;
struct irq_desc *desc;
@@ -188,6 +188,8 @@ void __init arch_early_irq_init(void)
if (i < NR_IRQS_LEGACY)
cpumask_setall(cfg[i].domain);
}
+
+ return 0;
}
#ifdef CONFIG_SPARSE_IRQ
@@ -230,7 +232,7 @@ static struct irq_cfg *get_one_free_irq_cfg(int cpu)
return cfg;
}
-void arch_init_chip_data(struct irq_desc *desc, int cpu)
+int arch_init_chip_data(struct irq_desc *desc, int cpu)
{
struct irq_cfg *cfg;
@@ -242,6 +244,8 @@ void arch_init_chip_data(struct irq_desc *desc, int cpu)
BUG_ON(1);
}
}
+
+ return 0;
}
#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
@@ -702,7 +706,7 @@ static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
}
#ifdef CONFIG_X86_64
-void io_apic_sync(struct irq_pin_list *entry)
+static void io_apic_sync(struct irq_pin_list *entry)
{
/*
* Synchronize the IO-APIC and the CPU by doing
@@ -1400,8 +1404,6 @@ void __setup_vector_irq(int cpu)
/* Mark the inuse vectors */
for_each_irq_desc(irq, desc) {
- if (!desc)
- continue;
cfg = desc->chip_data;
if (!cpumask_test_cpu(cpu, cfg->domain))
continue;
@@ -1783,8 +1785,6 @@ __apicdebuginit(void) print_IO_APIC(void)
for_each_irq_desc(irq, desc) {
struct irq_pin_list *entry;
- if (!desc)
- continue;
cfg = desc->chip_data;
entry = cfg->irq_2_pin;
if (!entry)
@@ -2425,9 +2425,6 @@ static void ir_irq_migration(struct work_struct *work)
struct irq_desc *desc;
for_each_irq_desc(irq, desc) {
- if (!desc)
- continue;
-
if (desc->status & IRQ_MOVE_PENDING) {
unsigned long flags;
@@ -2713,9 +2710,6 @@ static inline void init_IO_APIC_traps(void)
* 0x80, because int 0x80 is hm, kind of importantish. ;)
*/
for_each_irq_desc(irq, desc) {
- if (!desc)
- continue;
-
cfg = desc->chip_data;
if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
/*
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index e169ae9b6a62..652fce6d2cce 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -89,17 +89,17 @@ static cycle_t kvm_clock_read(void)
*/
static unsigned long kvm_get_tsc_khz(void)
{
- return preset_lpj;
+ struct pvclock_vcpu_time_info *src;
+ src = &per_cpu(hv_clock, 0);
+ return pvclock_tsc_khz(src);
}
static void kvm_get_preset_lpj(void)
{
- struct pvclock_vcpu_time_info *src;
unsigned long khz;
u64 lpj;
- src = &per_cpu(hv_clock, 0);
- khz = pvclock_tsc_khz(src);
+ khz = kvm_get_tsc_khz();
lpj = ((u64)khz * 1000);
do_div(lpj, HZ);
@@ -194,5 +194,7 @@ void __init kvmclock_init(void)
#endif
kvm_get_preset_lpj();
clocksource_register(&kvm_clock);
+ pv_info.paravirt_enabled = 1;
+ pv_info.name = "KVM";
}
}
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index eee32b43fee3..71f1d99a635d 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -12,8 +12,8 @@
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/vmalloc.h>
+#include <linux/uaccess.h>
-#include <asm/uaccess.h>
#include <asm/system.h>
#include <asm/ldt.h>
#include <asm/desc.h>
@@ -93,7 +93,7 @@ static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
if (err < 0)
return err;
- for(i = 0; i < old->size; i++)
+ for (i = 0; i < old->size; i++)
write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE);
return 0;
}
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index efc2f361fe85..666e43df51f9 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -13,8 +13,7 @@
#include <asm/msr.h>
#include <asm/acpi.h>
#include <asm/mmconfig.h>
-
-#include "../pci/pci.h"
+#include <asm/pci_x86.h>
struct pci_hostbridge_probe {
u32 bus;
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 45e3b69808ba..c5c5b8df1dbc 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -16,14 +16,14 @@
#include <linux/bitops.h>
#include <linux/acpi.h>
#include <linux/module.h>
+#include <linux/smp.h>
+#include <linux/acpi.h>
-#include <asm/smp.h>
#include <asm/mtrr.h>
#include <asm/mpspec.h>
#include <asm/pgalloc.h>
#include <asm/io_apic.h>
#include <asm/proto.h>
-#include <asm/acpi.h>
#include <asm/bios_ebda.h>
#include <asm/e820.h>
#include <asm/trampoline.h>
@@ -95,8 +95,8 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
#endif
if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
- set_bit(m->mpc_busid, mp_bus_not_pci);
-#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
+ set_bit(m->mpc_busid, mp_bus_not_pci);
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
#endif
} else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
@@ -104,7 +104,7 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
x86_quirks->mpc_oem_pci_bus(m);
clear_bit(m->mpc_busid, mp_bus_not_pci);
-#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
} else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 8bd1bf9622a7..45a09ccdc214 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -26,11 +26,10 @@
#include <linux/kernel_stat.h>
#include <linux/kdebug.h>
#include <linux/smp.h>
+#include <linux/nmi.h>
#include <asm/i8259.h>
#include <asm/io_apic.h>
-#include <asm/smp.h>
-#include <asm/nmi.h>
#include <asm/proto.h>
#include <asm/timer.h>
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index a35eaa379ff6..00c2bcd41463 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -52,7 +52,7 @@ static u32 *iommu_gatt_base; /* Remapping table */
* to trigger bugs with some popular PCI cards, in particular 3ware (but
* has been also also seen with Qlogic at least).
*/
-int iommu_fullflush = 1;
+static int iommu_fullflush = 1;
/* Allocation bitmap for the remapping area: */
static DEFINE_SPINLOCK(iommu_bitmap_lock);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index de4a9d643bee..2b46eb41643b 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -12,6 +12,8 @@
#include <asm/proto.h>
#include <asm/reboot_fixups.h>
#include <asm/reboot.h>
+#include <asm/pci_x86.h>
+#include <asm/virtext.h>
#ifdef CONFIG_X86_32
# include <linux/dmi.h>
@@ -23,7 +25,6 @@
#include <mach_ipi.h>
-
/*
* Power off function, if any
*/
@@ -39,6 +40,12 @@ int reboot_force;
static int reboot_cpu = -1;
#endif
+/* This is set if we need to go through the 'emergency' path.
+ * When machine_emergency_restart() is called, we may be on
+ * an inconsistent state and won't be able to do a clean cleanup
+ */
+static int reboot_emergency;
+
/* This is set by the PCI code if either type 1 or type 2 PCI is detected */
bool port_cf9_safe = false;
@@ -368,6 +375,48 @@ static inline void kb_wait(void)
}
}
+static void vmxoff_nmi(int cpu, struct die_args *args)
+{
+ cpu_emergency_vmxoff();
+}
+
+/* Use NMIs as IPIs to tell all CPUs to disable virtualization
+ */
+static void emergency_vmx_disable_all(void)
+{
+ /* Just make sure we won't change CPUs while doing this */
+ local_irq_disable();
+
+ /* We need to disable VMX on all CPUs before rebooting, otherwise
+ * we risk hanging up the machine, because the CPU ignore INIT
+ * signals when VMX is enabled.
+ *
+ * We can't take any locks and we may be on an inconsistent
+ * state, so we use NMIs as IPIs to tell the other CPUs to disable
+ * VMX and halt.
+ *
+ * For safety, we will avoid running the nmi_shootdown_cpus()
+ * stuff unnecessarily, but we don't have a way to check
+ * if other CPUs have VMX enabled. So we will call it only if the
+ * CPU we are running on has VMX enabled.
+ *
+ * We will miss cases where VMX is not enabled on all CPUs. This
+ * shouldn't do much harm because KVM always enable VMX on all
+ * CPUs anyway. But we can miss it on the small window where KVM
+ * is still enabling VMX.
+ */
+ if (cpu_has_vmx() && cpu_vmx_enabled()) {
+ /* Disable VMX on this CPU.
+ */
+ cpu_vmxoff();
+
+ /* Halt and disable VMX on the other CPUs */
+ nmi_shootdown_cpus(vmxoff_nmi);
+
+ }
+}
+
+
void __attribute__((weak)) mach_reboot_fixups(void)
{
}
@@ -376,6 +425,9 @@ static void native_machine_emergency_restart(void)
{
int i;
+ if (reboot_emergency)
+ emergency_vmx_disable_all();
+
/* Tell the BIOS if we want cold or warm reboot */
*((unsigned short *)__va(0x472)) = reboot_mode;
@@ -482,13 +534,19 @@ void native_machine_shutdown(void)
#endif
}
+static void __machine_emergency_restart(int emergency)
+{
+ reboot_emergency = emergency;
+ machine_ops.emergency_restart();
+}
+
static void native_machine_restart(char *__unused)
{
printk("machine restart\n");
if (!reboot_force)
machine_shutdown();
- machine_emergency_restart();
+ __machine_emergency_restart(0);
}
static void native_machine_halt(void)
@@ -532,7 +590,7 @@ void machine_shutdown(void)
void machine_emergency_restart(void)
{
- machine_ops.emergency_restart();
+ __machine_emergency_restart(1);
}
void machine_restart(char *cmd)
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 6a00e5faaa74..f885023167e0 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -582,7 +582,6 @@ static int __init uv_ptc_init(void)
static struct bau_control * __init uv_table_bases_init(int blade, int node)
{
int i;
- int *ip;
struct bau_msg_status *msp;
struct bau_control *bau_tabp;
@@ -599,13 +598,6 @@ static struct bau_control * __init uv_table_bases_init(int blade, int node)
bau_cpubits_clear(&msp->seen_by, (int)
uv_blade_nr_possible_cpus(blade));
- bau_tabp->watching =
- kmalloc_node(sizeof(int) * DEST_NUM_RESOURCES, GFP_KERNEL, node);
- BUG_ON(!bau_tabp->watching);
-
- for (i = 0, ip = bau_tabp->watching; i < DEST_Q_SIZE; i++, ip++)
- *ip = 0;
-
uv_bau_table_bases[blade] = bau_tabp;
return bau_tabp;
@@ -628,7 +620,6 @@ uv_table_bases_finish(int blade, int node, int cur_cpu,
bcp->bau_msg_head = bau_tablesp->va_queue_first;
bcp->va_queue_first = bau_tablesp->va_queue_first;
bcp->va_queue_last = bau_tablesp->va_queue_last;
- bcp->watching = bau_tablesp->watching;
bcp->msg_statuses = bau_tablesp->msg_statuses;
bcp->descriptor_base = adp;
}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 2d1f4c7e4052..ce6650eb64e9 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -292,8 +292,10 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
tsk->thread.error_code = error_code;
tsk->thread.trap_no = 8;
- /* This is always a kernel trap and never fixable (and thus must
- never return). */
+ /*
+ * This is always a kernel trap and never fixable (and thus must
+ * never return).
+ */
for (;;)
die(str, regs, error_code);
}
@@ -520,9 +522,11 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
}
#ifdef CONFIG_X86_64
-/* Help handler running on IST stack to switch back to user stack
- for scheduling or signal handling. The actual stack switch is done in
- entry.S */
+/*
+ * Help handler running on IST stack to switch back to user stack
+ * for scheduling or signal handling. The actual stack switch is done in
+ * entry.S
+ */
asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
{
struct pt_regs *regs = eregs;
@@ -532,8 +536,10 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
/* Exception from user space */
else if (user_mode(eregs))
regs = task_pt_regs(current);
- /* Exception from kernel and interrupts are enabled. Move to
- kernel process stack. */
+ /*
+ * Exception from kernel and interrupts are enabled. Move to
+ * kernel process stack.
+ */
else if (eregs->flags & X86_EFLAGS_IF)
regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
if (eregs != regs)
@@ -685,12 +691,7 @@ void math_error(void __user *ip)
cwd = get_fpu_cwd(task);
swd = get_fpu_swd(task);
- err = swd & ~cwd & 0x3f;
-
-#ifdef CONFIG_X86_32
- if (!err)
- return;
-#endif
+ err = swd & ~cwd;
if (err & 0x001) { /* Invalid op */
/*
@@ -708,7 +709,11 @@ void math_error(void __user *ip)
} else if (err & 0x020) { /* Precision */
info.si_code = FPE_FLTRES;
} else {
- info.si_code = __SI_FAULT|SI_KERNEL; /* WTF? */
+ /*
+ * If we're using IRQ 13, or supposedly even some trap 16
+ * implementations, it's possible we get a spurious trap...
+ */
+ return; /* Spurious trap, no error */
}
force_sig_info(SIGFPE, &info, task);
}
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 15c3e6999182..2b54fe002e94 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -159,7 +159,7 @@ int save_i387_xstate(void __user *buf)
* Restore the extended state if present. Otherwise, restore the FP/SSE
* state.
*/
-int restore_user_xstate(void __user *buf)
+static int restore_user_xstate(void __user *buf)
{
struct _fpx_sw_bytes fx_sw_user;
u64 mask;
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index c02343594b4d..d3ec292f00f2 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,8 +7,8 @@ common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
ifeq ($(CONFIG_KVM_TRACE),y)
common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
endif
-ifeq ($(CONFIG_DMAR),y)
-common-objs += $(addprefix ../../../virt/kvm/, vtd.o)
+ifeq ($(CONFIG_IOMMU_API),y)
+common-objs += $(addprefix ../../../virt/kvm/, iommu.o)
endif
EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 59ebd37ad79e..e665d1c623ca 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -603,10 +603,29 @@ void kvm_free_pit(struct kvm *kvm)
static void __inject_pit_timer_intr(struct kvm *kvm)
{
+ struct kvm_vcpu *vcpu;
+ int i;
+
mutex_lock(&kvm->lock);
kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
mutex_unlock(&kvm->lock);
+
+ /*
+ * Provides NMI watchdog support via Virtual Wire mode.
+ * The route is: PIT -> PIC -> LVT0 in NMI mode.
+ *
+ * Note: Our Virtual Wire implementation is simplified, only
+ * propagating PIT interrupts to all VCPUs when they have set
+ * LVT0 to NMI delivery. Other PIC interrupts are just sent to
+ * VCPU0, and only if its LVT0 is in EXTINT mode.
+ */
+ if (kvm->arch.vapics_in_nmi_mode > 0)
+ for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+ vcpu = kvm->vcpus[i];
+ if (vcpu)
+ kvm_apic_nmi_wd_deliver(vcpu);
+ }
}
void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 17e41e165f1a..179dcb0103fd 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -26,10 +26,40 @@
* Port from Qemu.
*/
#include <linux/mm.h>
+#include <linux/bitops.h>
#include "irq.h"
#include <linux/kvm_host.h>
+static void pic_lock(struct kvm_pic *s)
+{
+ spin_lock(&s->lock);
+}
+
+static void pic_unlock(struct kvm_pic *s)
+{
+ struct kvm *kvm = s->kvm;
+ unsigned acks = s->pending_acks;
+ bool wakeup = s->wakeup_needed;
+ struct kvm_vcpu *vcpu;
+
+ s->pending_acks = 0;
+ s->wakeup_needed = false;
+
+ spin_unlock(&s->lock);
+
+ while (acks) {
+ kvm_notify_acked_irq(kvm, __ffs(acks));
+ acks &= acks - 1;
+ }
+
+ if (wakeup) {
+ vcpu = s->kvm->vcpus[0];
+ if (vcpu)
+ kvm_vcpu_kick(vcpu);
+ }
+}
+
static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
{
s->isr &= ~(1 << irq);
@@ -136,17 +166,21 @@ static void pic_update_irq(struct kvm_pic *s)
void kvm_pic_update_irq(struct kvm_pic *s)
{
+ pic_lock(s);
pic_update_irq(s);
+ pic_unlock(s);
}
void kvm_pic_set_irq(void *opaque, int irq, int level)
{
struct kvm_pic *s = opaque;
+ pic_lock(s);
if (irq >= 0 && irq < PIC_NUM_PINS) {
pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
pic_update_irq(s);
}
+ pic_unlock(s);
}
/*
@@ -172,6 +206,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
int irq, irq2, intno;
struct kvm_pic *s = pic_irqchip(kvm);
+ pic_lock(s);
irq = pic_get_irq(&s->pics[0]);
if (irq >= 0) {
pic_intack(&s->pics[0], irq);
@@ -196,6 +231,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
intno = s->pics[0].irq_base + irq;
}
pic_update_irq(s);
+ pic_unlock(s);
kvm_notify_acked_irq(kvm, irq);
return intno;
@@ -203,7 +239,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
void kvm_pic_reset(struct kvm_kpic_state *s)
{
- int irq, irqbase;
+ int irq, irqbase, n;
struct kvm *kvm = s->pics_state->irq_request_opaque;
struct kvm_vcpu *vcpu0 = kvm->vcpus[0];
@@ -214,8 +250,10 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
- if (s->irr & (1 << irq) || s->isr & (1 << irq))
- kvm_notify_acked_irq(kvm, irq+irqbase);
+ if (s->irr & (1 << irq) || s->isr & (1 << irq)) {
+ n = irq + irqbase;
+ s->pics_state->pending_acks |= 1 << n;
+ }
}
s->last_irr = 0;
s->irr = 0;
@@ -406,6 +444,7 @@ static void picdev_write(struct kvm_io_device *this,
printk(KERN_ERR "PIC: non byte write\n");
return;
}
+ pic_lock(s);
switch (addr) {
case 0x20:
case 0x21:
@@ -418,6 +457,7 @@ static void picdev_write(struct kvm_io_device *this,
elcr_ioport_write(&s->pics[addr & 1], addr, data);
break;
}
+ pic_unlock(s);
}
static void picdev_read(struct kvm_io_device *this,
@@ -431,6 +471,7 @@ static void picdev_read(struct kvm_io_device *this,
printk(KERN_ERR "PIC: non byte read\n");
return;
}
+ pic_lock(s);
switch (addr) {
case 0x20:
case 0x21:
@@ -444,6 +485,7 @@ static void picdev_read(struct kvm_io_device *this,
break;
}
*(unsigned char *)val = data;
+ pic_unlock(s);
}
/*
@@ -459,7 +501,7 @@ static void pic_irq_request(void *opaque, int level)
s->output = level;
if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
s->pics[0].isr_ack &= ~(1 << irq);
- kvm_vcpu_kick(vcpu);
+ s->wakeup_needed = true;
}
}
@@ -469,6 +511,8 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
if (!s)
return NULL;
+ spin_lock_init(&s->lock);
+ s->kvm = kvm;
s->pics[0].elcr_mask = 0xf8;
s->pics[1].elcr_mask = 0xde;
s->irq_request = pic_irq_request;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index f17c8f5bbf31..2bf32a03ceec 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -25,6 +25,7 @@
#include <linux/mm_types.h>
#include <linux/hrtimer.h>
#include <linux/kvm_host.h>
+#include <linux/spinlock.h>
#include "iodev.h"
#include "ioapic.h"
@@ -59,6 +60,10 @@ struct kvm_kpic_state {
};
struct kvm_pic {
+ spinlock_t lock;
+ bool wakeup_needed;
+ unsigned pending_acks;
+ struct kvm *kvm;
struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
irq_request_func *irq_request;
void *irq_request_opaque;
@@ -87,6 +92,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s);
void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
+void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu);
void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
index 65ef0fc2c036..8e5ee99551f6 100644
--- a/arch/x86/kvm/kvm_svm.h
+++ b/arch/x86/kvm/kvm_svm.h
@@ -7,7 +7,7 @@
#include <linux/kvm_host.h>
#include <asm/msr.h>
-#include "svm.h"
+#include <asm/svm.h>
static const u32 host_save_user_msrs[] = {
#ifdef CONFIG_X86_64
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0fc3cab48943..afac68c0815c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -130,6 +130,11 @@ static inline int apic_lvtt_period(struct kvm_lapic *apic)
return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
}
+static inline int apic_lvt_nmi_mode(u32 lvt_val)
+{
+ return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
+}
+
static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */
LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */
@@ -354,6 +359,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
case APIC_DM_NMI:
kvm_inject_nmi(vcpu);
+ kvm_vcpu_kick(vcpu);
break;
case APIC_DM_INIT:
@@ -380,6 +386,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
}
break;
+ case APIC_DM_EXTINT:
+ /*
+ * Should only be called by kvm_apic_local_deliver() with LVT0,
+ * before NMI watchdog was enabled. Already handled by
+ * kvm_apic_accept_pic_intr().
+ */
+ break;
+
default:
printk(KERN_ERR "TODO: unsupported delivery mode %x\n",
delivery_mode);
@@ -663,6 +677,20 @@ static void start_apic_timer(struct kvm_lapic *apic)
apic->timer.period)));
}
+static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
+{
+ int nmi_wd_enabled = apic_lvt_nmi_mode(apic_get_reg(apic, APIC_LVT0));
+
+ if (apic_lvt_nmi_mode(lvt0_val)) {
+ if (!nmi_wd_enabled) {
+ apic_debug("Receive NMI setting on APIC_LVT0 "
+ "for cpu %d\n", apic->vcpu->vcpu_id);
+ apic->vcpu->kvm->arch.vapics_in_nmi_mode++;
+ }
+ } else if (nmi_wd_enabled)
+ apic->vcpu->kvm->arch.vapics_in_nmi_mode--;
+}
+
static void apic_mmio_write(struct kvm_io_device *this,
gpa_t address, int len, const void *data)
{
@@ -743,10 +771,11 @@ static void apic_mmio_write(struct kvm_io_device *this,
apic_set_reg(apic, APIC_ICR2, val & 0xff000000);
break;
+ case APIC_LVT0:
+ apic_manage_nmi_watchdog(apic, val);
case APIC_LVTT:
case APIC_LVTTHMR:
case APIC_LVTPC:
- case APIC_LVT0:
case APIC_LVT1:
case APIC_LVTERR:
/* TODO: Check vector */
@@ -961,12 +990,26 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
return 0;
}
-static int __inject_apic_timer_irq(struct kvm_lapic *apic)
+static int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
+{
+ u32 reg = apic_get_reg(apic, lvt_type);
+ int vector, mode, trig_mode;
+
+ if (apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
+ vector = reg & APIC_VECTOR_MASK;
+ mode = reg & APIC_MODE_MASK;
+ trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
+ return __apic_accept_irq(apic, mode, vector, 1, trig_mode);
+ }
+ return 0;
+}
+
+void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
{
- int vector;
+ struct kvm_lapic *apic = vcpu->arch.apic;
- vector = apic_lvt_vector(apic, APIC_LVTT);
- return __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0);
+ if (apic)
+ kvm_apic_local_deliver(apic, APIC_LVT0);
}
static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
@@ -1061,9 +1104,8 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (apic && apic_lvt_enabled(apic, APIC_LVTT) &&
- atomic_read(&apic->timer.pending) > 0) {
- if (__inject_apic_timer_irq(apic))
+ if (apic && atomic_read(&apic->timer.pending) > 0) {
+ if (kvm_apic_local_deliver(apic, APIC_LVTT))
atomic_dec(&apic->timer.pending);
}
}
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 410ddbc1aa2e..83f11c7474a1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -17,7 +17,6 @@
*
*/
-#include "vmx.h"
#include "mmu.h"
#include <linux/kvm_host.h>
@@ -33,6 +32,7 @@
#include <asm/page.h>
#include <asm/cmpxchg.h>
#include <asm/io.h>
+#include <asm/vmx.h>
/*
* When setting this variable to true it enables Two-Dimensional-Paging
@@ -168,6 +168,7 @@ static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
static u64 __read_mostly shadow_user_mask;
static u64 __read_mostly shadow_accessed_mask;
static u64 __read_mostly shadow_dirty_mask;
+static u64 __read_mostly shadow_mt_mask;
void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
{
@@ -183,13 +184,14 @@ void kvm_mmu_set_base_ptes(u64 base_pte)
EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
- u64 dirty_mask, u64 nx_mask, u64 x_mask)
+ u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask)
{
shadow_user_mask = user_mask;
shadow_accessed_mask = accessed_mask;
shadow_dirty_mask = dirty_mask;
shadow_nx_mask = nx_mask;
shadow_x_mask = x_mask;
+ shadow_mt_mask = mt_mask;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
@@ -384,7 +386,9 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
{
int *write_count;
- write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
+ gfn = unalias_gfn(kvm, gfn);
+ write_count = slot_largepage_idx(gfn,
+ gfn_to_memslot_unaliased(kvm, gfn));
*write_count += 1;
}
@@ -392,16 +396,20 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
{
int *write_count;
- write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
+ gfn = unalias_gfn(kvm, gfn);
+ write_count = slot_largepage_idx(gfn,
+ gfn_to_memslot_unaliased(kvm, gfn));
*write_count -= 1;
WARN_ON(*write_count < 0);
}
static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
{
- struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+ struct kvm_memory_slot *slot;
int *largepage_idx;
+ gfn = unalias_gfn(kvm, gfn);
+ slot = gfn_to_memslot_unaliased(kvm, gfn);
if (slot) {
largepage_idx = slot_largepage_idx(gfn, slot);
return *largepage_idx;
@@ -613,7 +621,7 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
return NULL;
}
-static void rmap_write_protect(struct kvm *kvm, u64 gfn)
+static int rmap_write_protect(struct kvm *kvm, u64 gfn)
{
unsigned long *rmapp;
u64 *spte;
@@ -659,8 +667,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
spte = rmap_next(kvm, rmapp, spte);
}
- if (write_protected)
- kvm_flush_remote_tlbs(kvm);
+ return write_protected;
}
static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
@@ -786,9 +793,11 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
+ INIT_LIST_HEAD(&sp->oos_link);
ASSERT(is_empty_shadow_page(sp->spt));
- sp->slot_bitmap = 0;
+ bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
sp->multimapped = 0;
+ sp->global = 1;
sp->parent_pte = parent_pte;
--vcpu->kvm->arch.n_free_mmu_pages;
return sp;
@@ -900,8 +909,9 @@ static void kvm_mmu_update_unsync_bitmap(u64 *spte)
struct kvm_mmu_page *sp = page_header(__pa(spte));
index = spte - sp->spt;
- __set_bit(index, sp->unsync_child_bitmap);
- sp->unsync_children = 1;
+ if (!__test_and_set_bit(index, sp->unsync_child_bitmap))
+ sp->unsync_children++;
+ WARN_ON(!sp->unsync_children);
}
static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
@@ -928,7 +938,6 @@ static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
- sp->unsync_children = 1;
kvm_mmu_update_parents_unsync(sp);
return 1;
}
@@ -959,38 +968,66 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
}
+#define KVM_PAGE_ARRAY_NR 16
+
+struct kvm_mmu_pages {
+ struct mmu_page_and_offset {
+ struct kvm_mmu_page *sp;
+ unsigned int idx;
+ } page[KVM_PAGE_ARRAY_NR];
+ unsigned int nr;
+};
+
#define for_each_unsync_children(bitmap, idx) \
for (idx = find_first_bit(bitmap, 512); \
idx < 512; \
idx = find_next_bit(bitmap, 512, idx+1))
-static int mmu_unsync_walk(struct kvm_mmu_page *sp,
- struct kvm_unsync_walk *walker)
+int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
+ int idx)
{
- int i, ret;
+ int i;
- if (!sp->unsync_children)
- return 0;
+ if (sp->unsync)
+ for (i=0; i < pvec->nr; i++)
+ if (pvec->page[i].sp == sp)
+ return 0;
+
+ pvec->page[pvec->nr].sp = sp;
+ pvec->page[pvec->nr].idx = idx;
+ pvec->nr++;
+ return (pvec->nr == KVM_PAGE_ARRAY_NR);
+}
+
+static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
+ struct kvm_mmu_pages *pvec)
+{
+ int i, ret, nr_unsync_leaf = 0;
for_each_unsync_children(sp->unsync_child_bitmap, i) {
u64 ent = sp->spt[i];
- if (is_shadow_present_pte(ent)) {
+ if (is_shadow_present_pte(ent) && !is_large_pte(ent)) {
struct kvm_mmu_page *child;
child = page_header(ent & PT64_BASE_ADDR_MASK);
if (child->unsync_children) {
- ret = mmu_unsync_walk(child, walker);
- if (ret)
+ if (mmu_pages_add(pvec, child, i))
+ return -ENOSPC;
+
+ ret = __mmu_unsync_walk(child, pvec);
+ if (!ret)
+ __clear_bit(i, sp->unsync_child_bitmap);
+ else if (ret > 0)
+ nr_unsync_leaf += ret;
+ else
return ret;
- __clear_bit(i, sp->unsync_child_bitmap);
}
if (child->unsync) {
- ret = walker->entry(child, walker);
- __clear_bit(i, sp->unsync_child_bitmap);
- if (ret)
- return ret;
+ nr_unsync_leaf++;
+ if (mmu_pages_add(pvec, child, i))
+ return -ENOSPC;
}
}
}
@@ -998,7 +1035,17 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
sp->unsync_children = 0;
- return 0;
+ return nr_unsync_leaf;
+}
+
+static int mmu_unsync_walk(struct kvm_mmu_page *sp,
+ struct kvm_mmu_pages *pvec)
+{
+ if (!sp->unsync_children)
+ return 0;
+
+ mmu_pages_add(pvec, sp, 0);
+ return __mmu_unsync_walk(sp, pvec);
}
static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
@@ -1021,10 +1068,18 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
return NULL;
}
+static void kvm_unlink_unsync_global(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ list_del(&sp->oos_link);
+ --kvm->stat.mmu_unsync_global;
+}
+
static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
WARN_ON(!sp->unsync);
sp->unsync = 0;
+ if (sp->global)
+ kvm_unlink_unsync_global(kvm, sp);
--kvm->stat.mmu_unsync;
}
@@ -1037,7 +1092,8 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
return 1;
}
- rmap_write_protect(vcpu->kvm, sp->gfn);
+ if (rmap_write_protect(vcpu->kvm, sp->gfn))
+ kvm_flush_remote_tlbs(vcpu->kvm);
kvm_unlink_unsync_page(vcpu->kvm, sp);
if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
kvm_mmu_zap_page(vcpu->kvm, sp);
@@ -1048,30 +1104,89 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
return 0;
}
-struct sync_walker {
- struct kvm_vcpu *vcpu;
- struct kvm_unsync_walk walker;
+struct mmu_page_path {
+ struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
+ unsigned int idx[PT64_ROOT_LEVEL-1];
};
-static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
+#define for_each_sp(pvec, sp, parents, i) \
+ for (i = mmu_pages_next(&pvec, &parents, -1), \
+ sp = pvec.page[i].sp; \
+ i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
+ i = mmu_pages_next(&pvec, &parents, i))
+
+int mmu_pages_next(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents,
+ int i)
{
- struct sync_walker *sync_walk = container_of(walk, struct sync_walker,
- walker);
- struct kvm_vcpu *vcpu = sync_walk->vcpu;
+ int n;
- kvm_sync_page(vcpu, sp);
- return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock));
+ for (n = i+1; n < pvec->nr; n++) {
+ struct kvm_mmu_page *sp = pvec->page[n].sp;
+
+ if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
+ parents->idx[0] = pvec->page[n].idx;
+ return n;
+ }
+
+ parents->parent[sp->role.level-2] = sp;
+ parents->idx[sp->role.level-1] = pvec->page[n].idx;
+ }
+
+ return n;
}
-static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+void mmu_pages_clear_parents(struct mmu_page_path *parents)
{
- struct sync_walker walker = {
- .walker = { .entry = mmu_sync_fn, },
- .vcpu = vcpu,
- };
+ struct kvm_mmu_page *sp;
+ unsigned int level = 0;
+
+ do {
+ unsigned int idx = parents->idx[level];
+
+ sp = parents->parent[level];
+ if (!sp)
+ return;
+
+ --sp->unsync_children;
+ WARN_ON((int)sp->unsync_children < 0);
+ __clear_bit(idx, sp->unsync_child_bitmap);
+ level++;
+ } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
+}
+
+static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
+ struct mmu_page_path *parents,
+ struct kvm_mmu_pages *pvec)
+{
+ parents->parent[parent->role.level-1] = NULL;
+ pvec->nr = 0;
+}
+
+static void mmu_sync_children(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *parent)
+{
+ int i;
+ struct kvm_mmu_page *sp;
+ struct mmu_page_path parents;
+ struct kvm_mmu_pages pages;
+
+ kvm_mmu_pages_init(parent, &parents, &pages);
+ while (mmu_unsync_walk(parent, &pages)) {
+ int protected = 0;
- while (mmu_unsync_walk(sp, &walker.walker))
+ for_each_sp(pages, sp, parents, i)
+ protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
+
+ if (protected)
+ kvm_flush_remote_tlbs(vcpu->kvm);
+
+ for_each_sp(pages, sp, parents, i) {
+ kvm_sync_page(vcpu, sp);
+ mmu_pages_clear_parents(&parents);
+ }
cond_resched_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_pages_init(parent, &parents, &pages);
+ }
}
static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
@@ -1129,7 +1244,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
sp->role = role;
hlist_add_head(&sp->hash_link, bucket);
if (!metaphysical) {
- rmap_write_protect(vcpu->kvm, gfn);
+ if (rmap_write_protect(vcpu->kvm, gfn))
+ kvm_flush_remote_tlbs(vcpu->kvm);
account_shadowed(vcpu->kvm, gfn);
}
if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
@@ -1153,6 +1269,8 @@ static int walk_shadow(struct kvm_shadow_walk *walker,
if (level == PT32E_ROOT_LEVEL) {
shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
shadow_addr &= PT64_BASE_ADDR_MASK;
+ if (!shadow_addr)
+ return 1;
--level;
}
@@ -1237,33 +1355,29 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
}
}
-struct zap_walker {
- struct kvm_unsync_walk walker;
- struct kvm *kvm;
- int zapped;
-};
-
-static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
+static int mmu_zap_unsync_children(struct kvm *kvm,
+ struct kvm_mmu_page *parent)
{
- struct zap_walker *zap_walk = container_of(walk, struct zap_walker,
- walker);
- kvm_mmu_zap_page(zap_walk->kvm, sp);
- zap_walk->zapped = 1;
- return 0;
-}
+ int i, zapped = 0;
+ struct mmu_page_path parents;
+ struct kvm_mmu_pages pages;
-static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
- struct zap_walker walker = {
- .walker = { .entry = mmu_zap_fn, },
- .kvm = kvm,
- .zapped = 0,
- };
-
- if (sp->role.level == PT_PAGE_TABLE_LEVEL)
+ if (parent->role.level == PT_PAGE_TABLE_LEVEL)
return 0;
- mmu_unsync_walk(sp, &walker.walker);
- return walker.zapped;
+
+ kvm_mmu_pages_init(parent, &parents, &pages);
+ while (mmu_unsync_walk(parent, &pages)) {
+ struct kvm_mmu_page *sp;
+
+ for_each_sp(pages, sp, parents, i) {
+ kvm_mmu_zap_page(kvm, sp);
+ mmu_pages_clear_parents(&parents);
+ }
+ zapped += pages.nr;
+ kvm_mmu_pages_init(parent, &parents, &pages);
+ }
+
+ return zapped;
}
static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -1362,7 +1476,7 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
struct kvm_mmu_page *sp = page_header(__pa(pte));
- __set_bit(slot, &sp->slot_bitmap);
+ __set_bit(slot, sp->slot_bitmap);
}
static void mmu_convert_notrap(struct kvm_mmu_page *sp)
@@ -1393,6 +1507,110 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
return page;
}
+/*
+ * The function is based on mtrr_type_lookup() in
+ * arch/x86/kernel/cpu/mtrr/generic.c
+ */
+static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
+ u64 start, u64 end)
+{
+ int i;
+ u64 base, mask;
+ u8 prev_match, curr_match;
+ int num_var_ranges = KVM_NR_VAR_MTRR;
+
+ if (!mtrr_state->enabled)
+ return 0xFF;
+
+ /* Make end inclusive end, instead of exclusive */
+ end--;
+
+ /* Look in fixed ranges. Just return the type as per start */
+ if (mtrr_state->have_fixed && (start < 0x100000)) {
+ int idx;
+
+ if (start < 0x80000) {
+ idx = 0;
+ idx += (start >> 16);
+ return mtrr_state->fixed_ranges[idx];
+ } else if (start < 0xC0000) {
+ idx = 1 * 8;
+ idx += ((start - 0x80000) >> 14);
+ return mtrr_state->fixed_ranges[idx];
+ } else if (start < 0x1000000) {
+ idx = 3 * 8;
+ idx += ((start - 0xC0000) >> 12);
+ return mtrr_state->fixed_ranges[idx];
+ }
+ }
+
+ /*
+ * Look in variable ranges
+ * Look of multiple ranges matching this address and pick type
+ * as per MTRR precedence
+ */
+ if (!(mtrr_state->enabled & 2))
+ return mtrr_state->def_type;
+
+ prev_match = 0xFF;
+ for (i = 0; i < num_var_ranges; ++i) {
+ unsigned short start_state, end_state;
+
+ if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11)))
+ continue;
+
+ base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) +
+ (mtrr_state->var_ranges[i].base_lo & PAGE_MASK);
+ mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) +
+ (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK);
+
+ start_state = ((start & mask) == (base & mask));
+ end_state = ((end & mask) == (base & mask));
+ if (start_state != end_state)
+ return 0xFE;
+
+ if ((start & mask) != (base & mask))
+ continue;
+
+ curr_match = mtrr_state->var_ranges[i].base_lo & 0xff;
+ if (prev_match == 0xFF) {
+ prev_match = curr_match;
+ continue;
+ }
+
+ if (prev_match == MTRR_TYPE_UNCACHABLE ||
+ curr_match == MTRR_TYPE_UNCACHABLE)
+ return MTRR_TYPE_UNCACHABLE;
+
+ if ((prev_match == MTRR_TYPE_WRBACK &&
+ curr_match == MTRR_TYPE_WRTHROUGH) ||
+ (prev_match == MTRR_TYPE_WRTHROUGH &&
+ curr_match == MTRR_TYPE_WRBACK)) {
+ prev_match = MTRR_TYPE_WRTHROUGH;
+ curr_match = MTRR_TYPE_WRTHROUGH;
+ }
+
+ if (prev_match != curr_match)
+ return MTRR_TYPE_UNCACHABLE;
+ }
+
+ if (prev_match != 0xFF)
+ return prev_match;
+
+ return mtrr_state->def_type;
+}
+
+static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ u8 mtrr;
+
+ mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT,
+ (gfn << PAGE_SHIFT) + PAGE_SIZE);
+ if (mtrr == 0xfe || mtrr == 0xff)
+ mtrr = MTRR_TYPE_WRBACK;
+ return mtrr;
+}
+
static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
unsigned index;
@@ -1409,9 +1627,15 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
if (s->role.word != sp->role.word)
return 1;
}
- kvm_mmu_mark_parents_unsync(vcpu, sp);
++vcpu->kvm->stat.mmu_unsync;
sp->unsync = 1;
+
+ if (sp->global) {
+ list_add(&sp->oos_link, &vcpu->kvm->arch.oos_global_pages);
+ ++vcpu->kvm->stat.mmu_unsync_global;
+ } else
+ kvm_mmu_mark_parents_unsync(vcpu, sp);
+
mmu_convert_notrap(sp);
return 0;
}
@@ -1437,11 +1661,24 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
unsigned pte_access, int user_fault,
int write_fault, int dirty, int largepage,
- gfn_t gfn, pfn_t pfn, bool speculative,
+ int global, gfn_t gfn, pfn_t pfn, bool speculative,
bool can_unsync)
{
u64 spte;
int ret = 0;
+ u64 mt_mask = shadow_mt_mask;
+ struct kvm_mmu_page *sp = page_header(__pa(shadow_pte));
+
+ if (!(vcpu->arch.cr4 & X86_CR4_PGE))
+ global = 0;
+ if (!global && sp->global) {
+ sp->global = 0;
+ if (sp->unsync) {
+ kvm_unlink_unsync_global(vcpu->kvm, sp);
+ kvm_mmu_mark_parents_unsync(vcpu, sp);
+ }
+ }
+
/*
* We don't set the accessed bit, since we sometimes want to see
* whether the guest actually used the pte (in order to detect
@@ -1460,6 +1697,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
spte |= shadow_user_mask;
if (largepage)
spte |= PT_PAGE_SIZE_MASK;
+ if (mt_mask) {
+ mt_mask = get_memory_type(vcpu, gfn) <<
+ kvm_x86_ops->get_mt_mask_shift();
+ spte |= mt_mask;
+ }
spte |= (u64)pfn << PAGE_SHIFT;
@@ -1474,6 +1716,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
spte |= PT_WRITABLE_MASK;
+ /*
+ * Optimization: for pte sync, if spte was writable the hash
+ * lookup is unnecessary (and expensive). Write protection
+ * is responsibility of mmu_get_page / kvm_sync_page.
+ * Same reasoning can be applied to dirty page accounting.
+ */
+ if (!can_unsync && is_writeble_pte(*shadow_pte))
+ goto set_pte;
+
if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
pgprintk("%s: found shadow page for %lx, marking ro\n",
__func__, gfn);
@@ -1495,8 +1746,8 @@ set_pte:
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
unsigned pt_access, unsigned pte_access,
int user_fault, int write_fault, int dirty,
- int *ptwrite, int largepage, gfn_t gfn,
- pfn_t pfn, bool speculative)
+ int *ptwrite, int largepage, int global,
+ gfn_t gfn, pfn_t pfn, bool speculative)
{
int was_rmapped = 0;
int was_writeble = is_writeble_pte(*shadow_pte);
@@ -1529,7 +1780,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
}
}
if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
- dirty, largepage, gfn, pfn, speculative, true)) {
+ dirty, largepage, global, gfn, pfn, speculative, true)) {
if (write_fault)
*ptwrite = 1;
kvm_x86_ops->tlb_flush(vcpu);
@@ -1586,7 +1837,7 @@ static int direct_map_entry(struct kvm_shadow_walk *_walk,
|| (walk->largepage && level == PT_DIRECTORY_LEVEL)) {
mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL,
0, walk->write, 1, &walk->pt_write,
- walk->largepage, gfn, walk->pfn, false);
+ walk->largepage, 0, gfn, walk->pfn, false);
++vcpu->stat.pf_fixed;
return 1;
}
@@ -1773,6 +2024,15 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
}
}
+static void mmu_sync_global(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_mmu_page *sp, *n;
+
+ list_for_each_entry_safe(sp, n, &kvm->arch.oos_global_pages, oos_link)
+ kvm_sync_page(vcpu, sp);
+}
+
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
{
spin_lock(&vcpu->kvm->mmu_lock);
@@ -1780,6 +2040,13 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
spin_unlock(&vcpu->kvm->mmu_lock);
}
+void kvm_mmu_sync_global(struct kvm_vcpu *vcpu)
+{
+ spin_lock(&vcpu->kvm->mmu_lock);
+ mmu_sync_global(vcpu);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+}
+
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
{
return vaddr;
@@ -2178,7 +2445,8 @@ static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
}
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- const u8 *new, int bytes)
+ const u8 *new, int bytes,
+ bool guest_initiated)
{
gfn_t gfn = gpa >> PAGE_SHIFT;
struct kvm_mmu_page *sp;
@@ -2204,15 +2472,17 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
kvm_mmu_free_some_pages(vcpu);
++vcpu->kvm->stat.mmu_pte_write;
kvm_mmu_audit(vcpu, "pre pte write");
- if (gfn == vcpu->arch.last_pt_write_gfn
- && !last_updated_pte_accessed(vcpu)) {
- ++vcpu->arch.last_pt_write_count;
- if (vcpu->arch.last_pt_write_count >= 3)
- flooded = 1;
- } else {
- vcpu->arch.last_pt_write_gfn = gfn;
- vcpu->arch.last_pt_write_count = 1;
- vcpu->arch.last_pte_updated = NULL;
+ if (guest_initiated) {
+ if (gfn == vcpu->arch.last_pt_write_gfn
+ && !last_updated_pte_accessed(vcpu)) {
+ ++vcpu->arch.last_pt_write_count;
+ if (vcpu->arch.last_pt_write_count >= 3)
+ flooded = 1;
+ } else {
+ vcpu->arch.last_pt_write_gfn = gfn;
+ vcpu->arch.last_pt_write_count = 1;
+ vcpu->arch.last_pte_updated = NULL;
+ }
}
index = kvm_page_table_hashfn(gfn);
bucket = &vcpu->kvm->arch.mmu_page_hash[index];
@@ -2352,9 +2622,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
- spin_lock(&vcpu->kvm->mmu_lock);
vcpu->arch.mmu.invlpg(vcpu, gva);
- spin_unlock(&vcpu->kvm->mmu_lock);
kvm_mmu_flush_tlb(vcpu);
++vcpu->stat.invlpg;
}
@@ -2451,7 +2719,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
int i;
u64 *pt;
- if (!test_bit(slot, &sp->slot_bitmap))
+ if (!test_bit(slot, sp->slot_bitmap))
continue;
pt = sp->spt;
@@ -2860,8 +3128,8 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
if (sp->role.metaphysical)
continue;
- slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
gfn = unalias_gfn(vcpu->kvm, sp->gfn);
+ slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
rmapp = &slot->rmap[gfn - slot->base_gfn];
if (*rmapp)
printk(KERN_ERR "%s: (%s) shadow page has writable"
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 84eee43bbe74..9fd78b6e17ad 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -82,6 +82,7 @@ struct shadow_walker {
int *ptwrite;
pfn_t pfn;
u64 *sptep;
+ gpa_t pte_gpa;
};
static gfn_t gpte_to_gfn(pt_element_t gpte)
@@ -222,7 +223,7 @@ walk:
if (ret)
goto walk;
pte |= PT_DIRTY_MASK;
- kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
+ kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte), 0);
walker->ptes[walker->level - 1] = pte;
}
@@ -274,7 +275,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
return;
kvm_get_pfn(pfn);
mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
- gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
+ gpte & PT_DIRTY_MASK, NULL, largepage,
+ gpte & PT_GLOBAL_MASK, gpte_to_gfn(gpte),
pfn, true);
}
@@ -301,8 +303,9 @@ static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw,
mmu_set_spte(vcpu, sptep, access, gw->pte_access & access,
sw->user_fault, sw->write_fault,
gw->ptes[gw->level-1] & PT_DIRTY_MASK,
- sw->ptwrite, sw->largepage, gw->gfn, sw->pfn,
- false);
+ sw->ptwrite, sw->largepage,
+ gw->ptes[gw->level-1] & PT_GLOBAL_MASK,
+ gw->gfn, sw->pfn, false);
sw->sptep = sptep;
return 1;
}
@@ -466,10 +469,22 @@ static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
struct kvm_vcpu *vcpu, u64 addr,
u64 *sptep, int level)
{
+ struct shadow_walker *sw =
+ container_of(_sw, struct shadow_walker, walker);
- if (level == PT_PAGE_TABLE_LEVEL) {
- if (is_shadow_present_pte(*sptep))
+ /* FIXME: properly handle invlpg on large guest pages */
+ if (level == PT_PAGE_TABLE_LEVEL ||
+ ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) {
+ struct kvm_mmu_page *sp = page_header(__pa(sptep));
+
+ sw->pte_gpa = (sp->gfn << PAGE_SHIFT);
+ sw->pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
+
+ if (is_shadow_present_pte(*sptep)) {
rmap_remove(vcpu->kvm, sptep);
+ if (is_large_pte(*sptep))
+ --vcpu->kvm->stat.lpages;
+ }
set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
return 1;
}
@@ -480,11 +495,26 @@ static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
{
+ pt_element_t gpte;
struct shadow_walker walker = {
.walker = { .entry = FNAME(shadow_invlpg_entry), },
+ .pte_gpa = -1,
};
+ spin_lock(&vcpu->kvm->mmu_lock);
walk_shadow(&walker.walker, vcpu, gva);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ if (walker.pte_gpa == -1)
+ return;
+ if (kvm_read_guest_atomic(vcpu->kvm, walker.pte_gpa, &gpte,
+ sizeof(pt_element_t)))
+ return;
+ if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) {
+ if (mmu_topup_memory_caches(vcpu))
+ return;
+ kvm_mmu_pte_write(vcpu, walker.pte_gpa, (const u8 *)&gpte,
+ sizeof(pt_element_t), 0);
+ }
}
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
@@ -580,7 +610,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
nr_present++;
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
- is_dirty_pte(gpte), 0, gfn,
+ is_dirty_pte(gpte), 0, gpte & PT_GLOBAL_MASK, gfn,
spte_to_pfn(sp->spt[i]), true, false);
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9c4ce657d963..1452851ae258 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -28,6 +28,8 @@
#include <asm/desc.h>
+#include <asm/virtext.h>
+
#define __ex(x) __kvm_handle_fault_on_reboot(x)
MODULE_AUTHOR("Qumranet");
@@ -245,34 +247,19 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
static int has_svm(void)
{
- uint32_t eax, ebx, ecx, edx;
-
- if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
- printk(KERN_INFO "has_svm: not amd\n");
- return 0;
- }
+ const char *msg;
- cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
- if (eax < SVM_CPUID_FUNC) {
- printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n");
+ if (!cpu_has_svm(&msg)) {
+ printk(KERN_INFO "has_svn: %s\n", msg);
return 0;
}
- cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
- if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
- printk(KERN_DEBUG "has_svm: svm not available\n");
- return 0;
- }
return 1;
}
static void svm_hardware_disable(void *garbage)
{
- uint64_t efer;
-
- wrmsrl(MSR_VM_HSAVE_PA, 0);
- rdmsrl(MSR_EFER, efer);
- wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
+ cpu_svm_disable();
}
static void svm_hardware_enable(void *garbage)
@@ -772,6 +759,22 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
+
+ /*
+ * SVM always stores 0 for the 'G' bit in the CS selector in
+ * the VMCB on a VMEXIT. This hurts cross-vendor migration:
+ * Intel's VMENTRY has a check on the 'G' bit.
+ */
+ if (seg == VCPU_SREG_CS)
+ var->g = s->limit > 0xfffff;
+
+ /*
+ * Work around a bug where the busy flag in the tr selector
+ * isn't exposed
+ */
+ if (seg == VCPU_SREG_TR)
+ var->type |= 0x2;
+
var->unusable = !var->present;
}
@@ -1099,6 +1102,7 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
rep = (io_info & SVM_IOIO_REP_MASK) != 0;
down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0;
+ skip_emulated_instruction(&svm->vcpu);
return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
}
@@ -1912,6 +1916,11 @@ static int get_npt_level(void)
#endif
}
+static int svm_get_mt_mask_shift(void)
+{
+ return 0;
+}
+
static struct kvm_x86_ops svm_x86_ops = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
@@ -1967,6 +1976,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.set_tss_addr = svm_set_tss_addr,
.get_tdp_level = get_npt_level,
+ .get_mt_mask_shift = svm_get_mt_mask_shift,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a4018b01e1f9..6259d7467648 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -16,7 +16,6 @@
*/
#include "irq.h"
-#include "vmx.h"
#include "mmu.h"
#include <linux/kvm_host.h>
@@ -31,6 +30,8 @@
#include <asm/io.h>
#include <asm/desc.h>
+#include <asm/vmx.h>
+#include <asm/virtext.h>
#define __ex(x) __kvm_handle_fault_on_reboot(x)
@@ -90,6 +91,11 @@ struct vcpu_vmx {
} rmode;
int vpid;
bool emulation_required;
+
+ /* Support for vnmi-less CPUs */
+ int soft_vnmi_blocked;
+ ktime_t entry_time;
+ s64 vnmi_blocked_time;
};
static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -122,7 +128,7 @@ static struct vmcs_config {
u32 vmentry_ctrl;
} vmcs_config;
-struct vmx_capability {
+static struct vmx_capability {
u32 ept;
u32 vpid;
} vmx_capability;
@@ -957,6 +963,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data);
break;
+ case MSR_IA32_CR_PAT:
+ if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+ vmcs_write64(GUEST_IA32_PAT, data);
+ vcpu->arch.pat = data;
+ break;
+ }
+ /* Otherwise falls through to kvm_set_msr_common */
default:
vmx_load_host_state(vmx);
msr = find_msr_entry(vmx, msr_index);
@@ -1032,8 +1045,7 @@ static int vmx_get_irq(struct kvm_vcpu *vcpu)
static __init int cpu_has_kvm_support(void)
{
- unsigned long ecx = cpuid_ecx(1);
- return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
+ return cpu_has_vmx();
}
static __init int vmx_disabled_by_bios(void)
@@ -1079,13 +1091,22 @@ static void vmclear_local_vcpus(void)
__vcpu_clear(vmx);
}
-static void hardware_disable(void *garbage)
+
+/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
+ * tricks.
+ */
+static void kvm_cpu_vmxoff(void)
{
- vmclear_local_vcpus();
asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
write_cr4(read_cr4() & ~X86_CR4_VMXE);
}
+static void hardware_disable(void *garbage)
+{
+ vmclear_local_vcpus();
+ kvm_cpu_vmxoff();
+}
+
static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
u32 msr, u32 *result)
{
@@ -1176,12 +1197,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
#ifdef CONFIG_X86_64
min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
#endif
- opt = 0;
+ opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
&_vmexit_control) < 0)
return -EIO;
- min = opt = 0;
+ min = 0;
+ opt = VM_ENTRY_LOAD_IA32_PAT;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
&_vmentry_control) < 0)
return -EIO;
@@ -2087,8 +2109,9 @@ static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
*/
static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
{
- u32 host_sysenter_cs;
+ u32 host_sysenter_cs, msr_low, msr_high;
u32 junk;
+ u64 host_pat;
unsigned long a;
struct descriptor_table dt;
int i;
@@ -2176,6 +2199,20 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
rdmsrl(MSR_IA32_SYSENTER_EIP, a);
vmcs_writel(HOST_IA32_SYSENTER_EIP, a); /* 22.2.3 */
+ if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
+ rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
+ host_pat = msr_low | ((u64) msr_high << 32);
+ vmcs_write64(HOST_IA32_PAT, host_pat);
+ }
+ if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+ rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
+ host_pat = msr_low | ((u64) msr_high << 32);
+ /* Write the default value follow host pat */
+ vmcs_write64(GUEST_IA32_PAT, host_pat);
+ /* Keep arch.pat sync with GUEST_IA32_PAT */
+ vmx->vcpu.arch.pat = host_pat;
+ }
+
for (i = 0; i < NR_VMX_MSR; ++i) {
u32 index = vmx_msr_index[i];
u32 data_low, data_high;
@@ -2230,6 +2267,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmx->vcpu.arch.rmode.active = 0;
+ vmx->soft_vnmi_blocked = 0;
+
vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
kvm_set_cr8(&vmx->vcpu, 0);
msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
@@ -2335,6 +2374,29 @@ out:
return ret;
}
+static void enable_irq_window(struct kvm_vcpu *vcpu)
+{
+ u32 cpu_based_vm_exec_control;
+
+ cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+
+static void enable_nmi_window(struct kvm_vcpu *vcpu)
+{
+ u32 cpu_based_vm_exec_control;
+
+ if (!cpu_has_virtual_nmis()) {
+ enable_irq_window(vcpu);
+ return;
+ }
+
+ cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
+ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+
static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2358,10 +2420,54 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!cpu_has_virtual_nmis()) {
+ /*
+ * Tracking the NMI-blocked state in software is built upon
+ * finding the next open IRQ window. This, in turn, depends on
+ * well-behaving guests: They have to keep IRQs disabled at
+ * least as long as the NMI handler runs. Otherwise we may
+ * cause NMI nesting, maybe breaking the guest. But as this is
+ * highly unlikely, we can live with the residual risk.
+ */
+ vmx->soft_vnmi_blocked = 1;
+ vmx->vnmi_blocked_time = 0;
+ }
+
+ ++vcpu->stat.nmi_injections;
+ if (vcpu->arch.rmode.active) {
+ vmx->rmode.irq.pending = true;
+ vmx->rmode.irq.vector = NMI_VECTOR;
+ vmx->rmode.irq.rip = kvm_rip_read(vcpu);
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+ NMI_VECTOR | INTR_TYPE_SOFT_INTR |
+ INTR_INFO_VALID_MASK);
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
+ kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
+ return;
+ }
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
}
+static void vmx_update_window_states(struct kvm_vcpu *vcpu)
+{
+ u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+
+ vcpu->arch.nmi_window_open =
+ !(guest_intr & (GUEST_INTR_STATE_STI |
+ GUEST_INTR_STATE_MOV_SS |
+ GUEST_INTR_STATE_NMI));
+ if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
+ vcpu->arch.nmi_window_open = 0;
+
+ vcpu->arch.interrupt_window_open =
+ ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+ !(guest_intr & (GUEST_INTR_STATE_STI |
+ GUEST_INTR_STATE_MOV_SS)));
+}
+
static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
{
int word_index = __ffs(vcpu->arch.irq_summary);
@@ -2374,40 +2480,49 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
kvm_queue_interrupt(vcpu, irq);
}
-
static void do_interrupt_requests(struct kvm_vcpu *vcpu,
struct kvm_run *kvm_run)
{
- u32 cpu_based_vm_exec_control;
-
- vcpu->arch.interrupt_window_open =
- ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
- (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
+ vmx_update_window_states(vcpu);
- if (vcpu->arch.interrupt_window_open &&
- vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
- kvm_do_inject_irq(vcpu);
+ if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
+ if (vcpu->arch.interrupt.pending) {
+ enable_nmi_window(vcpu);
+ } else if (vcpu->arch.nmi_window_open) {
+ vcpu->arch.nmi_pending = false;
+ vcpu->arch.nmi_injected = true;
+ } else {
+ enable_nmi_window(vcpu);
+ return;
+ }
+ }
+ if (vcpu->arch.nmi_injected) {
+ vmx_inject_nmi(vcpu);
+ if (vcpu->arch.nmi_pending)
+ enable_nmi_window(vcpu);
+ else if (vcpu->arch.irq_summary
+ || kvm_run->request_interrupt_window)
+ enable_irq_window(vcpu);
+ return;
+ }
- if (vcpu->arch.interrupt_window_open && vcpu->arch.interrupt.pending)
- vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+ if (vcpu->arch.interrupt_window_open) {
+ if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
+ kvm_do_inject_irq(vcpu);
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+ if (vcpu->arch.interrupt.pending)
+ vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+ }
if (!vcpu->arch.interrupt_window_open &&
(vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
- /*
- * Interrupts blocked. Wait for unblock.
- */
- cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
- else
- cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+ enable_irq_window(vcpu);
}
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
{
int ret;
struct kvm_userspace_memory_region tss_mem = {
- .slot = 8,
+ .slot = TSS_PRIVATE_MEMSLOT,
.guest_phys_addr = addr,
.memory_size = PAGE_SIZE * 3,
.flags = 0,
@@ -2492,7 +2607,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
}
- if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
+ if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
return 1; /* already handled by vmx_vcpu_run() */
if (is_no_device(intr_info)) {
@@ -2581,6 +2696,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
rep = (exit_qualification & 32) != 0;
port = exit_qualification >> 16;
+ skip_emulated_instruction(vcpu);
return kvm_emulate_pio(vcpu, kvm_run, in, size, port);
}
@@ -2767,6 +2883,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
KVMTRACE_0D(PEND_INTR, vcpu, handler);
+ ++vcpu->stat.irq_window_exits;
/*
* If the user space waits to inject interrupts, exit as soon as
@@ -2775,7 +2892,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
if (kvm_run->request_interrupt_window &&
!vcpu->arch.irq_summary) {
kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
- ++vcpu->stat.irq_window_exits;
return 0;
}
return 1;
@@ -2832,6 +2948,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long exit_qualification;
u16 tss_selector;
int reason;
@@ -2839,6 +2956,15 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
reason = (u32)exit_qualification >> 30;
+ if (reason == TASK_SWITCH_GATE && vmx->vcpu.arch.nmi_injected &&
+ (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK)
+ == INTR_TYPE_NMI_INTR) {
+ vcpu->arch.nmi_injected = false;
+ if (cpu_has_virtual_nmis())
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ }
tss_selector = exit_qualification;
return kvm_task_switch(vcpu, tss_selector, reason);
@@ -2927,16 +3053,12 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
while (!guest_state_valid(vcpu)) {
err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
- switch (err) {
- case EMULATE_DONE:
- break;
- case EMULATE_DO_MMIO:
- kvm_report_emulation_failure(vcpu, "mmio");
- /* TODO: Handle MMIO */
- return;
- default:
- kvm_report_emulation_failure(vcpu, "emulation failure");
- return;
+ if (err == EMULATE_DO_MMIO)
+ break;
+
+ if (err != EMULATE_DONE) {
+ kvm_report_emulation_failure(vcpu, "emulation failure");
+ return;
}
if (signal_pending(current))
@@ -2948,8 +3070,10 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
local_irq_disable();
preempt_disable();
- /* Guest state should be valid now, no more emulation should be needed */
- vmx->emulation_required = 0;
+ /* Guest state should be valid now except if we need to
+ * emulate an MMIO */
+ if (guest_state_valid(vcpu))
+ vmx->emulation_required = 0;
}
/*
@@ -2996,6 +3120,11 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
(u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit);
+ /* If we need to emulate an MMIO from handle_invalid_guest_state
+ * we just return 0 */
+ if (vmx->emulation_required && emulate_invalid_guest_state)
+ return 0;
+
/* Access CR3 don't cause VMExit in paging mode, so we need
* to sync with guest real CR3. */
if (vm_need_ept() && is_paging(vcpu)) {
@@ -3012,9 +3141,32 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
(exit_reason != EXIT_REASON_EXCEPTION_NMI &&
- exit_reason != EXIT_REASON_EPT_VIOLATION))
- printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
- "exit reason is 0x%x\n", __func__, exit_reason);
+ exit_reason != EXIT_REASON_EPT_VIOLATION &&
+ exit_reason != EXIT_REASON_TASK_SWITCH))
+ printk(KERN_WARNING "%s: unexpected, valid vectoring info "
+ "(0x%x) and exit reason is 0x%x\n",
+ __func__, vectoring_info, exit_reason);
+
+ if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
+ if (vcpu->arch.interrupt_window_open) {
+ vmx->soft_vnmi_blocked = 0;
+ vcpu->arch.nmi_window_open = 1;
+ } else if (vmx->vnmi_blocked_time > 1000000000LL &&
+ vcpu->arch.nmi_pending) {
+ /*
+ * This CPU don't support us in finding the end of an
+ * NMI-blocked window if the guest runs with IRQs
+ * disabled. So we pull the trigger after 1 s of
+ * futile waiting, but inform the user about this.
+ */
+ printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
+ "state on VCPU %d after 1 s timeout\n",
+ __func__, vcpu->vcpu_id);
+ vmx->soft_vnmi_blocked = 0;
+ vmx->vcpu.arch.nmi_window_open = 1;
+ }
+ }
+
if (exit_reason < kvm_vmx_max_exit_handlers
&& kvm_vmx_exit_handlers[exit_reason])
return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
@@ -3042,51 +3194,6 @@ static void update_tpr_threshold(struct kvm_vcpu *vcpu)
vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4);
}
-static void enable_irq_window(struct kvm_vcpu *vcpu)
-{
- u32 cpu_based_vm_exec_control;
-
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-}
-
-static void enable_nmi_window(struct kvm_vcpu *vcpu)
-{
- u32 cpu_based_vm_exec_control;
-
- if (!cpu_has_virtual_nmis())
- return;
-
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-}
-
-static int vmx_nmi_enabled(struct kvm_vcpu *vcpu)
-{
- u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
- return !(guest_intr & (GUEST_INTR_STATE_NMI |
- GUEST_INTR_STATE_MOV_SS |
- GUEST_INTR_STATE_STI));
-}
-
-static int vmx_irq_enabled(struct kvm_vcpu *vcpu)
-{
- u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
- return (!(guest_intr & (GUEST_INTR_STATE_MOV_SS |
- GUEST_INTR_STATE_STI)) &&
- (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF));
-}
-
-static void enable_intr_window(struct kvm_vcpu *vcpu)
-{
- if (vcpu->arch.nmi_pending)
- enable_nmi_window(vcpu);
- else if (kvm_cpu_has_interrupt(vcpu))
- enable_irq_window(vcpu);
-}
-
static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
{
u32 exit_intr_info;
@@ -3109,7 +3216,9 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
if (unblock_nmi && vector != DF_VECTOR)
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
GUEST_INTR_STATE_NMI);
- }
+ } else if (unlikely(vmx->soft_vnmi_blocked))
+ vmx->vnmi_blocked_time +=
+ ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
idt_vectoring_info = vmx->idt_vectoring_info;
idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
@@ -3147,26 +3256,29 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
{
update_tpr_threshold(vcpu);
- if (cpu_has_virtual_nmis()) {
- if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
- if (vcpu->arch.interrupt.pending) {
- enable_nmi_window(vcpu);
- } else if (vmx_nmi_enabled(vcpu)) {
- vcpu->arch.nmi_pending = false;
- vcpu->arch.nmi_injected = true;
- } else {
- enable_intr_window(vcpu);
- return;
- }
- }
- if (vcpu->arch.nmi_injected) {
- vmx_inject_nmi(vcpu);
- enable_intr_window(vcpu);
+ vmx_update_window_states(vcpu);
+
+ if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
+ if (vcpu->arch.interrupt.pending) {
+ enable_nmi_window(vcpu);
+ } else if (vcpu->arch.nmi_window_open) {
+ vcpu->arch.nmi_pending = false;
+ vcpu->arch.nmi_injected = true;
+ } else {
+ enable_nmi_window(vcpu);
return;
}
}
+ if (vcpu->arch.nmi_injected) {
+ vmx_inject_nmi(vcpu);
+ if (vcpu->arch.nmi_pending)
+ enable_nmi_window(vcpu);
+ else if (kvm_cpu_has_interrupt(vcpu))
+ enable_irq_window(vcpu);
+ return;
+ }
if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
- if (vmx_irq_enabled(vcpu))
+ if (vcpu->arch.interrupt_window_open)
kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
else
enable_irq_window(vcpu);
@@ -3174,6 +3286,8 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
if (vcpu->arch.interrupt.pending) {
vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr);
+ if (kvm_cpu_has_interrupt(vcpu))
+ enable_irq_window(vcpu);
}
}
@@ -3213,6 +3327,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 intr_info;
+ /* Record the guest's net vcpu time for enforced NMI injections. */
+ if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
+ vmx->entry_time = ktime_get();
+
/* Handle invalid guest state instead of entering VMX */
if (vmx->emulation_required && emulate_invalid_guest_state) {
handle_invalid_guest_state(vcpu, kvm_run);
@@ -3327,9 +3445,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if (vmx->rmode.irq.pending)
fixup_rmode_irq(vmx);
- vcpu->arch.interrupt_window_open =
- (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
- (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)) == 0;
+ vmx_update_window_states(vcpu);
asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
vmx->launched = 1;
@@ -3337,7 +3453,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
/* We need to handle NMIs before interrupts are enabled */
- if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200 &&
+ if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
(intr_info & INTR_INFO_VALID_MASK)) {
KVMTRACE_0D(NMI, vcpu, handler);
asm("int $2");
@@ -3455,6 +3571,11 @@ static int get_ept_level(void)
return VMX_EPT_DEFAULT_GAW + 1;
}
+static int vmx_get_mt_mask_shift(void)
+{
+ return VMX_EPT_MT_EPTE_SHIFT;
+}
+
static struct kvm_x86_ops vmx_x86_ops = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
@@ -3510,6 +3631,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.set_tss_addr = vmx_set_tss_addr,
.get_tdp_level = get_ept_level,
+ .get_mt_mask_shift = vmx_get_mt_mask_shift,
};
static int __init vmx_init(void)
@@ -3566,10 +3688,10 @@ static int __init vmx_init(void)
bypass_guest_pf = 0;
kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
VMX_EPT_WRITABLE_MASK |
- VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT |
VMX_EPT_IGMT_BIT);
kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
- VMX_EPT_EXECUTABLE_MASK);
+ VMX_EPT_EXECUTABLE_MASK,
+ VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
kvm_enable_tdp();
} else
kvm_disable_tdp();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f1f8ff2f1fa2..cc17546a2406 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -34,11 +34,13 @@
#include <linux/module.h>
#include <linux/mman.h>
#include <linux/highmem.h>
+#include <linux/iommu.h>
#include <linux/intel-iommu.h>
#include <asm/uaccess.h>
#include <asm/msr.h>
#include <asm/desc.h>
+#include <asm/mtrr.h>
#define MAX_IO_MSRS 256
#define CR0_RESERVED_BITS \
@@ -86,6 +88,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
{ "hypercalls", VCPU_STAT(hypercalls) },
{ "request_irq", VCPU_STAT(request_irq_exits) },
+ { "request_nmi", VCPU_STAT(request_nmi_exits) },
{ "irq_exits", VCPU_STAT(irq_exits) },
{ "host_state_reload", VCPU_STAT(host_state_reload) },
{ "efer_reload", VCPU_STAT(efer_reload) },
@@ -93,6 +96,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "insn_emulation", VCPU_STAT(insn_emulation) },
{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
{ "irq_injections", VCPU_STAT(irq_injections) },
+ { "nmi_injections", VCPU_STAT(nmi_injections) },
{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -101,6 +105,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "mmu_recycled", VM_STAT(mmu_recycled) },
{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
{ "mmu_unsync", VM_STAT(mmu_unsync) },
+ { "mmu_unsync_global", VM_STAT(mmu_unsync_global) },
{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
{ "largepages", VM_STAT(lpages) },
{ NULL }
@@ -312,6 +317,7 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
kvm_x86_ops->set_cr0(vcpu, cr0);
vcpu->arch.cr0 = cr0;
+ kvm_mmu_sync_global(vcpu);
kvm_mmu_reset_context(vcpu);
return;
}
@@ -355,6 +361,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
}
kvm_x86_ops->set_cr4(vcpu, cr4);
vcpu->arch.cr4 = cr4;
+ kvm_mmu_sync_global(vcpu);
kvm_mmu_reset_context(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_set_cr4);
@@ -449,7 +456,7 @@ static u32 msrs_to_save[] = {
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
- MSR_IA32_PERF_STATUS,
+ MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT
};
static unsigned num_msrs_to_save;
@@ -648,10 +655,38 @@ static bool msr_mtrr_valid(unsigned msr)
static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
+ u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
+
if (!msr_mtrr_valid(msr))
return 1;
- vcpu->arch.mtrr[msr - 0x200] = data;
+ if (msr == MSR_MTRRdefType) {
+ vcpu->arch.mtrr_state.def_type = data;
+ vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
+ } else if (msr == MSR_MTRRfix64K_00000)
+ p[0] = data;
+ else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
+ p[1 + msr - MSR_MTRRfix16K_80000] = data;
+ else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
+ p[3 + msr - MSR_MTRRfix4K_C0000] = data;
+ else if (msr == MSR_IA32_CR_PAT)
+ vcpu->arch.pat = data;
+ else { /* Variable MTRRs */
+ int idx, is_mtrr_mask;
+ u64 *pt;
+
+ idx = (msr - 0x200) / 2;
+ is_mtrr_mask = msr - 0x200 - 2 * idx;
+ if (!is_mtrr_mask)
+ pt =
+ (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
+ else
+ pt =
+ (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
+ *pt = data;
+ }
+
+ kvm_mmu_reset_context(vcpu);
return 0;
}
@@ -747,10 +782,37 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
{
+ u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
+
if (!msr_mtrr_valid(msr))
return 1;
- *pdata = vcpu->arch.mtrr[msr - 0x200];
+ if (msr == MSR_MTRRdefType)
+ *pdata = vcpu->arch.mtrr_state.def_type +
+ (vcpu->arch.mtrr_state.enabled << 10);
+ else if (msr == MSR_MTRRfix64K_00000)
+ *pdata = p[0];
+ else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
+ *pdata = p[1 + msr - MSR_MTRRfix16K_80000];
+ else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
+ *pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
+ else if (msr == MSR_IA32_CR_PAT)
+ *pdata = vcpu->arch.pat;
+ else { /* Variable MTRRs */
+ int idx, is_mtrr_mask;
+ u64 *pt;
+
+ idx = (msr - 0x200) / 2;
+ is_mtrr_mask = msr - 0x200 - 2 * idx;
+ if (!is_mtrr_mask)
+ pt =
+ (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
+ else
+ pt =
+ (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
+ *pdata = *pt;
+ }
+
return 0;
}
@@ -903,7 +965,6 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_IRQCHIP:
case KVM_CAP_HLT:
case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
- case KVM_CAP_USER_MEMORY:
case KVM_CAP_SET_TSS_ADDR:
case KVM_CAP_EXT_CPUID:
case KVM_CAP_CLOCKSOURCE:
@@ -929,7 +990,7 @@ int kvm_dev_ioctl_check_extension(long ext)
r = !tdp_enabled;
break;
case KVM_CAP_IOMMU:
- r = intel_iommu_found();
+ r = iommu_found();
break;
default:
r = 0;
@@ -1188,6 +1249,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
int t, times = entry->eax & 0xff;
entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+ entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
for (t = 1; t < times && *nent < maxnent; ++t) {
do_cpuid_1_ent(&entry[t], function, 0);
entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
@@ -1218,7 +1280,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
/* read more entries until level_type is zero */
for (i = 1; *nent < maxnent; ++i) {
- level_type = entry[i - 1].ecx & 0xff;
+ level_type = entry[i - 1].ecx & 0xff00;
if (!level_type)
break;
do_cpuid_1_ent(&entry[i], function, i);
@@ -1318,6 +1380,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
return 0;
}
+static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
+{
+ vcpu_load(vcpu);
+ kvm_inject_nmi(vcpu);
+ vcpu_put(vcpu);
+
+ return 0;
+}
+
static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
struct kvm_tpr_access_ctl *tac)
{
@@ -1377,6 +1448,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
r = 0;
break;
}
+ case KVM_NMI: {
+ r = kvm_vcpu_ioctl_nmi(vcpu);
+ if (r)
+ goto out;
+ r = 0;
+ break;
+ }
case KVM_SET_CPUID: {
struct kvm_cpuid __user *cpuid_arg = argp;
struct kvm_cpuid cpuid;
@@ -1968,7 +2046,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
if (ret < 0)
return 0;
- kvm_mmu_pte_write(vcpu, gpa, val, bytes);
+ kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
return 1;
}
@@ -2404,8 +2482,6 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
val = kvm_register_read(vcpu, VCPU_REGS_RAX);
memcpy(vcpu->arch.pio_data, &val, 4);
- kvm_x86_ops->skip_emulated_instruction(vcpu);
-
pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
if (pio_dev) {
kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
@@ -2541,7 +2617,7 @@ int kvm_arch_init(void *opaque)
kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
- PT_DIRTY_MASK, PT64_NX_MASK, 0);
+ PT_DIRTY_MASK, PT64_NX_MASK, 0, 0);
return 0;
out:
@@ -2729,7 +2805,7 @@ static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
/* when no next entry is found, the current entry[i] is reselected */
- for (j = i + 1; j == i; j = (j + 1) % nent) {
+ for (j = i + 1; ; j = (j + 1) % nent) {
struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
if (ej->function == e->function) {
ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
@@ -2973,7 +3049,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
pr_debug("vcpu %d received sipi with vector # %x\n",
vcpu->vcpu_id, vcpu->arch.sipi_vector);
kvm_lapic_reset(vcpu);
- r = kvm_x86_ops->vcpu_reset(vcpu);
+ r = kvm_arch_vcpu_reset(vcpu);
if (r)
return r;
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -3275,9 +3351,9 @@ static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
kvm_desct->padding = 0;
}
-static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu,
- u16 selector,
- struct descriptor_table *dtable)
+static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
+ u16 selector,
+ struct descriptor_table *dtable)
{
if (selector & 1 << 2) {
struct kvm_segment kvm_seg;
@@ -3302,7 +3378,7 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
struct descriptor_table dtable;
u16 index = selector >> 3;
- get_segment_descritptor_dtable(vcpu, selector, &dtable);
+ get_segment_descriptor_dtable(vcpu, selector, &dtable);
if (dtable.limit < index * 8 + 7) {
kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
@@ -3321,7 +3397,7 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
struct descriptor_table dtable;
u16 index = selector >> 3;
- get_segment_descritptor_dtable(vcpu, selector, &dtable);
+ get_segment_descriptor_dtable(vcpu, selector, &dtable);
if (dtable.limit < index * 8 + 7)
return 1;
@@ -3900,6 +3976,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
/* We do fxsave: this must be aligned. */
BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
+ vcpu->arch.mtrr_state.have_fixed = 1;
vcpu_load(vcpu);
r = kvm_arch_vcpu_reset(vcpu);
if (r == 0)
@@ -3925,6 +4002,9 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
{
+ vcpu->arch.nmi_pending = false;
+ vcpu->arch.nmi_injected = false;
+
return kvm_x86_ops->vcpu_reset(vcpu);
}
@@ -4012,6 +4092,7 @@ struct kvm *kvm_arch_create_vm(void)
return ERR_PTR(-ENOMEM);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+ INIT_LIST_HEAD(&kvm->arch.oos_global_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
@@ -4048,8 +4129,8 @@ static void kvm_free_vcpus(struct kvm *kvm)
void kvm_arch_destroy_vm(struct kvm *kvm)
{
- kvm_iommu_unmap_guest(kvm);
kvm_free_all_assigned_devices(kvm);
+ kvm_iommu_unmap_guest(kvm);
kvm_free_pit(kvm);
kfree(kvm->arch.vpic);
kfree(kvm->arch.vioapic);
@@ -4127,7 +4208,8 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
{
return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
- || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED;
+ || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
+ || vcpu->arch.nmi_pending;
}
static void vcpu_kick_intr(void *info)
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index ea051173b0da..d174db7a3370 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -58,6 +58,7 @@
#define SrcMem32 (4<<4) /* Memory operand (32-bit). */
#define SrcImm (5<<4) /* Immediate operand. */
#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */
+#define SrcOne (7<<4) /* Implied '1' */
#define SrcMask (7<<4)
/* Generic ModRM decode. */
#define ModRM (1<<7)
@@ -70,17 +71,23 @@
#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
#define GroupMask 0xff /* Group number stored in bits 0:7 */
+/* Source 2 operand type */
+#define Src2None (0<<29)
+#define Src2CL (1<<29)
+#define Src2ImmByte (2<<29)
+#define Src2One (3<<29)
+#define Src2Mask (7<<29)
enum {
Group1_80, Group1_81, Group1_82, Group1_83,
Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
};
-static u16 opcode_table[256] = {
+static u32 opcode_table[256] = {
/* 0x00 - 0x07 */
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
- 0, 0, 0, 0,
+ ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0,
/* 0x08 - 0x0F */
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -195,7 +202,7 @@ static u16 opcode_table[256] = {
ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
};
-static u16 twobyte_table[256] = {
+static u32 twobyte_table[256] = {
/* 0x00 - 0x0F */
0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0,
ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
@@ -230,9 +237,14 @@ static u16 twobyte_table[256] = {
/* 0x90 - 0x9F */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
/* 0xA0 - 0xA7 */
- 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
+ 0, 0, 0, DstMem | SrcReg | ModRM | BitOp,
+ DstMem | SrcReg | Src2ImmByte | ModRM,
+ DstMem | SrcReg | Src2CL | ModRM, 0, 0,
/* 0xA8 - 0xAF */
- 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, ModRM, 0,
+ 0, 0, 0, DstMem | SrcReg | ModRM | BitOp,
+ DstMem | SrcReg | Src2ImmByte | ModRM,
+ DstMem | SrcReg | Src2CL | ModRM,
+ ModRM, 0,
/* 0xB0 - 0xB7 */
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
DstMem | SrcReg | ModRM | BitOp,
@@ -253,7 +265,7 @@ static u16 twobyte_table[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
-static u16 group_table[] = {
+static u32 group_table[] = {
[Group1_80*8] =
ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
@@ -297,9 +309,9 @@ static u16 group_table[] = {
SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp,
};
-static u16 group2_table[] = {
+static u32 group2_table[] = {
[Group7*8] =
- SrcNone | ModRM, 0, 0, 0,
+ SrcNone | ModRM, 0, 0, SrcNone | ModRM,
SrcNone | ModRM | DstMem | Mov, 0,
SrcMem16 | ModRM | Mov, 0,
};
@@ -359,49 +371,48 @@ static u16 group2_table[] = {
"andl %"_msk",%"_LO32 _tmp"; " \
"orl %"_LO32 _tmp",%"_sav"; "
+#ifdef CONFIG_X86_64
+#define ON64(x) x
+#else
+#define ON64(x)
+#endif
+
+#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix) \
+ do { \
+ __asm__ __volatile__ ( \
+ _PRE_EFLAGS("0", "4", "2") \
+ _op _suffix " %"_x"3,%1; " \
+ _POST_EFLAGS("0", "4", "2") \
+ : "=m" (_eflags), "=m" ((_dst).val), \
+ "=&r" (_tmp) \
+ : _y ((_src).val), "i" (EFLAGS_MASK)); \
+ } while (0)
+
+
/* Raw emulation: instruction has two explicit operands. */
#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
- do { \
- unsigned long _tmp; \
- \
- switch ((_dst).bytes) { \
- case 2: \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "4", "2") \
- _op"w %"_wx"3,%1; " \
- _POST_EFLAGS("0", "4", "2") \
- : "=m" (_eflags), "=m" ((_dst).val), \
- "=&r" (_tmp) \
- : _wy ((_src).val), "i" (EFLAGS_MASK)); \
- break; \
- case 4: \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "4", "2") \
- _op"l %"_lx"3,%1; " \
- _POST_EFLAGS("0", "4", "2") \
- : "=m" (_eflags), "=m" ((_dst).val), \
- "=&r" (_tmp) \
- : _ly ((_src).val), "i" (EFLAGS_MASK)); \
- break; \
- case 8: \
- __emulate_2op_8byte(_op, _src, _dst, \
- _eflags, _qx, _qy); \
- break; \
- } \
+ do { \
+ unsigned long _tmp; \
+ \
+ switch ((_dst).bytes) { \
+ case 2: \
+ ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \
+ break; \
+ case 4: \
+ ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \
+ break; \
+ case 8: \
+ ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \
+ break; \
+ } \
} while (0)
#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
do { \
- unsigned long __tmp; \
+ unsigned long _tmp; \
switch ((_dst).bytes) { \
case 1: \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "4", "2") \
- _op"b %"_bx"3,%1; " \
- _POST_EFLAGS("0", "4", "2") \
- : "=m" (_eflags), "=m" ((_dst).val), \
- "=&r" (__tmp) \
- : _by ((_src).val), "i" (EFLAGS_MASK)); \
+ ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b"); \
break; \
default: \
__emulate_2op_nobyte(_op, _src, _dst, _eflags, \
@@ -425,71 +436,68 @@ static u16 group2_table[] = {
__emulate_2op_nobyte(_op, _src, _dst, _eflags, \
"w", "r", _LO32, "r", "", "r")
-/* Instruction has only one explicit operand (no source operand). */
-#define emulate_1op(_op, _dst, _eflags) \
- do { \
- unsigned long _tmp; \
- \
- switch ((_dst).bytes) { \
- case 1: \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "3", "2") \
- _op"b %1; " \
- _POST_EFLAGS("0", "3", "2") \
- : "=m" (_eflags), "=m" ((_dst).val), \
- "=&r" (_tmp) \
- : "i" (EFLAGS_MASK)); \
- break; \
- case 2: \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "3", "2") \
- _op"w %1; " \
- _POST_EFLAGS("0", "3", "2") \
- : "=m" (_eflags), "=m" ((_dst).val), \
- "=&r" (_tmp) \
- : "i" (EFLAGS_MASK)); \
- break; \
- case 4: \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "3", "2") \
- _op"l %1; " \
- _POST_EFLAGS("0", "3", "2") \
- : "=m" (_eflags), "=m" ((_dst).val), \
- "=&r" (_tmp) \
- : "i" (EFLAGS_MASK)); \
- break; \
- case 8: \
- __emulate_1op_8byte(_op, _dst, _eflags); \
- break; \
- } \
+/* Instruction has three operands and one operand is stored in ECX register */
+#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \
+ do { \
+ unsigned long _tmp; \
+ _type _clv = (_cl).val; \
+ _type _srcv = (_src).val; \
+ _type _dstv = (_dst).val; \
+ \
+ __asm__ __volatile__ ( \
+ _PRE_EFLAGS("0", "5", "2") \
+ _op _suffix " %4,%1 \n" \
+ _POST_EFLAGS("0", "5", "2") \
+ : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \
+ : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \
+ ); \
+ \
+ (_cl).val = (unsigned long) _clv; \
+ (_src).val = (unsigned long) _srcv; \
+ (_dst).val = (unsigned long) _dstv; \
} while (0)
-/* Emulate an instruction with quadword operands (x86/64 only). */
-#if defined(CONFIG_X86_64)
-#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy) \
- do { \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "4", "2") \
- _op"q %"_qx"3,%1; " \
- _POST_EFLAGS("0", "4", "2") \
- : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
- : _qy ((_src).val), "i" (EFLAGS_MASK)); \
+#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \
+ do { \
+ switch ((_dst).bytes) { \
+ case 2: \
+ __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
+ "w", unsigned short); \
+ break; \
+ case 4: \
+ __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
+ "l", unsigned int); \
+ break; \
+ case 8: \
+ ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
+ "q", unsigned long)); \
+ break; \
+ } \
} while (0)
-#define __emulate_1op_8byte(_op, _dst, _eflags) \
- do { \
- __asm__ __volatile__ ( \
- _PRE_EFLAGS("0", "3", "2") \
- _op"q %1; " \
- _POST_EFLAGS("0", "3", "2") \
- : "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
- : "i" (EFLAGS_MASK)); \
+#define __emulate_1op(_op, _dst, _eflags, _suffix) \
+ do { \
+ unsigned long _tmp; \
+ \
+ __asm__ __volatile__ ( \
+ _PRE_EFLAGS("0", "3", "2") \
+ _op _suffix " %1; " \
+ _POST_EFLAGS("0", "3", "2") \
+ : "=m" (_eflags), "+m" ((_dst).val), \
+ "=&r" (_tmp) \
+ : "i" (EFLAGS_MASK)); \
} while (0)
-#elif defined(__i386__)
-#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
-#define __emulate_1op_8byte(_op, _dst, _eflags)
-#endif /* __i386__ */
+/* Instruction has only one explicit operand (no source operand). */
+#define emulate_1op(_op, _dst, _eflags) \
+ do { \
+ switch ((_dst).bytes) { \
+ case 1: __emulate_1op(_op, _dst, _eflags, "b"); break; \
+ case 2: __emulate_1op(_op, _dst, _eflags, "w"); break; \
+ case 4: __emulate_1op(_op, _dst, _eflags, "l"); break; \
+ case 8: ON64(__emulate_1op(_op, _dst, _eflags, "q")); break; \
+ } \
+ } while (0)
/* Fetch next part of the instruction being emulated. */
#define insn_fetch(_type, _size, _eip) \
@@ -1041,6 +1049,33 @@ done_prefixes:
c->src.bytes = 1;
c->src.val = insn_fetch(s8, 1, c->eip);
break;
+ case SrcOne:
+ c->src.bytes = 1;
+ c->src.val = 1;
+ break;
+ }
+
+ /*
+ * Decode and fetch the second source operand: register, memory
+ * or immediate.
+ */
+ switch (c->d & Src2Mask) {
+ case Src2None:
+ break;
+ case Src2CL:
+ c->src2.bytes = 1;
+ c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
+ break;
+ case Src2ImmByte:
+ c->src2.type = OP_IMM;
+ c->src2.ptr = (unsigned long *)c->eip;
+ c->src2.bytes = 1;
+ c->src2.val = insn_fetch(u8, 1, c->eip);
+ break;
+ case Src2One:
+ c->src2.bytes = 1;
+ c->src2.val = 1;
+ break;
}
/* Decode and fetch the destination operand: register or memory. */
@@ -1100,20 +1135,33 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
c->regs[VCPU_REGS_RSP]);
}
-static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static int emulate_pop(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
int rc;
- rc = ops->read_std(register_address(c, ss_base(ctxt),
- c->regs[VCPU_REGS_RSP]),
- &c->dst.val, c->dst.bytes, ctxt->vcpu);
+ rc = ops->read_emulated(register_address(c, ss_base(ctxt),
+ c->regs[VCPU_REGS_RSP]),
+ &c->src.val, c->src.bytes, ctxt->vcpu);
if (rc != 0)
return rc;
- register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->dst.bytes);
+ register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.bytes);
+ return rc;
+}
+
+static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops)
+{
+ struct decode_cache *c = &ctxt->decode;
+ int rc;
+ c->src.bytes = c->dst.bytes;
+ rc = emulate_pop(ctxt, ops);
+ if (rc != 0)
+ return rc;
+ c->dst.val = c->src.val;
return 0;
}
@@ -1415,24 +1463,15 @@ special_insn:
emulate_1op("dec", c->dst, ctxt->eflags);
break;
case 0x50 ... 0x57: /* push reg */
- c->dst.type = OP_MEM;
- c->dst.bytes = c->op_bytes;
- c->dst.val = c->src.val;
- register_address_increment(c, &c->regs[VCPU_REGS_RSP],
- -c->op_bytes);
- c->dst.ptr = (void *) register_address(
- c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]);
+ emulate_push(ctxt);
break;
case 0x58 ... 0x5f: /* pop reg */
pop_instruction:
- if ((rc = ops->read_std(register_address(c, ss_base(ctxt),
- c->regs[VCPU_REGS_RSP]), c->dst.ptr,
- c->op_bytes, ctxt->vcpu)) != 0)
+ c->src.bytes = c->op_bytes;
+ rc = emulate_pop(ctxt, ops);
+ if (rc != 0)
goto done;
-
- register_address_increment(c, &c->regs[VCPU_REGS_RSP],
- c->op_bytes);
- c->dst.type = OP_NONE; /* Disable writeback. */
+ c->dst.val = c->src.val;
break;
case 0x63: /* movsxd */
if (ctxt->mode != X86EMUL_MODE_PROT64)
@@ -1591,7 +1630,9 @@ special_insn:
emulate_push(ctxt);
break;
case 0x9d: /* popf */
+ c->dst.type = OP_REG;
c->dst.ptr = (unsigned long *) &ctxt->eflags;
+ c->dst.bytes = c->op_bytes;
goto pop_instruction;
case 0xa0 ... 0xa1: /* mov */
c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
@@ -1689,7 +1730,9 @@ special_insn:
emulate_grp2(ctxt);
break;
case 0xc3: /* ret */
+ c->dst.type = OP_REG;
c->dst.ptr = &c->eip;
+ c->dst.bytes = c->op_bytes;
goto pop_instruction;
case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */
mov:
@@ -1778,7 +1821,7 @@ special_insn:
c->eip = saved_eip;
goto cannot_emulate;
}
- return 0;
+ break;
case 0xf4: /* hlt */
ctxt->vcpu->arch.halt_request = 1;
break;
@@ -1999,12 +2042,20 @@ twobyte_insn:
c->src.val &= (c->dst.bytes << 3) - 1;
emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
break;
+ case 0xa4: /* shld imm8, r, r/m */
+ case 0xa5: /* shld cl, r, r/m */
+ emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
+ break;
case 0xab:
bts: /* bts */
/* only subword offset */
c->src.val &= (c->dst.bytes << 3) - 1;
emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
break;
+ case 0xac: /* shrd imm8, r, r/m */
+ case 0xad: /* shrd cl, r, r/m */
+ emulate_2op_cl("shrd", c->src2, c->src, c->dst, ctxt->eflags);
+ break;
case 0xae: /* clflush */
break;
case 0xb0 ... 0xb1: /* cmpxchg */
diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c
index 37b9ae4d44c5..df167f265622 100644
--- a/arch/x86/mach-default/setup.c
+++ b/arch/x86/mach-default/setup.c
@@ -133,29 +133,28 @@ void __init time_init_hook(void)
**/
void mca_nmi_hook(void)
{
- /* If I recall correctly, there's a whole bunch of other things that
+ /*
+ * If I recall correctly, there's a whole bunch of other things that
* we can do to check for NMI problems, but that's all I know about
* at the moment.
*/
-
- printk("NMI generated from unknown source!\n");
+ pr_warning("NMI generated from unknown source!\n");
}
#endif
static __init int no_ipi_broadcast(char *str)
{
get_option(&str, &no_broadcast);
- printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" :
- "IPI Broadcast");
+ pr_info("Using %s mode\n",
+ no_broadcast ? "No IPI Broadcast" : "IPI Broadcast");
return 1;
}
-
__setup("no_ipi_broadcast=", no_ipi_broadcast);
static int __init print_ipi_mode(void)
{
- printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" :
- "Shortcut");
+ pr_info("Using IPI %s mode\n",
+ no_broadcast ? "No-Shortcut" : "Shortcut");
return 0;
}
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 8655b5bb0963..f99a6c6c432e 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -435,8 +435,12 @@ static void __init set_highmem_pages_init(void)
#endif /* !CONFIG_NUMA */
#else
-# define permanent_kmaps_init(pgd_base) do { } while (0)
-# define set_highmem_pages_init() do { } while (0)
+static inline void permanent_kmaps_init(pgd_t *pgd_base)
+{
+}
+static inline void set_highmem_pages_init(void)
+{
+}
#endif /* CONFIG_HIGHMEM */
void __init native_pagetable_setup_start(pgd_t *base)
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 1d88d2b39771..9e5752fe4d15 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -4,7 +4,7 @@
#include <linux/irq.h>
#include <linux/dmi.h>
#include <asm/numa.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
struct pci_root_info {
char *name;
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 22e057665e55..9bb09823b362 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -2,7 +2,7 @@
#include <linux/pci.h>
#include <linux/topology.h>
#include <linux/cpu.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
#ifdef CONFIG_X86_64
#include <asm/pci-direct.h>
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index bb1a01f089e2..62ddb73e09ed 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -14,8 +14,7 @@
#include <asm/segment.h>
#include <asm/io.h>
#include <asm/smp.h>
-
-#include "pci.h"
+#include <asm/pci_x86.h>
unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 |
PCI_PROBE_MMCONF;
diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c
index 9a5af6c8fbe9..bd13c3e4c6db 100644
--- a/arch/x86/pci/direct.c
+++ b/arch/x86/pci/direct.c
@@ -5,7 +5,7 @@
#include <linux/pci.h>
#include <linux/init.h>
#include <linux/dmi.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
/*
* Functions for accessing PCI base (first 256 bytes) and extended
diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
index 86631ccbc25a..f6adf2c6d751 100644
--- a/arch/x86/pci/early.c
+++ b/arch/x86/pci/early.c
@@ -2,7 +2,7 @@
#include <linux/pci.h>
#include <asm/pci-direct.h>
#include <asm/io.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
/* Direct PCI access. This is used for PCI accesses in early boot before
the PCI subsystem works. */
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 2051dc96b8e9..7d388d5cf548 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -6,8 +6,7 @@
#include <linux/dmi.h>
#include <linux/pci.h>
#include <linux/init.h>
-#include "pci.h"
-
+#include <asm/pci_x86.h>
static void __devinit pci_fixup_i450nx(struct pci_dev *d)
{
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 844df0cbbd3e..e51bf2cda4b0 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -34,8 +34,8 @@
#include <asm/pat.h>
#include <asm/e820.h>
+#include <asm/pci_x86.h>
-#include "pci.h"
static int
skip_isa_ioresource_align(struct pci_dev *dev) {
diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c
index d6c950f81858..bec3b048e72b 100644
--- a/arch/x86/pci/init.c
+++ b/arch/x86/pci/init.c
@@ -1,6 +1,6 @@
#include <linux/pci.h>
#include <linux/init.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
/* arch_initcall has too random ordering, so call the initializers
in the right sequence from here. */
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index bf69dbe08bff..373b9afe6d44 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -16,8 +16,7 @@
#include <asm/io_apic.h>
#include <linux/irq.h>
#include <linux/acpi.h>
-
-#include "pci.h"
+#include <asm/pci_x86.h>
#define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24))
#define PIRQ_VERSION 0x0100
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index b722dd481b39..f1065b129e9c 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -3,7 +3,7 @@
*/
#include <linux/init.h>
#include <linux/pci.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
/*
* Discover remaining PCI buses in case there are peer host bridges.
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 654a2234f8f3..89bf9242c80a 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -15,8 +15,7 @@
#include <linux/acpi.h>
#include <linux/bitmap.h>
#include <asm/e820.h>
-
-#include "pci.h"
+#include <asm/pci_x86.h>
/* aperture is up to 256MB but BIOS may reserve less */
#define MMCONFIG_APER_MIN (2 * 1024*1024)
diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c
index f3c761dce695..8b2d561046a3 100644
--- a/arch/x86/pci/mmconfig_32.c
+++ b/arch/x86/pci/mmconfig_32.c
@@ -13,7 +13,7 @@
#include <linux/init.h>
#include <linux/acpi.h>
#include <asm/e820.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
/* Assume systems with more busses have correct MCFG */
#define mmcfg_virt_addr ((void __iomem *) fix_to_virt(FIX_PCIE_MCFG))
diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c
index a1994163c99d..30007ffc8e11 100644
--- a/arch/x86/pci/mmconfig_64.c
+++ b/arch/x86/pci/mmconfig_64.c
@@ -10,8 +10,7 @@
#include <linux/acpi.h>
#include <linux/bitmap.h>
#include <asm/e820.h>
-
-#include "pci.h"
+#include <asm/pci_x86.h>
/* Static virtual mapping of the MMCONFIG aperture */
struct mmcfg_virt {
diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c
index 1177845d3186..2089354968a2 100644
--- a/arch/x86/pci/numaq_32.c
+++ b/arch/x86/pci/numaq_32.c
@@ -7,7 +7,7 @@
#include <linux/nodemask.h>
#include <mach_apic.h>
#include <asm/mpspec.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
#define XQUAD_PORTIO_BASE 0xfe400000
#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c
index e11e9e803d5f..b889d824f7c6 100644
--- a/arch/x86/pci/olpc.c
+++ b/arch/x86/pci/olpc.c
@@ -29,7 +29,7 @@
#include <linux/init.h>
#include <asm/olpc.h>
#include <asm/geode.h>
-#include "pci.h"
+#include <asm/pci_x86.h>
/*
* In the tables below, the first two line (8 longwords) are the
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index 37472fc6f729..b82cae970dfd 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -6,9 +6,8 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/uaccess.h>
-#include "pci.h"
-#include "pci-functions.h"
-
+#include <asm/pci_x86.h>
+#include <asm/mach-default/pci-functions.h>
/* BIOS32 signature: "_32_" */
#define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24))
diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c
index 42f4cb19faca..16d0c0eb0d19 100644
--- a/arch/x86/pci/visws.c
+++ b/arch/x86/pci/visws.c
@@ -9,11 +9,10 @@
#include <linux/init.h>
#include <asm/setup.h>
+#include <asm/pci_x86.h>
#include <asm/visws/cobalt.h>
#include <asm/visws/lithium.h>
-#include "pci.h"
-
static int pci_visws_enable_irq(struct pci_dev *dev) { return 0; }
static void pci_visws_disable_irq(struct pci_dev *dev) { }
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 65d75a6be0ba..14f240623497 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -132,8 +132,7 @@ static void do_stolen_accounting(void)
*snap = state;
/* Add the appropriate number of ticks of stolen time,
- including any left-overs from last time. Passing NULL to
- account_steal_time accounts the time as stolen. */
+ including any left-overs from last time. */
stolen = runnable + offline + __get_cpu_var(residual_stolen);
if (stolen < 0)
@@ -141,11 +140,10 @@ static void do_stolen_accounting(void)
ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
__get_cpu_var(residual_stolen) = stolen;
- account_steal_time(NULL, ticks);
+ account_steal_ticks(ticks);
/* Add the appropriate number of ticks of blocked time,
- including any left-overs from last time. Passing idle to
- account_steal_time accounts the time as idle/wait. */
+ including any left-overs from last time. */
blocked += __get_cpu_var(residual_blocked);
if (blocked < 0)
@@ -153,7 +151,7 @@ static void do_stolen_accounting(void)
ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
__get_cpu_var(residual_blocked) = blocked;
- account_steal_time(idle_task(smp_processor_id()), ticks);
+ account_idle_ticks(ticks);
}
/*