diff options
Diffstat (limited to 'arch/x86')
174 files changed, 3410 insertions, 2750 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index ad8ec356fb36..0e103236b754 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -14,3 +14,4 @@ obj-y += crypto/ obj-y += vdso/ obj-$(CONFIG_IA32_EMULATION) += ia32/ +obj-y += platform/ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index dfabfefc21c4..97b528d660ad 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1,6 +1,3 @@ -# x86 configuration -mainmenu "Linux Kernel Configuration for x86" - # Select 32 or 64 bit config 64BIT bool "64-bit kernel" if ARCH = "x86" @@ -24,7 +21,7 @@ config X86 select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE select HAVE_OPROFILE - select HAVE_PERF_EVENTS if (!M386 && !M486) + select HAVE_PERF_EVENTS select HAVE_IRQ_WORK select HAVE_IOREMAP_PROT select HAVE_KPROBES @@ -347,6 +344,7 @@ endif config X86_VSMP bool "ScaleMP vSMP" + select PARAVIRT_GUEST select PARAVIRT depends on X86_64 && PCI depends on X86_EXTENDED_PLATFORM @@ -1143,16 +1141,16 @@ config NUMA comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI) -config K8_NUMA +config AMD_NUMA def_bool y prompt "Old style AMD Opteron NUMA detection" depends on X86_64 && NUMA && PCI ---help--- - Enable K8 NUMA node topology detection. You should say Y here if - you have a multi processor AMD K8 system. This uses an old - method to read the NUMA configuration directly from the builtin - Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA - instead, which also takes priority if both are compiled in. + Enable AMD NUMA node topology detection. You should say Y here if + you have a multi processor AMD system. This uses an old method to + read the NUMA configuration directly from the builtin Northbridge + of Opteron. It is recommended to use X86_64_ACPI_NUMA instead, + which also takes priority if both are compiled in. config X86_64_ACPI_NUMA def_bool y @@ -1895,6 +1893,11 @@ config PCI_OLPC def_bool y depends on PCI && OLPC && (PCI_GOOLPC || PCI_GOANY) +config PCI_XEN + def_bool y + depends on PCI && XEN + select SWIOTLB_XEN + config PCI_DOMAINS def_bool y depends on PCI diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index 1255d953c65d..f2ee1abb1df9 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu @@ -51,7 +51,18 @@ cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686)) # prologue (push %ebp, mov %esp, %ebp) which breaks the function graph # tracer assumptions. For i686, generic, core2 this is set by the # compiler anyway -cflags-$(CONFIG_FUNCTION_GRAPH_TRACER) += $(call cc-option,-maccumulate-outgoing-args) +ifeq ($(CONFIG_FUNCTION_GRAPH_TRACER), y) +ADD_ACCUMULATE_OUTGOING_ARGS := y +endif + +# Work around to a bug with asm goto with first implementations of it +# in gcc causing gcc to mess up the push and pop of the stack in some +# uses of asm goto. +ifeq ($(CONFIG_JUMP_LABEL), y) +ADD_ACCUMULATE_OUTGOING_ARGS := y +endif + +cflags-$(ADD_ACCUMULATE_OUTGOING_ARGS) += $(call cc-option,-maccumulate-outgoing-args) # Bug fix for binutils: this option is required in order to keep # binutils from generating NOPL instructions against our will. diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 52f85a196fa0..35af09d13dc1 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -182,7 +182,7 @@ no_longmode: hlt jmp 1b -#include "../../kernel/verify_cpu_64.S" +#include "../../kernel/verify_cpu.S" /* * Be careful here startup_64 needs to be at a predictable diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 23f315c9f215..325c05294fc4 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -355,7 +355,7 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, if (heap > 0x3fffffffffffUL) error("Destination address too large"); #else - if (heap > ((-__PAGE_OFFSET-(512<<20)-1) & 0x7fffffff)) + if (heap > ((-__PAGE_OFFSET-(128<<20)-1) & 0x7fffffff)) error("Destination address too large"); #endif #ifndef CONFIG_RELOCATABLE diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index cbcc8d8ea93a..7a6e68e4f748 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -10,6 +10,7 @@ * by the Free Software Foundation. */ +#include <linux/err.h> #include <linux/module.h> #include <linux/init.h> #include <linux/kernel.h> diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 849813f398e7..5852519b2d0f 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -28,7 +28,6 @@ #include <linux/syscalls.h> #include <linux/times.h> #include <linux/utsname.h> -#include <linux/smp_lock.h> #include <linux/mm.h> #include <linux/uio.h> #include <linux/poll.h> diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 92091de11113..55d106b5e31b 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -93,6 +93,9 @@ extern u8 acpi_sci_flags; extern int acpi_sci_override_gsi; void acpi_pic_sci_set_trigger(unsigned int, u16); +extern int (*__acpi_register_gsi)(struct device *dev, u32 gsi, + int trigger, int polarity); + static inline void disable_acpi(void) { acpi_disabled = 1; diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 76561d20ea2f..4a2adaa9aefc 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -180,8 +180,15 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len); * On the local CPU you need to be protected again NMI or MCE handlers seeing an * inconsistent instruction while you patch. */ +struct text_poke_param { + void *addr; + const void *opcode; + size_t len; +}; + extern void *text_poke(void *addr, const void *opcode, size_t len); extern void *text_poke_smp(void *addr, const void *opcode, size_t len); +extern void text_poke_smp_batch(struct text_poke_param *params, int n); #if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) #define IDEAL_NOP_SIZE_5 5 diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index c8517f81b21e..6aee50d655d1 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -3,36 +3,53 @@ #include <linux/pci.h> -extern struct pci_device_id k8_nb_ids[]; +extern struct pci_device_id amd_nb_misc_ids[]; struct bootnode; -extern int early_is_k8_nb(u32 value); -extern int cache_k8_northbridges(void); -extern void k8_flush_garts(void); -extern int k8_get_nodes(struct bootnode *nodes); -extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn); -extern int k8_scan_nodes(void); +extern int early_is_amd_nb(u32 value); +extern int amd_cache_northbridges(void); +extern void amd_flush_garts(void); +extern int amd_get_nodes(struct bootnode *nodes); +extern int amd_numa_init(unsigned long start_pfn, unsigned long end_pfn); +extern int amd_scan_nodes(void); -struct k8_northbridge_info { +struct amd_northbridge { + struct pci_dev *misc; +}; + +struct amd_northbridge_info { u16 num; - u8 gart_supported; - struct pci_dev **nb_misc; + u64 flags; + struct amd_northbridge *nb; }; -extern struct k8_northbridge_info k8_northbridges; +extern struct amd_northbridge_info amd_northbridges; + +#define AMD_NB_GART 0x1 +#define AMD_NB_L3_INDEX_DISABLE 0x2 #ifdef CONFIG_AMD_NB -static inline struct pci_dev *node_to_k8_nb_misc(int node) +static inline int amd_nb_num(void) { - return (node < k8_northbridges.num) ? k8_northbridges.nb_misc[node] : NULL; + return amd_northbridges.num; } -#else +static inline int amd_nb_has_feature(int feature) +{ + return ((amd_northbridges.flags & feature) == feature); +} -static inline struct pci_dev *node_to_k8_nb_misc(int node) +static inline struct amd_northbridge *node_to_amd_nb(int node) { - return NULL; + return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL; } + +#else + +#define amd_nb_num(x) 0 +#define amd_nb_has_feature(x) false +#define node_to_amd_nb(x) NULL + #endif diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 286de34b0ed6..cf12007796db 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -141,13 +141,13 @@ static inline void native_apic_msr_write(u32 reg, u32 v) static inline u32 native_apic_msr_read(u32 reg) { - u32 low, high; + u64 msr; if (reg == APIC_DFR) return -1; - rdmsr(APIC_BASE_MSR + (reg >> 4), low, high); - return low; + rdmsrl(APIC_BASE_MSR + (reg >> 4), msr); + return (u32)msr; } static inline void native_x2apic_wait_icr_idle(void) @@ -181,12 +181,12 @@ extern void enable_x2apic(void); extern void x2apic_icr_write(u32 low, u32 id); static inline int x2apic_enabled(void) { - int msr, msr2; + u64 msr; if (!cpu_has_x2apic) return 0; - rdmsr(MSR_IA32_APICBASE, msr, msr2); + rdmsrl(MSR_IA32_APICBASE, msr); if (msr & X2APIC_ENABLE) return 1; return 0; @@ -238,6 +238,7 @@ extern void setup_boot_APIC_clock(void); extern void setup_secondary_APIC_clock(void); extern int APIC_init_uniprocessor(void); extern void enable_NMI_through_LVT0(void); +extern int apic_force_enable(void); /* * On 32bit this is mach-xxx local diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index a859ca461fb0..47a30ff8e517 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -145,6 +145,7 @@ #ifdef CONFIG_X86_32 # define MAX_IO_APICS 64 +# define MAX_LOCAL_APIC 256 #else # define MAX_IO_APICS 128 # define MAX_LOCAL_APIC 32768 diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 5be1542fbfaf..e99d55d74df5 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -72,6 +72,9 @@ struct e820map { #define BIOS_BEGIN 0x000a0000 #define BIOS_END 0x00100000 +#define BIOS_ROM_BASE 0xffe00000 +#define BIOS_ROM_END 0xffffffff + #ifdef __KERNEL__ /* see comment in arch/x86/kernel/e820.c */ extern struct e820map e820; diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 4d293dced62f..9479a037419f 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -216,8 +216,8 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr) } /* Return an pointer with offset calculated */ -static inline unsigned long __set_fixmap_offset(enum fixed_addresses idx, - phys_addr_t phys, pgprot_t flags) +static __always_inline unsigned long +__set_fixmap_offset(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) { __set_fixmap(idx, phys, flags); return fix_to_virt(idx) + (phys & (PAGE_SIZE - 1)); diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 8caac76ac324..3bd04022fd0c 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -59,11 +59,12 @@ extern void kunmap_high(struct page *page); void *kmap(struct page *page); void kunmap(struct page *page); -void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot); -void *kmap_atomic(struct page *page, enum km_type type); -void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type); -void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); -void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); + +void *kmap_atomic_prot(struct page *page, pgprot_t prot); +void *__kmap_atomic(struct page *page); +void __kunmap_atomic(void *kvaddr); +void *kmap_atomic_pfn(unsigned long pfn); +void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); struct page *kmap_atomic_to_page(void *ptr); #define flush_cache_kmaps() do { } while (0) diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index f0203f4791a8..072273082528 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -41,6 +41,8 @@ #include <asm-generic/int-ll64.h> #include <asm/page.h> +#include <xen/xen.h> + #define build_mmio_read(name, size, type, reg, barrier) \ static inline type name(const volatile void __iomem *addr) \ { type ret; asm volatile("mov" size " %1,%0":reg (ret) \ @@ -351,6 +353,17 @@ extern void early_iounmap(void __iomem *addr, unsigned long size); extern void fixup_early_ioremap(void); extern bool is_early_ioremap_ptep(pte_t *ptep); +#ifdef CONFIG_XEN +struct bio_vec; + +extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, + const struct bio_vec *vec2); + +#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \ + (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) && \ + (!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2))) +#endif /* CONFIG_XEN */ + #define IO_SPACE_LIMIT 0xffff #endif /* _ASM_X86_IO_H */ diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index c8be4566c3d2..0c5ca4e30d7b 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -159,7 +159,7 @@ struct io_apic_irq_attr; extern int io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr); void setup_IO_APIC_irq_extra(u32 gsi); -extern void ioapic_init_mappings(void); +extern void ioapic_and_gsi_init(void); extern void ioapic_insert_resources(void); extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); @@ -168,9 +168,9 @@ extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); -extern void probe_nr_irqs_gsi(void); - +extern int get_nr_irqs_gsi(void); extern void setup_ioapic_ids_from_mpc(void); +extern void setup_ioapic_ids_from_mpc_nocheck(void); struct mp_ioapic_gsi{ u32 gsi_base; @@ -188,9 +188,8 @@ extern void __init pre_init_apic_IRQ0(void); #define io_apic_assign_pci_irqs 0 #define setup_ioapic_ids_from_mpc x86_init_noop static const int timer_through_8259 = 0; -static inline void ioapic_init_mappings(void) { } +static inline void ioapic_and_gsi_init(void) { } static inline void ioapic_insert_resources(void) { } -static inline void probe_nr_irqs_gsi(void) { } #define gsi_top (NR_IRQS_LEGACY) static inline int mp_find_ioapic(u32 gsi) { return 0; } diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h index c4191b3b7056..363e33eb6ec1 100644 --- a/arch/x86/include/asm/iomap.h +++ b/arch/x86/include/asm/iomap.h @@ -27,10 +27,10 @@ #include <asm/tlbflush.h> void __iomem * -iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); +iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); void -iounmap_atomic(void __iomem *kvaddr, enum km_type type); +iounmap_atomic(void __iomem *kvaddr); int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot); diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 0bf5b0083650..ba870bb6dd8e 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -15,16 +15,10 @@ static inline int irq_canonicalize(int irq) return ((irq == 2) ? 9 : irq); } -#ifdef CONFIG_X86_LOCAL_APIC -# define ARCH_HAS_NMI_WATCHDOG -#endif - #ifdef CONFIG_X86_32 extern void irq_ctx_init(int cpu); -extern void irq_ctx_exit(int cpu); #else # define irq_ctx_init(cpu) do { } while (0) -# define irq_ctx_exit(cpu) do { } while (0) #endif #define __ARCH_HAS_DO_SOFTIRQ diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index 5bdfca86581b..f23eb2528464 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -28,7 +28,7 @@ extern void die(const char *, struct pt_regs *,long); extern int __must_check __die(const char *, struct pt_regs *, long); extern void show_registers(struct pt_regs *regs); extern void show_trace(struct task_struct *t, struct pt_regs *regs, - unsigned long *sp, unsigned long bp); + unsigned long *sp); extern void __show_regs(struct pt_regs *regs, int all); extern void show_regs(struct pt_regs *regs); extern unsigned long oops_begin(void); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9e6fe391094e..f702f82aa1eb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -79,7 +79,7 @@ #define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) #define KVM_MIN_FREE_MMU_PAGES 5 #define KVM_REFILL_PAGES 25 -#define KVM_MAX_CPUID_ENTRIES 40 +#define KVM_MAX_CPUID_ENTRIES 80 #define KVM_NR_FIXED_MTRR_REGION 88 #define KVM_NR_VAR_MTRR 8 diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h index c82868e9f905..0c90dd9f0505 100644 --- a/arch/x86/include/asm/mpspec.h +++ b/arch/x86/include/asm/mpspec.h @@ -5,8 +5,9 @@ #include <asm/mpspec_def.h> #include <asm/x86_init.h> +#include <asm/apicdef.h> -extern int apic_version[MAX_APICS]; +extern int apic_version[]; extern int pic_mode; #ifdef CONFIG_X86_32 @@ -107,7 +108,7 @@ extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level, int active_high_low); #endif /* CONFIG_ACPI */ -#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_APICS) +#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC) struct physid_mask { unsigned long mask[PHYSID_ARRAY_SIZE]; @@ -122,31 +123,31 @@ typedef struct physid_mask physid_mask_t; test_and_set_bit(physid, (map).mask) #define physids_and(dst, src1, src2) \ - bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS) + bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC) #define physids_or(dst, src1, src2) \ - bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS) + bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC) #define physids_clear(map) \ - bitmap_zero((map).mask, MAX_APICS) + bitmap_zero((map).mask, MAX_LOCAL_APIC) #define physids_complement(dst, src) \ - bitmap_complement((dst).mask, (src).mask, MAX_APICS) + bitmap_complement((dst).mask, (src).mask, MAX_LOCAL_APIC) #define physids_empty(map) \ - bitmap_empty((map).mask, MAX_APICS) + bitmap_empty((map).mask, MAX_LOCAL_APIC) #define physids_equal(map1, map2) \ - bitmap_equal((map1).mask, (map2).mask, MAX_APICS) + bitmap_equal((map1).mask, (map2).mask, MAX_LOCAL_APIC) #define physids_weight(map) \ - bitmap_weight((map).mask, MAX_APICS) + bitmap_weight((map).mask, MAX_LOCAL_APIC) #define physids_shift_right(d, s, n) \ - bitmap_shift_right((d).mask, (s).mask, n, MAX_APICS) + bitmap_shift_right((d).mask, (s).mask, n, MAX_LOCAL_APIC) #define physids_shift_left(d, s, n) \ - bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS) + bitmap_shift_left((d).mask, (s).mask, n, MAX_LOCAL_APIC) static inline unsigned long physids_coerce(physid_mask_t *map) { @@ -159,14 +160,6 @@ static inline void physids_promote(unsigned long physids, physid_mask_t *map) map->mask[0] = physids; } -/* Note: will create very large stack frames if physid_mask_t is big */ -#define physid_mask_of_physid(physid) \ - ({ \ - physid_mask_t __physid_mask = PHYSID_MASK_NONE; \ - physid_set(physid, __physid_mask); \ - __physid_mask; \ - }) - static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map) { physids_clear(*map); diff --git a/arch/x86/include/asm/mpspec_def.h b/arch/x86/include/asm/mpspec_def.h index 4a7f96d7c188..c0a955a9a087 100644 --- a/arch/x86/include/asm/mpspec_def.h +++ b/arch/x86/include/asm/mpspec_def.h @@ -15,13 +15,6 @@ #ifdef CONFIG_X86_32 # define MAX_MPC_ENTRY 1024 -# define MAX_APICS 256 -#else -# if NR_CPUS <= 255 -# define MAX_APICS 255 -# else -# define MAX_APICS 32768 -# endif #endif /* Intel MP Floating Pointer Structure */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 83c4bb1d917d..86030f63ba02 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -121,13 +121,18 @@ #define MSR_AMD64_IBSDCLINAD 0xc0011038 #define MSR_AMD64_IBSDCPHYSAD 0xc0011039 #define MSR_AMD64_IBSCTL 0xc001103a +#define MSR_AMD64_IBSBRTARGET 0xc001103b + +/* Fam 15h MSRs */ +#define MSR_F15H_PERF_CTL 0xc0010200 +#define MSR_F15H_PERF_CTR 0xc0010201 /* Fam 10h MSRs */ #define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058 #define FAM10H_MMIO_CONF_ENABLE (1<<0) #define FAM10H_MMIO_CONF_BUSRANGE_MASK 0xf #define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2 -#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffff +#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL #define FAM10H_MMIO_CONF_BASE_SHIFT 20 #define MSR_FAM10H_NODE_ID 0xc001100c diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 932f0f86b4b7..c4021b953510 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -5,41 +5,15 @@ #include <asm/irq.h> #include <asm/io.h> -#ifdef ARCH_HAS_NMI_WATCHDOG - -/** - * do_nmi_callback - * - * Check to see if a callback exists and execute it. Return 1 - * if the handler exists and was handled successfully. - */ -int do_nmi_callback(struct pt_regs *regs, int cpu); +#ifdef CONFIG_X86_LOCAL_APIC extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); -extern int check_nmi_watchdog(void); -#if !defined(CONFIG_LOCKUP_DETECTOR) -extern int nmi_watchdog_enabled; -#endif extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); extern int reserve_perfctr_nmi(unsigned int); extern void release_perfctr_nmi(unsigned int); extern int reserve_evntsel_nmi(unsigned int); extern void release_evntsel_nmi(unsigned int); -extern void setup_apic_nmi_watchdog(void *); -extern void stop_apic_nmi_watchdog(void *); -extern void disable_timer_nmi_watchdog(void); -extern void enable_timer_nmi_watchdog(void); -extern int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason); -extern void cpu_nmi_set_wd_enabled(void); - -extern atomic_t nmi_active; -extern unsigned int nmi_watchdog; -#define NMI_NONE 0 -#define NMI_IO_APIC 1 -#define NMI_LOCAL_APIC 2 -#define NMI_INVALID 3 - struct ctl_table; extern int proc_nmi_enabled(struct ctl_table *, int , void __user *, size_t *, loff_t *); @@ -47,33 +21,8 @@ extern int unknown_nmi_panic; void arch_trigger_all_cpu_backtrace(void); #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace - -static inline void localise_nmi_watchdog(void) -{ - if (nmi_watchdog == NMI_IO_APIC) - nmi_watchdog = NMI_LOCAL_APIC; -} - -/* check if nmi_watchdog is active (ie was specified at boot) */ -static inline int nmi_watchdog_active(void) -{ - /* - * actually it should be: - * return (nmi_watchdog == NMI_LOCAL_APIC || - * nmi_watchdog == NMI_IO_APIC) - * but since they are power of two we could use a - * cheaper way --cvg - */ - return nmi_watchdog & (NMI_LOCAL_APIC | NMI_IO_APIC); -} #endif -void lapic_watchdog_stop(void); -int lapic_watchdog_init(unsigned nmi_hz); -int lapic_wd_event(unsigned nmi_hz); -unsigned lapic_adjust_nmi_hz(unsigned hz); -void disable_lapic_nmi_watchdog(void); -void enable_lapic_nmi_watchdog(void); void stop_nmi(void); void restart_nmi(void); diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h index 101229b0d8ed..42a978c0c1b3 100644 --- a/arch/x86/include/asm/olpc.h +++ b/arch/x86/include/asm/olpc.h @@ -89,6 +89,8 @@ extern int olpc_ec_mask_unset(uint8_t bits); /* EC commands */ #define EC_FIRMWARE_REV 0x08 +#define EC_WLAN_ENTER_RESET 0x35 +#define EC_WLAN_LEAVE_RESET 0x25 /* SCI source values */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 18e3b8a8709f..ef9975812c77 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -824,27 +824,27 @@ static __always_inline void arch_spin_unlock(struct arch_spinlock *lock) #define __PV_IS_CALLEE_SAVE(func) \ ((struct paravirt_callee_save) { func }) -static inline unsigned long arch_local_save_flags(void) +static inline notrace unsigned long arch_local_save_flags(void) { return PVOP_CALLEE0(unsigned long, pv_irq_ops.save_fl); } -static inline void arch_local_irq_restore(unsigned long f) +static inline notrace void arch_local_irq_restore(unsigned long f) { PVOP_VCALLEE1(pv_irq_ops.restore_fl, f); } -static inline void arch_local_irq_disable(void) +static inline notrace void arch_local_irq_disable(void) { PVOP_VCALLEE0(pv_irq_ops.irq_disable); } -static inline void arch_local_irq_enable(void) +static inline notrace void arch_local_irq_enable(void) { PVOP_VCALLEE0(pv_irq_ops.irq_enable); } -static inline unsigned long arch_local_irq_save(void) +static inline notrace unsigned long arch_local_irq_save(void) { unsigned long f; diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index d395540ff894..ca0437c714b2 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -7,6 +7,7 @@ #include <linux/string.h> #include <asm/scatterlist.h> #include <asm/io.h> +#include <asm/x86_init.h> #ifdef __KERNEL__ @@ -94,8 +95,36 @@ static inline void early_quirks(void) { } extern void pci_iommu_alloc(void); -/* MSI arch hook */ -#define arch_setup_msi_irqs arch_setup_msi_irqs +#ifdef CONFIG_PCI_MSI +/* MSI arch specific hooks */ +static inline int x86_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +{ + return x86_msi.setup_msi_irqs(dev, nvec, type); +} + +static inline void x86_teardown_msi_irqs(struct pci_dev *dev) +{ + x86_msi.teardown_msi_irqs(dev); +} + +static inline void x86_teardown_msi_irq(unsigned int irq) +{ + x86_msi.teardown_msi_irq(irq); +} +#define arch_setup_msi_irqs x86_setup_msi_irqs +#define arch_teardown_msi_irqs x86_teardown_msi_irqs +#define arch_teardown_msi_irq x86_teardown_msi_irq +/* implemented in arch/x86/kernel/apic/io_apic. */ +int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type); +void native_teardown_msi_irq(unsigned int irq); +/* default to the implementation in drivers/lib/msi.c */ +#define HAVE_DEFAULT_MSI_TEARDOWN_IRQS +void default_teardown_msi_irqs(struct pci_dev *dev); +#else +#define native_setup_msi_irqs NULL +#define native_teardown_msi_irq NULL +#define default_teardown_msi_irqs NULL +#endif #define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys) diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index 49c7219826f9..704526734bef 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -47,6 +47,7 @@ enum pci_bf_sort_state { extern unsigned int pcibios_max_latency; void pcibios_resource_survey(void); +void pcibios_set_cache_line_size(void); /* pci-pc.c */ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 6e742cc4251b..d9d4dae305f6 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -111,20 +111,20 @@ union cpuid10_edx { #define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) /* IbsFetchCtl bits/masks */ -#define IBS_FETCH_RAND_EN (1ULL<<57) -#define IBS_FETCH_VAL (1ULL<<49) -#define IBS_FETCH_ENABLE (1ULL<<48) -#define IBS_FETCH_CNT 0xFFFF0000ULL -#define IBS_FETCH_MAX_CNT 0x0000FFFFULL +#define IBS_FETCH_RAND_EN (1ULL<<57) +#define IBS_FETCH_VAL (1ULL<<49) +#define IBS_FETCH_ENABLE (1ULL<<48) +#define IBS_FETCH_CNT 0xFFFF0000ULL +#define IBS_FETCH_MAX_CNT 0x0000FFFFULL /* IbsOpCtl bits */ -#define IBS_OP_CNT_CTL (1ULL<<19) -#define IBS_OP_VAL (1ULL<<18) -#define IBS_OP_ENABLE (1ULL<<17) -#define IBS_OP_MAX_CNT 0x0000FFFFULL +#define IBS_OP_CNT_CTL (1ULL<<19) +#define IBS_OP_VAL (1ULL<<18) +#define IBS_OP_ENABLE (1ULL<<17) +#define IBS_OP_MAX_CNT 0x0000FFFFULL +#define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */ #ifdef CONFIG_PERF_EVENTS -extern void init_hw_perf_events(void); extern void perf_events_lapic_init(void); #define PERF_EVENT_INDEX_OFFSET 0 @@ -155,7 +155,6 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs); } #else -static inline void init_hw_perf_events(void) { } static inline void perf_events_lapic_init(void) { } #endif diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h index a70cd216be5d..295e2ff18a6a 100644 --- a/arch/x86/include/asm/perf_event_p4.h +++ b/arch/x86/include/asm/perf_event_p4.h @@ -744,14 +744,6 @@ enum P4_ESCR_EMASKS { }; /* - * P4 PEBS specifics (Replay Event only) - * - * Format (bits): - * 0-6: metric from P4_PEBS_METRIC enum - * 7 : reserved - * 8 : reserved - * 9-11 : reserved - * * Note we have UOP and PEBS bits reserved for now * just in case if we will need them once */ @@ -788,5 +780,60 @@ enum P4_PEBS_METRIC { P4_PEBS_METRIC__max }; +/* + * Notes on internal configuration of ESCR+CCCR tuples + * + * Since P4 has quite the different architecture of + * performance registers in compare with "architectural" + * once and we have on 64 bits to keep configuration + * of performance event, the following trick is used. + * + * 1) Since both ESCR and CCCR registers have only low + * 32 bits valuable, we pack them into a single 64 bit + * configuration. Low 32 bits of such config correspond + * to low 32 bits of CCCR register and high 32 bits + * correspond to low 32 bits of ESCR register. + * + * 2) The meaning of every bit of such config field can + * be found in Intel SDM but it should be noted that + * we "borrow" some reserved bits for own usage and + * clean them or set to a proper value when we do + * a real write to hardware registers. + * + * 3) The format of bits of config is the following + * and should be either 0 or set to some predefined + * values: + * + * Low 32 bits + * ----------- + * 0-6: P4_PEBS_METRIC enum + * 7-11: reserved + * 12: reserved (Enable) + * 13-15: reserved (ESCR select) + * 16-17: Active Thread + * 18: Compare + * 19: Complement + * 20-23: Threshold + * 24: Edge + * 25: reserved (FORCE_OVF) + * 26: reserved (OVF_PMI_T0) + * 27: reserved (OVF_PMI_T1) + * 28-29: reserved + * 30: reserved (Cascade) + * 31: reserved (OVF) + * + * High 32 bits + * ------------ + * 0: reserved (T1_USR) + * 1: reserved (T1_OS) + * 2: reserved (T0_USR) + * 3: reserved (T0_OS) + * 4: Tag Enable + * 5-8: Tag Value + * 9-24: Event Mask (may use P4_ESCR_EMASK_BIT helper) + * 25-30: enum P4_EVENTS + * 31: reserved (HT thread) + */ + #endif /* PERF_EVENT_P4_H */ diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 8abde9ec90bf..0c92113c4cb6 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -49,24 +49,14 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t); #endif #if defined(CONFIG_HIGHPTE) -#define __KM_PTE \ - (in_nmi() ? KM_NMI_PTE : \ - in_irq() ? KM_IRQ_PTE : \ - KM_PTE0) #define pte_offset_map(dir, address) \ - ((pte_t *)kmap_atomic(pmd_page(*(dir)), __KM_PTE) + \ + ((pte_t *)kmap_atomic(pmd_page(*(dir))) + \ pte_index((address))) -#define pte_offset_map_nested(dir, address) \ - ((pte_t *)kmap_atomic(pmd_page(*(dir)), KM_PTE1) + \ - pte_index((address))) -#define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE) -#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) +#define pte_unmap(pte) kunmap_atomic((pte)) #else #define pte_offset_map(dir, address) \ ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address))) -#define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address)) #define pte_unmap(pte) do { } while (0) -#define pte_unmap_nested(pte) do { } while (0) #endif /* Clear a kernel PTE and flush it from the TLB */ diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index f96ac9bedf75..f86da20347f2 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -127,9 +127,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; } /* x86-64 always has all page tables mapped. */ #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) -#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address)) #define pte_unmap(pte) ((void)(pte))/* NOP */ -#define pte_unmap_nested(pte) ((void)(pte)) /* NOP */ #define update_mmu_cache(vma, address, ptep) do { } while (0) diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 7f7e577a0e39..31d84acc1512 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -11,6 +11,7 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src); void pvclock_read_wallclock(struct pvclock_wall_clock *wall, struct pvclock_vcpu_time_info *vcpu, struct timespec *ts); +void pvclock_resume(void); /* * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 4cfc90824068..4c2f63c7fc1b 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -50,7 +50,7 @@ struct smp_ops { void (*smp_prepare_cpus)(unsigned max_cpus); void (*smp_cpus_done)(unsigned max_cpus); - void (*smp_send_stop)(void); + void (*stop_other_cpus)(int wait); void (*smp_send_reschedule)(int cpu); int (*cpu_up)(unsigned cpu); @@ -73,7 +73,12 @@ extern struct smp_ops smp_ops; static inline void smp_send_stop(void) { - smp_ops.smp_send_stop(); + smp_ops.stop_other_cpus(0); +} + +static inline void stop_other_cpus(void) +{ + smp_ops.stop_other_cpus(1); } static inline void smp_prepare_boot_cpu(void) diff --git a/arch/x86/include/asm/smpboot_hooks.h b/arch/x86/include/asm/smpboot_hooks.h index 1def60114906..6c22bf353f26 100644 --- a/arch/x86/include/asm/smpboot_hooks.h +++ b/arch/x86/include/asm/smpboot_hooks.h @@ -48,7 +48,6 @@ static inline void __init smpboot_setup_io_apic(void) setup_IO_APIC(); else { nr_ioapics = 0; - localise_nmi_watchdog(); } #endif } diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 2b16a2ad23dc..52b5c7ed3608 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -7,6 +7,7 @@ #define _ASM_X86_STACKTRACE_H #include <linux/uaccess.h> +#include <linux/ptrace.h> extern int kstack_depth_to_print; @@ -46,7 +47,7 @@ struct stacktrace_ops { }; void dump_trace(struct task_struct *tsk, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, + unsigned long *stack, const struct stacktrace_ops *ops, void *data); #ifdef CONFIG_X86_32 @@ -57,13 +58,39 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, #define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) #endif +#ifdef CONFIG_FRAME_POINTER +static inline unsigned long +stack_frame(struct task_struct *task, struct pt_regs *regs) +{ + unsigned long bp; + + if (regs) + return regs->bp; + + if (task == current) { + /* Grab bp right from our regs */ + get_bp(bp); + return bp; + } + + /* bp is the last reg pushed by switch_to */ + return *(unsigned long *)task->thread.sp; +} +#else +static inline unsigned long +stack_frame(struct task_struct *task, struct pt_regs *regs) +{ + return 0; +} +#endif + extern void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl); + unsigned long *stack, char *log_lvl); extern void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl); + unsigned long *sp, char *log_lvl); extern unsigned int code_bytes; diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h index 5469630b27f5..fa7b9176b76c 100644 --- a/arch/x86/include/asm/timer.h +++ b/arch/x86/include/asm/timer.h @@ -10,12 +10,6 @@ unsigned long long native_sched_clock(void); extern int recalibrate_cpu_khz(void); -#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) -extern int timer_ack; -#else -# define timer_ack (0) -#endif - extern int no_timer_check; /* Accelerators for sched_clock() diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index bf6b88ef8eeb..a501741c2335 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -5,7 +5,7 @@ * * SGI UV architectural definitions * - * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved. */ #ifndef _ASM_X86_UV_UV_HUB_H @@ -77,7 +77,8 @@ * * 1111110000000000 * 5432109876543210 - * pppppppppplc0cch + * pppppppppplc0cch Nehalem-EX + * ppppppppplcc0cch Westmere-EX * sssssssssss * * p = pnode bits @@ -148,12 +149,25 @@ struct uv_hub_info_s { unsigned char m_val; unsigned char n_val; struct uv_scir_s scir; + unsigned char apic_pnode_shift; }; DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); #define uv_hub_info (&__get_cpu_var(__uv_hub_info)) #define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu)) +union uvh_apicid { + unsigned long v; + struct uvh_apicid_s { + unsigned long local_apic_mask : 24; + unsigned long local_apic_shift : 5; + unsigned long unused1 : 3; + unsigned long pnode_mask : 24; + unsigned long pnode_shift : 5; + unsigned long unused2 : 3; + } s; +}; + /* * Local & Global MMR space macros. * Note: macros are intended to be used ONLY by inline functions @@ -182,8 +196,11 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); #define UV_GLOBAL_MMR64_PNODE_BITS(p) \ (((unsigned long)(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT) +#define UVH_APICID 0x002D0E00L #define UV_APIC_PNODE_SHIFT 6 +#define UV_APICID_HIBIT_MASK 0xffff0000 + /* Local Bus from cpu's perspective */ #define LOCAL_BUS_BASE 0x1c00000 #define LOCAL_BUS_SIZE (4 * 1024 * 1024) @@ -280,7 +297,7 @@ static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset) */ static inline int uv_apicid_to_pnode(int apicid) { - return (apicid >> UV_APIC_PNODE_SHIFT); + return (apicid >> uv_hub_info->apic_pnode_shift); } /* @@ -476,8 +493,10 @@ static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value) } } +extern unsigned int uv_apicid_hibits; static unsigned long uv_hub_ipi_value(int apicid, int vector, int mode) { + apicid |= uv_apicid_hibits; return (1UL << UVH_IPI_INT_SEND_SHFT) | ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) | (mode << UVH_IPI_INT_DELIVERY_MODE_SHFT) | diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h index b2f2d2e05cec..20cafeac7455 100644 --- a/arch/x86/include/asm/uv/uv_mmrs.h +++ b/arch/x86/include/asm/uv/uv_mmrs.h @@ -5,7 +5,7 @@ * * SGI UV MMR definitions * - * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved. */ #ifndef _ASM_X86_UV_UV_MMRS_H @@ -754,6 +754,23 @@ union uvh_lb_bau_sb_descriptor_base_u { }; /* ========================================================================= */ +/* UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK */ +/* ========================================================================= */ +#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK 0x320130UL +#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_32 0x009f0 + +#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_SHFT 0 +#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_MASK 0x00000000ffffffffUL + +union uvh_lb_target_physical_apic_id_mask_u { + unsigned long v; + struct uvh_lb_target_physical_apic_id_mask_s { + unsigned long bit_enables : 32; /* RW */ + unsigned long rsvd_32_63 : 32; /* */ + } s; +}; + +/* ========================================================================= */ /* UVH_NODE_ID */ /* ========================================================================= */ #define UVH_NODE_ID 0x0UL @@ -806,6 +823,78 @@ union uvh_node_present_table_u { }; /* ========================================================================= */ +/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR */ +/* ========================================================================= */ +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL + +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24 +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48 +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_SHFT 63 +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL + +union uvh_rh_gam_alias210_overlay_config_0_mmr_u { + unsigned long v; + struct uvh_rh_gam_alias210_overlay_config_0_mmr_s { + unsigned long rsvd_0_23: 24; /* */ + unsigned long base : 8; /* RW */ + unsigned long rsvd_32_47: 16; /* */ + unsigned long m_alias : 5; /* RW */ + unsigned long rsvd_53_62: 10; /* */ + unsigned long enable : 1; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR */ +/* ========================================================================= */ +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL + +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24 +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48 +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_SHFT 63 +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL + +union uvh_rh_gam_alias210_overlay_config_1_mmr_u { + unsigned long v; + struct uvh_rh_gam_alias210_overlay_config_1_mmr_s { + unsigned long rsvd_0_23: 24; /* */ + unsigned long base : 8; /* RW */ + unsigned long rsvd_32_47: 16; /* */ + unsigned long m_alias : 5; /* RW */ + unsigned long rsvd_53_62: 10; /* */ + unsigned long enable : 1; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR */ +/* ========================================================================= */ +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL + +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24 +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48 +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_SHFT 63 +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL + +union uvh_rh_gam_alias210_overlay_config_2_mmr_u { + unsigned long v; + struct uvh_rh_gam_alias210_overlay_config_2_mmr_s { + unsigned long rsvd_0_23: 24; /* */ + unsigned long base : 8; /* RW */ + unsigned long rsvd_32_47: 16; /* */ + unsigned long m_alias : 5; /* RW */ + unsigned long rsvd_53_62: 10; /* */ + unsigned long enable : 1; /* RW */ + } s; +}; + +/* ========================================================================= */ /* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR */ /* ========================================================================= */ #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL @@ -857,6 +946,29 @@ union uvh_rh_gam_alias210_redirect_config_2_mmr_u { }; /* ========================================================================= */ +/* UVH_RH_GAM_CONFIG_MMR */ +/* ========================================================================= */ +#define UVH_RH_GAM_CONFIG_MMR 0x1600000UL + +#define UVH_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 +#define UVH_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL +#define UVH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 +#define UVH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL +#define UVH_RH_GAM_CONFIG_MMR_MMIOL_CFG_SHFT 12 +#define UVH_RH_GAM_CONFIG_MMR_MMIOL_CFG_MASK 0x0000000000001000UL + +union uvh_rh_gam_config_mmr_u { + unsigned long v; + struct uvh_rh_gam_config_mmr_s { + unsigned long m_skt : 6; /* RW */ + unsigned long n_skt : 4; /* RW */ + unsigned long rsvd_10_11: 2; /* */ + unsigned long mmiol_cfg : 1; /* RW */ + unsigned long rsvd_13_63: 51; /* */ + } s; +}; + +/* ========================================================================= */ /* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */ /* ========================================================================= */ #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL @@ -987,97 +1099,5 @@ union uvh_rtc1_int_config_u { } s; }; -/* ========================================================================= */ -/* UVH_SI_ADDR_MAP_CONFIG */ -/* ========================================================================= */ -#define UVH_SI_ADDR_MAP_CONFIG 0xc80000UL - -#define UVH_SI_ADDR_MAP_CONFIG_M_SKT_SHFT 0 -#define UVH_SI_ADDR_MAP_CONFIG_M_SKT_MASK 0x000000000000003fUL -#define UVH_SI_ADDR_MAP_CONFIG_N_SKT_SHFT 8 -#define UVH_SI_ADDR_MAP_CONFIG_N_SKT_MASK 0x0000000000000f00UL - -union uvh_si_addr_map_config_u { - unsigned long v; - struct uvh_si_addr_map_config_s { - unsigned long m_skt : 6; /* RW */ - unsigned long rsvd_6_7: 2; /* */ - unsigned long n_skt : 4; /* RW */ - unsigned long rsvd_12_63: 52; /* */ - } s; -}; - -/* ========================================================================= */ -/* UVH_SI_ALIAS0_OVERLAY_CONFIG */ -/* ========================================================================= */ -#define UVH_SI_ALIAS0_OVERLAY_CONFIG 0xc80008UL - -#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_SHFT 24 -#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL -#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_SHFT 48 -#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL -#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_SHFT 63 -#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL - -union uvh_si_alias0_overlay_config_u { - unsigned long v; - struct uvh_si_alias0_overlay_config_s { - unsigned long rsvd_0_23: 24; /* */ - unsigned long base : 8; /* RW */ - unsigned long rsvd_32_47: 16; /* */ - unsigned long m_alias : 5; /* RW */ - unsigned long rsvd_53_62: 10; /* */ - unsigned long enable : 1; /* RW */ - } s; -}; - -/* ========================================================================= */ -/* UVH_SI_ALIAS1_OVERLAY_CONFIG */ -/* ========================================================================= */ -#define UVH_SI_ALIAS1_OVERLAY_CONFIG 0xc80010UL - -#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_SHFT 24 -#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL -#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_SHFT 48 -#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL -#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_SHFT 63 -#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL - -union uvh_si_alias1_overlay_config_u { - unsigned long v; - struct uvh_si_alias1_overlay_config_s { - unsigned long rsvd_0_23: 24; /* */ - unsigned long base : 8; /* RW */ - unsigned long rsvd_32_47: 16; /* */ - unsigned long m_alias : 5; /* RW */ - unsigned long rsvd_53_62: 10; /* */ - unsigned long enable : 1; /* RW */ - } s; -}; - -/* ========================================================================= */ -/* UVH_SI_ALIAS2_OVERLAY_CONFIG */ -/* ========================================================================= */ -#define UVH_SI_ALIAS2_OVERLAY_CONFIG 0xc80018UL - -#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_SHFT 24 -#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL -#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_SHFT 48 -#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL -#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_SHFT 63 -#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL - -union uvh_si_alias2_overlay_config_u { - unsigned long v; - struct uvh_si_alias2_overlay_config_s { - unsigned long rsvd_0_23: 24; /* */ - unsigned long base : 8; /* RW */ - unsigned long rsvd_32_47: 16; /* */ - unsigned long m_alias : 5; /* RW */ - unsigned long rsvd_53_62: 10; /* */ - unsigned long enable : 1; /* RW */ - } s; -}; - -#endif /* _ASM_X86_UV_UV_MMRS_H */ +#endif /* __ASM_UV_MMRS_X86_H__ */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index baa579c8e038..64642ad019fb 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -154,9 +154,18 @@ struct x86_platform_ops { int (*i8042_detect)(void); }; +struct pci_dev; + +struct x86_msi_ops { + int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type); + void (*teardown_msi_irq)(unsigned int irq); + void (*teardown_msi_irqs)(struct pci_dev *dev); +}; + extern struct x86_init_ops x86_init; extern struct x86_cpuinit_ops x86_cpuinit; extern struct x86_platform_ops x86_platform; +extern struct x86_msi_ops x86_msi; extern void x86_init_noop(void); extern void x86_init_uint_noop(unsigned int unused); diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 7fda040a76cd..a3c28ae4025b 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -200,6 +200,23 @@ extern struct { char _entry[32]; } hypercall_page[]; (type)__res; \ }) +static inline long +privcmd_call(unsigned call, + unsigned long a1, unsigned long a2, + unsigned long a3, unsigned long a4, + unsigned long a5) +{ + __HYPERCALL_DECLS; + __HYPERCALL_5ARG(a1, a2, a3, a4, a5); + + asm volatile("call *%[call]" + : __HYPERCALL_5PARAM + : [call] "a" (&hypercall_page[call]) + : __HYPERCALL_CLOBBER5); + + return (long)__res; +} + static inline int HYPERVISOR_set_trap_table(struct trap_info *table) { diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index e8506c1f0c55..1c10c88ee4e1 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -61,9 +61,9 @@ DEFINE_GUEST_HANDLE(void); #define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START) #endif -#ifndef machine_to_phys_mapping -#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START) -#endif +#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START) +#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END) +#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>__MACH2PHYS_SHIFT) /* Maximum number of virtual CPUs in multi-processor guests. */ #define MAX_VIRT_CPUS 32 diff --git a/arch/x86/include/asm/xen/interface_32.h b/arch/x86/include/asm/xen/interface_32.h index 42a7e004ae5c..8413688b2571 100644 --- a/arch/x86/include/asm/xen/interface_32.h +++ b/arch/x86/include/asm/xen/interface_32.h @@ -32,6 +32,11 @@ /* And the trap vector is... */ #define TRAP_INSTR "int $0x82" +#define __MACH2PHYS_VIRT_START 0xF5800000 +#define __MACH2PHYS_VIRT_END 0xF6800000 + +#define __MACH2PHYS_SHIFT 2 + /* * Virtual addresses beyond this are not modifiable by guest OSes. The * machine->physical mapping table starts at this address, read-only. diff --git a/arch/x86/include/asm/xen/interface_64.h b/arch/x86/include/asm/xen/interface_64.h index 100d2662b97c..839a4811cf98 100644 --- a/arch/x86/include/asm/xen/interface_64.h +++ b/arch/x86/include/asm/xen/interface_64.h @@ -39,18 +39,7 @@ #define __HYPERVISOR_VIRT_END 0xFFFF880000000000 #define __MACH2PHYS_VIRT_START 0xFFFF800000000000 #define __MACH2PHYS_VIRT_END 0xFFFF804000000000 - -#ifndef HYPERVISOR_VIRT_START -#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START) -#define HYPERVISOR_VIRT_END mk_unsigned_long(__HYPERVISOR_VIRT_END) -#endif - -#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START) -#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END) -#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3) -#ifndef machine_to_phys_mapping -#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START) -#endif +#define __MACH2PHYS_SHIFT 3 /* * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base) diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index bf5f7d32bd08..8760cc60a21c 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -5,6 +5,7 @@ #include <linux/types.h> #include <linux/spinlock.h> #include <linux/pfn.h> +#include <linux/mm.h> #include <asm/uaccess.h> #include <asm/page.h> @@ -35,16 +36,25 @@ typedef struct xpaddr { #define MAX_DOMAIN_PAGES \ ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE)) +extern unsigned long *machine_to_phys_mapping; +extern unsigned int machine_to_phys_order; extern unsigned long get_phys_to_machine(unsigned long pfn); -extern void set_phys_to_machine(unsigned long pfn, unsigned long mfn); +extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn); static inline unsigned long pfn_to_mfn(unsigned long pfn) { + unsigned long mfn; + if (xen_feature(XENFEAT_auto_translated_physmap)) return pfn; - return get_phys_to_machine(pfn) & ~FOREIGN_FRAME_BIT; + mfn = get_phys_to_machine(pfn); + + if (mfn != INVALID_P2M_ENTRY) + mfn &= ~FOREIGN_FRAME_BIT; + + return mfn; } static inline int phys_to_machine_mapping_valid(unsigned long pfn) @@ -62,10 +72,8 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn) if (xen_feature(XENFEAT_auto_translated_physmap)) return mfn; -#if 0 if (unlikely((mfn >> machine_to_phys_order) != 0)) - return max_mapnr; -#endif + return ~0; pfn = 0; /* @@ -159,6 +167,7 @@ static inline pte_t __pte_ma(pteval_t x) #define pgd_val_ma(x) ((x).pgd) +void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid); xmaddr_t arbitrary_virt_to_machine(void *address); unsigned long arbitrary_virt_to_mfn(void *vaddr); diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h new file mode 100644 index 000000000000..2329b3eaf8d3 --- /dev/null +++ b/arch/x86/include/asm/xen/pci.h @@ -0,0 +1,65 @@ +#ifndef _ASM_X86_XEN_PCI_H +#define _ASM_X86_XEN_PCI_H + +#if defined(CONFIG_PCI_XEN) +extern int __init pci_xen_init(void); +extern int __init pci_xen_hvm_init(void); +#define pci_xen 1 +#else +#define pci_xen 0 +#define pci_xen_init (0) +static inline int pci_xen_hvm_init(void) +{ + return -1; +} +#endif +#if defined(CONFIG_XEN_DOM0) +void __init xen_setup_pirqs(void); +#else +static inline void __init xen_setup_pirqs(void) +{ +} +#endif + +#if defined(CONFIG_PCI_MSI) +#if defined(CONFIG_PCI_XEN) +/* The drivers/pci/xen-pcifront.c sets this structure to + * its own functions. + */ +struct xen_pci_frontend_ops { + int (*enable_msi)(struct pci_dev *dev, int **vectors); + void (*disable_msi)(struct pci_dev *dev); + int (*enable_msix)(struct pci_dev *dev, int **vectors, int nvec); + void (*disable_msix)(struct pci_dev *dev); +}; + +extern struct xen_pci_frontend_ops *xen_pci_frontend; + +static inline int xen_pci_frontend_enable_msi(struct pci_dev *dev, + int **vectors) +{ + if (xen_pci_frontend && xen_pci_frontend->enable_msi) + return xen_pci_frontend->enable_msi(dev, vectors); + return -ENODEV; +} +static inline void xen_pci_frontend_disable_msi(struct pci_dev *dev) +{ + if (xen_pci_frontend && xen_pci_frontend->disable_msi) + xen_pci_frontend->disable_msi(dev); +} +static inline int xen_pci_frontend_enable_msix(struct pci_dev *dev, + int **vectors, int nvec) +{ + if (xen_pci_frontend && xen_pci_frontend->enable_msix) + return xen_pci_frontend->enable_msix(dev, vectors, nvec); + return -ENODEV; +} +static inline void xen_pci_frontend_disable_msix(struct pci_dev *dev) +{ + if (xen_pci_frontend && xen_pci_frontend->disable_msix) + xen_pci_frontend->disable_msix(dev); +} +#endif /* CONFIG_PCI_XEN */ +#endif /* CONFIG_PCI_MSI */ + +#endif /* _ASM_X86_XEN_PCI_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 2c833d8c4141..1e994754d323 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -36,7 +36,6 @@ obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o ldt.o dumpstack.o obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o obj-$(CONFIG_IRQ_WORK) += irq_work.o -obj-$(CONFIG_X86_VISWS) += visws_quirks.o obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o @@ -46,6 +45,7 @@ obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o obj-y += tsc.o io_delay.o rtc.o obj-y += pci-iommu_table.o +obj-y += resource.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o obj-y += process.o @@ -58,7 +58,6 @@ obj-$(CONFIG_INTEL_TXT) += tboot.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += cpu/ obj-y += acpi/ -obj-$(CONFIG_SFI) += sfi.o obj-y += reboot.o obj-$(CONFIG_MCA) += mca_32.o obj-$(CONFIG_X86_MSR) += msr.o @@ -82,7 +81,6 @@ obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_VM86) += vm86_32.o @@ -104,14 +102,6 @@ obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o -obj-$(CONFIG_SCx200) += scx200.o -scx200-y += scx200_32.o - -obj-$(CONFIG_OLPC) += olpc.o -obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o -obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o -obj-$(CONFIG_X86_MRST) += mrst.o - microcode-y := microcode_core.o microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o @@ -124,7 +114,6 @@ obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) - obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o obj-$(CONFIG_AUDIT) += audit_64.o obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index c05872aa3ce0..17c8090fabd4 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -198,6 +198,11 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled) { unsigned int ver = 0; + if (id >= (MAX_LOCAL_APIC-1)) { + printk(KERN_INFO PREFIX "skipped apicid that is too big\n"); + return; + } + if (!enabled) { ++disabled_cpus; return; @@ -513,35 +518,62 @@ int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi) return 0; } -/* - * success: return IRQ number (>=0) - * failure: return < 0 - */ -int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) +static int acpi_register_gsi_pic(struct device *dev, u32 gsi, + int trigger, int polarity) { - unsigned int irq; - unsigned int plat_gsi = gsi; - #ifdef CONFIG_PCI /* * Make sure all (legacy) PCI IRQs are set as level-triggered. */ - if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { - if (trigger == ACPI_LEVEL_SENSITIVE) - eisa_set_level_irq(gsi); - } + if (trigger == ACPI_LEVEL_SENSITIVE) + eisa_set_level_irq(gsi); #endif + return gsi; +} + +static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi, + int trigger, int polarity) +{ #ifdef CONFIG_X86_IO_APIC - if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) { - plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity); - } + gsi = mp_register_gsi(dev, gsi, trigger, polarity); #endif + + return gsi; +} + +int (*__acpi_register_gsi)(struct device *dev, u32 gsi, + int trigger, int polarity) = acpi_register_gsi_pic; + +/* + * success: return IRQ number (>=0) + * failure: return < 0 + */ +int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) +{ + unsigned int irq; + unsigned int plat_gsi = gsi; + + plat_gsi = (*__acpi_register_gsi)(dev, gsi, trigger, polarity); irq = gsi_to_irq(plat_gsi); return irq; } +void __init acpi_set_irq_model_pic(void) +{ + acpi_irq_model = ACPI_IRQ_MODEL_PIC; + __acpi_register_gsi = acpi_register_gsi_pic; + acpi_ioapic = 0; +} + +void __init acpi_set_irq_model_ioapic(void) +{ + acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC; + __acpi_register_gsi = acpi_register_gsi_ioapic; + acpi_ioapic = 1; +} + /* * ACPI based hotplug support for CPU */ @@ -883,13 +915,13 @@ static int __init acpi_parse_madt_lapic_entries(void) acpi_register_lapic_address(acpi_lapic_addr); count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC, - acpi_parse_sapic, MAX_APICS); + acpi_parse_sapic, MAX_LOCAL_APIC); if (!count) { x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC, - acpi_parse_x2apic, MAX_APICS); + acpi_parse_x2apic, MAX_LOCAL_APIC); count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, - acpi_parse_lapic, MAX_APICS); + acpi_parse_lapic, MAX_LOCAL_APIC); } if (!count && !x2count) { printk(KERN_ERR PREFIX "No LAPIC entries present\n"); @@ -1259,8 +1291,7 @@ static void __init acpi_process_madt(void) */ error = acpi_parse_madt_ioapic_entries(); if (!error) { - acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC; - acpi_ioapic = 1; + acpi_set_irq_model_ioapic(); smp_found_config = 1; } diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 74a847835bab..69fd72aa5594 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -15,7 +15,6 @@ #ifdef CONFIG_X86_32 #include <asm/pgtable.h> -#include <asm/pgtable_32.h> #endif #include "realmode/wakeup.h" diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index a36bb90aef53..553d0b0d639b 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -591,17 +591,21 @@ static atomic_t stop_machine_first; static int wrote_text; struct text_poke_params { - void *addr; - const void *opcode; - size_t len; + struct text_poke_param *params; + int nparams; }; static int __kprobes stop_machine_text_poke(void *data) { struct text_poke_params *tpp = data; + struct text_poke_param *p; + int i; if (atomic_dec_and_test(&stop_machine_first)) { - text_poke(tpp->addr, tpp->opcode, tpp->len); + for (i = 0; i < tpp->nparams; i++) { + p = &tpp->params[i]; + text_poke(p->addr, p->opcode, p->len); + } smp_wmb(); /* Make sure other cpus see that this has run */ wrote_text = 1; } else { @@ -610,8 +614,12 @@ static int __kprobes stop_machine_text_poke(void *data) smp_mb(); /* Load wrote_text before following execution */ } - flush_icache_range((unsigned long)tpp->addr, - (unsigned long)tpp->addr + tpp->len); + for (i = 0; i < tpp->nparams; i++) { + p = &tpp->params[i]; + flush_icache_range((unsigned long)p->addr, + (unsigned long)p->addr + p->len); + } + return 0; } @@ -631,78 +639,62 @@ static int __kprobes stop_machine_text_poke(void *data) void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len) { struct text_poke_params tpp; + struct text_poke_param p; - tpp.addr = addr; - tpp.opcode = opcode; - tpp.len = len; + p.addr = addr; + p.opcode = opcode; + p.len = len; + tpp.params = &p; + tpp.nparams = 1; atomic_set(&stop_machine_first, 1); wrote_text = 0; /* Use __stop_machine() because the caller already got online_cpus. */ - __stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); + __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask); return addr; } +/** + * text_poke_smp_batch - Update instructions on a live kernel on SMP + * @params: an array of text_poke parameters + * @n: the number of elements in params. + * + * Modify multi-byte instruction by using stop_machine() on SMP. Since the + * stop_machine() is heavy task, it is better to aggregate text_poke requests + * and do it once if possible. + * + * Note: Must be called under get_online_cpus() and text_mutex. + */ +void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n) +{ + struct text_poke_params tpp = {.params = params, .nparams = n}; + + atomic_set(&stop_machine_first, 1); + wrote_text = 0; + stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); +} + #if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL) -unsigned char ideal_nop5[IDEAL_NOP_SIZE_5]; +#ifdef CONFIG_X86_64 +unsigned char ideal_nop5[5] = { 0x66, 0x66, 0x66, 0x66, 0x90 }; +#else +unsigned char ideal_nop5[5] = { 0x3e, 0x8d, 0x74, 0x26, 0x00 }; +#endif void __init arch_init_ideal_nop5(void) { - extern const unsigned char ftrace_test_p6nop[]; - extern const unsigned char ftrace_test_nop5[]; - extern const unsigned char ftrace_test_jmp[]; - int faulted = 0; - /* - * There is no good nop for all x86 archs. - * We will default to using the P6_NOP5, but first we - * will test to make sure that the nop will actually - * work on this CPU. If it faults, we will then - * go to a lesser efficient 5 byte nop. If that fails - * we then just use a jmp as our nop. This isn't the most - * efficient nop, but we can not use a multi part nop - * since we would then risk being preempted in the middle - * of that nop, and if we enabled tracing then, it might - * cause a system crash. + * There is no good nop for all x86 archs. This selection + * algorithm should be unified with the one in find_nop_table(), + * but this should be good enough for now. * - * TODO: check the cpuid to determine the best nop. + * For cases other than the ones below, use the safe (as in + * always functional) defaults above. */ - asm volatile ( - "ftrace_test_jmp:" - "jmp ftrace_test_p6nop\n" - "nop\n" - "nop\n" - "nop\n" /* 2 byte jmp + 3 bytes */ - "ftrace_test_p6nop:" - P6_NOP5 - "jmp 1f\n" - "ftrace_test_nop5:" - ".byte 0x66,0x66,0x66,0x66,0x90\n" - "1:" - ".section .fixup, \"ax\"\n" - "2: movl $1, %0\n" - " jmp ftrace_test_nop5\n" - "3: movl $2, %0\n" - " jmp 1b\n" - ".previous\n" - _ASM_EXTABLE(ftrace_test_p6nop, 2b) - _ASM_EXTABLE(ftrace_test_nop5, 3b) - : "=r"(faulted) : "0" (faulted)); - - switch (faulted) { - case 0: - pr_info("converting mcount calls to 0f 1f 44 00 00\n"); - memcpy(ideal_nop5, ftrace_test_p6nop, IDEAL_NOP_SIZE_5); - break; - case 1: - pr_info("converting mcount calls to 66 66 66 66 90\n"); - memcpy(ideal_nop5, ftrace_test_nop5, IDEAL_NOP_SIZE_5); - break; - case 2: - pr_info("converting mcount calls to jmp . + 5\n"); - memcpy(ideal_nop5, ftrace_test_jmp, IDEAL_NOP_SIZE_5); - break; - } - +#ifdef CONFIG_X86_64 + /* Don't use these on 32 bits due to broken virtualizers */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + memcpy(ideal_nop5, p6_nops[5], 5); +#endif } #endif diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 8f6463d8ed0d..affacb5e0065 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -12,95 +12,116 @@ static u32 *flush_words; -struct pci_device_id k8_nb_ids[] = { +struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) }, {} }; -EXPORT_SYMBOL(k8_nb_ids); +EXPORT_SYMBOL(amd_nb_misc_ids); -struct k8_northbridge_info k8_northbridges; -EXPORT_SYMBOL(k8_northbridges); +struct amd_northbridge_info amd_northbridges; +EXPORT_SYMBOL(amd_northbridges); -static struct pci_dev *next_k8_northbridge(struct pci_dev *dev) +static struct pci_dev *next_northbridge(struct pci_dev *dev, + struct pci_device_id *ids) { do { dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); if (!dev) break; - } while (!pci_match_id(&k8_nb_ids[0], dev)); + } while (!pci_match_id(ids, dev)); return dev; } -int cache_k8_northbridges(void) +int amd_cache_northbridges(void) { - int i; - struct pci_dev *dev; + int i = 0; + struct amd_northbridge *nb; + struct pci_dev *misc; - if (k8_northbridges.num) + if (amd_nb_num()) return 0; - dev = NULL; - while ((dev = next_k8_northbridge(dev)) != NULL) - k8_northbridges.num++; + misc = NULL; + while ((misc = next_northbridge(misc, amd_nb_misc_ids)) != NULL) + i++; - /* some CPU families (e.g. family 0x11) do not support GART */ - if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || - boot_cpu_data.x86 == 0x15) - k8_northbridges.gart_supported = 1; + if (i == 0) + return 0; - k8_northbridges.nb_misc = kmalloc((k8_northbridges.num + 1) * - sizeof(void *), GFP_KERNEL); - if (!k8_northbridges.nb_misc) + nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL); + if (!nb) return -ENOMEM; - if (!k8_northbridges.num) { - k8_northbridges.nb_misc[0] = NULL; - return 0; - } + amd_northbridges.nb = nb; + amd_northbridges.num = i; - if (k8_northbridges.gart_supported) { - flush_words = kmalloc(k8_northbridges.num * sizeof(u32), - GFP_KERNEL); - if (!flush_words) { - kfree(k8_northbridges.nb_misc); - return -ENOMEM; - } - } + misc = NULL; + for (i = 0; i != amd_nb_num(); i++) { + node_to_amd_nb(i)->misc = misc = + next_northbridge(misc, amd_nb_misc_ids); + } + + /* some CPU families (e.g. family 0x11) do not support GART */ + if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || + boot_cpu_data.x86 == 0x15) + amd_northbridges.flags |= AMD_NB_GART; + + /* + * Some CPU families support L3 Cache Index Disable. There are some + * limitations because of E382 and E388 on family 0x10. + */ + if (boot_cpu_data.x86 == 0x10 && + boot_cpu_data.x86_model >= 0x8 && + (boot_cpu_data.x86_model > 0x9 || + boot_cpu_data.x86_mask >= 0x1)) + amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE; - dev = NULL; - i = 0; - while ((dev = next_k8_northbridge(dev)) != NULL) { - k8_northbridges.nb_misc[i] = dev; - if (k8_northbridges.gart_supported) - pci_read_config_dword(dev, 0x9c, &flush_words[i++]); - } - k8_northbridges.nb_misc[i] = NULL; return 0; } -EXPORT_SYMBOL_GPL(cache_k8_northbridges); +EXPORT_SYMBOL_GPL(amd_cache_northbridges); /* Ignores subdevice/subvendor but as far as I can figure out they're useless anyways */ -int __init early_is_k8_nb(u32 device) +int __init early_is_amd_nb(u32 device) { struct pci_device_id *id; u32 vendor = device & 0xffff; device >>= 16; - for (id = k8_nb_ids; id->vendor; id++) + for (id = amd_nb_misc_ids; id->vendor; id++) if (vendor == id->vendor && device == id->device) return 1; return 0; } -void k8_flush_garts(void) +int amd_cache_gart(void) +{ + int i; + + if (!amd_nb_has_feature(AMD_NB_GART)) + return 0; + + flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL); + if (!flush_words) { + amd_northbridges.flags &= ~AMD_NB_GART; + return -ENOMEM; + } + + for (i = 0; i != amd_nb_num(); i++) + pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, + &flush_words[i]); + + return 0; +} + +void amd_flush_garts(void) { int flushed, i; unsigned long flags; static DEFINE_SPINLOCK(gart_lock); - if (!k8_northbridges.gart_supported) + if (!amd_nb_has_feature(AMD_NB_GART)) return; /* Avoid races between AGP and IOMMU. In theory it's not needed @@ -109,16 +130,16 @@ void k8_flush_garts(void) that it doesn't matter to serialize more. -AK */ spin_lock_irqsave(&gart_lock, flags); flushed = 0; - for (i = 0; i < k8_northbridges.num; i++) { - pci_write_config_dword(k8_northbridges.nb_misc[i], 0x9c, - flush_words[i]|1); + for (i = 0; i < amd_nb_num(); i++) { + pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c, + flush_words[i] | 1); flushed++; } - for (i = 0; i < k8_northbridges.num; i++) { + for (i = 0; i < amd_nb_num(); i++) { u32 w; /* Make sure the hardware actually executed the flush*/ for (;;) { - pci_read_config_dword(k8_northbridges.nb_misc[i], + pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, &w); if (!(w & 1)) break; @@ -129,19 +150,23 @@ void k8_flush_garts(void) if (!flushed) printk("nothing to flush?\n"); } -EXPORT_SYMBOL_GPL(k8_flush_garts); +EXPORT_SYMBOL_GPL(amd_flush_garts); -static __init int init_k8_nbs(void) +static __init int init_amd_nbs(void) { int err = 0; - err = cache_k8_northbridges(); + err = amd_cache_northbridges(); if (err < 0) - printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n"); + printk(KERN_NOTICE "AMD NB: Cannot enumerate AMD northbridges.\n"); + + if (amd_cache_gart() < 0) + printk(KERN_NOTICE "AMD NB: Cannot initialize GART flush words, " + "GART support disabled.\n"); return err; } /* This has to go after the PCI subsystem */ -fs_initcall(init_k8_nbs); +fs_initcall(init_amd_nbs); diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index b3a16e8f0703..dcd7c83e1659 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -206,7 +206,7 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order) * Do an PCI bus scan by hand because we're running before the PCI * subsystem. * - * All K8 AGP bridges are AGPv3 compliant, so we can do this scan + * All AMD AGP bridges are AGPv3 compliant, so we can do this scan * generically. It's probably overkill to always scan all slots because * the AGP bridges should be always an own bus on the HT hierarchy, * but do it here for future safety. @@ -303,7 +303,7 @@ void __init early_gart_iommu_check(void) dev_limit = bus_dev_ranges[i].dev_limit; for (slot = dev_base; slot < dev_limit; slot++) { - if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) continue; ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); @@ -358,7 +358,7 @@ void __init early_gart_iommu_check(void) dev_limit = bus_dev_ranges[i].dev_limit; for (slot = dev_base; slot < dev_limit; slot++) { - if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) continue; ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); @@ -400,7 +400,7 @@ int __init gart_iommu_hole_init(void) dev_limit = bus_dev_ranges[i].dev_limit; for (slot = dev_base; slot < dev_limit; slot++) { - if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) continue; iommu_detected = 1; @@ -518,7 +518,7 @@ out: dev_base = bus_dev_ranges[i].dev_base; dev_limit = bus_dev_ranges[i].dev_limit; for (slot = dev_base; slot < dev_limit; slot++) { - if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) continue; write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 910f20b457c4..3966b564ea47 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -3,10 +3,7 @@ # obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o -ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y) -obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o -endif -obj-$(CONFIG_HARDLOCKUP_DETECTOR) += hw_nmi.o +obj-y += hw_nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_SMP) += ipi.o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index cb1304856a5c..879999a5230f 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -31,7 +31,6 @@ #include <linux/init.h> #include <linux/cpu.h> #include <linux/dmi.h> -#include <linux/nmi.h> #include <linux/smp.h> #include <linux/mm.h> @@ -52,7 +51,6 @@ #include <asm/mce.h> #include <asm/kvm_para.h> #include <asm/tsc.h> -#include <asm/atomic.h> unsigned int num_processors; @@ -801,11 +799,7 @@ void __init setup_boot_APIC_clock(void) * PIT/HPET going. Otherwise register lapic as a dummy * device. */ - if (nmi_watchdog != NMI_IO_APIC) - lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; - else - pr_warning("APIC timer registered as dummy," - " due to nmi_watchdog=%d!\n", nmi_watchdog); + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; /* Setup the lapic or request the broadcast */ setup_APIC_timer(); @@ -1389,8 +1383,15 @@ void __cpuinit end_local_APIC_setup(void) } #endif - setup_apic_nmi_watchdog(NULL); apic_pm_activate(); + + /* + * Now that local APIC setup is completed for BP, configure the fault + * handling for interrupt remapping. + */ + if (!smp_processor_id() && intr_remapping_enabled) + enable_drhd_fault_handling(); + } #ifdef CONFIG_X86_X2APIC @@ -1532,13 +1533,60 @@ static int __init detect_init_APIC(void) return 0; } #else + +static int apic_verify(void) +{ + u32 features, h, l; + + /* + * The APIC feature bit should now be enabled + * in `cpuid' + */ + features = cpuid_edx(1); + if (!(features & (1 << X86_FEATURE_APIC))) { + pr_warning("Could not enable APIC!\n"); + return -1; + } + set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + + /* The BIOS may have set up the APIC at some other address */ + rdmsr(MSR_IA32_APICBASE, l, h); + if (l & MSR_IA32_APICBASE_ENABLE) + mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; + + pr_info("Found and enabled local APIC!\n"); + return 0; +} + +int apic_force_enable(void) +{ + u32 h, l; + + if (disable_apic) + return -1; + + /* + * Some BIOSes disable the local APIC in the APIC_BASE + * MSR. This can only be done in software for Intel P6 or later + * and AMD K7 (Model > 1) or later. + */ + rdmsr(MSR_IA32_APICBASE, l, h); + if (!(l & MSR_IA32_APICBASE_ENABLE)) { + pr_info("Local APIC disabled by BIOS -- reenabling.\n"); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; + wrmsr(MSR_IA32_APICBASE, l, h); + enabled_via_apicbase = 1; + } + return apic_verify(); +} + /* * Detect and initialize APIC */ static int __init detect_init_APIC(void) { - u32 h, l, features; - /* Disabled by kernel option? */ if (disable_apic) return -1; @@ -1568,38 +1616,12 @@ static int __init detect_init_APIC(void) "you can enable it with \"lapic\"\n"); return -1; } - /* - * Some BIOSes disable the local APIC in the APIC_BASE - * MSR. This can only be done in software for Intel P6 or later - * and AMD K7 (Model > 1) or later. - */ - rdmsr(MSR_IA32_APICBASE, l, h); - if (!(l & MSR_IA32_APICBASE_ENABLE)) { - pr_info("Local APIC disabled by BIOS -- reenabling.\n"); - l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; - wrmsr(MSR_IA32_APICBASE, l, h); - enabled_via_apicbase = 1; - } - } - /* - * The APIC feature bit should now be enabled - * in `cpuid' - */ - features = cpuid_edx(1); - if (!(features & (1 << X86_FEATURE_APIC))) { - pr_warning("Could not enable APIC!\n"); - return -1; + if (apic_force_enable()) + return -1; + } else { + if (apic_verify()) + return -1; } - set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - - /* The BIOS may have set up the APIC at some other address */ - rdmsr(MSR_IA32_APICBASE, l, h); - if (l & MSR_IA32_APICBASE_ENABLE) - mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; - - pr_info("Found and enabled local APIC!\n"); apic_pm_activate(); @@ -1687,7 +1709,7 @@ void __init init_apic_mappings(void) * This initializes the IO-APIC and APIC hardware if this is * a UP kernel. */ -int apic_version[MAX_APICS]; +int apic_version[MAX_LOCAL_APIC]; int __init APIC_init_uniprocessor(void) { @@ -1752,17 +1774,10 @@ int __init APIC_init_uniprocessor(void) setup_IO_APIC(); else { nr_ioapics = 0; - localise_nmi_watchdog(); } -#else - localise_nmi_watchdog(); #endif x86_init.timers.setup_percpu_clockev(); -#ifdef CONFIG_X86_64 - check_nmi_watchdog(); -#endif - return 0; } diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index cefd6942f0e9..72ec29e1ae06 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -17,19 +17,31 @@ #include <linux/nmi.h> #include <linux/module.h> -/* For reliability, we're prepared to waste bits here. */ -static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; - +#ifdef CONFIG_HARDLOCKUP_DETECTOR u64 hw_nmi_get_sample_period(void) { return (u64)(cpu_khz) * 1000 * 60; } +#endif + +#ifdef arch_trigger_all_cpu_backtrace +/* For reliability, we're prepared to waste bits here. */ +static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; + +/* "in progress" flag of arch_trigger_all_cpu_backtrace */ +static unsigned long backtrace_flag; -#ifdef ARCH_HAS_NMI_WATCHDOG void arch_trigger_all_cpu_backtrace(void) { int i; + if (test_and_set_bit(0, &backtrace_flag)) + /* + * If there is already a trigger_all_cpu_backtrace() in progress + * (backtrace_flag == 1), don't output double cpu dump infos. + */ + return; + cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); printk(KERN_INFO "sending NMI to all CPUs:\n"); @@ -41,6 +53,9 @@ void arch_trigger_all_cpu_backtrace(void) break; mdelay(1); } + + clear_bit(0, &backtrace_flag); + smp_mb__after_clear_bit(); } static int __kprobes @@ -49,7 +64,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self, { struct die_args *args = __args; struct pt_regs *regs; - int cpu = smp_processor_id(); + int cpu; switch (cmd) { case DIE_NMI: @@ -61,6 +76,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self, } regs = args->regs; + cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED; @@ -90,18 +106,3 @@ static int __init register_trigger_all_cpu_backtrace(void) } early_initcall(register_trigger_all_cpu_backtrace); #endif - -/* STUB calls to mimic old nmi_watchdog behaviour */ -#if defined(CONFIG_X86_LOCAL_APIC) -unsigned int nmi_watchdog = NMI_NONE; -EXPORT_SYMBOL(nmi_watchdog); -void acpi_nmi_enable(void) { return; } -void acpi_nmi_disable(void) { return; } -#endif -atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ -EXPORT_SYMBOL(nmi_active); -int unknown_nmi_panic; -void cpu_nmi_set_wd_enabled(void) { return; } -void stop_apic_nmi_watchdog(void *unused) { return; } -void setup_apic_nmi_watchdog(void *unused) { return; } -int __init check_nmi_watchdog(void) { return 0; } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 8ae808d110f4..f6cd5b410770 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -54,7 +54,6 @@ #include <asm/dma.h> #include <asm/timer.h> #include <asm/i8259.h> -#include <asm/nmi.h> #include <asm/msidef.h> #include <asm/hypertransport.h> #include <asm/setup.h> @@ -1934,8 +1933,7 @@ void disable_IO_APIC(void) * * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 */ - -void __init setup_ioapic_ids_from_mpc(void) +void __init setup_ioapic_ids_from_mpc_nocheck(void) { union IO_APIC_reg_00 reg_00; physid_mask_t phys_id_present_map; @@ -1944,15 +1942,6 @@ void __init setup_ioapic_ids_from_mpc(void) unsigned char old_id; unsigned long flags; - if (acpi_ioapic) - return; - /* - * Don't check I/O APIC IDs for xAPIC systems. They have - * no meaning without the serial APIC bus. - */ - if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) - return; /* * This is broken; anything with a real cpu count has to * circumvent this idiocy regardless. @@ -2006,7 +1995,6 @@ void __init setup_ioapic_ids_from_mpc(void) physids_or(phys_id_present_map, phys_id_present_map, tmp); } - /* * We need to adjust the IRQ routing table * if the ID changed. @@ -2042,6 +2030,21 @@ void __init setup_ioapic_ids_from_mpc(void) apic_printk(APIC_VERBOSE, " ok.\n"); } } + +void __init setup_ioapic_ids_from_mpc(void) +{ + + if (acpi_ioapic) + return; + /* + * Don't check I/O APIC IDs for xAPIC systems. They have + * no meaning without the serial APIC bus. + */ + if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + return; + setup_ioapic_ids_from_mpc_nocheck(); +} #endif int no_timer_check __initdata; @@ -2430,13 +2433,12 @@ static void ack_apic_level(struct irq_data *data) { struct irq_cfg *cfg = data->chip_data; int i, do_unmask_irq = 0, irq = data->irq; - struct irq_desc *desc = irq_to_desc(irq); unsigned long v; irq_complete_move(cfg); #ifdef CONFIG_GENERIC_PENDING_IRQ /* If we are moving the irq we need to mask it */ - if (unlikely(desc->status & IRQ_MOVE_PENDING)) { + if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { do_unmask_irq = 1; mask_ioapic(cfg); } @@ -2643,24 +2645,6 @@ static void lapic_register_intr(int irq) "edge"); } -static void __init setup_nmi(void) -{ - /* - * Dirty trick to enable the NMI watchdog ... - * We put the 8259A master into AEOI mode and - * unmask on all local APICs LVT0 as NMI. - * - * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') - * is from Maciej W. Rozycki - so we do not have to EOI from - * the NMI handler or the timer interrupt. - */ - apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); - - enable_NMI_through_LVT0(); - - apic_printk(APIC_VERBOSE, " done.\n"); -} - /* * This looks a bit hackish but it's about the only one way of sending * a few INTA cycles to 8259As and any associated glue logic. ICR does @@ -2766,15 +2750,6 @@ static inline void __init check_timer(void) */ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); legacy_pic->init(1); -#ifdef CONFIG_X86_32 - { - unsigned int ver; - - ver = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(ver); - timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); - } -#endif pin1 = find_isa_irq_pin(0, mp_INT); apic1 = find_isa_irq_apic(0, mp_INT); @@ -2822,10 +2797,6 @@ static inline void __init check_timer(void) unmask_ioapic(cfg); } if (timer_irq_works()) { - if (nmi_watchdog == NMI_IO_APIC) { - setup_nmi(); - legacy_pic->unmask(0); - } if (disable_timer_pin_1 > 0) clear_IO_APIC_pin(0, pin1); goto out; @@ -2851,11 +2822,6 @@ static inline void __init check_timer(void) if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); timer_through_8259 = 1; - if (nmi_watchdog == NMI_IO_APIC) { - legacy_pic->mask(0); - setup_nmi(); - legacy_pic->unmask(0); - } goto out; } /* @@ -2867,15 +2833,6 @@ static inline void __init check_timer(void) apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); } - if (nmi_watchdog == NMI_IO_APIC) { - apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work " - "through the IO-APIC - disabling NMI Watchdog!\n"); - nmi_watchdog = NMI_NONE; - } -#ifdef CONFIG_X86_32 - timer_ack = 0; -#endif - apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...\n"); @@ -3109,7 +3066,7 @@ void destroy_irq(unsigned int irq) irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE); - if (intr_remapping_enabled) + if (irq_remapped(cfg)) free_irte(irq); raw_spin_lock_irqsave(&vector_lock, flags); __clear_irq_vector(irq, cfg); @@ -3331,7 +3288,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) return 0; } -int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { int node, ret, sub_handle, index = 0; unsigned int irq, irq_want; @@ -3389,7 +3346,7 @@ error: return ret; } -void arch_teardown_msi_irq(unsigned int irq) +void native_teardown_msi_irq(unsigned int irq) { destroy_irq(irq); } @@ -3413,6 +3370,7 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, msg.data |= MSI_DATA_VECTOR(cfg->vector); msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; msg.address_lo |= MSI_ADDR_DEST_ID(dest); + msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest); dmar_msi_write(irq, &msg); @@ -3639,7 +3597,7 @@ int __init io_apic_get_redir_entries (int ioapic) return reg_01.bits.entries + 1; } -void __init probe_nr_irqs_gsi(void) +static void __init probe_nr_irqs_gsi(void) { int nr; @@ -3650,6 +3608,11 @@ void __init probe_nr_irqs_gsi(void) printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); } +int get_nr_irqs_gsi(void) +{ + return nr_irqs_gsi; +} + #ifdef CONFIG_SPARSE_IRQ int __init arch_probe_nr_irqs(void) { @@ -3951,7 +3914,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics) return res; } -void __init ioapic_init_mappings(void) +void __init ioapic_and_gsi_init(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; struct resource *ioapic_res; @@ -3989,6 +3952,8 @@ fake_ioapic_page: ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1; ioapic_res++; } + + probe_nr_irqs_gsi(); } void __init ioapic_insert_resources(void) @@ -4098,7 +4063,8 @@ void __init pre_init_apic_IRQ0(void) printk(KERN_INFO "Early APIC setup for system timer0\n"); #ifndef CONFIG_SMP - phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); + physid_set_mask_of_physid(boot_cpu_physical_apicid, + &phys_cpu_present_map); #endif /* Make sure the irq descriptor is set up */ cfg = alloc_irq_and_cfg_at(0, 0); diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c deleted file mode 100644 index c90041ccb742..000000000000 --- a/arch/x86/kernel/apic/nmi.c +++ /dev/null @@ -1,567 +0,0 @@ -/* - * NMI watchdog support on APIC systems - * - * Started by Ingo Molnar <mingo@redhat.com> - * - * Fixes: - * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. - * Mikael Pettersson : Power Management for local APIC NMI watchdog. - * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog. - * Pavel Machek and - * Mikael Pettersson : PM converted to driver model. Disable/enable API. - */ - -#include <asm/apic.h> - -#include <linux/nmi.h> -#include <linux/mm.h> -#include <linux/delay.h> -#include <linux/interrupt.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/sysdev.h> -#include <linux/sysctl.h> -#include <linux/percpu.h> -#include <linux/kprobes.h> -#include <linux/cpumask.h> -#include <linux/kernel_stat.h> -#include <linux/kdebug.h> -#include <linux/smp.h> - -#include <asm/i8259.h> -#include <asm/io_apic.h> -#include <asm/proto.h> -#include <asm/timer.h> - -#include <asm/mce.h> - -#include <asm/mach_traps.h> - -int unknown_nmi_panic; -int nmi_watchdog_enabled; - -/* For reliability, we're prepared to waste bits here. */ -static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; - -/* nmi_active: - * >0: the lapic NMI watchdog is active, but can be disabled - * <0: the lapic NMI watchdog has not been set up, and cannot - * be enabled - * 0: the lapic NMI watchdog is disabled, but can be enabled - */ -atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ -EXPORT_SYMBOL(nmi_active); - -unsigned int nmi_watchdog = NMI_NONE; -EXPORT_SYMBOL(nmi_watchdog); - -static int panic_on_timeout; - -static unsigned int nmi_hz = HZ; -static DEFINE_PER_CPU(short, wd_enabled); -static int endflag __initdata; - -static inline unsigned int get_nmi_count(int cpu) -{ - return per_cpu(irq_stat, cpu).__nmi_count; -} - -static inline int mce_in_progress(void) -{ -#if defined(CONFIG_X86_MCE) - return atomic_read(&mce_entry) > 0; -#endif - return 0; -} - -/* - * Take the local apic timer and PIT/HPET into account. We don't - * know which one is active, when we have highres/dyntick on - */ -static inline unsigned int get_timer_irqs(int cpu) -{ - return per_cpu(irq_stat, cpu).apic_timer_irqs + - per_cpu(irq_stat, cpu).irq0_irqs; -} - -#ifdef CONFIG_SMP -/* - * The performance counters used by NMI_LOCAL_APIC don't trigger when - * the CPU is idle. To make sure the NMI watchdog really ticks on all - * CPUs during the test make them busy. - */ -static __init void nmi_cpu_busy(void *data) -{ - local_irq_enable_in_hardirq(); - /* - * Intentionally don't use cpu_relax here. This is - * to make sure that the performance counter really ticks, - * even if there is a simulator or similar that catches the - * pause instruction. On a real HT machine this is fine because - * all other CPUs are busy with "useless" delay loops and don't - * care if they get somewhat less cycles. - */ - while (endflag == 0) - mb(); -} -#endif - -static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count) -{ - printk(KERN_CONT "\n"); - - printk(KERN_WARNING - "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n", - cpu, prev_nmi_count[cpu], get_nmi_count(cpu)); - - printk(KERN_WARNING - "Please report this to bugzilla.kernel.org,\n"); - printk(KERN_WARNING - "and attach the output of the 'dmesg' command.\n"); - - per_cpu(wd_enabled, cpu) = 0; - atomic_dec(&nmi_active); -} - -static void __acpi_nmi_disable(void *__unused) -{ - apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); -} - -int __init check_nmi_watchdog(void) -{ - unsigned int *prev_nmi_count; - int cpu; - - if (!nmi_watchdog_active() || !atomic_read(&nmi_active)) - return 0; - - prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL); - if (!prev_nmi_count) - goto error; - - printk(KERN_INFO "Testing NMI watchdog ... "); - -#ifdef CONFIG_SMP - if (nmi_watchdog == NMI_LOCAL_APIC) - smp_call_function(nmi_cpu_busy, (void *)&endflag, 0); -#endif - - for_each_possible_cpu(cpu) - prev_nmi_count[cpu] = get_nmi_count(cpu); - local_irq_enable(); - mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */ - - for_each_online_cpu(cpu) { - if (!per_cpu(wd_enabled, cpu)) - continue; - if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) - report_broken_nmi(cpu, prev_nmi_count); - } - endflag = 1; - if (!atomic_read(&nmi_active)) { - kfree(prev_nmi_count); - atomic_set(&nmi_active, -1); - goto error; - } - printk("OK.\n"); - - /* - * now that we know it works we can reduce NMI frequency to - * something more reasonable; makes a difference in some configs - */ - if (nmi_watchdog == NMI_LOCAL_APIC) - nmi_hz = lapic_adjust_nmi_hz(1); - - kfree(prev_nmi_count); - return 0; -error: - if (nmi_watchdog == NMI_IO_APIC) { - if (!timer_through_8259) - legacy_pic->mask(0); - on_each_cpu(__acpi_nmi_disable, NULL, 1); - } - -#ifdef CONFIG_X86_32 - timer_ack = 0; -#endif - return -1; -} - -static int __init setup_nmi_watchdog(char *str) -{ - unsigned int nmi; - - if (!strncmp(str, "panic", 5)) { - panic_on_timeout = 1; - str = strchr(str, ','); - if (!str) - return 1; - ++str; - } - - if (!strncmp(str, "lapic", 5)) - nmi_watchdog = NMI_LOCAL_APIC; - else if (!strncmp(str, "ioapic", 6)) - nmi_watchdog = NMI_IO_APIC; - else { - get_option(&str, &nmi); - if (nmi >= NMI_INVALID) - return 0; - nmi_watchdog = nmi; - } - - return 1; -} -__setup("nmi_watchdog=", setup_nmi_watchdog); - -/* - * Suspend/resume support - */ -#ifdef CONFIG_PM - -static int nmi_pm_active; /* nmi_active before suspend */ - -static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) -{ - /* only CPU0 goes here, other CPUs should be offline */ - nmi_pm_active = atomic_read(&nmi_active); - stop_apic_nmi_watchdog(NULL); - BUG_ON(atomic_read(&nmi_active) != 0); - return 0; -} - -static int lapic_nmi_resume(struct sys_device *dev) -{ - /* only CPU0 goes here, other CPUs should be offline */ - if (nmi_pm_active > 0) { - setup_apic_nmi_watchdog(NULL); - touch_nmi_watchdog(); - } - return 0; -} - -static struct sysdev_class nmi_sysclass = { - .name = "lapic_nmi", - .resume = lapic_nmi_resume, - .suspend = lapic_nmi_suspend, -}; - -static struct sys_device device_lapic_nmi = { - .id = 0, - .cls = &nmi_sysclass, -}; - -static int __init init_lapic_nmi_sysfs(void) -{ - int error; - - /* - * should really be a BUG_ON but b/c this is an - * init call, it just doesn't work. -dcz - */ - if (nmi_watchdog != NMI_LOCAL_APIC) - return 0; - - if (atomic_read(&nmi_active) < 0) - return 0; - - error = sysdev_class_register(&nmi_sysclass); - if (!error) - error = sysdev_register(&device_lapic_nmi); - return error; -} - -/* must come after the local APIC's device_initcall() */ -late_initcall(init_lapic_nmi_sysfs); - -#endif /* CONFIG_PM */ - -static void __acpi_nmi_enable(void *__unused) -{ - apic_write(APIC_LVT0, APIC_DM_NMI); -} - -/* - * Enable timer based NMIs on all CPUs: - */ -void acpi_nmi_enable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_enable, NULL, 1); -} - -/* - * Disable timer based NMIs on all CPUs: - */ -void acpi_nmi_disable(void) -{ - if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) - on_each_cpu(__acpi_nmi_disable, NULL, 1); -} - -/* - * This function is called as soon the LAPIC NMI watchdog driver has everything - * in place and it's ready to check if the NMIs belong to the NMI watchdog - */ -void cpu_nmi_set_wd_enabled(void) -{ - __get_cpu_var(wd_enabled) = 1; -} - -void setup_apic_nmi_watchdog(void *unused) -{ - if (__get_cpu_var(wd_enabled)) - return; - - /* cheap hack to support suspend/resume */ - /* if cpu0 is not active neither should the other cpus */ - if (smp_processor_id() != 0 && atomic_read(&nmi_active) <= 0) - return; - - switch (nmi_watchdog) { - case NMI_LOCAL_APIC: - if (lapic_watchdog_init(nmi_hz) < 0) { - __get_cpu_var(wd_enabled) = 0; - return; - } - /* FALL THROUGH */ - case NMI_IO_APIC: - __get_cpu_var(wd_enabled) = 1; - atomic_inc(&nmi_active); - } -} - -void stop_apic_nmi_watchdog(void *unused) -{ - /* only support LOCAL and IO APICs for now */ - if (!nmi_watchdog_active()) - return; - if (__get_cpu_var(wd_enabled) == 0) - return; - if (nmi_watchdog == NMI_LOCAL_APIC) - lapic_watchdog_stop(); - else - __acpi_nmi_disable(NULL); - __get_cpu_var(wd_enabled) = 0; - atomic_dec(&nmi_active); -} - -/* - * the best way to detect whether a CPU has a 'hard lockup' problem - * is to check it's local APIC timer IRQ counts. If they are not - * changing then that CPU has some problem. - * - * as these watchdog NMI IRQs are generated on every CPU, we only - * have to check the current processor. - * - * since NMIs don't listen to _any_ locks, we have to be extremely - * careful not to rely on unsafe variables. The printk might lock - * up though, so we have to break up any console locks first ... - * [when there will be more tty-related locks, break them up here too!] - */ - -static DEFINE_PER_CPU(unsigned, last_irq_sum); -static DEFINE_PER_CPU(long, alert_counter); -static DEFINE_PER_CPU(int, nmi_touch); - -void touch_nmi_watchdog(void) -{ - if (nmi_watchdog_active()) { - unsigned cpu; - - /* - * Tell other CPUs to reset their alert counters. We cannot - * do it ourselves because the alert count increase is not - * atomic. - */ - for_each_present_cpu(cpu) { - if (per_cpu(nmi_touch, cpu) != 1) - per_cpu(nmi_touch, cpu) = 1; - } - } - - /* - * Tickle the softlockup detector too: - */ - touch_softlockup_watchdog(); -} -EXPORT_SYMBOL(touch_nmi_watchdog); - -notrace __kprobes int -nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) -{ - /* - * Since current_thread_info()-> is always on the stack, and we - * always switch the stack NMI-atomically, it's safe to use - * smp_processor_id(). - */ - unsigned int sum; - int touched = 0; - int cpu = smp_processor_id(); - int rc = 0; - - sum = get_timer_irqs(cpu); - - if (__get_cpu_var(nmi_touch)) { - __get_cpu_var(nmi_touch) = 0; - touched = 1; - } - - /* We can be called before check_nmi_watchdog, hence NULL check. */ - if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */ - - raw_spin_lock(&lock); - printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); - show_regs(regs); - dump_stack(); - raw_spin_unlock(&lock); - cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); - - rc = 1; - } - - /* Could check oops_in_progress here too, but it's safer not to */ - if (mce_in_progress()) - touched = 1; - - /* if the none of the timers isn't firing, this cpu isn't doing much */ - if (!touched && __get_cpu_var(last_irq_sum) == sum) { - /* - * Ayiee, looks like this CPU is stuck ... - * wait a few IRQs (5 seconds) before doing the oops ... - */ - __this_cpu_inc(alert_counter); - if (__this_cpu_read(alert_counter) == 5 * nmi_hz) - /* - * die_nmi will return ONLY if NOTIFY_STOP happens.. - */ - die_nmi("BUG: NMI Watchdog detected LOCKUP", - regs, panic_on_timeout); - } else { - __get_cpu_var(last_irq_sum) = sum; - __this_cpu_write(alert_counter, 0); - } - - /* see if the nmi watchdog went off */ - if (!__get_cpu_var(wd_enabled)) - return rc; - switch (nmi_watchdog) { - case NMI_LOCAL_APIC: - rc |= lapic_wd_event(nmi_hz); - break; - case NMI_IO_APIC: - /* - * don't know how to accurately check for this. - * just assume it was a watchdog timer interrupt - * This matches the old behaviour. - */ - rc = 1; - break; - } - return rc; -} - -#ifdef CONFIG_SYSCTL - -static void enable_ioapic_nmi_watchdog_single(void *unused) -{ - __get_cpu_var(wd_enabled) = 1; - atomic_inc(&nmi_active); - __acpi_nmi_enable(NULL); -} - -static void enable_ioapic_nmi_watchdog(void) -{ - on_each_cpu(enable_ioapic_nmi_watchdog_single, NULL, 1); - touch_nmi_watchdog(); -} - -static void disable_ioapic_nmi_watchdog(void) -{ - on_each_cpu(stop_apic_nmi_watchdog, NULL, 1); -} - -static int __init setup_unknown_nmi_panic(char *str) -{ - unknown_nmi_panic = 1; - return 1; -} -__setup("unknown_nmi_panic", setup_unknown_nmi_panic); - -static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) -{ - unsigned char reason = get_nmi_reason(); - char buf[64]; - - sprintf(buf, "NMI received for unknown reason %02x\n", reason); - die_nmi(buf, regs, 1); /* Always panic here */ - return 0; -} - -/* - * proc handler for /proc/sys/kernel/nmi - */ -int proc_nmi_enabled(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) -{ - int old_state; - - nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; - old_state = nmi_watchdog_enabled; - proc_dointvec(table, write, buffer, length, ppos); - if (!!old_state == !!nmi_watchdog_enabled) - return 0; - - if (atomic_read(&nmi_active) < 0 || !nmi_watchdog_active()) { - printk(KERN_WARNING - "NMI watchdog is permanently disabled\n"); - return -EIO; - } - - if (nmi_watchdog == NMI_LOCAL_APIC) { - if (nmi_watchdog_enabled) - enable_lapic_nmi_watchdog(); - else - disable_lapic_nmi_watchdog(); - } else if (nmi_watchdog == NMI_IO_APIC) { - if (nmi_watchdog_enabled) - enable_ioapic_nmi_watchdog(); - else - disable_ioapic_nmi_watchdog(); - } else { - printk(KERN_WARNING - "NMI watchdog doesn't know what hardware to touch\n"); - return -EIO; - } - return 0; -} - -#endif /* CONFIG_SYSCTL */ - -int do_nmi_callback(struct pt_regs *regs, int cpu) -{ -#ifdef CONFIG_SYSCTL - if (unknown_nmi_panic) - return unknown_nmi_panic_callback(regs, cpu); -#endif - return 0; -} - -void arch_trigger_all_cpu_backtrace(void) -{ - int i; - - cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); - - printk(KERN_INFO "sending NMI to all CPUs:\n"); - apic->send_IPI_all(NMI_VECTOR); - - /* Wait for up to 10 seconds for all CPUs to do the backtrace */ - for (i = 0; i < 10 * 1000; i++) { - if (cpumask_empty(to_cpumask(backtrace_mask))) - break; - mdelay(1); - } -} diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index f9e4e6a54073..d8c4a6feb286 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -79,13 +79,6 @@ void __init default_setup_apic_routing(void) /* need to update phys_pkg_id */ apic->phys_pkg_id = apicid_phys_pkg_id; } - - /* - * Now that apic routing model is selected, configure the - * fault handling for intr remapping. - */ - if (intr_remapping_enabled) - enable_drhd_fault_handling(); } /* Same for both flat and physical. */ diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index f744f54cb248..c1c52c341f40 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -5,7 +5,7 @@ * * SGI UV APIC functions (note: not an Intel compatible APIC) * - * Copyright (C) 2007-2009 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved. */ #include <linux/cpumask.h> #include <linux/hardirq.h> @@ -41,8 +41,11 @@ DEFINE_PER_CPU(int, x2apic_extra_bits); static enum uv_system_type uv_system_type; static u64 gru_start_paddr, gru_end_paddr; +static union uvh_apicid uvh_apicid; int uv_min_hub_revision_id; EXPORT_SYMBOL_GPL(uv_min_hub_revision_id); +unsigned int uv_apicid_hibits; +EXPORT_SYMBOL_GPL(uv_apicid_hibits); static DEFINE_SPINLOCK(uv_nmi_lock); static inline bool is_GRU_range(u64 start, u64 end) @@ -70,12 +73,44 @@ static int early_get_nodeid(void) return node_id.s.node_id; } +static void __init early_get_apic_pnode_shift(void) +{ + unsigned long *mmr; + + mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_APICID, sizeof(*mmr)); + uvh_apicid.v = *mmr; + early_iounmap(mmr, sizeof(*mmr)); + if (!uvh_apicid.v) + /* + * Old bios, use default value + */ + uvh_apicid.s.pnode_shift = UV_APIC_PNODE_SHIFT; +} + +/* + * Add an extra bit as dictated by bios to the destination apicid of + * interrupts potentially passing through the UV HUB. This prevents + * a deadlock between interrupts and IO port operations. + */ +static void __init uv_set_apicid_hibit(void) +{ + union uvh_lb_target_physical_apic_id_mask_u apicid_mask; + unsigned long *mmr; + + mmr = early_ioremap(UV_LOCAL_MMR_BASE | + UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK, sizeof(*mmr)); + apicid_mask.v = *mmr; + early_iounmap(mmr, sizeof(*mmr)); + uv_apicid_hibits = apicid_mask.s.bit_enables & UV_APICID_HIBIT_MASK; +} + static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { int nodeid; if (!strcmp(oem_id, "SGI")) { nodeid = early_get_nodeid(); + early_get_apic_pnode_shift(); x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; x86_platform.nmi_init = uv_nmi_init; if (!strcmp(oem_table_id, "UVL")) @@ -84,8 +119,9 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) uv_system_type = UV_X2APIC; else if (!strcmp(oem_table_id, "UVH")) { __get_cpu_var(x2apic_extra_bits) = - nodeid << (UV_APIC_PNODE_SHIFT - 1); + nodeid << (uvh_apicid.s.pnode_shift - 1); uv_system_type = UV_NON_UNIQUE_APIC; + uv_set_apicid_hibit(); return 1; } } @@ -139,6 +175,7 @@ static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_ri int pnode; pnode = uv_apicid_to_pnode(phys_apicid); + phys_apicid |= uv_apicid_hibits; val = (1UL << UVH_IPI_INT_SEND_SHFT) | (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | @@ -220,7 +257,7 @@ static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask) int cpu = cpumask_first(cpumask); if ((unsigned)cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_apicid, cpu); + return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; else return BAD_APICID; } @@ -239,7 +276,7 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, if (cpumask_test_cpu(cpu, cpu_online_mask)) break; } - return per_cpu(x86_cpu_to_apicid, cpu); + return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; } static unsigned int x2apic_get_apic_id(unsigned long x) @@ -363,14 +400,14 @@ struct redir_addr { #define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT static __initdata struct redir_addr redir_addrs[] = { - {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_SI_ALIAS0_OVERLAY_CONFIG}, - {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_SI_ALIAS1_OVERLAY_CONFIG}, - {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_SI_ALIAS2_OVERLAY_CONFIG}, + {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR}, + {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR}, + {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR}, }; static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) { - union uvh_si_alias0_overlay_config_u alias; + union uvh_rh_gam_alias210_overlay_config_2_mmr_u alias; union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect; int i; @@ -644,7 +681,7 @@ void uv_nmi_init(void) void __init uv_system_init(void) { - union uvh_si_addr_map_config_u m_n_config; + union uvh_rh_gam_config_mmr_u m_n_config; union uvh_node_id_u node_id; unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; @@ -654,7 +691,7 @@ void __init uv_system_init(void) map_low_mmrs(); - m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); + m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR ); m_val = m_n_config.s.m_skt; n_val = m_n_config.s.n_skt; mmr_base = @@ -716,6 +753,10 @@ void __init uv_system_init(void) int apicid = per_cpu(x86_cpu_to_apicid, cpu); nid = cpu_to_node(cpu); + /* + * apic_pnode_shift must be set before calling uv_apicid_to_pnode(); + */ + uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift; pnode = uv_apicid_to_pnode(apicid); blade = boot_pnode_to_blade(pnode); lcpu = uv_blade_info[blade].nr_possible_cpus; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4b68bda30938..1d59834396bd 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -894,7 +894,6 @@ void __init identify_boot_cpu(void) #else vgetcpu_set_mode(); #endif - init_hw_perf_events(); } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index cd8da247dda1..a2baafb2fe6d 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -701,6 +701,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) per_cpu(acfreq_data, policy->cpu) = NULL; acpi_processor_unregister_performance(data->acpi_data, policy->cpu); + kfree(data->freq_table); kfree(data); } diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c index 733093d60436..141abebc4516 100644 --- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c +++ b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c @@ -393,7 +393,7 @@ static struct cpufreq_driver nforce2_driver = { * Detects nForce2 A2 and C1 stepping * */ -static unsigned int nforce2_detect_chipset(void) +static int nforce2_detect_chipset(void) { nforce2_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE2, diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c index fc09f142d94d..d9f51367666b 100644 --- a/arch/x86/kernel/cpu/cpufreq/longrun.c +++ b/arch/x86/kernel/cpu/cpufreq/longrun.c @@ -35,7 +35,7 @@ static unsigned int longrun_low_freq, longrun_high_freq; * Reads the current LongRun policy by access to MSR_TMTA_LONGRUN_FLAGS * and MSR_TMTA_LONGRUN_CTRL */ -static void __init longrun_get_policy(struct cpufreq_policy *policy) +static void __cpuinit longrun_get_policy(struct cpufreq_policy *policy) { u32 msr_lo, msr_hi; @@ -165,7 +165,7 @@ static unsigned int longrun_get(unsigned int cpu) * TMTA rules: * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq) */ -static unsigned int __cpuinit longrun_determine_freqs(unsigned int *low_freq, +static int __cpuinit longrun_determine_freqs(unsigned int *low_freq, unsigned int *high_freq) { u32 msr_lo, msr_hi; diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 12cd823c8d03..9ecf81f9b90f 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -149,8 +149,7 @@ union _cpuid4_leaf_ecx { }; struct amd_l3_cache { - struct pci_dev *dev; - bool can_disable; + struct amd_northbridge *nb; unsigned indices; u8 subcaches[4]; }; @@ -311,14 +310,12 @@ struct _cache_attr { /* * L3 cache descriptors */ -static struct amd_l3_cache **__cpuinitdata l3_caches; - static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) { unsigned int sc0, sc1, sc2, sc3; u32 val = 0; - pci_read_config_dword(l3->dev, 0x1C4, &val); + pci_read_config_dword(l3->nb->misc, 0x1C4, &val); /* calculate subcache sizes */ l3->subcaches[0] = sc0 = !(val & BIT(0)); @@ -327,49 +324,17 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1; + l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; } -static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node) -{ - struct amd_l3_cache *l3; - struct pci_dev *dev = node_to_k8_nb_misc(node); - - l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC); - if (!l3) { - printk(KERN_WARNING "Error allocating L3 struct\n"); - return NULL; - } - - l3->dev = dev; - - amd_calc_l3_indices(l3); - - return l3; -} - -static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, - int index) +static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, + int index) { + static struct amd_l3_cache *__cpuinitdata l3_caches; int node; - if (boot_cpu_data.x86 != 0x10) - return; - - if (index < 3) - return; - - /* see errata #382 and #388 */ - if (boot_cpu_data.x86_model < 0x8) - return; - - if ((boot_cpu_data.x86_model == 0x8 || - boot_cpu_data.x86_model == 0x9) - && - boot_cpu_data.x86_mask < 0x1) - return; - - /* not in virtualized environments */ - if (k8_northbridges.num == 0) + /* only for L3, and not in virtualized environments */ + if (index < 3 || amd_nb_num() == 0) return; /* @@ -377,7 +342,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, * never freed but this is done only on shutdown so it doesn't matter. */ if (!l3_caches) { - int size = k8_northbridges.num * sizeof(struct amd_l3_cache *); + int size = amd_nb_num() * sizeof(struct amd_l3_cache); l3_caches = kzalloc(size, GFP_ATOMIC); if (!l3_caches) @@ -386,14 +351,12 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, node = amd_get_nb_id(smp_processor_id()); - if (!l3_caches[node]) { - l3_caches[node] = amd_init_l3_cache(node); - l3_caches[node]->can_disable = true; + if (!l3_caches[node].nb) { + l3_caches[node].nb = node_to_amd_nb(node); + amd_calc_l3_indices(&l3_caches[node]); } - WARN_ON(!l3_caches[node]); - - this_leaf->l3 = l3_caches[node]; + this_leaf->l3 = &l3_caches[node]; } /* @@ -407,7 +370,7 @@ int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot) { unsigned int reg = 0; - pci_read_config_dword(l3->dev, 0x1BC + slot * 4, ®); + pci_read_config_dword(l3->nb->misc, 0x1BC + slot * 4, ®); /* check whether this slot is activated already */ if (reg & (3UL << 30)) @@ -421,7 +384,8 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, { int index; - if (!this_leaf->l3 || !this_leaf->l3->can_disable) + if (!this_leaf->l3 || + !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) return -EINVAL; index = amd_get_l3_disable_slot(this_leaf->l3, slot); @@ -456,7 +420,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, if (!l3->subcaches[i]) continue; - pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg); + pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg); /* * We need to WBINVD on a core on the node containing the L3 @@ -466,7 +430,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, wbinvd_on_cpu(cpu); reg |= BIT(31); - pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg); + pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg); } } @@ -523,7 +487,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!this_leaf->l3 || !this_leaf->l3->can_disable) + if (!this_leaf->l3 || + !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) return -EINVAL; cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); @@ -544,7 +509,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, #define STORE_CACHE_DISABLE(slot) \ static ssize_t \ store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \ - const char *buf, size_t count) \ + const char *buf, size_t count) \ { \ return store_cache_disable(this_leaf, buf, count, slot); \ } @@ -557,10 +522,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, show_cache_disable_1, store_cache_disable_1); #else /* CONFIG_AMD_NB */ -static void __cpuinit -amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index) -{ -}; +#define amd_init_l3_cache(x, y) #endif /* CONFIG_AMD_NB */ static int @@ -574,7 +536,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index, if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { amd_cpuid4(index, &eax, &ebx, &ecx); - amd_check_l3_disable(this_leaf, index); + amd_init_l3_cache(this_leaf, index); } else { cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); } @@ -982,30 +944,48 @@ define_one_ro(size); define_one_ro(shared_cpu_map); define_one_ro(shared_cpu_list); -#define DEFAULT_SYSFS_CACHE_ATTRS \ - &type.attr, \ - &level.attr, \ - &coherency_line_size.attr, \ - &physical_line_partition.attr, \ - &ways_of_associativity.attr, \ - &number_of_sets.attr, \ - &size.attr, \ - &shared_cpu_map.attr, \ - &shared_cpu_list.attr - static struct attribute *default_attrs[] = { - DEFAULT_SYSFS_CACHE_ATTRS, + &type.attr, + &level.attr, + &coherency_line_size.attr, + &physical_line_partition.attr, + &ways_of_associativity.attr, + &number_of_sets.attr, + &size.attr, + &shared_cpu_map.attr, + &shared_cpu_list.attr, NULL }; -static struct attribute *default_l3_attrs[] = { - DEFAULT_SYSFS_CACHE_ATTRS, #ifdef CONFIG_AMD_NB - &cache_disable_0.attr, - &cache_disable_1.attr, +static struct attribute ** __cpuinit amd_l3_attrs(void) +{ + static struct attribute **attrs; + int n; + + if (attrs) + return attrs; + + n = sizeof (default_attrs) / sizeof (struct attribute *); + + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + n += 2; + + attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL); + if (attrs == NULL) + return attrs = default_attrs; + + for (n = 0; default_attrs[n]; n++) + attrs[n] = default_attrs[n]; + + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) { + attrs[n++] = &cache_disable_0.attr; + attrs[n++] = &cache_disable_1.attr; + } + + return attrs; +} #endif - NULL -}; static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) { @@ -1116,11 +1096,11 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) this_leaf = CPUID4_INFO_IDX(cpu, i); - if (this_leaf->l3 && this_leaf->l3->can_disable) - ktype_cache.default_attrs = default_l3_attrs; - else - ktype_cache.default_attrs = default_attrs; - + ktype_cache.default_attrs = default_attrs; +#ifdef CONFIG_AMD_NB + if (this_leaf->l3) + ktype_cache.default_attrs = amd_l3_attrs(); +#endif retval = kobject_init_and_add(&(this_object->kobj), &ktype_cache, per_cpu(ici_cache_kobject, cpu), diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index fe73c1844a9a..0a360d146596 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -49,7 +49,6 @@ static unsigned long copy_from_user_nmi(void *to, const void __user *from, unsigned long n) { unsigned long offset, addr = (unsigned long)from; - int type = in_nmi() ? KM_NMI : KM_IRQ0; unsigned long size, len = 0; struct page *page; void *map; @@ -63,9 +62,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) offset = addr & (PAGE_SIZE - 1); size = min(PAGE_SIZE - offset, n - len); - map = kmap_atomic(page, type); + map = kmap_atomic(page); memcpy(to, map+offset, size); - kunmap_atomic(map, type); + kunmap_atomic(map); put_page(page); len += size; @@ -238,6 +237,7 @@ struct x86_pmu { * Intel DebugStore bits */ int bts, pebs; + int bts_active, pebs_active; int pebs_record_size; void (*drain_pebs)(struct pt_regs *regs); struct event_constraint *pebs_constraints; @@ -330,9 +330,6 @@ static bool reserve_pmc_hardware(void) { int i; - if (nmi_watchdog == NMI_LOCAL_APIC) - disable_lapic_nmi_watchdog(); - for (i = 0; i < x86_pmu.num_counters; i++) { if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) goto perfctr_fail; @@ -355,9 +352,6 @@ perfctr_fail: for (i--; i >= 0; i--) release_perfctr_nmi(x86_pmu.perfctr + i); - if (nmi_watchdog == NMI_LOCAL_APIC) - enable_lapic_nmi_watchdog(); - return false; } @@ -369,9 +363,6 @@ static void release_pmc_hardware(void) release_perfctr_nmi(x86_pmu.perfctr + i); release_evntsel_nmi(x86_pmu.eventsel + i); } - - if (nmi_watchdog == NMI_LOCAL_APIC) - enable_lapic_nmi_watchdog(); } #else @@ -381,7 +372,59 @@ static void release_pmc_hardware(void) {} #endif -static int reserve_ds_buffers(void); +static bool check_hw_exists(void) +{ + u64 val, val_new = 0; + int i, reg, ret = 0; + + /* + * Check to see if the BIOS enabled any of the counters, if so + * complain and bail. + */ + for (i = 0; i < x86_pmu.num_counters; i++) { + reg = x86_pmu.eventsel + i; + ret = rdmsrl_safe(reg, &val); + if (ret) + goto msr_fail; + if (val & ARCH_PERFMON_EVENTSEL_ENABLE) + goto bios_fail; + } + + if (x86_pmu.num_counters_fixed) { + reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; + ret = rdmsrl_safe(reg, &val); + if (ret) + goto msr_fail; + for (i = 0; i < x86_pmu.num_counters_fixed; i++) { + if (val & (0x03 << i*4)) + goto bios_fail; + } + } + + /* + * Now write a value and read it back to see if it matches, + * this is needed to detect certain hardware emulators (qemu/kvm) + * that don't trap on the MSR access and always return 0s. + */ + val = 0xabcdUL; + ret = checking_wrmsrl(x86_pmu.perfctr, val); + ret |= rdmsrl_safe(x86_pmu.perfctr, &val_new); + if (ret || val != val_new) + goto msr_fail; + + return true; + +bios_fail: + printk(KERN_CONT "Broken BIOS detected, using software events only.\n"); + printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val); + return false; + +msr_fail: + printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n"); + return false; +} + +static void reserve_ds_buffers(void); static void release_ds_buffers(void); static void hw_perf_event_destroy(struct perf_event *event) @@ -437,7 +480,7 @@ static int x86_setup_perfctr(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; u64 config; - if (!hwc->sample_period) { + if (!is_sampling_event(event)) { hwc->sample_period = x86_pmu.max_period; hwc->last_period = hwc->sample_period; local64_set(&hwc->period_left, hwc->sample_period); @@ -478,7 +521,7 @@ static int x86_setup_perfctr(struct perf_event *event) if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && (hwc->sample_period == 1)) { /* BTS is not supported by this architecture. */ - if (!x86_pmu.bts) + if (!x86_pmu.bts_active) return -EOPNOTSUPP; /* BTS is currently only allowed for user-mode. */ @@ -497,12 +540,13 @@ static int x86_pmu_hw_config(struct perf_event *event) int precise = 0; /* Support for constant skid */ - if (x86_pmu.pebs) + if (x86_pmu.pebs_active) { precise++; - /* Support for IP fixup */ - if (x86_pmu.lbr_nr) - precise++; + /* Support for IP fixup */ + if (x86_pmu.lbr_nr) + precise++; + } if (event->attr.precise_ip > precise) return -EOPNOTSUPP; @@ -544,11 +588,8 @@ static int __x86_pmu_event_init(struct perf_event *event) if (atomic_read(&active_events) == 0) { if (!reserve_pmc_hardware()) err = -EBUSY; - else { - err = reserve_ds_buffers(); - if (err) - release_pmc_hardware(); - } + else + reserve_ds_buffers(); } if (!err) atomic_inc(&active_events); @@ -1350,7 +1391,7 @@ static void __init pmu_check_apic(void) pr_info("no hardware sampling interrupt available.\n"); } -void __init init_hw_perf_events(void) +int __init init_hw_perf_events(void) { struct event_constraint *c; int err; @@ -1365,15 +1406,19 @@ void __init init_hw_perf_events(void) err = amd_pmu_init(); break; default: - return; + return 0; } if (err != 0) { pr_cont("no PMU driver, software events only.\n"); - return; + return 0; } pmu_check_apic(); + /* sanity check that the hardware exists or is emulated */ + if (!check_hw_exists()) + return 0; + pr_cont("%s PMU driver.\n", x86_pmu.name); if (x86_pmu.quirks) @@ -1420,9 +1465,12 @@ void __init init_hw_perf_events(void) pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); - perf_pmu_register(&pmu); + perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); perf_cpu_notifier(x86_pmu_notifier); + + return 0; } +early_initcall(init_hw_perf_events); static inline void x86_pmu_read(struct perf_event *event) { @@ -1668,7 +1716,7 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) perf_callchain_store(entry, regs->ip); - dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); + dump_trace(NULL, regs, NULL, &backtrace_ops, entry); } #ifdef CONFIG_COMPAT diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 46d58448c3af..67e2202a6039 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -1,7 +1,5 @@ #ifdef CONFIG_CPU_SUP_AMD -static DEFINE_RAW_SPINLOCK(amd_nb_lock); - static __initconst const u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -275,17 +273,17 @@ done: return &emptyconstraint; } -static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) +static struct amd_nb *amd_alloc_nb(int cpu) { struct amd_nb *nb; int i; - nb = kmalloc(sizeof(struct amd_nb), GFP_KERNEL); + nb = kmalloc_node(sizeof(struct amd_nb), GFP_KERNEL | __GFP_ZERO, + cpu_to_node(cpu)); if (!nb) return NULL; - memset(nb, 0, sizeof(*nb)); - nb->nb_id = nb_id; + nb->nb_id = -1; /* * initialize all possible NB constraints @@ -306,7 +304,7 @@ static int amd_pmu_cpu_prepare(int cpu) if (boot_cpu_data.x86_max_cores < 2) return NOTIFY_OK; - cpuc->amd_nb = amd_alloc_nb(cpu, -1); + cpuc->amd_nb = amd_alloc_nb(cpu); if (!cpuc->amd_nb) return NOTIFY_BAD; @@ -325,8 +323,6 @@ static void amd_pmu_cpu_starting(int cpu) nb_id = amd_get_nb_id(cpu); WARN_ON_ONCE(nb_id == BAD_APICID); - raw_spin_lock(&amd_nb_lock); - for_each_online_cpu(i) { nb = per_cpu(cpu_hw_events, i).amd_nb; if (WARN_ON_ONCE(!nb)) @@ -341,8 +337,6 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->amd_nb->nb_id = nb_id; cpuc->amd_nb->refcnt++; - - raw_spin_unlock(&amd_nb_lock); } static void amd_pmu_cpu_dead(int cpu) @@ -354,8 +348,6 @@ static void amd_pmu_cpu_dead(int cpu) cpuhw = &per_cpu(cpu_hw_events, cpu); - raw_spin_lock(&amd_nb_lock); - if (cpuhw->amd_nb) { struct amd_nb *nb = cpuhw->amd_nb; @@ -364,8 +356,6 @@ static void amd_pmu_cpu_dead(int cpu) cpuhw->amd_nb = NULL; } - - raw_spin_unlock(&amd_nb_lock); } static __initconst const struct x86_pmu amd_pmu = { diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index c8f5c088cad1..24e390e40f2e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -816,6 +816,32 @@ static int intel_pmu_hw_config(struct perf_event *event) if (ret) return ret; + if (event->attr.precise_ip && + (event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { + /* + * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P + * (0x003c) so that we can use it with PEBS. + * + * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't + * PEBS capable. However we can use INST_RETIRED.ANY_P + * (0x00c0), which is a PEBS capable event, to get the same + * count. + * + * INST_RETIRED.ANY_P counts the number of cycles that retires + * CNTMASK instructions. By setting CNTMASK to a value (16) + * larger than the maximum number of instructions that can be + * retired per cycle (4) and then inverting the condition, we + * count all cycles that retire 16 or less instructions, which + * is every cycle. + * + * Thereby we gain a PEBS capable cycle counter. + */ + u64 alt_config = 0x108000c0; /* INST_RETIRED.TOTAL_CYCLES */ + + alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); + event->hw.config = alt_config; + } + if (event->attr.type != PERF_TYPE_RAW) return 0; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 4977f9c400e5..b7dcd9f2b8a0 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -74,6 +74,107 @@ static void fini_debug_store_on_cpu(int cpu) wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); } +static int alloc_pebs_buffer(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + int node = cpu_to_node(cpu); + int max, thresh = 1; /* always use a single PEBS record */ + void *buffer; + + if (!x86_pmu.pebs) + return 0; + + buffer = kmalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node); + if (unlikely(!buffer)) + return -ENOMEM; + + max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size; + + ds->pebs_buffer_base = (u64)(unsigned long)buffer; + ds->pebs_index = ds->pebs_buffer_base; + ds->pebs_absolute_maximum = ds->pebs_buffer_base + + max * x86_pmu.pebs_record_size; + + ds->pebs_interrupt_threshold = ds->pebs_buffer_base + + thresh * x86_pmu.pebs_record_size; + + return 0; +} + +static void release_pebs_buffer(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + + if (!ds || !x86_pmu.pebs) + return; + + kfree((void *)(unsigned long)ds->pebs_buffer_base); + ds->pebs_buffer_base = 0; +} + +static int alloc_bts_buffer(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + int node = cpu_to_node(cpu); + int max, thresh; + void *buffer; + + if (!x86_pmu.bts) + return 0; + + buffer = kmalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node); + if (unlikely(!buffer)) + return -ENOMEM; + + max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE; + thresh = max / 16; + + ds->bts_buffer_base = (u64)(unsigned long)buffer; + ds->bts_index = ds->bts_buffer_base; + ds->bts_absolute_maximum = ds->bts_buffer_base + + max * BTS_RECORD_SIZE; + ds->bts_interrupt_threshold = ds->bts_absolute_maximum - + thresh * BTS_RECORD_SIZE; + + return 0; +} + +static void release_bts_buffer(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + + if (!ds || !x86_pmu.bts) + return; + + kfree((void *)(unsigned long)ds->bts_buffer_base); + ds->bts_buffer_base = 0; +} + +static int alloc_ds_buffer(int cpu) +{ + int node = cpu_to_node(cpu); + struct debug_store *ds; + + ds = kmalloc_node(sizeof(*ds), GFP_KERNEL | __GFP_ZERO, node); + if (unlikely(!ds)) + return -ENOMEM; + + per_cpu(cpu_hw_events, cpu).ds = ds; + + return 0; +} + +static void release_ds_buffer(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + + if (!ds) + return; + + per_cpu(cpu_hw_events, cpu).ds = NULL; + kfree(ds); +} + static void release_ds_buffers(void) { int cpu; @@ -82,93 +183,77 @@ static void release_ds_buffers(void) return; get_online_cpus(); - for_each_online_cpu(cpu) fini_debug_store_on_cpu(cpu); for_each_possible_cpu(cpu) { - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; - - if (!ds) - continue; - - per_cpu(cpu_hw_events, cpu).ds = NULL; - - kfree((void *)(unsigned long)ds->pebs_buffer_base); - kfree((void *)(unsigned long)ds->bts_buffer_base); - kfree(ds); + release_pebs_buffer(cpu); + release_bts_buffer(cpu); + release_ds_buffer(cpu); } - put_online_cpus(); } -static int reserve_ds_buffers(void) +static void reserve_ds_buffers(void) { - int cpu, err = 0; + int bts_err = 0, pebs_err = 0; + int cpu; + + x86_pmu.bts_active = 0; + x86_pmu.pebs_active = 0; if (!x86_pmu.bts && !x86_pmu.pebs) - return 0; + return; + + if (!x86_pmu.bts) + bts_err = 1; + + if (!x86_pmu.pebs) + pebs_err = 1; get_online_cpus(); for_each_possible_cpu(cpu) { - struct debug_store *ds; - void *buffer; - int max, thresh; + if (alloc_ds_buffer(cpu)) { + bts_err = 1; + pebs_err = 1; + } + + if (!bts_err && alloc_bts_buffer(cpu)) + bts_err = 1; - err = -ENOMEM; - ds = kzalloc(sizeof(*ds), GFP_KERNEL); - if (unlikely(!ds)) + if (!pebs_err && alloc_pebs_buffer(cpu)) + pebs_err = 1; + + if (bts_err && pebs_err) break; - per_cpu(cpu_hw_events, cpu).ds = ds; - - if (x86_pmu.bts) { - buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL); - if (unlikely(!buffer)) - break; - - max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE; - thresh = max / 16; - - ds->bts_buffer_base = (u64)(unsigned long)buffer; - ds->bts_index = ds->bts_buffer_base; - ds->bts_absolute_maximum = ds->bts_buffer_base + - max * BTS_RECORD_SIZE; - ds->bts_interrupt_threshold = ds->bts_absolute_maximum - - thresh * BTS_RECORD_SIZE; - } + } - if (x86_pmu.pebs) { - buffer = kzalloc(PEBS_BUFFER_SIZE, GFP_KERNEL); - if (unlikely(!buffer)) - break; - - max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size; - - ds->pebs_buffer_base = (u64)(unsigned long)buffer; - ds->pebs_index = ds->pebs_buffer_base; - ds->pebs_absolute_maximum = ds->pebs_buffer_base + - max * x86_pmu.pebs_record_size; - /* - * Always use single record PEBS - */ - ds->pebs_interrupt_threshold = ds->pebs_buffer_base + - x86_pmu.pebs_record_size; - } + if (bts_err) { + for_each_possible_cpu(cpu) + release_bts_buffer(cpu); + } - err = 0; + if (pebs_err) { + for_each_possible_cpu(cpu) + release_pebs_buffer(cpu); } - if (err) - release_ds_buffers(); - else { + if (bts_err && pebs_err) { + for_each_possible_cpu(cpu) + release_ds_buffer(cpu); + } else { + if (x86_pmu.bts && !bts_err) + x86_pmu.bts_active = 1; + + if (x86_pmu.pebs && !pebs_err) + x86_pmu.pebs_active = 1; + for_each_online_cpu(cpu) init_debug_store_on_cpu(cpu); } put_online_cpus(); - - return err; } /* @@ -233,7 +318,7 @@ static int intel_pmu_drain_bts_buffer(void) if (!event) return 0; - if (!ds) + if (!x86_pmu.bts_active) return 0; at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; @@ -503,7 +588,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) struct pebs_record_core *at, *top; int n; - if (!ds || !x86_pmu.pebs) + if (!x86_pmu.pebs_active) return; at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base; @@ -545,7 +630,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) u64 status = 0; int bit, n; - if (!ds || !x86_pmu.pebs) + if (!x86_pmu.pebs_active) return; at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base; @@ -630,9 +715,8 @@ static void intel_ds_init(void) #else /* CONFIG_CPU_SUP_INTEL */ -static int reserve_ds_buffers(void) +static void reserve_ds_buffers(void) { - return 0; } static void release_ds_buffers(void) diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index d9f4ff8fcd69..d5a236615501 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -16,32 +16,12 @@ #include <linux/kernel.h> #include <linux/bitops.h> #include <linux/smp.h> -#include <linux/nmi.h> +#include <asm/nmi.h> #include <linux/kprobes.h> #include <asm/apic.h> #include <asm/perf_event.h> -struct nmi_watchdog_ctlblk { - unsigned int cccr_msr; - unsigned int perfctr_msr; /* the MSR to reset in NMI handler */ - unsigned int evntsel_msr; /* the MSR to select the events to handle */ -}; - -/* Interface defining a CPU specific perfctr watchdog */ -struct wd_ops { - int (*reserve)(void); - void (*unreserve)(void); - int (*setup)(unsigned nmi_hz); - void (*rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz); - void (*stop)(void); - unsigned perfctr; - unsigned evntsel; - u64 checkbit; -}; - -static const struct wd_ops *wd_ops; - /* * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's * offset from MSR_P4_BSU_ESCR0. @@ -60,8 +40,6 @@ static const struct wd_ops *wd_ops; static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS); static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS); -static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk); - /* converts an msr to an appropriate reservation bit */ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) { @@ -172,623 +150,3 @@ void release_evntsel_nmi(unsigned int msr) clear_bit(counter, evntsel_nmi_owner); } EXPORT_SYMBOL(release_evntsel_nmi); - -void disable_lapic_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); - - if (atomic_read(&nmi_active) <= 0) - return; - - on_each_cpu(stop_apic_nmi_watchdog, NULL, 1); - - if (wd_ops) - wd_ops->unreserve(); - - BUG_ON(atomic_read(&nmi_active) != 0); -} - -void enable_lapic_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); - - /* are we already enabled */ - if (atomic_read(&nmi_active) != 0) - return; - - /* are we lapic aware */ - if (!wd_ops) - return; - if (!wd_ops->reserve()) { - printk(KERN_ERR "NMI watchdog: cannot reserve perfctrs\n"); - return; - } - - on_each_cpu(setup_apic_nmi_watchdog, NULL, 1); - touch_nmi_watchdog(); -} - -/* - * Activate the NMI watchdog via the local APIC. - */ - -static unsigned int adjust_for_32bit_ctr(unsigned int hz) -{ - u64 counter_val; - unsigned int retval = hz; - - /* - * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter - * are writable, with higher bits sign extending from bit 31. - * So, we can only program the counter with 31 bit values and - * 32nd bit should be 1, for 33.. to be 1. - * Find the appropriate nmi_hz - */ - counter_val = (u64)cpu_khz * 1000; - do_div(counter_val, retval); - if (counter_val > 0x7fffffffULL) { - u64 count = (u64)cpu_khz * 1000; - do_div(count, 0x7fffffffUL); - retval = count + 1; - } - return retval; -} - -static void write_watchdog_counter(unsigned int perfctr_msr, - const char *descr, unsigned nmi_hz) -{ - u64 count = (u64)cpu_khz * 1000; - - do_div(count, nmi_hz); - if (descr) - pr_debug("setting %s to -0x%08Lx\n", descr, count); - wrmsrl(perfctr_msr, 0 - count); -} - -static void write_watchdog_counter32(unsigned int perfctr_msr, - const char *descr, unsigned nmi_hz) -{ - u64 count = (u64)cpu_khz * 1000; - - do_div(count, nmi_hz); - if (descr) - pr_debug("setting %s to -0x%08Lx\n", descr, count); - wrmsr(perfctr_msr, (u32)(-count), 0); -} - -/* - * AMD K7/K8/Family10h/Family11h support. - * AMD keeps this interface nicely stable so there is not much variety - */ -#define K7_EVNTSEL_ENABLE (1 << 22) -#define K7_EVNTSEL_INT (1 << 20) -#define K7_EVNTSEL_OS (1 << 17) -#define K7_EVNTSEL_USR (1 << 16) -#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 -#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING - -static int setup_k7_watchdog(unsigned nmi_hz) -{ - unsigned int perfctr_msr, evntsel_msr; - unsigned int evntsel; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - perfctr_msr = wd_ops->perfctr; - evntsel_msr = wd_ops->evntsel; - - wrmsrl(perfctr_msr, 0UL); - - evntsel = K7_EVNTSEL_INT - | K7_EVNTSEL_OS - | K7_EVNTSEL_USR - | K7_NMI_EVENT; - - /* setup the timer */ - wrmsr(evntsel_msr, evntsel, 0); - write_watchdog_counter(perfctr_msr, "K7_PERFCTR0", nmi_hz); - - /* initialize the wd struct before enabling */ - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; /* unused */ - - /* ok, everything is initialized, announce that we're set */ - cpu_nmi_set_wd_enabled(); - - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= K7_EVNTSEL_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); - - return 1; -} - -static void single_msr_stop_watchdog(void) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - wrmsr(wd->evntsel_msr, 0, 0); -} - -static int single_msr_reserve(void) -{ - if (!reserve_perfctr_nmi(wd_ops->perfctr)) - return 0; - - if (!reserve_evntsel_nmi(wd_ops->evntsel)) { - release_perfctr_nmi(wd_ops->perfctr); - return 0; - } - return 1; -} - -static void single_msr_unreserve(void) -{ - release_evntsel_nmi(wd_ops->evntsel); - release_perfctr_nmi(wd_ops->perfctr); -} - -static void __kprobes -single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) -{ - /* start the cycle over again */ - write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz); -} - -static const struct wd_ops k7_wd_ops = { - .reserve = single_msr_reserve, - .unreserve = single_msr_unreserve, - .setup = setup_k7_watchdog, - .rearm = single_msr_rearm, - .stop = single_msr_stop_watchdog, - .perfctr = MSR_K7_PERFCTR0, - .evntsel = MSR_K7_EVNTSEL0, - .checkbit = 1ULL << 47, -}; - -/* - * Intel Model 6 (PPro+,P2,P3,P-M,Core1) - */ -#define P6_EVNTSEL0_ENABLE (1 << 22) -#define P6_EVNTSEL_INT (1 << 20) -#define P6_EVNTSEL_OS (1 << 17) -#define P6_EVNTSEL_USR (1 << 16) -#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79 -#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED - -static int setup_p6_watchdog(unsigned nmi_hz) -{ - unsigned int perfctr_msr, evntsel_msr; - unsigned int evntsel; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - perfctr_msr = wd_ops->perfctr; - evntsel_msr = wd_ops->evntsel; - - /* KVM doesn't implement this MSR */ - if (wrmsr_safe(perfctr_msr, 0, 0) < 0) - return 0; - - evntsel = P6_EVNTSEL_INT - | P6_EVNTSEL_OS - | P6_EVNTSEL_USR - | P6_NMI_EVENT; - - /* setup the timer */ - wrmsr(evntsel_msr, evntsel, 0); - nmi_hz = adjust_for_32bit_ctr(nmi_hz); - write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0", nmi_hz); - - /* initialize the wd struct before enabling */ - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; /* unused */ - - /* ok, everything is initialized, announce that we're set */ - cpu_nmi_set_wd_enabled(); - - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= P6_EVNTSEL0_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); - - return 1; -} - -static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) -{ - /* - * P6 based Pentium M need to re-unmask - * the apic vector but it doesn't hurt - * other P6 variant. - * ArchPerfom/Core Duo also needs this - */ - apic_write(APIC_LVTPC, APIC_DM_NMI); - - /* P6/ARCH_PERFMON has 32 bit counter write */ - write_watchdog_counter32(wd->perfctr_msr, NULL, nmi_hz); -} - -static const struct wd_ops p6_wd_ops = { - .reserve = single_msr_reserve, - .unreserve = single_msr_unreserve, - .setup = setup_p6_watchdog, - .rearm = p6_rearm, - .stop = single_msr_stop_watchdog, - .perfctr = MSR_P6_PERFCTR0, - .evntsel = MSR_P6_EVNTSEL0, - .checkbit = 1ULL << 39, -}; - -/* - * Intel P4 performance counters. - * By far the most complicated of all. - */ -#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1 << 7) -#define P4_ESCR_EVENT_SELECT(N) ((N) << 25) -#define P4_ESCR_OS (1 << 3) -#define P4_ESCR_USR (1 << 2) -#define P4_CCCR_OVF_PMI0 (1 << 26) -#define P4_CCCR_OVF_PMI1 (1 << 27) -#define P4_CCCR_THRESHOLD(N) ((N) << 20) -#define P4_CCCR_COMPLEMENT (1 << 19) -#define P4_CCCR_COMPARE (1 << 18) -#define P4_CCCR_REQUIRED (3 << 16) -#define P4_CCCR_ESCR_SELECT(N) ((N) << 13) -#define P4_CCCR_ENABLE (1 << 12) -#define P4_CCCR_OVF (1 << 31) - -#define P4_CONTROLS 18 -static unsigned int p4_controls[18] = { - MSR_P4_BPU_CCCR0, - MSR_P4_BPU_CCCR1, - MSR_P4_BPU_CCCR2, - MSR_P4_BPU_CCCR3, - MSR_P4_MS_CCCR0, - MSR_P4_MS_CCCR1, - MSR_P4_MS_CCCR2, - MSR_P4_MS_CCCR3, - MSR_P4_FLAME_CCCR0, - MSR_P4_FLAME_CCCR1, - MSR_P4_FLAME_CCCR2, - MSR_P4_FLAME_CCCR3, - MSR_P4_IQ_CCCR0, - MSR_P4_IQ_CCCR1, - MSR_P4_IQ_CCCR2, - MSR_P4_IQ_CCCR3, - MSR_P4_IQ_CCCR4, - MSR_P4_IQ_CCCR5, -}; -/* - * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter - * CRU_ESCR0 (with any non-null event selector) through a complemented - * max threshold. [IA32-Vol3, Section 14.9.9] - */ -static int setup_p4_watchdog(unsigned nmi_hz) -{ - unsigned int perfctr_msr, evntsel_msr, cccr_msr; - unsigned int evntsel, cccr_val; - unsigned int misc_enable, dummy; - unsigned int ht_num; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy); - if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL)) - return 0; - -#ifdef CONFIG_SMP - /* detect which hyperthread we are on */ - if (smp_num_siblings == 2) { - unsigned int ebx, apicid; - - ebx = cpuid_ebx(1); - apicid = (ebx >> 24) & 0xff; - ht_num = apicid & 1; - } else -#endif - ht_num = 0; - - /* - * performance counters are shared resources - * assign each hyperthread its own set - * (re-use the ESCR0 register, seems safe - * and keeps the cccr_val the same) - */ - if (!ht_num) { - /* logical cpu 0 */ - perfctr_msr = MSR_P4_IQ_PERFCTR0; - evntsel_msr = MSR_P4_CRU_ESCR0; - cccr_msr = MSR_P4_IQ_CCCR0; - cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); - - /* - * If we're on the kdump kernel or other situation, we may - * still have other performance counter registers set to - * interrupt and they'll keep interrupting forever because - * of the P4_CCCR_OVF quirk. So we need to ACK all the - * pending interrupts and disable all the registers here, - * before reenabling the NMI delivery. Refer to p4_rearm() - * about the P4_CCCR_OVF quirk. - */ - if (reset_devices) { - unsigned int low, high; - int i; - - for (i = 0; i < P4_CONTROLS; i++) { - rdmsr(p4_controls[i], low, high); - low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF); - wrmsr(p4_controls[i], low, high); - } - } - } else { - /* logical cpu 1 */ - perfctr_msr = MSR_P4_IQ_PERFCTR1; - evntsel_msr = MSR_P4_CRU_ESCR0; - cccr_msr = MSR_P4_IQ_CCCR1; - - /* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */ - if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4) - cccr_val = P4_CCCR_OVF_PMI0; - else - cccr_val = P4_CCCR_OVF_PMI1; - cccr_val |= P4_CCCR_ESCR_SELECT(4); - } - - evntsel = P4_ESCR_EVENT_SELECT(0x3F) - | P4_ESCR_OS - | P4_ESCR_USR; - - cccr_val |= P4_CCCR_THRESHOLD(15) - | P4_CCCR_COMPLEMENT - | P4_CCCR_COMPARE - | P4_CCCR_REQUIRED; - - wrmsr(evntsel_msr, evntsel, 0); - wrmsr(cccr_msr, cccr_val, 0); - write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz); - - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = cccr_msr; - - /* ok, everything is initialized, announce that we're set */ - cpu_nmi_set_wd_enabled(); - - apic_write(APIC_LVTPC, APIC_DM_NMI); - cccr_val |= P4_CCCR_ENABLE; - wrmsr(cccr_msr, cccr_val, 0); - return 1; -} - -static void stop_p4_watchdog(void) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - wrmsr(wd->cccr_msr, 0, 0); - wrmsr(wd->evntsel_msr, 0, 0); -} - -static int p4_reserve(void) -{ - if (!reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR0)) - return 0; -#ifdef CONFIG_SMP - if (smp_num_siblings > 1 && !reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR1)) - goto fail1; -#endif - if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0)) - goto fail2; - /* RED-PEN why is ESCR1 not reserved here? */ - return 1; - fail2: -#ifdef CONFIG_SMP - if (smp_num_siblings > 1) - release_perfctr_nmi(MSR_P4_IQ_PERFCTR1); - fail1: -#endif - release_perfctr_nmi(MSR_P4_IQ_PERFCTR0); - return 0; -} - -static void p4_unreserve(void) -{ -#ifdef CONFIG_SMP - if (smp_num_siblings > 1) - release_perfctr_nmi(MSR_P4_IQ_PERFCTR1); -#endif - release_evntsel_nmi(MSR_P4_CRU_ESCR0); - release_perfctr_nmi(MSR_P4_IQ_PERFCTR0); -} - -static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) -{ - unsigned dummy; - /* - * P4 quirks: - * - An overflown perfctr will assert its interrupt - * until the OVF flag in its CCCR is cleared. - * - LVTPC is masked on interrupt and must be - * unmasked by the LVTPC handler. - */ - rdmsrl(wd->cccr_msr, dummy); - dummy &= ~P4_CCCR_OVF; - wrmsrl(wd->cccr_msr, dummy); - apic_write(APIC_LVTPC, APIC_DM_NMI); - /* start the cycle over again */ - write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz); -} - -static const struct wd_ops p4_wd_ops = { - .reserve = p4_reserve, - .unreserve = p4_unreserve, - .setup = setup_p4_watchdog, - .rearm = p4_rearm, - .stop = stop_p4_watchdog, - /* RED-PEN this is wrong for the other sibling */ - .perfctr = MSR_P4_BPU_PERFCTR0, - .evntsel = MSR_P4_BSU_ESCR0, - .checkbit = 1ULL << 39, -}; - -/* - * Watchdog using the Intel architected PerfMon. - * Used for Core2 and hopefully all future Intel CPUs. - */ -#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL -#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK - -static struct wd_ops intel_arch_wd_ops; - -static int setup_intel_arch_watchdog(unsigned nmi_hz) -{ - unsigned int ebx; - union cpuid10_eax eax; - unsigned int unused; - unsigned int perfctr_msr, evntsel_msr; - unsigned int evntsel; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - /* - * Check whether the Architectural PerfMon supports - * Unhalted Core Cycles Event or not. - * NOTE: Corresponding bit = 0 in ebx indicates event present. - */ - cpuid(10, &(eax.full), &ebx, &unused, &unused); - if ((eax.split.mask_length < - (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || - (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) - return 0; - - perfctr_msr = wd_ops->perfctr; - evntsel_msr = wd_ops->evntsel; - - wrmsrl(perfctr_msr, 0UL); - - evntsel = ARCH_PERFMON_EVENTSEL_INT - | ARCH_PERFMON_EVENTSEL_OS - | ARCH_PERFMON_EVENTSEL_USR - | ARCH_PERFMON_NMI_EVENT_SEL - | ARCH_PERFMON_NMI_EVENT_UMASK; - - /* setup the timer */ - wrmsr(evntsel_msr, evntsel, 0); - nmi_hz = adjust_for_32bit_ctr(nmi_hz); - write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz); - - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; /* unused */ - - /* ok, everything is initialized, announce that we're set */ - cpu_nmi_set_wd_enabled(); - - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); - intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); - return 1; -} - -static struct wd_ops intel_arch_wd_ops __read_mostly = { - .reserve = single_msr_reserve, - .unreserve = single_msr_unreserve, - .setup = setup_intel_arch_watchdog, - .rearm = p6_rearm, - .stop = single_msr_stop_watchdog, - .perfctr = MSR_ARCH_PERFMON_PERFCTR1, - .evntsel = MSR_ARCH_PERFMON_EVENTSEL1, -}; - -static void probe_nmi_watchdog(void) -{ - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - if (boot_cpu_data.x86 == 6 || - (boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x15)) - wd_ops = &k7_wd_ops; - return; - case X86_VENDOR_INTEL: - /* Work around where perfctr1 doesn't have a working enable - * bit as described in the following errata: - * AE49 Core Duo and Intel Core Solo 65 nm - * AN49 Intel Pentium Dual-Core - * AF49 Dual-Core Intel Xeon Processor LV - */ - if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) || - ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 15 && - boot_cpu_data.x86_mask == 4))) { - intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0; - intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0; - } - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - wd_ops = &intel_arch_wd_ops; - break; - } - switch (boot_cpu_data.x86) { - case 6: - if (boot_cpu_data.x86_model > 13) - return; - - wd_ops = &p6_wd_ops; - break; - case 15: - wd_ops = &p4_wd_ops; - break; - default: - return; - } - break; - } -} - -/* Interface to nmi.c */ - -int lapic_watchdog_init(unsigned nmi_hz) -{ - if (!wd_ops) { - probe_nmi_watchdog(); - if (!wd_ops) { - printk(KERN_INFO "NMI watchdog: CPU not supported\n"); - return -1; - } - - if (!wd_ops->reserve()) { - printk(KERN_ERR - "NMI watchdog: cannot reserve perfctrs\n"); - return -1; - } - } - - if (!(wd_ops->setup(nmi_hz))) { - printk(KERN_ERR "Cannot setup NMI watchdog on CPU %d\n", - raw_smp_processor_id()); - return -1; - } - - return 0; -} - -void lapic_watchdog_stop(void) -{ - if (wd_ops) - wd_ops->stop(); -} - -unsigned lapic_adjust_nmi_hz(unsigned hz) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - if (wd->perfctr_msr == MSR_P6_PERFCTR0 || - wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1) - hz = adjust_for_32bit_ctr(hz); - return hz; -} - -int __kprobes lapic_wd_event(unsigned nmi_hz) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - u64 ctr; - - rdmsrl(wd->perfctr_msr, ctr); - if (ctr & wd_ops->checkbit) /* perfctr still running? */ - return 0; - - wd_ops->rearm(wd, nmi_hz); - return 1; -} diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 1b7b31ab7d86..212a6a42527c 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -33,7 +33,6 @@ #include <linux/init.h> #include <linux/poll.h> #include <linux/smp.h> -#include <linux/smp_lock.h> #include <linux/major.h> #include <linux/fs.h> #include <linux/device.h> diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index 67414550c3cc..d5cd13945d5a 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -61,7 +61,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (!is_crashed_pfn_valid(pfn)) return -EFAULT; - vaddr = kmap_atomic_pfn(pfn, KM_PTE0); + vaddr = kmap_atomic_pfn(pfn); if (!userbuf) { memcpy(buf, (vaddr + offset), csize); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 6e8752c1bd52..8474c998cbd4 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -175,21 +175,21 @@ static const struct stacktrace_ops print_trace_ops = { void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) + unsigned long *stack, char *log_lvl) { printk("%sCall Trace:\n", log_lvl); - dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); + dump_trace(task, regs, stack, &print_trace_ops, log_lvl); } void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) + unsigned long *stack) { - show_trace_log_lvl(task, regs, stack, bp, ""); + show_trace_log_lvl(task, regs, stack, ""); } void show_stack(struct task_struct *task, unsigned long *sp) { - show_stack_log_lvl(task, NULL, sp, 0, ""); + show_stack_log_lvl(task, NULL, sp, ""); } /* @@ -210,7 +210,7 @@ void dump_stack(void) init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - show_trace(NULL, NULL, &stack, bp); + show_trace(NULL, NULL, &stack); } EXPORT_SYMBOL(dump_stack); diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 0f6376ffa2d9..74cc1eda384b 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -17,11 +17,12 @@ #include <asm/stacktrace.h> -void dump_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, +void dump_trace(struct task_struct *task, + struct pt_regs *regs, unsigned long *stack, const struct stacktrace_ops *ops, void *data) { int graph = 0; + unsigned long bp; if (!task) task = current; @@ -34,18 +35,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, stack = (unsigned long *)task->thread.sp; } -#ifdef CONFIG_FRAME_POINTER - if (!bp) { - if (task == current) { - /* Grab bp right from our regs */ - get_bp(bp); - } else { - /* bp is the last reg pushed by switch_to */ - bp = *(unsigned long *) task->thread.sp; - } - } -#endif - + bp = stack_frame(task, regs); for (;;) { struct thread_info *context; @@ -65,7 +55,7 @@ EXPORT_SYMBOL(dump_trace); void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) + unsigned long *sp, char *log_lvl) { unsigned long *stack; int i; @@ -82,12 +72,12 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, if (kstack_end(stack)) break; if (i && ((i % STACKSLOTS_PER_LINE) == 0)) - printk("\n%s", log_lvl); - printk(" %08lx", *stack++); + printk(KERN_CONT "\n"); + printk(KERN_CONT " %08lx", *stack++); touch_nmi_watchdog(); } - printk("\n"); - show_trace_log_lvl(task, regs, sp, bp, log_lvl); + printk(KERN_CONT "\n"); + show_trace_log_lvl(task, regs, sp, log_lvl); } @@ -112,8 +102,7 @@ void show_registers(struct pt_regs *regs) u8 *ip; printk(KERN_EMERG "Stack:\n"); - show_stack_log_lvl(NULL, regs, ®s->sp, - 0, KERN_EMERG); + show_stack_log_lvl(NULL, regs, ®s->sp, KERN_EMERG); printk(KERN_EMERG "Code: "); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 57a21f11c791..64101335de19 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -139,8 +139,8 @@ fixup_bp_irq_link(unsigned long bp, unsigned long *stack, * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack */ -void dump_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, +void dump_trace(struct task_struct *task, + struct pt_regs *regs, unsigned long *stack, const struct stacktrace_ops *ops, void *data) { const unsigned cpu = get_cpu(); @@ -149,6 +149,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned used = 0; struct thread_info *tinfo; int graph = 0; + unsigned long bp; if (!task) task = current; @@ -160,18 +161,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, stack = (unsigned long *)task->thread.sp; } -#ifdef CONFIG_FRAME_POINTER - if (!bp) { - if (task == current) { - /* Grab bp right from our regs */ - get_bp(bp); - } else { - /* bp is the last reg pushed by switch_to */ - bp = *(unsigned long *) task->thread.sp; - } - } -#endif - + bp = stack_frame(task, regs); /* * Print function call entries in all stacks, starting at the * current stack address. If the stacks consist of nested @@ -235,7 +225,7 @@ EXPORT_SYMBOL(dump_trace); void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) + unsigned long *sp, char *log_lvl) { unsigned long *irq_stack_end; unsigned long *irq_stack; @@ -265,21 +255,21 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, if (stack >= irq_stack && stack <= irq_stack_end) { if (stack == irq_stack_end) { stack = (unsigned long *) (irq_stack_end[-1]); - printk(" <EOI> "); + printk(KERN_CONT " <EOI> "); } } else { if (((long) stack & (THREAD_SIZE-1)) == 0) break; } if (i && ((i % STACKSLOTS_PER_LINE) == 0)) - printk("\n%s", log_lvl); - printk(" %016lx", *stack++); + printk(KERN_CONT "\n"); + printk(KERN_CONT " %016lx", *stack++); touch_nmi_watchdog(); } preempt_enable(); - printk("\n"); - show_trace_log_lvl(task, regs, sp, bp, log_lvl); + printk(KERN_CONT "\n"); + show_trace_log_lvl(task, regs, sp, log_lvl); } void show_registers(struct pt_regs *regs) @@ -308,7 +298,7 @@ void show_registers(struct pt_regs *regs) printk(KERN_EMERG "Stack:\n"); show_stack_log_lvl(NULL, regs, (unsigned long *)sp, - regs->bp, KERN_EMERG); + KERN_EMERG); printk(KERN_EMERG "Code: "); diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 59e175e89599..591e60104278 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -395,7 +395,7 @@ sysenter_past_esp: * A tiny bit of offset fixup is necessary - 4*4 means the 4 words * pushed above; +8 corresponds to copy_thread's esp0 setting. */ - pushl_cfi (TI_sysenter_return-THREAD_SIZE_asm+8+4*4)(%esp) + pushl_cfi ((TI_sysenter_return)-THREAD_SIZE_asm+8+4*4)(%esp) CFI_REL_OFFSET eip, 0 pushl_cfi %eax diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index fe2690d71c0c..e3ba417e8697 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -295,6 +295,7 @@ ENDPROC(native_usergs_sysret64) .endm /* save partial stack frame */ + .pushsection .kprobes.text, "ax" ENTRY(save_args) XCPT_FRAME cld @@ -334,6 +335,7 @@ ENTRY(save_args) ret CFI_ENDPROC END(save_args) + .popsection ENTRY(save_rest) PARTIAL_FRAME 1 REST_SKIP+8 diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index bcece91dd311..5707fc8a7a4b 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -60,16 +60,18 @@ #define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) #endif +/* Number of possible pages in the lowmem region */ +LOWMEM_PAGES = (((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) + /* Enough space to fit pagetables for the low memory linear map */ -MAPPING_BEYOND_END = \ - PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT +MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT /* * Worst-case size of the kernel mapping we need to make: - * the worst-case size of the kernel itself, plus the extra we need - * to map for the linear map. + * a relocatable kernel can live anywhere in lowmem, so we need to be able + * to map all of lowmem. */ -KERNEL_PAGES = (KERNEL_IMAGE_SIZE + MAPPING_BEYOND_END)>>PAGE_SHIFT +KERNEL_PAGES = LOWMEM_PAGES INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm RESERVE_BRK(pagetables, INIT_MAP_SIZE) @@ -314,6 +316,10 @@ ENTRY(startup_32_smp) subl $0x80000001, %eax cmpl $(0x8000ffff-0x80000001), %eax ja 6f + + /* Clear bogus XD_DISABLE bits */ + call verify_cpu + mov $0x80000001, %eax cpuid /* Execute Disable bit supported? */ @@ -609,6 +615,8 @@ ignore_int: #endif iret +#include "verify_cpu.S" + __REFDATA .align 4 ENTRY(initial_code) @@ -620,13 +628,13 @@ ENTRY(initial_code) __PAGE_ALIGNED_BSS .align PAGE_SIZE_asm #ifdef CONFIG_X86_PAE -initial_pg_pmd: +ENTRY(initial_pg_pmd) .fill 1024*KPMDS,4,0 #else ENTRY(initial_page_table) .fill 1024,4,0 #endif -initial_pg_fixmap: +ENTRY(initial_pg_fixmap) .fill 1024,4,0 ENTRY(empty_zero_page) .fill 4096,1,0 diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index aff0b3c27509..4ff5968f12d2 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -27,6 +27,9 @@ #define HPET_DEV_FSB_CAP 0x1000 #define HPET_DEV_PERI_CAP 0x2000 +#define HPET_MIN_CYCLES 128 +#define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1)) + #define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt) /* @@ -299,8 +302,9 @@ static void hpet_legacy_clockevent_register(void) /* Calculate the min / max delta */ hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, &hpet_clockevent); - /* 5 usec minimum reprogramming delta. */ - hpet_clockevent.min_delta_ns = 5000; + /* Setup minimum reprogramming delta. */ + hpet_clockevent.min_delta_ns = clockevent_delta2ns(HPET_MIN_PROG_DELTA, + &hpet_clockevent); /* * Start hpet with the boot cpu mask and make it @@ -393,22 +397,24 @@ static int hpet_next_event(unsigned long delta, * the wraparound into account) nor a simple count down event * mode. Further the write to the comparator register is * delayed internally up to two HPET clock cycles in certain - * chipsets (ATI, ICH9,10). We worked around that by reading - * back the compare register, but that required another - * workaround for ICH9,10 chips where the first readout after - * write can return the old stale value. We already have a - * minimum delta of 5us enforced, but a NMI or SMI hitting + * chipsets (ATI, ICH9,10). Some newer AMD chipsets have even + * longer delays. We worked around that by reading back the + * compare register, but that required another workaround for + * ICH9,10 chips where the first readout after write can + * return the old stale value. We already had a minimum + * programming delta of 5us enforced, but a NMI or SMI hitting * between the counter readout and the comparator write can * move us behind that point easily. Now instead of reading * the compare register back several times, we make the ETIME * decision based on the following: Return ETIME if the - * counter value after the write is less than 8 HPET cycles + * counter value after the write is less than HPET_MIN_CYCLES * away from the event or if the counter is already ahead of - * the event. + * the event. The minimum programming delta for the generic + * clockevents code is set to 1.5 * HPET_MIN_CYCLES. */ res = (s32)(cnt - hpet_readl(HPET_COUNTER)); - return res < 8 ? -ETIME : 0; + return res < HPET_MIN_CYCLES ? -ETIME : 0; } static void hpet_legacy_set_mode(enum clock_event_mode mode, @@ -713,7 +719,7 @@ static int hpet_cpuhp_notify(struct notifier_block *n, switch (action & 0xf) { case CPU_ONLINE: - INIT_DELAYED_WORK_ON_STACK(&work.work, hpet_work); + INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work); init_completion(&work.complete); /* FIXME: add schedule_work_on() */ schedule_delayed_work_on(cpu, &work.work, 0); diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index ff15c9dcc25d..42c594254507 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -433,6 +433,10 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) dr6_p = (unsigned long *)ERR_PTR(args->err); dr6 = *dr6_p; + /* If it's a single step, TRAP bits are random */ + if (dr6 & DR_STEP) + return NOTIFY_DONE; + /* Do an early return if no trap bits are set in DR6 */ if ((dr6 & DR_TRAP_BITS) == 0) return NOTIFY_DONE; diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 50fbbe60e507..96656f207751 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -17,6 +17,7 @@ #include <linux/delay.h> #include <linux/uaccess.h> #include <linux/percpu.h> +#include <linux/mm.h> #include <asm/apic.h> @@ -60,9 +61,6 @@ union irq_ctx { static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx); static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx); -static DEFINE_PER_CPU_MULTIPAGE_ALIGNED(union irq_ctx, hardirq_stack, THREAD_SIZE); -static DEFINE_PER_CPU_MULTIPAGE_ALIGNED(union irq_ctx, softirq_stack, THREAD_SIZE); - static void call_on_stack(void *func, void *stack) { asm volatile("xchgl %%ebx,%%esp \n" @@ -128,7 +126,9 @@ void __cpuinit irq_ctx_init(int cpu) if (per_cpu(hardirq_ctx, cpu)) return; - irqctx = &per_cpu(hardirq_stack, cpu); + irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), + THREAD_FLAGS, + THREAD_ORDER)); irqctx->tinfo.task = NULL; irqctx->tinfo.exec_domain = NULL; irqctx->tinfo.cpu = cpu; @@ -137,7 +137,9 @@ void __cpuinit irq_ctx_init(int cpu) per_cpu(hardirq_ctx, cpu) = irqctx; - irqctx = &per_cpu(softirq_stack, cpu); + irqctx = page_address(alloc_pages_node(cpu_to_node(cpu), + THREAD_FLAGS, + THREAD_ORDER)); irqctx->tinfo.task = NULL; irqctx->tinfo.exec_domain = NULL; irqctx->tinfo.cpu = cpu; @@ -150,11 +152,6 @@ void __cpuinit irq_ctx_init(int cpu) cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu)); } -void irq_ctx_exit(int cpu) -{ - per_cpu(hardirq_ctx, cpu) = NULL; -} - asmlinkage void do_softirq(void) { unsigned long flags; diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index d81cfebb848f..cd21b654dec6 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -315,14 +315,18 @@ static void kgdb_remove_all_hw_break(void) if (!breakinfo[i].enabled) continue; bp = *per_cpu_ptr(breakinfo[i].pev, cpu); - if (bp->attr.disabled == 1) + if (!bp->attr.disabled) { + arch_uninstall_hw_breakpoint(bp); + bp->attr.disabled = 1; continue; + } if (dbg_is_early) early_dr7 &= ~encode_dr7(i, breakinfo[i].len, breakinfo[i].type); - else - arch_uninstall_hw_breakpoint(bp); - bp->attr.disabled = 1; + else if (hw_break_release_slot(i)) + printk(KERN_ERR "KGDB: hw bpt remove failed %lx\n", + breakinfo[i].addr); + breakinfo[i].enabled = 0; } } @@ -387,7 +391,7 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) * disable hardware debugging while it is processing gdb packets or * handling exception. */ -void kgdb_disable_hw_debug(struct pt_regs *regs) +static void kgdb_disable_hw_debug(struct pt_regs *regs) { int i; int cpu = raw_smp_processor_id(); @@ -724,6 +728,7 @@ struct kgdb_arch arch_kgdb_ops = { .flags = KGDB_HW_BREAKPOINT, .set_hw_breakpoint = kgdb_set_hw_break, .remove_hw_breakpoint = kgdb_remove_hw_break, + .disable_hw_break = kgdb_disable_hw_debug, .remove_all_hw_break = kgdb_remove_all_hw_break, .correct_hw_break = kgdb_correct_hw_break, }; diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 1cbd54c0df99..5940282bd2f9 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -1184,6 +1184,10 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op, { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + /* This is possible if op is under delayed unoptimizing */ + if (kprobe_disabled(&op->kp)) + return; + preempt_disable(); if (kprobe_running()) { kprobes_inc_nmissed_count(&op->kp); @@ -1401,10 +1405,16 @@ int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) return 0; } -/* Replace a breakpoint (int3) with a relative jump. */ -int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op) +#define MAX_OPTIMIZE_PROBES 256 +static struct text_poke_param *jump_poke_params; +static struct jump_poke_buffer { + u8 buf[RELATIVEJUMP_SIZE]; +} *jump_poke_bufs; + +static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm, + u8 *insn_buf, + struct optimized_kprobe *op) { - unsigned char jmp_code[RELATIVEJUMP_SIZE]; s32 rel = (s32)((long)op->optinsn.insn - ((long)op->kp.addr + RELATIVEJUMP_SIZE)); @@ -1412,16 +1422,79 @@ int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op) memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, RELATIVE_ADDR_SIZE); - jmp_code[0] = RELATIVEJUMP_OPCODE; - *(s32 *)(&jmp_code[1]) = rel; + insn_buf[0] = RELATIVEJUMP_OPCODE; + *(s32 *)(&insn_buf[1]) = rel; + + tprm->addr = op->kp.addr; + tprm->opcode = insn_buf; + tprm->len = RELATIVEJUMP_SIZE; +} + +/* + * Replace breakpoints (int3) with relative jumps. + * Caller must call with locking kprobe_mutex and text_mutex. + */ +void __kprobes arch_optimize_kprobes(struct list_head *oplist) +{ + struct optimized_kprobe *op, *tmp; + int c = 0; + + list_for_each_entry_safe(op, tmp, oplist, list) { + WARN_ON(kprobe_disabled(&op->kp)); + /* Setup param */ + setup_optimize_kprobe(&jump_poke_params[c], + jump_poke_bufs[c].buf, op); + list_del_init(&op->list); + if (++c >= MAX_OPTIMIZE_PROBES) + break; + } /* * text_poke_smp doesn't support NMI/MCE code modifying. * However, since kprobes itself also doesn't support NMI/MCE * code probing, it's not a problem. */ - text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE); - return 0; + text_poke_smp_batch(jump_poke_params, c); +} + +static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm, + u8 *insn_buf, + struct optimized_kprobe *op) +{ + /* Set int3 to first byte for kprobes */ + insn_buf[0] = BREAKPOINT_INSTRUCTION; + memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + + tprm->addr = op->kp.addr; + tprm->opcode = insn_buf; + tprm->len = RELATIVEJUMP_SIZE; +} + +/* + * Recover original instructions and breakpoints from relative jumps. + * Caller must call with locking kprobe_mutex. + */ +extern void arch_unoptimize_kprobes(struct list_head *oplist, + struct list_head *done_list) +{ + struct optimized_kprobe *op, *tmp; + int c = 0; + + list_for_each_entry_safe(op, tmp, oplist, list) { + /* Setup param */ + setup_unoptimize_kprobe(&jump_poke_params[c], + jump_poke_bufs[c].buf, op); + list_move(&op->list, done_list); + if (++c >= MAX_OPTIMIZE_PROBES) + break; + } + + /* + * text_poke_smp doesn't support NMI/MCE code modifying. + * However, since kprobes itself also doesn't support NMI/MCE + * code probing, it's not a problem. + */ + text_poke_smp_batch(jump_poke_params, c); } /* Replace a relative jump with a breakpoint (int3). */ @@ -1453,11 +1526,35 @@ static int __kprobes setup_detour_execution(struct kprobe *p, } return 0; } + +static int __kprobes init_poke_params(void) +{ + /* Allocate code buffer and parameter array */ + jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) * + MAX_OPTIMIZE_PROBES, GFP_KERNEL); + if (!jump_poke_bufs) + return -ENOMEM; + + jump_poke_params = kmalloc(sizeof(struct text_poke_param) * + MAX_OPTIMIZE_PROBES, GFP_KERNEL); + if (!jump_poke_params) { + kfree(jump_poke_bufs); + jump_poke_bufs = NULL; + return -ENOMEM; + } + + return 0; +} +#else /* !CONFIG_OPTPROBES */ +static int __kprobes init_poke_params(void) +{ + return 0; +} #endif int __init arch_init_kprobes(void) { - return 0; + return init_poke_params(); } int __kprobes arch_trampoline_kprobe(struct kprobe *p) diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index e1af7c055c7d..ce0cb4721c9a 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -212,7 +212,7 @@ static int install_equiv_cpu_table(const u8 *buf) return 0; } - equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); + equiv_cpu_table = vmalloc(size); if (!equiv_cpu_table) { pr_err("failed to allocate equivalent CPU table\n"); return 0; diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index dcb65cc0a053..1a1b606d3e92 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -364,8 +364,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, /* For performance reasons, reuse mc area when possible */ if (!mc || mc_size > curr_mc_size) { - if (mc) - vfree(mc); + vfree(mc); mc = vmalloc(mc_size); if (!mc) break; @@ -374,13 +373,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, if (get_ucode_data(mc, ucode_ptr, mc_size) || microcode_sanity_check(mc) < 0) { - vfree(mc); break; } if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) { - if (new_mc) - vfree(new_mc); + vfree(new_mc); new_rev = mc_header.rev; new_mc = mc; mc = NULL; /* trigger new vmalloc */ @@ -390,12 +387,10 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, leftover -= mc_size; } - if (mc) - vfree(mc); + vfree(mc); if (leftover) { - if (new_mc) - vfree(new_mc); + vfree(new_mc); state = UCODE_ERROR; goto out; } @@ -405,8 +400,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, goto out; } - if (uci->mc) - vfree(uci->mc); + vfree(uci->mc); uci->mc = (struct microcode_intel *)new_mc; pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index 71825806cd44..ac861b8348e2 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -25,7 +25,6 @@ struct pci_hostbridge_probe { }; static u64 __cpuinitdata fam10h_pci_mmconf_base; -static int __cpuinitdata fam10h_pci_mmconf_base_status; static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = { { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 }, @@ -44,10 +43,12 @@ static int __cpuinit cmp_range(const void *x1, const void *x2) return start1 - start2; } -/*[47:0] */ -/* need to avoid (0xfd<<32) and (0xfe<<32), ht used space */ +#define MMCONF_UNIT (1ULL << FAM10H_MMIO_CONF_BASE_SHIFT) +#define MMCONF_MASK (~(MMCONF_UNIT - 1)) +#define MMCONF_SIZE (MMCONF_UNIT << 8) +/* need to avoid (0xfd<<32), (0xfe<<32), and (0xff<<32), ht used space */ #define FAM10H_PCI_MMCONF_BASE (0xfcULL<<32) -#define BASE_VALID(b) ((b != (0xfdULL << 32)) && (b != (0xfeULL << 32))) +#define BASE_VALID(b) ((b) + MMCONF_SIZE <= (0xfdULL<<32) || (b) >= (1ULL<<40)) static void __cpuinit get_fam10h_pci_mmconf_base(void) { int i; @@ -64,12 +65,11 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void) struct range range[8]; /* only try to get setting from BSP */ - /* -1 or 1 */ - if (fam10h_pci_mmconf_base_status) + if (fam10h_pci_mmconf_base) return; if (!early_pci_allowed()) - goto fail; + return; found = 0; for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { @@ -91,7 +91,7 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void) } if (!found) - goto fail; + return; /* SYS_CFG */ address = MSR_K8_SYSCFG; @@ -99,16 +99,16 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void) /* TOP_MEM2 is not enabled? */ if (!(val & (1<<21))) { - tom2 = 0; + tom2 = 1ULL << 32; } else { /* TOP_MEM2 */ address = MSR_K8_TOP_MEM2; rdmsrl(address, val); - tom2 = val & (0xffffULL<<32); + tom2 = max(val & 0xffffff800000ULL, 1ULL << 32); } if (base <= tom2) - base = tom2 + (1ULL<<32); + base = (tom2 + 2 * MMCONF_UNIT - 1) & MMCONF_MASK; /* * need to check if the range is in the high mmio range that is @@ -123,11 +123,11 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void) if (!(reg & 3)) continue; - start = (((u64)reg) << 8) & (0xffULL << 32); /* 39:16 on 31:8*/ + start = (u64)(reg & 0xffffff00) << 8; /* 39:16 on 31:8*/ reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3)); - end = (((u64)reg) << 8) & (0xffULL << 32); /* 39:16 on 31:8*/ + end = ((u64)(reg & 0xffffff00) << 8) | 0xffff; /* 39:16 on 31:8*/ - if (!end) + if (end < tom2) continue; range[hi_mmio_num].start = start; @@ -143,32 +143,27 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void) if (range[hi_mmio_num - 1].end < base) goto out; - if (range[0].start > base) + if (range[0].start > base + MMCONF_SIZE) goto out; /* need to find one window */ - base = range[0].start - (1ULL << 32); + base = (range[0].start & MMCONF_MASK) - MMCONF_UNIT; if ((base > tom2) && BASE_VALID(base)) goto out; - base = range[hi_mmio_num - 1].end + (1ULL << 32); - if ((base > tom2) && BASE_VALID(base)) + base = (range[hi_mmio_num - 1].end + MMCONF_UNIT) & MMCONF_MASK; + if (BASE_VALID(base)) goto out; /* need to find window between ranges */ - if (hi_mmio_num > 1) - for (i = 0; i < hi_mmio_num - 1; i++) { - if (range[i + 1].start > (range[i].end + (1ULL << 32))) { - base = range[i].end + (1ULL << 32); - if ((base > tom2) && BASE_VALID(base)) - goto out; - } + for (i = 1; i < hi_mmio_num; i++) { + base = (range[i - 1].end + MMCONF_UNIT) & MMCONF_MASK; + val = range[i].start & MMCONF_MASK; + if (val >= base + MMCONF_SIZE && BASE_VALID(base)) + goto out; } - -fail: - fam10h_pci_mmconf_base_status = -1; return; + out: fam10h_pci_mmconf_base = base; - fam10h_pci_mmconf_base_status = 1; } void __cpuinit fam10h_check_enable_mmcfg(void) @@ -190,11 +185,10 @@ void __cpuinit fam10h_check_enable_mmcfg(void) /* only trust the one handle 256 buses, if acpi=off */ if (!acpi_pci_disabled || busnbits >= 8) { - u64 base; - base = val & (0xffffULL << 32); - if (fam10h_pci_mmconf_base_status <= 0) { + u64 base = val & MMCONF_MASK; + + if (!fam10h_pci_mmconf_base) { fam10h_pci_mmconf_base = base; - fam10h_pci_mmconf_base_status = 1; return; } else if (fam10h_pci_mmconf_base == base) return; @@ -206,8 +200,10 @@ void __cpuinit fam10h_check_enable_mmcfg(void) * with 256 buses */ get_fam10h_pci_mmconf_base(); - if (fam10h_pci_mmconf_base_status <= 0) + if (!fam10h_pci_mmconf_base) { + pci_probe &= ~PCI_CHECK_ENABLE_AMD_MMCONF; return; + } printk(KERN_INFO "Enable MMCONFIG on AMD Family 10h\n"); val &= ~((FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT) | @@ -217,13 +213,13 @@ void __cpuinit fam10h_check_enable_mmcfg(void) wrmsrl(address, val); } -static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d) +static int __init set_check_enable_amd_mmconf(const struct dmi_system_id *d) { pci_probe |= PCI_CHECK_ENABLE_AMD_MMCONF; return 0; } -static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = { +static const struct dmi_system_id __initconst mmconf_dmi_table[] = { { .callback = set_check_enable_amd_mmconf, .ident = "Sun Microsystems Machine", @@ -234,7 +230,8 @@ static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = { {} }; -void __cpuinit check_enable_amd_mmconf_dmi(void) +/* Called from a __cpuinit function, but only on the BSP. */ +void __ref check_enable_amd_mmconf_dmi(void) { dmi_check_system(mmconf_dmi_table); } diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 7bf2dc4c8f70..12fcbe2c143e 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -30,7 +30,6 @@ #include <linux/init.h> #include <linux/poll.h> #include <linux/smp.h> -#include <linux/smp_lock.h> #include <linux/major.h> #include <linux/fs.h> #include <linux/device.h> diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index ba0f0ca9f280..c01ffa5b9b87 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -143,7 +143,7 @@ static void flush_gart(void) spin_lock_irqsave(&iommu_bitmap_lock, flags); if (need_flush) { - k8_flush_garts(); + amd_flush_garts(); need_flush = false; } spin_unlock_irqrestore(&iommu_bitmap_lock, flags); @@ -561,17 +561,17 @@ static void enable_gart_translations(void) { int i; - if (!k8_northbridges.gart_supported) + if (!amd_nb_has_feature(AMD_NB_GART)) return; - for (i = 0; i < k8_northbridges.num; i++) { - struct pci_dev *dev = k8_northbridges.nb_misc[i]; + for (i = 0; i < amd_nb_num(); i++) { + struct pci_dev *dev = node_to_amd_nb(i)->misc; enable_gart_translation(dev, __pa(agp_gatt_table)); } /* Flush the GART-TLB to remove stale entries */ - k8_flush_garts(); + amd_flush_garts(); } /* @@ -596,13 +596,13 @@ static void gart_fixup_northbridges(struct sys_device *dev) if (!fix_up_north_bridges) return; - if (!k8_northbridges.gart_supported) + if (!amd_nb_has_feature(AMD_NB_GART)) return; pr_info("PCI-DMA: Restoring GART aperture settings\n"); - for (i = 0; i < k8_northbridges.num; i++) { - struct pci_dev *dev = k8_northbridges.nb_misc[i]; + for (i = 0; i < amd_nb_num(); i++) { + struct pci_dev *dev = node_to_amd_nb(i)->misc; /* * Don't enable translations just yet. That is the next @@ -644,7 +644,7 @@ static struct sys_device device_gart = { * Private Northbridge GATT initialization in case we cannot use the * AGP driver for some reason. */ -static __init int init_k8_gatt(struct agp_kern_info *info) +static __init int init_amd_gatt(struct agp_kern_info *info) { unsigned aper_size, gatt_size, new_aper_size; unsigned aper_base, new_aper_base; @@ -656,8 +656,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info) aper_size = aper_base = info->aper_size = 0; dev = NULL; - for (i = 0; i < k8_northbridges.num; i++) { - dev = k8_northbridges.nb_misc[i]; + for (i = 0; i < amd_nb_num(); i++) { + dev = node_to_amd_nb(i)->misc; new_aper_base = read_aperture(dev, &new_aper_size); if (!new_aper_base) goto nommu; @@ -725,13 +725,13 @@ static void gart_iommu_shutdown(void) if (!no_agp) return; - if (!k8_northbridges.gart_supported) + if (!amd_nb_has_feature(AMD_NB_GART)) return; - for (i = 0; i < k8_northbridges.num; i++) { + for (i = 0; i < amd_nb_num(); i++) { u32 ctl; - dev = k8_northbridges.nb_misc[i]; + dev = node_to_amd_nb(i)->misc; pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); ctl &= ~GARTEN; @@ -749,14 +749,14 @@ int __init gart_iommu_init(void) unsigned long scratch; long i; - if (!k8_northbridges.gart_supported) + if (!amd_nb_has_feature(AMD_NB_GART)) return 0; #ifndef CONFIG_AGP_AMD64 no_agp = 1; #else /* Makefile puts PCI initialization via subsys_initcall first. */ - /* Add other K8 AGP bridge drivers here */ + /* Add other AMD AGP bridge drivers here */ no_agp = no_agp || (agp_amd64_init() < 0) || (agp_copy_info(agp_bridge, &info) < 0); @@ -765,7 +765,7 @@ int __init gart_iommu_init(void) if (no_iommu || (!force_iommu && max_pfn <= MAX_DMA32_PFN) || !gart_iommu_aperture || - (no_agp && init_k8_gatt(&info) < 0)) { + (no_agp && init_amd_gatt(&info) < 0)) { if (max_pfn > MAX_DMA32_PFN) { pr_warning("More than 4GB of memory but GART IOMMU not available.\n"); pr_warning("falling back to iommu=soft.\n"); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 57d1868a86aa..c852041bfc3d 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -91,8 +91,7 @@ void exit_thread(void) void show_regs(struct pt_regs *regs) { show_registers(regs); - show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), - regs->bp); + show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs)); } void show_regs_common(void) @@ -374,6 +373,7 @@ void default_idle(void) { if (hlt_use_halt()) { trace_power_start(POWER_CSTATE, 1, smp_processor_id()); + trace_cpu_idle(1, smp_processor_id()); current_thread_info()->status &= ~TS_POLLING; /* * TS_POLLING-cleared state must be visible before we @@ -444,6 +444,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id()); + trace_cpu_idle((ax>>4)+1, smp_processor_id()); if (!need_resched()) { if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -460,6 +461,7 @@ static void mwait_idle(void) { if (!need_resched()) { trace_power_start(POWER_CSTATE, 1, smp_processor_id()); + trace_cpu_idle(1, smp_processor_id()); if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -481,10 +483,12 @@ static void mwait_idle(void) static void poll_idle(void) { trace_power_start(POWER_CSTATE, 0, smp_processor_id()); + trace_cpu_idle(0, smp_processor_id()); local_irq_enable(); while (!need_resched()) cpu_relax(); - trace_power_end(0); + trace_power_end(smp_processor_id()); + trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); } /* diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 96586c3cbbbf..4b9befa0e347 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -113,8 +113,8 @@ void cpu_idle(void) stop_critical_timings(); pm_idle(); start_critical_timings(); - trace_power_end(smp_processor_id()); + trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); } tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b3d7a3a04f38..4c818a738396 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -142,6 +142,8 @@ void cpu_idle(void) start_critical_timings(); trace_power_end(smp_processor_id()); + trace_cpu_idle(PWR_EVENT_EXIT, + smp_processor_id()); /* In many cases the interrupt that ended idle has already called exit_idle. But some idle diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 70c4872cd8aa..45892dc4b72a 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -801,7 +801,8 @@ void ptrace_disable(struct task_struct *child) static const struct user_regset_view user_x86_32_view; /* Initialized below. */ #endif -long arch_ptrace(struct task_struct *child, long request, long addr, long data) +long arch_ptrace(struct task_struct *child, long request, + unsigned long addr, unsigned long data) { int ret; unsigned long __user *datap = (unsigned long __user *)data; @@ -812,8 +813,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) unsigned long tmp; ret = -EIO; - if ((addr & (sizeof(data) - 1)) || addr < 0 || - addr >= sizeof(struct user)) + if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user)) break; tmp = 0; /* Default return condition */ @@ -830,8 +830,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ ret = -EIO; - if ((addr & (sizeof(data) - 1)) || addr < 0 || - addr >= sizeof(struct user)) + if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user)) break; if (addr < sizeof(struct user_regs_struct)) @@ -888,17 +887,17 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION case PTRACE_GET_THREAD_AREA: - if (addr < 0) + if ((int) addr < 0) return -EIO; ret = do_get_thread_area(child, addr, - (struct user_desc __user *) data); + (struct user_desc __user *)data); break; case PTRACE_SET_THREAD_AREA: - if (addr < 0) + if ((int) addr < 0) return -EIO; ret = do_set_thread_area(child, addr, - (struct user_desc __user *) data, 0); + (struct user_desc __user *)data, 0); break; #endif diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index bab3b9e6f66d..42eb3300dfc6 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -41,44 +41,6 @@ void pvclock_set_flags(u8 flags) valid_flags = flags; } -/* - * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, - * yielding a 64-bit result. - */ -static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) -{ - u64 product; -#ifdef __i386__ - u32 tmp1, tmp2; -#endif - - if (shift < 0) - delta >>= -shift; - else - delta <<= shift; - -#ifdef __i386__ - __asm__ ( - "mul %5 ; " - "mov %4,%%eax ; " - "mov %%edx,%4 ; " - "mul %5 ; " - "xor %5,%5 ; " - "add %4,%%eax ; " - "adc %5,%%edx ; " - : "=A" (product), "=r" (tmp1), "=r" (tmp2) - : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); -#elif defined(__x86_64__) - __asm__ ( - "mul %%rdx ; shrd $32,%%rdx,%%rax" - : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); -#else -#error implement me! -#endif - - return product; -} - static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) { u64 delta = native_read_tsc() - shadow->tsc_timestamp; @@ -121,6 +83,11 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) static atomic64_t last_value = ATOMIC64_INIT(0); +void pvclock_resume(void) +{ + atomic64_set(&last_value, 0); +} + cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) { struct pvclock_shadow_time shadow; diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index f7f53dcd3e0a..c495aa8d4815 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -635,7 +635,7 @@ void native_machine_shutdown(void) /* O.K Now that I'm on the appropriate processor, * stop all of the others. */ - smp_send_stop(); + stop_other_cpus(); #endif lapic_shutdown(); diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c new file mode 100644 index 000000000000..2a26819bb6a8 --- /dev/null +++ b/arch/x86/kernel/resource.c @@ -0,0 +1,48 @@ +#include <linux/ioport.h> +#include <asm/e820.h> + +static void resource_clip(struct resource *res, resource_size_t start, + resource_size_t end) +{ + resource_size_t low = 0, high = 0; + + if (res->end < start || res->start > end) + return; /* no conflict */ + + if (res->start < start) + low = start - res->start; + + if (res->end > end) + high = res->end - end; + + /* Keep the area above or below the conflict, whichever is larger */ + if (low > high) + res->end = start - 1; + else + res->start = end + 1; +} + +static void remove_e820_regions(struct resource *avail) +{ + int i; + struct e820entry *entry; + + for (i = 0; i < e820.nr_map; i++) { + entry = &e820.map[i]; + + resource_clip(avail, entry->addr, + entry->addr + entry->size - 1); + } +} + +void arch_remove_reservations(struct resource *avail) +{ + /* Trim out BIOS areas (low 1MB and high 2MB) and E820 regions */ + if (avail->flags & IORESOURCE_MEM) { + if (avail->start < BIOS_END) + avail->start = BIOS_END; + resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END); + + remove_e820_regions(avail); + } +} diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 95a32746fbf9..d3cfe26c0252 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -501,7 +501,18 @@ static inline unsigned long long get_total_mem(void) return total << PAGE_SHIFT; } -#define DEFAULT_BZIMAGE_ADDR_MAX 0x37FFFFFF +/* + * Keep the crash kernel below this limit. On 32 bits earlier kernels + * would limit the kernel to the low 512 MiB due to mapping restrictions. + * On 64 bits, kexec-tools currently limits us to 896 MiB; increase this + * limit once kexec-tools are fixed. + */ +#ifdef CONFIG_X86_32 +# define CRASH_KERNEL_ADDR_MAX (512 << 20) +#else +# define CRASH_KERNEL_ADDR_MAX (896 << 20) +#endif + static void __init reserve_crashkernel(void) { unsigned long long total_mem; @@ -520,10 +531,10 @@ static void __init reserve_crashkernel(void) const unsigned long long alignment = 16<<20; /* 16M */ /* - * kexec want bzImage is below DEFAULT_BZIMAGE_ADDR_MAX + * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX */ crash_base = memblock_find_in_range(alignment, - DEFAULT_BZIMAGE_ADDR_MAX, crash_size, alignment); + CRASH_KERNEL_ADDR_MAX, crash_size, alignment); if (crash_base == MEMBLOCK_ERROR) { pr_info("crashkernel reservation failed - No suitable area found.\n"); @@ -694,7 +705,7 @@ static u64 __init get_max_mapped(void) void __init setup_arch(char **cmdline_p) { int acpi = 0; - int k8 = 0; + int amd = 0; unsigned long flags; #ifdef CONFIG_X86_32 @@ -769,6 +780,7 @@ void __init setup_arch(char **cmdline_p) x86_init.oem.arch_setup(); + iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; setup_memory_map(); parse_setup_data(); /* update the e820_saved too */ @@ -979,12 +991,12 @@ void __init setup_arch(char **cmdline_p) acpi = acpi_numa_init(); #endif -#ifdef CONFIG_K8_NUMA +#ifdef CONFIG_AMD_NUMA if (!acpi) - k8 = !k8_numa_init(0, max_pfn); + amd = !amd_numa_init(0, max_pfn); #endif - initmem_init(0, max_pfn, acpi, k8); + initmem_init(0, max_pfn, acpi, amd); memblock_find_dma_reserve(); dma32_reserve_bootmem(); @@ -1033,10 +1045,7 @@ void __init setup_arch(char **cmdline_p) #endif init_apic_mappings(); - ioapic_init_mappings(); - - /* need to wait for io_apic is mapped */ - probe_nr_irqs_gsi(); + ioapic_and_gsi_init(); kvm_guest_init(); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index d801210945d6..513deac7228d 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -159,10 +159,10 @@ asmlinkage void smp_reboot_interrupt(void) irq_exit(); } -static void native_smp_send_stop(void) +static void native_stop_other_cpus(int wait) { unsigned long flags; - unsigned long wait; + unsigned long timeout; if (reboot_force) return; @@ -179,9 +179,12 @@ static void native_smp_send_stop(void) if (num_online_cpus() > 1) { apic->send_IPI_allbutself(REBOOT_VECTOR); - /* Don't wait longer than a second */ - wait = USEC_PER_SEC; - while (num_online_cpus() > 1 && wait--) + /* + * Don't wait longer than a second if the caller + * didn't ask us to wait. + */ + timeout = USEC_PER_SEC; + while (num_online_cpus() > 1 && (wait || timeout--)) udelay(1); } @@ -227,7 +230,7 @@ struct smp_ops smp_ops = { .smp_prepare_cpus = native_smp_prepare_cpus, .smp_cpus_done = native_smp_cpus_done, - .smp_send_stop = native_smp_send_stop, + .stop_other_cpus = native_stop_other_cpus, .smp_send_reschedule = native_smp_send_reschedule, .cpu_up = native_cpu_up, diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 6af118511b4a..68f61ac632e1 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -281,6 +281,13 @@ static void __cpuinit smp_callin(void) */ smp_store_cpu_info(cpuid); + /* + * This must be done before setting cpu_online_mask + * or calling notify_cpu_starting. + */ + set_cpu_sibling_map(raw_smp_processor_id()); + wmb(); + notify_cpu_starting(cpuid); /* @@ -316,16 +323,6 @@ notrace static void __cpuinit start_secondary(void *unused) */ check_tsc_sync_target(); - if (nmi_watchdog == NMI_IO_APIC) { - legacy_pic->mask(0); - enable_NMI_through_LVT0(); - legacy_pic->unmask(0); - } - - /* This must be done before setting cpu_online_mask */ - set_cpu_sibling_map(raw_smp_processor_id()); - wmb(); - /* * We need to hold call_lock, so there is no inconsistency * between the time smp_call_function() determines number of @@ -747,7 +744,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), }; - INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle); + INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle); alternatives_smp_switch(1); @@ -1061,8 +1058,6 @@ static int __init smp_sanity_check(unsigned max_cpus) printk(KERN_INFO "SMP mode deactivated.\n"); smpboot_clear_io_apic(); - localise_nmi_watchdog(); - connect_bsp_APIC(); setup_local_APIC(); end_local_APIC_setup(); @@ -1196,7 +1191,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus) #ifdef CONFIG_X86_IO_APIC setup_ioapic_dest(); #endif - check_nmi_watchdog(); mtrr_aps_init(); } @@ -1341,8 +1335,6 @@ int native_cpu_disable(void) if (cpu == 0) return -EBUSY; - if (nmi_watchdog == NMI_LOCAL_APIC) - stop_apic_nmi_watchdog(NULL); clear_local_APIC(); cpu_disable_common(); @@ -1373,7 +1365,6 @@ void play_dead_common(void) { idle_task_exit(); reset_lazy_tlbstate(); - irq_ctx_exit(raw_smp_processor_id()); c1e_remove_cpu(raw_smp_processor_id()); mb(); diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index b53c525368a7..938c8e10a19a 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -73,22 +73,22 @@ static const struct stacktrace_ops save_stack_ops_nosched = { */ void save_stack_trace(struct stack_trace *trace) { - dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace); + dump_trace(current, NULL, NULL, &save_stack_ops, trace); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL_GPL(save_stack_trace); -void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp) +void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs) { - dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace); + dump_trace(current, regs, NULL, &save_stack_ops, trace); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { - dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); + dump_trace(tsk, NULL, NULL, &save_stack_ops_nosched, trace); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index fb5cc5e14cfa..25a28a245937 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -22,10 +22,6 @@ #include <asm/hpet.h> #include <asm/time.h> -#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC) -int timer_ack; -#endif - #ifdef CONFIG_X86_64 volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; #endif @@ -63,20 +59,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) /* Keep nmi watchdog up to date */ inc_irq_stat(irq0_irqs); - /* Optimized out for !IO_APIC and x86_64 */ - if (timer_ack) { - /* - * Subtle, when I/O APICs are used we have to ack timer IRQ - * manually to deassert NMI lines for the watchdog if run - * on an 82489DX-based system. - */ - raw_spin_lock(&i8259A_lock); - outb(0x0c, PIC_MASTER_OCW3); - /* Ack the IRQ; AEOI will end it automatically. */ - inb(PIC_MASTER_POLL); - raw_spin_unlock(&i8259A_lock); - } - global_clock_event->event_handler(global_clock_event); /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S index 3af2dff58b21..075d130efcf9 100644 --- a/arch/x86/kernel/trampoline_64.S +++ b/arch/x86/kernel/trampoline_64.S @@ -127,7 +127,7 @@ startup_64: no_longmode: hlt jmp no_longmode -#include "verify_cpu_64.S" +#include "verify_cpu.S" # Careful these need to be in the same 64K segment as the above; tidt: diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index cb838ca42c96..c76aaca5694d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -83,6 +83,8 @@ EXPORT_SYMBOL_GPL(used_vectors); static int ignore_nmis; +int unknown_nmi_panic; + static inline void conditional_sti(struct pt_regs *regs) { if (regs->flags & X86_EFLAGS_IF) @@ -300,6 +302,13 @@ gp_in_kernel: die("general protection fault", regs, error_code); } +static int __init setup_unknown_nmi_panic(char *str) +{ + unknown_nmi_panic = 1; + return 1; +} +__setup("unknown_nmi_panic", setup_unknown_nmi_panic); + static notrace __kprobes void mem_parity_error(unsigned char reason, struct pt_regs *regs) { @@ -342,9 +351,11 @@ io_check_error(unsigned char reason, struct pt_regs *regs) reason = (reason & 0xf) | 8; outb(reason, 0x61); - i = 2000; - while (--i) - udelay(1000); + i = 20000; + while (--i) { + touch_nmi_watchdog(); + udelay(100); + } reason &= ~8; outb(reason, 0x61); @@ -371,7 +382,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) reason, smp_processor_id()); printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); - if (panic_on_unrecovered_nmi) + if (unknown_nmi_panic || panic_on_unrecovered_nmi) panic("NMI: Not continuing"); printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); @@ -397,20 +408,8 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) return; - -#ifndef CONFIG_LOCKUP_DETECTOR - /* - * Ok, so this is none of the documented NMI sources, - * so it must be the NMI watchdog. - */ - if (nmi_watchdog_tick(regs, reason)) - return; - if (!do_nmi_callback(regs, cpu)) -#endif /* !CONFIG_LOCKUP_DETECTOR */ - unknown_nmi_error(reason, regs); -#else - unknown_nmi_error(reason, regs); #endif + unknown_nmi_error(reason, regs); return; } @@ -446,14 +445,12 @@ do_nmi(struct pt_regs *regs, long error_code) void stop_nmi(void) { - acpi_nmi_disable(); ignore_nmis++; } void restart_nmi(void) { ignore_nmis--; - acpi_nmi_enable(); } /* May run on IST stack. */ diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu.S index 56a8c2a867d9..0edefc19a113 100644 --- a/arch/x86/kernel/verify_cpu_64.S +++ b/arch/x86/kernel/verify_cpu.S @@ -7,6 +7,7 @@ * Copyright (c) 2007 Andi Kleen (ak@suse.de) * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com) * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com) + * Copyright (c) 2010 Kees Cook (kees.cook@canonical.com) * * This source code is licensed under the GNU General Public License, * Version 2. See the file COPYING for more details. @@ -14,18 +15,17 @@ * This is a common code for verification whether CPU supports * long mode and SSE or not. It is not called directly instead this * file is included at various places and compiled in that context. - * Following are the current usage. + * This file is expected to run in 32bit code. Currently: * - * This file is included by both 16bit and 32bit code. + * arch/x86/boot/compressed/head_64.S: Boot cpu verification + * arch/x86/kernel/trampoline_64.S: secondary processor verfication + * arch/x86/kernel/head_32.S: processor startup * - * arch/x86_64/boot/setup.S : Boot cpu verification (16bit) - * arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit) - * arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit) - * arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit) - * - * verify_cpu, returns the status of cpu check in register %eax. + * verify_cpu, returns the status of longmode and SSE in register %eax. * 0: Success 1: Failure * + * On Intel, the XD_DISABLE flag will be cleared as a side-effect. + * * The caller needs to check for the error code and take the action * appropriately. Either display a message or halt. */ @@ -62,8 +62,41 @@ verify_cpu: cmpl $0x444d4163,%ecx jnz verify_cpu_noamd mov $1,%di # cpu is from AMD + jmp verify_cpu_check verify_cpu_noamd: + cmpl $0x756e6547,%ebx # GenuineIntel? + jnz verify_cpu_check + cmpl $0x49656e69,%edx + jnz verify_cpu_check + cmpl $0x6c65746e,%ecx + jnz verify_cpu_check + + # only call IA32_MISC_ENABLE when: + # family > 6 || (family == 6 && model >= 0xd) + movl $0x1, %eax # check CPU family and model + cpuid + movl %eax, %ecx + + andl $0x0ff00f00, %eax # mask family and extended family + shrl $8, %eax + cmpl $6, %eax + ja verify_cpu_clear_xd # family > 6, ok + jb verify_cpu_check # family < 6, skip + + andl $0x000f00f0, %ecx # mask model and extended model + shrl $4, %ecx + cmpl $0xd, %ecx + jb verify_cpu_check # family == 6, model < 0xd, skip + +verify_cpu_clear_xd: + movl $MSR_IA32_MISC_ENABLE, %ecx + rdmsr + btrl $2, %edx # clear MSR_IA32_MISC_ENABLE_XD_DISABLE + jnc verify_cpu_check # only write MSR if bit was changed + wrmsr + +verify_cpu_check: movl $0x1,%eax # Does the cpu have what it takes cpuid andl $REQUIRED_MASK0,%edx diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index cd6da6bf3eca..ceb2911aa439 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -6,10 +6,12 @@ #include <linux/init.h> #include <linux/ioport.h> #include <linux/module.h> +#include <linux/pci.h> #include <asm/bios_ebda.h> #include <asm/paravirt.h> #include <asm/pci_x86.h> +#include <asm/pci.h> #include <asm/mpspec.h> #include <asm/setup.h> #include <asm/apic.h> @@ -99,3 +101,8 @@ struct x86_platform_ops x86_platform = { }; EXPORT_SYMBOL_GPL(x86_platform); +struct x86_msi_ops x86_msi = { + .setup_msi_irqs = native_setup_msi_irqs, + .teardown_msi_irq = native_teardown_msi_irq, + .teardown_msi_irqs = default_teardown_msi_irqs, +}; diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 9c253bd65e24..547128546cc3 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -394,7 +394,8 @@ static void __init setup_xstate_init(void) * Setup init_xstate_buf to represent the init state of * all the features managed by the xsave */ - init_xstate_buf = alloc_bootmem(xstate_size); + init_xstate_buf = alloc_bootmem_align(xstate_size, + __alignof__(struct xsave_struct)); init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; clts(); diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index f628234fbeca..3cece05e4ac4 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -575,6 +575,8 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) s->pics[1].elcr_mask = 0xde; s->pics[0].pics_state = s; s->pics[1].pics_state = s; + s->pics[0].isr_ack = 0xff; + s->pics[1].isr_ack = 0xff; /* * Initialize PIO device diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 908ea5464a51..fbb04aee8301 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -720,7 +720,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) } } -static void set_spte_track_bits(u64 *sptep, u64 new_spte) +static int set_spte_track_bits(u64 *sptep, u64 new_spte) { pfn_t pfn; u64 old_spte = *sptep; @@ -731,19 +731,20 @@ static void set_spte_track_bits(u64 *sptep, u64 new_spte) old_spte = __xchg_spte(sptep, new_spte); if (!is_rmap_spte(old_spte)) - return; + return 0; pfn = spte_to_pfn(old_spte); if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) kvm_set_pfn_accessed(pfn); if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask)) kvm_set_pfn_dirty(pfn); + return 1; } static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) { - set_spte_track_bits(sptep, new_spte); - rmap_remove(kvm, sptep); + if (set_spte_track_bits(sptep, new_spte)) + rmap_remove(kvm, sptep); } static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) @@ -2393,7 +2394,8 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) ASSERT(!VALID_PAGE(root)); spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); - sp = kvm_mmu_get_page(vcpu, i << 30, i << 30, + sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT), + i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL, NULL); root = __pa(sp->spt); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 82e144a4e514..b81a9b7c2ca4 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3395,6 +3395,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; load_host_msrs(vcpu); + kvm_load_ldt(ldt_selector); loadsegment(fs, fs_selector); #ifdef CONFIG_X86_64 load_gs_index(gs_selector); @@ -3402,7 +3403,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) #else loadsegment(gs, gs_selector); #endif - kvm_load_ldt(ldt_selector); reload_tss(vcpu); @@ -3494,6 +3494,10 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu) static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) { switch (func) { + case 0x00000001: + /* Mask out xsave bit as long as it is not supported by SVM */ + entry->ecx &= ~(bit(X86_FEATURE_XSAVE)); + break; case 0x80000001: if (nested) entry->ecx |= (1 << 2); /* Set SVM bit */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8da0e45ff7c9..81fcbe9515c5 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -821,10 +821,9 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) #endif #ifdef CONFIG_X86_64 - if (is_long_mode(&vmx->vcpu)) { - rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); + if (is_long_mode(&vmx->vcpu)) wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); - } #endif for (i = 0; i < vmx->save_nmsrs; ++i) kvm_set_shared_msr(vmx->guest_msrs[i].index, @@ -839,23 +838,23 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) ++vmx->vcpu.stat.host_state_reload; vmx->host_state.loaded = 0; - if (vmx->host_state.fs_reload_needed) - loadsegment(fs, vmx->host_state.fs_sel); +#ifdef CONFIG_X86_64 + if (is_long_mode(&vmx->vcpu)) + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); +#endif if (vmx->host_state.gs_ldt_reload_needed) { kvm_load_ldt(vmx->host_state.ldt_sel); #ifdef CONFIG_X86_64 load_gs_index(vmx->host_state.gs_sel); - wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); #else loadsegment(gs, vmx->host_state.gs_sel); #endif } + if (vmx->host_state.fs_reload_needed) + loadsegment(fs, vmx->host_state.fs_sel); reload_tss(); #ifdef CONFIG_X86_64 - if (is_long_mode(&vmx->vcpu)) { - rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); - wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); - } + wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); #endif if (current_thread_info()->status & TS_USEDFPU) clts(); @@ -4228,11 +4227,6 @@ static int vmx_get_lpage_level(void) return PT_PDPE_LEVEL; } -static inline u32 bit(int bitno) -{ - return 1 << (bitno & 31); -} - static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2288ad829b32..b989e1f1e5d3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -155,11 +155,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { u64 __read_mostly host_xcr0; -static inline u32 bit(int bitno) -{ - return 1 << (bitno & 31); -} - static void kvm_on_user_return(struct user_return_notifier *urn) { unsigned slot; @@ -2560,6 +2555,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, !kvm_exception_is_soft(vcpu->arch.exception.nr); events->exception.nr = vcpu->arch.exception.nr; events->exception.has_error_code = vcpu->arch.exception.has_error_code; + events->exception.pad = 0; events->exception.error_code = vcpu->arch.exception.error_code; events->interrupt.injected = @@ -2573,12 +2569,14 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, events->nmi.injected = vcpu->arch.nmi_injected; events->nmi.pending = vcpu->arch.nmi_pending; events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); + events->nmi.pad = 0; events->sipi_vector = vcpu->arch.sipi_vector; events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING | KVM_VCPUEVENT_VALID_SIPI_VECTOR | KVM_VCPUEVENT_VALID_SHADOW); + memset(&events->reserved, 0, sizeof(events->reserved)); } static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, @@ -2623,6 +2621,7 @@ static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, dbgregs->dr6 = vcpu->arch.dr6; dbgregs->dr7 = vcpu->arch.dr7; dbgregs->flags = 0; + memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved)); } static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, @@ -3106,6 +3105,7 @@ static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) sizeof(ps->channels)); ps->flags = kvm->arch.vpit->pit_state.flags; mutex_unlock(&kvm->arch.vpit->pit_state.lock); + memset(&ps->reserved, 0, sizeof(ps->reserved)); return r; } @@ -3169,10 +3169,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_memslots *slots, *old_slots; unsigned long *dirty_bitmap; - spin_lock(&kvm->mmu_lock); - kvm_mmu_slot_remove_write_access(kvm, log->slot); - spin_unlock(&kvm->mmu_lock); - r = -ENOMEM; dirty_bitmap = vmalloc(n); if (!dirty_bitmap) @@ -3194,6 +3190,10 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; kfree(old_slots); + spin_lock(&kvm->mmu_lock); + kvm_mmu_slot_remove_write_access(kvm, log->slot); + spin_unlock(&kvm->mmu_lock); + r = -EFAULT; if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) { vfree(dirty_bitmap); @@ -3486,6 +3486,7 @@ long kvm_arch_vm_ioctl(struct file *filp, user_ns.clock = kvm->arch.kvmclock_offset + now_ns; local_irq_enable(); user_ns.flags = 0; + memset(&user_ns.pad, 0, sizeof(user_ns.pad)); r = -EFAULT; if (copy_to_user(argp, &user_ns, sizeof(user_ns))) @@ -3972,8 +3973,10 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) return X86EMUL_CONTINUE; if (kvm_x86_ops->has_wbinvd_exit()) { + preempt_disable(); smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, wbinvd_ipi, NULL, 1); + preempt_enable(); cpumask_clear(vcpu->arch.wbinvd_dirty_mask); } wbinvd(); @@ -4561,9 +4564,11 @@ static void kvm_timer_init(void) #ifdef CONFIG_CPU_FREQ struct cpufreq_policy policy; memset(&policy, 0, sizeof(policy)); - cpufreq_get_policy(&policy, get_cpu()); + cpu = get_cpu(); + cpufreq_get_policy(&policy, cpu); if (policy.cpuinfo.max_freq) max_tsc_khz = policy.cpuinfo.max_freq; + put_cpu(); #endif cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); @@ -5514,6 +5519,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; kvm_x86_ops->set_cr4(vcpu, sregs->cr4); + if (sregs->cr4 & X86_CR4_OSXSAVE) + update_cpuid(vcpu); if (!is_long_mode(vcpu) && is_pae(vcpu)) { load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3); mmu_reset_needed = 1; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 2cea414489f3..c600da830ce0 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -70,6 +70,11 @@ static inline int is_paging(struct kvm_vcpu *vcpu) return kvm_read_cr0_bits(vcpu, X86_CR0_PG); } +static inline u32 bit(int bitno) +{ + return 1 << (bitno & 31); +} + void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq); diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 73b1e1a1f489..4996cf5f73a0 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -531,7 +531,10 @@ static void lguest_write_cr3(unsigned long cr3) { lguest_data.pgdir = cr3; lazy_hcall1(LHCALL_NEW_PGTABLE, cr3); - cr3_changed = true; + + /* These two page tables are simple, linear, and used during boot */ + if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table)) + cr3_changed = true; } static unsigned long lguest_read_cr3(void) @@ -703,9 +706,9 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) * to forget all of them. Fortunately, this is very rare. * * ... except in early boot when the kernel sets up the initial pagetables, - * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell - * the Host anything changed until we've done the first page table switch, - * which brings boot back to 0.25 seconds. + * which makes booting astonishingly slow: 48 seconds! So we don't even tell + * the Host anything changed until we've done the first real page table switch, + * which brings boot back to 4.3 seconds. */ static void lguest_set_pte(pte_t *ptep, pte_t pteval) { @@ -1002,7 +1005,7 @@ static void lguest_time_init(void) clockevents_register_device(&lguest_clockevent); /* Finally, we unblock the timer interrupt. */ - enable_lguest_irq(0); + clear_bit(0, lguest_data.blocked_interrupts); } /* @@ -1349,9 +1352,6 @@ __init void lguest_init(void) */ switch_to_new_gdt(0); - /* We actually boot with all memory mapped, but let's say 128MB. */ - max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; - /* * The Host<->Guest Switcher lives at the top of our address space, and * the Host told us how big it is when we made LGUEST_INIT hypercall: diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index 4f420c2f2d55..e7d5382ef263 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -4,6 +4,7 @@ #include <asm/asm-offsets.h> #include <asm/thread_info.h> #include <asm/processor-flags.h> +#include <asm/pgtable.h> /*G:020 * Our story starts with the kernel booting into startup_32 in @@ -37,9 +38,113 @@ ENTRY(lguest_entry) /* Set up the initial stack so we can run C code. */ movl $(init_thread_union+THREAD_SIZE),%esp + call init_pagetables + /* Jumps are relative: we're running __PAGE_OFFSET too low. */ jmp lguest_init+__PAGE_OFFSET +/* + * Initialize page tables. This creates a PDE and a set of page + * tables, which are located immediately beyond __brk_base. The variable + * _brk_end is set up to point to the first "safe" location. + * Mappings are created both at virtual address 0 (identity mapping) + * and PAGE_OFFSET for up to _end. + * + * FIXME: This code is taken verbatim from arch/x86/kernel/head_32.S: they + * don't have a stack at this point, so we can't just use call and ret. + */ +init_pagetables: +#if PTRS_PER_PMD > 1 +#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) + PTRS_PER_PGD) +#else +#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) +#endif +#define pa(X) ((X) - __PAGE_OFFSET) + +/* Enough space to fit pagetables for the low memory linear map */ +MAPPING_BEYOND_END = \ + PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT +#ifdef CONFIG_X86_PAE + + /* + * In PAE mode initial_page_table is statically defined to contain + * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3 + * entries). The identity mapping is handled by pointing two PGD entries + * to the first kernel PMD. + * + * Note the upper half of each PMD or PTE are always zero at this stage. + */ + +#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */ + + xorl %ebx,%ebx /* %ebx is kept at zero */ + + movl $pa(__brk_base), %edi + movl $pa(initial_pg_pmd), %edx + movl $PTE_IDENT_ATTR, %eax +10: + leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */ + movl %ecx,(%edx) /* Store PMD entry */ + /* Upper half already zero */ + addl $8,%edx + movl $512,%ecx +11: + stosl + xchgl %eax,%ebx + stosl + xchgl %eax,%ebx + addl $0x1000,%eax + loop 11b + + /* + * End condition: we must map up to the end + MAPPING_BEYOND_END. + */ + movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp + cmpl %ebp,%eax + jb 10b +1: + addl $__PAGE_OFFSET, %edi + movl %edi, pa(_brk_end) + shrl $12, %eax + movl %eax, pa(max_pfn_mapped) + + /* Do early initialization of the fixmap area */ + movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax + movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8) +#else /* Not PAE */ + +page_pde_offset = (__PAGE_OFFSET >> 20); + + movl $pa(__brk_base), %edi + movl $pa(initial_page_table), %edx + movl $PTE_IDENT_ATTR, %eax +10: + leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */ + movl %ecx,(%edx) /* Store identity PDE entry */ + movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ + addl $4,%edx + movl $1024, %ecx +11: + stosl + addl $0x1000,%eax + loop 11b + /* + * End condition: we must map up to the end + MAPPING_BEYOND_END. + */ + movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp + cmpl %ebp,%eax + jb 10b + addl $__PAGE_OFFSET, %edi + movl %edi, pa(_brk_end) + shrl $12, %eax + movl %eax, pa(max_pfn_mapped) + + /* Do early initialization of the fixmap area */ + movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax + movl %eax,pa(initial_page_table+0xffc) +#endif + ret + /*G:055 * We create a macro which puts the assembler code between lgstart_ and lgend_ * markers. These templates are put in the .text section: they can't be diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 55543397a8a7..09df2f9a3d69 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -23,7 +23,7 @@ mmiotrace-y := kmmio.o pf_in.o mmio-mod.o obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o -obj-$(CONFIG_K8_NUMA) += k8topology_64.o +obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/amdtopology_64.c index 804a3b6c6e14..51fae9cfdecb 100644 --- a/arch/x86/mm/k8topology_64.c +++ b/arch/x86/mm/amdtopology_64.c @@ -1,8 +1,8 @@ /* - * AMD K8 NUMA support. + * AMD NUMA support. * Discover the memory map and associated nodes. * - * This version reads it directly from the K8 northbridge. + * This version reads it directly from the AMD northbridge. * * Copyright 2002,2003 Andi Kleen, SuSE Labs. */ @@ -57,7 +57,7 @@ static __init void early_get_boot_cpu_id(void) { /* * need to get the APIC ID of the BSP so can use that to - * create apicid_to_node in k8_scan_nodes() + * create apicid_to_node in amd_scan_nodes() */ #ifdef CONFIG_X86_MPPARSE /* @@ -69,7 +69,7 @@ static __init void early_get_boot_cpu_id(void) early_init_lapic_mapping(); } -int __init k8_get_nodes(struct bootnode *physnodes) +int __init amd_get_nodes(struct bootnode *physnodes) { int i; int ret = 0; @@ -82,7 +82,7 @@ int __init k8_get_nodes(struct bootnode *physnodes) return ret; } -int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn) +int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn) { unsigned long start = PFN_PHYS(start_pfn); unsigned long end = PFN_PHYS(end_pfn); @@ -194,7 +194,7 @@ int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn) return 0; } -int __init k8_scan_nodes(void) +int __init amd_scan_nodes(void) { unsigned int bits; unsigned int cores; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 79b0b372d2d0..7d90ceb882a4 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -11,6 +11,7 @@ #include <linux/kprobes.h> /* __kprobes, ... */ #include <linux/mmiotrace.h> /* kmmio_handler, ... */ #include <linux/perf_event.h> /* perf_sw_event */ +#include <linux/hugetlb.h> /* hstate_index_to_shift */ #include <asm/traps.h> /* dotraplinkage, ... */ #include <asm/pgalloc.h> /* pgd_*(), ... */ @@ -160,15 +161,20 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) static void force_sig_info_fault(int si_signo, int si_code, unsigned long address, - struct task_struct *tsk) + struct task_struct *tsk, int fault) { + unsigned lsb = 0; siginfo_t info; info.si_signo = si_signo; info.si_errno = 0; info.si_code = si_code; info.si_addr = (void __user *)address; - info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0; + if (fault & VM_FAULT_HWPOISON_LARGE) + lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); + if (fault & VM_FAULT_HWPOISON) + lsb = PAGE_SHIFT; + info.si_addr_lsb = lsb; force_sig_info(si_signo, &info, tsk); } @@ -722,7 +728,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, tsk->thread.error_code = error_code | (address >= TASK_SIZE); tsk->thread.trap_no = 14; - force_sig_info_fault(SIGSEGV, si_code, address, tsk); + force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0); return; } @@ -807,14 +813,14 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, tsk->thread.trap_no = 14; #ifdef CONFIG_MEMORY_FAILURE - if (fault & VM_FAULT_HWPOISON) { + if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { printk(KERN_ERR "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", tsk->comm, tsk->pid, address); code = BUS_MCEERR_AR; } #endif - force_sig_info_fault(SIGBUS, code, address, tsk); + force_sig_info_fault(SIGBUS, code, address, tsk, fault); } static noinline void @@ -824,7 +830,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, if (fault & VM_FAULT_OOM) { out_of_memory(regs, error_code, address); } else { - if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) + if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| + VM_FAULT_HWPOISON_LARGE)) do_sigbus(regs, error_code, address, fault); else BUG(); @@ -912,9 +919,9 @@ spurious_fault(unsigned long error_code, unsigned long address) int show_unhandled_signals = 1; static inline int -access_error(unsigned long error_code, int write, struct vm_area_struct *vma) +access_error(unsigned long error_code, struct vm_area_struct *vma) { - if (write) { + if (error_code & PF_WRITE) { /* write, present and write, not present: */ if (unlikely(!(vma->vm_flags & VM_WRITE))) return 1; @@ -949,8 +956,10 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) struct task_struct *tsk; unsigned long address; struct mm_struct *mm; - int write; int fault; + int write = error_code & PF_WRITE; + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | + (write ? FAULT_FLAG_WRITE : 0); tsk = current; mm = tsk->mm; @@ -1061,6 +1070,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) bad_area_nosemaphore(regs, error_code, address); return; } +retry: down_read(&mm->mmap_sem); } else { /* @@ -1104,9 +1114,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) * we can handle it.. */ good_area: - write = error_code & PF_WRITE; - - if (unlikely(access_error(error_code, write, vma))) { + if (unlikely(access_error(error_code, vma))) { bad_area_access_error(regs, error_code, address); return; } @@ -1116,21 +1124,34 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault: */ - fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); + fault = handle_mm_fault(mm, vma, address, flags); if (unlikely(fault & VM_FAULT_ERROR)) { mm_fault_error(regs, error_code, address, fault); return; } - if (fault & VM_FAULT_MAJOR) { - tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, - regs, address); - } else { - tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, - regs, address); + /* + * Major/minor page fault accounting is only done on the + * initial attempt. If we go through a retry, it is extremely + * likely that the page will be found in page cache at that point. + */ + if (flags & FAULT_FLAG_ALLOW_RETRY) { + if (fault & VM_FAULT_MAJOR) { + tsk->maj_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + regs, address); + } else { + tsk->min_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + regs, address); + } + if (fault & VM_FAULT_RETRY) { + /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk + * of starvation. */ + flags &= ~FAULT_FLAG_ALLOW_RETRY; + goto retry; + } } check_v8086_mode(regs, address, tsk); diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 5e8fa12ef861..b49962662101 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -9,6 +9,7 @@ void *kmap(struct page *page) return page_address(page); return kmap_high(page); } +EXPORT_SYMBOL(kmap); void kunmap(struct page *page) { @@ -18,6 +19,7 @@ void kunmap(struct page *page) return; kunmap_high(page); } +EXPORT_SYMBOL(kunmap); /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because @@ -27,10 +29,10 @@ void kunmap(struct page *page) * However when holding an atomic kmap it is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ -void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) +void *kmap_atomic_prot(struct page *page, pgprot_t prot) { - enum fixed_addresses idx; unsigned long vaddr; + int idx, type; /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ pagefault_disable(); @@ -38,8 +40,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) if (!PageHighMem(page)) return page_address(page); - debug_kmap_atomic(type); - + type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); BUG_ON(!pte_none(*(kmap_pte-idx))); @@ -47,44 +48,57 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) return (void *)vaddr; } +EXPORT_SYMBOL(kmap_atomic_prot); + +void *__kmap_atomic(struct page *page) +{ + return kmap_atomic_prot(page, kmap_prot); +} +EXPORT_SYMBOL(__kmap_atomic); -void *kmap_atomic(struct page *page, enum km_type type) +/* + * This is the same as kmap_atomic() but can map memory that doesn't + * have a struct page associated with it. + */ +void *kmap_atomic_pfn(unsigned long pfn) { - return kmap_atomic_prot(page, type, kmap_prot); + return kmap_atomic_prot_pfn(pfn, kmap_prot); } +EXPORT_SYMBOL_GPL(kmap_atomic_pfn); -void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type) +void __kunmap_atomic(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); - - /* - * Force other mappings to Oops if they'll try to access this pte - * without first remap it. Keeping stale mappings around is a bad idea - * also, in case the page changes cacheability attributes or becomes - * a protected page in a hypervisor. - */ - if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) + + if (vaddr >= __fix_to_virt(FIX_KMAP_END) && + vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) { + int idx, type; + + type = kmap_atomic_idx(); + idx = type + KM_TYPE_NR * smp_processor_id(); + +#ifdef CONFIG_DEBUG_HIGHMEM + WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); +#endif + /* + * Force other mappings to Oops if they'll try to access this + * pte without first remap it. Keeping stale mappings around + * is a bad idea also, in case the page changes cacheability + * attributes or becomes a protected page in a hypervisor. + */ kpte_clear_flush(kmap_pte-idx, vaddr); - else { + kmap_atomic_idx_pop(); + } #ifdef CONFIG_DEBUG_HIGHMEM + else { BUG_ON(vaddr < PAGE_OFFSET); BUG_ON(vaddr >= (unsigned long)high_memory); -#endif } +#endif pagefault_enable(); } - -/* - * This is the same as kmap_atomic() but can map memory that doesn't - * have a struct page associated with it. - */ -void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) -{ - return kmap_atomic_prot_pfn(pfn, type, kmap_prot); -} -EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */ +EXPORT_SYMBOL(__kunmap_atomic); struct page *kmap_atomic_to_page(void *ptr) { @@ -98,12 +112,6 @@ struct page *kmap_atomic_to_page(void *ptr) pte = kmap_pte - (idx - FIX_KMAP_BEGIN); return pte_page(*pte); } - -EXPORT_SYMBOL(kmap); -EXPORT_SYMBOL(kunmap); -EXPORT_SYMBOL(kmap_atomic); -EXPORT_SYMBOL(kunmap_atomic_notypecheck); -EXPORT_SYMBOL(kmap_atomic_prot); EXPORT_SYMBOL(kmap_atomic_to_page); void __init set_highmem_pages_init(void) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 84346200e783..71a59296af80 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -51,7 +51,6 @@ #include <asm/numa.h> #include <asm/cacheflush.h> #include <asm/init.h> -#include <linux/bootmem.h> static int __init parse_direct_gbpages_off(char *arg) { diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 72fc70cf6184..7b179b499fa3 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -48,21 +48,20 @@ int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot) } EXPORT_SYMBOL_GPL(iomap_create_wc); -void -iomap_free(resource_size_t base, unsigned long size) +void iomap_free(resource_size_t base, unsigned long size) { io_free_memtype(base, base + size); } EXPORT_SYMBOL_GPL(iomap_free); -void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) +void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) { - enum fixed_addresses idx; unsigned long vaddr; + int idx, type; pagefault_disable(); - debug_kmap_atomic(type); + type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR * smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); set_pte(kmap_pte - idx, pfn_pte(pfn, prot)); @@ -72,10 +71,10 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) } /* - * Map 'pfn' using fixed map 'type' and protections 'prot' + * Map 'pfn' using protections 'prot' */ void __iomem * -iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) +iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) { /* * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS. @@ -86,24 +85,34 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC)) prot = PAGE_KERNEL_UC_MINUS; - return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, type, prot); + return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot); } EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); void -iounmap_atomic(void __iomem *kvaddr, enum km_type type) +iounmap_atomic(void __iomem *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); - /* - * Force other mappings to Oops if they'll try to access this pte - * without first remap it. Keeping stale mappings around is a bad idea - * also, in case the page changes cacheability attributes or becomes - * a protected page in a hypervisor. - */ - if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) + if (vaddr >= __fix_to_virt(FIX_KMAP_END) && + vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) { + int idx, type; + + type = kmap_atomic_idx(); + idx = type + KM_TYPE_NR * smp_processor_id(); + +#ifdef CONFIG_DEBUG_HIGHMEM + WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); +#endif + /* + * Force other mappings to Oops if they'll try to access this + * pte without first remap it. Keeping stale mappings around + * is a bad idea also, in case the page changes cacheability + * attributes or becomes a protected page in a hypervisor. + */ kpte_clear_flush(kmap_pte-idx, vaddr); + kmap_atomic_idx_pop(); + } pagefault_enable(); } diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c index af3b6c8a436f..704a37cedddb 100644 --- a/arch/x86/mm/kmemcheck/error.c +++ b/arch/x86/mm/kmemcheck/error.c @@ -185,7 +185,7 @@ void kmemcheck_error_save(enum kmemcheck_shadow state, e->trace.entries = e->trace_entries; e->trace.max_entries = ARRAY_SIZE(e->trace_entries); e->trace.skip = 0; - save_stack_trace_bp(&e->trace, regs->bp); + save_stack_trace_regs(&e->trace, regs); /* Round address down to nearest 16 bytes */ shadow_copy = kmemcheck_shadow_lookup(address diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 60f498511dd6..7762a517d69d 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -178,11 +178,8 @@ static void * __init early_node_mem(int nodeid, unsigned long start, /* extend the search scope */ end = max_pfn_mapped << PAGE_SHIFT; - if (end > (MAX_DMA32_PFN<<PAGE_SHIFT)) - start = MAX_DMA32_PFN<<PAGE_SHIFT; - else - start = MAX_DMA_PFN<<PAGE_SHIFT; - mem = memblock_x86_find_in_range_node(nodeid, start, end, size, align); + start = MAX_DMA_PFN << PAGE_SHIFT; + mem = memblock_find_in_range(start, end, size, align); if (mem != MEMBLOCK_ERROR) return __va(mem); @@ -267,7 +264,7 @@ static struct bootnode physnodes[MAX_NUMNODES] __initdata; static char *cmdline __initdata; static int __init setup_physnodes(unsigned long start, unsigned long end, - int acpi, int k8) + int acpi, int amd) { int nr_nodes = 0; int ret = 0; @@ -277,13 +274,13 @@ static int __init setup_physnodes(unsigned long start, unsigned long end, if (acpi) nr_nodes = acpi_get_nodes(physnodes); #endif -#ifdef CONFIG_K8_NUMA - if (k8) - nr_nodes = k8_get_nodes(physnodes); +#ifdef CONFIG_AMD_NUMA + if (amd) + nr_nodes = amd_get_nodes(physnodes); #endif /* * Basic sanity checking on the physical node map: there may be errors - * if the SRAT or K8 incorrectly reported the topology or the mem= + * if the SRAT or AMD code incorrectly reported the topology or the mem= * kernel parameter is used. */ for (i = 0; i < nr_nodes; i++) { @@ -552,7 +549,7 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size) * numa=fake command-line option. */ static int __init numa_emulation(unsigned long start_pfn, - unsigned long last_pfn, int acpi, int k8) + unsigned long last_pfn, int acpi, int amd) { u64 addr = start_pfn << PAGE_SHIFT; u64 max_addr = last_pfn << PAGE_SHIFT; @@ -560,7 +557,7 @@ static int __init numa_emulation(unsigned long start_pfn, int num_nodes; int i; - num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8); + num_phys_nodes = setup_physnodes(addr, max_addr, acpi, amd); /* * If the numa=fake command-line contains a 'M' or 'G', it represents * the fixed node size. Otherwise, if it is just a single number N, @@ -605,7 +602,7 @@ static int __init numa_emulation(unsigned long start_pfn, #endif /* CONFIG_NUMA_EMU */ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, - int acpi, int k8) + int acpi, int amd) { int i; @@ -613,7 +610,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, nodes_clear(node_online_map); #ifdef CONFIG_NUMA_EMU - if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8)) + if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd)) return; nodes_clear(node_possible_map); nodes_clear(node_online_map); @@ -627,8 +624,8 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn, nodes_clear(node_online_map); #endif -#ifdef CONFIG_K8_NUMA - if (!numa_off && k8 && !k8_scan_nodes()) +#ifdef CONFIG_AMD_NUMA + if (!numa_off && amd && !amd_scan_nodes()) return; nodes_clear(node_possible_map); nodes_clear(node_online_map); diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c index a3250aa34086..410531d3c292 100644 --- a/arch/x86/mm/setup_nx.c +++ b/arch/x86/mm/setup_nx.c @@ -41,7 +41,7 @@ void __init x86_report_nx(void) { if (!cpu_has_nx) { printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " - "missing in CPU or disabled in BIOS!\n"); + "missing in CPU!\n"); } else { #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) if (disable_nx) { diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c index a17dffd136c1..f16434568a51 100644 --- a/arch/x86/mm/srat_32.c +++ b/arch/x86/mm/srat_32.c @@ -92,6 +92,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity) /* mark this node as "seen" in node bitmap */ BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo); + /* don't need to check apic_id here, because it is always 8 bits */ apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo; printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n", diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index a35cb9d8b060..171a0aacb99a 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -134,6 +134,10 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) } apic_id = pa->apic_id; + if (apic_id >= MAX_LOCAL_APIC) { + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); + return; + } apicid_to_node[apic_id] = node; node_set(node, cpu_nodes_parsed); acpi_numa = 1; @@ -168,6 +172,12 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; else apic_id = pa->apic_id; + + if (apic_id >= MAX_LOCAL_APIC) { + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); + return; + } + apicid_to_node[apic_id] = node; node_set(node, cpu_nodes_parsed); acpi_numa = 1; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 49358481c733..6acc724d5d8f 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -223,7 +223,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask, static void __cpuinit calculate_tlb_offset(void) { - int cpu, node, nr_node_vecs; + int cpu, node, nr_node_vecs, idx = 0; /* * we are changing tlb_vector_offset for each CPU in runtime, but this * will not cause inconsistency, as the write is atomic under X86. we @@ -239,7 +239,7 @@ static void __cpuinit calculate_tlb_offset(void) nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes; for_each_online_node(node) { - int node_offset = (node % NUM_INVALIDATE_TLB_VECTORS) * + int node_offset = (idx % NUM_INVALIDATE_TLB_VECTORS) * nr_node_vecs; int cpu_offset = 0; for_each_cpu(cpu, cpumask_of_node(node)) { @@ -248,10 +248,11 @@ static void __cpuinit calculate_tlb_offset(void) cpu_offset++; cpu_offset = cpu_offset % nr_node_vecs; } + idx++; } } -static int tlb_cpuhp_notify(struct notifier_block *n, +static int __cpuinit tlb_cpuhp_notify(struct notifier_block *n, unsigned long action, void *hcpu) { switch (action & 0xf) { diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index 2d49d4e19a36..72cbec14d783 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -126,7 +126,7 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth) if (!user_mode_vm(regs)) { unsigned long stack = kernel_stack_pointer(regs); if (depth) - dump_trace(NULL, regs, (unsigned long *)stack, 0, + dump_trace(NULL, regs, (unsigned long *)stack, &backtrace_ops, &depth); return; } diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index bd1489c3ce09..358c8b9c96a7 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -726,6 +726,15 @@ int __init op_nmi_init(struct oprofile_operations *ops) case 0x11: cpu_type = "x86-64/family11h"; break; + case 0x12: + cpu_type = "x86-64/family12h"; + break; + case 0x14: + cpu_type = "x86-64/family14h"; + break; + case 0x15: + cpu_type = "x86-64/family15h"; + break; default: return -ENODEV; } diff --git a/arch/x86/oprofile/nmi_timer_int.c b/arch/x86/oprofile/nmi_timer_int.c index e3ecb71b5790..0636dd93cef8 100644 --- a/arch/x86/oprofile/nmi_timer_int.c +++ b/arch/x86/oprofile/nmi_timer_int.c @@ -58,9 +58,6 @@ static void timer_stop(void) int __init op_nmi_timer_init(struct oprofile_operations *ops) { - if ((nmi_watchdog != NMI_IO_APIC) || (atomic_read(&nmi_active) <= 0)) - return -ENODEV; - ops->start = timer_start; ops->stop = timer_stop; ops->cpu_type = "timer"; diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 08de2545bc68..c3b8e24f2b16 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -29,11 +29,12 @@ #include "op_x86_model.h" #include "op_counter.h" -#define NUM_COUNTERS 4 +#define NUM_COUNTERS 4 +#define NUM_COUNTERS_F15H 6 #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX -#define NUM_VIRT_COUNTERS 32 +#define NUM_VIRT_COUNTERS 32 #else -#define NUM_VIRT_COUNTERS NUM_COUNTERS +#define NUM_VIRT_COUNTERS 0 #endif #define OP_EVENT_MASK 0x0FFF @@ -41,24 +42,32 @@ #define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21)) -static unsigned long reset_value[NUM_VIRT_COUNTERS]; +static int num_counters; +static unsigned long reset_value[OP_MAX_COUNTER]; #define IBS_FETCH_SIZE 6 #define IBS_OP_SIZE 12 static u32 ibs_caps; -struct op_ibs_config { +struct ibs_config { unsigned long op_enabled; unsigned long fetch_enabled; unsigned long max_cnt_fetch; unsigned long max_cnt_op; unsigned long rand_en; unsigned long dispatched_ops; + unsigned long branch_target; }; -static struct op_ibs_config ibs_config; -static u64 ibs_op_ctl; +struct ibs_state { + u64 ibs_op_ctl; + int branch_target; + unsigned long sample_size; +}; + +static struct ibs_config ibs_config; +static struct ibs_state ibs_state; /* * IBS cpuid feature detection @@ -71,8 +80,16 @@ static u64 ibs_op_ctl; * bit 0 is used to indicate the existence of IBS. */ #define IBS_CAPS_AVAIL (1U<<0) +#define IBS_CAPS_FETCHSAM (1U<<1) +#define IBS_CAPS_OPSAM (1U<<2) #define IBS_CAPS_RDWROPCNT (1U<<3) #define IBS_CAPS_OPCNT (1U<<4) +#define IBS_CAPS_BRNTRGT (1U<<5) +#define IBS_CAPS_OPCNTEXT (1U<<6) + +#define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \ + | IBS_CAPS_FETCHSAM \ + | IBS_CAPS_OPSAM) /* * IBS APIC setup @@ -99,12 +116,12 @@ static u32 get_ibs_caps(void) /* check IBS cpuid feature flags */ max_level = cpuid_eax(0x80000000); if (max_level < IBS_CPUID_FEATURES) - return IBS_CAPS_AVAIL; + return IBS_CAPS_DEFAULT; ibs_caps = cpuid_eax(IBS_CPUID_FEATURES); if (!(ibs_caps & IBS_CAPS_AVAIL)) /* cpuid flags not valid */ - return IBS_CAPS_AVAIL; + return IBS_CAPS_DEFAULT; return ibs_caps; } @@ -197,8 +214,8 @@ op_amd_handle_ibs(struct pt_regs * const regs, rdmsrl(MSR_AMD64_IBSOPCTL, ctl); if (ctl & IBS_OP_VAL) { rdmsrl(MSR_AMD64_IBSOPRIP, val); - oprofile_write_reserve(&entry, regs, val, - IBS_OP_CODE, IBS_OP_SIZE); + oprofile_write_reserve(&entry, regs, val, IBS_OP_CODE, + ibs_state.sample_size); oprofile_add_data64(&entry, val); rdmsrl(MSR_AMD64_IBSOPDATA, val); oprofile_add_data64(&entry, val); @@ -210,10 +227,14 @@ op_amd_handle_ibs(struct pt_regs * const regs, oprofile_add_data64(&entry, val); rdmsrl(MSR_AMD64_IBSDCPHYSAD, val); oprofile_add_data64(&entry, val); + if (ibs_state.branch_target) { + rdmsrl(MSR_AMD64_IBSBRTARGET, val); + oprofile_add_data(&entry, (unsigned long)val); + } oprofile_write_commit(&entry); /* reenable the IRQ */ - ctl = op_amd_randomize_ibs_op(ibs_op_ctl); + ctl = op_amd_randomize_ibs_op(ibs_state.ibs_op_ctl); wrmsrl(MSR_AMD64_IBSOPCTL, ctl); } } @@ -226,21 +247,32 @@ static inline void op_amd_start_ibs(void) if (!ibs_caps) return; + memset(&ibs_state, 0, sizeof(ibs_state)); + + /* + * Note: Since the max count settings may out of range we + * write back the actual used values so that userland can read + * it. + */ + if (ibs_config.fetch_enabled) { - val = (ibs_config.max_cnt_fetch >> 4) & IBS_FETCH_MAX_CNT; + val = ibs_config.max_cnt_fetch >> 4; + val = min(val, IBS_FETCH_MAX_CNT); + ibs_config.max_cnt_fetch = val << 4; val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0; val |= IBS_FETCH_ENABLE; wrmsrl(MSR_AMD64_IBSFETCHCTL, val); } if (ibs_config.op_enabled) { - ibs_op_ctl = ibs_config.max_cnt_op >> 4; + val = ibs_config.max_cnt_op >> 4; if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) { /* * IbsOpCurCnt not supported. See * op_amd_randomize_ibs_op() for details. */ - ibs_op_ctl = clamp(ibs_op_ctl, 0x0081ULL, 0xFF80ULL); + val = clamp(val, 0x0081ULL, 0xFF80ULL); + ibs_config.max_cnt_op = val << 4; } else { /* * The start value is randomized with a @@ -248,13 +280,24 @@ static inline void op_amd_start_ibs(void) * with the half of the randomized range. Also * avoid underflows. */ - ibs_op_ctl = min(ibs_op_ctl + IBS_RANDOM_MAXCNT_OFFSET, - IBS_OP_MAX_CNT); + val += IBS_RANDOM_MAXCNT_OFFSET; + if (ibs_caps & IBS_CAPS_OPCNTEXT) + val = min(val, IBS_OP_MAX_CNT_EXT); + else + val = min(val, IBS_OP_MAX_CNT); + ibs_config.max_cnt_op = + (val - IBS_RANDOM_MAXCNT_OFFSET) << 4; + } + val = ((val & ~IBS_OP_MAX_CNT) << 4) | (val & IBS_OP_MAX_CNT); + val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0; + val |= IBS_OP_ENABLE; + ibs_state.ibs_op_ctl = val; + ibs_state.sample_size = IBS_OP_SIZE; + if (ibs_config.branch_target) { + ibs_state.branch_target = 1; + ibs_state.sample_size++; } - if (ibs_caps & IBS_CAPS_OPCNT && ibs_config.dispatched_ops) - ibs_op_ctl |= IBS_OP_CNT_CTL; - ibs_op_ctl |= IBS_OP_ENABLE; - val = op_amd_randomize_ibs_op(ibs_op_ctl); + val = op_amd_randomize_ibs_op(ibs_state.ibs_op_ctl); wrmsrl(MSR_AMD64_IBSOPCTL, val); } } @@ -281,29 +324,25 @@ static inline int eilvt_is_available(int offset) static inline int ibs_eilvt_valid(void) { - u64 val; int offset; + u64 val; rdmsrl(MSR_AMD64_IBSCTL, val); + offset = val & IBSCTL_LVT_OFFSET_MASK; + if (!(val & IBSCTL_LVT_OFFSET_VALID)) { - pr_err(FW_BUG "cpu %d, invalid IBS " - "interrupt offset %d (MSR%08X=0x%016llx)", - smp_processor_id(), offset, - MSR_AMD64_IBSCTL, val); + pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n", + smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); return 0; } - offset = val & IBSCTL_LVT_OFFSET_MASK; - - if (eilvt_is_available(offset)) - return !0; - - pr_err(FW_BUG "cpu %d, IBS interrupt offset %d " - "not available (MSR%08X=0x%016llx)", - smp_processor_id(), offset, - MSR_AMD64_IBSCTL, val); + if (!eilvt_is_available(offset)) { + pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n", + smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); + return 0; + } - return 0; + return 1; } static inline int get_ibs_offset(void) @@ -350,7 +389,7 @@ static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, int i; /* enable active counters */ - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { int virt = op_x86_phys_to_virt(i); if (!reset_value[virt]) continue; @@ -369,7 +408,7 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) { int i; - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { if (!msrs->counters[i].addr) continue; release_perfctr_nmi(MSR_K7_PERFCTR0 + i); @@ -381,7 +420,7 @@ static int op_amd_fill_in_addresses(struct op_msrs * const msrs) { int i; - for (i = 0; i < NUM_COUNTERS; i++) { + for (i = 0; i < num_counters; i++) { if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) goto fail; if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) { @@ -389,8 +428,13 @@ static int op_amd_fill_in_addresses(struct op_msrs * const msrs) goto fail; } /* both registers must be reserved */ - msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; - msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; + if (num_counters == NUM_COUNTERS_F15H) { + msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1); + msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1); + } else { + msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; + msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; + } continue; fail: if (!counter_config[i].enabled) @@ -410,7 +454,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, int i; /* setup reset_value */ - for (i = 0; i < NUM_VIRT_COUNTERS; ++i) { + for (i = 0; i < OP_MAX_COUNTER; ++i) { if (counter_config[i].enabled && msrs->counters[op_x86_virt_to_phys(i)].addr) reset_value[i] = counter_config[i].count; @@ -419,7 +463,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, } /* clear all counters */ - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { if (!msrs->controls[i].addr) continue; rdmsrl(msrs->controls[i].addr, val); @@ -435,7 +479,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, } /* enable active counters */ - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { int virt = op_x86_phys_to_virt(i); if (!reset_value[virt]) continue; @@ -466,7 +510,7 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, u64 val; int i; - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { int virt = op_x86_phys_to_virt(i); if (!reset_value[virt]) continue; @@ -489,7 +533,7 @@ static void op_amd_start(struct op_msrs const * const msrs) u64 val; int i; - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { if (!reset_value[op_x86_phys_to_virt(i)]) continue; rdmsrl(msrs->controls[i].addr, val); @@ -509,7 +553,7 @@ static void op_amd_stop(struct op_msrs const * const msrs) * Subtle: stop on all counters to avoid race with setting our * pm callback */ - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { if (!reset_value[op_x86_phys_to_virt(i)]) continue; rdmsrl(msrs->controls[i].addr, val); @@ -594,21 +638,29 @@ static int __init_ibs_nmi(void) return 0; } -/* initialize the APIC for the IBS interrupts if available */ +/* + * check and reserve APIC extended interrupt LVT offset for IBS if + * available + * + * init_ibs() preforms implicitly cpu-local operations, so pin this + * thread to its current CPU + */ + static void init_ibs(void) { - ibs_caps = get_ibs_caps(); + preempt_disable(); + ibs_caps = get_ibs_caps(); if (!ibs_caps) - return; + goto out; - if (__init_ibs_nmi()) { + if (__init_ibs_nmi() < 0) ibs_caps = 0; - return; - } + else + printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps); - printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", - (unsigned)ibs_caps); +out: + preempt_enable(); } static int (*create_arch_files)(struct super_block *sb, struct dentry *root); @@ -631,44 +683,60 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root) /* model specific files */ /* setup some reasonable defaults */ + memset(&ibs_config, 0, sizeof(ibs_config)); ibs_config.max_cnt_fetch = 250000; - ibs_config.fetch_enabled = 0; ibs_config.max_cnt_op = 250000; - ibs_config.op_enabled = 0; - ibs_config.dispatched_ops = 0; - - dir = oprofilefs_mkdir(sb, root, "ibs_fetch"); - oprofilefs_create_ulong(sb, dir, "enable", - &ibs_config.fetch_enabled); - oprofilefs_create_ulong(sb, dir, "max_count", - &ibs_config.max_cnt_fetch); - oprofilefs_create_ulong(sb, dir, "rand_enable", - &ibs_config.rand_en); - - dir = oprofilefs_mkdir(sb, root, "ibs_op"); - oprofilefs_create_ulong(sb, dir, "enable", - &ibs_config.op_enabled); - oprofilefs_create_ulong(sb, dir, "max_count", - &ibs_config.max_cnt_op); - if (ibs_caps & IBS_CAPS_OPCNT) - oprofilefs_create_ulong(sb, dir, "dispatched_ops", - &ibs_config.dispatched_ops); + + if (ibs_caps & IBS_CAPS_FETCHSAM) { + dir = oprofilefs_mkdir(sb, root, "ibs_fetch"); + oprofilefs_create_ulong(sb, dir, "enable", + &ibs_config.fetch_enabled); + oprofilefs_create_ulong(sb, dir, "max_count", + &ibs_config.max_cnt_fetch); + oprofilefs_create_ulong(sb, dir, "rand_enable", + &ibs_config.rand_en); + } + + if (ibs_caps & IBS_CAPS_OPSAM) { + dir = oprofilefs_mkdir(sb, root, "ibs_op"); + oprofilefs_create_ulong(sb, dir, "enable", + &ibs_config.op_enabled); + oprofilefs_create_ulong(sb, dir, "max_count", + &ibs_config.max_cnt_op); + if (ibs_caps & IBS_CAPS_OPCNT) + oprofilefs_create_ulong(sb, dir, "dispatched_ops", + &ibs_config.dispatched_ops); + if (ibs_caps & IBS_CAPS_BRNTRGT) + oprofilefs_create_ulong(sb, dir, "branch_target", + &ibs_config.branch_target); + } return 0; } +struct op_x86_model_spec op_amd_spec; + static int op_amd_init(struct oprofile_operations *ops) { init_ibs(); create_arch_files = ops->create_files; ops->create_files = setup_ibs_files; + + if (boot_cpu_data.x86 == 0x15) { + num_counters = NUM_COUNTERS_F15H; + } else { + num_counters = NUM_COUNTERS; + } + + op_amd_spec.num_counters = num_counters; + op_amd_spec.num_controls = num_counters; + op_amd_spec.num_virt_counters = max(num_counters, NUM_VIRT_COUNTERS); + return 0; } struct op_x86_model_spec op_amd_spec = { - .num_counters = NUM_COUNTERS, - .num_controls = NUM_COUNTERS, - .num_virt_counters = NUM_VIRT_COUNTERS, + /* num_counters/num_controls filled in at runtime */ .reserved = MSR_AMD_EVENTSEL_RESERVED, .event_mask = OP_EVENT_MASK, .init = op_amd_init, diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 182558dd5515..9fadec074142 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -11,7 +11,7 @@ #include <linux/oprofile.h> #include <linux/smp.h> #include <linux/ptrace.h> -#include <linux/nmi.h> +#include <asm/nmi.h> #include <asm/msr.h> #include <asm/fixmap.h> #include <asm/apic.h> diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index a0207a7fdf39..effd96e33f16 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -4,6 +4,7 @@ obj-$(CONFIG_PCI_BIOS) += pcbios.o obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_$(BITS).o direct.o mmconfig-shared.o obj-$(CONFIG_PCI_DIRECT) += direct.o obj-$(CONFIG_PCI_OLPC) += olpc.o +obj-$(CONFIG_PCI_XEN) += xen.o obj-y += fixup.o obj-$(CONFIG_ACPI) += acpi.o diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 15466c096ba5..0972315c3860 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -138,7 +138,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data) struct acpi_resource_address64 addr; acpi_status status; unsigned long flags; - struct resource *root, *conflict; u64 start, end; status = resource_to_addr(acpi_res, &addr); @@ -146,12 +145,10 @@ setup_resource(struct acpi_resource *acpi_res, void *data) return AE_OK; if (addr.resource_type == ACPI_MEMORY_RANGE) { - root = &iomem_resource; flags = IORESOURCE_MEM; if (addr.info.mem.caching == ACPI_PREFETCHABLE_MEMORY) flags |= IORESOURCE_PREFETCH; } else if (addr.resource_type == ACPI_IO_RANGE) { - root = &ioport_resource; flags = IORESOURCE_IO; } else return AE_OK; @@ -172,25 +169,90 @@ setup_resource(struct acpi_resource *acpi_res, void *data) return AE_OK; } - conflict = insert_resource_conflict(root, res); - if (conflict) { - dev_err(&info->bridge->dev, - "address space collision: host bridge window %pR " - "conflicts with %s %pR\n", - res, conflict->name, conflict); - } else { - pci_bus_add_resource(info->bus, res, 0); - info->res_num++; - if (addr.translation_offset) - dev_info(&info->bridge->dev, "host bridge window %pR " - "(PCI address [%#llx-%#llx])\n", - res, res->start - addr.translation_offset, - res->end - addr.translation_offset); + info->res_num++; + if (addr.translation_offset) + dev_info(&info->bridge->dev, "host bridge window %pR " + "(PCI address [%#llx-%#llx])\n", + res, res->start - addr.translation_offset, + res->end - addr.translation_offset); + else + dev_info(&info->bridge->dev, "host bridge window %pR\n", res); + + return AE_OK; +} + +static bool resource_contains(struct resource *res, resource_size_t point) +{ + if (res->start <= point && point <= res->end) + return true; + return false; +} + +static void coalesce_windows(struct pci_root_info *info, int type) +{ + int i, j; + struct resource *res1, *res2; + + for (i = 0; i < info->res_num; i++) { + res1 = &info->res[i]; + if (!(res1->flags & type)) + continue; + + for (j = i + 1; j < info->res_num; j++) { + res2 = &info->res[j]; + if (!(res2->flags & type)) + continue; + + /* + * I don't like throwing away windows because then + * our resources no longer match the ACPI _CRS, but + * the kernel resource tree doesn't allow overlaps. + */ + if (resource_contains(res1, res2->start) || + resource_contains(res1, res2->end) || + resource_contains(res2, res1->start) || + resource_contains(res2, res1->end)) { + res1->start = min(res1->start, res2->start); + res1->end = max(res1->end, res2->end); + dev_info(&info->bridge->dev, + "host bridge window expanded to %pR; %pR ignored\n", + res1, res2); + res2->flags = 0; + } + } + } +} + +static void add_resources(struct pci_root_info *info) +{ + int i; + struct resource *res, *root, *conflict; + + if (!pci_use_crs) + return; + + coalesce_windows(info, IORESOURCE_MEM); + coalesce_windows(info, IORESOURCE_IO); + + for (i = 0; i < info->res_num; i++) { + res = &info->res[i]; + + if (res->flags & IORESOURCE_MEM) + root = &iomem_resource; + else if (res->flags & IORESOURCE_IO) + root = &ioport_resource; else - dev_info(&info->bridge->dev, - "host bridge window %pR\n", res); + continue; + + conflict = insert_resource_conflict(root, res); + if (conflict) + dev_err(&info->bridge->dev, + "address space collision: host bridge window %pR " + "conflicts with %s %pR\n", + res, conflict->name, conflict); + else + pci_bus_add_resource(info->bus, res, 0); } - return AE_OK; } static void @@ -224,6 +286,7 @@ get_current_resources(struct acpi_device *device, int busnum, acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, &info); + add_resources(&info); return; name_alloc_fail: diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index a0772af64efb..f7c8a399978c 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -421,16 +421,10 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum) return bus; } - -int __init pcibios_init(void) +void __init pcibios_set_cache_line_size(void) { struct cpuinfo_x86 *c = &boot_cpu_data; - if (!raw_pci_ops) { - printk(KERN_WARNING "PCI: System does not support PCI\n"); - return 0; - } - /* * Set PCI cacheline size to that of the CPU if the CPU has reported it. * (For older CPUs that don't support cpuid, we se it to 32 bytes @@ -445,7 +439,16 @@ int __init pcibios_init(void) pci_dfl_cache_line_size = 32 >> 2; printk(KERN_DEBUG "PCI: Unknown cacheline size. Setting to 32 bytes\n"); } +} + +int __init pcibios_init(void) +{ + if (!raw_pci_ops) { + printk(KERN_WARNING "PCI: System does not support PCI\n"); + return 0; + } + pcibios_set_cache_line_size(); pcibios_resource_survey(); if (pci_bf_sort >= pci_force_bf) diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 55253095be84..b1805b78842f 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -72,9 +72,6 @@ pcibios_align_resource(void *data, const struct resource *res, return start; if (start & 0x300) start = (start + 0x3ff) & ~0x3ff; - } else if (res->flags & IORESOURCE_MEM) { - if (start < BIOS_END) - start = BIOS_END; } return start; } @@ -311,6 +308,8 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, */ prot |= _PAGE_CACHE_UC_MINUS; + prot |= _PAGE_IOMAP; /* creating a mapping for IO */ + vma->vm_page_prot = __pgprot(prot); if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index f547ee05f715..9f9bfb705cf9 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -584,27 +584,28 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route case PCI_DEVICE_ID_INTEL_ICH9_3: case PCI_DEVICE_ID_INTEL_ICH9_4: case PCI_DEVICE_ID_INTEL_ICH9_5: - case PCI_DEVICE_ID_INTEL_TOLAPAI_0: + case PCI_DEVICE_ID_INTEL_EP80579_0: case PCI_DEVICE_ID_INTEL_ICH10_0: case PCI_DEVICE_ID_INTEL_ICH10_1: case PCI_DEVICE_ID_INTEL_ICH10_2: case PCI_DEVICE_ID_INTEL_ICH10_3: + case PCI_DEVICE_ID_INTEL_PATSBURG_LPC: r->name = "PIIX/ICH"; r->get = pirq_piix_get; r->set = pirq_piix_set; return 1; } - if ((device >= PCI_DEVICE_ID_INTEL_PCH_LPC_MIN) && - (device <= PCI_DEVICE_ID_INTEL_PCH_LPC_MAX)) { + if ((device >= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MIN) && + (device <= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MAX)) { r->name = "PIIX/ICH"; r->get = pirq_piix_get; r->set = pirq_piix_set; return 1; } - if ((device >= PCI_DEVICE_ID_INTEL_CPT_LPC_MIN) && - (device <= PCI_DEVICE_ID_INTEL_CPT_LPC_MAX)) { + if ((device >= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN) && + (device <= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX)) { r->name = "PIIX/ICH"; r->get = pirq_piix_get; r->set = pirq_piix_set; diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index a918553ebc75..e282886616a0 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -65,7 +65,6 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start, int end, u64 addr) { struct pci_mmcfg_region *new; - int num_buses; struct resource *res; if (addr == 0) @@ -82,10 +81,9 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start, list_add_sorted(new); - num_buses = end - start + 1; res = &new->res; res->start = addr + PCI_MMCFG_BUS_OFFSET(start); - res->end = addr + PCI_MMCFG_BUS_OFFSET(num_buses) - 1; + res->end = addr + PCI_MMCFG_BUS_OFFSET(end + 1) - 1; res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; snprintf(new->name, PCI_MMCFG_RESOURCE_NAME_LEN, "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end); diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c new file mode 100644 index 000000000000..25cd4a07d09f --- /dev/null +++ b/arch/x86/pci/xen.c @@ -0,0 +1,429 @@ +/* + * Xen PCI Frontend Stub - puts some "dummy" functions in to the Linux + * x86 PCI core to support the Xen PCI Frontend + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ +#include <linux/module.h> +#include <linux/init.h> +#include <linux/pci.h> +#include <linux/acpi.h> + +#include <linux/io.h> +#include <asm/io_apic.h> +#include <asm/pci_x86.h> + +#include <asm/xen/hypervisor.h> + +#include <xen/features.h> +#include <xen/events.h> +#include <asm/xen/pci.h> + +#ifdef CONFIG_ACPI +static int xen_hvm_register_pirq(u32 gsi, int triggering) +{ + int rc, irq; + struct physdev_map_pirq map_irq; + int shareable = 0; + char *name; + + if (!xen_hvm_domain()) + return -1; + + map_irq.domid = DOMID_SELF; + map_irq.type = MAP_PIRQ_TYPE_GSI; + map_irq.index = gsi; + map_irq.pirq = -1; + + rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); + if (rc) { + printk(KERN_WARNING "xen map irq failed %d\n", rc); + return -1; + } + + if (triggering == ACPI_EDGE_SENSITIVE) { + shareable = 0; + name = "ioapic-edge"; + } else { + shareable = 1; + name = "ioapic-level"; + } + + irq = xen_map_pirq_gsi(map_irq.pirq, gsi, shareable, name); + + printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq); + + return irq; +} + +static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi, + int trigger, int polarity) +{ + return xen_hvm_register_pirq(gsi, trigger); +} +#endif + +#if defined(CONFIG_PCI_MSI) +#include <linux/msi.h> +#include <asm/msidef.h> + +struct xen_pci_frontend_ops *xen_pci_frontend; +EXPORT_SYMBOL_GPL(xen_pci_frontend); + +#define XEN_PIRQ_MSI_DATA (MSI_DATA_TRIGGER_EDGE | \ + MSI_DATA_LEVEL_ASSERT | (3 << 8) | MSI_DATA_VECTOR(0)) + +static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq, + struct msi_msg *msg) +{ + /* We set vector == 0 to tell the hypervisor we don't care about it, + * but we want a pirq setup instead. + * We use the dest_id field to pass the pirq that we want. */ + msg->address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(pirq); + msg->address_lo = + MSI_ADDR_BASE_LO | + MSI_ADDR_DEST_MODE_PHYSICAL | + MSI_ADDR_REDIRECTION_CPU | + MSI_ADDR_DEST_ID(pirq); + + msg->data = XEN_PIRQ_MSI_DATA; +} + +static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +{ + int irq, pirq, ret = 0; + struct msi_desc *msidesc; + struct msi_msg msg; + + list_for_each_entry(msidesc, &dev->msi_list, list) { + __read_msi_msg(msidesc, &msg); + pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) | + ((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff); + if (xen_irq_from_pirq(pirq) >= 0 && msg.data == XEN_PIRQ_MSI_DATA) { + xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ? + "msi-x" : "msi", &irq, &pirq, XEN_ALLOC_IRQ); + if (irq < 0) + goto error; + ret = set_irq_msi(irq, msidesc); + if (ret < 0) + goto error_while; + printk(KERN_DEBUG "xen: msi already setup: msi --> irq=%d" + " pirq=%d\n", irq, pirq); + return 0; + } + xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ? + "msi-x" : "msi", &irq, &pirq, (XEN_ALLOC_IRQ | XEN_ALLOC_PIRQ)); + if (irq < 0 || pirq < 0) + goto error; + printk(KERN_DEBUG "xen: msi --> irq=%d, pirq=%d\n", irq, pirq); + xen_msi_compose_msg(dev, pirq, &msg); + ret = set_irq_msi(irq, msidesc); + if (ret < 0) + goto error_while; + write_msi_msg(irq, &msg); + } + return 0; + +error_while: + unbind_from_irqhandler(irq, NULL); +error: + if (ret == -ENODEV) + dev_err(&dev->dev, "Xen PCI frontend has not registered" \ + " MSI/MSI-X support!\n"); + + return ret; +} + +/* + * For MSI interrupts we have to use drivers/xen/event.s functions to + * allocate an irq_desc and setup the right */ + + +static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +{ + int irq, ret, i; + struct msi_desc *msidesc; + int *v; + + v = kzalloc(sizeof(int) * max(1, nvec), GFP_KERNEL); + if (!v) + return -ENOMEM; + + if (type == PCI_CAP_ID_MSIX) + ret = xen_pci_frontend_enable_msix(dev, &v, nvec); + else + ret = xen_pci_frontend_enable_msi(dev, &v); + if (ret) + goto error; + i = 0; + list_for_each_entry(msidesc, &dev->msi_list, list) { + irq = xen_allocate_pirq(v[i], 0, /* not sharable */ + (type == PCI_CAP_ID_MSIX) ? + "pcifront-msi-x" : "pcifront-msi"); + if (irq < 0) { + ret = -1; + goto free; + } + + ret = set_irq_msi(irq, msidesc); + if (ret) + goto error_while; + i++; + } + kfree(v); + return 0; + +error_while: + unbind_from_irqhandler(irq, NULL); +error: + if (ret == -ENODEV) + dev_err(&dev->dev, "Xen PCI frontend has not registered" \ + " MSI/MSI-X support!\n"); +free: + kfree(v); + return ret; +} + +static void xen_teardown_msi_irqs(struct pci_dev *dev) +{ + struct msi_desc *msidesc; + + msidesc = list_entry(dev->msi_list.next, struct msi_desc, list); + if (msidesc->msi_attrib.is_msix) + xen_pci_frontend_disable_msix(dev); + else + xen_pci_frontend_disable_msi(dev); +} + +static void xen_teardown_msi_irq(unsigned int irq) +{ + xen_destroy_irq(irq); +} + +static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +{ + int irq, ret; + struct msi_desc *msidesc; + + list_for_each_entry(msidesc, &dev->msi_list, list) { + irq = xen_create_msi_irq(dev, msidesc, type); + if (irq < 0) + return -1; + + ret = set_irq_msi(irq, msidesc); + if (ret) + goto error; + } + return 0; + +error: + xen_destroy_irq(irq); + return ret; +} +#endif + +static int xen_pcifront_enable_irq(struct pci_dev *dev) +{ + int rc; + int share = 1; + + dev_info(&dev->dev, "Xen PCI enabling IRQ: %d\n", dev->irq); + + if (dev->irq < 0) + return -EINVAL; + + if (dev->irq < NR_IRQS_LEGACY) + share = 0; + + rc = xen_allocate_pirq(dev->irq, share, "pcifront"); + if (rc < 0) { + dev_warn(&dev->dev, "Xen PCI IRQ: %d, failed to register:%d\n", + dev->irq, rc); + return rc; + } + return 0; +} + +int __init pci_xen_init(void) +{ + if (!xen_pv_domain() || xen_initial_domain()) + return -ENODEV; + + printk(KERN_INFO "PCI: setting up Xen PCI frontend stub\n"); + + pcibios_set_cache_line_size(); + + pcibios_enable_irq = xen_pcifront_enable_irq; + pcibios_disable_irq = NULL; + +#ifdef CONFIG_ACPI + /* Keep ACPI out of the picture */ + acpi_noirq = 1; +#endif + +#ifdef CONFIG_PCI_MSI + x86_msi.setup_msi_irqs = xen_setup_msi_irqs; + x86_msi.teardown_msi_irq = xen_teardown_msi_irq; + x86_msi.teardown_msi_irqs = xen_teardown_msi_irqs; +#endif + return 0; +} + +int __init pci_xen_hvm_init(void) +{ + if (!xen_feature(XENFEAT_hvm_pirqs)) + return 0; + +#ifdef CONFIG_ACPI + /* + * We don't want to change the actual ACPI delivery model, + * just how GSIs get registered. + */ + __acpi_register_gsi = acpi_register_gsi_xen_hvm; +#endif + +#ifdef CONFIG_PCI_MSI + x86_msi.setup_msi_irqs = xen_hvm_setup_msi_irqs; + x86_msi.teardown_msi_irq = xen_teardown_msi_irq; +#endif + return 0; +} + +#ifdef CONFIG_XEN_DOM0 +static int xen_register_pirq(u32 gsi, int triggering) +{ + int rc, irq; + struct physdev_map_pirq map_irq; + int shareable = 0; + char *name; + + if (!xen_pv_domain()) + return -1; + + if (triggering == ACPI_EDGE_SENSITIVE) { + shareable = 0; + name = "ioapic-edge"; + } else { + shareable = 1; + name = "ioapic-level"; + } + + irq = xen_allocate_pirq(gsi, shareable, name); + + printk(KERN_DEBUG "xen: --> irq=%d\n", irq); + + if (irq < 0) + goto out; + + map_irq.domid = DOMID_SELF; + map_irq.type = MAP_PIRQ_TYPE_GSI; + map_irq.index = gsi; + map_irq.pirq = irq; + + rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); + if (rc) { + printk(KERN_WARNING "xen map irq failed %d\n", rc); + return -1; + } + +out: + return irq; +} + +static int xen_register_gsi(u32 gsi, int triggering, int polarity) +{ + int rc, irq; + struct physdev_setup_gsi setup_gsi; + + if (!xen_pv_domain()) + return -1; + + printk(KERN_DEBUG "xen: registering gsi %u triggering %d polarity %d\n", + gsi, triggering, polarity); + + irq = xen_register_pirq(gsi, triggering); + + setup_gsi.gsi = gsi; + setup_gsi.triggering = (triggering == ACPI_EDGE_SENSITIVE ? 0 : 1); + setup_gsi.polarity = (polarity == ACPI_ACTIVE_HIGH ? 0 : 1); + + rc = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi); + if (rc == -EEXIST) + printk(KERN_INFO "Already setup the GSI :%d\n", gsi); + else if (rc) { + printk(KERN_ERR "Failed to setup GSI :%d, err_code:%d\n", + gsi, rc); + } + + return irq; +} + +static __init void xen_setup_acpi_sci(void) +{ + int rc; + int trigger, polarity; + int gsi = acpi_sci_override_gsi; + + if (!gsi) + return; + + rc = acpi_get_override_irq(gsi, &trigger, &polarity); + if (rc) { + printk(KERN_WARNING "xen: acpi_get_override_irq failed for acpi" + " sci, rc=%d\n", rc); + return; + } + trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE; + polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH; + + printk(KERN_INFO "xen: sci override: global_irq=%d trigger=%d " + "polarity=%d\n", gsi, trigger, polarity); + + gsi = xen_register_gsi(gsi, trigger, polarity); + printk(KERN_INFO "xen: acpi sci %d\n", gsi); + + return; +} + +static int acpi_register_gsi_xen(struct device *dev, u32 gsi, + int trigger, int polarity) +{ + return xen_register_gsi(gsi, trigger, polarity); +} + +static int __init pci_xen_initial_domain(void) +{ +#ifdef CONFIG_PCI_MSI + x86_msi.setup_msi_irqs = xen_initdom_setup_msi_irqs; + x86_msi.teardown_msi_irq = xen_teardown_msi_irq; +#endif + xen_setup_acpi_sci(); + __acpi_register_gsi = acpi_register_gsi_xen; + + return 0; +} + +void __init xen_setup_pirqs(void) +{ + int irq; + + pci_xen_initial_domain(); + + if (0 == nr_ioapics) { + for (irq = 0; irq < NR_IRQS_LEGACY; irq++) + xen_allocate_pirq(irq, 0, "xt-pic"); + return; + } + + /* Pre-allocate legacy irqs */ + for (irq = 0; irq < NR_IRQS_LEGACY; irq++) { + int trigger, polarity; + + if (acpi_get_override_irq(irq, &trigger, &polarity) == -1) + continue; + + xen_register_pirq(irq, + trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE); + } +} +#endif diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile new file mode 100644 index 000000000000..7bf70b812fa2 --- /dev/null +++ b/arch/x86/platform/Makefile @@ -0,0 +1,8 @@ +# Platform specific code goes here +obj-y += efi/ +obj-y += mrst/ +obj-y += olpc/ +obj-y += scx200/ +obj-y += sfi/ +obj-y += visws/ +obj-y += uv/ diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile new file mode 100644 index 000000000000..73b8be0f3675 --- /dev/null +++ b/arch/x86/platform/efi/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o diff --git a/arch/x86/kernel/efi.c b/arch/x86/platform/efi/efi.c index 0fe27d7c6258..0fe27d7c6258 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/platform/efi/efi.c diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/platform/efi/efi_32.c index 5cab48ee61a4..5cab48ee61a4 100644 --- a/arch/x86/kernel/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/platform/efi/efi_64.c index ac0621a7ac3d..ac0621a7ac3d 100644 --- a/arch/x86/kernel/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c diff --git a/arch/x86/kernel/efi_stub_32.S b/arch/x86/platform/efi/efi_stub_32.S index fbe66e626c09..fbe66e626c09 100644 --- a/arch/x86/kernel/efi_stub_32.S +++ b/arch/x86/platform/efi/efi_stub_32.S diff --git a/arch/x86/kernel/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S index 4c07ccab8146..4c07ccab8146 100644 --- a/arch/x86/kernel/efi_stub_64.S +++ b/arch/x86/platform/efi/efi_stub_64.S diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile new file mode 100644 index 000000000000..efbbc552fa95 --- /dev/null +++ b/arch/x86/platform/mrst/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_X86_MRST) += mrst.o diff --git a/arch/x86/kernel/mrst.c b/arch/x86/platform/mrst/mrst.c index 79ae68154e87..79ae68154e87 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/platform/mrst/mrst.c diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile new file mode 100644 index 000000000000..c31b8fcb5a86 --- /dev/null +++ b/arch/x86/platform/olpc/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_OLPC) += olpc.o +obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o +obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o diff --git a/arch/x86/kernel/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c index f5442c03abc3..f5442c03abc3 100644 --- a/arch/x86/kernel/olpc-xo1.c +++ b/arch/x86/platform/olpc/olpc-xo1.c diff --git a/arch/x86/kernel/olpc.c b/arch/x86/platform/olpc/olpc.c index edaf3fe8dc5e..edaf3fe8dc5e 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/platform/olpc/olpc.c diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/platform/olpc/olpc_ofw.c index 787320464379..787320464379 100644 --- a/arch/x86/kernel/olpc_ofw.c +++ b/arch/x86/platform/olpc/olpc_ofw.c diff --git a/arch/x86/platform/scx200/Makefile b/arch/x86/platform/scx200/Makefile new file mode 100644 index 000000000000..762b4c7f4314 --- /dev/null +++ b/arch/x86/platform/scx200/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_SCx200) += scx200.o +scx200-y += scx200_32.o diff --git a/arch/x86/kernel/scx200_32.c b/arch/x86/platform/scx200/scx200_32.c index 7e004acbe526..7e004acbe526 100644 --- a/arch/x86/kernel/scx200_32.c +++ b/arch/x86/platform/scx200/scx200_32.c diff --git a/arch/x86/platform/sfi/Makefile b/arch/x86/platform/sfi/Makefile new file mode 100644 index 000000000000..cc5db1168a5e --- /dev/null +++ b/arch/x86/platform/sfi/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_SFI) += sfi.o diff --git a/arch/x86/kernel/sfi.c b/arch/x86/platform/sfi/sfi.c index dd4c281ffe57..ca54875ac795 100644 --- a/arch/x86/kernel/sfi.c +++ b/arch/x86/platform/sfi/sfi.c @@ -48,9 +48,9 @@ static void __init mp_sfi_register_lapic_address(unsigned long address) /* All CPUs enumerated by SFI must be present and enabled */ static void __cpuinit mp_sfi_register_lapic(u8 id) { - if (MAX_APICS - id <= 0) { + if (MAX_LOCAL_APIC - id <= 0) { pr_warning("Processor #%d invalid (max %d)\n", - id, MAX_APICS); + id, MAX_LOCAL_APIC); return; } diff --git a/arch/x86/platform/uv/Makefile b/arch/x86/platform/uv/Makefile new file mode 100644 index 000000000000..6c40995fefb8 --- /dev/null +++ b/arch/x86/platform/uv/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index 8bc57baaa9ad..8bc57baaa9ad 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 20ea20a39e2a..ba9caa808a9c 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1343,8 +1343,8 @@ uv_activation_descriptor_init(int node, int pnode) * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR) * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub */ - bau_desc = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)* - UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node); + bau_desc = kmalloc_node(sizeof(struct bau_desc) * UV_ADP_SIZE + * UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node); BUG_ON(!bau_desc); pa = uv_gpa(bau_desc); /* need the real nasid*/ @@ -1402,9 +1402,9 @@ uv_payload_queue_init(int node, int pnode) struct bau_payload_queue_entry *pqp_malloc; struct bau_control *bcp; - pqp = (struct bau_payload_queue_entry *) kmalloc_node( - (DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry), - GFP_KERNEL, node); + pqp = kmalloc_node((DEST_Q_SIZE + 1) + * sizeof(struct bau_payload_queue_entry), + GFP_KERNEL, node); BUG_ON(!pqp); pqp_malloc = pqp; @@ -1455,7 +1455,7 @@ static void __init uv_init_uvhub(int uvhub, int vector) * the below initialization can't be in firmware because the * messaging IRQ will be determined by the OS */ - apicid = uvhub_to_first_apicid(uvhub); + apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits; uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, ((apicid << 32) | vector)); } @@ -1520,8 +1520,7 @@ static void __init uv_init_per_cpu(int nuvhubs) timeout_us = calculate_destination_timeout(); - uvhub_descs = (struct uvhub_desc *) - kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL); + uvhub_descs = kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL); memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc)); uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL); for_each_present_cpu(cpu) { diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index 7b24460917d5..7b24460917d5 100644 --- a/arch/x86/kernel/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/platform/uv/uv_sysfs.c index 309c70fb7759..309c70fb7759 100644 --- a/arch/x86/kernel/uv_sysfs.c +++ b/arch/x86/platform/uv/uv_sysfs.c diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/platform/uv/uv_time.c index 56e421bc379b..9daf5d1af9f1 100644 --- a/arch/x86/kernel/uv_time.c +++ b/arch/x86/platform/uv/uv_time.c @@ -89,6 +89,7 @@ static void uv_rtc_send_IPI(int cpu) apicid = cpu_physical_id(cpu); pnode = uv_apicid_to_pnode(apicid); + apicid |= uv_apicid_hibits; val = (1UL << UVH_IPI_INT_SEND_SHFT) | (apicid << UVH_IPI_INT_APIC_ID_SHFT) | (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT); @@ -107,6 +108,7 @@ static int uv_intr_pending(int pnode) static int uv_setup_intr(int cpu, u64 expires) { u64 val; + unsigned long apicid = cpu_physical_id(cpu) | uv_apicid_hibits; int pnode = uv_cpu_to_pnode(cpu); uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, @@ -117,7 +119,7 @@ static int uv_setup_intr(int cpu, u64 expires) UVH_EVENT_OCCURRED0_RTC1_MASK); val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | - ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); + ((u64)apicid << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); /* Set configuration */ uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val); diff --git a/arch/x86/platform/visws/Makefile b/arch/x86/platform/visws/Makefile new file mode 100644 index 000000000000..91bc17ab2fd5 --- /dev/null +++ b/arch/x86/platform/visws/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_X86_VISWS) += visws_quirks.o diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c index 3371bd053b89..632037671746 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/platform/visws/visws_quirks.c @@ -171,7 +171,7 @@ static void __init MP_processor_info(struct mpc_cpu *m) ver = m->apicver; if ((ver >= 0x14 && m->apicid >= 0xff) || m->apicid >= 0xf) { printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", - m->apicid, MAX_APICS); + m->apicid, MAX_LOCAL_APIC); return; } diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index 4a2afa1bac51..b6552b189bcd 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -25,7 +25,7 @@ targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y) export CPPFLAGS_vdso.lds += -P -C -VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -Wl,-soname=linux-vdso.so.1 \ +VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \ -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 $(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so @@ -69,7 +69,7 @@ vdso32.so-$(VDSO32-y) += sysenter vdso32-images = $(vdso32.so-y:%=vdso32-%.so) CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) -VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -Wl,-soname=linux-gate.so.1 +VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-soname=linux-gate.so.1 # This makes sure the $(obj) subdirectory exists even though vdso32/ # is not a kbuild sub-make subdirectory. diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 68128a1b401a..5b54892e4bc3 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -13,21 +13,28 @@ config XEN kernel to boot in a paravirtualized environment under the Xen hypervisor. +config XEN_DOM0 + def_bool y + depends on XEN && PCI_XEN && SWIOTLB_XEN + depends on X86_LOCAL_APIC && X86_IO_APIC && ACPI && PCI + +# Dummy symbol since people have come to rely on the PRIVILEGED_GUEST +# name in tools. +config XEN_PRIVILEGED_GUEST + def_bool XEN_DOM0 + config XEN_PVHVM def_bool y depends on XEN depends on X86_LOCAL_APIC config XEN_MAX_DOMAIN_MEMORY - int "Maximum allowed size of a domain in gigabytes" - default 8 if X86_32 - default 32 if X86_64 + int + default 128 depends on XEN help - The pseudo-physical to machine address array is sized - according to the maximum possible memory size of a Xen - domain. This array uses 1 page per gigabyte, so there's no - need to be too stingy here. + This only affects the sizing of some bss arrays, the unused + portions of which are freed. config XEN_SAVE_RESTORE bool diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 63b83ceebd1a..44dcad43989d 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -46,6 +46,7 @@ #include <asm/paravirt.h> #include <asm/apic.h> #include <asm/page.h> +#include <asm/xen/pci.h> #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> #include <asm/fixmap.h> @@ -59,7 +60,6 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/reboot.h> -#include <asm/setup.h> #include <asm/stackprotector.h> #include <asm/hypervisor.h> @@ -75,6 +75,11 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); enum xen_domain_type xen_domain_type = XEN_NATIVE; EXPORT_SYMBOL_GPL(xen_domain_type); +unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START; +EXPORT_SYMBOL(machine_to_phys_mapping); +unsigned int machine_to_phys_order; +EXPORT_SYMBOL(machine_to_phys_order); + struct start_info *xen_start_info; EXPORT_SYMBOL_GPL(xen_start_info); @@ -136,9 +141,6 @@ static void xen_vcpu_setup(int cpu) info.mfn = arbitrary_virt_to_mfn(vcpup); info.offset = offset_in_page(vcpup); - printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n", - cpu, vcpup, info.mfn, info.offset); - /* Check to see if the hypervisor will put the vcpu_info structure where we want it, which allows direct access via a percpu-variable. */ @@ -152,9 +154,6 @@ static void xen_vcpu_setup(int cpu) /* This cpu is using the registered vcpu info, even if later ones fail to. */ per_cpu(xen_vcpu, cpu) = vcpup; - - printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n", - cpu, vcpup); } } @@ -243,6 +242,7 @@ static __init void xen_init_cpuid_mask(void) cpuid_leaf1_edx_mask = ~((1 << X86_FEATURE_MCE) | /* disable MCE */ (1 << X86_FEATURE_MCA) | /* disable MCA */ + (1 << X86_FEATURE_MTRR) | /* disable MTRR */ (1 << X86_FEATURE_ACC)); /* thermal monitoring */ if (!xen_initial_domain()) @@ -836,6 +836,11 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) Xen console noise. */ break; + case MSR_IA32_CR_PAT: + if (smp_processor_id() == 0) + xen_set_pat(((u64)high << 32) | low); + break; + default: ret = native_write_msr_safe(msr, low, high); } @@ -874,8 +879,6 @@ void xen_setup_vcpu_info_placement(void) /* xen_vcpu_setup managed to place the vcpu_info within the percpu area for all cpus, so make use of it */ if (have_vcpu_info_placement) { - printk(KERN_INFO "Xen: using vcpu_info placement\n"); - pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); @@ -1018,10 +1021,6 @@ static void xen_reboot(int reason) { struct sched_shutdown r = { .reason = reason }; -#ifdef CONFIG_SMP - smp_send_stop(); -#endif - if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) BUG(); } @@ -1092,6 +1091,8 @@ static void __init xen_setup_stackprotector(void) /* First C function to be called on Xen boot */ asmlinkage void __init xen_start_kernel(void) { + struct physdev_set_iopl set_iopl; + int rc; pgd_t *pgd; if (!xen_start_info) @@ -1099,6 +1100,8 @@ asmlinkage void __init xen_start_kernel(void) xen_domain_type = XEN_PV_DOMAIN; + xen_setup_machphys_mapping(); + /* Install Xen paravirt ops */ pv_info = xen_info; pv_init_ops = xen_init_ops; @@ -1188,8 +1191,10 @@ asmlinkage void __init xen_start_kernel(void) xen_raw_console_write("mapping kernel into physical memory\n"); pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); + xen_ident_map_ISA(); - init_mm.pgd = pgd; + /* Allocate and initialize top and mid mfn levels for p2m structure */ + xen_build_mfn_list_list(); /* keep using Xen gdt for now; no urgent need to change it */ @@ -1200,10 +1205,18 @@ asmlinkage void __init xen_start_kernel(void) #else pv_info.kernel_rpl = 0; #endif - /* set the limit of our address space */ xen_reserve_top(); + /* We used to do this in xen_arch_setup, but that is too late on AMD + * were early_cpu_init (run before ->arch_setup()) calls early_amd_init + * which pokes 0xcf8 port. + */ + set_iopl.iopl = 1; + rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); + if (rc != 0) + xen_raw_printk("physdev_op failed %d\n", rc); + #ifdef CONFIG_X86_32 /* set up basic CPUID stuff */ cpu_detect(&new_cpu_data); @@ -1223,6 +1236,8 @@ asmlinkage void __init xen_start_kernel(void) add_preferred_console("xenboot", 0, NULL); add_preferred_console("tty", 0, NULL); add_preferred_console("hvc", 0, NULL); + if (pci_xen) + x86_init.pci.arch_init = pci_xen_init; } else { /* Make sure ACS will be enabled */ pci_request_acs(); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index f72d18c69221..44924e551fde 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -57,6 +57,7 @@ #include <asm/linkage.h> #include <asm/page.h> #include <asm/init.h> +#include <asm/pat.h> #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> @@ -140,7 +141,8 @@ static inline void check_zero(void) * large enough to allocate page table pages to allocate the rest. * Each page can map 2MB. */ -static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss; +#define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4) +static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES); #ifdef CONFIG_X86_64 /* l3 pud for userspace vsyscall mapping */ @@ -171,49 +173,182 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ */ #define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) +/* + * Xen leaves the responsibility for maintaining p2m mappings to the + * guests themselves, but it must also access and update the p2m array + * during suspend/resume when all the pages are reallocated. + * + * The p2m table is logically a flat array, but we implement it as a + * three-level tree to allow the address space to be sparse. + * + * Xen + * | + * p2m_top p2m_top_mfn + * / \ / \ + * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn + * / \ / \ / / + * p2m p2m p2m p2m p2m p2m p2m ... + * + * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p. + * + * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the + * maximum representable pseudo-physical address space is: + * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages + * + * P2M_PER_PAGE depends on the architecture, as a mfn is always + * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to + * 512 and 1024 entries respectively. + */ -#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) -#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) +unsigned long xen_max_p2m_pfn __read_mostly; -/* Placeholder for holes in the address space */ -static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data = - { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL }; +#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) +#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) +#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) - /* Array of pointers to pages containing p2m entries */ -static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data = - { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] }; +#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) -/* Arrays of p2m arrays expressed in mfns used for save/restore */ -static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss; +/* Placeholders for holes in the address space */ +static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE); -static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE] - __page_aligned_bss; +static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); +static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); + +RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); +RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); static inline unsigned p2m_top_index(unsigned long pfn) { - BUG_ON(pfn >= MAX_DOMAIN_PAGES); - return pfn / P2M_ENTRIES_PER_PAGE; + BUG_ON(pfn >= MAX_P2M_PFN); + return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE); +} + +static inline unsigned p2m_mid_index(unsigned long pfn) +{ + return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE; } static inline unsigned p2m_index(unsigned long pfn) { - return pfn % P2M_ENTRIES_PER_PAGE; + return pfn % P2M_PER_PAGE; } -/* Build the parallel p2m_top_mfn structures */ +static void p2m_top_init(unsigned long ***top) +{ + unsigned i; + + for (i = 0; i < P2M_TOP_PER_PAGE; i++) + top[i] = p2m_mid_missing; +} + +static void p2m_top_mfn_init(unsigned long *top) +{ + unsigned i; + + for (i = 0; i < P2M_TOP_PER_PAGE; i++) + top[i] = virt_to_mfn(p2m_mid_missing_mfn); +} + +static void p2m_top_mfn_p_init(unsigned long **top) +{ + unsigned i; + + for (i = 0; i < P2M_TOP_PER_PAGE; i++) + top[i] = p2m_mid_missing_mfn; +} + +static void p2m_mid_init(unsigned long **mid) +{ + unsigned i; + + for (i = 0; i < P2M_MID_PER_PAGE; i++) + mid[i] = p2m_missing; +} + +static void p2m_mid_mfn_init(unsigned long *mid) +{ + unsigned i; + + for (i = 0; i < P2M_MID_PER_PAGE; i++) + mid[i] = virt_to_mfn(p2m_missing); +} + +static void p2m_init(unsigned long *p2m) +{ + unsigned i; + + for (i = 0; i < P2M_MID_PER_PAGE; i++) + p2m[i] = INVALID_P2M_ENTRY; +} + +/* + * Build the parallel p2m_top_mfn and p2m_mid_mfn structures + * + * This is called both at boot time, and after resuming from suspend: + * - At boot time we're called very early, and must use extend_brk() + * to allocate memory. + * + * - After resume we're called from within stop_machine, but the mfn + * tree should alreay be completely allocated. + */ void xen_build_mfn_list_list(void) { - unsigned pfn, idx; + unsigned long pfn; - for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) { - unsigned topidx = p2m_top_index(pfn); + /* Pre-initialize p2m_top_mfn to be completely missing */ + if (p2m_top_mfn == NULL) { + p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_mfn_init(p2m_mid_missing_mfn); - p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]); + p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_mfn_p_init(p2m_top_mfn_p); + + p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_mfn_init(p2m_top_mfn); + } else { + /* Reinitialise, mfn's all change after migration */ + p2m_mid_mfn_init(p2m_mid_missing_mfn); } - for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) { - unsigned topidx = idx * P2M_ENTRIES_PER_PAGE; - p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]); + for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { + unsigned topidx = p2m_top_index(pfn); + unsigned mididx = p2m_mid_index(pfn); + unsigned long **mid; + unsigned long *mid_mfn_p; + + mid = p2m_top[topidx]; + mid_mfn_p = p2m_top_mfn_p[topidx]; + + /* Don't bother allocating any mfn mid levels if + * they're just missing, just update the stored mfn, + * since all could have changed over a migrate. + */ + if (mid == p2m_mid_missing) { + BUG_ON(mididx); + BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); + p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn); + pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE; + continue; + } + + if (mid_mfn_p == p2m_mid_missing_mfn) { + /* + * XXX boot-time only! We should never find + * missing parts of the mfn tree after + * runtime. extend_brk() will BUG if we call + * it too late. + */ + mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_mfn_init(mid_mfn_p); + + p2m_top_mfn_p[topidx] = mid_mfn_p; + } + + p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); + mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]); } } @@ -222,8 +357,8 @@ void xen_setup_mfn_list_list(void) BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = - virt_to_mfn(p2m_top_mfn_list); - HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages; + virt_to_mfn(p2m_top_mfn); + HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; } /* Set up p2m_top to point to the domain-builder provided p2m pages */ @@ -231,98 +366,176 @@ void __init xen_build_dynamic_phys_to_machine(void) { unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list; unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); - unsigned pfn; + unsigned long pfn; + + xen_max_p2m_pfn = max_pfn; - for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { + p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_init(p2m_missing); + + p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_init(p2m_mid_missing); + + p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_init(p2m_top); + + /* + * The domain builder gives us a pre-constructed p2m array in + * mfn_list for all the pages initially given to us, so we just + * need to graft that into our tree structure. + */ + for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) { unsigned topidx = p2m_top_index(pfn); + unsigned mididx = p2m_mid_index(pfn); - p2m_top[topidx] = &mfn_list[pfn]; - } + if (p2m_top[topidx] == p2m_mid_missing) { + unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_init(mid); - xen_build_mfn_list_list(); + p2m_top[topidx] = mid; + } + + p2m_top[topidx][mididx] = &mfn_list[pfn]; + } } unsigned long get_phys_to_machine(unsigned long pfn) { - unsigned topidx, idx; + unsigned topidx, mididx, idx; - if (unlikely(pfn >= MAX_DOMAIN_PAGES)) + if (unlikely(pfn >= MAX_P2M_PFN)) return INVALID_P2M_ENTRY; topidx = p2m_top_index(pfn); + mididx = p2m_mid_index(pfn); idx = p2m_index(pfn); - return p2m_top[topidx][idx]; + + return p2m_top[topidx][mididx][idx]; } EXPORT_SYMBOL_GPL(get_phys_to_machine); -/* install a new p2m_top page */ -bool install_p2mtop_page(unsigned long pfn, unsigned long *p) +static void *alloc_p2m_page(void) { - unsigned topidx = p2m_top_index(pfn); - unsigned long **pfnp, *mfnp; - unsigned i; + return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); +} - pfnp = &p2m_top[topidx]; - mfnp = &p2m_top_mfn[topidx]; +static void free_p2m_page(void *p) +{ + free_page((unsigned long)p); +} + +/* + * Fully allocate the p2m structure for a given pfn. We need to check + * that both the top and mid levels are allocated, and make sure the + * parallel mfn tree is kept in sync. We may race with other cpus, so + * the new pages are installed with cmpxchg; if we lose the race then + * simply free the page we allocated and use the one that's there. + */ +static bool alloc_p2m(unsigned long pfn) +{ + unsigned topidx, mididx; + unsigned long ***top_p, **mid; + unsigned long *top_mfn_p, *mid_mfn; - for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++) - p[i] = INVALID_P2M_ENTRY; + topidx = p2m_top_index(pfn); + mididx = p2m_mid_index(pfn); - if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) { - *mfnp = virt_to_mfn(p); - return true; + top_p = &p2m_top[topidx]; + mid = *top_p; + + if (mid == p2m_mid_missing) { + /* Mid level is missing, allocate a new one */ + mid = alloc_p2m_page(); + if (!mid) + return false; + + p2m_mid_init(mid); + + if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing) + free_p2m_page(mid); } - return false; -} + top_mfn_p = &p2m_top_mfn[topidx]; + mid_mfn = p2m_top_mfn_p[topidx]; -static void alloc_p2m(unsigned long pfn) -{ - unsigned long *p; + BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); + + if (mid_mfn == p2m_mid_missing_mfn) { + /* Separately check the mid mfn level */ + unsigned long missing_mfn; + unsigned long mid_mfn_mfn; + + mid_mfn = alloc_p2m_page(); + if (!mid_mfn) + return false; + + p2m_mid_mfn_init(mid_mfn); + + missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); + mid_mfn_mfn = virt_to_mfn(mid_mfn); + if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn) + free_p2m_page(mid_mfn); + else + p2m_top_mfn_p[topidx] = mid_mfn; + } + + if (p2m_top[topidx][mididx] == p2m_missing) { + /* p2m leaf page is missing */ + unsigned long *p2m; + + p2m = alloc_p2m_page(); + if (!p2m) + return false; - p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL); - BUG_ON(p == NULL); + p2m_init(p2m); - if (!install_p2mtop_page(pfn, p)) - free_page((unsigned long)p); + if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing) + free_p2m_page(p2m); + else + mid_mfn[mididx] = virt_to_mfn(p2m); + } + + return true; } /* Try to install p2m mapping; fail if intermediate bits missing */ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) { - unsigned topidx, idx; + unsigned topidx, mididx, idx; - if (unlikely(pfn >= MAX_DOMAIN_PAGES)) { + if (unlikely(pfn >= MAX_P2M_PFN)) { BUG_ON(mfn != INVALID_P2M_ENTRY); return true; } topidx = p2m_top_index(pfn); - if (p2m_top[topidx] == p2m_missing) { - if (mfn == INVALID_P2M_ENTRY) - return true; - return false; - } - + mididx = p2m_mid_index(pfn); idx = p2m_index(pfn); - p2m_top[topidx][idx] = mfn; + + if (p2m_top[topidx][mididx] == p2m_missing) + return mfn == INVALID_P2M_ENTRY; + + p2m_top[topidx][mididx][idx] = mfn; return true; } -void set_phys_to_machine(unsigned long pfn, unsigned long mfn) +bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) { if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); - return; + return true; } if (unlikely(!__set_phys_to_machine(pfn, mfn))) { - alloc_p2m(pfn); + if (!alloc_p2m(pfn)) + return false; if (!__set_phys_to_machine(pfn, mfn)) - BUG(); + return false; } + + return true; } unsigned long arbitrary_virt_to_mfn(void *vaddr) @@ -399,7 +612,7 @@ static bool xen_iomap_pte(pte_t pte) return pte_flags(pte) & _PAGE_IOMAP; } -static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval) +void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid) { struct multicall_space mcs; struct mmu_update *u; @@ -411,10 +624,16 @@ static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval) u->ptr = arbitrary_virt_to_machine(ptep).maddr; u->val = pte_val_ma(pteval); - MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_IO); + MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid); xen_mc_issue(PARAVIRT_LAZY_MMU); } +EXPORT_SYMBOL_GPL(xen_set_domain_pte); + +static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval) +{ + xen_set_domain_pte(ptep, pteval, DOMID_IO); +} static void xen_extend_mmu_update(const struct mmu_update *update) { @@ -561,7 +780,20 @@ static pteval_t pte_pfn_to_mfn(pteval_t val) if (val & _PAGE_PRESENT) { unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; pteval_t flags = val & PTE_FLAGS_MASK; - val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; + unsigned long mfn = pfn_to_mfn(pfn); + + /* + * If there's no mfn for the pfn, then just create an + * empty non-present pte. Unfortunately this loses + * information about the original pfn, so + * pte_mfn_to_pfn is asymmetric. + */ + if (unlikely(mfn == INVALID_P2M_ENTRY)) { + mfn = 0; + flags = 0; + } + + val = ((pteval_t)mfn << PAGE_SHIFT) | flags; } return val; @@ -583,10 +815,18 @@ static pteval_t iomap_pte(pteval_t val) pteval_t xen_pte_val(pte_t pte) { - if (xen_initial_domain() && (pte.pte & _PAGE_IOMAP)) - return pte.pte; + pteval_t pteval = pte.pte; + + /* If this is a WC pte, convert back from Xen WC to Linux WC */ + if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) { + WARN_ON(!pat_enabled); + pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT; + } - return pte_mfn_to_pfn(pte.pte); + if (xen_initial_domain() && (pteval & _PAGE_IOMAP)) + return pteval; + + return pte_mfn_to_pfn(pteval); } PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); @@ -596,10 +836,48 @@ pgdval_t xen_pgd_val(pgd_t pgd) } PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); +/* + * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7 + * are reserved for now, to correspond to the Intel-reserved PAT + * types. + * + * We expect Linux's PAT set as follows: + * + * Idx PTE flags Linux Xen Default + * 0 WB WB WB + * 1 PWT WC WT WT + * 2 PCD UC- UC- UC- + * 3 PCD PWT UC UC UC + * 4 PAT WB WC WB + * 5 PAT PWT WC WP WT + * 6 PAT PCD UC- UC UC- + * 7 PAT PCD PWT UC UC UC + */ + +void xen_set_pat(u64 pat) +{ + /* We expect Linux to use a PAT setting of + * UC UC- WC WB (ignoring the PAT flag) */ + WARN_ON(pat != 0x0007010600070106ull); +} + pte_t xen_make_pte(pteval_t pte) { phys_addr_t addr = (pte & PTE_PFN_MASK); + /* If Linux is trying to set a WC pte, then map to the Xen WC. + * If _PAGE_PAT is set, then it probably means it is really + * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope + * things work out OK... + * + * (We should never see kernel mappings with _PAGE_PSE set, + * but we could see hugetlbfs mappings, I think.). + */ + if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) { + if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT) + pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT; + } + /* * Unprivileged domains are allowed to do IOMAPpings for * PCI passthrough, but not map ISA space. The ISA @@ -1697,6 +1975,7 @@ static void *m2v(phys_addr_t maddr) return __ka(m2p(maddr)); } +/* Set the page permissions on an identity-mapped pages */ static void set_page_prot(void *addr, pgprot_t prot) { unsigned long pfn = __pa(addr) >> PAGE_SHIFT; @@ -1712,6 +1991,9 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) unsigned ident_pte; unsigned long pfn; + level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES, + PAGE_SIZE); + ident_pte = 0; pfn = 0; for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { @@ -1722,7 +2004,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) pte_page = m2v(pmd[pmdidx].pmd); else { /* Check for free pte pages */ - if (ident_pte == ARRAY_SIZE(level1_ident_pgt)) + if (ident_pte == LEVEL1_IDENT_ENTRIES) break; pte_page = &level1_ident_pgt[ident_pte]; @@ -1752,6 +2034,20 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) set_page_prot(pmd, PAGE_KERNEL_RO); } +void __init xen_setup_machphys_mapping(void) +{ + struct xen_machphys_mapping mapping; + unsigned long machine_to_phys_nr_ents; + + if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { + machine_to_phys_mapping = (unsigned long *)mapping.v_start; + machine_to_phys_nr_ents = mapping.max_mfn + 1; + } else { + machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES; + } + machine_to_phys_order = fls(machine_to_phys_nr_ents - 1); +} + #ifdef CONFIG_X86_64 static void convert_pfn_mfn(void *v) { @@ -1837,45 +2133,88 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, return pgd; } #else /* !CONFIG_X86_64 */ -static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss; +static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); +static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD); + +static __init void xen_write_cr3_init(unsigned long cr3) +{ + unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir)); + + BUG_ON(read_cr3() != __pa(initial_page_table)); + BUG_ON(cr3 != __pa(swapper_pg_dir)); + + /* + * We are switching to swapper_pg_dir for the first time (from + * initial_page_table) and therefore need to mark that page + * read-only and then pin it. + * + * Xen disallows sharing of kernel PMDs for PAE + * guests. Therefore we must copy the kernel PMD from + * initial_page_table into a new kernel PMD to be used in + * swapper_pg_dir. + */ + swapper_kernel_pmd = + extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); + memcpy(swapper_kernel_pmd, initial_kernel_pmd, + sizeof(pmd_t) * PTRS_PER_PMD); + swapper_pg_dir[KERNEL_PGD_BOUNDARY] = + __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT); + set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO); + + set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); + xen_write_cr3(cr3); + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn); + + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, + PFN_DOWN(__pa(initial_page_table))); + set_page_prot(initial_page_table, PAGE_KERNEL); + set_page_prot(initial_kernel_pmd, PAGE_KERNEL); + + pv_mmu_ops.write_cr3 = &xen_write_cr3; +} __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { pmd_t *kernel_pmd; + initial_kernel_pmd = + extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); + max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + xen_start_info->nr_pt_frames * PAGE_SIZE + 512*1024); kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); - memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); + memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); - xen_map_identity_early(level2_kernel_pgt, max_pfn); + xen_map_identity_early(initial_kernel_pmd, max_pfn); - memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD); - set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY], - __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT)); + memcpy(initial_page_table, pgd, sizeof(pgd_t) * PTRS_PER_PGD); + initial_page_table[KERNEL_PGD_BOUNDARY] = + __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT); - set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); - set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); + set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO); + set_page_prot(initial_page_table, PAGE_KERNEL_RO); set_page_prot(empty_zero_page, PAGE_KERNEL_RO); pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); - xen_write_cr3(__pa(swapper_pg_dir)); - - pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir))); + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, + PFN_DOWN(__pa(initial_page_table))); + xen_write_cr3(__pa(initial_page_table)); memblock_x86_reserve_range(__pa(xen_start_info->pt_base), __pa(xen_start_info->pt_base + xen_start_info->nr_pt_frames * PAGE_SIZE), "XEN PAGETABLES"); - return swapper_pg_dir; + return initial_page_table; } #endif /* CONFIG_X86_64 */ +static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; + static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) { pte_t pte; @@ -1896,15 +2235,28 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) #else case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: #endif -#ifdef CONFIG_X86_LOCAL_APIC - case FIX_APIC_BASE: /* maps dummy local APIC */ -#endif case FIX_TEXT_POKE0: case FIX_TEXT_POKE1: /* All local page mappings */ pte = pfn_pte(phys, prot); break; +#ifdef CONFIG_X86_LOCAL_APIC + case FIX_APIC_BASE: /* maps dummy local APIC */ + pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL); + break; +#endif + +#ifdef CONFIG_X86_IO_APIC + case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END: + /* + * We just don't map the IO APIC - all access is via + * hypercalls. Keep the address in the pte for reference. + */ + pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL); + break; +#endif + case FIX_PARAVIRT_BOOTMAP: /* This is an MFN, but it isn't an IO mapping from the IO domain */ @@ -1929,6 +2281,29 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) #endif } +__init void xen_ident_map_ISA(void) +{ + unsigned long pa; + + /* + * If we're dom0, then linear map the ISA machine addresses into + * the kernel's address space. + */ + if (!xen_initial_domain()) + return; + + xen_raw_printk("Xen: setup ISA identity maps\n"); + + for (pa = ISA_START_ADDRESS; pa < ISA_END_ADDRESS; pa += PAGE_SIZE) { + pte_t pte = mfn_pte(PFN_DOWN(pa), PAGE_KERNEL_IO); + + if (HYPERVISOR_update_va_mapping(PAGE_OFFSET + pa, pte, 0)) + BUG(); + } + + xen_flush_tlb(); +} + static __init void xen_post_allocator_init(void) { pv_mmu_ops.set_pte = xen_set_pte; @@ -1968,7 +2343,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .write_cr2 = xen_write_cr2, .read_cr3 = xen_read_cr3, +#ifdef CONFIG_X86_32 + .write_cr3 = xen_write_cr3_init, +#else .write_cr3 = xen_write_cr3, +#endif .flush_tlb_user = xen_flush_tlb, .flush_tlb_kernel = xen_flush_tlb, @@ -2036,7 +2415,7 @@ void __init xen_init_mmu_ops(void) x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; pv_mmu_ops = xen_mmu_ops; - vmap_lazy_unmap = false; + memset(dummy_mapping, 0xff, PAGE_SIZE); } /* Protected by xen_reservation_lock. */ @@ -2269,6 +2648,73 @@ void __init xen_hvm_init_mmu_ops(void) } #endif +#define REMAP_BATCH_SIZE 16 + +struct remap_data { + unsigned long mfn; + pgprot_t prot; + struct mmu_update *mmu_update; +}; + +static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, + unsigned long addr, void *data) +{ + struct remap_data *rmd = data; + pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot)); + + rmd->mmu_update->ptr = arbitrary_virt_to_machine(ptep).maddr; + rmd->mmu_update->val = pte_val_ma(pte); + rmd->mmu_update++; + + return 0; +} + +int xen_remap_domain_mfn_range(struct vm_area_struct *vma, + unsigned long addr, + unsigned long mfn, int nr, + pgprot_t prot, unsigned domid) +{ + struct remap_data rmd; + struct mmu_update mmu_update[REMAP_BATCH_SIZE]; + int batch; + unsigned long range; + int err = 0; + + prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP); + + BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) == + (VM_PFNMAP | VM_RESERVED | VM_IO))); + + rmd.mfn = mfn; + rmd.prot = prot; + + while (nr) { + batch = min(REMAP_BATCH_SIZE, nr); + range = (unsigned long)batch << PAGE_SHIFT; + + rmd.mmu_update = mmu_update; + err = apply_to_page_range(vma->vm_mm, addr, range, + remap_area_mfn_pte_fn, &rmd); + if (err) + goto out; + + err = -EFAULT; + if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0) + goto out; + + nr -= batch; + addr += range; + } + + err = 0; +out: + + flush_tlb_all(); + + return err; +} +EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); + #ifdef CONFIG_XEN_DEBUG_FS static struct dentry *d_mmu_debug; diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index fa938c4aa2f7..537bb9aab777 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -12,7 +12,6 @@ enum pt_level { bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); -bool install_p2mtop_page(unsigned long pfn, unsigned long *p); void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index 22471001b74c..bfd0632fe65e 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -1,6 +1,7 @@ /* Glue code to lib/swiotlb-xen.c */ #include <linux/dma-mapping.h> +#include <linux/pci.h> #include <xen/swiotlb-xen.h> #include <asm/xen/hypervisor.h> @@ -55,6 +56,9 @@ void __init pci_xen_swiotlb_init(void) if (xen_swiotlb) { xen_swiotlb_init(1); dma_ops = &xen_swiotlb_dma_ops; + + /* Make sure ACS will be enabled */ + pci_request_acs(); } } IOMMU_INIT_FINISH(pci_xen_swiotlb_detect, diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c index 0f456386cce5..25c52f94a27c 100644 --- a/arch/x86/xen/platform-pci-unplug.c +++ b/arch/x86/xen/platform-pci-unplug.c @@ -68,7 +68,7 @@ static int __init check_platform_magic(void) return 0; } -void __init xen_unplug_emulated_devices(void) +void xen_unplug_emulated_devices(void) { int r; diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 9729c903404b..b5a7f928234b 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -18,10 +18,11 @@ #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> +#include <xen/xen.h> #include <xen/page.h> #include <xen/interface/callback.h> -#include <xen/interface/physdev.h> #include <xen/interface/memory.h> +#include <xen/interface/physdev.h> #include <xen/features.h> #include "xen-ops.h" @@ -34,6 +35,39 @@ extern void xen_sysenter_target(void); extern void xen_syscall_target(void); extern void xen_syscall32_target(void); +/* Amount of extra memory space we add to the e820 ranges */ +phys_addr_t xen_extra_mem_start, xen_extra_mem_size; + +/* + * The maximum amount of extra memory compared to the base size. The + * main scaling factor is the size of struct page. At extreme ratios + * of base:extra, all the base memory can be filled with page + * structures for the extra memory, leaving no space for anything + * else. + * + * 10x seems like a reasonable balance between scaling flexibility and + * leaving a practically usable system. + */ +#define EXTRA_MEM_RATIO (10) + +static __init void xen_add_extra_mem(unsigned long pages) +{ + u64 size = (u64)pages * PAGE_SIZE; + u64 extra_start = xen_extra_mem_start + xen_extra_mem_size; + + if (!pages) + return; + + e820_add_region(extra_start, size, E820_RAM); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + + memblock_x86_reserve_range(extra_start, extra_start + size, "XEN EXTRA"); + + xen_extra_mem_size += size; + + xen_max_p2m_pfn = PFN_DOWN(extra_start + size); +} + static unsigned long __init xen_release_chunk(phys_addr_t start_addr, phys_addr_t end_addr) { @@ -83,16 +117,18 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn, const struct e820map *e820) { phys_addr_t max_addr = PFN_PHYS(max_pfn); - phys_addr_t last_end = 0; + phys_addr_t last_end = ISA_END_ADDRESS; unsigned long released = 0; int i; + /* Free any unused memory above the low 1Mbyte. */ for (i = 0; i < e820->nr_map && last_end < max_addr; i++) { phys_addr_t end = e820->map[i].addr; end = min(max_addr, end); - released += xen_release_chunk(last_end, end); - last_end = e820->map[i].addr + e820->map[i].size; + if (last_end < end) + released += xen_release_chunk(last_end, end); + last_end = max(last_end, e820->map[i].addr + e820->map[i].size); } if (last_end < max_addr) @@ -105,21 +141,72 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn, /** * machine_specific_memory_setup - Hook for machine specific memory setup. **/ - char * __init xen_memory_setup(void) { + static struct e820entry map[E820MAX] __initdata; + unsigned long max_pfn = xen_start_info->nr_pages; + unsigned long long mem_end; + int rc; + struct xen_memory_map memmap; + unsigned long extra_pages = 0; + unsigned long extra_limit; + int i; + int op; max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); + mem_end = PFN_PHYS(max_pfn); + + memmap.nr_entries = E820MAX; + set_xen_guest_handle(memmap.buffer, map); + + op = xen_initial_domain() ? + XENMEM_machine_memory_map : + XENMEM_memory_map; + rc = HYPERVISOR_memory_op(op, &memmap); + if (rc == -ENOSYS) { + BUG_ON(xen_initial_domain()); + memmap.nr_entries = 1; + map[0].addr = 0ULL; + map[0].size = mem_end; + /* 8MB slack (to balance backend allocations). */ + map[0].size += 8ULL << 20; + map[0].type = E820_RAM; + rc = 0; + } + BUG_ON(rc); e820.nr_map = 0; + xen_extra_mem_start = mem_end; + for (i = 0; i < memmap.nr_entries; i++) { + unsigned long long end = map[i].addr + map[i].size; + + if (map[i].type == E820_RAM && end > mem_end) { + /* RAM off the end - may be partially included */ + u64 delta = min(map[i].size, end - mem_end); - e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM); + map[i].size -= delta; + end -= delta; + + extra_pages += PFN_DOWN(delta); + } + + if (map[i].size > 0 && end > xen_extra_mem_start) + xen_extra_mem_start = end; + + /* Add region if any remains */ + if (map[i].size > 0) + e820_add_region(map[i].addr, map[i].size, map[i].type); + } /* - * Even though this is normal, usable memory under Xen, reserve - * ISA memory anyway because too many things think they can poke + * In domU, the ISA region is normal, usable memory, but we + * reserve ISA memory anyway because too many things poke * about in there. + * + * In Dom0, the host E820 information can leave gaps in the + * ISA range, which would cause us to release those pages. To + * avoid this, we unconditionally reserve them here. */ e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, E820_RESERVED); @@ -136,23 +223,30 @@ char * __init xen_memory_setup(void) sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); - xen_return_unused_memory(xen_start_info->nr_pages, &e820); + extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820); - return "Xen"; -} + /* + * Clamp the amount of extra memory to a EXTRA_MEM_RATIO + * factor the base size. On non-highmem systems, the base + * size is the full initial memory allocation; on highmem it + * is limited to the max size of lowmem, so that it doesn't + * get completely filled. + * + * In principle there could be a problem in lowmem systems if + * the initial memory is also very large with respect to + * lowmem, but we won't try to deal with that here. + */ + extra_limit = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), + max_pfn + extra_pages); -static void xen_idle(void) -{ - local_irq_disable(); - - if (need_resched()) - local_irq_enable(); - else { - current_thread_info()->status &= ~TS_POLLING; - smp_mb__after_clear_bit(); - safe_halt(); - current_thread_info()->status |= TS_POLLING; - } + if (extra_limit >= max_pfn) + extra_pages = extra_limit - max_pfn; + else + extra_pages = 0; + + xen_add_extra_mem(extra_pages); + + return "Xen"; } /* @@ -224,9 +318,6 @@ void __cpuinit xen_enable_syscall(void) void __init xen_arch_setup(void) { - struct physdev_set_iopl set_iopl; - int rc; - xen_panic_handler_init(); HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); @@ -243,11 +334,6 @@ void __init xen_arch_setup(void) xen_enable_sysenter(); xen_enable_syscall(); - set_iopl.iopl = 1; - rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); - if (rc != 0) - printk(KERN_INFO "physdev_op failed %d\n", rc); - #ifdef CONFIG_ACPI if (!(xen_start_info->flags & SIF_INITDOMAIN)) { printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); @@ -259,9 +345,11 @@ void __init xen_arch_setup(void) MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ? COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); - pm_idle = xen_idle; - - paravirt_disable_iospace(); + /* Set up idle, making sure it calls safe_halt() pvop */ +#ifdef CONFIG_X86_32 + boot_cpu_data.hlt_works_ok = 1; +#endif + pm_idle = default_idle; fiddle_vdso(); } diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 25f232b18a82..72a4c7959045 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -28,6 +28,7 @@ #include <asm/xen/interface.h> #include <asm/xen/hypercall.h> +#include <xen/xen.h> #include <xen/page.h> #include <xen/events.h> @@ -156,11 +157,35 @@ static void __init xen_fill_possible_map(void) { int i, rc; + if (xen_initial_domain()) + return; + + for (i = 0; i < nr_cpu_ids; i++) { + rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); + if (rc >= 0) { + num_processors++; + set_cpu_possible(i, true); + } + } +} + +static void __init xen_filter_cpu_maps(void) +{ + int i, rc; + + if (!xen_initial_domain()) + return; + + num_processors = 0; + disabled_cpus = 0; for (i = 0; i < nr_cpu_ids; i++) { rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); if (rc >= 0) { num_processors++; set_cpu_possible(i, true); + } else { + set_cpu_possible(i, false); + set_cpu_present(i, false); } } } @@ -174,6 +199,7 @@ static void __init xen_smp_prepare_boot_cpu(void) old memory can be recycled */ make_lowmem_page_readwrite(xen_initial_gdt); + xen_filter_cpu_maps(); xen_setup_vcpu_info_placement(); } @@ -400,9 +426,9 @@ static void stop_self(void *v) BUG(); } -static void xen_smp_send_stop(void) +static void xen_stop_other_cpus(int wait) { - smp_call_function(stop_self, NULL, 0); + smp_call_function(stop_self, NULL, wait); } static void xen_smp_send_reschedule(int cpu) @@ -470,7 +496,7 @@ static const struct smp_ops xen_smp_ops __initdata = { .cpu_disable = xen_cpu_disable, .play_dead = xen_play_dead, - .smp_send_stop = xen_smp_send_stop, + .stop_other_cpus = xen_stop_other_cpus, .smp_send_reschedule = xen_smp_send_reschedule, .send_call_func_ipi = xen_smp_send_call_function_ipi, diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 1d789d56877c..9bbd63a129b5 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -31,6 +31,7 @@ void xen_hvm_post_suspend(int suspend_cancelled) int cpu; xen_hvm_init_shared_info(); xen_callback_vector(); + xen_unplug_emulated_devices(); if (xen_feature(XENFEAT_hvm_safe_pvclock)) { for_each_online_cpu(cpu) { xen_setup_runstate_info(cpu); diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index b2bb5aa3b054..5da5e53fb94c 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -426,6 +426,8 @@ void xen_timer_resume(void) { int cpu; + pvclock_resume(); + if (xen_clockevent != &xen_vcpuop_clockevent) return; diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 7c8ab86163e9..9d41bf985757 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -30,6 +30,9 @@ void xen_setup_machphys_mapping(void); pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); void xen_ident_map_ISA(void); void xen_reserve_top(void); +extern unsigned long xen_max_p2m_pfn; + +void xen_set_pat(u64); char * __init xen_memory_setup(void); void __init xen_arch_setup(void); @@ -40,7 +43,7 @@ void xen_vcpu_restore(void); void xen_callback_vector(void); void xen_hvm_init_shared_info(void); -void __init xen_unplug_emulated_devices(void); +void xen_unplug_emulated_devices(void); void __init xen_build_dynamic_phys_to_machine(void); |