diff options
Diffstat (limited to 'arch/x86')
142 files changed, 5563 insertions, 2095 deletions
diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 1f2521434554..b0c5276861ec 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -49,6 +49,9 @@ else KBUILD_AFLAGS += -m64 KBUILD_CFLAGS += -m64 + # Use -mpreferred-stack-boundary=3 if supported. + KBUILD_CFLAGS += $(call cc-option,-mno-sse -mpreferred-stack-boundary=3) + # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8) cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index be6d9e365a80..3470624d7835 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -2460,10 +2460,12 @@ ENTRY(aesni_cbc_dec) pxor IN3, STATE4 movaps IN4, IV #else - pxor (INP), STATE2 - pxor 0x10(INP), STATE3 pxor IN1, STATE4 movaps IN2, IV + movups (INP), IN1 + pxor IN1, STATE2 + movups 0x10(INP), IN2 + pxor IN2, STATE3 #endif movups STATE1, (OUTP) movups STATE2, 0x10(OUTP) diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index daeca56211e3..673ac9b63d6b 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -38,7 +38,7 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) { int err = 0; - bool ia32 = is_ia32_task(); + bool ia32 = test_thread_flag(TIF_IA32); if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t))) return -EFAULT; diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 49331bedc158..70780689599a 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -75,23 +75,54 @@ static inline int alternatives_text_reserved(void *start, void *end) } #endif /* CONFIG_SMP */ +#define OLDINSTR(oldinstr) "661:\n\t" oldinstr "\n662:\n" + +#define b_replacement(number) "663"#number +#define e_replacement(number) "664"#number + +#define alt_slen "662b-661b" +#define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f" + +#define ALTINSTR_ENTRY(feature, number) \ + " .long 661b - .\n" /* label */ \ + " .long " b_replacement(number)"f - .\n" /* new instruction */ \ + " .word " __stringify(feature) "\n" /* feature bit */ \ + " .byte " alt_slen "\n" /* source len */ \ + " .byte " alt_rlen(number) "\n" /* replacement len */ + +#define DISCARD_ENTRY(number) /* rlen <= slen */ \ + " .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n" + +#define ALTINSTR_REPLACEMENT(newinstr, feature, number) /* replacement */ \ + b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t" + /* alternative assembly primitive: */ #define ALTERNATIVE(oldinstr, newinstr, feature) \ - \ - "661:\n\t" oldinstr "\n662:\n" \ - ".section .altinstructions,\"a\"\n" \ - " .long 661b - .\n" /* label */ \ - " .long 663f - .\n" /* new instruction */ \ - " .word " __stringify(feature) "\n" /* feature bit */ \ - " .byte 662b-661b\n" /* sourcelen */ \ - " .byte 664f-663f\n" /* replacementlen */ \ - ".previous\n" \ - ".section .discard,\"aw\",@progbits\n" \ - " .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */ \ - ".previous\n" \ - ".section .altinstr_replacement, \"ax\"\n" \ - "663:\n\t" newinstr "\n664:\n" /* replacement */ \ - ".previous" + OLDINSTR(oldinstr) \ + ".section .altinstructions,\"a\"\n" \ + ALTINSTR_ENTRY(feature, 1) \ + ".previous\n" \ + ".section .discard,\"aw\",@progbits\n" \ + DISCARD_ENTRY(1) \ + ".previous\n" \ + ".section .altinstr_replacement, \"ax\"\n" \ + ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ + ".previous" + +#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ + OLDINSTR(oldinstr) \ + ".section .altinstructions,\"a\"\n" \ + ALTINSTR_ENTRY(feature1, 1) \ + ALTINSTR_ENTRY(feature2, 2) \ + ".previous\n" \ + ".section .discard,\"aw\",@progbits\n" \ + DISCARD_ENTRY(1) \ + DISCARD_ENTRY(2) \ + ".previous\n" \ + ".section .altinstr_replacement, \"ax\"\n" \ + ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ + ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ + ".previous" /* * This must be included *after* the definition of ALTERNATIVE due to @@ -140,6 +171,19 @@ static inline int alternatives_text_reserved(void *start, void *end) : output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input) /* + * Like alternative_call, but there are two features and respective functions. + * If CPU has feature2, function2 is used. + * Otherwise, if CPU has feature1, function1 is used. + * Otherwise, old function is used. + */ +#define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2, \ + output, input...) \ + asm volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\ + "call %P[new2]", feature2) \ + : output : [old] "i" (oldfunc), [new1] "i" (newfunc1), \ + [new2] "i" (newfunc2), ## input) + +/* * use this macro(s) if you need more than one output parameter * in alternative_io */ diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 49ad773f4b9f..b3341e9cd8fd 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -26,10 +26,31 @@ struct amd_l3_cache { u8 subcaches[4]; }; +struct threshold_block { + unsigned int block; + unsigned int bank; + unsigned int cpu; + u32 address; + u16 interrupt_enable; + bool interrupt_capable; + u16 threshold_limit; + struct kobject kobj; + struct list_head miscj; +}; + +struct threshold_bank { + struct kobject *kobj; + struct threshold_block *blocks; + + /* initialized to the number of CPUs on the node sharing this bank */ + atomic_t cpus; +}; + struct amd_northbridge { struct pci_dev *misc; struct pci_dev *link; struct amd_l3_cache l3_cache; + struct threshold_bank *bank4; }; struct amd_northbridge_info { diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index eaff4790ed96..3ea51a84a0e4 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -306,7 +306,8 @@ struct apic { unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid); unsigned long (*check_apicid_present)(int apicid); - void (*vector_allocation_domain)(int cpu, struct cpumask *retmask); + void (*vector_allocation_domain)(int cpu, struct cpumask *retmask, + const struct cpumask *mask); void (*init_apic_ldr)(void); void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap); @@ -331,9 +332,9 @@ struct apic { unsigned long (*set_apic_id)(unsigned int id); unsigned long apic_id_mask; - unsigned int (*cpu_mask_to_apicid)(const struct cpumask *cpumask); - unsigned int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask, - const struct cpumask *andmask); + int (*cpu_mask_to_apicid_and)(const struct cpumask *cpumask, + const struct cpumask *andmask, + unsigned int *apicid); /* ipi */ void (*send_IPI_mask)(const struct cpumask *mask, int vector); @@ -464,6 +465,8 @@ static inline u32 safe_apic_wait_icr_idle(void) return apic->safe_wait_icr_idle(); } +extern void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)); + #else /* CONFIG_X86_LOCAL_APIC */ static inline u32 apic_read(u32 reg) { return 0; } @@ -473,6 +476,7 @@ static inline u64 apic_icr_read(void) { return 0; } static inline void apic_icr_write(u32 low, u32 high) { } static inline void apic_wait_icr_idle(void) { } static inline u32 safe_apic_wait_icr_idle(void) { return 0; } +static inline void apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) {} #endif /* CONFIG_X86_LOCAL_APIC */ @@ -537,6 +541,11 @@ static inline const struct cpumask *default_target_cpus(void) #endif } +static inline const struct cpumask *online_target_cpus(void) +{ + return cpu_online_mask; +} + DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); @@ -586,21 +595,50 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb) #endif -static inline unsigned int -default_cpu_mask_to_apicid(const struct cpumask *cpumask) +static inline int +flat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask, + unsigned int *apicid) { - return cpumask_bits(cpumask)[0] & APIC_ALL_CPUS; + unsigned long cpu_mask = cpumask_bits(cpumask)[0] & + cpumask_bits(andmask)[0] & + cpumask_bits(cpu_online_mask)[0] & + APIC_ALL_CPUS; + + if (likely(cpu_mask)) { + *apicid = (unsigned int)cpu_mask; + return 0; + } else { + return -EINVAL; + } } -static inline unsigned int +extern int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) + const struct cpumask *andmask, + unsigned int *apicid); + +static inline void +flat_vector_allocation_domain(int cpu, struct cpumask *retmask, + const struct cpumask *mask) { - unsigned long mask1 = cpumask_bits(cpumask)[0]; - unsigned long mask2 = cpumask_bits(andmask)[0]; - unsigned long mask3 = cpumask_bits(cpu_online_mask)[0]; + /* Careful. Some cpus do not strictly honor the set of cpus + * specified in the interrupt destination when using lowest + * priority interrupt delivery mode. + * + * In particular there was a hyperthreading cpu observed to + * deliver interrupts to the wrong hyperthread when only one + * hyperthread was specified in the interrupt desitination. + */ + cpumask_clear(retmask); + cpumask_bits(retmask)[0] = APIC_ALL_CPUS; +} - return (unsigned int)(mask1 & mask2 & mask3); +static inline void +default_vector_allocation_domain(int cpu, struct cpumask *retmask, + const struct cpumask *mask) +{ + cpumask_copy(retmask, cpumask_of(cpu)); } static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid) diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index a6983b277220..72f5009deb5a 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -264,6 +264,13 @@ static inline int test_and_clear_bit(int nr, volatile unsigned long *addr) * This operation is non-atomic and can be reordered. * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. + * + * Note: the operation is performed atomically with respect to + * the local CPU, but not other CPUs. Portable code should not + * rely on this behaviour. + * KVM relies on this behaviour on x86 for modifying memory that is also + * accessed from a hypervisor on the same CPU if running in a VM: don't change + * this without also updating arch/x86/kernel/kvm.c */ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) { diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 340ee49961a6..f91e80f4f180 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -176,7 +176,7 @@ #define X86_FEATURE_XSAVEOPT (7*32+ 4) /* Optimized Xsave */ #define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */ #define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */ -#define X86_FEATURE_DTS (7*32+ 7) /* Digital Thermal Sensor */ +#define X86_FEATURE_DTHERM (7*32+ 7) /* Digital Thermal Sensor */ #define X86_FEATURE_HW_PSTATE (7*32+ 8) /* AMD HW-PState */ /* Virtualization flags: Linux defined, word 8 */ diff --git a/arch/x86/include/asm/emergency-restart.h b/arch/x86/include/asm/emergency-restart.h index cc70c1c78ca4..75ce3f47d204 100644 --- a/arch/x86/include/asm/emergency-restart.h +++ b/arch/x86/include/asm/emergency-restart.h @@ -4,9 +4,7 @@ enum reboot_type { BOOT_TRIPLE = 't', BOOT_KBD = 'k', -#ifdef CONFIG_X86_32 BOOT_BIOS = 'b', -#endif BOOT_ACPI = 'a', BOOT_EFI = 'e', BOOT_CF9 = 'p', diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h index dbe82a5c5eac..d3d74698dce9 100644 --- a/arch/x86/include/asm/floppy.h +++ b/arch/x86/include/asm/floppy.h @@ -99,7 +99,7 @@ static irqreturn_t floppy_hardint(int irq, void *dev_id) virtual_dma_residue += virtual_dma_count; virtual_dma_count = 0; #ifdef TRACE_FLPY_INT - printk("count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n", + printk(KERN_DEBUG "count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n", virtual_dma_count, virtual_dma_residue, calls, bytes, dma_wait); calls = 0; diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 7a15153c675d..b518c7509933 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -49,6 +49,7 @@ extern const struct hypervisor_x86 *x86_hyper; extern const struct hypervisor_x86 x86_hyper_vmware; extern const struct hypervisor_x86 x86_hyper_ms_hyperv; extern const struct hypervisor_x86 x86_hyper_xen_hvm; +extern const struct hypervisor_x86 x86_hyper_kvm; static inline bool hypervisor_x2apic_available(void) { diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index e7d1c194d272..246617efd67f 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -12,6 +12,7 @@ /* Select x86 specific features in <linux/kvm.h> */ #define __KVM_HAVE_PIT #define __KVM_HAVE_IOAPIC +#define __KVM_HAVE_IRQ_LINE #define __KVM_HAVE_DEVICE_ASSIGNMENT #define __KVM_HAVE_MSI #define __KVM_HAVE_USER_NMI diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 1ac46c22dd50..c764f43b71c5 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -192,8 +192,8 @@ struct x86_emulate_ops { struct x86_instruction_info *info, enum x86_intercept_stage stage); - bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, - u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); + void (*get_cpuid)(struct x86_emulate_ctxt *ctxt, + u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); }; typedef u32 __attribute__((vector_size(16))) sse128_t; @@ -280,9 +280,9 @@ struct x86_emulate_ctxt { u8 modrm_seg; bool rip_relative; unsigned long _eip; + struct operand memop; /* Fields above regs are cleared together. */ unsigned long regs[NR_VCPU_REGS]; - struct operand memop; struct operand *memopp; struct fetch_cache fetch; struct read_cache io_read; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index db7c1f2709a2..09155d64cf7e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -48,12 +48,13 @@ #define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) #define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) +#define CR3_PCID_ENABLED_RESERVED_BITS 0xFFFFFF0000000000ULL #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 0xFFFFFF0000000000ULL) #define CR4_RESERVED_BITS \ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ - | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \ | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_RDWRGSFS \ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) @@ -175,6 +176,13 @@ enum { /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 +/* + * The following bit is set with PV-EOI, unset on EOI. + * We detect PV-EOI changes by guest by comparing + * this bit with PV-EOI in guest memory. + * See the implementation in apic_update_pv_eoi. + */ +#define KVM_APIC_PV_EOI_PENDING 1 /* * We don't want allocation failures within the mmu code, so we preallocate @@ -313,8 +321,8 @@ struct kvm_pmu { u64 counter_bitmask[2]; u64 global_ctrl_mask; u8 version; - struct kvm_pmc gp_counters[X86_PMC_MAX_GENERIC]; - struct kvm_pmc fixed_counters[X86_PMC_MAX_FIXED]; + struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC]; + struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED]; struct irq_work irq_work; u64 reprogram_pmi; }; @@ -484,6 +492,11 @@ struct kvm_vcpu_arch { u64 length; u64 status; } osvw; + + struct { + u64 msr_val; + struct gfn_to_hva_cache data; + } pv_eoi; }; struct kvm_lpage_info { @@ -661,6 +674,7 @@ struct kvm_x86_ops { u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); int (*get_lpage_level)(void); bool (*rdtscp_supported)(void); + bool (*invpcid_supported)(void); void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host); void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); @@ -802,7 +816,20 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); -int kvm_pic_set_irq(void *opaque, int irq, int level); +static inline int __kvm_irq_line_state(unsigned long *irq_state, + int irq_source_id, int level) +{ + /* Logical OR for level trig interrupt */ + if (level) + __set_bit(irq_source_id, irq_state); + else + __clear_bit(irq_source_id, irq_state); + + return !!(*irq_state); +} + +int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level); +void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); void kvm_inject_nmi(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 63ab1661d00e..2f7712e08b1e 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -22,6 +22,7 @@ #define KVM_FEATURE_CLOCKSOURCE2 3 #define KVM_FEATURE_ASYNC_PF 4 #define KVM_FEATURE_STEAL_TIME 5 +#define KVM_FEATURE_PV_EOI 6 /* The last 8 bits are used to indicate how to interpret the flags field * in pvclock structure. If no bits are set, all flags are ignored. @@ -37,6 +38,7 @@ #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 #define MSR_KVM_ASYNC_PF_EN 0x4b564d02 #define MSR_KVM_STEAL_TIME 0x4b564d03 +#define MSR_KVM_PV_EOI_EN 0x4b564d04 struct kvm_steal_time { __u64 steal; @@ -89,6 +91,11 @@ struct kvm_vcpu_pv_apf_data { __u32 enabled; }; +#define KVM_PV_EOI_BIT 0 +#define KVM_PV_EOI_MASK (0x1 << KVM_PV_EOI_BIT) +#define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK +#define KVM_PV_EOI_DISABLED 0x0 + #ifdef __KERNEL__ #include <asm/processor.h> diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 084ef95274cd..813ed103f45e 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -115,8 +115,8 @@ notrace static inline int native_write_msr_safe(unsigned int msr, extern unsigned long long native_read_tsc(void); -extern int native_rdmsr_safe_regs(u32 regs[8]); -extern int native_wrmsr_safe_regs(u32 regs[8]); +extern int rdmsr_safe_regs(u32 regs[8]); +extern int wrmsr_safe_regs(u32 regs[8]); static __always_inline unsigned long long __native_read_tsc(void) { @@ -187,43 +187,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) return err; } -static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) -{ - u32 gprs[8] = { 0 }; - int err; - - gprs[1] = msr; - gprs[7] = 0x9c5a203a; - - err = native_rdmsr_safe_regs(gprs); - - *p = gprs[0] | ((u64)gprs[2] << 32); - - return err; -} - -static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) -{ - u32 gprs[8] = { 0 }; - - gprs[0] = (u32)val; - gprs[1] = msr; - gprs[2] = val >> 32; - gprs[7] = 0x9c5a203a; - - return native_wrmsr_safe_regs(gprs); -} - -static inline int rdmsr_safe_regs(u32 regs[8]) -{ - return native_rdmsr_safe_regs(regs); -} - -static inline int wrmsr_safe_regs(u32 regs[8]) -{ - return native_wrmsr_safe_regs(regs); -} - #define rdtscl(low) \ ((low) = (u32)__native_read_tsc()) @@ -237,6 +200,8 @@ do { \ (high) = (u32)(_l >> 32); \ } while (0) +#define rdpmcl(counter, val) ((val) = native_read_pmc(counter)) + #define rdtscp(low, high, aux) \ do { \ unsigned long long _val = native_read_tscp(&(aux)); \ @@ -248,8 +213,7 @@ do { \ #endif /* !CONFIG_PARAVIRT */ - -#define checking_wrmsrl(msr, val) wrmsr_safe((msr), (u32)(val), \ +#define wrmsrl_safe(msr, val) wrmsr_safe((msr), (u32)(val), \ (u32)((val) >> 32)) #define write_tsc(val1, val2) wrmsr(MSR_IA32_TSC, (val1), (val2)) diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index dc580c42851c..c0fa356e90de 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -44,28 +44,14 @@ struct nmiaction { const char *name; }; -#define register_nmi_handler(t, fn, fg, n) \ +#define register_nmi_handler(t, fn, fg, n, init...) \ ({ \ - static struct nmiaction fn##_na = { \ + static struct nmiaction init fn##_na = { \ .handler = (fn), \ .name = (n), \ .flags = (fg), \ }; \ - __register_nmi_handler((t), &fn##_na); \ -}) - -/* - * For special handlers that register/unregister in the - * init section only. This should be considered rare. - */ -#define register_nmi_handler_initonly(t, fn, fg, n) \ -({ \ - static struct nmiaction fn##_na __initdata = { \ - .handler = (fn), \ - .name = (n), \ - .flags = (fg), \ - }; \ - __register_nmi_handler((t), &fn##_na); \ + __register_nmi_handler((t), &fn##_na); \ }) int __register_nmi_handler(unsigned int, struct nmiaction *); diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 6cbbabf52707..0b47ddb6f00b 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -128,21 +128,11 @@ static inline u64 paravirt_read_msr(unsigned msr, int *err) return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err); } -static inline int paravirt_rdmsr_regs(u32 *regs) -{ - return PVOP_CALL1(int, pv_cpu_ops.rdmsr_regs, regs); -} - static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high) { return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high); } -static inline int paravirt_wrmsr_regs(u32 *regs) -{ - return PVOP_CALL1(int, pv_cpu_ops.wrmsr_regs, regs); -} - /* These should all do BUG_ON(_err), but our headers are too tangled. */ #define rdmsr(msr, val1, val2) \ do { \ @@ -176,9 +166,6 @@ do { \ _err; \ }) -#define rdmsr_safe_regs(regs) paravirt_rdmsr_regs(regs) -#define wrmsr_safe_regs(regs) paravirt_wrmsr_regs(regs) - static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) { int err; @@ -186,32 +173,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) *p = paravirt_read_msr(msr, &err); return err; } -static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) -{ - u32 gprs[8] = { 0 }; - int err; - - gprs[1] = msr; - gprs[7] = 0x9c5a203a; - - err = paravirt_rdmsr_regs(gprs); - - *p = gprs[0] | ((u64)gprs[2] << 32); - - return err; -} - -static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) -{ - u32 gprs[8] = { 0 }; - - gprs[0] = (u32)val; - gprs[1] = msr; - gprs[2] = val >> 32; - gprs[7] = 0x9c5a203a; - - return paravirt_wrmsr_regs(gprs); -} static inline u64 paravirt_read_tsc(void) { @@ -252,6 +213,8 @@ do { \ high = _l >> 32; \ } while (0) +#define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter)) + static inline unsigned long long paravirt_rdtscp(unsigned int *aux) { return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 8e8b9a4987ee..8613cbb7ba41 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -153,9 +153,7 @@ struct pv_cpu_ops { /* MSR, PMC and TSR operations. err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ u64 (*read_msr)(unsigned int msr, int *err); - int (*rdmsr_regs)(u32 *regs); int (*write_msr)(unsigned int msr, unsigned low, unsigned high); - int (*wrmsr_regs)(u32 *regs); u64 (*read_tsc)(void); u64 (*read_pmc)(int counter); diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index b3a531746026..5ad24a89b19b 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -7,9 +7,13 @@ #undef DEBUG #ifdef DEBUG -#define DBG(x...) printk(x) +#define DBG(fmt, ...) printk(fmt, ##__VA_ARGS__) #else -#define DBG(x...) +#define DBG(fmt, ...) \ +do { \ + if (0) \ + printk(fmt, ##__VA_ARGS__); \ +} while (0) #endif #define PCI_PROBE_BIOS 0x0001 diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 588f52ea810e..c78f14a0df00 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -5,11 +5,10 @@ * Performance event hw details: */ -#define X86_PMC_MAX_GENERIC 32 -#define X86_PMC_MAX_FIXED 3 +#define INTEL_PMC_MAX_GENERIC 32 +#define INTEL_PMC_MAX_FIXED 3 +#define INTEL_PMC_IDX_FIXED 32 -#define X86_PMC_IDX_GENERIC 0 -#define X86_PMC_IDX_FIXED 32 #define X86_PMC_IDX_MAX 64 #define MSR_ARCH_PERFMON_PERFCTR0 0xc1 @@ -48,8 +47,7 @@ (X86_RAW_EVENT_MASK | \ AMD64_EVENTSEL_EVENT) #define AMD64_NUM_COUNTERS 4 -#define AMD64_NUM_COUNTERS_F15H 6 -#define AMD64_NUM_COUNTERS_MAX AMD64_NUM_COUNTERS_F15H +#define AMD64_NUM_COUNTERS_CORE 6 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) @@ -121,16 +119,16 @@ struct x86_pmu_capability { /* Instr_Retired.Any: */ #define MSR_ARCH_PERFMON_FIXED_CTR0 0x309 -#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0) +#define INTEL_PMC_IDX_FIXED_INSTRUCTIONS (INTEL_PMC_IDX_FIXED + 0) /* CPU_CLK_Unhalted.Core: */ #define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a -#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1) +#define INTEL_PMC_IDX_FIXED_CPU_CYCLES (INTEL_PMC_IDX_FIXED + 1) /* CPU_CLK_Unhalted.Ref: */ #define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b -#define X86_PMC_IDX_FIXED_REF_CYCLES (X86_PMC_IDX_FIXED + 2) -#define X86_PMC_MSK_FIXED_REF_CYCLES (1ULL << X86_PMC_IDX_FIXED_REF_CYCLES) +#define INTEL_PMC_IDX_FIXED_REF_CYCLES (INTEL_PMC_IDX_FIXED + 2) +#define INTEL_PMC_MSK_FIXED_REF_CYCLES (1ULL << INTEL_PMC_IDX_FIXED_REF_CYCLES) /* * We model BTS tracing as another fixed-mode PMC. @@ -139,7 +137,7 @@ struct x86_pmu_capability { * values are used by actual fixed events and higher values are used * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr. */ -#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) +#define INTEL_PMC_IDX_FIXED_BTS (INTEL_PMC_IDX_FIXED + 16) /* * IBS cpuid feature detection @@ -234,6 +232,7 @@ struct perf_guest_switch_msr { extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap); +extern void perf_check_microcode(void); #else static inline perf_guest_switch_msr *perf_guest_get_msrs(int *nr) { @@ -247,6 +246,7 @@ static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) } static inline void perf_events_lapic_init(void) { } +static inline void perf_check_microcode(void) { } #endif #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index 98391db840c6..f2b489cf1602 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -2,9 +2,9 @@ #define _ASM_X86_PGTABLE_2LEVEL_H #define pte_ERROR(e) \ - printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, (e).pte_low) + pr_err("%s:%d: bad pte %08lx\n", __FILE__, __LINE__, (e).pte_low) #define pgd_ERROR(e) \ - printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) + pr_err("%s:%d: bad pgd %08lx\n", __FILE__, __LINE__, pgd_val(e)) /* * Certain architectures need to do special things when PTEs diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 43876f16caf1..4cc9f2b7cdc3 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -9,13 +9,13 @@ */ #define pte_ERROR(e) \ - printk("%s:%d: bad pte %p(%08lx%08lx).\n", \ + pr_err("%s:%d: bad pte %p(%08lx%08lx)\n", \ __FILE__, __LINE__, &(e), (e).pte_high, (e).pte_low) #define pmd_ERROR(e) \ - printk("%s:%d: bad pmd %p(%016Lx).\n", \ + pr_err("%s:%d: bad pmd %p(%016Lx)\n", \ __FILE__, __LINE__, &(e), pmd_val(e)) #define pgd_ERROR(e) \ - printk("%s:%d: bad pgd %p(%016Lx).\n", \ + pr_err("%s:%d: bad pgd %p(%016Lx)\n", \ __FILE__, __LINE__, &(e), pgd_val(e)) /* Rules for using set_pte: the pte being assigned *must* be @@ -47,16 +47,26 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte) * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd * operations. * - * Without THP if the mmap_sem is hold for reading, the - * pmd can only transition from null to not null while pmd_read_atomic runs. - * So there's no need of literally reading it atomically. + * Without THP if the mmap_sem is hold for reading, the pmd can only + * transition from null to not null while pmd_read_atomic runs. So + * we can always return atomic pmd values with this function. * * With THP if the mmap_sem is hold for reading, the pmd can become - * THP or null or point to a pte (and in turn become "stable") at any - * time under pmd_read_atomic, so it's mandatory to read it atomically - * with cmpxchg8b. + * trans_huge or none or point to a pte (and in turn become "stable") + * at any time under pmd_read_atomic. We could read it really + * atomically here with a atomic64_read for the THP enabled case (and + * it would be a whole lot simpler), but to avoid using cmpxchg8b we + * only return an atomic pmdval if the low part of the pmdval is later + * found stable (i.e. pointing to a pte). And we're returning a none + * pmdval if the low part of the pmd is none. In some cases the high + * and low part of the pmdval returned may not be consistent if THP is + * enabled (the low part may point to previously mapped hugepage, + * while the high part may point to a more recently mapped hugepage), + * but pmd_none_or_trans_huge_or_clear_bad() only needs the low part + * of the pmd to be read atomically to decide if the pmd is unstable + * or not, with the only exception of when the low part of the pmd is + * zero in which case we return a none pmd. */ -#ifndef CONFIG_TRANSPARENT_HUGEPAGE static inline pmd_t pmd_read_atomic(pmd_t *pmdp) { pmdval_t ret; @@ -74,12 +84,6 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp) return (pmd_t) { ret }; } -#else /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline pmd_t pmd_read_atomic(pmd_t *pmdp) -{ - return (pmd_t) { atomic64_read((atomic64_t *)pmdp) }; -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) { diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 975f709e09ae..8251be02301e 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -26,16 +26,16 @@ extern pgd_t init_level4_pgt[]; extern void paging_init(void); #define pte_ERROR(e) \ - printk("%s:%d: bad pte %p(%016lx).\n", \ + pr_err("%s:%d: bad pte %p(%016lx)\n", \ __FILE__, __LINE__, &(e), pte_val(e)) #define pmd_ERROR(e) \ - printk("%s:%d: bad pmd %p(%016lx).\n", \ + pr_err("%s:%d: bad pmd %p(%016lx)\n", \ __FILE__, __LINE__, &(e), pmd_val(e)) #define pud_ERROR(e) \ - printk("%s:%d: bad pud %p(%016lx).\n", \ + pr_err("%s:%d: bad pud %p(%016lx)\n", \ __FILE__, __LINE__, &(e), pud_val(e)) #define pgd_ERROR(e) \ - printk("%s:%d: bad pgd %p(%016lx).\n", \ + pr_err("%s:%d: bad pgd %p(%016lx)\n", \ __FILE__, __LINE__, &(e), pgd_val(e)) struct mm_struct; diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index f8ab3eaad128..aea1d1d848c7 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -44,6 +44,7 @@ */ #define X86_CR3_PWT 0x00000008 /* Page Write Through */ #define X86_CR3_PCD 0x00000010 /* Page Cache Disable */ +#define X86_CR3_PCID_MASK 0x00000fff /* PCID Mask */ /* * Intel CPU features in CR4 @@ -61,6 +62,7 @@ #define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */ #define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */ #define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */ +#define X86_CR4_PCIDE 0x00020000 /* enable PCID support */ #define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */ #define X86_CR4_SMEP 0x00100000 /* enable SMEP support */ diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index fce3f4ae5bd6..fe1ec5bcd846 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -21,8 +21,9 @@ struct real_mode_header { u32 wakeup_header; #endif /* APM/BIOS reboot */ -#ifdef CONFIG_X86_32 u32 machine_real_restart_asm; +#ifdef CONFIG_X86_64 + u32 machine_real_restart_seg; #endif }; diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h index 92f297069e87..a82c4f1b4d83 100644 --- a/arch/x86/include/asm/reboot.h +++ b/arch/x86/include/asm/reboot.h @@ -18,8 +18,8 @@ extern struct machine_ops machine_ops; void native_machine_crash_shutdown(struct pt_regs *regs); void native_machine_shutdown(void); -void machine_real_restart(unsigned int type); -/* These must match dispatch_table in reboot_32.S */ +void __noreturn machine_real_restart(unsigned int type); +/* These must match dispatch in arch/x86/realmore/rm/reboot.S */ #define MRR_BIOS 0 #define MRR_APM 1 diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index f48394513c37..2ffa95dc2333 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -169,11 +169,6 @@ void x86_idle_thread_init(unsigned int cpu, struct task_struct *idle); void smp_store_cpu_info(int id); #define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) -/* We don't mark CPUs online until __cpu_up(), so we need another measure */ -static inline int num_booting_cpus(void) -{ - return cpumask_weight(cpu_callout_mask); -} #else /* !CONFIG_SMP */ #define wbinvd_on_cpu(cpu) wbinvd() static inline int wbinvd_on_all_cpus(void) diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 8e796fbbf9c6..d8def8b3dba0 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -17,6 +17,8 @@ /* Handles exceptions in both to and from, but doesn't do access_ok */ __must_check unsigned long +copy_user_enhanced_fast_string(void *to, const void *from, unsigned len); +__must_check unsigned long copy_user_generic_string(void *to, const void *from, unsigned len); __must_check unsigned long copy_user_generic_unrolled(void *to, const void *from, unsigned len); @@ -26,9 +28,16 @@ copy_user_generic(void *to, const void *from, unsigned len) { unsigned ret; - alternative_call(copy_user_generic_unrolled, + /* + * If CPU has ERMS feature, use copy_user_enhanced_fast_string. + * Otherwise, if CPU has rep_good feature, use copy_user_generic_string. + * Otherwise, use copy_user_generic_unrolled. + */ + alternative_call_2(copy_user_generic_unrolled, copy_user_generic_string, X86_FEATURE_REP_GOOD, + copy_user_enhanced_fast_string, + X86_FEATURE_ERMS, ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from), "=d" (len)), "1" (to), "2" (from), "3" (len) diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 1e9bed14f7ae..f3971bbcd1de 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -48,7 +48,7 @@ struct arch_uprobe_task { #endif }; -extern int arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm); +extern int arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long addr); extern int arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs); extern int arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs); extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk); diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 6149b476d9df..a06983cdc125 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -140,6 +140,9 @@ #define IPI_RESET_LIMIT 1 /* after this # consecutive successes, bump up the throttle if it was lowered */ #define COMPLETE_THRESHOLD 5 +/* after this # of giveups (fall back to kernel IPI's) disable the use of + the BAU for a period of time */ +#define GIVEUP_LIMIT 100 #define UV_LB_SUBNODEID 0x10 @@ -166,7 +169,6 @@ #define FLUSH_RETRY_TIMEOUT 2 #define FLUSH_GIVEUP 3 #define FLUSH_COMPLETE 4 -#define FLUSH_RETRY_BUSYBUG 5 /* * tuning the action when the numalink network is extremely delayed @@ -175,7 +177,7 @@ microseconds */ #define CONGESTED_REPS 10 /* long delays averaged over this many broadcasts */ -#define CONGESTED_PERIOD 30 /* time for the bau to be +#define DISABLED_PERIOD 10 /* time for the bau to be disabled, in seconds */ /* see msg_type: */ #define MSG_NOOP 0 @@ -520,6 +522,12 @@ struct ptc_stats { unsigned long s_uv2_wars; /* uv2 workaround, perm. busy */ unsigned long s_uv2_wars_hw; /* uv2 workaround, hiwater */ unsigned long s_uv2_war_waits; /* uv2 workaround, long waits */ + unsigned long s_overipilimit; /* over the ipi reset limit */ + unsigned long s_giveuplimit; /* disables, over giveup limit*/ + unsigned long s_enters; /* entries to the driver */ + unsigned long s_ipifordisabled; /* fall back to IPI; disabled */ + unsigned long s_plugged; /* plugged by h/w bug*/ + unsigned long s_congested; /* giveup on long wait */ /* destination statistics */ unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */ @@ -586,8 +594,8 @@ struct bau_control { int timeout_tries; int ipi_attempts; int conseccompletes; - int baudisabled; - int set_bau_off; + short nobau; + short baudisabled; short cpu; short osnode; short uvhub_cpu; @@ -596,14 +604,16 @@ struct bau_control { short cpus_in_socket; short cpus_in_uvhub; short partition_base_pnode; - short using_desc; /* an index, like uvhub_cpu */ - unsigned int inuse_map; + short busy; /* all were busy (war) */ unsigned short message_number; unsigned short uvhub_quiesce; short socket_acknowledge_count[DEST_Q_SIZE]; cycles_t send_message; + cycles_t period_end; + cycles_t period_time; spinlock_t uvhub_lock; spinlock_t queue_lock; + spinlock_t disable_lock; /* tunables */ int max_concurr; int max_concurr_const; @@ -614,9 +624,9 @@ struct bau_control { int complete_threshold; int cong_response_us; int cong_reps; - int cong_period; - unsigned long clocks_per_100_usec; - cycles_t period_time; + cycles_t disabled_period; + int period_giveups; + int giveup_limit; long period_requests; struct hub_and_pnode *thp; }; diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 31f180c21ce9..74fcb963595b 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -60,6 +60,7 @@ #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 +#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 #define PIN_BASED_EXT_INTR_MASK 0x00000001 @@ -281,6 +282,7 @@ enum vmcs_field { #define EXIT_REASON_EPT_MISCONFIG 49 #define EXIT_REASON_WBINVD 54 #define EXIT_REASON_XSETBV 55 +#define EXIT_REASON_INVPCID 58 /* * Interruption-information format @@ -404,6 +406,7 @@ enum vmcs_field { #define VMX_EPTP_WB_BIT (1ull << 14) #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) #define VMX_EPT_1GB_PAGE_BIT (1ull << 17) +#define VMX_EPT_AD_BIT (1ull << 21) #define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) @@ -415,11 +418,14 @@ enum vmcs_field { #define VMX_EPT_MAX_GAW 0x4 #define VMX_EPT_MT_EPTE_SHIFT 3 #define VMX_EPT_GAW_EPTP_SHIFT 3 +#define VMX_EPT_AD_ENABLE_BIT (1ull << 6) #define VMX_EPT_DEFAULT_MT 0x6ull #define VMX_EPT_READABLE_MASK 0x1ull #define VMX_EPT_WRITABLE_MASK 0x2ull #define VMX_EPT_EXECUTABLE_MASK 0x4ull #define VMX_EPT_IPAT_BIT (1ull << 6) +#define VMX_EPT_ACCESS_BIT (1ull << 8) +#define VMX_EPT_DIRTY_BIT (1ull << 9) #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul diff --git a/arch/x86/include/asm/x2apic.h b/arch/x86/include/asm/x2apic.h index 92e54abf89e0..f90f0a587c66 100644 --- a/arch/x86/include/asm/x2apic.h +++ b/arch/x86/include/asm/x2apic.h @@ -9,15 +9,6 @@ #include <asm/ipi.h> #include <linux/cpumask.h> -/* - * Need to use more than cpu 0, because we need more vectors - * when MSI-X are used. - */ -static const struct cpumask *x2apic_target_cpus(void) -{ - return cpu_online_mask; -} - static int x2apic_apic_id_valid(int apicid) { return 1; @@ -28,15 +19,6 @@ static int x2apic_apic_id_registered(void) return 1; } -/* - * For now each logical cpu is in its own vector allocation domain. - */ -static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - cpumask_clear(retmask); - cpumask_set_cpu(cpu, retmask); -} - static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) { diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index c090af10ac7d..38155f667144 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -156,7 +156,6 @@ struct x86_cpuinit_ops { /** * struct x86_platform_ops - platform specific runtime functions * @calibrate_tsc: calibrate TSC - * @wallclock_init: init the wallclock device * @get_wallclock: get time from HW clock like RTC etc. * @set_wallclock: set time back to HW clock * @is_untracked_pat_range exclude from PAT logic @@ -164,10 +163,10 @@ struct x86_cpuinit_ops { * @i8042_detect pre-detect if i8042 controller exists * @save_sched_clock_state: save state for sched_clock() on suspend * @restore_sched_clock_state: restore state for sched_clock() on resume + * @apic_post_init: adjust apic if neeeded */ struct x86_platform_ops { unsigned long (*calibrate_tsc)(void); - void (*wallclock_init)(void); unsigned long (*get_wallclock)(void); int (*set_wallclock)(unsigned long nowtime); void (*iommu_shutdown)(void); @@ -177,6 +176,7 @@ struct x86_platform_ops { int (*i8042_detect)(void); void (*save_sched_clock_state)(void); void (*restore_sched_clock_state)(void); + void (*apic_post_init)(void); }; struct pci_dev; diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 5728852fb90f..59c226d120cd 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -48,6 +48,7 @@ #include <xen/interface/sched.h> #include <xen/interface/physdev.h> #include <xen/interface/platform.h> +#include <xen/interface/xen-mca.h> /* * The hypercall asms have to meet several constraints: @@ -302,6 +303,13 @@ HYPERVISOR_set_timer_op(u64 timeout) } static inline int +HYPERVISOR_mca(struct xen_mc *mc_op) +{ + mc_op->interface_version = XEN_MCA_INTERFACE_VERSION; + return _hypercall1(int, mca, mc_op); +} + +static inline int HYPERVISOR_dom0_op(struct xen_platform_op *platform_op) { platform_op->interface_version = XENPF_INTERFACE_VERSION; diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 8afb69319815..b2297e58c6ed 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -422,12 +422,14 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header, return 0; } - if (intsrc->source_irq == 0 && intsrc->global_irq == 2) { + if (intsrc->source_irq == 0) { if (acpi_skip_timer_override) { - printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n"); + printk(PREFIX "BIOS IRQ0 override ignored.\n"); return 0; } - if (acpi_fix_pin2_polarity && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) { + + if ((intsrc->global_irq == 2) && acpi_fix_pin2_polarity + && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) { intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK; printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n"); } @@ -1334,17 +1336,12 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d) } /* - * Force ignoring BIOS IRQ0 pin2 override + * Force ignoring BIOS IRQ0 override */ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) { - /* - * The ati_ixp4x0_rev() early PCI quirk should have set - * the acpi_skip_timer_override flag already: - */ if (!acpi_skip_timer_override) { - WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n"); - pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", + pr_notice("%s detected: Ignoring BIOS IRQ0 override\n", d->ident); acpi_skip_timer_override = 1; } @@ -1438,7 +1435,7 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = { * is enabled. This input is incorrectly designated the * ISA IRQ 0 via an interrupt source override even though * it is wired to the output of the master 8259A and INTIN0 - * is not connected at all. Force ignoring BIOS IRQ0 pin2 + * is not connected at all. Force ignoring BIOS IRQ0 * override in that cases. */ { @@ -1473,6 +1470,14 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = { DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"), }, }, + { + .callback = dmi_ignore_irq0_timer_override, + .ident = "FUJITSU SIEMENS", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"), + DMI_MATCH(DMI_PRODUCT_NAME, "AMILO PRO V2030"), + }, + }, {} }; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 1f84794f0759..931280ff8299 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) "SMP alternatives: " fmt + #include <linux/module.h> #include <linux/sched.h> #include <linux/mutex.h> @@ -63,8 +65,11 @@ static int __init setup_noreplace_paravirt(char *str) __setup("noreplace-paravirt", setup_noreplace_paravirt); #endif -#define DPRINTK(fmt, args...) if (debug_alternative) \ - printk(KERN_DEBUG fmt, args) +#define DPRINTK(fmt, ...) \ +do { \ + if (debug_alternative) \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ +} while (0) /* * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes @@ -428,7 +433,7 @@ void alternatives_smp_switch(int smp) * If this still occurs then you should see a hang * or crash shortly after this line: */ - printk("lockdep: fixing up alternatives.\n"); + pr_info("lockdep: fixing up alternatives\n"); #endif if (noreplace_smp || smp_alt_once || skip_smp_alternatives) @@ -444,14 +449,14 @@ void alternatives_smp_switch(int smp) if (smp == smp_mode) { /* nothing */ } else if (smp) { - printk(KERN_INFO "SMP alternatives: switching to SMP code\n"); + pr_info("switching to SMP code\n"); clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP); list_for_each_entry(mod, &smp_alt_modules, next) alternatives_smp_lock(mod->locks, mod->locks_end, mod->text, mod->text_end); } else { - printk(KERN_INFO "SMP alternatives: switching to UP code\n"); + pr_info("switching to UP code\n"); set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); set_cpu_cap(&cpu_data(0), X86_FEATURE_UP); list_for_each_entry(mod, &smp_alt_modules, next) @@ -546,7 +551,7 @@ void __init alternative_instructions(void) #ifdef CONFIG_SMP if (smp_alt_once) { if (1 == num_possible_cpus()) { - printk(KERN_INFO "SMP alternatives: switching to UP code\n"); + pr_info("switching to UP code\n"); set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); set_cpu_cap(&cpu_data(0), X86_FEATURE_UP); @@ -664,7 +669,7 @@ static int __kprobes stop_machine_text_poke(void *data) struct text_poke_param *p; int i; - if (atomic_dec_and_test(&stop_machine_first)) { + if (atomic_xchg(&stop_machine_first, 0)) { for (i = 0; i < tpp->nparams; i++) { p = &tpp->params[i]; text_poke(p->addr, p->opcode, p->len); diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index be16854591cc..aadf3359e2a7 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -2,6 +2,9 @@ * Shared support code for AMD K8 northbridges and derivates. * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/types.h> #include <linux/slab.h> #include <linux/init.h> @@ -16,6 +19,7 @@ const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) }, {} }; EXPORT_SYMBOL(amd_nb_misc_ids); @@ -258,7 +262,7 @@ void amd_flush_garts(void) } spin_unlock_irqrestore(&gart_lock, flags); if (!flushed) - printk("nothing to flush?\n"); + pr_notice("nothing to flush?\n"); } EXPORT_SYMBOL_GPL(amd_flush_garts); @@ -269,11 +273,10 @@ static __init int init_amd_nbs(void) err = amd_cache_northbridges(); if (err < 0) - printk(KERN_NOTICE "AMD NB: Cannot enumerate AMD northbridges.\n"); + pr_notice("Cannot enumerate AMD northbridges\n"); if (amd_cache_gart() < 0) - printk(KERN_NOTICE "AMD NB: Cannot initialize GART flush words, " - "GART support disabled.\n"); + pr_notice("Cannot initialize GART flush words, GART support disabled\n"); return err; } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 39a222e094af..98e24131ff3a 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2123,6 +2123,42 @@ void default_init_apic_ldr(void) apic_write(APIC_LDR, val); } +int default_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask, + unsigned int *apicid) +{ + unsigned int cpu; + + for_each_cpu_and(cpu, cpumask, andmask) { + if (cpumask_test_cpu(cpu, cpu_online_mask)) + break; + } + + if (likely(cpu < nr_cpu_ids)) { + *apicid = per_cpu(x86_cpu_to_apicid, cpu); + return 0; + } + + return -EINVAL; +} + +/* + * Override the generic EOI implementation with an optimized version. + * Only called during early boot when only one CPU is active and with + * interrupts disabled, so we know this does not race with actual APIC driver + * use. + */ +void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) +{ + struct apic **drv; + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + /* Should happen once for each apic */ + WARN_ON((*drv)->eoi_write == eoi_write); + (*drv)->eoi_write = eoi_write; + } +} + /* * Power management */ diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 0e881c46e8c8..00c77cf78e9e 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -36,25 +36,6 @@ static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 1; } -static const struct cpumask *flat_target_cpus(void) -{ - return cpu_online_mask; -} - -static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - /* Careful. Some cpus do not strictly honor the set of cpus - * specified in the interrupt destination when using lowest - * priority interrupt delivery mode. - * - * In particular there was a hyperthreading cpu observed to - * deliver interrupts to the wrong hyperthread when only one - * hyperthread was specified in the interrupt desitination. - */ - cpumask_clear(retmask); - cpumask_bits(retmask)[0] = APIC_ALL_CPUS; -} - /* * Set up the logical destination ID. * @@ -92,7 +73,7 @@ static void flat_send_IPI_mask(const struct cpumask *cpumask, int vector) } static void - flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) +flat_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { unsigned long mask = cpumask_bits(cpumask)[0]; int cpu = smp_processor_id(); @@ -186,7 +167,7 @@ static struct apic apic_flat = { .irq_delivery_mode = dest_LowestPrio, .irq_dest_mode = 1, /* logical */ - .target_cpus = flat_target_cpus, + .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = NULL, @@ -210,8 +191,7 @@ static struct apic apic_flat = { .set_apic_id = set_apic_id, .apic_id_mask = 0xFFu << 24, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, - .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, .send_IPI_mask = flat_send_IPI_mask, .send_IPI_mask_allbutself = flat_send_IPI_mask_allbutself, @@ -262,17 +242,6 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } -static const struct cpumask *physflat_target_cpus(void) -{ - return cpu_online_mask; -} - -static void physflat_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - cpumask_clear(retmask); - cpumask_set_cpu(cpu, retmask); -} - static void physflat_send_IPI_mask(const struct cpumask *cpumask, int vector) { default_send_IPI_mask_sequence_phys(cpumask, vector); @@ -294,38 +263,6 @@ static void physflat_send_IPI_all(int vector) physflat_send_IPI_mask(cpu_online_mask, vector); } -static unsigned int physflat_cpu_mask_to_apicid(const struct cpumask *cpumask) -{ - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - cpu = cpumask_first(cpumask); - if ((unsigned)cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_apicid, cpu); - else - return BAD_APICID; -} - -static unsigned int -physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) -{ - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - for_each_cpu_and(cpu, cpumask, andmask) { - if (cpumask_test_cpu(cpu, cpu_online_mask)) - break; - } - return per_cpu(x86_cpu_to_apicid, cpu); -} - static int physflat_probe(void) { if (apic == &apic_physflat || num_possible_cpus() > 8) @@ -345,13 +282,13 @@ static struct apic apic_physflat = { .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 0, /* physical */ - .target_cpus = physflat_target_cpus, + .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = 0, .check_apicid_used = NULL, .check_apicid_present = NULL, - .vector_allocation_domain = physflat_vector_allocation_domain, + .vector_allocation_domain = default_vector_allocation_domain, /* not needed, but shouldn't hurt: */ .init_apic_ldr = flat_init_apic_ldr, @@ -370,8 +307,7 @@ static struct apic apic_physflat = { .set_apic_id = set_apic_id, .apic_id_mask = 0xFFu << 24, - .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, - .cpu_mask_to_apicid_and = physflat_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, .send_IPI_mask = physflat_send_IPI_mask, .send_IPI_mask_allbutself = physflat_send_IPI_mask_allbutself, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index a6e4c6e06c08..e145f28b4099 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -100,12 +100,12 @@ static unsigned long noop_check_apicid_present(int bit) return physid_isset(bit, phys_cpu_present_map); } -static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask) +static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask, + const struct cpumask *mask) { if (cpu != 0) pr_warning("APIC: Vector allocated for non-BSP cpu\n"); - cpumask_clear(retmask); - cpumask_set_cpu(cpu, retmask); + cpumask_copy(retmask, cpumask_of(cpu)); } static u32 noop_apic_read(u32 reg) @@ -159,8 +159,7 @@ struct apic apic_noop = { .set_apic_id = NULL, .apic_id_mask = 0x0F << 24, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, - .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, .send_IPI_mask = noop_send_IPI_mask, .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 6ec6d5d297c3..bc552cff2578 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -72,17 +72,6 @@ static int numachip_phys_pkg_id(int initial_apic_id, int index_msb) return initial_apic_id >> index_msb; } -static const struct cpumask *numachip_target_cpus(void) -{ - return cpu_online_mask; -} - -static void numachip_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - cpumask_clear(retmask); - cpumask_set_cpu(cpu, retmask); -} - static int __cpuinit numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip) { union numachip_csr_g3_ext_irq_gen int_gen; @@ -157,38 +146,6 @@ static void numachip_send_IPI_self(int vector) __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); } -static unsigned int numachip_cpu_mask_to_apicid(const struct cpumask *cpumask) -{ - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - cpu = cpumask_first(cpumask); - if (likely((unsigned)cpu < nr_cpu_ids)) - return per_cpu(x86_cpu_to_apicid, cpu); - - return BAD_APICID; -} - -static unsigned int -numachip_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) -{ - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - for_each_cpu_and(cpu, cpumask, andmask) { - if (cpumask_test_cpu(cpu, cpu_online_mask)) - break; - } - return per_cpu(x86_cpu_to_apicid, cpu); -} - static int __init numachip_probe(void) { return apic == &apic_numachip; @@ -253,13 +210,13 @@ static struct apic apic_numachip __refconst = { .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 0, /* physical */ - .target_cpus = numachip_target_cpus, + .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = 0, .check_apicid_used = NULL, .check_apicid_present = NULL, - .vector_allocation_domain = numachip_vector_allocation_domain, + .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = flat_init_apic_ldr, .ioapic_phys_id_map = NULL, @@ -277,8 +234,7 @@ static struct apic apic_numachip __refconst = { .set_apic_id = set_apic_id, .apic_id_mask = 0xffU << 24, - .cpu_mask_to_apicid = numachip_cpu_mask_to_apicid, - .cpu_mask_to_apicid_and = numachip_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, .send_IPI_mask = numachip_send_IPI_mask, .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 31fbdbfbf960..d50e3640d5ae 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -26,15 +26,6 @@ static int bigsmp_apic_id_registered(void) return 1; } -static const struct cpumask *bigsmp_target_cpus(void) -{ -#ifdef CONFIG_SMP - return cpu_online_mask; -#else - return cpumask_of(0); -#endif -} - static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid) { return 0; @@ -105,32 +96,6 @@ static int bigsmp_check_phys_apicid_present(int phys_apicid) return 1; } -/* As we are using single CPU as destination, pick only one CPU here */ -static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask) -{ - int cpu = cpumask_first(cpumask); - - if (cpu < nr_cpu_ids) - return cpu_physical_id(cpu); - return BAD_APICID; -} - -static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) -{ - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - for_each_cpu_and(cpu, cpumask, andmask) { - if (cpumask_test_cpu(cpu, cpu_online_mask)) - return cpu_physical_id(cpu); - } - return BAD_APICID; -} - static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) { return cpuid_apic >> index_msb; @@ -177,12 +142,6 @@ static const struct dmi_system_id bigsmp_dmi_table[] = { { } /* NULL entry stops DMI scanning */ }; -static void bigsmp_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - cpumask_clear(retmask); - cpumask_set_cpu(cpu, retmask); -} - static int probe_bigsmp(void) { if (def_to_bigsmp) @@ -205,13 +164,13 @@ static struct apic apic_bigsmp = { /* phys delivery to target CPU: */ .irq_dest_mode = 0, - .target_cpus = bigsmp_target_cpus, + .target_cpus = default_target_cpus, .disable_esr = 1, .dest_logical = 0, .check_apicid_used = bigsmp_check_apicid_used, .check_apicid_present = bigsmp_check_apicid_present, - .vector_allocation_domain = bigsmp_vector_allocation_domain, + .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = bigsmp_init_apic_ldr, .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, @@ -229,8 +188,7 @@ static struct apic apic_bigsmp = { .set_apic_id = NULL, .apic_id_mask = 0xFF << 24, - .cpu_mask_to_apicid = bigsmp_cpu_mask_to_apicid, - .cpu_mask_to_apicid_and = bigsmp_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, .send_IPI_mask = bigsmp_send_IPI_mask, .send_IPI_mask_allbutself = NULL, diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index db4ab1be3c79..0874799a98c6 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -394,21 +394,6 @@ static void es7000_enable_apic_mode(void) WARN(1, "Command failed, status = %x\n", mip_status); } -static void es7000_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - /* Careful. Some cpus do not strictly honor the set of cpus - * specified in the interrupt destination when using lowest - * priority interrupt delivery mode. - * - * In particular there was a hyperthreading cpu observed to - * deliver interrupts to the wrong hyperthread when only one - * hyperthread was specified in the interrupt desitination. - */ - cpumask_clear(retmask); - cpumask_bits(retmask)[0] = APIC_ALL_CPUS; -} - - static void es7000_wait_for_init_deassert(atomic_t *deassert) { while (!atomic_read(deassert)) @@ -540,45 +525,49 @@ static int es7000_check_phys_apicid_present(int cpu_physical_apicid) return 1; } -static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask) +static inline int +es7000_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) { unsigned int round = 0; - int cpu, uninitialized_var(apicid); + unsigned int cpu, uninitialized_var(apicid); /* * The cpus in the mask must all be on the apic cluster. */ - for_each_cpu(cpu, cpumask) { + for_each_cpu_and(cpu, cpumask, cpu_online_mask) { int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { WARN(1, "Not a valid mask!"); - return BAD_APICID; + return -EINVAL; } - apicid = new_apicid; + apicid |= new_apicid; round++; } - return apicid; + if (!round) + return -EINVAL; + *dest_id = apicid; + return 0; } -static unsigned int +static int es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask, - const struct cpumask *andmask) + const struct cpumask *andmask, + unsigned int *apicid) { - int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); cpumask_var_t cpumask; + *apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) - return apicid; + return 0; cpumask_and(cpumask, inmask, andmask); - cpumask_and(cpumask, cpumask, cpu_online_mask); - apicid = es7000_cpu_mask_to_apicid(cpumask); + es7000_cpu_mask_to_apicid(cpumask, apicid); free_cpumask_var(cpumask); - return apicid; + return 0; } static int es7000_phys_pkg_id(int cpuid_apic, int index_msb) @@ -638,7 +627,7 @@ static struct apic __refdata apic_es7000_cluster = { .check_apicid_used = es7000_check_apicid_used, .check_apicid_present = es7000_check_apicid_present, - .vector_allocation_domain = es7000_vector_allocation_domain, + .vector_allocation_domain = flat_vector_allocation_domain, .init_apic_ldr = es7000_init_apic_ldr_cluster, .ioapic_phys_id_map = es7000_ioapic_phys_id_map, @@ -656,7 +645,6 @@ static struct apic __refdata apic_es7000_cluster = { .set_apic_id = NULL, .apic_id_mask = 0xFF << 24, - .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and, .send_IPI_mask = es7000_send_IPI_mask, @@ -705,7 +693,7 @@ static struct apic __refdata apic_es7000 = { .check_apicid_used = es7000_check_apicid_used, .check_apicid_present = es7000_check_apicid_present, - .vector_allocation_domain = es7000_vector_allocation_domain, + .vector_allocation_domain = flat_vector_allocation_domain, .init_apic_ldr = es7000_init_apic_ldr, .ioapic_phys_id_map = es7000_ioapic_phys_id_map, @@ -723,7 +711,6 @@ static struct apic __refdata apic_es7000 = { .set_apic_id = NULL, .apic_id_mask = 0xFF << 24, - .cpu_mask_to_apicid = es7000_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = es7000_cpu_mask_to_apicid_and, .send_IPI_mask = es7000_send_IPI_mask, diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 5f0ff597437c..406eee784684 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -448,8 +448,8 @@ static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pi entry = alloc_irq_pin_list(node); if (!entry) { - printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n", - node, apic, pin); + pr_err("can not alloc irq_pin_list (%d,%d,%d)\n", + node, apic, pin); return -ENOMEM; } entry->apic = apic; @@ -661,7 +661,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) ioapic_mask_entry(apic, pin); entry = ioapic_read_entry(apic, pin); if (entry.irr) - printk(KERN_ERR "Unable to reset IRR for apic: %d, pin :%d\n", + pr_err("Unable to reset IRR for apic: %d, pin :%d\n", mpc_ioapic_id(apic), pin); } @@ -895,7 +895,7 @@ static int irq_polarity(int idx) } case 2: /* reserved */ { - printk(KERN_WARNING "broken BIOS!!\n"); + pr_warn("broken BIOS!!\n"); polarity = 1; break; } @@ -906,7 +906,7 @@ static int irq_polarity(int idx) } default: /* invalid */ { - printk(KERN_WARNING "broken BIOS!!\n"); + pr_warn("broken BIOS!!\n"); polarity = 1; break; } @@ -948,7 +948,7 @@ static int irq_trigger(int idx) } default: { - printk(KERN_WARNING "broken BIOS!!\n"); + pr_warn("broken BIOS!!\n"); trigger = 1; break; } @@ -962,7 +962,7 @@ static int irq_trigger(int idx) } case 2: /* reserved */ { - printk(KERN_WARNING "broken BIOS!!\n"); + pr_warn("broken BIOS!!\n"); trigger = 1; break; } @@ -973,7 +973,7 @@ static int irq_trigger(int idx) } default: /* invalid */ { - printk(KERN_WARNING "broken BIOS!!\n"); + pr_warn("broken BIOS!!\n"); trigger = 0; break; } @@ -991,7 +991,7 @@ static int pin_2_irq(int idx, int apic, int pin) * Debugging check, we are in big trouble if this message pops up! */ if (mp_irqs[idx].dstirq != pin) - printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); + pr_err("broken BIOS or MPTABLE parser, ayiee!!\n"); if (test_bit(bus, mp_bus_not_pci)) { irq = mp_irqs[idx].srcbusirq; @@ -1112,8 +1112,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) * 0x80, because int 0x80 is hm, kind of importantish. ;) */ static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START; - static int current_offset = VECTOR_OFFSET_START % 8; - unsigned int old_vector; + static int current_offset = VECTOR_OFFSET_START % 16; int cpu, err; cpumask_var_t tmp_mask; @@ -1123,35 +1122,45 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) return -ENOMEM; - old_vector = cfg->vector; - if (old_vector) { - cpumask_and(tmp_mask, mask, cpu_online_mask); - cpumask_and(tmp_mask, cfg->domain, tmp_mask); - if (!cpumask_empty(tmp_mask)) { - free_cpumask_var(tmp_mask); - return 0; - } - } - /* Only try and allocate irqs on cpus that are present */ err = -ENOSPC; - for_each_cpu_and(cpu, mask, cpu_online_mask) { - int new_cpu; - int vector, offset; + cpumask_clear(cfg->old_domain); + cpu = cpumask_first_and(mask, cpu_online_mask); + while (cpu < nr_cpu_ids) { + int new_cpu, vector, offset; - apic->vector_allocation_domain(cpu, tmp_mask); + apic->vector_allocation_domain(cpu, tmp_mask, mask); + + if (cpumask_subset(tmp_mask, cfg->domain)) { + err = 0; + if (cpumask_equal(tmp_mask, cfg->domain)) + break; + /* + * New cpumask using the vector is a proper subset of + * the current in use mask. So cleanup the vector + * allocation for the members that are not used anymore. + */ + cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask); + cfg->move_in_progress = 1; + cpumask_and(cfg->domain, cfg->domain, tmp_mask); + break; + } vector = current_vector; offset = current_offset; next: - vector += 8; + vector += 16; if (vector >= first_system_vector) { - /* If out of vectors on large boxen, must share them. */ - offset = (offset + 1) % 8; + offset = (offset + 1) % 16; vector = FIRST_EXTERNAL_VECTOR + offset; } - if (unlikely(current_vector == vector)) + + if (unlikely(current_vector == vector)) { + cpumask_or(cfg->old_domain, cfg->old_domain, tmp_mask); + cpumask_andnot(tmp_mask, mask, cfg->old_domain); + cpu = cpumask_first_and(tmp_mask, cpu_online_mask); continue; + } if (test_bit(vector, used_vectors)) goto next; @@ -1162,7 +1171,7 @@ next: /* Found one! */ current_vector = vector; current_offset = offset; - if (old_vector) { + if (cfg->vector) { cfg->move_in_progress = 1; cpumask_copy(cfg->old_domain, cfg->domain); } @@ -1346,18 +1355,18 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, if (!IO_APIC_IRQ(irq)) return; - /* - * For legacy irqs, cfg->domain starts with cpu 0 for legacy - * controllers like 8259. Now that IO-APIC can handle this irq, update - * the cfg->domain. - */ - if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain)) - apic->vector_allocation_domain(0, cfg->domain); if (assign_irq_vector(irq, cfg, apic->target_cpus())) return; - dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); + if (apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus(), + &dest)) { + pr_warn("Failed to obtain apicid for ioapic %d, pin %d\n", + mpc_ioapic_id(attr->ioapic), attr->ioapic_pin); + __clear_irq_vector(irq, cfg); + + return; + } apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " @@ -1366,7 +1375,7 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg, cfg->vector, irq, attr->trigger, attr->polarity, dest); if (setup_ioapic_entry(irq, &entry, dest, cfg->vector, attr)) { - pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n", + pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n", mpc_ioapic_id(attr->ioapic), attr->ioapic_pin); __clear_irq_vector(irq, cfg); @@ -1469,9 +1478,10 @@ void setup_IO_APIC_irq_extra(u32 gsi) * Set up the timer pin, possibly with the 8259A-master behind. */ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx, - unsigned int pin, int vector) + unsigned int pin, int vector) { struct IO_APIC_route_entry entry; + unsigned int dest; if (irq_remapping_enabled) return; @@ -1482,9 +1492,13 @@ static void __init setup_timer_IRQ0_pin(unsigned int ioapic_idx, * We use logical delivery to get the timer IRQ * to the first CPU. */ + if (unlikely(apic->cpu_mask_to_apicid_and(apic->target_cpus(), + apic->target_cpus(), &dest))) + dest = BAD_APICID; + entry.dest_mode = apic->irq_dest_mode; entry.mask = 0; /* don't mask IRQ for edge */ - entry.dest = apic->cpu_mask_to_apicid(apic->target_cpus()); + entry.dest = dest; entry.delivery_mode = apic->irq_delivery_mode; entry.polarity = 0; entry.trigger = 0; @@ -1521,7 +1535,6 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx) reg_03.raw = io_apic_read(ioapic_idx, 3); raw_spin_unlock_irqrestore(&ioapic_lock, flags); - printk("\n"); printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx)); printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); @@ -1578,7 +1591,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx) i, ir_entry->index ); - printk("%1d %1d %1d %1d %1d " + pr_cont("%1d %1d %1d %1d %1d " "%1d %1d %X %02X\n", ir_entry->format, ir_entry->mask, @@ -1598,7 +1611,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx) i, entry.dest ); - printk("%1d %1d %1d %1d %1d " + pr_cont("%1d %1d %1d %1d %1d " "%1d %1d %02X\n", entry.mask, entry.trigger, @@ -1651,8 +1664,8 @@ __apicdebuginit(void) print_IO_APICs(void) continue; printk(KERN_DEBUG "IRQ%d ", irq); for_each_irq_pin(entry, cfg->irq_2_pin) - printk("-> %d:%d", entry->apic, entry->pin); - printk("\n"); + pr_cont("-> %d:%d", entry->apic, entry->pin); + pr_cont("\n"); } printk(KERN_INFO ".................................... done.\n"); @@ -1665,9 +1678,9 @@ __apicdebuginit(void) print_APIC_field(int base) printk(KERN_DEBUG); for (i = 0; i < 8; i++) - printk(KERN_CONT "%08x", apic_read(base + i*0x10)); + pr_cont("%08x", apic_read(base + i*0x10)); - printk(KERN_CONT "\n"); + pr_cont("\n"); } __apicdebuginit(void) print_local_APIC(void *dummy) @@ -1769,7 +1782,7 @@ __apicdebuginit(void) print_local_APIC(void *dummy) printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v); } } - printk("\n"); + pr_cont("\n"); } __apicdebuginit(void) print_local_APICs(int maxcpu) @@ -2065,7 +2078,7 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) reg_00.raw = io_apic_read(ioapic_idx, 0); raw_spin_unlock_irqrestore(&ioapic_lock, flags); if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) - printk("could not set ID!\n"); + pr_cont("could not set ID!\n"); else apic_printk(APIC_VERBOSE, " ok.\n"); } @@ -2210,71 +2223,6 @@ void send_cleanup_vector(struct irq_cfg *cfg) cfg->move_in_progress = 0; } -static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) -{ - int apic, pin; - struct irq_pin_list *entry; - u8 vector = cfg->vector; - - for_each_irq_pin(entry, cfg->irq_2_pin) { - unsigned int reg; - - apic = entry->apic; - pin = entry->pin; - /* - * With interrupt-remapping, destination information comes - * from interrupt-remapping table entry. - */ - if (!irq_remapped(cfg)) - io_apic_write(apic, 0x11 + pin*2, dest); - reg = io_apic_read(apic, 0x10 + pin*2); - reg &= ~IO_APIC_REDIR_VECTOR_MASK; - reg |= vector; - io_apic_modify(apic, 0x10 + pin*2, reg); - } -} - -/* - * Either sets data->affinity to a valid value, and returns - * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and - * leaves data->affinity untouched. - */ -int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, - unsigned int *dest_id) -{ - struct irq_cfg *cfg = data->chip_data; - - if (!cpumask_intersects(mask, cpu_online_mask)) - return -1; - - if (assign_irq_vector(data->irq, data->chip_data, mask)) - return -1; - - cpumask_copy(data->affinity, mask); - - *dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain); - return 0; -} - -static int -ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, - bool force) -{ - unsigned int dest, irq = data->irq; - unsigned long flags; - int ret; - - raw_spin_lock_irqsave(&ioapic_lock, flags); - ret = __ioapic_set_affinity(data, mask, &dest); - if (!ret) { - /* Only the high 8 bits are valid. */ - dest = SET_APIC_LOGICAL_ID(dest); - __target_IO_APIC_irq(irq, dest, data->chip_data); - } - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - return ret; -} - asmlinkage void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; @@ -2362,6 +2310,87 @@ void irq_force_complete_move(int irq) static inline void irq_complete_move(struct irq_cfg *cfg) { } #endif +static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg) +{ + int apic, pin; + struct irq_pin_list *entry; + u8 vector = cfg->vector; + + for_each_irq_pin(entry, cfg->irq_2_pin) { + unsigned int reg; + + apic = entry->apic; + pin = entry->pin; + /* + * With interrupt-remapping, destination information comes + * from interrupt-remapping table entry. + */ + if (!irq_remapped(cfg)) + io_apic_write(apic, 0x11 + pin*2, dest); + reg = io_apic_read(apic, 0x10 + pin*2); + reg &= ~IO_APIC_REDIR_VECTOR_MASK; + reg |= vector; + io_apic_modify(apic, 0x10 + pin*2, reg); + } +} + +/* + * Either sets data->affinity to a valid value, and returns + * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and + * leaves data->affinity untouched. + */ +int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, + unsigned int *dest_id) +{ + struct irq_cfg *cfg = data->chip_data; + unsigned int irq = data->irq; + int err; + + if (!config_enabled(CONFIG_SMP)) + return -1; + + if (!cpumask_intersects(mask, cpu_online_mask)) + return -EINVAL; + + err = assign_irq_vector(irq, cfg, mask); + if (err) + return err; + + err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id); + if (err) { + if (assign_irq_vector(irq, cfg, data->affinity)) + pr_err("Failed to recover vector for irq %d\n", irq); + return err; + } + + cpumask_copy(data->affinity, mask); + + return 0; +} + +static int +ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask, + bool force) +{ + unsigned int dest, irq = data->irq; + unsigned long flags; + int ret; + + if (!config_enabled(CONFIG_SMP)) + return -1; + + raw_spin_lock_irqsave(&ioapic_lock, flags); + ret = __ioapic_set_affinity(data, mask, &dest); + if (!ret) { + /* Only the high 8 bits are valid. */ + dest = SET_APIC_LOGICAL_ID(dest); + __target_IO_APIC_irq(irq, dest, data->chip_data); + ret = IRQ_SET_MASK_OK_NOCOPY; + } + raw_spin_unlock_irqrestore(&ioapic_lock, flags); + return ret; +} + static void ack_apic_edge(struct irq_data *data) { irq_complete_move(data->chip_data); @@ -2541,9 +2570,7 @@ static void irq_remap_modify_chip_defaults(struct irq_chip *chip) chip->irq_ack = ir_ack_apic_edge; chip->irq_eoi = ir_ack_apic_level; -#ifdef CONFIG_SMP chip->irq_set_affinity = set_remapped_irq_affinity; -#endif } #endif /* CONFIG_IRQ_REMAP */ @@ -2554,9 +2581,7 @@ static struct irq_chip ioapic_chip __read_mostly = { .irq_unmask = unmask_ioapic_irq, .irq_ack = ack_apic_edge, .irq_eoi = ack_apic_level, -#ifdef CONFIG_SMP .irq_set_affinity = ioapic_set_affinity, -#endif .irq_retrigger = ioapic_retrigger_irq, }; @@ -3038,7 +3063,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, if (err) return err; - dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); + err = apic->cpu_mask_to_apicid_and(cfg->domain, + apic->target_cpus(), &dest); + if (err) + return err; if (irq_remapped(cfg)) { compose_remapped_msi_msg(pdev, irq, dest, msg, hpet_id); @@ -3072,7 +3100,6 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, return err; } -#ifdef CONFIG_SMP static int msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { @@ -3092,9 +3119,8 @@ msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) __write_msi_msg(data->msi_desc, &msg); - return 0; + return IRQ_SET_MASK_OK_NOCOPY; } -#endif /* CONFIG_SMP */ /* * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, @@ -3105,9 +3131,7 @@ static struct irq_chip msi_chip = { .irq_unmask = unmask_msi_irq, .irq_mask = mask_msi_irq, .irq_ack = ack_apic_edge, -#ifdef CONFIG_SMP .irq_set_affinity = msi_set_affinity, -#endif .irq_retrigger = ioapic_retrigger_irq, }; @@ -3192,7 +3216,6 @@ void native_teardown_msi_irq(unsigned int irq) } #ifdef CONFIG_DMAR_TABLE -#ifdef CONFIG_SMP static int dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) @@ -3214,19 +3237,15 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, dmar_msi_write(irq, &msg); - return 0; + return IRQ_SET_MASK_OK_NOCOPY; } -#endif /* CONFIG_SMP */ - static struct irq_chip dmar_msi_type = { .name = "DMAR_MSI", .irq_unmask = dmar_msi_unmask, .irq_mask = dmar_msi_mask, .irq_ack = ack_apic_edge, -#ifdef CONFIG_SMP .irq_set_affinity = dmar_msi_set_affinity, -#endif .irq_retrigger = ioapic_retrigger_irq, }; @@ -3247,7 +3266,6 @@ int arch_setup_dmar_msi(unsigned int irq) #ifdef CONFIG_HPET_TIMER -#ifdef CONFIG_SMP static int hpet_msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) { @@ -3267,19 +3285,15 @@ static int hpet_msi_set_affinity(struct irq_data *data, hpet_msi_write(data->handler_data, &msg); - return 0; + return IRQ_SET_MASK_OK_NOCOPY; } -#endif /* CONFIG_SMP */ - static struct irq_chip hpet_msi_type = { .name = "HPET_MSI", .irq_unmask = hpet_msi_unmask, .irq_mask = hpet_msi_mask, .irq_ack = ack_apic_edge, -#ifdef CONFIG_SMP .irq_set_affinity = hpet_msi_set_affinity, -#endif .irq_retrigger = ioapic_retrigger_irq, }; @@ -3314,8 +3328,6 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id) */ #ifdef CONFIG_HT_IRQ -#ifdef CONFIG_SMP - static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) { struct ht_irq_msg msg; @@ -3340,25 +3352,23 @@ ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force) return -1; target_ht_irq(data->irq, dest, cfg->vector); - return 0; + return IRQ_SET_MASK_OK_NOCOPY; } -#endif - static struct irq_chip ht_irq_chip = { .name = "PCI-HT", .irq_mask = mask_ht_irq, .irq_unmask = unmask_ht_irq, .irq_ack = ack_apic_edge, -#ifdef CONFIG_SMP .irq_set_affinity = ht_set_affinity, -#endif .irq_retrigger = ioapic_retrigger_irq, }; int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) { struct irq_cfg *cfg; + struct ht_irq_msg msg; + unsigned dest; int err; if (disable_apic) @@ -3366,36 +3376,37 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) cfg = irq_cfg(irq); err = assign_irq_vector(irq, cfg, apic->target_cpus()); - if (!err) { - struct ht_irq_msg msg; - unsigned dest; + if (err) + return err; + + err = apic->cpu_mask_to_apicid_and(cfg->domain, + apic->target_cpus(), &dest); + if (err) + return err; - dest = apic->cpu_mask_to_apicid_and(cfg->domain, - apic->target_cpus()); + msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); - msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); + msg.address_lo = + HT_IRQ_LOW_BASE | + HT_IRQ_LOW_DEST_ID(dest) | + HT_IRQ_LOW_VECTOR(cfg->vector) | + ((apic->irq_dest_mode == 0) ? + HT_IRQ_LOW_DM_PHYSICAL : + HT_IRQ_LOW_DM_LOGICAL) | + HT_IRQ_LOW_RQEOI_EDGE | + ((apic->irq_delivery_mode != dest_LowestPrio) ? + HT_IRQ_LOW_MT_FIXED : + HT_IRQ_LOW_MT_ARBITRATED) | + HT_IRQ_LOW_IRQ_MASKED; - msg.address_lo = - HT_IRQ_LOW_BASE | - HT_IRQ_LOW_DEST_ID(dest) | - HT_IRQ_LOW_VECTOR(cfg->vector) | - ((apic->irq_dest_mode == 0) ? - HT_IRQ_LOW_DM_PHYSICAL : - HT_IRQ_LOW_DM_LOGICAL) | - HT_IRQ_LOW_RQEOI_EDGE | - ((apic->irq_delivery_mode != dest_LowestPrio) ? - HT_IRQ_LOW_MT_FIXED : - HT_IRQ_LOW_MT_ARBITRATED) | - HT_IRQ_LOW_IRQ_MASKED; + write_ht_irq_msg(irq, &msg); - write_ht_irq_msg(irq, &msg); + irq_set_chip_and_handler_name(irq, &ht_irq_chip, + handle_edge_irq, "edge"); - irq_set_chip_and_handler_name(irq, &ht_irq_chip, - handle_edge_irq, "edge"); + dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); - dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); - } - return err; + return 0; } #endif /* CONFIG_HT_IRQ */ @@ -3563,7 +3574,8 @@ static int __init io_apic_get_unique_id(int ioapic, int apic_id) /* Sanity check */ if (reg_00.bits.ID != apic_id) { - printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); + pr_err("IOAPIC[%d]: Unable to change apic_id!\n", + ioapic); return -1; } } diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index f00a68cca37a..d661ee95cabf 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -406,16 +406,13 @@ static inline int numaq_check_phys_apicid_present(int phys_apicid) * We use physical apicids here, not logical, so just return the default * physical broadcast to stop people from breaking us */ -static unsigned int numaq_cpu_mask_to_apicid(const struct cpumask *cpumask) -{ - return 0x0F; -} - -static inline unsigned int +static int numaq_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) + const struct cpumask *andmask, + unsigned int *apicid) { - return 0x0F; + *apicid = 0x0F; + return 0; } /* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */ @@ -441,20 +438,6 @@ static int probe_numaq(void) return found_numaq; } -static void numaq_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - /* Careful. Some cpus do not strictly honor the set of cpus - * specified in the interrupt destination when using lowest - * priority interrupt delivery mode. - * - * In particular there was a hyperthreading cpu observed to - * deliver interrupts to the wrong hyperthread when only one - * hyperthread was specified in the interrupt desitination. - */ - cpumask_clear(retmask); - cpumask_bits(retmask)[0] = APIC_ALL_CPUS; -} - static void numaq_setup_portio_remap(void) { int num_quads = num_online_nodes(); @@ -491,7 +474,7 @@ static struct apic __refdata apic_numaq = { .check_apicid_used = numaq_check_apicid_used, .check_apicid_present = numaq_check_apicid_present, - .vector_allocation_domain = numaq_vector_allocation_domain, + .vector_allocation_domain = flat_vector_allocation_domain, .init_apic_ldr = numaq_init_apic_ldr, .ioapic_phys_id_map = numaq_ioapic_phys_id_map, @@ -509,7 +492,6 @@ static struct apic __refdata apic_numaq = { .set_apic_id = NULL, .apic_id_mask = 0x0F << 24, - .cpu_mask_to_apicid = numaq_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = numaq_cpu_mask_to_apicid_and, .send_IPI_mask = numaq_send_IPI_mask, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 1b291da09e60..eb35ef9ee63f 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -66,21 +66,6 @@ static void setup_apic_flat_routing(void) #endif } -static void default_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - /* - * Careful. Some cpus do not strictly honor the set of cpus - * specified in the interrupt destination when using lowest - * priority interrupt delivery mode. - * - * In particular there was a hyperthreading cpu observed to - * deliver interrupts to the wrong hyperthread when only one - * hyperthread was specified in the interrupt desitination. - */ - cpumask_clear(retmask); - cpumask_bits(retmask)[0] = APIC_ALL_CPUS; -} - /* should be called last. */ static int probe_default(void) { @@ -105,7 +90,7 @@ static struct apic apic_default = { .check_apicid_used = default_check_apicid_used, .check_apicid_present = default_check_apicid_present, - .vector_allocation_domain = default_vector_allocation_domain, + .vector_allocation_domain = flat_vector_allocation_domain, .init_apic_ldr = default_init_apic_ldr, .ioapic_phys_id_map = default_ioapic_phys_id_map, @@ -123,8 +108,7 @@ static struct apic apic_default = { .set_apic_id = NULL, .apic_id_mask = 0x0F << 24, - .cpu_mask_to_apicid = default_cpu_mask_to_apicid, - .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, .send_IPI_mask = default_send_IPI_mask_logical, .send_IPI_mask_allbutself = default_send_IPI_mask_allbutself_logical, @@ -208,6 +192,9 @@ void __init default_setup_apic_routing(void) if (apic->setup_apic_routing) apic->setup_apic_routing(); + + if (x86_platform.apic_post_init) + x86_platform.apic_post_init(); } void __init generic_apic_probe(void) diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 3fe986698929..1793dba7a741 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -23,11 +23,6 @@ #include <asm/ipi.h> #include <asm/setup.h> -static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) -{ - return hard_smp_processor_id() >> index_msb; -} - /* * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. */ @@ -48,10 +43,8 @@ void __init default_setup_apic_routing(void) } } - if (is_vsmp_box()) { - /* need to update phys_pkg_id */ - apic->phys_pkg_id = apicid_phys_pkg_id; - } + if (x86_platform.apic_post_init) + x86_platform.apic_post_init(); } /* Same for both flat and physical. */ diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 659897c00755..77c95c0e1bf7 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -26,6 +26,8 @@ * */ +#define pr_fmt(fmt) "summit: %s: " fmt, __func__ + #include <linux/mm.h> #include <linux/init.h> #include <asm/io.h> @@ -235,8 +237,8 @@ static int summit_apic_id_registered(void) static void summit_setup_apic_routing(void) { - printk("Enabling APIC mode: Summit. Using %d I/O APICs\n", - nr_ioapics); + pr_info("Enabling APIC mode: Summit. Using %d I/O APICs\n", + nr_ioapics); } static int summit_cpu_present_to_apicid(int mps_cpu) @@ -263,43 +265,48 @@ static int summit_check_phys_apicid_present(int physical_apicid) return 1; } -static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask) +static inline int +summit_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) { unsigned int round = 0; - int cpu, apicid = 0; + unsigned int cpu, apicid = 0; /* * The cpus in the mask must all be on the apic cluster. */ - for_each_cpu(cpu, cpumask) { + for_each_cpu_and(cpu, cpumask, cpu_online_mask) { int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { - printk("%s: Not a valid mask!\n", __func__); - return BAD_APICID; + pr_err("Not a valid mask!\n"); + return -EINVAL; } apicid |= new_apicid; round++; } - return apicid; + if (!round) + return -EINVAL; + *dest_id = apicid; + return 0; } -static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, - const struct cpumask *andmask) +static int +summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, + const struct cpumask *andmask, + unsigned int *apicid) { - int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); cpumask_var_t cpumask; + *apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0); if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) - return apicid; + return 0; cpumask_and(cpumask, inmask, andmask); - cpumask_and(cpumask, cpumask, cpu_online_mask); - apicid = summit_cpu_mask_to_apicid(cpumask); + summit_cpu_mask_to_apicid(cpumask, apicid); free_cpumask_var(cpumask); - return apicid; + return 0; } /* @@ -320,20 +327,6 @@ static int probe_summit(void) return 0; } -static void summit_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - /* Careful. Some cpus do not strictly honor the set of cpus - * specified in the interrupt destination when using lowest - * priority interrupt delivery mode. - * - * In particular there was a hyperthreading cpu observed to - * deliver interrupts to the wrong hyperthread when only one - * hyperthread was specified in the interrupt desitination. - */ - cpumask_clear(retmask); - cpumask_bits(retmask)[0] = APIC_ALL_CPUS; -} - #ifdef CONFIG_X86_SUMMIT_NUMA static struct rio_table_hdr *rio_table_hdr; static struct scal_detail *scal_devs[MAX_NUMNODES]; @@ -355,7 +348,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus) } } if (i == rio_table_hdr->num_rio_dev) { - printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __func__); + pr_err("Couldn't find owner Cyclone for Winnipeg!\n"); return last_bus; } @@ -366,7 +359,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus) } } if (i == rio_table_hdr->num_scal_dev) { - printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __func__); + pr_err("Couldn't find owner Twister for Cyclone!\n"); return last_bus; } @@ -396,7 +389,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus) num_buses = 9; break; default: - printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __func__); + pr_info("Unsupported Winnipeg type!\n"); return last_bus; } @@ -411,13 +404,15 @@ static int build_detail_arrays(void) int i, scal_detail_size, rio_detail_size; if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) { - printk(KERN_WARNING "%s: MAX_NUMNODES too low! Defined as %d, but system has %d nodes.\n", __func__, MAX_NUMNODES, rio_table_hdr->num_scal_dev); + pr_warn("MAX_NUMNODES too low! Defined as %d, but system has %d nodes\n", + MAX_NUMNODES, rio_table_hdr->num_scal_dev); return 0; } switch (rio_table_hdr->version) { default: - printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __func__, rio_table_hdr->version); + pr_warn("Invalid Rio Grande Table Version: %d\n", + rio_table_hdr->version); return 0; case 2: scal_detail_size = 11; @@ -462,7 +457,7 @@ void setup_summit(void) offset = *((unsigned short *)(ptr + offset)); } if (!rio_table_hdr) { - printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __func__); + pr_err("Unable to locate Rio Grande Table in EBDA - bailing!\n"); return; } @@ -509,7 +504,7 @@ static struct apic apic_summit = { .check_apicid_used = summit_check_apicid_used, .check_apicid_present = summit_check_apicid_present, - .vector_allocation_domain = summit_vector_allocation_domain, + .vector_allocation_domain = flat_vector_allocation_domain, .init_apic_ldr = summit_init_apic_ldr, .ioapic_phys_id_map = summit_ioapic_phys_id_map, @@ -527,7 +522,6 @@ static struct apic apic_summit = { .set_apic_id = NULL, .apic_id_mask = 0xFF << 24, - .cpu_mask_to_apicid = summit_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = summit_cpu_mask_to_apicid_and, .send_IPI_mask = summit_send_IPI_mask, diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index ff35cff0e1a7..c88baa4ff0e5 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -81,7 +81,7 @@ static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) } static void - x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) +x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) { __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT); } @@ -96,36 +96,37 @@ static void x2apic_send_IPI_all(int vector) __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); } -static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) +static int +x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask, + unsigned int *apicid) { - /* - * We're using fixed IRQ delivery, can only return one logical APIC ID. - * May as well be the first. - */ - int cpu = cpumask_first(cpumask); + u32 dest = 0; + u16 cluster; + int i; - if ((unsigned)cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_logical_apicid, cpu); - else - return BAD_APICID; -} + for_each_cpu_and(i, cpumask, andmask) { + if (!cpumask_test_cpu(i, cpu_online_mask)) + continue; + dest = per_cpu(x86_cpu_to_logical_apicid, i); + cluster = x2apic_cluster(i); + break; + } -static unsigned int -x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) -{ - int cpu; + if (!dest) + return -EINVAL; - /* - * We're using fixed IRQ delivery, can only return one logical APIC ID. - * May as well be the first. - */ - for_each_cpu_and(cpu, cpumask, andmask) { - if (cpumask_test_cpu(cpu, cpu_online_mask)) - break; + for_each_cpu_and(i, cpumask, andmask) { + if (!cpumask_test_cpu(i, cpu_online_mask)) + continue; + if (cluster != x2apic_cluster(i)) + continue; + dest |= per_cpu(x86_cpu_to_logical_apicid, i); } - return per_cpu(x86_cpu_to_logical_apicid, cpu); + *apicid = dest; + + return 0; } static void init_x2apic_ldr(void) @@ -208,6 +209,32 @@ static int x2apic_cluster_probe(void) return 0; } +static const struct cpumask *x2apic_cluster_target_cpus(void) +{ + return cpu_all_mask; +} + +/* + * Each x2apic cluster is an allocation domain. + */ +static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask, + const struct cpumask *mask) +{ + /* + * To minimize vector pressure, default case of boot, device bringup + * etc will use a single cpu for the interrupt destination. + * + * On explicit migration requests coming from irqbalance etc, + * interrupts will be routed to the x2apic cluster (cluster-id + * derived from the first cpu in the mask) members specified + * in the mask. + */ + if (mask == x2apic_cluster_target_cpus()) + cpumask_copy(retmask, cpumask_of(cpu)); + else + cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu)); +} + static struct apic apic_x2apic_cluster = { .name = "cluster x2apic", @@ -219,13 +246,13 @@ static struct apic apic_x2apic_cluster = { .irq_delivery_mode = dest_LowestPrio, .irq_dest_mode = 1, /* logical */ - .target_cpus = x2apic_target_cpus, + .target_cpus = x2apic_cluster_target_cpus, .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = NULL, .check_apicid_present = NULL, - .vector_allocation_domain = x2apic_vector_allocation_domain, + .vector_allocation_domain = cluster_vector_allocation_domain, .init_apic_ldr = init_x2apic_ldr, .ioapic_phys_id_map = NULL, @@ -243,7 +270,6 @@ static struct apic apic_x2apic_cluster = { .set_apic_id = x2apic_set_apic_id, .apic_id_mask = 0xFFFFFFFFu, - .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, .send_IPI_mask = x2apic_send_IPI_mask, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index c17e982db275..e03a1e180e81 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -76,38 +76,6 @@ static void x2apic_send_IPI_all(int vector) __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); } -static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) -{ - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - int cpu = cpumask_first(cpumask); - - if ((unsigned)cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_apicid, cpu); - else - return BAD_APICID; -} - -static unsigned int -x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) -{ - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - for_each_cpu_and(cpu, cpumask, andmask) { - if (cpumask_test_cpu(cpu, cpu_online_mask)) - break; - } - - return per_cpu(x86_cpu_to_apicid, cpu); -} - static void init_x2apic_ldr(void) { } @@ -131,13 +99,13 @@ static struct apic apic_x2apic_phys = { .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 0, /* physical */ - .target_cpus = x2apic_target_cpus, + .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = 0, .check_apicid_used = NULL, .check_apicid_present = NULL, - .vector_allocation_domain = x2apic_vector_allocation_domain, + .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = init_x2apic_ldr, .ioapic_phys_id_map = NULL, @@ -155,8 +123,7 @@ static struct apic apic_x2apic_phys = { .set_apic_id = x2apic_set_apic_id, .apic_id_mask = 0xFFFFFFFFu, - .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, - .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, + .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, .send_IPI_mask = x2apic_send_IPI_mask, .send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index c6d03f7a4401..8cfade9510a4 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -185,17 +185,6 @@ EXPORT_SYMBOL_GPL(uv_possible_blades); unsigned long sn_rtc_cycles_per_second; EXPORT_SYMBOL(sn_rtc_cycles_per_second); -static const struct cpumask *uv_target_cpus(void) -{ - return cpu_online_mask; -} - -static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - cpumask_clear(retmask); - cpumask_set_cpu(cpu, retmask); -} - static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) { #ifdef CONFIG_SMP @@ -280,25 +269,12 @@ static void uv_init_apic_ldr(void) { } -static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask) -{ - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - int cpu = cpumask_first(cpumask); - - if ((unsigned)cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; - else - return BAD_APICID; -} - -static unsigned int +static int uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, - const struct cpumask *andmask) + const struct cpumask *andmask, + unsigned int *apicid) { - int cpu; + int unsigned cpu; /* * We're using fixed IRQ delivery, can only return one phys APIC ID. @@ -308,7 +284,13 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask, if (cpumask_test_cpu(cpu, cpu_online_mask)) break; } - return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; + + if (likely(cpu < nr_cpu_ids)) { + *apicid = per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; + return 0; + } + + return -EINVAL; } static unsigned int x2apic_get_apic_id(unsigned long x) @@ -362,13 +344,13 @@ static struct apic __refdata apic_x2apic_uv_x = { .irq_delivery_mode = dest_Fixed, .irq_dest_mode = 0, /* physical */ - .target_cpus = uv_target_cpus, + .target_cpus = online_target_cpus, .disable_esr = 0, .dest_logical = APIC_DEST_LOGICAL, .check_apicid_used = NULL, .check_apicid_present = NULL, - .vector_allocation_domain = uv_vector_allocation_domain, + .vector_allocation_domain = default_vector_allocation_domain, .init_apic_ldr = uv_init_apic_ldr, .ioapic_phys_id_map = NULL, @@ -386,7 +368,6 @@ static struct apic __refdata apic_x2apic_uv_x = { .set_apic_id = set_apic_id, .apic_id_mask = 0xFFFFFFFFu, - .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and, .send_IPI_mask = uv_send_IPI_mask, diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 07b0c0db466c..d65464e43503 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -201,6 +201,8 @@ * http://www.microsoft.com/whdc/archive/amp_12.mspx] */ +#define pr_fmt(fmt) "apm: " fmt + #include <linux/module.h> #include <linux/poll.h> @@ -485,11 +487,11 @@ static void apm_error(char *str, int err) if (error_table[i].key == err) break; if (i < ERROR_COUNT) - printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg); + pr_notice("%s: %s\n", str, error_table[i].msg); else if (err < 0) - printk(KERN_NOTICE "apm: %s: linux error code %i\n", str, err); + pr_notice("%s: linux error code %i\n", str, err); else - printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n", + pr_notice("%s: unknown error code %#2.2x\n", str, err); } @@ -1184,7 +1186,7 @@ static void queue_event(apm_event_t event, struct apm_user *sender) static int notified; if (notified++ == 0) - printk(KERN_ERR "apm: an event queue overflowed\n"); + pr_err("an event queue overflowed\n"); if (++as->event_tail >= APM_MAX_EVENTS) as->event_tail = 0; } @@ -1447,7 +1449,7 @@ static void apm_mainloop(void) static int check_apm_user(struct apm_user *as, const char *func) { if (as == NULL || as->magic != APM_BIOS_MAGIC) { - printk(KERN_ERR "apm: %s passed bad filp\n", func); + pr_err("%s passed bad filp\n", func); return 1; } return 0; @@ -1586,7 +1588,7 @@ static int do_release(struct inode *inode, struct file *filp) as1 = as1->next) ; if (as1 == NULL) - printk(KERN_ERR "apm: filp not in user list\n"); + pr_err("filp not in user list\n"); else as1->next = as->next; } @@ -1600,11 +1602,9 @@ static int do_open(struct inode *inode, struct file *filp) struct apm_user *as; as = kmalloc(sizeof(*as), GFP_KERNEL); - if (as == NULL) { - printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n", - sizeof(*as)); + if (as == NULL) return -ENOMEM; - } + as->magic = APM_BIOS_MAGIC; as->event_tail = as->event_head = 0; as->suspends_pending = as->standbys_pending = 0; @@ -2313,16 +2313,16 @@ static int __init apm_init(void) } if (apm_info.disabled) { - printk(KERN_NOTICE "apm: disabled on user request.\n"); + pr_notice("disabled on user request.\n"); return -ENODEV; } if ((num_online_cpus() > 1) && !power_off && !smp) { - printk(KERN_NOTICE "apm: disabled - APM is not SMP safe.\n"); + pr_notice("disabled - APM is not SMP safe.\n"); apm_info.disabled = 1; return -ENODEV; } if (!acpi_disabled) { - printk(KERN_NOTICE "apm: overridden by ACPI.\n"); + pr_notice("overridden by ACPI.\n"); apm_info.disabled = 1; return -ENODEV; } @@ -2356,8 +2356,7 @@ static int __init apm_init(void) kapmd_task = kthread_create(apm, NULL, "kapmd"); if (IS_ERR(kapmd_task)) { - printk(KERN_ERR "apm: disabled - Unable to start kernel " - "thread.\n"); + pr_err("disabled - Unable to start kernel thread\n"); err = PTR_ERR(kapmd_task); kapmd_task = NULL; remove_proc_entry("apm", NULL); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 6ab6aa2fdfdd..bac4c3804cc7 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -32,7 +32,9 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o ifdef CONFIG_PERF_EVENTS obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd.o -obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_p4.o perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_p4.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o endif obj-$(CONFIG_X86_MCE) += mcheck/ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 146bb6218eec..9d92e19039f0 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -19,6 +19,39 @@ #include "cpu.h" +static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) +{ + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + u32 gprs[8] = { 0 }; + int err; + + WARN_ONCE((c->x86 != 0xf), "%s should only be used on K8!\n", __func__); + + gprs[1] = msr; + gprs[7] = 0x9c5a203a; + + err = rdmsr_safe_regs(gprs); + + *p = gprs[0] | ((u64)gprs[2] << 32); + + return err; +} + +static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) +{ + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + u32 gprs[8] = { 0 }; + + WARN_ONCE((c->x86 != 0xf), "%s should only be used on K8!\n", __func__); + + gprs[0] = (u32)val; + gprs[1] = msr; + gprs[2] = val >> 32; + gprs[7] = 0x9c5a203a; + + return wrmsr_safe_regs(gprs); +} + #ifdef CONFIG_X86_32 /* * B step AMD K6 before B 9730xxxx have hardware bugs that can cause @@ -586,9 +619,9 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) !cpu_has(c, X86_FEATURE_TOPOEXT)) { u64 val; - if (!rdmsrl_amd_safe(0xc0011005, &val)) { + if (!rdmsrl_safe(0xc0011005, &val)) { val |= 1ULL << 54; - wrmsrl_amd_safe(0xc0011005, val); + wrmsrl_safe(0xc0011005, val); rdmsrl(0xc0011005, val); if (val & (1ULL << 54)) { set_cpu_cap(c, X86_FEATURE_TOPOEXT); @@ -679,7 +712,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask); if (err == 0) { mask |= (1 << 10); - checking_wrmsrl(MSR_AMD64_MCx_MASK(4), mask); + wrmsrl_safe(MSR_AMD64_MCx_MASK(4), mask); } } diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 46674fbb62ba..c97bb7b5a9f8 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -55,8 +55,8 @@ static void __init check_fpu(void) if (!boot_cpu_data.hard_math) { #ifndef CONFIG_MATH_EMULATION - printk(KERN_EMERG "No coprocessor found and no math emulation present.\n"); - printk(KERN_EMERG "Giving up.\n"); + pr_emerg("No coprocessor found and no math emulation present\n"); + pr_emerg("Giving up\n"); for (;;) ; #endif return; @@ -86,7 +86,7 @@ static void __init check_fpu(void) boot_cpu_data.fdiv_bug = fdiv_bug; if (boot_cpu_data.fdiv_bug) - printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n"); + pr_warn("Hmm, FPU with FDIV bug\n"); } static void __init check_hlt(void) @@ -94,16 +94,16 @@ static void __init check_hlt(void) if (boot_cpu_data.x86 >= 5 || paravirt_enabled()) return; - printk(KERN_INFO "Checking 'hlt' instruction... "); + pr_info("Checking 'hlt' instruction... "); if (!boot_cpu_data.hlt_works_ok) { - printk("disabled\n"); + pr_cont("disabled\n"); return; } halt(); halt(); halt(); halt(); - printk(KERN_CONT "OK.\n"); + pr_cont("OK\n"); } /* @@ -116,7 +116,7 @@ static void __init check_popad(void) #ifndef CONFIG_X86_POPAD_OK int res, inp = (int) &res; - printk(KERN_INFO "Checking for popad bug... "); + pr_info("Checking for popad bug... "); __asm__ __volatile__( "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx " : "=&a" (res) @@ -127,9 +127,9 @@ static void __init check_popad(void) * CPU hard. Too bad. */ if (res != 12345678) - printk(KERN_CONT "Buggy.\n"); + pr_cont("Buggy\n"); else - printk(KERN_CONT "OK.\n"); + pr_cont("OK\n"); #endif } @@ -161,7 +161,7 @@ void __init check_bugs(void) { identify_boot_cpu(); #ifndef CONFIG_SMP - printk(KERN_INFO "CPU: "); + pr_info("CPU: "); print_cpu_info(&boot_cpu_data); #endif check_config(); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 6b9333b429ba..5bbc082c47ad 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -947,7 +947,7 @@ static void __cpuinit __print_cpu_msr(void) index_max = msr_range_array[i].max; for (index = index_min; index < index_max; index++) { - if (rdmsrl_amd_safe(index, &val)) + if (rdmsrl_safe(index, &val)) continue; printk(KERN_INFO " MSR%08x: %016llx\n", index, val); } diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 755f64fb0743..a8f8fa9769d6 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -37,6 +37,9 @@ static const __initconst struct hypervisor_x86 * const hypervisors[] = #endif &x86_hyper_vmware, &x86_hyper_ms_hyperv, +#ifdef CONFIG_KVM_GUEST + &x86_hyper_kvm, +#endif }; const struct hypervisor_x86 *x86_hyper; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index da27c5d2168a..5e095f873e3e 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -7,6 +7,9 @@ * Copyright 2008 Intel Corporation * Author: Andi Kleen */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/thread_info.h> #include <linux/capability.h> #include <linux/miscdevice.h> @@ -57,8 +60,6 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex); int mce_disabled __read_mostly; -#define MISC_MCELOG_MINOR 227 - #define SPINUNIT 100 /* 100ns */ atomic_t mce_entry; @@ -210,7 +211,7 @@ static void drain_mcelog_buffer(void) cpu_relax(); if (!m->finished && retries >= 4) { - pr_err("MCE: skipping error being logged currently!\n"); + pr_err("skipping error being logged currently!\n"); break; } } @@ -1167,8 +1168,9 @@ int memory_failure(unsigned long pfn, int vector, int flags) { /* mce_severity() should not hand us an ACTION_REQUIRED error */ BUG_ON(flags & MF_ACTION_REQUIRED); - printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n" - "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn); + pr_err("Uncorrected memory error in page 0x%lx ignored\n" + "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", + pfn); return 0; } @@ -1186,6 +1188,7 @@ void mce_notify_process(void) { unsigned long pfn; struct mce_info *mi = mce_find_info(); + int flags = MF_ACTION_REQUIRED; if (!mi) mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL); @@ -1200,8 +1203,9 @@ void mce_notify_process(void) * doomed. We still need to mark the page as poisoned and alert any * other users of the page. */ - if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0 || - mi->restartable == 0) { + if (!mi->restartable) + flags |= MF_MUST_KILL; + if (memory_failure(pfn, MCE_VECTOR, flags) < 0) { pr_err("Memory error not recovered"); force_sig(SIGBUS, current); } @@ -1358,11 +1362,10 @@ static int __cpuinit __mcheck_cpu_cap_init(void) b = cap & MCG_BANKCNT_MASK; if (!banks) - printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); + pr_info("CPU supports %d MCE banks\n", b); if (b > MAX_NR_BANKS) { - printk(KERN_WARNING - "MCE: Using only %u machine check banks out of %u\n", + pr_warn("Using only %u machine check banks out of %u\n", MAX_NR_BANKS, b); b = MAX_NR_BANKS; } @@ -1419,7 +1422,7 @@ static void __mcheck_cpu_init_generic(void) static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) { if (c->x86_vendor == X86_VENDOR_UNKNOWN) { - pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); + pr_info("unknown CPU type - not enabling MCE support\n"); return -EOPNOTSUPP; } @@ -1574,7 +1577,7 @@ static void __mcheck_cpu_init_timer(void) /* Handle unconfigured int18 (should never happen) */ static void unexpected_machine_check(struct pt_regs *regs, long error_code) { - printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", + pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", smp_processor_id()); } @@ -1893,8 +1896,7 @@ static int __init mcheck_enable(char *str) get_option(&str, &monarch_timeout); } } else { - printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", - str); + pr_info("mce argument %s ignored. Please use /sys\n", str); return 0; } return 1; @@ -2342,7 +2344,7 @@ static __init int mcheck_init_device(void) return err; } -device_initcall(mcheck_init_device); +device_initcall_sync(mcheck_init_device); /* * Old style boot options parsing. Only for compatibility. diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index f4873a64f46d..c4e916d77378 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -1,15 +1,17 @@ /* - * (c) 2005, 2006 Advanced Micro Devices, Inc. + * (c) 2005-2012 Advanced Micro Devices, Inc. * Your use of this code is subject to the terms and conditions of the * GNU general public license version 2. See "COPYING" or * http://www.gnu.org/licenses/gpl.html * * Written by Jacob Shin - AMD, Inc. * - * Support : jacob.shin@amd.com + * Support: borislav.petkov@amd.com * * April 2006 * - added support for AMD Family 0x10 processors + * May 2012 + * - major scrubbing * * All MC4_MISCi registers are shared between multi-cores */ @@ -25,6 +27,7 @@ #include <linux/cpu.h> #include <linux/smp.h> +#include <asm/amd_nb.h> #include <asm/apic.h> #include <asm/idle.h> #include <asm/mce.h> @@ -45,23 +48,15 @@ #define MASK_BLKPTR_LO 0xFF000000 #define MCG_XBLK_ADDR 0xC0000400 -struct threshold_block { - unsigned int block; - unsigned int bank; - unsigned int cpu; - u32 address; - u16 interrupt_enable; - bool interrupt_capable; - u16 threshold_limit; - struct kobject kobj; - struct list_head miscj; +static const char * const th_names[] = { + "load_store", + "insn_fetch", + "combined_unit", + "", + "northbridge", + "execution_unit", }; -struct threshold_bank { - struct kobject *kobj; - struct threshold_block *blocks; - cpumask_var_t cpus; -}; static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); static unsigned char shared_bank[NR_BANKS] = { @@ -84,6 +79,26 @@ struct thresh_restart { u16 old_limit; }; +static const char * const bank4_names(struct threshold_block *b) +{ + switch (b->address) { + /* MSR4_MISC0 */ + case 0x00000413: + return "dram"; + + case 0xc0000408: + return "ht_links"; + + case 0xc0000409: + return "l3_cache"; + + default: + WARN(1, "Funny MSR: 0x%08x\n", b->address); + return ""; + } +}; + + static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits) { /* @@ -224,8 +239,6 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) if (!block) per_cpu(bank_map, cpu) |= (1 << bank); - if (shared_bank[bank] && c->cpu_core_id) - break; memset(&b, 0, sizeof(b)); b.cpu = cpu; @@ -326,7 +339,7 @@ struct threshold_attr { #define SHOW_FIELDS(name) \ static ssize_t show_ ## name(struct threshold_block *b, char *buf) \ { \ - return sprintf(buf, "%lx\n", (unsigned long) b->name); \ + return sprintf(buf, "%lu\n", (unsigned long) b->name); \ } SHOW_FIELDS(interrupt_enable) SHOW_FIELDS(threshold_limit) @@ -377,38 +390,21 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size) return size; } -struct threshold_block_cross_cpu { - struct threshold_block *tb; - long retval; -}; - -static void local_error_count_handler(void *_tbcc) -{ - struct threshold_block_cross_cpu *tbcc = _tbcc; - struct threshold_block *b = tbcc->tb; - u32 low, high; - - rdmsr(b->address, low, high); - tbcc->retval = (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit); -} - static ssize_t show_error_count(struct threshold_block *b, char *buf) { - struct threshold_block_cross_cpu tbcc = { .tb = b, }; + u32 lo, hi; - smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1); - return sprintf(buf, "%lx\n", tbcc.retval); -} - -static ssize_t store_error_count(struct threshold_block *b, - const char *buf, size_t count) -{ - struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 }; + rdmsr_on_cpu(b->cpu, b->address, &lo, &hi); - smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); - return 1; + return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) - + (THRESHOLD_MAX - b->threshold_limit))); } +static struct threshold_attr error_count = { + .attr = {.name = __stringify(error_count), .mode = 0444 }, + .show = show_error_count, +}; + #define RW_ATTR(val) \ static struct threshold_attr val = { \ .attr = {.name = __stringify(val), .mode = 0644 }, \ @@ -418,7 +414,6 @@ static struct threshold_attr val = { \ RW_ATTR(interrupt_enable); RW_ATTR(threshold_limit); -RW_ATTR(error_count); static struct attribute *default_attrs[] = { &threshold_limit.attr, @@ -517,7 +512,7 @@ static __cpuinit int allocate_threshold_blocks(unsigned int cpu, err = kobject_init_and_add(&b->kobj, &threshold_ktype, per_cpu(threshold_banks, cpu)[bank]->kobj, - "misc%i", block); + (bank == 4 ? bank4_names(b) : th_names[bank])); if (err) goto out_free; recurse: @@ -548,98 +543,91 @@ out_free: return err; } -static __cpuinit long -local_allocate_threshold_blocks(int cpu, unsigned int bank) +static __cpuinit int __threshold_add_blocks(struct threshold_bank *b) { - return allocate_threshold_blocks(cpu, bank, 0, - MSR_IA32_MC0_MISC + bank * 4); + struct list_head *head = &b->blocks->miscj; + struct threshold_block *pos = NULL; + struct threshold_block *tmp = NULL; + int err = 0; + + err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name); + if (err) + return err; + + list_for_each_entry_safe(pos, tmp, head, miscj) { + + err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name); + if (err) { + list_for_each_entry_safe_reverse(pos, tmp, head, miscj) + kobject_del(&pos->kobj); + + return err; + } + } + return err; } -/* symlinks sibling shared banks to first core. first core owns dir/files. */ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) { - int i, err = 0; - struct threshold_bank *b = NULL; struct device *dev = per_cpu(mce_device, cpu); - char name[32]; + struct amd_northbridge *nb = NULL; + struct threshold_bank *b = NULL; + const char *name = th_names[bank]; + int err = 0; - sprintf(name, "threshold_bank%i", bank); + if (shared_bank[bank]) { -#ifdef CONFIG_SMP - if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ - i = cpumask_first(cpu_llc_shared_mask(cpu)); + nb = node_to_amd_nb(amd_get_nb_id(cpu)); + WARN_ON(!nb); - /* first core not up yet */ - if (cpu_data(i).cpu_core_id) - goto out; + /* threshold descriptor already initialized on this node? */ + if (nb->bank4) { + /* yes, use it */ + b = nb->bank4; + err = kobject_add(b->kobj, &dev->kobj, name); + if (err) + goto out; - /* already linked */ - if (per_cpu(threshold_banks, cpu)[bank]) - goto out; + per_cpu(threshold_banks, cpu)[bank] = b; + atomic_inc(&b->cpus); - b = per_cpu(threshold_banks, i)[bank]; + err = __threshold_add_blocks(b); - if (!b) goto out; - - err = sysfs_create_link(&dev->kobj, b->kobj, name); - if (err) - goto out; - - cpumask_copy(b->cpus, cpu_llc_shared_mask(cpu)); - per_cpu(threshold_banks, cpu)[bank] = b; - - goto out; + } } -#endif b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); if (!b) { err = -ENOMEM; goto out; } - if (!zalloc_cpumask_var(&b->cpus, GFP_KERNEL)) { - kfree(b); - err = -ENOMEM; - goto out; - } b->kobj = kobject_create_and_add(name, &dev->kobj); - if (!b->kobj) + if (!b->kobj) { + err = -EINVAL; goto out_free; - -#ifndef CONFIG_SMP - cpumask_setall(b->cpus); -#else - cpumask_set_cpu(cpu, b->cpus); -#endif + } per_cpu(threshold_banks, cpu)[bank] = b; - err = local_allocate_threshold_blocks(cpu, bank); - if (err) - goto out_free; - - for_each_cpu(i, b->cpus) { - if (i == cpu) - continue; - - dev = per_cpu(mce_device, i); - if (dev) - err = sysfs_create_link(&dev->kobj,b->kobj, name); - if (err) - goto out; + if (shared_bank[bank]) { + atomic_set(&b->cpus, 1); - per_cpu(threshold_banks, i)[bank] = b; + /* nb is already initialized, see above */ + WARN_ON(nb->bank4); + nb->bank4 = b; } - goto out; + err = allocate_threshold_blocks(cpu, bank, 0, + MSR_IA32_MC0_MISC + bank * 4); + if (!err) + goto out; -out_free: - per_cpu(threshold_banks, cpu)[bank] = NULL; - free_cpumask_var(b->cpus); + out_free: kfree(b); -out: + + out: return err; } @@ -660,12 +648,6 @@ static __cpuinit int threshold_create_device(unsigned int cpu) return err; } -/* - * let's be hotplug friendly. - * in case of multiple core processors, the first core always takes ownership - * of shared sysfs dir/files, and rest of the cores will be symlinked to it. - */ - static void deallocate_threshold_block(unsigned int cpu, unsigned int bank) { @@ -686,41 +668,42 @@ static void deallocate_threshold_block(unsigned int cpu, per_cpu(threshold_banks, cpu)[bank]->blocks = NULL; } +static void __threshold_remove_blocks(struct threshold_bank *b) +{ + struct threshold_block *pos = NULL; + struct threshold_block *tmp = NULL; + + kobject_del(b->kobj); + + list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj) + kobject_del(&pos->kobj); +} + static void threshold_remove_bank(unsigned int cpu, int bank) { + struct amd_northbridge *nb; struct threshold_bank *b; - struct device *dev; - char name[32]; - int i = 0; b = per_cpu(threshold_banks, cpu)[bank]; if (!b) return; + if (!b->blocks) goto free_out; - sprintf(name, "threshold_bank%i", bank); - -#ifdef CONFIG_SMP - /* sibling symlink */ - if (shared_bank[bank] && b->blocks->cpu != cpu) { - dev = per_cpu(mce_device, cpu); - sysfs_remove_link(&dev->kobj, name); - per_cpu(threshold_banks, cpu)[bank] = NULL; - - return; - } -#endif - - /* remove all sibling symlinks before unregistering */ - for_each_cpu(i, b->cpus) { - if (i == cpu) - continue; - - dev = per_cpu(mce_device, i); - if (dev) - sysfs_remove_link(&dev->kobj, name); - per_cpu(threshold_banks, i)[bank] = NULL; + if (shared_bank[bank]) { + if (!atomic_dec_and_test(&b->cpus)) { + __threshold_remove_blocks(b); + per_cpu(threshold_banks, cpu)[bank] = NULL; + return; + } else { + /* + * the last CPU on this node using the shared bank is + * going away, remove that bank now. + */ + nb = node_to_amd_nb(amd_get_nb_id(cpu)); + nb->bank4 = NULL; + } } deallocate_threshold_block(cpu, bank); @@ -728,7 +711,6 @@ static void threshold_remove_bank(unsigned int cpu, int bank) free_out: kobject_del(b->kobj); kobject_put(b->kobj); - free_cpumask_var(b->cpus); kfree(b); per_cpu(threshold_banks, cpu)[bank] = NULL; } @@ -777,4 +759,24 @@ static __init int threshold_init_device(void) return 0; } -device_initcall(threshold_init_device); +/* + * there are 3 funcs which need to be _initcalled in a logic sequence: + * 1. xen_late_init_mcelog + * 2. mcheck_init_device + * 3. threshold_init_device + * + * xen_late_init_mcelog must register xen_mce_chrdev_device before + * native mce_chrdev_device registration if running under xen platform; + * + * mcheck_init_device should be inited before threshold_init_device to + * initialize mce_device, otherwise a NULL ptr dereference will cause panic. + * + * so we use following _initcalls + * 1. device_initcall(xen_late_init_mcelog); + * 2. device_initcall_sync(mcheck_init_device); + * 3. late_initcall(threshold_init_device); + * + * when running under xen, the initcall order is 1,2,3; + * on baremetal, we skip 1 and we do only 2 and 3. + */ +late_initcall(threshold_init_device); diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl index dfea390e1608..c7b3fe2d72e0 100644 --- a/arch/x86/kernel/cpu/mkcapflags.pl +++ b/arch/x86/kernel/cpu/mkcapflags.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/perl -w # # Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h # @@ -11,22 +11,35 @@ open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n"; print OUT "#include <asm/cpufeature.h>\n\n"; print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n"; +%features = (); +$err = 0; + while (defined($line = <IN>)) { if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) { $macro = $1; - $feature = $2; + $feature = "\L$2"; $tail = $3; if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) { - $feature = $1; + $feature = "\L$1"; } - if ($feature ne '') { - printf OUT "\t%-32s = \"%s\",\n", - "[$macro]", "\L$feature"; + next if ($feature eq ''); + + if ($features{$feature}++) { + print STDERR "$in: duplicate feature name: $feature\n"; + $err++; } + printf OUT "\t%-32s = \"%s\",\n", "[$macro]", $feature; } } print OUT "};\n"; close(IN); close(OUT); + +if ($err) { + unlink($out); + exit(1); +} + +exit(0); diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index bdda2e6c673b..35ffda5d0727 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -258,11 +258,11 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, /* Compute the maximum size with which we can make a range: */ if (range_startk) - max_align = ffs(range_startk) - 1; + max_align = __ffs(range_startk); else - max_align = 32; + max_align = BITS_PER_LONG - 1; - align = fls(range_sizek) - 1; + align = __fls(range_sizek); if (align > max_align) align = max_align; diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 75772ae6c65f..e9fe907cd249 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -361,11 +361,7 @@ static void __init print_mtrr_state(void) } pr_debug("MTRR variable ranges %sabled:\n", mtrr_state.enabled & 2 ? "en" : "dis"); - if (size_or_mask & 0xffffffffUL) - high_width = ffs(size_or_mask & 0xffffffffUL) - 1; - else - high_width = ffs(size_or_mask>>32) + 32 - 1; - high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4; + high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4; for (i = 0; i < num_var_ranges; ++i) { if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index c4706cf9c011..29557aa06dda 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -35,17 +35,6 @@ #include "perf_event.h" -#if 0 -#undef wrmsrl -#define wrmsrl(msr, val) \ -do { \ - trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\ - (unsigned long)(val)); \ - native_write_msr((msr), (u32)((u64)(val)), \ - (u32)((u64)(val) >> 32)); \ -} while (0) -#endif - struct x86_pmu x86_pmu __read_mostly; DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { @@ -74,7 +63,7 @@ u64 x86_perf_event_update(struct perf_event *event) int idx = hwc->idx; s64 delta; - if (idx == X86_PMC_IDX_FIXED_BTS) + if (idx == INTEL_PMC_IDX_FIXED_BTS) return 0; /* @@ -86,7 +75,7 @@ u64 x86_perf_event_update(struct perf_event *event) */ again: prev_raw_count = local64_read(&hwc->prev_count); - rdmsrl(hwc->event_base, new_raw_count); + rdpmcl(hwc->event_base_rdpmc, new_raw_count); if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) @@ -189,7 +178,7 @@ static void release_pmc_hardware(void) {} static bool check_hw_exists(void) { - u64 val, val_new = 0; + u64 val, val_new = ~0; int i, reg, ret = 0; /* @@ -222,8 +211,9 @@ static bool check_hw_exists(void) * that don't trap on the MSR access and always return 0s. */ val = 0xabcdUL; - ret = checking_wrmsrl(x86_pmu_event_addr(0), val); - ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new); + reg = x86_pmu_event_addr(0); + ret = wrmsrl_safe(reg, val); + ret |= rdmsrl_safe(reg, &val_new); if (ret || val != val_new) goto msr_fail; @@ -240,6 +230,7 @@ bios_fail: msr_fail: printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n"); + printk(KERN_ERR "Failed to access perfctr msr (MSR %x is %Lx)\n", reg, val_new); return false; } @@ -388,7 +379,7 @@ int x86_pmu_hw_config(struct perf_event *event) int precise = 0; /* Support for constant skid */ - if (x86_pmu.pebs_active) { + if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { precise++; /* Support for IP fixup */ @@ -637,8 +628,8 @@ static bool __perf_sched_find_counter(struct perf_sched *sched) c = sched->constraints[sched->state.event]; /* Prefer fixed purpose counters */ - if (x86_pmu.num_counters_fixed) { - idx = X86_PMC_IDX_FIXED; + if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) { + idx = INTEL_PMC_IDX_FIXED; for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) { if (!__test_and_set_bit(idx, sched->state.used)) goto done; @@ -646,7 +637,7 @@ static bool __perf_sched_find_counter(struct perf_sched *sched) } /* Grab the first unused counter starting with idx */ idx = sched->state.counter; - for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_FIXED) { + for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) { if (!__test_and_set_bit(idx, sched->state.used)) goto done; } @@ -704,8 +695,8 @@ static bool perf_sched_next_event(struct perf_sched *sched) /* * Assign a counter for each event. */ -static int perf_assign_events(struct event_constraint **constraints, int n, - int wmin, int wmax, int *assign) +int perf_assign_events(struct event_constraint **constraints, int n, + int wmin, int wmax, int *assign) { struct perf_sched sched; @@ -824,15 +815,17 @@ static inline void x86_assign_hw_event(struct perf_event *event, hwc->last_cpu = smp_processor_id(); hwc->last_tag = ++cpuc->tags[i]; - if (hwc->idx == X86_PMC_IDX_FIXED_BTS) { + if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) { hwc->config_base = 0; hwc->event_base = 0; - } else if (hwc->idx >= X86_PMC_IDX_FIXED) { + } else if (hwc->idx >= INTEL_PMC_IDX_FIXED) { hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; - hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED); + hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED); + hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30; } else { hwc->config_base = x86_pmu_config_addr(hwc->idx); hwc->event_base = x86_pmu_event_addr(hwc->idx); + hwc->event_base_rdpmc = hwc->idx; } } @@ -930,7 +923,7 @@ int x86_perf_event_set_period(struct perf_event *event) s64 period = hwc->sample_period; int ret = 0, idx = hwc->idx; - if (idx == X86_PMC_IDX_FIXED_BTS) + if (idx == INTEL_PMC_IDX_FIXED_BTS) return 0; /* @@ -1316,7 +1309,6 @@ static struct attribute_group x86_pmu_format_group = { static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; - struct event_constraint *c; int err; pr_info("Performance Events: "); @@ -1347,21 +1339,8 @@ static int __init init_hw_perf_events(void) for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) quirk->func(); - if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { - WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", - x86_pmu.num_counters, X86_PMC_MAX_GENERIC); - x86_pmu.num_counters = X86_PMC_MAX_GENERIC; - } - x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; - - if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { - WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", - x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); - x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; - } - - x86_pmu.intel_ctrl |= - ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; + if (!x86_pmu.intel_ctrl) + x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; perf_events_lapic_init(); register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); @@ -1370,22 +1349,6 @@ static int __init init_hw_perf_events(void) __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, 0, x86_pmu.num_counters, 0); - if (x86_pmu.event_constraints) { - /* - * event on fixed counter2 (REF_CYCLES) only works on this - * counter, so do not extend mask to generic counters - */ - for_each_event_constraint(c, x86_pmu.event_constraints) { - if (c->cmask != X86_RAW_EVENT_MASK - || c->idxmsk64 == X86_PMC_MSK_FIXED_REF_CYCLES) { - continue; - } - - c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; - c->weight += x86_pmu.num_counters; - } - } - x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ x86_pmu_format_group.attrs = x86_pmu.format_attrs; @@ -1620,8 +1583,8 @@ static int x86_pmu_event_idx(struct perf_event *event) if (!x86_pmu.attr_rdpmc) return 0; - if (x86_pmu.num_counters_fixed && idx >= X86_PMC_IDX_FIXED) { - idx -= X86_PMC_IDX_FIXED; + if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) { + idx -= INTEL_PMC_IDX_FIXED; idx |= 1 << 30; } @@ -1649,7 +1612,12 @@ static ssize_t set_attr_rdpmc(struct device *cdev, struct device_attribute *attr, const char *buf, size_t count) { - unsigned long val = simple_strtoul(buf, NULL, 0); + unsigned long val; + ssize_t ret; + + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; if (!!val != !!x86_pmu.attr_rdpmc) { x86_pmu.attr_rdpmc = !!val; @@ -1682,13 +1650,20 @@ static void x86_pmu_flush_branch_stack(void) x86_pmu.flush_branch_stack(); } +void perf_check_microcode(void) +{ + if (x86_pmu.check_microcode) + x86_pmu.check_microcode(); +} +EXPORT_SYMBOL_GPL(perf_check_microcode); + static struct pmu pmu = { .pmu_enable = x86_pmu_enable, .pmu_disable = x86_pmu_disable, - .attr_groups = x86_pmu_attr_groups, + .attr_groups = x86_pmu_attr_groups, - .event_init = x86_pmu_event_init, + .event_init = x86_pmu_event_init, .add = x86_pmu_add, .del = x86_pmu_del, @@ -1696,11 +1671,11 @@ static struct pmu pmu = { .stop = x86_pmu_stop, .read = x86_pmu_read, - .start_txn = x86_pmu_start_txn, - .cancel_txn = x86_pmu_cancel_txn, - .commit_txn = x86_pmu_commit_txn, + .start_txn = x86_pmu_start_txn, + .cancel_txn = x86_pmu_cancel_txn, + .commit_txn = x86_pmu_commit_txn, - .event_idx = x86_pmu_event_idx, + .event_idx = x86_pmu_event_idx, .flush_branch_stack = x86_pmu_flush_branch_stack, }; @@ -1863,7 +1838,7 @@ unsigned long perf_misc_flags(struct pt_regs *regs) else misc |= PERF_RECORD_MISC_GUEST_KERNEL; } else { - if (user_mode(regs)) + if (!kernel_ip(regs->ip)) misc |= PERF_RECORD_MISC_USER; else misc |= PERF_RECORD_MISC_KERNEL; diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 7241e2fc3c17..a15df4be151f 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -14,6 +14,18 @@ #include <linux/perf_event.h> +#if 0 +#undef wrmsrl +#define wrmsrl(msr, val) \ +do { \ + unsigned int _msr = (msr); \ + u64 _val = (val); \ + trace_printk("wrmsrl(%x, %Lx)\n", (unsigned int)(_msr), \ + (unsigned long long)(_val)); \ + native_write_msr((_msr), (u32)(_val), (u32)(_val >> 32)); \ +} while (0) +#endif + /* * | NHM/WSM | SNB | * register ------------------------------- @@ -57,7 +69,7 @@ struct amd_nb { }; /* The maximal number of PEBS events: */ -#define MAX_PEBS_EVENTS 4 +#define MAX_PEBS_EVENTS 8 /* * A debug store configuration. @@ -349,6 +361,8 @@ struct x86_pmu { void (*cpu_starting)(int cpu); void (*cpu_dying)(int cpu); void (*cpu_dead)(int cpu); + + void (*check_microcode)(void); void (*flush_branch_stack)(void); /* @@ -360,12 +374,16 @@ struct x86_pmu { /* * Intel DebugStore bits */ - int bts, pebs; - int bts_active, pebs_active; + int bts :1, + bts_active :1, + pebs :1, + pebs_active :1, + pebs_broken :1; int pebs_record_size; void (*drain_pebs)(struct pt_regs *regs); struct event_constraint *pebs_constraints; void (*pebs_aliases)(struct perf_event *event); + int max_pebs_events; /* * Intel LBR @@ -468,6 +486,8 @@ static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, void x86_pmu_enable_all(int added); +int perf_assign_events(struct event_constraint **constraints, int n, + int wmin, int wmax, int *assign); int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign); void x86_pmu_stop(struct perf_event *event, int flags); diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 11a4eb9131d5..4528ae7b6ec4 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -366,7 +366,7 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY; - if (boot_cpu_data.x86_max_cores < 2 || boot_cpu_data.x86 == 0x15) + if (boot_cpu_data.x86_max_cores < 2) return; nb_id = amd_get_nb_id(cpu); @@ -422,35 +422,6 @@ static struct attribute *amd_format_attr[] = { NULL, }; -static __initconst const struct x86_pmu amd_pmu = { - .name = "AMD", - .handle_irq = x86_pmu_handle_irq, - .disable_all = x86_pmu_disable_all, - .enable_all = x86_pmu_enable_all, - .enable = x86_pmu_enable_event, - .disable = x86_pmu_disable_event, - .hw_config = amd_pmu_hw_config, - .schedule_events = x86_schedule_events, - .eventsel = MSR_K7_EVNTSEL0, - .perfctr = MSR_K7_PERFCTR0, - .event_map = amd_pmu_event_map, - .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_counters = AMD64_NUM_COUNTERS, - .cntval_bits = 48, - .cntval_mask = (1ULL << 48) - 1, - .apic = 1, - /* use highest bit to detect overflow */ - .max_period = (1ULL << 47) - 1, - .get_event_constraints = amd_get_event_constraints, - .put_event_constraints = amd_put_event_constraints, - - .format_attrs = amd_format_attr, - - .cpu_prepare = amd_pmu_cpu_prepare, - .cpu_starting = amd_pmu_cpu_starting, - .cpu_dead = amd_pmu_cpu_dead, -}; - /* AMD Family 15h */ #define AMD_EVENT_TYPE_MASK 0x000000F0ULL @@ -597,8 +568,8 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *ev } } -static __initconst const struct x86_pmu amd_pmu_f15h = { - .name = "AMD Family 15h", +static __initconst const struct x86_pmu amd_pmu = { + .name = "AMD", .handle_irq = x86_pmu_handle_irq, .disable_all = x86_pmu_disable_all, .enable_all = x86_pmu_enable_all, @@ -606,50 +577,68 @@ static __initconst const struct x86_pmu amd_pmu_f15h = { .disable = x86_pmu_disable_event, .hw_config = amd_pmu_hw_config, .schedule_events = x86_schedule_events, - .eventsel = MSR_F15H_PERF_CTL, - .perfctr = MSR_F15H_PERF_CTR, + .eventsel = MSR_K7_EVNTSEL0, + .perfctr = MSR_K7_PERFCTR0, .event_map = amd_pmu_event_map, .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_counters = AMD64_NUM_COUNTERS_F15H, + .num_counters = AMD64_NUM_COUNTERS, .cntval_bits = 48, .cntval_mask = (1ULL << 48) - 1, .apic = 1, /* use highest bit to detect overflow */ .max_period = (1ULL << 47) - 1, - .get_event_constraints = amd_get_event_constraints_f15h, - /* nortbridge counters not yet implemented: */ -#if 0 + .get_event_constraints = amd_get_event_constraints, .put_event_constraints = amd_put_event_constraints, + .format_attrs = amd_format_attr, + .cpu_prepare = amd_pmu_cpu_prepare, - .cpu_dead = amd_pmu_cpu_dead, -#endif .cpu_starting = amd_pmu_cpu_starting, - .format_attrs = amd_format_attr, + .cpu_dead = amd_pmu_cpu_dead, }; +static int setup_event_constraints(void) +{ + if (boot_cpu_data.x86 >= 0x15) + x86_pmu.get_event_constraints = amd_get_event_constraints_f15h; + return 0; +} + +static int setup_perfctr_core(void) +{ + if (!cpu_has_perfctr_core) { + WARN(x86_pmu.get_event_constraints == amd_get_event_constraints_f15h, + KERN_ERR "Odd, counter constraints enabled but no core perfctrs detected!"); + return -ENODEV; + } + + WARN(x86_pmu.get_event_constraints == amd_get_event_constraints, + KERN_ERR "hw perf events core counters need constraints handler!"); + + /* + * If core performance counter extensions exists, we must use + * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also + * x86_pmu_addr_offset(). + */ + x86_pmu.eventsel = MSR_F15H_PERF_CTL; + x86_pmu.perfctr = MSR_F15H_PERF_CTR; + x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE; + + printk(KERN_INFO "perf: AMD core performance counters detected\n"); + + return 0; +} + __init int amd_pmu_init(void) { /* Performance-monitoring supported from K7 and later: */ if (boot_cpu_data.x86 < 6) return -ENODEV; - /* - * If core performance counter extensions exists, it must be - * family 15h, otherwise fail. See x86_pmu_addr_offset(). - */ - switch (boot_cpu_data.x86) { - case 0x15: - if (!cpu_has_perfctr_core) - return -ENODEV; - x86_pmu = amd_pmu_f15h; - break; - default: - if (cpu_has_perfctr_core) - return -ENODEV; - x86_pmu = amd_pmu; - break; - } + x86_pmu = amd_pmu; + + setup_event_constraints(); + setup_perfctr_core(); /* Events are common for all AMDs */ memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 187c294bc658..7a8b9d0abcaa 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -5,6 +5,8 @@ * among events on a single PMU. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/stddef.h> #include <linux/types.h> #include <linux/init.h> @@ -21,14 +23,14 @@ */ static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = { - [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, - [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, - [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */ + [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, + [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */ }; static struct event_constraint intel_core_event_constraints[] __read_mostly = @@ -747,7 +749,7 @@ static void intel_pmu_disable_all(void) wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); - if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) + if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) intel_pmu_disable_bts(); intel_pmu_pebs_disable_all(); @@ -763,9 +765,9 @@ static void intel_pmu_enable_all(int added) wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask); - if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { + if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { struct perf_event *event = - cpuc->events[X86_PMC_IDX_FIXED_BTS]; + cpuc->events[INTEL_PMC_IDX_FIXED_BTS]; if (WARN_ON_ONCE(!event)) return; @@ -871,7 +873,7 @@ static inline void intel_pmu_ack_status(u64 ack) static void intel_pmu_disable_fixed(struct hw_perf_event *hwc) { - int idx = hwc->idx - X86_PMC_IDX_FIXED; + int idx = hwc->idx - INTEL_PMC_IDX_FIXED; u64 ctrl_val, mask; mask = 0xfULL << (idx * 4); @@ -886,7 +888,7 @@ static void intel_pmu_disable_event(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { + if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) { intel_pmu_disable_bts(); intel_pmu_drain_bts_buffer(); return; @@ -915,7 +917,7 @@ static void intel_pmu_disable_event(struct perf_event *event) static void intel_pmu_enable_fixed(struct hw_perf_event *hwc) { - int idx = hwc->idx - X86_PMC_IDX_FIXED; + int idx = hwc->idx - INTEL_PMC_IDX_FIXED; u64 ctrl_val, bits, mask; /* @@ -949,7 +951,7 @@ static void intel_pmu_enable_event(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { + if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) { if (!__this_cpu_read(cpu_hw_events.enabled)) return; @@ -1000,14 +1002,14 @@ static void intel_pmu_reset(void) local_irq_save(flags); - printk("clearing PMU state on CPU#%d\n", smp_processor_id()); + pr_info("clearing PMU state on CPU#%d\n", smp_processor_id()); for (idx = 0; idx < x86_pmu.num_counters; idx++) { - checking_wrmsrl(x86_pmu_config_addr(idx), 0ull); - checking_wrmsrl(x86_pmu_event_addr(idx), 0ull); + wrmsrl_safe(x86_pmu_config_addr(idx), 0ull); + wrmsrl_safe(x86_pmu_event_addr(idx), 0ull); } for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) - checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); + wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); if (ds) ds->bts_index = ds->bts_buffer_base; @@ -1707,16 +1709,61 @@ static __init void intel_clovertown_quirk(void) * But taken together it might just make sense to not enable PEBS on * these chips. */ - printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); + pr_warn("PEBS disabled due to CPU errata\n"); x86_pmu.pebs = 0; x86_pmu.pebs_constraints = NULL; } +static int intel_snb_pebs_broken(int cpu) +{ + u32 rev = UINT_MAX; /* default to broken for unknown models */ + + switch (cpu_data(cpu).x86_model) { + case 42: /* SNB */ + rev = 0x28; + break; + + case 45: /* SNB-EP */ + switch (cpu_data(cpu).x86_mask) { + case 6: rev = 0x618; break; + case 7: rev = 0x70c; break; + } + } + + return (cpu_data(cpu).microcode < rev); +} + +static void intel_snb_check_microcode(void) +{ + int pebs_broken = 0; + int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) { + if ((pebs_broken = intel_snb_pebs_broken(cpu))) + break; + } + put_online_cpus(); + + if (pebs_broken == x86_pmu.pebs_broken) + return; + + /* + * Serialized by the microcode lock.. + */ + if (x86_pmu.pebs_broken) { + pr_info("PEBS enabled due to microcode update\n"); + x86_pmu.pebs_broken = 0; + } else { + pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n"); + x86_pmu.pebs_broken = 1; + } +} + static __init void intel_sandybridge_quirk(void) { - printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); - x86_pmu.pebs = 0; - x86_pmu.pebs_constraints = NULL; + x86_pmu.check_microcode = intel_snb_check_microcode; + intel_snb_check_microcode(); } static const struct { int id; char *name; } intel_arch_events_map[] __initconst = { @@ -1736,8 +1783,8 @@ static __init void intel_arch_events_quirk(void) /* disable event that reported as not presend by cpuid */ for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) { intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0; - printk(KERN_WARNING "CPUID marked event: \'%s\' unavailable\n", - intel_arch_events_map[bit].name); + pr_warn("CPUID marked event: \'%s\' unavailable\n", + intel_arch_events_map[bit].name); } } @@ -1756,7 +1803,7 @@ static __init void intel_nehalem_quirk(void) intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89; ebx.split.no_branch_misses_retired = 0; x86_pmu.events_maskl = ebx.full; - printk(KERN_INFO "CPU erratum AAJ80 worked around\n"); + pr_info("CPU erratum AAJ80 worked around\n"); } } @@ -1765,6 +1812,7 @@ __init int intel_pmu_init(void) union cpuid10_edx edx; union cpuid10_eax eax; union cpuid10_ebx ebx; + struct event_constraint *c; unsigned int unused; int version; @@ -1800,6 +1848,8 @@ __init int intel_pmu_init(void) x86_pmu.events_maskl = ebx.full; x86_pmu.events_mask_len = eax.split.mask_length; + x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters); + /* * Quirk: v2 perfmon does not report fixed-purpose events, so * assume at least 3 events: @@ -1951,5 +2001,37 @@ __init int intel_pmu_init(void) } } + if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) { + WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", + x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); + x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC; + } + x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; + + if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) { + WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", + x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED); + x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED; + } + + x86_pmu.intel_ctrl |= + ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED; + + if (x86_pmu.event_constraints) { + /* + * event on fixed counter2 (REF_CYCLES) only works on this + * counter, so do not extend mask to generic counters + */ + for_each_event_constraint(c, x86_pmu.event_constraints) { + if (c->cmask != X86_RAW_EVENT_MASK + || c->idxmsk64 == INTEL_PMC_MSK_FIXED_REF_CYCLES) { + continue; + } + + c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; + c->weight += x86_pmu.num_counters; + } + } + return 0; } diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 35e2192df9f4..629ae0b7ad90 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -248,7 +248,7 @@ void reserve_ds_buffers(void) */ struct event_constraint bts_constraint = - EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0); + EVENT_CONSTRAINT(0, 1ULL << INTEL_PMC_IDX_FIXED_BTS, 0); void intel_pmu_enable_bts(u64 config) { @@ -295,7 +295,7 @@ int intel_pmu_drain_bts_buffer(void) u64 to; u64 flags; }; - struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; + struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS]; struct bts_record *at, *top; struct perf_output_handle handle; struct perf_event_header header; @@ -620,7 +620,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) * Should not happen, we program the threshold at 1 and do not * set a reset value. */ - WARN_ON_ONCE(n > 1); + WARN_ONCE(n > 1, "bad leftover pebs %d\n", n); at += n - 1; __intel_pmu_pebs_event(event, iregs, at); @@ -651,10 +651,10 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) * Should not happen, we program the threshold at 1 and do not * set a reset value. */ - WARN_ON_ONCE(n > MAX_PEBS_EVENTS); + WARN_ONCE(n > x86_pmu.max_pebs_events, "Unexpected number of pebs records %d\n", n); for ( ; at < top; at++) { - for_each_set_bit(bit, (unsigned long *)&at->status, MAX_PEBS_EVENTS) { + for_each_set_bit(bit, (unsigned long *)&at->status, x86_pmu.max_pebs_events) { event = cpuc->events[bit]; if (!test_bit(bit, cpuc->active_mask)) continue; @@ -670,7 +670,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) break; } - if (!event || bit >= MAX_PEBS_EVENTS) + if (!event || bit >= x86_pmu.max_pebs_events) continue; __intel_pmu_pebs_event(event, iregs, at); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c new file mode 100644 index 000000000000..19faffc60886 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -0,0 +1,1850 @@ +#include "perf_event_intel_uncore.h" + +static struct intel_uncore_type *empty_uncore[] = { NULL, }; +static struct intel_uncore_type **msr_uncores = empty_uncore; +static struct intel_uncore_type **pci_uncores = empty_uncore; +/* pci bus to socket mapping */ +static int pcibus_to_physid[256] = { [0 ... 255] = -1, }; + +static DEFINE_RAW_SPINLOCK(uncore_box_lock); + +/* mask of cpus that collect uncore events */ +static cpumask_t uncore_cpu_mask; + +/* constraint for the fixed counter */ +static struct event_constraint constraint_fixed = + EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL); +static struct event_constraint constraint_empty = + EVENT_CONSTRAINT(0, 0, 0); + +DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); +DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); +DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19"); +DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23"); +DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28"); +DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31"); +DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31"); +DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28"); +DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15"); +DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30"); +DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51"); +DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4"); +DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17"); +DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22"); +DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31"); +DEFINE_UNCORE_FORMAT_ATTR(filter_brand0, filter_brand0, "config1:0-7"); +DEFINE_UNCORE_FORMAT_ATTR(filter_brand1, filter_brand1, "config1:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(filter_brand2, filter_brand2, "config1:16-23"); +DEFINE_UNCORE_FORMAT_ATTR(filter_brand3, filter_brand3, "config1:24-31"); + +/* Sandy Bridge-EP uncore support */ +static struct intel_uncore_type snbep_uncore_cbox; +static struct intel_uncore_type snbep_uncore_pcu; + +static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + int box_ctl = uncore_pci_box_ctl(box); + u32 config; + + pci_read_config_dword(pdev, box_ctl, &config); + config |= SNBEP_PMON_BOX_CTL_FRZ; + pci_write_config_dword(pdev, box_ctl, config); +} + +static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + int box_ctl = uncore_pci_box_ctl(box); + u32 config; + + pci_read_config_dword(pdev, box_ctl, &config); + config &= ~SNBEP_PMON_BOX_CTL_FRZ; + pci_write_config_dword(pdev, box_ctl, config); +} + +static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, hwc->config_base, hwc->config | + SNBEP_PMON_CTL_EN); +} + +static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, hwc->config_base, hwc->config); +} + +static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + u64 count; + + pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count); + pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1); + return count; +} + +static void snbep_uncore_pci_init_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, + SNBEP_PMON_BOX_CTL_INT); +} + +static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box) +{ + u64 config; + unsigned msr; + + msr = uncore_msr_box_ctl(box); + if (msr) { + rdmsrl(msr, config); + config |= SNBEP_PMON_BOX_CTL_FRZ; + wrmsrl(msr, config); + return; + } +} + +static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + u64 config; + unsigned msr; + + msr = uncore_msr_box_ctl(box); + if (msr) { + rdmsrl(msr, config); + config &= ~SNBEP_PMON_BOX_CTL_FRZ; + wrmsrl(msr, config); + return; + } +} + +static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + + if (reg1->idx != EXTRA_REG_NONE) + wrmsrl(reg1->reg, reg1->config); + + wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); +} + +static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + wrmsrl(hwc->config_base, hwc->config); +} + +static u64 snbep_uncore_msr_read_counter(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 count; + + rdmsrl(hwc->event_base, count); + return count; +} + +static void snbep_uncore_msr_init_box(struct intel_uncore_box *box) +{ + unsigned msr = uncore_msr_box_ctl(box); + if (msr) + wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT); +} + +static struct event_constraint * +snbep_uncore_get_constraint(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct intel_uncore_extra_reg *er; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + unsigned long flags; + bool ok = false; + + if (reg1->idx == EXTRA_REG_NONE || (box->phys_id >= 0 && reg1->alloc)) + return NULL; + + er = &box->shared_regs[reg1->idx]; + raw_spin_lock_irqsave(&er->lock, flags); + if (!atomic_read(&er->ref) || er->config1 == reg1->config) { + atomic_inc(&er->ref); + er->config1 = reg1->config; + ok = true; + } + raw_spin_unlock_irqrestore(&er->lock, flags); + + if (ok) { + if (box->phys_id >= 0) + reg1->alloc = 1; + return NULL; + } + return &constraint_empty; +} + +static void snbep_uncore_put_constraint(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct intel_uncore_extra_reg *er; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + + if (box->phys_id < 0 || !reg1->alloc) + return; + + er = &box->shared_regs[reg1->idx]; + atomic_dec(&er->ref); + reg1->alloc = 0; +} + +static int snbep_uncore_hw_config(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + + if (box->pmu->type == &snbep_uncore_cbox) { + reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER + + SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx; + reg1->config = event->attr.config1 & + SNBEP_CB0_MSR_PMON_BOX_FILTER_MASK; + } else if (box->pmu->type == &snbep_uncore_pcu) { + reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER; + reg1->config = event->attr.config1 & + SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK; + } else { + return 0; + } + reg1->idx = 0; + return 0; +} + +static struct attribute *snbep_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + NULL, +}; + +static struct attribute *snbep_uncore_ubox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh5.attr, + NULL, +}; + +static struct attribute *snbep_uncore_cbox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_tid_en.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + &format_attr_filter_tid.attr, + &format_attr_filter_nid.attr, + &format_attr_filter_state.attr, + &format_attr_filter_opc.attr, + NULL, +}; + +static struct attribute *snbep_uncore_pcu_formats_attr[] = { + &format_attr_event.attr, + &format_attr_occ_sel.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh5.attr, + &format_attr_occ_invert.attr, + &format_attr_occ_edge.attr, + &format_attr_filter_brand0.attr, + &format_attr_filter_brand1.attr, + &format_attr_filter_brand2.attr, + &format_attr_filter_brand3.attr, + NULL, +}; + +static struct uncore_event_desc snbep_uncore_imc_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), + INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"), + INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"), + { /* end: all zeroes */ }, +}; + +static struct uncore_event_desc snbep_uncore_qpi_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x14"), + INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"), + INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x02,umask=0x08"), + INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x03,umask=0x04"), + { /* end: all zeroes */ }, +}; + +static struct attribute_group snbep_uncore_format_group = { + .name = "format", + .attrs = snbep_uncore_formats_attr, +}; + +static struct attribute_group snbep_uncore_ubox_format_group = { + .name = "format", + .attrs = snbep_uncore_ubox_formats_attr, +}; + +static struct attribute_group snbep_uncore_cbox_format_group = { + .name = "format", + .attrs = snbep_uncore_cbox_formats_attr, +}; + +static struct attribute_group snbep_uncore_pcu_format_group = { + .name = "format", + .attrs = snbep_uncore_pcu_formats_attr, +}; + +static struct intel_uncore_ops snbep_uncore_msr_ops = { + .init_box = snbep_uncore_msr_init_box, + .disable_box = snbep_uncore_msr_disable_box, + .enable_box = snbep_uncore_msr_enable_box, + .disable_event = snbep_uncore_msr_disable_event, + .enable_event = snbep_uncore_msr_enable_event, + .read_counter = snbep_uncore_msr_read_counter, + .get_constraint = snbep_uncore_get_constraint, + .put_constraint = snbep_uncore_put_constraint, + .hw_config = snbep_uncore_hw_config, +}; + +static struct intel_uncore_ops snbep_uncore_pci_ops = { + .init_box = snbep_uncore_pci_init_box, + .disable_box = snbep_uncore_pci_disable_box, + .enable_box = snbep_uncore_pci_enable_box, + .disable_event = snbep_uncore_pci_disable_event, + .enable_event = snbep_uncore_pci_enable_event, + .read_counter = snbep_uncore_pci_read_counter, +}; + +static struct event_constraint snbep_uncore_cbox_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x01, 0x1), + UNCORE_EVENT_CONSTRAINT(0x02, 0x3), + UNCORE_EVENT_CONSTRAINT(0x04, 0x3), + UNCORE_EVENT_CONSTRAINT(0x05, 0x3), + UNCORE_EVENT_CONSTRAINT(0x07, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x1), + UNCORE_EVENT_CONSTRAINT(0x12, 0x3), + UNCORE_EVENT_CONSTRAINT(0x13, 0x3), + UNCORE_EVENT_CONSTRAINT(0x1b, 0xc), + UNCORE_EVENT_CONSTRAINT(0x1c, 0xc), + UNCORE_EVENT_CONSTRAINT(0x1d, 0xc), + UNCORE_EVENT_CONSTRAINT(0x1e, 0xc), + EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff), + UNCORE_EVENT_CONSTRAINT(0x21, 0x3), + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + UNCORE_EVENT_CONSTRAINT(0x31, 0x3), + UNCORE_EVENT_CONSTRAINT(0x32, 0x3), + UNCORE_EVENT_CONSTRAINT(0x33, 0x3), + UNCORE_EVENT_CONSTRAINT(0x34, 0x3), + UNCORE_EVENT_CONSTRAINT(0x35, 0x3), + UNCORE_EVENT_CONSTRAINT(0x36, 0x1), + UNCORE_EVENT_CONSTRAINT(0x37, 0x3), + UNCORE_EVENT_CONSTRAINT(0x38, 0x3), + UNCORE_EVENT_CONSTRAINT(0x39, 0x3), + UNCORE_EVENT_CONSTRAINT(0x3b, 0x1), + EVENT_CONSTRAINT_END +}; + +static struct event_constraint snbep_uncore_r2pcie_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x10, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x3), + UNCORE_EVENT_CONSTRAINT(0x12, 0x1), + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + UNCORE_EVENT_CONSTRAINT(0x24, 0x3), + UNCORE_EVENT_CONSTRAINT(0x25, 0x3), + UNCORE_EVENT_CONSTRAINT(0x26, 0x3), + UNCORE_EVENT_CONSTRAINT(0x32, 0x3), + UNCORE_EVENT_CONSTRAINT(0x33, 0x3), + UNCORE_EVENT_CONSTRAINT(0x34, 0x3), + EVENT_CONSTRAINT_END +}; + +static struct event_constraint snbep_uncore_r3qpi_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x10, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x3), + UNCORE_EVENT_CONSTRAINT(0x12, 0x3), + UNCORE_EVENT_CONSTRAINT(0x13, 0x1), + UNCORE_EVENT_CONSTRAINT(0x20, 0x3), + UNCORE_EVENT_CONSTRAINT(0x21, 0x3), + UNCORE_EVENT_CONSTRAINT(0x22, 0x3), + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + UNCORE_EVENT_CONSTRAINT(0x24, 0x3), + UNCORE_EVENT_CONSTRAINT(0x25, 0x3), + UNCORE_EVENT_CONSTRAINT(0x26, 0x3), + UNCORE_EVENT_CONSTRAINT(0x30, 0x3), + UNCORE_EVENT_CONSTRAINT(0x31, 0x3), + UNCORE_EVENT_CONSTRAINT(0x32, 0x3), + UNCORE_EVENT_CONSTRAINT(0x33, 0x3), + UNCORE_EVENT_CONSTRAINT(0x34, 0x3), + UNCORE_EVENT_CONSTRAINT(0x36, 0x3), + UNCORE_EVENT_CONSTRAINT(0x37, 0x3), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type snbep_uncore_ubox = { + .name = "ubox", + .num_counters = 2, + .num_boxes = 1, + .perf_ctr_bits = 44, + .fixed_ctr_bits = 48, + .perf_ctr = SNBEP_U_MSR_PMON_CTR0, + .event_ctl = SNBEP_U_MSR_PMON_CTL0, + .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK, + .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR, + .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL, + .ops = &snbep_uncore_msr_ops, + .format_group = &snbep_uncore_ubox_format_group, +}; + +static struct intel_uncore_type snbep_uncore_cbox = { + .name = "cbox", + .num_counters = 4, + .num_boxes = 8, + .perf_ctr_bits = 44, + .event_ctl = SNBEP_C0_MSR_PMON_CTL0, + .perf_ctr = SNBEP_C0_MSR_PMON_CTR0, + .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL, + .msr_offset = SNBEP_CBO_MSR_OFFSET, + .num_shared_regs = 1, + .constraints = snbep_uncore_cbox_constraints, + .ops = &snbep_uncore_msr_ops, + .format_group = &snbep_uncore_cbox_format_group, +}; + +static struct intel_uncore_type snbep_uncore_pcu = { + .name = "pcu", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0, + .event_ctl = SNBEP_PCU_MSR_PMON_CTL0, + .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL, + .num_shared_regs = 1, + .ops = &snbep_uncore_msr_ops, + .format_group = &snbep_uncore_pcu_format_group, +}; + +static struct intel_uncore_type *snbep_msr_uncores[] = { + &snbep_uncore_ubox, + &snbep_uncore_cbox, + &snbep_uncore_pcu, + NULL, +}; + +#define SNBEP_UNCORE_PCI_COMMON_INIT() \ + .perf_ctr = SNBEP_PCI_PMON_CTR0, \ + .event_ctl = SNBEP_PCI_PMON_CTL0, \ + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, \ + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \ + .ops = &snbep_uncore_pci_ops, \ + .format_group = &snbep_uncore_format_group + +static struct intel_uncore_type snbep_uncore_ha = { + .name = "ha", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type snbep_uncore_imc = { + .name = "imc", + .num_counters = 4, + .num_boxes = 4, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR, + .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL, + .event_descs = snbep_uncore_imc_events, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type snbep_uncore_qpi = { + .name = "qpi", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + .event_descs = snbep_uncore_qpi_events, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + + +static struct intel_uncore_type snbep_uncore_r2pcie = { + .name = "r2pcie", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 44, + .constraints = snbep_uncore_r2pcie_constraints, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type snbep_uncore_r3qpi = { + .name = "r3qpi", + .num_counters = 3, + .num_boxes = 2, + .perf_ctr_bits = 44, + .constraints = snbep_uncore_r3qpi_constraints, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type *snbep_pci_uncores[] = { + &snbep_uncore_ha, + &snbep_uncore_imc, + &snbep_uncore_qpi, + &snbep_uncore_r2pcie, + &snbep_uncore_r3qpi, + NULL, +}; + +static DEFINE_PCI_DEVICE_TABLE(snbep_uncore_pci_ids) = { + { /* Home Agent */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA), + .driver_data = (unsigned long)&snbep_uncore_ha, + }, + { /* MC Channel 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0), + .driver_data = (unsigned long)&snbep_uncore_imc, + }, + { /* MC Channel 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1), + .driver_data = (unsigned long)&snbep_uncore_imc, + }, + { /* MC Channel 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2), + .driver_data = (unsigned long)&snbep_uncore_imc, + }, + { /* MC Channel 3 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3), + .driver_data = (unsigned long)&snbep_uncore_imc, + }, + { /* QPI Port 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0), + .driver_data = (unsigned long)&snbep_uncore_qpi, + }, + { /* QPI Port 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1), + .driver_data = (unsigned long)&snbep_uncore_qpi, + }, + { /* P2PCIe */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE), + .driver_data = (unsigned long)&snbep_uncore_r2pcie, + }, + { /* R3QPI Link 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0), + .driver_data = (unsigned long)&snbep_uncore_r3qpi, + }, + { /* R3QPI Link 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1), + .driver_data = (unsigned long)&snbep_uncore_r3qpi, + }, + { /* end: all zeroes */ } +}; + +static struct pci_driver snbep_uncore_pci_driver = { + .name = "snbep_uncore", + .id_table = snbep_uncore_pci_ids, +}; + +/* + * build pci bus to socket mapping + */ +static void snbep_pci2phy_map_init(void) +{ + struct pci_dev *ubox_dev = NULL; + int i, bus, nodeid; + u32 config; + + while (1) { + /* find the UBOX device */ + ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, + PCI_DEVICE_ID_INTEL_JAKETOWN_UBOX, + ubox_dev); + if (!ubox_dev) + break; + bus = ubox_dev->bus->number; + /* get the Node ID of the local register */ + pci_read_config_dword(ubox_dev, 0x40, &config); + nodeid = config; + /* get the Node ID mapping */ + pci_read_config_dword(ubox_dev, 0x54, &config); + /* + * every three bits in the Node ID mapping register maps + * to a particular node. + */ + for (i = 0; i < 8; i++) { + if (nodeid == ((config >> (3 * i)) & 0x7)) { + pcibus_to_physid[bus] = i; + break; + } + } + }; + return; +} +/* end of Sandy Bridge-EP uncore support */ + + +/* Sandy Bridge uncore support */ +static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->idx < UNCORE_PMC_IDX_FIXED) + wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN); + else + wrmsrl(hwc->config_base, SNB_UNC_CTL_EN); +} + +static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + wrmsrl(event->hw.config_base, 0); +} + +static u64 snb_uncore_msr_read_counter(struct intel_uncore_box *box, + struct perf_event *event) +{ + u64 count; + rdmsrl(event->hw.event_base, count); + return count; +} + +static void snb_uncore_msr_init_box(struct intel_uncore_box *box) +{ + if (box->pmu->pmu_idx == 0) { + wrmsrl(SNB_UNC_PERF_GLOBAL_CTL, + SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL); + } +} + +static struct attribute *snb_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask5.attr, + NULL, +}; + +static struct attribute_group snb_uncore_format_group = { + .name = "format", + .attrs = snb_uncore_formats_attr, +}; + +static struct intel_uncore_ops snb_uncore_msr_ops = { + .init_box = snb_uncore_msr_init_box, + .disable_event = snb_uncore_msr_disable_event, + .enable_event = snb_uncore_msr_enable_event, + .read_counter = snb_uncore_msr_read_counter, +}; + +static struct event_constraint snb_uncore_cbox_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x80, 0x1), + UNCORE_EVENT_CONSTRAINT(0x83, 0x1), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type snb_uncore_cbox = { + .name = "cbox", + .num_counters = 2, + .num_boxes = 4, + .perf_ctr_bits = 44, + .fixed_ctr_bits = 48, + .perf_ctr = SNB_UNC_CBO_0_PER_CTR0, + .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0, + .fixed_ctr = SNB_UNC_FIXED_CTR, + .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL, + .single_fixed = 1, + .event_mask = SNB_UNC_RAW_EVENT_MASK, + .msr_offset = SNB_UNC_CBO_MSR_OFFSET, + .constraints = snb_uncore_cbox_constraints, + .ops = &snb_uncore_msr_ops, + .format_group = &snb_uncore_format_group, +}; + +static struct intel_uncore_type *snb_msr_uncores[] = { + &snb_uncore_cbox, + NULL, +}; +/* end of Sandy Bridge uncore support */ + +/* Nehalem uncore support */ +static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box) +{ + wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0); +} + +static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, + NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC); +} + +static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->idx < UNCORE_PMC_IDX_FIXED) + wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN); + else + wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN); +} + +static struct attribute *nhm_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask8.attr, + NULL, +}; + +static struct attribute_group nhm_uncore_format_group = { + .name = "format", + .attrs = nhm_uncore_formats_attr, +}; + +static struct uncore_event_desc nhm_uncore_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), + INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any, "event=0x2f,umask=0x0f"), + INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any, "event=0x2c,umask=0x0f"), + INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads, "event=0x20,umask=0x01"), + INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes, "event=0x20,umask=0x02"), + INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads, "event=0x20,umask=0x04"), + INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"), + INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads, "event=0x20,umask=0x10"), + INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes, "event=0x20,umask=0x20"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_ops nhm_uncore_msr_ops = { + .disable_box = nhm_uncore_msr_disable_box, + .enable_box = nhm_uncore_msr_enable_box, + .disable_event = snb_uncore_msr_disable_event, + .enable_event = nhm_uncore_msr_enable_event, + .read_counter = snb_uncore_msr_read_counter, +}; + +static struct intel_uncore_type nhm_uncore = { + .name = "", + .num_counters = 8, + .num_boxes = 1, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .event_ctl = NHM_UNC_PERFEVTSEL0, + .perf_ctr = NHM_UNC_UNCORE_PMC0, + .fixed_ctr = NHM_UNC_FIXED_CTR, + .fixed_ctl = NHM_UNC_FIXED_CTR_CTRL, + .event_mask = NHM_UNC_RAW_EVENT_MASK, + .event_descs = nhm_uncore_events, + .ops = &nhm_uncore_msr_ops, + .format_group = &nhm_uncore_format_group, +}; + +static struct intel_uncore_type *nhm_msr_uncores[] = { + &nhm_uncore, + NULL, +}; +/* end of Nehalem uncore support */ + +static void uncore_assign_hw_event(struct intel_uncore_box *box, + struct perf_event *event, int idx) +{ + struct hw_perf_event *hwc = &event->hw; + + hwc->idx = idx; + hwc->last_tag = ++box->tags[idx]; + + if (hwc->idx == UNCORE_PMC_IDX_FIXED) { + hwc->event_base = uncore_fixed_ctr(box); + hwc->config_base = uncore_fixed_ctl(box); + return; + } + + hwc->config_base = uncore_event_ctl(box, hwc->idx); + hwc->event_base = uncore_perf_ctr(box, hwc->idx); +} + +static void uncore_perf_event_update(struct intel_uncore_box *box, + struct perf_event *event) +{ + u64 prev_count, new_count, delta; + int shift; + + if (event->hw.idx >= UNCORE_PMC_IDX_FIXED) + shift = 64 - uncore_fixed_ctr_bits(box); + else + shift = 64 - uncore_perf_ctr_bits(box); + + /* the hrtimer might modify the previous event value */ +again: + prev_count = local64_read(&event->hw.prev_count); + new_count = uncore_read_counter(box, event); + if (local64_xchg(&event->hw.prev_count, new_count) != prev_count) + goto again; + + delta = (new_count << shift) - (prev_count << shift); + delta >>= shift; + + local64_add(delta, &event->count); +} + +/* + * The overflow interrupt is unavailable for SandyBridge-EP, is broken + * for SandyBridge. So we use hrtimer to periodically poll the counter + * to avoid overflow. + */ +static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer) +{ + struct intel_uncore_box *box; + unsigned long flags; + int bit; + + box = container_of(hrtimer, struct intel_uncore_box, hrtimer); + if (!box->n_active || box->cpu != smp_processor_id()) + return HRTIMER_NORESTART; + /* + * disable local interrupt to prevent uncore_pmu_event_start/stop + * to interrupt the update process + */ + local_irq_save(flags); + + for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX) + uncore_perf_event_update(box, box->events[bit]); + + local_irq_restore(flags); + + hrtimer_forward_now(hrtimer, ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL)); + return HRTIMER_RESTART; +} + +static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box) +{ + __hrtimer_start_range_ns(&box->hrtimer, + ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL), 0, + HRTIMER_MODE_REL_PINNED, 0); +} + +static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box) +{ + hrtimer_cancel(&box->hrtimer); +} + +static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box) +{ + hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + box->hrtimer.function = uncore_pmu_hrtimer; +} + +struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, + int cpu) +{ + struct intel_uncore_box *box; + int i, size; + + size = sizeof(*box) + type->num_shared_regs * + sizeof(struct intel_uncore_extra_reg); + + box = kmalloc_node(size, GFP_KERNEL | __GFP_ZERO, cpu_to_node(cpu)); + if (!box) + return NULL; + + for (i = 0; i < type->num_shared_regs; i++) + raw_spin_lock_init(&box->shared_regs[i].lock); + + uncore_pmu_init_hrtimer(box); + atomic_set(&box->refcnt, 1); + box->cpu = -1; + box->phys_id = -1; + + return box; +} + +static struct intel_uncore_box * +uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) +{ + static struct intel_uncore_box *box; + + box = *per_cpu_ptr(pmu->box, cpu); + if (box) + return box; + + raw_spin_lock(&uncore_box_lock); + list_for_each_entry(box, &pmu->box_list, list) { + if (box->phys_id == topology_physical_package_id(cpu)) { + atomic_inc(&box->refcnt); + *per_cpu_ptr(pmu->box, cpu) = box; + break; + } + } + raw_spin_unlock(&uncore_box_lock); + + return *per_cpu_ptr(pmu->box, cpu); +} + +static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event) +{ + return container_of(event->pmu, struct intel_uncore_pmu, pmu); +} + +static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event) +{ + /* + * perf core schedules event on the basis of cpu, uncore events are + * collected by one of the cpus inside a physical package. + */ + return uncore_pmu_to_box(uncore_event_to_pmu(event), + smp_processor_id()); +} + +static int uncore_collect_events(struct intel_uncore_box *box, + struct perf_event *leader, bool dogrp) +{ + struct perf_event *event; + int n, max_count; + + max_count = box->pmu->type->num_counters; + if (box->pmu->type->fixed_ctl) + max_count++; + + if (box->n_events >= max_count) + return -EINVAL; + + n = box->n_events; + box->event_list[n] = leader; + n++; + if (!dogrp) + return n; + + list_for_each_entry(event, &leader->sibling_list, group_entry) { + if (event->state <= PERF_EVENT_STATE_OFF) + continue; + + if (n >= max_count) + return -EINVAL; + + box->event_list[n] = event; + n++; + } + return n; +} + +static struct event_constraint * +uncore_get_event_constraint(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct intel_uncore_type *type = box->pmu->type; + struct event_constraint *c; + + if (type->ops->get_constraint) { + c = type->ops->get_constraint(box, event); + if (c) + return c; + } + + if (event->hw.config == ~0ULL) + return &constraint_fixed; + + if (type->constraints) { + for_each_event_constraint(c, type->constraints) { + if ((event->hw.config & c->cmask) == c->code) + return c; + } + } + + return &type->unconstrainted; +} + +static void uncore_put_event_constraint(struct intel_uncore_box *box, + struct perf_event *event) +{ + if (box->pmu->type->ops->put_constraint) + box->pmu->type->ops->put_constraint(box, event); +} + +static int uncore_assign_events(struct intel_uncore_box *box, + int assign[], int n) +{ + unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)]; + struct event_constraint *c, *constraints[UNCORE_PMC_IDX_MAX]; + int i, wmin, wmax, ret = 0; + struct hw_perf_event *hwc; + + bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX); + + for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) { + c = uncore_get_event_constraint(box, box->event_list[i]); + constraints[i] = c; + wmin = min(wmin, c->weight); + wmax = max(wmax, c->weight); + } + + /* fastpath, try to reuse previous register */ + for (i = 0; i < n; i++) { + hwc = &box->event_list[i]->hw; + c = constraints[i]; + + /* never assigned */ + if (hwc->idx == -1) + break; + + /* constraint still honored */ + if (!test_bit(hwc->idx, c->idxmsk)) + break; + + /* not already used */ + if (test_bit(hwc->idx, used_mask)) + break; + + __set_bit(hwc->idx, used_mask); + if (assign) + assign[i] = hwc->idx; + } + /* slow path */ + if (i != n) + ret = perf_assign_events(constraints, n, wmin, wmax, assign); + + if (!assign || ret) { + for (i = 0; i < n; i++) + uncore_put_event_constraint(box, box->event_list[i]); + } + return ret ? -EINVAL : 0; +} + +static void uncore_pmu_event_start(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + int idx = event->hw.idx; + + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX)) + return; + + event->hw.state = 0; + box->events[idx] = event; + box->n_active++; + __set_bit(idx, box->active_mask); + + local64_set(&event->hw.prev_count, uncore_read_counter(box, event)); + uncore_enable_event(box, event); + + if (box->n_active == 1) { + uncore_enable_box(box); + uncore_pmu_start_hrtimer(box); + } +} + +static void uncore_pmu_event_stop(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + struct hw_perf_event *hwc = &event->hw; + + if (__test_and_clear_bit(hwc->idx, box->active_mask)) { + uncore_disable_event(box, event); + box->n_active--; + box->events[hwc->idx] = NULL; + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + + if (box->n_active == 0) { + uncore_disable_box(box); + uncore_pmu_cancel_hrtimer(box); + } + } + + if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + uncore_perf_event_update(box, event); + hwc->state |= PERF_HES_UPTODATE; + } +} + +static int uncore_pmu_event_add(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + struct hw_perf_event *hwc = &event->hw; + int assign[UNCORE_PMC_IDX_MAX]; + int i, n, ret; + + if (!box) + return -ENODEV; + + ret = n = uncore_collect_events(box, event, false); + if (ret < 0) + return ret; + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + if (!(flags & PERF_EF_START)) + hwc->state |= PERF_HES_ARCH; + + ret = uncore_assign_events(box, assign, n); + if (ret) + return ret; + + /* save events moving to new counters */ + for (i = 0; i < box->n_events; i++) { + event = box->event_list[i]; + hwc = &event->hw; + + if (hwc->idx == assign[i] && + hwc->last_tag == box->tags[assign[i]]) + continue; + /* + * Ensure we don't accidentally enable a stopped + * counter simply because we rescheduled. + */ + if (hwc->state & PERF_HES_STOPPED) + hwc->state |= PERF_HES_ARCH; + + uncore_pmu_event_stop(event, PERF_EF_UPDATE); + } + + /* reprogram moved events into new counters */ + for (i = 0; i < n; i++) { + event = box->event_list[i]; + hwc = &event->hw; + + if (hwc->idx != assign[i] || + hwc->last_tag != box->tags[assign[i]]) + uncore_assign_hw_event(box, event, assign[i]); + else if (i < box->n_events) + continue; + + if (hwc->state & PERF_HES_ARCH) + continue; + + uncore_pmu_event_start(event, 0); + } + box->n_events = n; + + return 0; +} + +static void uncore_pmu_event_del(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + int i; + + uncore_pmu_event_stop(event, PERF_EF_UPDATE); + + for (i = 0; i < box->n_events; i++) { + if (event == box->event_list[i]) { + uncore_put_event_constraint(box, event); + + while (++i < box->n_events) + box->event_list[i - 1] = box->event_list[i]; + + --box->n_events; + break; + } + } + + event->hw.idx = -1; + event->hw.last_tag = ~0ULL; +} + +static void uncore_pmu_event_read(struct perf_event *event) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + uncore_perf_event_update(box, event); +} + +/* + * validation ensures the group can be loaded onto the + * PMU if it was the only group available. + */ +static int uncore_validate_group(struct intel_uncore_pmu *pmu, + struct perf_event *event) +{ + struct perf_event *leader = event->group_leader; + struct intel_uncore_box *fake_box; + int ret = -EINVAL, n; + + fake_box = uncore_alloc_box(pmu->type, smp_processor_id()); + if (!fake_box) + return -ENOMEM; + + fake_box->pmu = pmu; + /* + * the event is not yet connected with its + * siblings therefore we must first collect + * existing siblings, then add the new event + * before we can simulate the scheduling + */ + n = uncore_collect_events(fake_box, leader, true); + if (n < 0) + goto out; + + fake_box->n_events = n; + n = uncore_collect_events(fake_box, event, false); + if (n < 0) + goto out; + + fake_box->n_events = n; + + ret = uncore_assign_events(fake_box, NULL, n); +out: + kfree(fake_box); + return ret; +} + +int uncore_pmu_event_init(struct perf_event *event) +{ + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + struct hw_perf_event *hwc = &event->hw; + int ret; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + pmu = uncore_event_to_pmu(event); + /* no device found for this pmu */ + if (pmu->func_id < 0) + return -ENOENT; + + /* + * Uncore PMU does measure at all privilege level all the time. + * So it doesn't make sense to specify any exclude bits. + */ + if (event->attr.exclude_user || event->attr.exclude_kernel || + event->attr.exclude_hv || event->attr.exclude_idle) + return -EINVAL; + + /* Sampling not supported yet */ + if (hwc->sample_period) + return -EINVAL; + + /* + * Place all uncore events for a particular physical package + * onto a single cpu + */ + if (event->cpu < 0) + return -EINVAL; + box = uncore_pmu_to_box(pmu, event->cpu); + if (!box || box->cpu < 0) + return -EINVAL; + event->cpu = box->cpu; + + event->hw.idx = -1; + event->hw.last_tag = ~0ULL; + event->hw.extra_reg.idx = EXTRA_REG_NONE; + + if (event->attr.config == UNCORE_FIXED_EVENT) { + /* no fixed counter */ + if (!pmu->type->fixed_ctl) + return -EINVAL; + /* + * if there is only one fixed counter, only the first pmu + * can access the fixed counter + */ + if (pmu->type->single_fixed && pmu->pmu_idx > 0) + return -EINVAL; + hwc->config = ~0ULL; + } else { + hwc->config = event->attr.config & pmu->type->event_mask; + if (pmu->type->ops->hw_config) { + ret = pmu->type->ops->hw_config(box, event); + if (ret) + return ret; + } + } + + if (event->group_leader != event) + ret = uncore_validate_group(pmu, event); + else + ret = 0; + + return ret; +} + +static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu) +{ + int ret; + + pmu->pmu = (struct pmu) { + .attr_groups = pmu->type->attr_groups, + .task_ctx_nr = perf_invalid_context, + .event_init = uncore_pmu_event_init, + .add = uncore_pmu_event_add, + .del = uncore_pmu_event_del, + .start = uncore_pmu_event_start, + .stop = uncore_pmu_event_stop, + .read = uncore_pmu_event_read, + }; + + if (pmu->type->num_boxes == 1) { + if (strlen(pmu->type->name) > 0) + sprintf(pmu->name, "uncore_%s", pmu->type->name); + else + sprintf(pmu->name, "uncore"); + } else { + sprintf(pmu->name, "uncore_%s_%d", pmu->type->name, + pmu->pmu_idx); + } + + ret = perf_pmu_register(&pmu->pmu, pmu->name, -1); + return ret; +} + +static void __init uncore_type_exit(struct intel_uncore_type *type) +{ + int i; + + for (i = 0; i < type->num_boxes; i++) + free_percpu(type->pmus[i].box); + kfree(type->pmus); + type->pmus = NULL; + kfree(type->attr_groups[1]); + type->attr_groups[1] = NULL; +} + +static void uncore_types_exit(struct intel_uncore_type **types) +{ + int i; + for (i = 0; types[i]; i++) + uncore_type_exit(types[i]); +} + +static int __init uncore_type_init(struct intel_uncore_type *type) +{ + struct intel_uncore_pmu *pmus; + struct attribute_group *events_group; + struct attribute **attrs; + int i, j; + + pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL); + if (!pmus) + return -ENOMEM; + + type->unconstrainted = (struct event_constraint) + __EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1, + 0, type->num_counters, 0); + + for (i = 0; i < type->num_boxes; i++) { + pmus[i].func_id = -1; + pmus[i].pmu_idx = i; + pmus[i].type = type; + INIT_LIST_HEAD(&pmus[i].box_list); + pmus[i].box = alloc_percpu(struct intel_uncore_box *); + if (!pmus[i].box) + goto fail; + } + + if (type->event_descs) { + i = 0; + while (type->event_descs[i].attr.attr.name) + i++; + + events_group = kzalloc(sizeof(struct attribute *) * (i + 1) + + sizeof(*events_group), GFP_KERNEL); + if (!events_group) + goto fail; + + attrs = (struct attribute **)(events_group + 1); + events_group->name = "events"; + events_group->attrs = attrs; + + for (j = 0; j < i; j++) + attrs[j] = &type->event_descs[j].attr.attr; + + type->attr_groups[1] = events_group; + } + + type->pmus = pmus; + return 0; +fail: + uncore_type_exit(type); + return -ENOMEM; +} + +static int __init uncore_types_init(struct intel_uncore_type **types) +{ + int i, ret; + + for (i = 0; types[i]; i++) { + ret = uncore_type_init(types[i]); + if (ret) + goto fail; + } + return 0; +fail: + while (--i >= 0) + uncore_type_exit(types[i]); + return ret; +} + +static struct pci_driver *uncore_pci_driver; +static bool pcidrv_registered; + +/* + * add a pci uncore device + */ +static int __devinit uncore_pci_add(struct intel_uncore_type *type, + struct pci_dev *pdev) +{ + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, phys_id; + + phys_id = pcibus_to_physid[pdev->bus->number]; + if (phys_id < 0) + return -ENODEV; + + box = uncore_alloc_box(type, 0); + if (!box) + return -ENOMEM; + + /* + * for performance monitoring unit with multiple boxes, + * each box has a different function id. + */ + for (i = 0; i < type->num_boxes; i++) { + pmu = &type->pmus[i]; + if (pmu->func_id == pdev->devfn) + break; + if (pmu->func_id < 0) { + pmu->func_id = pdev->devfn; + break; + } + pmu = NULL; + } + + if (!pmu) { + kfree(box); + return -EINVAL; + } + + box->phys_id = phys_id; + box->pci_dev = pdev; + box->pmu = pmu; + uncore_box_init(box); + pci_set_drvdata(pdev, box); + + raw_spin_lock(&uncore_box_lock); + list_add_tail(&box->list, &pmu->box_list); + raw_spin_unlock(&uncore_box_lock); + + return 0; +} + +static void uncore_pci_remove(struct pci_dev *pdev) +{ + struct intel_uncore_box *box = pci_get_drvdata(pdev); + struct intel_uncore_pmu *pmu = box->pmu; + int cpu, phys_id = pcibus_to_physid[pdev->bus->number]; + + if (WARN_ON_ONCE(phys_id != box->phys_id)) + return; + + raw_spin_lock(&uncore_box_lock); + list_del(&box->list); + raw_spin_unlock(&uncore_box_lock); + + for_each_possible_cpu(cpu) { + if (*per_cpu_ptr(pmu->box, cpu) == box) { + *per_cpu_ptr(pmu->box, cpu) = NULL; + atomic_dec(&box->refcnt); + } + } + + WARN_ON_ONCE(atomic_read(&box->refcnt) != 1); + kfree(box); +} + +static int __devinit uncore_pci_probe(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + struct intel_uncore_type *type; + + type = (struct intel_uncore_type *)id->driver_data; + return uncore_pci_add(type, pdev); +} + +static int __init uncore_pci_init(void) +{ + int ret; + + switch (boot_cpu_data.x86_model) { + case 45: /* Sandy Bridge-EP */ + pci_uncores = snbep_pci_uncores; + uncore_pci_driver = &snbep_uncore_pci_driver; + snbep_pci2phy_map_init(); + break; + default: + return 0; + } + + ret = uncore_types_init(pci_uncores); + if (ret) + return ret; + + uncore_pci_driver->probe = uncore_pci_probe; + uncore_pci_driver->remove = uncore_pci_remove; + + ret = pci_register_driver(uncore_pci_driver); + if (ret == 0) + pcidrv_registered = true; + else + uncore_types_exit(pci_uncores); + + return ret; +} + +static void __init uncore_pci_exit(void) +{ + if (pcidrv_registered) { + pcidrv_registered = false; + pci_unregister_driver(uncore_pci_driver); + uncore_types_exit(pci_uncores); + } +} + +static void __cpuinit uncore_cpu_dying(int cpu) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, j; + + for (i = 0; msr_uncores[i]; i++) { + type = msr_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + box = *per_cpu_ptr(pmu->box, cpu); + *per_cpu_ptr(pmu->box, cpu) = NULL; + if (box && atomic_dec_and_test(&box->refcnt)) + kfree(box); + } + } +} + +static int __cpuinit uncore_cpu_starting(int cpu) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box, *exist; + int i, j, k, phys_id; + + phys_id = topology_physical_package_id(cpu); + + for (i = 0; msr_uncores[i]; i++) { + type = msr_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + box = *per_cpu_ptr(pmu->box, cpu); + /* called by uncore_cpu_init? */ + if (box && box->phys_id >= 0) { + uncore_box_init(box); + continue; + } + + for_each_online_cpu(k) { + exist = *per_cpu_ptr(pmu->box, k); + if (exist && exist->phys_id == phys_id) { + atomic_inc(&exist->refcnt); + *per_cpu_ptr(pmu->box, cpu) = exist; + kfree(box); + box = NULL; + break; + } + } + + if (box) { + box->phys_id = phys_id; + uncore_box_init(box); + } + } + } + return 0; +} + +static int __cpuinit uncore_cpu_prepare(int cpu, int phys_id) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, j; + + for (i = 0; msr_uncores[i]; i++) { + type = msr_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + if (pmu->func_id < 0) + pmu->func_id = j; + + box = uncore_alloc_box(type, cpu); + if (!box) + return -ENOMEM; + + box->pmu = pmu; + box->phys_id = phys_id; + *per_cpu_ptr(pmu->box, cpu) = box; + } + } + return 0; +} + +static void __cpuinit uncore_change_context(struct intel_uncore_type **uncores, + int old_cpu, int new_cpu) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, j; + + for (i = 0; uncores[i]; i++) { + type = uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + if (old_cpu < 0) + box = uncore_pmu_to_box(pmu, new_cpu); + else + box = uncore_pmu_to_box(pmu, old_cpu); + if (!box) + continue; + + if (old_cpu < 0) { + WARN_ON_ONCE(box->cpu != -1); + box->cpu = new_cpu; + continue; + } + + WARN_ON_ONCE(box->cpu != old_cpu); + if (new_cpu >= 0) { + uncore_pmu_cancel_hrtimer(box); + perf_pmu_migrate_context(&pmu->pmu, + old_cpu, new_cpu); + box->cpu = new_cpu; + } else { + box->cpu = -1; + } + } + } +} + +static void __cpuinit uncore_event_exit_cpu(int cpu) +{ + int i, phys_id, target; + + /* if exiting cpu is used for collecting uncore events */ + if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask)) + return; + + /* find a new cpu to collect uncore events */ + phys_id = topology_physical_package_id(cpu); + target = -1; + for_each_online_cpu(i) { + if (i == cpu) + continue; + if (phys_id == topology_physical_package_id(i)) { + target = i; + break; + } + } + + /* migrate uncore events to the new cpu */ + if (target >= 0) + cpumask_set_cpu(target, &uncore_cpu_mask); + + uncore_change_context(msr_uncores, cpu, target); + uncore_change_context(pci_uncores, cpu, target); +} + +static void __cpuinit uncore_event_init_cpu(int cpu) +{ + int i, phys_id; + + phys_id = topology_physical_package_id(cpu); + for_each_cpu(i, &uncore_cpu_mask) { + if (phys_id == topology_physical_package_id(i)) + return; + } + + cpumask_set_cpu(cpu, &uncore_cpu_mask); + + uncore_change_context(msr_uncores, -1, cpu); + uncore_change_context(pci_uncores, -1, cpu); +} + +static int __cpuinit uncore_cpu_notifier(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + /* allocate/free data structure for uncore box */ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + uncore_cpu_prepare(cpu, -1); + break; + case CPU_STARTING: + uncore_cpu_starting(cpu); + break; + case CPU_UP_CANCELED: + case CPU_DYING: + uncore_cpu_dying(cpu); + break; + default: + break; + } + + /* select the cpu that collects uncore events */ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_FAILED: + case CPU_STARTING: + uncore_event_init_cpu(cpu); + break; + case CPU_DOWN_PREPARE: + uncore_event_exit_cpu(cpu); + break; + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block uncore_cpu_nb __cpuinitdata = { + .notifier_call = uncore_cpu_notifier, + /* + * to migrate uncore events, our notifier should be executed + * before perf core's notifier. + */ + .priority = CPU_PRI_PERF + 1, +}; + +static void __init uncore_cpu_setup(void *dummy) +{ + uncore_cpu_starting(smp_processor_id()); +} + +static int __init uncore_cpu_init(void) +{ + int ret, cpu, max_cores; + + max_cores = boot_cpu_data.x86_max_cores; + switch (boot_cpu_data.x86_model) { + case 26: /* Nehalem */ + case 30: + case 37: /* Westmere */ + case 44: + msr_uncores = nhm_msr_uncores; + break; + case 42: /* Sandy Bridge */ + if (snb_uncore_cbox.num_boxes > max_cores) + snb_uncore_cbox.num_boxes = max_cores; + msr_uncores = snb_msr_uncores; + break; + case 45: /* Sandy Birdge-EP */ + if (snbep_uncore_cbox.num_boxes > max_cores) + snbep_uncore_cbox.num_boxes = max_cores; + msr_uncores = snbep_msr_uncores; + break; + default: + return 0; + } + + ret = uncore_types_init(msr_uncores); + if (ret) + return ret; + + get_online_cpus(); + + for_each_online_cpu(cpu) { + int i, phys_id = topology_physical_package_id(cpu); + + for_each_cpu(i, &uncore_cpu_mask) { + if (phys_id == topology_physical_package_id(i)) { + phys_id = -1; + break; + } + } + if (phys_id < 0) + continue; + + uncore_cpu_prepare(cpu, phys_id); + uncore_event_init_cpu(cpu); + } + on_each_cpu(uncore_cpu_setup, NULL, 1); + + register_cpu_notifier(&uncore_cpu_nb); + + put_online_cpus(); + + return 0; +} + +static int __init uncore_pmus_register(void) +{ + struct intel_uncore_pmu *pmu; + struct intel_uncore_type *type; + int i, j; + + for (i = 0; msr_uncores[i]; i++) { + type = msr_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + uncore_pmu_register(pmu); + } + } + + for (i = 0; pci_uncores[i]; i++) { + type = pci_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + uncore_pmu_register(pmu); + } + } + + return 0; +} + +static int __init intel_uncore_init(void) +{ + int ret; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return -ENODEV; + + ret = uncore_pci_init(); + if (ret) + goto fail; + ret = uncore_cpu_init(); + if (ret) { + uncore_pci_exit(); + goto fail; + } + + uncore_pmus_register(); + return 0; +fail: + return ret; +} +device_initcall(intel_uncore_init); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h new file mode 100644 index 000000000000..b13e9ea81def --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -0,0 +1,424 @@ +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/pci.h> +#include <linux/perf_event.h> +#include "perf_event.h" + +#define UNCORE_PMU_NAME_LEN 32 +#define UNCORE_BOX_HASH_SIZE 8 + +#define UNCORE_PMU_HRTIMER_INTERVAL (60 * NSEC_PER_SEC) + +#define UNCORE_FIXED_EVENT 0xff +#define UNCORE_PMC_IDX_MAX_GENERIC 8 +#define UNCORE_PMC_IDX_FIXED UNCORE_PMC_IDX_MAX_GENERIC +#define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FIXED + 1) + +#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff) + +/* SNB event control */ +#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff +#define SNB_UNC_CTL_UMASK_MASK 0x0000ff00 +#define SNB_UNC_CTL_EDGE_DET (1 << 18) +#define SNB_UNC_CTL_EN (1 << 22) +#define SNB_UNC_CTL_INVERT (1 << 23) +#define SNB_UNC_CTL_CMASK_MASK 0x1f000000 +#define NHM_UNC_CTL_CMASK_MASK 0xff000000 +#define NHM_UNC_FIXED_CTR_CTL_EN (1 << 0) + +#define SNB_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \ + SNB_UNC_CTL_UMASK_MASK | \ + SNB_UNC_CTL_EDGE_DET | \ + SNB_UNC_CTL_INVERT | \ + SNB_UNC_CTL_CMASK_MASK) + +#define NHM_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \ + SNB_UNC_CTL_UMASK_MASK | \ + SNB_UNC_CTL_EDGE_DET | \ + SNB_UNC_CTL_INVERT | \ + NHM_UNC_CTL_CMASK_MASK) + +/* SNB global control register */ +#define SNB_UNC_PERF_GLOBAL_CTL 0x391 +#define SNB_UNC_FIXED_CTR_CTRL 0x394 +#define SNB_UNC_FIXED_CTR 0x395 + +/* SNB uncore global control */ +#define SNB_UNC_GLOBAL_CTL_CORE_ALL ((1 << 4) - 1) +#define SNB_UNC_GLOBAL_CTL_EN (1 << 29) + +/* SNB Cbo register */ +#define SNB_UNC_CBO_0_PERFEVTSEL0 0x700 +#define SNB_UNC_CBO_0_PER_CTR0 0x706 +#define SNB_UNC_CBO_MSR_OFFSET 0x10 + +/* NHM global control register */ +#define NHM_UNC_PERF_GLOBAL_CTL 0x391 +#define NHM_UNC_FIXED_CTR 0x394 +#define NHM_UNC_FIXED_CTR_CTRL 0x395 + +/* NHM uncore global control */ +#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL ((1ULL << 8) - 1) +#define NHM_UNC_GLOBAL_CTL_EN_FC (1ULL << 32) + +/* NHM uncore register */ +#define NHM_UNC_PERFEVTSEL0 0x3c0 +#define NHM_UNC_UNCORE_PMC0 0x3b0 + +/* SNB-EP Box level control */ +#define SNBEP_PMON_BOX_CTL_RST_CTRL (1 << 0) +#define SNBEP_PMON_BOX_CTL_RST_CTRS (1 << 1) +#define SNBEP_PMON_BOX_CTL_FRZ (1 << 8) +#define SNBEP_PMON_BOX_CTL_FRZ_EN (1 << 16) +#define SNBEP_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \ + SNBEP_PMON_BOX_CTL_RST_CTRS | \ + SNBEP_PMON_BOX_CTL_FRZ_EN) +/* SNB-EP event control */ +#define SNBEP_PMON_CTL_EV_SEL_MASK 0x000000ff +#define SNBEP_PMON_CTL_UMASK_MASK 0x0000ff00 +#define SNBEP_PMON_CTL_RST (1 << 17) +#define SNBEP_PMON_CTL_EDGE_DET (1 << 18) +#define SNBEP_PMON_CTL_EV_SEL_EXT (1 << 21) /* only for QPI */ +#define SNBEP_PMON_CTL_EN (1 << 22) +#define SNBEP_PMON_CTL_INVERT (1 << 23) +#define SNBEP_PMON_CTL_TRESH_MASK 0xff000000 +#define SNBEP_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PMON_CTL_UMASK_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_INVERT | \ + SNBEP_PMON_CTL_TRESH_MASK) + +/* SNB-EP Ubox event control */ +#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK 0x1f000000 +#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK \ + (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PMON_CTL_UMASK_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_INVERT | \ + SNBEP_U_MSR_PMON_CTL_TRESH_MASK) + +#define SNBEP_CBO_PMON_CTL_TID_EN (1 << 19) +#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \ + SNBEP_CBO_PMON_CTL_TID_EN) + +/* SNB-EP PCU event control */ +#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK 0x0000c000 +#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK 0x1f000000 +#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT (1 << 30) +#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET (1 << 31) +#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK \ + (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_INVERT | \ + SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET) + +/* SNB-EP pci control register */ +#define SNBEP_PCI_PMON_BOX_CTL 0xf4 +#define SNBEP_PCI_PMON_CTL0 0xd8 +/* SNB-EP pci counter register */ +#define SNBEP_PCI_PMON_CTR0 0xa0 + +/* SNB-EP home agent register */ +#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0 0x40 +#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1 0x44 +#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH 0x48 +/* SNB-EP memory controller register */ +#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL 0xf0 +#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR 0xd0 +/* SNB-EP QPI register */ +#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0 0x228 +#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1 0x22c +#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0 0x238 +#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1 0x23c + +/* SNB-EP Ubox register */ +#define SNBEP_U_MSR_PMON_CTR0 0xc16 +#define SNBEP_U_MSR_PMON_CTL0 0xc10 + +#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL 0xc08 +#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR 0xc09 + +/* SNB-EP Cbo register */ +#define SNBEP_C0_MSR_PMON_CTR0 0xd16 +#define SNBEP_C0_MSR_PMON_CTL0 0xd10 +#define SNBEP_C0_MSR_PMON_BOX_CTL 0xd04 +#define SNBEP_C0_MSR_PMON_BOX_FILTER 0xd14 +#define SNBEP_CB0_MSR_PMON_BOX_FILTER_MASK 0xfffffc1f +#define SNBEP_CBO_MSR_OFFSET 0x20 + +/* SNB-EP PCU register */ +#define SNBEP_PCU_MSR_PMON_CTR0 0xc36 +#define SNBEP_PCU_MSR_PMON_CTL0 0xc30 +#define SNBEP_PCU_MSR_PMON_BOX_CTL 0xc24 +#define SNBEP_PCU_MSR_PMON_BOX_FILTER 0xc34 +#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK 0xffffffff +#define SNBEP_PCU_MSR_CORE_C3_CTR 0x3fc +#define SNBEP_PCU_MSR_CORE_C6_CTR 0x3fd + +struct intel_uncore_ops; +struct intel_uncore_pmu; +struct intel_uncore_box; +struct uncore_event_desc; + +struct intel_uncore_type { + const char *name; + int num_counters; + int num_boxes; + int perf_ctr_bits; + int fixed_ctr_bits; + unsigned perf_ctr; + unsigned event_ctl; + unsigned event_mask; + unsigned fixed_ctr; + unsigned fixed_ctl; + unsigned box_ctl; + unsigned msr_offset; + unsigned num_shared_regs:8; + unsigned single_fixed:1; + struct event_constraint unconstrainted; + struct event_constraint *constraints; + struct intel_uncore_pmu *pmus; + struct intel_uncore_ops *ops; + struct uncore_event_desc *event_descs; + const struct attribute_group *attr_groups[3]; +}; + +#define format_group attr_groups[0] + +struct intel_uncore_ops { + void (*init_box)(struct intel_uncore_box *); + void (*disable_box)(struct intel_uncore_box *); + void (*enable_box)(struct intel_uncore_box *); + void (*disable_event)(struct intel_uncore_box *, struct perf_event *); + void (*enable_event)(struct intel_uncore_box *, struct perf_event *); + u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *); + int (*hw_config)(struct intel_uncore_box *, struct perf_event *); + struct event_constraint *(*get_constraint)(struct intel_uncore_box *, + struct perf_event *); + void (*put_constraint)(struct intel_uncore_box *, struct perf_event *); +}; + +struct intel_uncore_pmu { + struct pmu pmu; + char name[UNCORE_PMU_NAME_LEN]; + int pmu_idx; + int func_id; + struct intel_uncore_type *type; + struct intel_uncore_box ** __percpu box; + struct list_head box_list; +}; + +struct intel_uncore_extra_reg { + raw_spinlock_t lock; + u64 config1; + atomic_t ref; +}; + +struct intel_uncore_box { + int phys_id; + int n_active; /* number of active events */ + int n_events; + int cpu; /* cpu to collect events */ + unsigned long flags; + atomic_t refcnt; + struct perf_event *events[UNCORE_PMC_IDX_MAX]; + struct perf_event *event_list[UNCORE_PMC_IDX_MAX]; + unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)]; + u64 tags[UNCORE_PMC_IDX_MAX]; + struct pci_dev *pci_dev; + struct intel_uncore_pmu *pmu; + struct hrtimer hrtimer; + struct list_head list; + struct intel_uncore_extra_reg shared_regs[0]; +}; + +#define UNCORE_BOX_FLAG_INITIATED 0 + +struct uncore_event_desc { + struct kobj_attribute attr; + const char *config; +}; + +#define INTEL_UNCORE_EVENT_DESC(_name, _config) \ +{ \ + .attr = __ATTR(_name, 0444, uncore_event_show, NULL), \ + .config = _config, \ +} + +#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format) \ +static ssize_t __uncore_##_var##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *page) \ +{ \ + BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ + return sprintf(page, _format "\n"); \ +} \ +static struct kobj_attribute format_attr_##_var = \ + __ATTR(_name, 0444, __uncore_##_var##_show, NULL) + + +static ssize_t uncore_event_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct uncore_event_desc *event = + container_of(attr, struct uncore_event_desc, attr); + return sprintf(buf, "%s", event->config); +} + +static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box) +{ + return box->pmu->type->box_ctl; +} + +static inline unsigned uncore_pci_fixed_ctl(struct intel_uncore_box *box) +{ + return box->pmu->type->fixed_ctl; +} + +static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box) +{ + return box->pmu->type->fixed_ctr; +} + +static inline +unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx) +{ + return idx * 4 + box->pmu->type->event_ctl; +} + +static inline +unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx) +{ + return idx * 8 + box->pmu->type->perf_ctr; +} + +static inline +unsigned uncore_msr_box_ctl(struct intel_uncore_box *box) +{ + if (!box->pmu->type->box_ctl) + return 0; + return box->pmu->type->box_ctl + + box->pmu->type->msr_offset * box->pmu->pmu_idx; +} + +static inline +unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box) +{ + if (!box->pmu->type->fixed_ctl) + return 0; + return box->pmu->type->fixed_ctl + + box->pmu->type->msr_offset * box->pmu->pmu_idx; +} + +static inline +unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box) +{ + return box->pmu->type->fixed_ctr + + box->pmu->type->msr_offset * box->pmu->pmu_idx; +} + +static inline +unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx) +{ + return idx + box->pmu->type->event_ctl + + box->pmu->type->msr_offset * box->pmu->pmu_idx; +} + +static inline +unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx) +{ + return idx + box->pmu->type->perf_ctr + + box->pmu->type->msr_offset * box->pmu->pmu_idx; +} + +static inline +unsigned uncore_fixed_ctl(struct intel_uncore_box *box) +{ + if (box->pci_dev) + return uncore_pci_fixed_ctl(box); + else + return uncore_msr_fixed_ctl(box); +} + +static inline +unsigned uncore_fixed_ctr(struct intel_uncore_box *box) +{ + if (box->pci_dev) + return uncore_pci_fixed_ctr(box); + else + return uncore_msr_fixed_ctr(box); +} + +static inline +unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx) +{ + if (box->pci_dev) + return uncore_pci_event_ctl(box, idx); + else + return uncore_msr_event_ctl(box, idx); +} + +static inline +unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx) +{ + if (box->pci_dev) + return uncore_pci_perf_ctr(box, idx); + else + return uncore_msr_perf_ctr(box, idx); +} + +static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box) +{ + return box->pmu->type->perf_ctr_bits; +} + +static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box) +{ + return box->pmu->type->fixed_ctr_bits; +} + +static inline int uncore_num_counters(struct intel_uncore_box *box) +{ + return box->pmu->type->num_counters; +} + +static inline void uncore_disable_box(struct intel_uncore_box *box) +{ + if (box->pmu->type->ops->disable_box) + box->pmu->type->ops->disable_box(box); +} + +static inline void uncore_enable_box(struct intel_uncore_box *box) +{ + if (box->pmu->type->ops->enable_box) + box->pmu->type->ops->enable_box(box); +} + +static inline void uncore_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + box->pmu->type->ops->disable_event(box, event); +} + +static inline void uncore_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + box->pmu->type->ops->enable_event(box, event); +} + +static inline u64 uncore_read_counter(struct intel_uncore_box *box, + struct perf_event *event) +{ + return box->pmu->type->ops->read_counter(box, event); +} + +static inline void uncore_box_init(struct intel_uncore_box *box) +{ + if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) { + if (box->pmu->type->ops->init_box) + box->pmu->type->ops->init_box(box); + } +} diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 47124a73dd73..92c7e39a079f 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -895,8 +895,8 @@ static void p4_pmu_disable_pebs(void) * So at moment let leave metrics turned on forever -- it's * ok for now but need to be revisited! * - * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0); - * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0); + * (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)0); + * (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)0); */ } @@ -909,7 +909,7 @@ static inline void p4_pmu_disable_event(struct perf_event *event) * state we need to clear P4_CCCR_OVF, otherwise interrupt get * asserted again and again */ - (void)checking_wrmsrl(hwc->config_base, + (void)wrmsrl_safe(hwc->config_base, (u64)(p4_config_unpack_cccr(hwc->config)) & ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED); } @@ -943,8 +943,8 @@ static void p4_pmu_enable_pebs(u64 config) bind = &p4_pebs_bind_map[idx]; - (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs); - (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert); + (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs); + (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert); } static void p4_pmu_enable_event(struct perf_event *event) @@ -978,8 +978,8 @@ static void p4_pmu_enable_event(struct perf_event *event) */ p4_pmu_enable_pebs(hwc->config); - (void)checking_wrmsrl(escr_addr, escr_conf); - (void)checking_wrmsrl(hwc->config_base, + (void)wrmsrl_safe(escr_addr, escr_conf); + (void)wrmsrl_safe(hwc->config_base, (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE); } @@ -1325,7 +1325,7 @@ __init int p4_pmu_init(void) unsigned int low, high; /* If we get stripped -- indexing fails */ - BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC); + BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC); rdmsr(MSR_IA32_MISC_ENABLE, low, high); if (!(low & (1 << 7))) { diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index 32bcfc7dd230..e4dd0f7a0453 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -71,7 +71,7 @@ p6_pmu_disable_event(struct perf_event *event) if (cpuc->enabled) val |= ARCH_PERFMON_EVENTSEL_ENABLE; - (void)checking_wrmsrl(hwc->config_base, val); + (void)wrmsrl_safe(hwc->config_base, val); } static void p6_pmu_enable_event(struct perf_event *event) @@ -84,7 +84,7 @@ static void p6_pmu_enable_event(struct perf_event *event) if (cpuc->enabled) val |= ARCH_PERFMON_EVENTSEL_ENABLE; - (void)checking_wrmsrl(hwc->config_base, val); + (void)wrmsrl_safe(hwc->config_base, val); } PMU_FORMAT_ATTR(event, "config:0-7" ); diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index addf9e82a7f2..ee8e9abc859f 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -31,7 +31,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) const struct cpuid_bit *cb; static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { - { X86_FEATURE_DTS, CR_EAX, 0, 0x00000006, 0 }, + { X86_FEATURE_DTHERM, CR_EAX, 0, 0x00000006, 0 }, { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 }, diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 571246d81edf..ae42418bc50f 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -27,8 +27,8 @@ static int die_counter; void printk_address(unsigned long address, int reliable) { - printk(" [<%p>] %s%pB\n", (void *) address, - reliable ? "" : "? ", (void *) address); + pr_cont(" [<%p>] %s%pB\n", + (void *)address, reliable ? "" : "? ", (void *)address); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -271,6 +271,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) return 1; + print_modules(); show_regs(regs); #ifdef CONFIG_X86_32 if (user_mode_vm(regs)) { diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index e0b1d783daab..1038a417ea53 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -73,11 +73,11 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, if (kstack_end(stack)) break; if (i && ((i % STACKSLOTS_PER_LINE) == 0)) - printk(KERN_CONT "\n"); - printk(KERN_CONT " %08lx", *stack++); + pr_cont("\n"); + pr_cont(" %08lx", *stack++); touch_nmi_watchdog(); } - printk(KERN_CONT "\n"); + pr_cont("\n"); show_trace_log_lvl(task, regs, sp, bp, log_lvl); } @@ -86,12 +86,11 @@ void show_regs(struct pt_regs *regs) { int i; - print_modules(); __show_regs(regs, !user_mode_vm(regs)); - printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", - TASK_COMM_LEN, current->comm, task_pid_nr(current), - current_thread_info(), current, task_thread_info(current)); + pr_emerg("Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", + TASK_COMM_LEN, current->comm, task_pid_nr(current), + current_thread_info(), current, task_thread_info(current)); /* * When in-kernel, we also print out the stack and code at the * time of the fault.. @@ -102,10 +101,10 @@ void show_regs(struct pt_regs *regs) unsigned char c; u8 *ip; - printk(KERN_EMERG "Stack:\n"); + pr_emerg("Stack:\n"); show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG); - printk(KERN_EMERG "Code: "); + pr_emerg("Code:"); ip = (u8 *)regs->ip - code_prologue; if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { @@ -116,16 +115,16 @@ void show_regs(struct pt_regs *regs) for (i = 0; i < code_len; i++, ip++) { if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - printk(KERN_CONT " Bad EIP value."); + pr_cont(" Bad EIP value."); break; } if (ip == (u8 *)regs->ip) - printk(KERN_CONT "<%02x> ", c); + pr_cont(" <%02x>", c); else - printk(KERN_CONT "%02x ", c); + pr_cont(" %02x", c); } } - printk(KERN_CONT "\n"); + pr_cont("\n"); } int is_valid_bugaddr(unsigned long ip) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 791b76122aa8..b653675d5288 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -228,20 +228,20 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, if (stack >= irq_stack && stack <= irq_stack_end) { if (stack == irq_stack_end) { stack = (unsigned long *) (irq_stack_end[-1]); - printk(KERN_CONT " <EOI> "); + pr_cont(" <EOI> "); } } else { if (((long) stack & (THREAD_SIZE-1)) == 0) break; } if (i && ((i % STACKSLOTS_PER_LINE) == 0)) - printk(KERN_CONT "\n"); - printk(KERN_CONT " %016lx", *stack++); + pr_cont("\n"); + pr_cont(" %016lx", *stack++); touch_nmi_watchdog(); } preempt_enable(); - printk(KERN_CONT "\n"); + pr_cont("\n"); show_trace_log_lvl(task, regs, sp, bp, log_lvl); } @@ -254,10 +254,9 @@ void show_regs(struct pt_regs *regs) sp = regs->sp; printk("CPU %d ", cpu); - print_modules(); __show_regs(regs, 1); - printk("Process %s (pid: %d, threadinfo %p, task %p)\n", - cur->comm, cur->pid, task_thread_info(cur), cur); + printk(KERN_DEFAULT "Process %s (pid: %d, threadinfo %p, task %p)\n", + cur->comm, cur->pid, task_thread_info(cur), cur); /* * When in-kernel, we also print out the stack and code at the @@ -284,16 +283,16 @@ void show_regs(struct pt_regs *regs) for (i = 0; i < code_len; i++, ip++) { if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - printk(KERN_CONT " Bad RIP value."); + pr_cont(" Bad RIP value."); break; } if (ip == (u8 *)regs->ip) - printk(KERN_CONT "<%02x> ", c); + pr_cont("<%02x> ", c); else - printk(KERN_CONT "%02x ", c); + pr_cont("%02x ", c); } } - printk(KERN_CONT "\n"); + pr_cont("\n"); } int is_valid_bugaddr(unsigned long ip) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 7d65133b51be..111f6bbd8b38 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1758,10 +1758,30 @@ end_repeat_nmi: */ call save_paranoid DEFAULT_FRAME 0 + + /* + * Save off the CR2 register. If we take a page fault in the NMI then + * it could corrupt the CR2 value. If the NMI preempts a page fault + * handler before it was able to read the CR2 register, and then the + * NMI itself takes a page fault, the page fault that was preempted + * will read the information from the NMI page fault and not the + * origin fault. Save it off and restore it if it changes. + * Use the r12 callee-saved register. + */ + movq %cr2, %r12 + /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ movq %rsp,%rdi movq $-1,%rsi call do_nmi + + /* Did the NMI take a page fault? Restore cr2 if it did */ + movq %cr2, %rcx + cmpq %rcx, %r12 + je 1f + movq %r12, %cr2 +1: + testl %ebx,%ebx /* swapgs needed? */ jnz nmi_restore nmi_swapgs: diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 3dafc6003b7c..1f5f1d5d2a02 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -294,9 +294,9 @@ void fixup_irqs(void) raw_spin_unlock(&desc->lock); if (break_affinity && set_affinity) - printk("Broke affinity for irq %i\n", irq); + pr_notice("Broke affinity for irq %i\n", irq); else if (!set_affinity) - printk("Cannot set affinity for irq %i\n", irq); + pr_notice("Cannot set affinity for irq %i\n", irq); } /* diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 8bfb6146f753..3f61904365cf 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -444,12 +444,12 @@ void kgdb_roundup_cpus(unsigned long flags) /** * kgdb_arch_handle_exception - Handle architecture specific GDB packets. - * @vector: The error vector of the exception that happened. + * @e_vector: The error vector of the exception that happened. * @signo: The signal number of the exception that happened. * @err_code: The error code of the exception that happened. - * @remcom_in_buffer: The buffer of the packet we have read. - * @remcom_out_buffer: The buffer of %BUFMAX bytes to write a packet into. - * @regs: The &struct pt_regs of the current process. + * @remcomInBuffer: The buffer of the packet we have read. + * @remcomOutBuffer: The buffer of %BUFMAX bytes to write a packet into. + * @linux_regs: The &struct pt_regs of the current process. * * This function MUST handle the 'c' and 's' command packets, * as well packets to set / remove a hardware breakpoint, if used. diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index e554e5ad2fe8..c1d61ee4b4f1 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -39,6 +39,9 @@ #include <asm/desc.h> #include <asm/tlbflush.h> #include <asm/idle.h> +#include <asm/apic.h> +#include <asm/apicdef.h> +#include <asm/hypervisor.h> static int kvmapf = 1; @@ -283,6 +286,22 @@ static void kvm_register_steal_time(void) cpu, __pa(st)); } +static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; + +static void kvm_guest_apic_eoi_write(u32 reg, u32 val) +{ + /** + * This relies on __test_and_clear_bit to modify the memory + * in a way that is atomic with respect to the local CPU. + * The hypervisor only accesses this memory from the local CPU so + * there's no need for lock or memory barriers. + * An optimization barrier is implied in apic write. + */ + if (__test_and_clear_bit(KVM_PV_EOI_BIT, &__get_cpu_var(kvm_apic_eoi))) + return; + apic_write(APIC_EOI, APIC_EOI_ACK); +} + void __cpuinit kvm_guest_cpu_init(void) { if (!kvm_para_available()) @@ -300,11 +319,20 @@ void __cpuinit kvm_guest_cpu_init(void) smp_processor_id()); } + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { + unsigned long pa; + /* Size alignment is implied but just to make it explicit. */ + BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); + __get_cpu_var(kvm_apic_eoi) = 0; + pa = __pa(&__get_cpu_var(kvm_apic_eoi)) | KVM_MSR_ENABLED; + wrmsrl(MSR_KVM_PV_EOI_EN, pa); + } + if (has_steal_clock) kvm_register_steal_time(); } -static void kvm_pv_disable_apf(void *unused) +static void kvm_pv_disable_apf(void) { if (!__get_cpu_var(apf_reason).enabled) return; @@ -316,11 +344,23 @@ static void kvm_pv_disable_apf(void *unused) smp_processor_id()); } +static void kvm_pv_guest_cpu_reboot(void *unused) +{ + /* + * We disable PV EOI before we load a new kernel by kexec, + * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory. + * New kernel can re-enable when it boots. + */ + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) + wrmsrl(MSR_KVM_PV_EOI_EN, 0); + kvm_pv_disable_apf(); +} + static int kvm_pv_reboot_notify(struct notifier_block *nb, unsigned long code, void *unused) { if (code == SYS_RESTART) - on_each_cpu(kvm_pv_disable_apf, NULL, 1); + on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1); return NOTIFY_DONE; } @@ -371,7 +411,9 @@ static void __cpuinit kvm_guest_cpu_online(void *dummy) static void kvm_guest_cpu_offline(void *dummy) { kvm_disable_steal_time(); - kvm_pv_disable_apf(NULL); + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) + wrmsrl(MSR_KVM_PV_EOI_EN, 0); + kvm_pv_disable_apf(); apf_task_wake_all(); } @@ -424,6 +466,9 @@ void __init kvm_guest_init(void) pv_time_ops.steal_clock = kvm_steal_clock; } + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) + apic_set_eoi_write(kvm_guest_apic_eoi_write); + #ifdef CONFIG_SMP smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; register_cpu_notifier(&kvm_cpu_notifier); @@ -432,6 +477,19 @@ void __init kvm_guest_init(void) #endif } +static bool __init kvm_detect(void) +{ + if (!kvm_para_available()) + return false; + return true; +} + +const struct hypervisor_x86 x86_hyper_kvm __refconst = { + .name = "KVM", + .detect = kvm_detect, +}; +EXPORT_SYMBOL_GPL(x86_hyper_kvm); + static __init int activate_jump_labels(void) { if (has_steal_clock) { diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 086eb58c6e80..f1b42b3a186c 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -120,11 +120,6 @@ bool kvm_check_and_clear_guest_paused(void) bool ret = false; struct pvclock_vcpu_time_info *src; - /* - * per_cpu() is safe here because this function is only called from - * timer functions where preemption is already disabled. - */ - WARN_ON(!in_atomic()); src = &__get_cpu_var(hv_clock); if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { __this_cpu_and(hv_clock.flags, ~PVCLOCK_GUEST_STOPPED); diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index fbdfc6917180..4873e62db6a1 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -87,6 +87,7 @@ #include <asm/microcode.h> #include <asm/processor.h> #include <asm/cpu_device_id.h> +#include <asm/perf_event.h> MODULE_DESCRIPTION("Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); @@ -277,7 +278,6 @@ static int reload_for_cpu(int cpu) struct ucode_cpu_info *uci = ucode_cpu_info + cpu; int err = 0; - mutex_lock(µcode_mutex); if (uci->valid) { enum ucode_state ustate; @@ -288,7 +288,6 @@ static int reload_for_cpu(int cpu) if (ustate == UCODE_ERROR) err = -EINVAL; } - mutex_unlock(µcode_mutex); return err; } @@ -298,19 +297,31 @@ static ssize_t reload_store(struct device *dev, const char *buf, size_t size) { unsigned long val; - int cpu = dev->id; - ssize_t ret = 0; + int cpu; + ssize_t ret = 0, tmp_ret; ret = kstrtoul(buf, 0, &val); if (ret) return ret; - if (val == 1) { - get_online_cpus(); - if (cpu_online(cpu)) - ret = reload_for_cpu(cpu); - put_online_cpus(); + if (val != 1) + return size; + + get_online_cpus(); + mutex_lock(µcode_mutex); + for_each_online_cpu(cpu) { + tmp_ret = reload_for_cpu(cpu); + if (tmp_ret != 0) + pr_warn("Error reloading microcode on CPU %d\n", cpu); + + /* save retval of the first encountered reload error */ + if (!ret) + ret = tmp_ret; } + if (!ret) + perf_check_microcode(); + mutex_unlock(µcode_mutex); + put_online_cpus(); if (!ret) ret = size; @@ -339,7 +350,6 @@ static DEVICE_ATTR(version, 0400, version_show, NULL); static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL); static struct attribute *mc_default_attrs[] = { - &dev_attr_reload.attr, &dev_attr_version.attr, &dev_attr_processor_flags.attr, NULL @@ -504,7 +514,7 @@ static struct notifier_block __refdata mc_cpu_notifier = { #ifdef MODULE /* Autoload on Intel and AMD systems */ -static const struct x86_cpu_id microcode_id[] = { +static const struct x86_cpu_id __initconst microcode_id[] = { #ifdef CONFIG_MICROCODE_INTEL { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, }, #endif @@ -516,6 +526,16 @@ static const struct x86_cpu_id microcode_id[] = { MODULE_DEVICE_TABLE(x86cpu, microcode_id); #endif +static struct attribute *cpu_root_microcode_attrs[] = { + &dev_attr_reload.attr, + NULL +}; + +static struct attribute_group cpu_root_microcode_group = { + .name = "microcode", + .attrs = cpu_root_microcode_attrs, +}; + static int __init microcode_init(void) { struct cpuinfo_x86 *c = &cpu_data(0); @@ -540,16 +560,25 @@ static int __init microcode_init(void) mutex_lock(µcode_mutex); error = subsys_interface_register(&mc_cpu_interface); - + if (!error) + perf_check_microcode(); mutex_unlock(µcode_mutex); put_online_cpus(); if (error) goto out_pdev; + error = sysfs_create_group(&cpu_subsys.dev_root->kobj, + &cpu_root_microcode_group); + + if (error) { + pr_err("Error creating microcode group!\n"); + goto out_driver; + } + error = microcode_dev_init(); if (error) - goto out_driver; + goto out_ucode_group; register_syscore_ops(&mc_syscore_ops); register_hotcpu_notifier(&mc_cpu_notifier); @@ -559,7 +588,11 @@ static int __init microcode_init(void) return 0; -out_driver: + out_ucode_group: + sysfs_remove_group(&cpu_subsys.dev_root->kobj, + &cpu_root_microcode_group); + + out_driver: get_online_cpus(); mutex_lock(µcode_mutex); @@ -568,7 +601,7 @@ out_driver: mutex_unlock(µcode_mutex); put_online_cpus(); -out_pdev: + out_pdev: platform_device_unregister(microcode_pdev); return error; @@ -584,6 +617,9 @@ static void __exit microcode_exit(void) unregister_hotcpu_notifier(&mc_cpu_notifier); unregister_syscore_ops(&mc_syscore_ops); + sysfs_remove_group(&cpu_subsys.dev_root->kobj, + &cpu_root_microcode_group); + get_online_cpus(); mutex_lock(µcode_mutex); diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index f21fd94ac897..202494d2ec6e 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -15,6 +15,9 @@ along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/moduleloader.h> #include <linux/elf.h> #include <linux/vmalloc.h> @@ -30,9 +33,14 @@ #include <asm/pgtable.h> #if 0 -#define DEBUGP printk +#define DEBUGP(fmt, ...) \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__) #else -#define DEBUGP(fmt...) +#define DEBUGP(fmt, ...) \ +do { \ + if (0) \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ +} while (0) #endif void *module_alloc(unsigned long size) @@ -56,8 +64,8 @@ int apply_relocate(Elf32_Shdr *sechdrs, Elf32_Sym *sym; uint32_t *location; - DEBUGP("Applying relocate section %u to %u\n", relsec, - sechdrs[relsec].sh_info); + DEBUGP("Applying relocate section %u to %u\n", + relsec, sechdrs[relsec].sh_info); for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { /* This is where to make the change */ location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr @@ -77,7 +85,7 @@ int apply_relocate(Elf32_Shdr *sechdrs, *location += sym->st_value - (uint32_t)location; break; default: - printk(KERN_ERR "module %s: Unknown relocation: %u\n", + pr_err("%s: Unknown relocation: %u\n", me->name, ELF32_R_TYPE(rel[i].r_info)); return -ENOEXEC; } @@ -97,8 +105,8 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, void *loc; u64 val; - DEBUGP("Applying relocate section %u to %u\n", relsec, - sechdrs[relsec].sh_info); + DEBUGP("Applying relocate section %u to %u\n", + relsec, sechdrs[relsec].sh_info); for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { /* This is where to make the change */ loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr @@ -110,8 +118,8 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, + ELF64_R_SYM(rel[i].r_info); DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", - (int)ELF64_R_TYPE(rel[i].r_info), - sym->st_value, rel[i].r_addend, (u64)loc); + (int)ELF64_R_TYPE(rel[i].r_info), + sym->st_value, rel[i].r_addend, (u64)loc); val = sym->st_value + rel[i].r_addend; @@ -140,7 +148,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, #endif break; default: - printk(KERN_ERR "module %s: Unknown rela relocation: %llu\n", + pr_err("%s: Unknown rela relocation: %llu\n", me->name, ELF64_R_TYPE(rel[i].r_info)); return -ENOEXEC; } @@ -148,9 +156,9 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, return 0; overflow: - printk(KERN_ERR "overflow in relocation type %d val %Lx\n", + pr_err("overflow in relocation type %d val %Lx\n", (int)ELF64_R_TYPE(rel[i].r_info), val); - printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n", + pr_err("`%s' likely not compiled with -mcmodel=kernel\n", me->name); return -ENOEXEC; } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index a0b2f84457be..f84f5c57de35 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -365,8 +365,9 @@ static __kprobes void default_do_nmi(struct pt_regs *regs) #ifdef CONFIG_X86_32 /* * For i386, NMIs use the same stack as the kernel, and we can - * add a workaround to the iret problem in C. Simply have 3 states - * the NMI can be in. + * add a workaround to the iret problem in C (preventing nested + * NMIs if an NMI takes a trap). Simply have 3 states the NMI + * can be in: * * 1) not running * 2) executing @@ -383,32 +384,50 @@ static __kprobes void default_do_nmi(struct pt_regs *regs) * If an NMI hits a breakpoint that executes an iret, another * NMI can preempt it. We do not want to allow this new NMI * to run, but we want to execute it when the first one finishes. - * We set the state to "latched", and the first NMI will perform - * an cmpxchg on the state, and if it doesn't successfully - * reset the state to "not running" it will restart the next - * NMI. + * We set the state to "latched", and the exit of the first NMI will + * perform a dec_return, if the result is zero (NOT_RUNNING), then + * it will simply exit the NMI handler. If not, the dec_return + * would have set the state to NMI_EXECUTING (what we want it to + * be when we are running). In this case, we simply jump back + * to rerun the NMI handler again, and restart the 'latched' NMI. + * + * No trap (breakpoint or page fault) should be hit before nmi_restart, + * thus there is no race between the first check of state for NOT_RUNNING + * and setting it to NMI_EXECUTING. The HW will prevent nested NMIs + * at this point. + * + * In case the NMI takes a page fault, we need to save off the CR2 + * because the NMI could have preempted another page fault and corrupt + * the CR2 that is about to be read. As nested NMIs must be restarted + * and they can not take breakpoints or page faults, the update of the + * CR2 must be done before converting the nmi state back to NOT_RUNNING. + * Otherwise, there would be a race of another nested NMI coming in + * after setting state to NOT_RUNNING but before updating the nmi_cr2. */ enum nmi_states { - NMI_NOT_RUNNING, + NMI_NOT_RUNNING = 0, NMI_EXECUTING, NMI_LATCHED, }; static DEFINE_PER_CPU(enum nmi_states, nmi_state); +static DEFINE_PER_CPU(unsigned long, nmi_cr2); #define nmi_nesting_preprocess(regs) \ do { \ - if (__get_cpu_var(nmi_state) != NMI_NOT_RUNNING) { \ - __get_cpu_var(nmi_state) = NMI_LATCHED; \ + if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { \ + this_cpu_write(nmi_state, NMI_LATCHED); \ return; \ } \ - nmi_restart: \ - __get_cpu_var(nmi_state) = NMI_EXECUTING; \ - } while (0) + this_cpu_write(nmi_state, NMI_EXECUTING); \ + this_cpu_write(nmi_cr2, read_cr2()); \ + } while (0); \ + nmi_restart: #define nmi_nesting_postprocess() \ do { \ - if (cmpxchg(&__get_cpu_var(nmi_state), \ - NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING) \ + if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) \ + write_cr2(this_cpu_read(nmi_cr2)); \ + if (this_cpu_dec_return(nmi_state)) \ goto nmi_restart; \ } while (0) #else /* x86_64 */ diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index 149b8d9c6ad4..6d9582ec0324 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -42,7 +42,8 @@ static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs) static void __init init_nmi_testsuite(void) { /* trap all the unknown NMIs we may generate */ - register_nmi_handler_initonly(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk"); + register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk", + __initdata); } static void __init cleanup_nmi_testsuite(void) @@ -64,8 +65,8 @@ static void __init test_nmi_ipi(struct cpumask *mask) { unsigned long timeout; - if (register_nmi_handler_initonly(NMI_LOCAL, test_nmi_ipi_callback, - NMI_FLAG_FIRST, "nmi_selftest")) { + if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback, + NMI_FLAG_FIRST, "nmi_selftest", __initdata)) { nmi_fail = FAILURE; return; } diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 9ce885996fd7..17fff18a1031 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -352,9 +352,7 @@ struct pv_cpu_ops pv_cpu_ops = { #endif .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, - .rdmsr_regs = native_rdmsr_safe_regs, .write_msr = native_write_msr_safe, - .wrmsr_regs = native_wrmsr_safe_regs, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, .read_tscp = native_read_tscp, diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index b72838bae64a..299d49302e7d 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -22,6 +22,8 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#define pr_fmt(fmt) "Calgary: " fmt + #include <linux/kernel.h> #include <linux/init.h> #include <linux/types.h> @@ -245,7 +247,7 @@ static unsigned long iommu_range_alloc(struct device *dev, offset = iommu_area_alloc(tbl->it_map, tbl->it_size, 0, npages, 0, boundary_size, 0); if (offset == ~0UL) { - printk(KERN_WARNING "Calgary: IOMMU full.\n"); + pr_warn("IOMMU full\n"); spin_unlock_irqrestore(&tbl->it_lock, flags); if (panic_on_overflow) panic("Calgary: fix the allocator.\n"); @@ -271,8 +273,8 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, entry = iommu_range_alloc(dev, tbl, npages); if (unlikely(entry == DMA_ERROR_CODE)) { - printk(KERN_WARNING "Calgary: failed to allocate %u pages in " - "iommu %p\n", npages, tbl); + pr_warn("failed to allocate %u pages in iommu %p\n", + npages, tbl); return DMA_ERROR_CODE; } @@ -561,8 +563,7 @@ static void calgary_tce_cache_blast(struct iommu_table *tbl) i++; } while ((val & 0xff) != 0xff && i < 100); if (i == 100) - printk(KERN_WARNING "Calgary: PCI bus not quiesced, " - "continuing anyway\n"); + pr_warn("PCI bus not quiesced, continuing anyway\n"); /* invalidate TCE cache */ target = calgary_reg(bbar, tar_offset(tbl->it_busno)); @@ -604,8 +605,7 @@ begin: i++; } while ((val64 & 0xff) != 0xff && i < 100); if (i == 100) - printk(KERN_WARNING "CalIOC2: PCI bus not quiesced, " - "continuing anyway\n"); + pr_warn("CalIOC2: PCI bus not quiesced, continuing anyway\n"); /* 3. poll Page Migration DEBUG for SoftStopFault */ target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); @@ -617,8 +617,7 @@ begin: if (++count < 100) goto begin; else { - printk(KERN_WARNING "CalIOC2: too many SoftStopFaults, " - "aborting TCE cache flush sequence!\n"); + pr_warn("CalIOC2: too many SoftStopFaults, aborting TCE cache flush sequence!\n"); return; /* pray for the best */ } } @@ -840,8 +839,8 @@ static void calgary_dump_error_regs(struct iommu_table *tbl) plssr = be32_to_cpu(readl(target)); /* If no error, the agent ID in the CSR is not valid */ - printk(KERN_EMERG "Calgary: DMA error on Calgary PHB 0x%x, " - "0x%08x@CSR 0x%08x@PLSSR\n", tbl->it_busno, csr, plssr); + pr_emerg("DMA error on Calgary PHB 0x%x, 0x%08x@CSR 0x%08x@PLSSR\n", + tbl->it_busno, csr, plssr); } static void calioc2_dump_error_regs(struct iommu_table *tbl) @@ -867,22 +866,21 @@ static void calioc2_dump_error_regs(struct iommu_table *tbl) target = calgary_reg(bbar, phboff | 0x800); mck = be32_to_cpu(readl(target)); - printk(KERN_EMERG "Calgary: DMA error on CalIOC2 PHB 0x%x\n", - tbl->it_busno); + pr_emerg("DMA error on CalIOC2 PHB 0x%x\n", tbl->it_busno); - printk(KERN_EMERG "Calgary: 0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n", - csr, plssr, csmr, mck); + pr_emerg("0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n", + csr, plssr, csmr, mck); /* dump rest of error regs */ - printk(KERN_EMERG "Calgary: "); + pr_emerg(""); for (i = 0; i < ARRAY_SIZE(errregs); i++) { /* err regs are at 0x810 - 0x870 */ erroff = (0x810 + (i * 0x10)); target = calgary_reg(bbar, phboff | erroff); errregs[i] = be32_to_cpu(readl(target)); - printk("0x%08x@0x%lx ", errregs[i], erroff); + pr_cont("0x%08x@0x%lx ", errregs[i], erroff); } - printk("\n"); + pr_cont("\n"); /* root complex status */ target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS); diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 62c9457ccd2f..c0f420f76cd3 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -100,7 +100,7 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size, struct dma_attrs *attrs) { unsigned long dma_mask; - struct page *page = NULL; + struct page *page; unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; dma_addr_t addr; @@ -108,6 +108,7 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size, flag |= __GFP_ZERO; again: + page = NULL; if (!(flag & GFP_ATOMIC)) page = dma_alloc_from_contiguous(dev, count, get_order(size)); if (!page) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 735279e54e59..ef6a8456f719 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/errno.h> #include <linux/kernel.h> #include <linux/mm.h> @@ -145,16 +147,14 @@ void show_regs_common(void) /* Board Name is optional */ board = dmi_get_system_info(DMI_BOARD_NAME); - printk(KERN_CONT "\n"); - printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - printk(KERN_CONT " %s %s", vendor, product); - if (board) - printk(KERN_CONT "/%s", board); - printk(KERN_CONT "\n"); + printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s %s%s%s\n", + current->pid, current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version, + vendor, product, + board ? "/" : "", + board ? board : ""); } void flush_thread(void) @@ -645,7 +645,7 @@ static void amd_e400_idle(void) amd_e400_c1e_detected = true; if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) mark_tsc_unstable("TSC halt in AMD C1E"); - printk(KERN_INFO "System has AMD C1E enabled\n"); + pr_info("System has AMD C1E enabled\n"); } } @@ -659,8 +659,7 @@ static void amd_e400_idle(void) */ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, &cpu); - printk(KERN_INFO "Switch to broadcast mode on CPU%d\n", - cpu); + pr_info("Switch to broadcast mode on CPU%d\n", cpu); } clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); @@ -681,8 +680,7 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP if (pm_idle == poll_idle && smp_num_siblings > 1) { - printk_once(KERN_WARNING "WARNING: polling idle and HT enabled," - " performance may degrade.\n"); + pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); } #endif if (pm_idle) @@ -692,11 +690,11 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) /* * One CPU supports mwait => All CPUs supports mwait */ - printk(KERN_INFO "using mwait in idle threads.\n"); + pr_info("using mwait in idle threads\n"); pm_idle = mwait_idle; } else if (cpu_has_amd_erratum(amd_erratum_400)) { /* E400: APIC timer interrupt does not wake up CPU from C1e */ - printk(KERN_INFO "using AMD E400 aware idle routine\n"); + pr_info("using AMD E400 aware idle routine\n"); pm_idle = amd_e400_idle; } else pm_idle = default_idle; @@ -715,7 +713,7 @@ static int __init idle_setup(char *str) return -EINVAL; if (!strcmp(str, "poll")) { - printk("using polling idle threads.\n"); + pr_info("using polling idle threads\n"); pm_idle = poll_idle; boot_option_idle_override = IDLE_POLL; } else if (!strcmp(str, "mwait")) { diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 61cdf7fdf099..0a980c9d7cb8 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -117,10 +117,10 @@ void release_thread(struct task_struct *dead_task) { if (dead_task->mm) { if (dead_task->mm->context.size) { - printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", - dead_task->comm, - dead_task->mm->context.ldt, - dead_task->mm->context.size); + pr_warn("WARNING: dead process %8s still has LDT? <%p/%d>\n", + dead_task->comm, + dead_task->mm->context.ldt, + dead_task->mm->context.size); BUG(); } } @@ -466,7 +466,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) task->thread.gs = addr; if (doit) { load_gs_index(0); - ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); + ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr); } } put_cpu(); @@ -494,7 +494,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) /* set the selector to 0 to not confuse __switch_to */ loadsegment(fs, 0); - ret = checking_wrmsrl(MSR_FS_BASE, addr); + ret = wrmsrl_safe(MSR_FS_BASE, addr); } } put_cpu(); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 25b48edb847c..52190a938b4a 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/reboot.h> #include <linux/init.h> @@ -20,14 +22,12 @@ #include <asm/virtext.h> #include <asm/cpu.h> #include <asm/nmi.h> +#include <asm/smp.h> -#ifdef CONFIG_X86_32 -# include <linux/ctype.h> -# include <linux/mc146818rtc.h> -# include <asm/realmode.h> -#else -# include <asm/x86_init.h> -#endif +#include <linux/ctype.h> +#include <linux/mc146818rtc.h> +#include <asm/realmode.h> +#include <asm/x86_init.h> /* * Power off function, if any @@ -49,7 +49,7 @@ int reboot_force; */ static int reboot_default = 1; -#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) +#ifdef CONFIG_SMP static int reboot_cpu = -1; #endif @@ -67,8 +67,8 @@ bool port_cf9_safe = false; * reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci] * warm Don't set the cold reboot flag * cold Set the cold reboot flag - * bios Reboot by jumping through the BIOS (only for X86_32) - * smp Reboot by executing reset on BSP or other CPU (only for X86_32) + * bios Reboot by jumping through the BIOS + * smp Reboot by executing reset on BSP or other CPU * triple Force a triple fault (init) * kbd Use the keyboard controller. cold reset (default) * acpi Use the RESET_REG in the FADT @@ -95,7 +95,6 @@ static int __init reboot_setup(char *str) reboot_mode = 0; break; -#ifdef CONFIG_X86_32 #ifdef CONFIG_SMP case 's': if (isdigit(*(str+1))) { @@ -112,7 +111,6 @@ static int __init reboot_setup(char *str) #endif /* CONFIG_SMP */ case 'b': -#endif case 'a': case 'k': case 't': @@ -138,7 +136,6 @@ static int __init reboot_setup(char *str) __setup("reboot=", reboot_setup); -#ifdef CONFIG_X86_32 /* * Reboot options and system auto-detection code provided by * Dell Inc. so their systems "just work". :-) @@ -152,16 +149,14 @@ static int __init set_bios_reboot(const struct dmi_system_id *d) { if (reboot_type != BOOT_BIOS) { reboot_type = BOOT_BIOS; - printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident); + pr_info("%s series board detected. Selecting %s-method for reboots.\n", + "BIOS", d->ident); } return 0; } -void machine_real_restart(unsigned int type) +void __noreturn machine_real_restart(unsigned int type) { - void (*restart_lowmem)(unsigned int) = (void (*)(unsigned int)) - real_mode_header->machine_real_restart_asm; - local_irq_disable(); /* @@ -181,25 +176,28 @@ void machine_real_restart(unsigned int type) /* * Switch back to the initial page table. */ +#ifdef CONFIG_X86_32 load_cr3(initial_page_table); - - /* - * Write 0x1234 to absolute memory location 0x472. The BIOS reads - * this on booting to tell it to "Bypass memory test (also warm - * boot)". This seems like a fairly standard thing that gets set by - * REBOOT.COM programs, and the previous reset routine did this - * too. */ - *((unsigned short *)0x472) = reboot_mode; +#else + write_cr3(real_mode_header->trampoline_pgd); +#endif /* Jump to the identity-mapped low memory code */ - restart_lowmem(type); +#ifdef CONFIG_X86_32 + asm volatile("jmpl *%0" : : + "rm" (real_mode_header->machine_real_restart_asm), + "a" (type)); +#else + asm volatile("ljmpl *%0" : : + "m" (real_mode_header->machine_real_restart_asm), + "D" (type)); +#endif + unreachable(); } #ifdef CONFIG_APM_MODULE EXPORT_SYMBOL(machine_real_restart); #endif -#endif /* CONFIG_X86_32 */ - /* * Some Apple MacBook and MacBookPro's needs reboot=p to be able to reboot */ @@ -207,8 +205,8 @@ static int __init set_pci_reboot(const struct dmi_system_id *d) { if (reboot_type != BOOT_CF9) { reboot_type = BOOT_CF9; - printk(KERN_INFO "%s series board detected. " - "Selecting PCI-method for reboots.\n", d->ident); + pr_info("%s series board detected. Selecting %s-method for reboots.\n", + "PCI", d->ident); } return 0; } @@ -217,17 +215,16 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d) { if (reboot_type != BOOT_KBD) { reboot_type = BOOT_KBD; - printk(KERN_INFO "%s series board detected. Selecting KBD-method for reboot.\n", d->ident); + pr_info("%s series board detected. Selecting %s-method for reboot.\n", + "KBD", d->ident); } return 0; } /* - * This is a single dmi_table handling all reboot quirks. Note that - * REBOOT_BIOS is only available for 32bit + * This is a single dmi_table handling all reboot quirks. */ static struct dmi_system_id __initdata reboot_dmi_table[] = { -#ifdef CONFIG_X86_32 { /* Handle problems with rebooting on Dell E520's */ .callback = set_bios_reboot, .ident = "Dell E520", @@ -377,7 +374,6 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "P4S800"), }, }, -#endif /* CONFIG_X86_32 */ { /* Handle reboot issue on Acer Aspire one */ .callback = set_kbd_reboot, @@ -451,6 +447,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"), }, }, + { /* Handle problems with rebooting on the Precision M6600. */ + .callback = set_pci_reboot, + .ident = "Dell OptiPlex 990", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"), + }, + }, { } }; @@ -576,13 +580,11 @@ static void native_machine_emergency_restart(void) reboot_type = BOOT_KBD; break; -#ifdef CONFIG_X86_32 case BOOT_BIOS: machine_real_restart(MRR_BIOS); reboot_type = BOOT_KBD; break; -#endif case BOOT_ACPI: acpi_reboot(); @@ -624,12 +626,10 @@ void native_machine_shutdown(void) /* The boot cpu is always logical cpu 0 */ int reboot_cpu_id = 0; -#ifdef CONFIG_X86_32 /* See if there has been given a command line override */ if ((reboot_cpu != -1) && (reboot_cpu < nr_cpu_ids) && cpu_online(reboot_cpu)) reboot_cpu_id = reboot_cpu; -#endif /* Make certain the cpu I'm about to reboot on is online */ if (!cpu_online(reboot_cpu_id)) @@ -670,7 +670,7 @@ static void __machine_emergency_restart(int emergency) static void native_machine_restart(char *__unused) { - printk("machine restart\n"); + pr_notice("machine restart\n"); if (!reboot_force) machine_shutdown(); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 16be6dc14db1..f4b9b80e1b95 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1031,8 +1031,6 @@ void __init setup_arch(char **cmdline_p) x86_init.timers.wallclock_init(); - x86_platform.wallclock_init(); - mcheck_init(); arch_init_ideal_nops(); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 21af737053aa..b280908a376e 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -6,6 +6,9 @@ * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes * 2000-2002 x86-64 support by Andi Kleen */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/sched.h> #include <linux/mm.h> #include <linux/smp.h> @@ -814,7 +817,7 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where) me->comm, me->pid, where, frame, regs->ip, regs->sp, regs->orig_ax); print_vma_addr(" in ", regs->ip); - printk(KERN_CONT "\n"); + pr_cont("\n"); } force_sig(SIGSEGV, me); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3fab55bea29b..c1a310fb8309 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1,4 +1,4 @@ -/* + /* * x86 SMP booting functions * * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk> @@ -39,6 +39,8 @@ * Glauber Costa : i386 and x86_64 integration */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/init.h> #include <linux/smp.h> #include <linux/module.h> @@ -184,7 +186,7 @@ static void __cpuinit smp_callin(void) * boards) */ - pr_debug("CALLIN, before setup_local_APIC().\n"); + pr_debug("CALLIN, before setup_local_APIC()\n"); if (apic->smp_callin_clear_local_apic) apic->smp_callin_clear_local_apic(); setup_local_APIC(); @@ -255,22 +257,13 @@ notrace static void __cpuinit start_secondary(void *unused) check_tsc_sync_target(); /* - * We need to hold call_lock, so there is no inconsistency - * between the time smp_call_function() determines number of - * IPI recipients, and the time when the determination is made - * for which cpus receive the IPI. Holding this - * lock helps us to not include this cpu in a currently in progress - * smp_call_function(). - * * We need to hold vector_lock so there the set of online cpus * does not change while we are assigning vectors to cpus. Holding * this lock ensures we don't half assign or remove an irq from a cpu. */ - ipi_call_lock(); lock_vector_lock(); set_cpu_online(smp_processor_id(), true); unlock_vector_lock(); - ipi_call_unlock(); per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; x86_platform.nmi_init(); @@ -349,9 +342,12 @@ static bool __cpuinit match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { - if (c->phys_proc_id == o->phys_proc_id) - return topology_sane(c, o, "mc"); + if (c->phys_proc_id == o->phys_proc_id) { + if (cpu_has(c, X86_FEATURE_AMD_DCM)) + return true; + return topology_sane(c, o, "mc"); + } return false; } @@ -429,17 +425,16 @@ static void impress_friends(void) /* * Allow the user to impress friends. */ - pr_debug("Before bogomips.\n"); + pr_debug("Before bogomips\n"); for_each_possible_cpu(cpu) if (cpumask_test_cpu(cpu, cpu_callout_mask)) bogosum += cpu_data(cpu).loops_per_jiffy; - printk(KERN_INFO - "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", + pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n", num_online_cpus(), bogosum/(500000/HZ), (bogosum/(5000/HZ))%100); - pr_debug("Before bogocount - setting activated=1.\n"); + pr_debug("Before bogocount - setting activated=1\n"); } void __inquire_remote_apic(int apicid) @@ -449,18 +444,17 @@ void __inquire_remote_apic(int apicid) int timeout; u32 status; - printk(KERN_INFO "Inquiring remote APIC 0x%x...\n", apicid); + pr_info("Inquiring remote APIC 0x%x...\n", apicid); for (i = 0; i < ARRAY_SIZE(regs); i++) { - printk(KERN_INFO "... APIC 0x%x %s: ", apicid, names[i]); + pr_info("... APIC 0x%x %s: ", apicid, names[i]); /* * Wait for idle. */ status = safe_apic_wait_icr_idle(); if (status) - printk(KERN_CONT - "a previous APIC delivery may have failed\n"); + pr_cont("a previous APIC delivery may have failed\n"); apic_icr_write(APIC_DM_REMRD | regs[i], apicid); @@ -473,10 +467,10 @@ void __inquire_remote_apic(int apicid) switch (status) { case APIC_ICR_RR_VALID: status = apic_read(APIC_RRR); - printk(KERN_CONT "%08x\n", status); + pr_cont("%08x\n", status); break; default: - printk(KERN_CONT "failed\n"); + pr_cont("failed\n"); } } } @@ -510,12 +504,12 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) apic_write(APIC_ESR, 0); accept_status = (apic_read(APIC_ESR) & 0xEF); } - pr_debug("NMI sent.\n"); + pr_debug("NMI sent\n"); if (send_status) - printk(KERN_ERR "APIC never delivered???\n"); + pr_err("APIC never delivered???\n"); if (accept_status) - printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status); + pr_err("APIC delivery error (%lx)\n", accept_status); return (send_status | accept_status); } @@ -537,7 +531,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) apic_read(APIC_ESR); } - pr_debug("Asserting INIT.\n"); + pr_debug("Asserting INIT\n"); /* * Turn INIT on target chip @@ -553,7 +547,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) mdelay(10); - pr_debug("Deasserting INIT.\n"); + pr_debug("Deasserting INIT\n"); /* Target chip */ /* Send IPI */ @@ -586,14 +580,14 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) /* * Run STARTUP IPI loop. */ - pr_debug("#startup loops: %d.\n", num_starts); + pr_debug("#startup loops: %d\n", num_starts); for (j = 1; j <= num_starts; j++) { - pr_debug("Sending STARTUP #%d.\n", j); + pr_debug("Sending STARTUP #%d\n", j); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); apic_read(APIC_ESR); - pr_debug("After apic_write.\n"); + pr_debug("After apic_write\n"); /* * STARTUP IPI @@ -610,7 +604,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) */ udelay(300); - pr_debug("Startup point 1.\n"); + pr_debug("Startup point 1\n"); pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); @@ -625,12 +619,12 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) if (send_status || accept_status) break; } - pr_debug("After Startup.\n"); + pr_debug("After Startup\n"); if (send_status) - printk(KERN_ERR "APIC never delivered???\n"); + pr_err("APIC never delivered???\n"); if (accept_status) - printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status); + pr_err("APIC delivery error (%lx)\n", accept_status); return (send_status | accept_status); } @@ -644,11 +638,11 @@ static void __cpuinit announce_cpu(int cpu, int apicid) if (system_state == SYSTEM_BOOTING) { if (node != current_node) { if (current_node > (-1)) - pr_cont(" Ok.\n"); + pr_cont(" OK\n"); current_node = node; pr_info("Booting Node %3d, Processors ", node); } - pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " Ok.\n" : ""); + pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " OK\n" : ""); return; } else pr_info("Booting Node %d Processor %d APIC 0x%x\n", @@ -728,9 +722,9 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) /* * allow APs to start initializing. */ - pr_debug("Before Callout %d.\n", cpu); + pr_debug("Before Callout %d\n", cpu); cpumask_set_cpu(cpu, cpu_callout_mask); - pr_debug("After Callout %d.\n", cpu); + pr_debug("After Callout %d\n", cpu); /* * Wait 5s total for a response @@ -758,7 +752,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) pr_err("CPU%d: Stuck ??\n", cpu); else /* trampoline code not run */ - pr_err("CPU%d: Not responding.\n", cpu); + pr_err("CPU%d: Not responding\n", cpu); if (apic->inquire_remote_apic) apic->inquire_remote_apic(apicid); } @@ -803,7 +797,7 @@ int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle) if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || !physid_isset(apicid, phys_cpu_present_map) || !apic->apic_id_valid(apicid)) { - printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu); + pr_err("%s: bad cpu %d\n", __func__, cpu); return -EINVAL; } @@ -884,9 +878,8 @@ static int __init smp_sanity_check(unsigned max_cpus) unsigned int cpu; unsigned nr; - printk(KERN_WARNING - "More than 8 CPUs detected - skipping them.\n" - "Use CONFIG_X86_BIGSMP.\n"); + pr_warn("More than 8 CPUs detected - skipping them\n" + "Use CONFIG_X86_BIGSMP\n"); nr = 0; for_each_present_cpu(cpu) { @@ -907,8 +900,7 @@ static int __init smp_sanity_check(unsigned max_cpus) #endif if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { - printk(KERN_WARNING - "weird, boot CPU (#%d) not listed by the BIOS.\n", + pr_warn("weird, boot CPU (#%d) not listed by the BIOS\n", hard_smp_processor_id()); physid_set(hard_smp_processor_id(), phys_cpu_present_map); @@ -920,11 +912,10 @@ static int __init smp_sanity_check(unsigned max_cpus) */ if (!smp_found_config && !acpi_lapic) { preempt_enable(); - printk(KERN_NOTICE "SMP motherboard not detected.\n"); + pr_notice("SMP motherboard not detected\n"); disable_smp(); if (APIC_init_uniprocessor()) - printk(KERN_NOTICE "Local APIC not detected." - " Using dummy APIC emulation.\n"); + pr_notice("Local APIC not detected. Using dummy APIC emulation.\n"); return -1; } @@ -933,9 +924,8 @@ static int __init smp_sanity_check(unsigned max_cpus) * CPU too, but we do it for the sake of robustness anyway. */ if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) { - printk(KERN_NOTICE - "weird, boot CPU (#%d) not listed by the BIOS.\n", - boot_cpu_physical_apicid); + pr_notice("weird, boot CPU (#%d) not listed by the BIOS\n", + boot_cpu_physical_apicid); physid_set(hard_smp_processor_id(), phys_cpu_present_map); } preempt_enable(); @@ -948,8 +938,7 @@ static int __init smp_sanity_check(unsigned max_cpus) if (!disable_apic) { pr_err("BIOS bug, local APIC #%d not detected!...\n", boot_cpu_physical_apicid); - pr_err("... forcing use of dummy APIC emulation." - "(tell your hw vendor)\n"); + pr_err("... forcing use of dummy APIC emulation (tell your hw vendor)\n"); } smpboot_clear_io_apic(); disable_ioapic_support(); @@ -962,7 +951,7 @@ static int __init smp_sanity_check(unsigned max_cpus) * If SMP should be disabled, then really disable it! */ if (!max_cpus) { - printk(KERN_INFO "SMP mode deactivated.\n"); + pr_info("SMP mode deactivated\n"); smpboot_clear_io_apic(); connect_bsp_APIC(); @@ -1014,7 +1003,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) if (smp_sanity_check(max_cpus) < 0) { - printk(KERN_INFO "SMP disabled\n"); + pr_info("SMP disabled\n"); disable_smp(); goto out; } @@ -1052,7 +1041,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) * Set up local APIC timer on boot CPU. */ - printk(KERN_INFO "CPU%d: ", 0); + pr_info("CPU%d: ", 0); print_cpu_info(&cpu_data(0)); x86_init.timers.setup_percpu_clockev(); @@ -1102,7 +1091,7 @@ void __init native_smp_prepare_boot_cpu(void) void __init native_smp_cpus_done(unsigned int max_cpus) { - pr_debug("Boot done.\n"); + pr_debug("Boot done\n"); nmi_selftest(); impress_friends(); @@ -1163,8 +1152,7 @@ __init void prefill_possible_map(void) /* nr_cpu_ids could be reduced via nr_cpus= */ if (possible > nr_cpu_ids) { - printk(KERN_WARNING - "%d Processors exceeds NR_CPUS limit of %d\n", + pr_warn("%d Processors exceeds NR_CPUS limit of %d\n", possible, nr_cpu_ids); possible = nr_cpu_ids; } @@ -1173,13 +1161,12 @@ __init void prefill_possible_map(void) if (!setup_max_cpus) #endif if (possible > i) { - printk(KERN_WARNING - "%d Processors exceeds max_cpus limit of %u\n", + pr_warn("%d Processors exceeds max_cpus limit of %u\n", possible, setup_max_cpus); possible = i; } - printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", + pr_info("Allowing %d CPUs, %d hotplug CPUs\n", possible, max_t(int, possible - num_processors, 0)); for (i = 0; i < possible; i++) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 05b31d92f69c..b481341c9369 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -9,6 +9,9 @@ /* * Handle hardware traps and faults. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/interrupt.h> #include <linux/kallsyms.h> #include <linux/spinlock.h> @@ -143,12 +146,11 @@ trap_signal: #ifdef CONFIG_X86_64 if (show_unhandled_signals && unhandled_signal(tsk, signr) && printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] trap %s ip:%lx sp:%lx error:%lx", - tsk->comm, tsk->pid, str, - regs->ip, regs->sp, error_code); + pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx", + tsk->comm, tsk->pid, str, + regs->ip, regs->sp, error_code); print_vma_addr(" in ", regs->ip); - printk("\n"); + pr_cont("\n"); } #endif @@ -269,12 +271,11 @@ do_general_protection(struct pt_regs *regs, long error_code) if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] general protection ip:%lx sp:%lx error:%lx", + pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx", tsk->comm, task_pid_nr(tsk), regs->ip, regs->sp, error_code); print_vma_addr(" in ", regs->ip); - printk("\n"); + pr_cont("\n"); } force_sig(SIGSEGV, tsk); @@ -570,7 +571,7 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) conditional_sti(regs); #if 0 /* No need to warn about this any longer. */ - printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); + pr_info("Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); #endif } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index fc0a147e3727..cfa5d4f7ca56 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/sched.h> #include <linux/init.h> @@ -84,8 +86,7 @@ EXPORT_SYMBOL_GPL(check_tsc_unstable); #ifdef CONFIG_X86_TSC int __init notsc_setup(char *str) { - printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " - "cannot disable TSC completely.\n"); + pr_warn("Kernel compiled with CONFIG_X86_TSC, cannot disable TSC completely\n"); tsc_disabled = 1; return 1; } @@ -373,7 +374,7 @@ static unsigned long quick_pit_calibrate(void) goto success; } } - printk("Fast TSC calibration failed\n"); + pr_err("Fast TSC calibration failed\n"); return 0; success: @@ -392,7 +393,7 @@ success: */ delta *= PIT_TICK_RATE; do_div(delta, i*256*1000); - printk("Fast TSC calibration using PIT\n"); + pr_info("Fast TSC calibration using PIT\n"); return delta; } @@ -487,9 +488,8 @@ unsigned long native_calibrate_tsc(void) * use the reference value, as it is more precise. */ if (delta >= 90 && delta <= 110) { - printk(KERN_INFO - "TSC: PIT calibration matches %s. %d loops\n", - hpet ? "HPET" : "PMTIMER", i + 1); + pr_info("PIT calibration matches %s. %d loops\n", + hpet ? "HPET" : "PMTIMER", i + 1); return tsc_ref_min; } @@ -511,38 +511,36 @@ unsigned long native_calibrate_tsc(void) */ if (tsc_pit_min == ULONG_MAX) { /* PIT gave no useful value */ - printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n"); + pr_warn("Unable to calibrate against PIT\n"); /* We don't have an alternative source, disable TSC */ if (!hpet && !ref1 && !ref2) { - printk("TSC: No reference (HPET/PMTIMER) available\n"); + pr_notice("No reference (HPET/PMTIMER) available\n"); return 0; } /* The alternative source failed as well, disable TSC */ if (tsc_ref_min == ULONG_MAX) { - printk(KERN_WARNING "TSC: HPET/PMTIMER calibration " - "failed.\n"); + pr_warn("HPET/PMTIMER calibration failed\n"); return 0; } /* Use the alternative source */ - printk(KERN_INFO "TSC: using %s reference calibration\n", - hpet ? "HPET" : "PMTIMER"); + pr_info("using %s reference calibration\n", + hpet ? "HPET" : "PMTIMER"); return tsc_ref_min; } /* We don't have an alternative source, use the PIT calibration value */ if (!hpet && !ref1 && !ref2) { - printk(KERN_INFO "TSC: Using PIT calibration value\n"); + pr_info("Using PIT calibration value\n"); return tsc_pit_min; } /* The alternative source failed, use the PIT calibration value */ if (tsc_ref_min == ULONG_MAX) { - printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. " - "Using PIT calibration\n"); + pr_warn("HPET/PMTIMER calibration failed. Using PIT calibration.\n"); return tsc_pit_min; } @@ -551,9 +549,9 @@ unsigned long native_calibrate_tsc(void) * the PIT value as we know that there are PMTIMERs around * running at double speed. At least we let the user know: */ - printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n", - hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); - printk(KERN_INFO "TSC: Using PIT calibration value\n"); + pr_warn("PIT calibration deviates from %s: %lu %lu\n", + hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); + pr_info("Using PIT calibration value\n"); return tsc_pit_min; } @@ -785,7 +783,7 @@ void mark_tsc_unstable(char *reason) tsc_unstable = 1; sched_clock_stable = 0; disable_sched_clock_irqtime(); - printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); + pr_info("Marking TSC unstable due to %s\n", reason); /* Change only the rating, when not registered */ if (clocksource_tsc.mult) clocksource_mark_unstable(&clocksource_tsc); @@ -912,9 +910,9 @@ static void tsc_refine_calibration_work(struct work_struct *work) goto out; tsc_khz = freq; - printk(KERN_INFO "Refined TSC clocksource calibration: " - "%lu.%03lu MHz.\n", (unsigned long)tsc_khz / 1000, - (unsigned long)tsc_khz % 1000); + pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n", + (unsigned long)tsc_khz / 1000, + (unsigned long)tsc_khz % 1000); out: clocksource_register_khz(&clocksource_tsc, tsc_khz); @@ -970,9 +968,9 @@ void __init tsc_init(void) return; } - printk("Detected %lu.%03lu MHz processor.\n", - (unsigned long)cpu_khz / 1000, - (unsigned long)cpu_khz % 1000); + pr_info("Detected %lu.%03lu MHz processor\n", + (unsigned long)cpu_khz / 1000, + (unsigned long)cpu_khz % 1000); /* * Secondary CPUs do not run through tsc_init(), so set up diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index dc4e910a7d96..36fd42091fa7 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -409,9 +409,10 @@ static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. * @mm: the probed address space. * @arch_uprobe: the probepoint information. + * @addr: virtual address at which to install the probepoint * Return 0 on success or a -ve number on error. */ -int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm) +int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) { int ret; struct insn insn; diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 255f58ae71e8..54abcc0baf23 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -28,6 +28,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/capability.h> #include <linux/errno.h> #include <linux/interrupt.h> @@ -137,14 +139,14 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) local_irq_enable(); if (!current->thread.vm86_info) { - printk("no vm86_info: BAD\n"); + pr_alert("no vm86_info: BAD\n"); do_exit(SIGSEGV); } set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask); tmp = copy_vm86_regs_to_user(¤t->thread.vm86_info->regs, regs); tmp += put_user(current->thread.screen_bitmap, ¤t->thread.vm86_info->screen_bitmap); if (tmp) { - printk("vm86: could not access userspace vm86_info\n"); + pr_alert("could not access userspace vm86_info\n"); do_exit(SIGSEGV); } diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 8eeb55a551b4..992f890283e9 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -16,6 +16,7 @@ #include <linux/pci_ids.h> #include <linux/pci_regs.h> #include <linux/smp.h> +#include <linux/irq.h> #include <asm/apic.h> #include <asm/pci-direct.h> @@ -95,6 +96,18 @@ static void __init set_vsmp_pv_ops(void) ctl = readl(address + 4); printk(KERN_INFO "vSMP CTL: capabilities:0x%08x control:0x%08x\n", cap, ctl); + + /* If possible, let the vSMP foundation route the interrupt optimally */ +#ifdef CONFIG_SMP + if (cap & ctl & BIT(8)) { + ctl &= ~BIT(8); +#ifdef CONFIG_PROC_FS + /* Don't let users change irq affinity via procfs */ + no_irq_affinity = 1; +#endif + } +#endif + if (cap & ctl & (1 << 4)) { /* Setup irq ops and turn on vSMP IRQ fastpath handling */ pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable); @@ -102,12 +115,11 @@ static void __init set_vsmp_pv_ops(void) pv_irq_ops.save_fl = PV_CALLEE_SAVE(vsmp_save_fl); pv_irq_ops.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl); pv_init_ops.patch = vsmp_patch; - ctl &= ~(1 << 4); - writel(ctl, address + 4); - ctl = readl(address + 4); - printk(KERN_INFO "vSMP CTL: control set to:0x%08x\n", ctl); } + writel(ctl, address + 4); + ctl = readl(address + 4); + pr_info("vSMP CTL: control set to:0x%08x\n", ctl); early_iounmap(address, 8); } @@ -187,12 +199,36 @@ static void __init vsmp_cap_cpus(void) #endif } +static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) +{ + return hard_smp_processor_id() >> index_msb; +} + +/* + * In vSMP, all cpus should be capable of handling interrupts, regardless of + * the APIC used. + */ +static void fill_vector_allocation_domain(int cpu, struct cpumask *retmask, + const struct cpumask *mask) +{ + cpumask_setall(retmask); +} + +static void vsmp_apic_post_init(void) +{ + /* need to update phys_pkg_id */ + apic->phys_pkg_id = apicid_phys_pkg_id; + apic->vector_allocation_domain = fill_vector_allocation_domain; +} + void __init vsmp_init(void) { detect_vsmp_box(); if (!is_vsmp_box()) return; + x86_platform.apic_post_init = vsmp_apic_post_init; + vsmp_cap_cpus(); set_vsmp_pv_ops(); diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 7515cf0e1805..8d141b309046 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -18,6 +18,8 @@ * use the vDSO. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/time.h> #include <linux/init.h> #include <linux/kernel.h> @@ -111,18 +113,13 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, const char *message) { - static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - struct task_struct *tsk; - - if (!show_unhandled_signals || !__ratelimit(&rs)) + if (!show_unhandled_signals) return; - tsk = current; - - printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", - level, tsk->comm, task_pid_nr(tsk), - message, regs->ip, regs->cs, - regs->sp, regs->ax, regs->si, regs->di); + pr_notice_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", + level, current->comm, task_pid_nr(current), + message, regs->ip, regs->cs, + regs->sp, regs->ax, regs->si, regs->di); } static int addr_to_vsyscall_nr(unsigned long addr) @@ -139,6 +136,19 @@ static int addr_to_vsyscall_nr(unsigned long addr) return nr; } +#ifdef CONFIG_SECCOMP +static int vsyscall_seccomp(struct task_struct *tsk, int syscall_nr) +{ + if (!seccomp_mode(&tsk->seccomp)) + return 0; + task_pt_regs(tsk)->orig_ax = syscall_nr; + task_pt_regs(tsk)->ax = syscall_nr; + return __secure_computing(syscall_nr); +} +#else +#define vsyscall_seccomp(_tsk, _nr) 0 +#endif + static bool write_ok_or_segv(unsigned long ptr, size_t size) { /* @@ -174,6 +184,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) int vsyscall_nr; int prev_sig_on_uaccess_error; long ret; + int skip; /* * No point in checking CS -- the only way to get here is a user mode @@ -205,9 +216,6 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) } tsk = current; - if (seccomp_mode(&tsk->seccomp)) - do_exit(SIGKILL); - /* * With a real vsyscall, page faults cause SIGSEGV. We want to * preserve that behavior to make writing exploits harder. @@ -222,8 +230,13 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) * address 0". */ ret = -EFAULT; + skip = 0; switch (vsyscall_nr) { case 0: + skip = vsyscall_seccomp(tsk, __NR_gettimeofday); + if (skip) + break; + if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || !write_ok_or_segv(regs->si, sizeof(struct timezone))) break; @@ -234,6 +247,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) break; case 1: + skip = vsyscall_seccomp(tsk, __NR_time); + if (skip) + break; + if (!write_ok_or_segv(regs->di, sizeof(time_t))) break; @@ -241,6 +258,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) break; case 2: + skip = vsyscall_seccomp(tsk, __NR_getcpu); + if (skip) + break; + if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || !write_ok_or_segv(regs->si, sizeof(unsigned))) break; @@ -253,6 +274,12 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; + if (skip) { + if ((long)regs->ax <= 0L) /* seccomp errno emulation */ + goto do_ret; + goto done; /* seccomp trace/trap */ + } + if (ret == -EFAULT) { /* Bad news -- userspace fed a bad pointer to a vsyscall. */ warn_bad_vsyscall(KERN_INFO, regs, @@ -271,10 +298,11 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) regs->ax = ret; +do_ret: /* Emulate a ret instruction. */ regs->ip = caller; regs->sp += 8; - +done: return true; sigsegv: diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 9796c2f3d074..6020f6f5927c 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -28,6 +28,7 @@ EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(copy_user_generic_string); EXPORT_SYMBOL(copy_user_generic_unrolled); +EXPORT_SYMBOL(copy_user_enhanced_fast_string); EXPORT_SYMBOL(__copy_user_nocache); EXPORT_SYMBOL(_copy_from_user); EXPORT_SYMBOL(_copy_to_user); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 35c5e543f550..9f3167e891ef 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -29,7 +29,6 @@ void __init x86_init_uint_noop(unsigned int unused) { } void __init x86_init_pgd_noop(pgd_t *unused) { } int __init iommu_init_noop(void) { return 0; } void iommu_shutdown_noop(void) { } -void wallclock_init_noop(void) { } /* * The platform setup functions are preset with the default functions @@ -101,7 +100,6 @@ static int default_i8042_detect(void) { return 1; }; struct x86_platform_ops x86_platform = { .calibrate_tsc = native_calibrate_tsc, - .wallclock_init = wallclock_init_noop, .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_rtc_mmss, .iommu_shutdown = iommu_shutdown_noop, diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index bd18149b2b0f..3d3e20709119 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -3,6 +3,9 @@ * * Author: Suresh Siddha <suresh.b.siddha@intel.com> */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/bootmem.h> #include <linux/compat.h> #include <asm/i387.h> @@ -162,7 +165,7 @@ int save_i387_xstate(void __user *buf) BUG_ON(sig_xstate_size < xstate_size); if ((unsigned long)buf % 64) - printk("save_i387_xstate: bad fpstate %p\n", buf); + pr_err("%s: bad fpstate %p\n", __func__, buf); if (!used_math()) return 0; @@ -422,7 +425,7 @@ static void __init xstate_enable_boot_cpu(void) pcntxt_mask = eax + ((u64)edx << 32); if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { - printk(KERN_ERR "FP/SSE not shown under xsave features 0x%llx\n", + pr_err("FP/SSE not shown under xsave features 0x%llx\n", pcntxt_mask); BUG(); } @@ -445,9 +448,8 @@ static void __init xstate_enable_boot_cpu(void) setup_xstate_init(); - printk(KERN_INFO "xsave/xrstor: enabled xstate_bv 0x%llx, " - "cntxt size 0x%x\n", - pcntxt_mask, xstate_size); + pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n", + pcntxt_mask, xstate_size); } /* diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 7df1c6d839fb..0595f1397b7c 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -201,6 +201,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, unsigned f_lm = 0; #endif unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; + unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0; /* cpuid 1.edx */ const u32 kvm_supported_word0_x86_features = @@ -228,7 +229,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, 0 /* DS-CPL, VMX, SMX, EST */ | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ | - 0 /* Reserved, DCA */ | F(XMM4_1) | + F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | F(F16C) | F(RDRAND); @@ -248,7 +249,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ebx */ const u32 kvm_supported_word9_x86_features = F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | - F(BMI2) | F(ERMS) | F(RTM); + F(BMI2) | F(ERMS) | f_invpcid | F(RTM); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -409,6 +410,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, (1 << KVM_FEATURE_NOP_IO_DELAY) | (1 << KVM_FEATURE_CLOCKSOURCE2) | (1 << KVM_FEATURE_ASYNC_PF) | + (1 << KVM_FEATURE_PV_EOI) | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); if (sched_info_on()) @@ -639,33 +641,37 @@ static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu, return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); } -void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) +void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) { - u32 function, index; + u32 function = *eax, index = *ecx; struct kvm_cpuid_entry2 *best; - function = kvm_register_read(vcpu, VCPU_REGS_RAX); - index = kvm_register_read(vcpu, VCPU_REGS_RCX); - kvm_register_write(vcpu, VCPU_REGS_RAX, 0); - kvm_register_write(vcpu, VCPU_REGS_RBX, 0); - kvm_register_write(vcpu, VCPU_REGS_RCX, 0); - kvm_register_write(vcpu, VCPU_REGS_RDX, 0); best = kvm_find_cpuid_entry(vcpu, function, index); if (!best) best = check_cpuid_limit(vcpu, function, index); if (best) { - kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); - kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); - kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); - kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); - } + *eax = best->eax; + *ebx = best->ebx; + *ecx = best->ecx; + *edx = best->edx; + } else + *eax = *ebx = *ecx = *edx = 0; +} + +void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) +{ + u32 function, eax, ebx, ecx, edx; + + function = eax = kvm_register_read(vcpu, VCPU_REGS_RAX); + ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); + kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx); + kvm_register_write(vcpu, VCPU_REGS_RAX, eax); + kvm_register_write(vcpu, VCPU_REGS_RBX, ebx); + kvm_register_write(vcpu, VCPU_REGS_RCX, ecx); + kvm_register_write(vcpu, VCPU_REGS_RDX, edx); kvm_x86_ops->skip_emulated_instruction(vcpu); - trace_kvm_cpuid(function, - kvm_register_read(vcpu, VCPU_REGS_RAX), - kvm_register_read(vcpu, VCPU_REGS_RBX), - kvm_register_read(vcpu, VCPU_REGS_RCX), - kvm_register_read(vcpu, VCPU_REGS_RDX)); + trace_kvm_cpuid(function, eax, ebx, ecx, edx); } EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 26d1fb437eb5..a10e46016851 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -17,6 +17,7 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); +void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) @@ -51,4 +52,12 @@ static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu) return best && (best->ecx & bit(X86_FEATURE_OSVW)); } +static inline bool guest_cpuid_has_pcid(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + return best && (best->ecx & bit(X86_FEATURE_PCID)); +} + #endif diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index f95d242ee9f7..97d9a9914ba8 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -433,11 +433,32 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, return ctxt->ops->intercept(ctxt, &info, stage); } +static void assign_masked(ulong *dest, ulong src, ulong mask) +{ + *dest = (*dest & ~mask) | (src & mask); +} + static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt) { return (1UL << (ctxt->ad_bytes << 3)) - 1; } +static ulong stack_mask(struct x86_emulate_ctxt *ctxt) +{ + u16 sel; + struct desc_struct ss; + + if (ctxt->mode == X86EMUL_MODE_PROT64) + return ~0UL; + ctxt->ops->get_segment(ctxt, &sel, &ss, NULL, VCPU_SREG_SS); + return ~0U >> ((ss.d ^ 1) * 16); /* d=0: 0xffff; d=1: 0xffffffff */ +} + +static int stack_size(struct x86_emulate_ctxt *ctxt) +{ + return (__fls(stack_mask(ctxt)) + 1) >> 3; +} + /* Access/update address held in a register, based on addressing mode. */ static inline unsigned long address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg) @@ -958,6 +979,12 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, op->orig_val = op->val; } +static void adjust_modrm_seg(struct x86_emulate_ctxt *ctxt, int base_reg) +{ + if (base_reg == VCPU_REGS_RSP || base_reg == VCPU_REGS_RBP) + ctxt->modrm_seg = VCPU_SREG_SS; +} + static int decode_modrm(struct x86_emulate_ctxt *ctxt, struct operand *op) { @@ -1061,15 +1088,20 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) modrm_ea += insn_fetch(s32, ctxt); - else + else { modrm_ea += ctxt->regs[base_reg]; + adjust_modrm_seg(ctxt, base_reg); + } if (index_reg != 4) modrm_ea += ctxt->regs[index_reg] << scale; } else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) { if (ctxt->mode == X86EMUL_MODE_PROT64) ctxt->rip_relative = 1; - } else - modrm_ea += ctxt->regs[ctxt->modrm_rm]; + } else { + base_reg = ctxt->modrm_rm; + modrm_ea += ctxt->regs[base_reg]; + adjust_modrm_seg(ctxt, base_reg); + } switch (ctxt->modrm_mod) { case 0: if (ctxt->modrm_rm == 5) @@ -1264,7 +1296,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, /* allowed just for 8 bytes segments */ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, - u16 selector, struct desc_struct *desc) + u16 selector, struct desc_struct *desc, + ulong *desc_addr_p) { struct desc_ptr dt; u16 index = selector >> 3; @@ -1275,7 +1308,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (dt.size < index * 8 + 7) return emulate_gp(ctxt, selector & 0xfffc); - addr = dt.address + index * 8; + *desc_addr_p = addr = dt.address + index * 8; return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception); } @@ -1302,11 +1335,12 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, int seg) { - struct desc_struct seg_desc; + struct desc_struct seg_desc, old_desc; u8 dpl, rpl, cpl; unsigned err_vec = GP_VECTOR; u32 err_code = 0; bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ + ulong desc_addr; int ret; memset(&seg_desc, 0, sizeof seg_desc); @@ -1324,8 +1358,14 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, goto load; } - /* NULL selector is not valid for TR, CS and SS */ - if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) + rpl = selector & 3; + cpl = ctxt->ops->cpl(ctxt); + + /* NULL selector is not valid for TR, CS and SS (except for long mode) */ + if ((seg == VCPU_SREG_CS + || (seg == VCPU_SREG_SS + && (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl)) + || seg == VCPU_SREG_TR) && null_selector) goto exception; @@ -1336,7 +1376,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (null_selector) /* for NULL selector skip all following checks */ goto load; - ret = read_segment_descriptor(ctxt, selector, &seg_desc); + ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr); if (ret != X86EMUL_CONTINUE) return ret; @@ -1352,9 +1392,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, goto exception; } - rpl = selector & 3; dpl = seg_desc.dpl; - cpl = ctxt->ops->cpl(ctxt); switch (seg) { case VCPU_SREG_SS: @@ -1384,6 +1422,12 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, case VCPU_SREG_TR: if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9)) goto exception; + old_desc = seg_desc; + seg_desc.type |= 2; /* busy */ + ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc, + sizeof(seg_desc), &ctxt->exception); + if (ret != X86EMUL_CONTINUE) + return ret; break; case VCPU_SREG_LDTR: if (seg_desc.s || seg_desc.type != 2) @@ -1474,17 +1518,22 @@ static int writeback(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int em_push(struct x86_emulate_ctxt *ctxt) +static int push(struct x86_emulate_ctxt *ctxt, void *data, int bytes) { struct segmented_address addr; - register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -ctxt->op_bytes); + register_address_increment(ctxt, &ctxt->regs[VCPU_REGS_RSP], -bytes); addr.ea = register_address(ctxt, ctxt->regs[VCPU_REGS_RSP]); addr.seg = VCPU_SREG_SS; + return segmented_write(ctxt, addr, data, bytes); +} + +static int em_push(struct x86_emulate_ctxt *ctxt) +{ /* Disable writeback. */ ctxt->dst.type = OP_NONE; - return segmented_write(ctxt, addr, &ctxt->src.val, ctxt->op_bytes); + return push(ctxt, &ctxt->src.val, ctxt->op_bytes); } static int emulate_pop(struct x86_emulate_ctxt *ctxt, @@ -1556,6 +1605,33 @@ static int em_popf(struct x86_emulate_ctxt *ctxt) return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes); } +static int em_enter(struct x86_emulate_ctxt *ctxt) +{ + int rc; + unsigned frame_size = ctxt->src.val; + unsigned nesting_level = ctxt->src2.val & 31; + + if (nesting_level) + return X86EMUL_UNHANDLEABLE; + + rc = push(ctxt, &ctxt->regs[VCPU_REGS_RBP], stack_size(ctxt)); + if (rc != X86EMUL_CONTINUE) + return rc; + assign_masked(&ctxt->regs[VCPU_REGS_RBP], ctxt->regs[VCPU_REGS_RSP], + stack_mask(ctxt)); + assign_masked(&ctxt->regs[VCPU_REGS_RSP], + ctxt->regs[VCPU_REGS_RSP] - frame_size, + stack_mask(ctxt)); + return X86EMUL_CONTINUE; +} + +static int em_leave(struct x86_emulate_ctxt *ctxt) +{ + assign_masked(&ctxt->regs[VCPU_REGS_RSP], ctxt->regs[VCPU_REGS_RBP], + stack_mask(ctxt)); + return emulate_pop(ctxt, &ctxt->regs[VCPU_REGS_RBP], ctxt->op_bytes); +} + static int em_push_sreg(struct x86_emulate_ctxt *ctxt) { int seg = ctxt->src2.val; @@ -1993,8 +2069,8 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt) u32 eax, ebx, ecx, edx; eax = ecx = 0; - return ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx) - && ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); + return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx; } @@ -2013,32 +2089,31 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) eax = 0x00000000; ecx = 0x00000000; - if (ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx)) { - /* - * Intel ("GenuineIntel") - * remark: Intel CPUs only support "syscall" in 64bit - * longmode. Also an 64bit guest with a - * 32bit compat-app running will #UD !! While this - * behaviour can be fixed (by emulating) into AMD - * response - CPUs of AMD can't behave like Intel. - */ - if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && - ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && - edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx) - return false; + ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); + /* + * Intel ("GenuineIntel") + * remark: Intel CPUs only support "syscall" in 64bit + * longmode. Also an 64bit guest with a + * 32bit compat-app running will #UD !! While this + * behaviour can be fixed (by emulating) into AMD + * response - CPUs of AMD can't behave like Intel. + */ + if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && + ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && + edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx) + return false; - /* AMD ("AuthenticAMD") */ - if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx && - ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx && - edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) - return true; - - /* AMD ("AMDisbetter!") */ - if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx && - ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx && - edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx) - return true; - } + /* AMD ("AuthenticAMD") */ + if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx && + ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx && + edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) + return true; + + /* AMD ("AMDisbetter!") */ + if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx && + ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx && + edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx) + return true; /* default: (not Intel, not AMD), apply Intel's stricter rules... */ return false; @@ -2547,13 +2622,14 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, ulong old_tss_base = ops->get_cached_segment_base(ctxt, VCPU_SREG_TR); u32 desc_limit; + ulong desc_addr; /* FIXME: old_tss_base == ~0 ? */ - ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc); + ret = read_segment_descriptor(ctxt, tss_selector, &next_tss_desc, &desc_addr); if (ret != X86EMUL_CONTINUE) return ret; - ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc); + ret = read_segment_descriptor(ctxt, old_tss_sel, &curr_tss_desc, &desc_addr); if (ret != X86EMUL_CONTINUE) return ret; @@ -2948,6 +3024,24 @@ static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt) return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg); } +static int em_lldt(struct x86_emulate_ctxt *ctxt) +{ + u16 sel = ctxt->src.val; + + /* Disable writeback. */ + ctxt->dst.type = OP_NONE; + return load_segment_descriptor(ctxt, sel, VCPU_SREG_LDTR); +} + +static int em_ltr(struct x86_emulate_ctxt *ctxt) +{ + u16 sel = ctxt->src.val; + + /* Disable writeback. */ + ctxt->dst.type = OP_NONE; + return load_segment_descriptor(ctxt, sel, VCPU_SREG_TR); +} + static int em_invlpg(struct x86_emulate_ctxt *ctxt) { int rc; @@ -2989,11 +3083,42 @@ static int em_vmcall(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt, + void (*get)(struct x86_emulate_ctxt *ctxt, + struct desc_ptr *ptr)) +{ + struct desc_ptr desc_ptr; + + if (ctxt->mode == X86EMUL_MODE_PROT64) + ctxt->op_bytes = 8; + get(ctxt, &desc_ptr); + if (ctxt->op_bytes == 2) { + ctxt->op_bytes = 4; + desc_ptr.address &= 0x00ffffff; + } + /* Disable writeback. */ + ctxt->dst.type = OP_NONE; + return segmented_write(ctxt, ctxt->dst.addr.mem, + &desc_ptr, 2 + ctxt->op_bytes); +} + +static int em_sgdt(struct x86_emulate_ctxt *ctxt) +{ + return emulate_store_desc_ptr(ctxt, ctxt->ops->get_gdt); +} + +static int em_sidt(struct x86_emulate_ctxt *ctxt) +{ + return emulate_store_desc_ptr(ctxt, ctxt->ops->get_idt); +} + static int em_lgdt(struct x86_emulate_ctxt *ctxt) { struct desc_ptr desc_ptr; int rc; + if (ctxt->mode == X86EMUL_MODE_PROT64) + ctxt->op_bytes = 8; rc = read_descriptor(ctxt, ctxt->src.addr.mem, &desc_ptr.size, &desc_ptr.address, ctxt->op_bytes); @@ -3021,6 +3146,8 @@ static int em_lidt(struct x86_emulate_ctxt *ctxt) struct desc_ptr desc_ptr; int rc; + if (ctxt->mode == X86EMUL_MODE_PROT64) + ctxt->op_bytes = 8; rc = read_descriptor(ctxt, ctxt->src.addr.mem, &desc_ptr.size, &desc_ptr.address, ctxt->op_bytes); @@ -3143,6 +3270,42 @@ static int em_bsr(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_cpuid(struct x86_emulate_ctxt *ctxt) +{ + u32 eax, ebx, ecx, edx; + + eax = ctxt->regs[VCPU_REGS_RAX]; + ecx = ctxt->regs[VCPU_REGS_RCX]; + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); + ctxt->regs[VCPU_REGS_RAX] = eax; + ctxt->regs[VCPU_REGS_RBX] = ebx; + ctxt->regs[VCPU_REGS_RCX] = ecx; + ctxt->regs[VCPU_REGS_RDX] = edx; + return X86EMUL_CONTINUE; +} + +static int em_lahf(struct x86_emulate_ctxt *ctxt) +{ + ctxt->regs[VCPU_REGS_RAX] &= ~0xff00UL; + ctxt->regs[VCPU_REGS_RAX] |= (ctxt->eflags & 0xff) << 8; + return X86EMUL_CONTINUE; +} + +static int em_bswap(struct x86_emulate_ctxt *ctxt) +{ + switch (ctxt->op_bytes) { +#ifdef CONFIG_X86_64 + case 8: + asm("bswap %0" : "+r"(ctxt->dst.val)); + break; +#endif + default: + asm("bswap %0" : "+r"(*(u32 *)&ctxt->dst.val)); + break; + } + return X86EMUL_CONTINUE; +} + static bool valid_cr(int nr) { switch (nr) { @@ -3424,14 +3587,14 @@ static struct opcode group5[] = { static struct opcode group6[] = { DI(Prot, sldt), DI(Prot, str), - DI(Prot | Priv, lldt), - DI(Prot | Priv, ltr), + II(Prot | Priv | SrcMem16, em_lldt, lldt), + II(Prot | Priv | SrcMem16, em_ltr, ltr), N, N, N, N, }; static struct group_dual group7 = { { - DI(Mov | DstMem | Priv, sgdt), - DI(Mov | DstMem | Priv, sidt), + II(Mov | DstMem | Priv, em_sgdt, sgdt), + II(Mov | DstMem | Priv, em_sidt, sidt), II(SrcMem | Priv, em_lgdt, lgdt), II(SrcMem | Priv, em_lidt, lidt), II(SrcNone | DstMem | Mov, em_smsw, smsw), N, @@ -3538,7 +3701,7 @@ static struct opcode opcode_table[256] = { D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), I(SrcImmFAddr | No64, em_call_far), N, II(ImplicitOps | Stack, em_pushf, pushf), - II(ImplicitOps | Stack, em_popf, popf), N, N, + II(ImplicitOps | Stack, em_popf, popf), N, I(ImplicitOps, em_lahf), /* 0xA0 - 0xA7 */ I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov), @@ -3561,7 +3724,8 @@ static struct opcode opcode_table[256] = { I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg), G(ByteOp, group11), G(0, group11), /* 0xC8 - 0xCF */ - N, N, N, I(ImplicitOps | Stack, em_ret_far), + I(Stack | SrcImmU16 | Src2ImmByte, em_enter), I(Stack, em_leave), + N, I(ImplicitOps | Stack, em_ret_far), D(ImplicitOps), DI(SrcImmByte, intn), D(ImplicitOps | No64), II(ImplicitOps, em_iret, iret), /* 0xD0 - 0xD7 */ @@ -3635,7 +3799,7 @@ static struct opcode twobyte_table[256] = { X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), /* 0xA0 - 0xA7 */ I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg), - DI(ImplicitOps, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt), + II(ImplicitOps, em_cpuid, cpuid), I(DstMem | SrcReg | ModRM | BitOp, em_bt), D(DstMem | SrcReg | Src2ImmByte | ModRM), D(DstMem | SrcReg | Src2CL | ModRM), N, N, /* 0xA8 - 0xAF */ @@ -3658,11 +3822,12 @@ static struct opcode twobyte_table[256] = { I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr), D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), - /* 0xC0 - 0xCF */ + /* 0xC0 - 0xC7 */ D2bv(DstMem | SrcReg | ModRM | Lock), N, D(DstMem | SrcReg | ModRM | Mov), N, N, N, GD(0, &group9), - N, N, N, N, N, N, N, N, + /* 0xC8 - 0xCF */ + X8(I(DstReg, em_bswap)), /* 0xD0 - 0xDF */ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, /* 0xE0 - 0xEF */ @@ -4426,12 +4591,12 @@ twobyte_insn: break; case 0xb6 ... 0xb7: /* movzx */ ctxt->dst.bytes = ctxt->op_bytes; - ctxt->dst.val = (ctxt->d & ByteOp) ? (u8) ctxt->src.val + ctxt->dst.val = (ctxt->src.bytes == 1) ? (u8) ctxt->src.val : (u16) ctxt->src.val; break; case 0xbe ... 0xbf: /* movsx */ ctxt->dst.bytes = ctxt->op_bytes; - ctxt->dst.val = (ctxt->d & ByteOp) ? (s8) ctxt->src.val : + ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val : (s16) ctxt->src.val; break; case 0xc0 ... 0xc1: /* xadd */ diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 81cf4fa4a2be..1df8fb9e1d5d 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -188,14 +188,15 @@ void kvm_pic_update_irq(struct kvm_pic *s) pic_unlock(s); } -int kvm_pic_set_irq(void *opaque, int irq, int level) +int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level) { - struct kvm_pic *s = opaque; int ret = -1; pic_lock(s); if (irq >= 0 && irq < PIC_NUM_PINS) { - ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); + int irq_level = __kvm_irq_line_state(&s->irq_states[irq], + irq_source_id, level); + ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level); pic_update_irq(s); trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, s->pics[irq >> 3].imr, ret == 0); @@ -205,6 +206,16 @@ int kvm_pic_set_irq(void *opaque, int irq, int level) return ret; } +void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id) +{ + int i; + + pic_lock(s); + for (i = 0; i < PIC_NUM_PINS; i++) + __clear_bit(irq_source_id, &s->irq_states[i]); + pic_unlock(s); +} + /* * acknowledge interrupt 'irq' */ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 93c15743f1ee..ce878788a39f 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -107,6 +107,16 @@ static inline void apic_clear_vector(int vec, void *bitmap) clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); } +static inline int __apic_test_and_set_vector(int vec, void *bitmap) +{ + return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} + +static inline int __apic_test_and_clear_vector(int vec, void *bitmap) +{ + return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} + static inline int apic_hw_enabled(struct kvm_lapic *apic) { return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; @@ -210,6 +220,16 @@ static int find_highest_vector(void *bitmap) return fls(word[word_offset << 2]) - 1 + (word_offset << 5); } +static u8 count_vectors(void *bitmap) +{ + u32 *word = bitmap; + int word_offset; + u8 count = 0; + for (word_offset = 0; word_offset < MAX_APIC_VECTOR >> 5; ++word_offset) + count += hweight32(word[word_offset << 2]); + return count; +} + static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) { apic->irr_pending = true; @@ -242,6 +262,27 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) apic->irr_pending = true; } +static inline void apic_set_isr(int vec, struct kvm_lapic *apic) +{ + if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR)) + ++apic->isr_count; + BUG_ON(apic->isr_count > MAX_APIC_VECTOR); + /* + * ISR (in service register) bit is set when injecting an interrupt. + * The highest vector is injected. Thus the latest bit set matches + * the highest bit in ISR. + */ + apic->highest_isr_cache = vec; +} + +static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) +{ + if (__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR)) + --apic->isr_count; + BUG_ON(apic->isr_count < 0); + apic->highest_isr_cache = -1; +} + int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; @@ -270,9 +311,61 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq) irq->level, irq->trig_mode); } +static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) +{ + + return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val, + sizeof(val)); +} + +static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val) +{ + + return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val, + sizeof(*val)); +} + +static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED; +} + +static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu) +{ + u8 val; + if (pv_eoi_get_user(vcpu, &val) < 0) + apic_debug("Can't read EOI MSR value: 0x%llx\n", + (unsigned long long)vcpi->arch.pv_eoi.msr_val); + return val & 0x1; +} + +static void pv_eoi_set_pending(struct kvm_vcpu *vcpu) +{ + if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) { + apic_debug("Can't set EOI MSR value: 0x%llx\n", + (unsigned long long)vcpi->arch.pv_eoi.msr_val); + return; + } + __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); +} + +static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) +{ + if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) { + apic_debug("Can't clear EOI MSR value: 0x%llx\n", + (unsigned long long)vcpi->arch.pv_eoi.msr_val); + return; + } + __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); +} + static inline int apic_find_highest_isr(struct kvm_lapic *apic) { int result; + if (!apic->isr_count) + return -1; + if (likely(apic->highest_isr_cache != -1)) + return apic->highest_isr_cache; result = find_highest_vector(apic->regs + APIC_ISR); ASSERT(result == -1 || result >= 16); @@ -482,17 +575,20 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2) return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; } -static void apic_set_eoi(struct kvm_lapic *apic) +static int apic_set_eoi(struct kvm_lapic *apic) { int vector = apic_find_highest_isr(apic); + + trace_kvm_eoi(apic, vector); + /* * Not every write EOI will has corresponding ISR, * one example is when Kernel check timer on setup_IO_APIC */ if (vector == -1) - return; + return vector; - apic_clear_vector(vector, apic->regs + APIC_ISR); + apic_clear_isr(vector, apic); apic_update_ppr(apic); if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && @@ -505,6 +601,7 @@ static void apic_set_eoi(struct kvm_lapic *apic) kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); } kvm_make_request(KVM_REQ_EVENT, apic->vcpu); + return vector; } static void apic_send_ipi(struct kvm_lapic *apic) @@ -1081,10 +1178,13 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } apic->irr_pending = false; + apic->isr_count = 0; + apic->highest_isr_cache = -1; update_divide_count(apic); atomic_set(&apic->lapic_timer.pending, 0); if (kvm_vcpu_is_bsp(vcpu)) vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; + vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); vcpu->arch.apic_arb_prio = 0; @@ -1248,7 +1348,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu) if (vector == -1) return -1; - apic_set_vector(vector, apic->regs + APIC_ISR); + apic_set_isr(vector, apic); apic_update_ppr(apic); apic_clear_irr(vector, apic); return vector; @@ -1267,6 +1367,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu) update_divide_count(apic); start_apic_timer(apic); apic->irr_pending = true; + apic->isr_count = count_vectors(apic->regs + APIC_ISR); + apic->highest_isr_cache = -1; kvm_make_request(KVM_REQ_EVENT, vcpu); } @@ -1283,11 +1385,51 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } +/* + * apic_sync_pv_eoi_from_guest - called on vmexit or cancel interrupt + * + * Detect whether guest triggered PV EOI since the + * last entry. If yes, set EOI on guests's behalf. + * Clear PV EOI in guest memory in any case. + */ +static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu, + struct kvm_lapic *apic) +{ + bool pending; + int vector; + /* + * PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host + * and KVM_PV_EOI_ENABLED in guest memory as follows: + * + * KVM_APIC_PV_EOI_PENDING is unset: + * -> host disabled PV EOI. + * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is set: + * -> host enabled PV EOI, guest did not execute EOI yet. + * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is unset: + * -> host enabled PV EOI, guest executed EOI. + */ + BUG_ON(!pv_eoi_enabled(vcpu)); + pending = pv_eoi_get_pending(vcpu); + /* + * Clear pending bit in any case: it will be set again on vmentry. + * While this might not be ideal from performance point of view, + * this makes sure pv eoi is only enabled when we know it's safe. + */ + pv_eoi_clr_pending(vcpu); + if (pending) + return; + vector = apic_set_eoi(apic); + trace_kvm_pv_eoi(apic, vector); +} + void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) { u32 data; void *vapic; + if (test_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention)) + apic_sync_pv_eoi_from_guest(vcpu, vcpu->arch.apic); + if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; @@ -1298,17 +1440,44 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) apic_set_tpr(vcpu->arch.apic, data & 0xff); } +/* + * apic_sync_pv_eoi_to_guest - called before vmentry + * + * Detect whether it's safe to enable PV EOI and + * if yes do so. + */ +static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu, + struct kvm_lapic *apic) +{ + if (!pv_eoi_enabled(vcpu) || + /* IRR set or many bits in ISR: could be nested. */ + apic->irr_pending || + /* Cache not set: could be safe but we don't bother. */ + apic->highest_isr_cache == -1 || + /* Need EOI to update ioapic. */ + kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) { + /* + * PV EOI was disabled by apic_sync_pv_eoi_from_guest + * so we need not do anything here. + */ + return; + } + + pv_eoi_set_pending(apic->vcpu); +} + void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) { u32 data, tpr; int max_irr, max_isr; - struct kvm_lapic *apic; + struct kvm_lapic *apic = vcpu->arch.apic; void *vapic; + apic_sync_pv_eoi_to_guest(vcpu, apic); + if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; - apic = vcpu->arch.apic; tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff; max_irr = apic_find_highest_irr(apic); if (max_irr < 0) @@ -1394,3 +1563,16 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) return 0; } + +int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) +{ + u64 addr = data & ~KVM_MSR_ENABLED; + if (!IS_ALIGNED(addr, 4)) + return 1; + + vcpu->arch.pv_eoi.msr_val = data; + if (!pv_eoi_enabled(vcpu)) + return 0; + return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data, + addr); +} diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 6f4ce2575d09..4af5405ae1e2 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -13,6 +13,15 @@ struct kvm_lapic { u32 divide_count; struct kvm_vcpu *vcpu; bool irr_pending; + /* Number of bits set in ISR. */ + s16 isr_count; + /* The highest vector set in ISR; if -1 - invalid, must scan ISR. */ + int highest_isr_cache; + /** + * APIC register page. The layout matches the register layout seen by + * the guest 1:1, because it is accessed by the vmx microcode. + * Note: Only one register, the TPR, is used by the microcode. + */ void *regs; gpa_t vapic_addr; struct page *vapic_page; @@ -60,4 +69,6 @@ static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu) { return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE; } + +int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index be3cea4407ff..01ca00423938 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -90,7 +90,7 @@ module_param(dbg, bool, 0644); #define PTE_PREFETCH_NUM 8 -#define PT_FIRST_AVAIL_BITS_SHIFT 9 +#define PT_FIRST_AVAIL_BITS_SHIFT 10 #define PT64_SECOND_AVAIL_BITS_SHIFT 52 #define PT64_LEVEL_BITS 9 @@ -145,7 +145,8 @@ module_param(dbg, bool, 0644); #define CREATE_TRACE_POINTS #include "mmutrace.h" -#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) +#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) +#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1)) #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) @@ -188,6 +189,7 @@ static u64 __read_mostly shadow_dirty_mask; static u64 __read_mostly shadow_mmio_mask; static void mmu_spte_set(u64 *sptep, u64 spte); +static void mmu_free_roots(struct kvm_vcpu *vcpu); void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) { @@ -444,8 +446,22 @@ static bool __check_direct_spte_mmio_pf(u64 spte) } #endif +static bool spte_is_locklessly_modifiable(u64 spte) +{ + return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)); +} + static bool spte_has_volatile_bits(u64 spte) { + /* + * Always atomicly update spte if it can be updated + * out of mmu-lock, it can ensure dirty bit is not lost, + * also, it can help us to get a stable is_writable_pte() + * to ensure tlb flush is not missed. + */ + if (spte_is_locklessly_modifiable(spte)) + return true; + if (!shadow_accessed_mask) return false; @@ -478,34 +494,47 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte) /* Rules for using mmu_spte_update: * Update the state bits, it means the mapped pfn is not changged. + * + * Whenever we overwrite a writable spte with a read-only one we + * should flush remote TLBs. Otherwise rmap_write_protect + * will find a read-only spte, even though the writable spte + * might be cached on a CPU's TLB, the return value indicates this + * case. */ -static void mmu_spte_update(u64 *sptep, u64 new_spte) +static bool mmu_spte_update(u64 *sptep, u64 new_spte) { - u64 mask, old_spte = *sptep; + u64 old_spte = *sptep; + bool ret = false; WARN_ON(!is_rmap_spte(new_spte)); - if (!is_shadow_present_pte(old_spte)) - return mmu_spte_set(sptep, new_spte); - - new_spte |= old_spte & shadow_dirty_mask; - - mask = shadow_accessed_mask; - if (is_writable_pte(old_spte)) - mask |= shadow_dirty_mask; + if (!is_shadow_present_pte(old_spte)) { + mmu_spte_set(sptep, new_spte); + return ret; + } - if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask) + if (!spte_has_volatile_bits(old_spte)) __update_clear_spte_fast(sptep, new_spte); else old_spte = __update_clear_spte_slow(sptep, new_spte); + /* + * For the spte updated out of mmu-lock is safe, since + * we always atomicly update it, see the comments in + * spte_has_volatile_bits(). + */ + if (is_writable_pte(old_spte) && !is_writable_pte(new_spte)) + ret = true; + if (!shadow_accessed_mask) - return; + return ret; if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask)) kvm_set_pfn_accessed(spte_to_pfn(old_spte)); if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask)) kvm_set_pfn_dirty(spte_to_pfn(old_spte)); + + return ret; } /* @@ -652,8 +681,7 @@ static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) mmu_page_header_cache); } -static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, - size_t size) +static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) { void *p; @@ -664,8 +692,7 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) { - return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache, - sizeof(struct pte_list_desc)); + return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); } static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) @@ -1051,35 +1078,82 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) rmap_remove(kvm, sptep); } -static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) + +static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) +{ + if (is_large_pte(*sptep)) { + WARN_ON(page_header(__pa(sptep))->role.level == + PT_PAGE_TABLE_LEVEL); + drop_spte(kvm, sptep); + --kvm->stat.lpages; + return true; + } + + return false; +} + +static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) +{ + if (__drop_large_spte(vcpu->kvm, sptep)) + kvm_flush_remote_tlbs(vcpu->kvm); +} + +/* + * Write-protect on the specified @sptep, @pt_protect indicates whether + * spte writ-protection is caused by protecting shadow page table. + * @flush indicates whether tlb need be flushed. + * + * Note: write protection is difference between drity logging and spte + * protection: + * - for dirty logging, the spte can be set to writable at anytime if + * its dirty bitmap is properly set. + * - for spte protection, the spte can be writable only after unsync-ing + * shadow page. + * + * Return true if the spte is dropped. + */ +static bool +spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect) +{ + u64 spte = *sptep; + + if (!is_writable_pte(spte) && + !(pt_protect && spte_is_locklessly_modifiable(spte))) + return false; + + rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); + + if (__drop_large_spte(kvm, sptep)) { + *flush |= true; + return true; + } + + if (pt_protect) + spte &= ~SPTE_MMU_WRITEABLE; + spte = spte & ~PT_WRITABLE_MASK; + + *flush |= mmu_spte_update(sptep, spte); + return false; +} + +static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, + int level, bool pt_protect) { u64 *sptep; struct rmap_iterator iter; - int write_protected = 0; + bool flush = false; for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { BUG_ON(!(*sptep & PT_PRESENT_MASK)); - rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); - - if (!is_writable_pte(*sptep)) { - sptep = rmap_get_next(&iter); - continue; - } - - if (level == PT_PAGE_TABLE_LEVEL) { - mmu_spte_update(sptep, *sptep & ~PT_WRITABLE_MASK); - sptep = rmap_get_next(&iter); - } else { - BUG_ON(!is_large_pte(*sptep)); - drop_spte(kvm, sptep); - --kvm->stat.lpages; + if (spte_write_protect(kvm, sptep, &flush, pt_protect)) { sptep = rmap_get_first(*rmapp, &iter); + continue; } - write_protected = 1; + sptep = rmap_get_next(&iter); } - return write_protected; + return flush; } /** @@ -1100,26 +1174,26 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, while (mask) { rmapp = &slot->rmap[gfn_offset + __ffs(mask)]; - __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL); + __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false); /* clear the first set bit */ mask &= mask - 1; } } -static int rmap_write_protect(struct kvm *kvm, u64 gfn) +static bool rmap_write_protect(struct kvm *kvm, u64 gfn) { struct kvm_memory_slot *slot; unsigned long *rmapp; int i; - int write_protected = 0; + bool write_protected = false; slot = gfn_to_memslot(kvm, gfn); for (i = PT_PAGE_TABLE_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { rmapp = __gfn_to_rmap(gfn, i, slot); - write_protected |= __rmap_write_protect(kvm, rmapp, i); + write_protected |= __rmap_write_protect(kvm, rmapp, i, true); } return write_protected; @@ -1238,11 +1312,12 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, unsigned long data) { u64 *sptep; - struct rmap_iterator iter; + struct rmap_iterator uninitialized_var(iter); int young = 0; /* - * Emulate the accessed bit for EPT, by checking if this page has + * In case of absence of EPT Access and Dirty Bits supports, + * emulate the accessed bit for EPT, by checking if this page has * an EPT mapping, and clearing it if it does. On the next access, * a new EPT mapping will be established. * This has some overhead, but not as much as the cost of swapping @@ -1253,11 +1328,12 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, for (sptep = rmap_get_first(*rmapp, &iter); sptep; sptep = rmap_get_next(&iter)) { - BUG_ON(!(*sptep & PT_PRESENT_MASK)); + BUG_ON(!is_shadow_present_pte(*sptep)); - if (*sptep & PT_ACCESSED_MASK) { + if (*sptep & shadow_accessed_mask) { young = 1; - clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)sptep); + clear_bit((ffs(shadow_accessed_mask) - 1), + (unsigned long *)sptep); } } @@ -1281,9 +1357,9 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, for (sptep = rmap_get_first(*rmapp, &iter); sptep; sptep = rmap_get_next(&iter)) { - BUG_ON(!(*sptep & PT_PRESENT_MASK)); + BUG_ON(!is_shadow_present_pte(*sptep)); - if (*sptep & PT_ACCESSED_MASK) { + if (*sptep & shadow_accessed_mask) { young = 1; break; } @@ -1401,12 +1477,10 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, u64 *parent_pte, int direct) { struct kvm_mmu_page *sp; - sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, - sizeof *sp); - sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); + sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); + sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); if (!direct) - sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, - PAGE_SIZE); + sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); bitmap_zero(sp->slot_bitmap, KVM_MEM_SLOTS_NUM); @@ -1701,7 +1775,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, kvm_mmu_pages_init(parent, &parents, &pages); while (mmu_unsync_walk(parent, &pages)) { - int protected = 0; + bool protected = false; for_each_sp(pages, sp, parents, i) protected |= rmap_write_protect(vcpu->kvm, sp->gfn); @@ -1866,15 +1940,6 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) mmu_spte_set(sptep, spte); } -static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) -{ - if (is_large_pte(*sptep)) { - drop_spte(vcpu->kvm, sptep); - --vcpu->kvm->stat.lpages; - kvm_flush_remote_tlbs(vcpu->kvm); - } -} - static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned direct_access) { @@ -2243,7 +2308,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, pfn_t pfn, bool speculative, bool can_unsync, bool host_writable) { - u64 spte, entry = *sptep; + u64 spte; int ret = 0; if (set_mmio_spte(sptep, gfn, pfn, pte_access)) @@ -2257,8 +2322,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= shadow_x_mask; else spte |= shadow_nx_mask; + if (pte_access & ACC_USER_MASK) spte |= shadow_user_mask; + if (level > PT_PAGE_TABLE_LEVEL) spte |= PT_PAGE_SIZE_MASK; if (tdp_enabled) @@ -2283,7 +2350,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, goto done; } - spte |= PT_WRITABLE_MASK; + spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; if (!vcpu->arch.mmu.direct_map && !(pte_access & ACC_WRITE_MASK)) { @@ -2312,8 +2379,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, __func__, gfn); ret = 1; pte_access &= ~ACC_WRITE_MASK; - if (is_writable_pte(spte)) - spte &= ~PT_WRITABLE_MASK; + spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE); } } @@ -2321,14 +2387,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, mark_page_dirty(vcpu->kvm, gfn); set_pte: - mmu_spte_update(sptep, spte); - /* - * If we overwrite a writable spte with a read-only one we - * should flush remote TLBs. Otherwise rmap_write_protect - * will find a read-only spte, even though the writable spte - * might be cached on a CPU's TLB. - */ - if (is_writable_pte(entry) && !is_writable_pte(*sptep)) + if (mmu_spte_update(sptep, spte)) kvm_flush_remote_tlbs(vcpu->kvm); done: return ret; @@ -2403,6 +2462,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) { + mmu_free_roots(vcpu); } static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, @@ -2625,18 +2685,116 @@ exit: return ret; } +static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code) +{ + /* + * #PF can be fast only if the shadow page table is present and it + * is caused by write-protect, that means we just need change the + * W bit of the spte which can be done out of mmu-lock. + */ + if (!(error_code & PFERR_PRESENT_MASK) || + !(error_code & PFERR_WRITE_MASK)) + return false; + + return true; +} + +static bool +fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte) +{ + struct kvm_mmu_page *sp = page_header(__pa(sptep)); + gfn_t gfn; + + WARN_ON(!sp->role.direct); + + /* + * The gfn of direct spte is stable since it is calculated + * by sp->gfn. + */ + gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt); + + if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) + mark_page_dirty(vcpu->kvm, gfn); + + return true; +} + +/* + * Return value: + * - true: let the vcpu to access on the same address again. + * - false: let the real page fault path to fix it. + */ +static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level, + u32 error_code) +{ + struct kvm_shadow_walk_iterator iterator; + bool ret = false; + u64 spte = 0ull; + + if (!page_fault_can_be_fast(vcpu, error_code)) + return false; + + walk_shadow_page_lockless_begin(vcpu); + for_each_shadow_entry_lockless(vcpu, gva, iterator, spte) + if (!is_shadow_present_pte(spte) || iterator.level < level) + break; + + /* + * If the mapping has been changed, let the vcpu fault on the + * same address again. + */ + if (!is_rmap_spte(spte)) { + ret = true; + goto exit; + } + + if (!is_last_spte(spte, level)) + goto exit; + + /* + * Check if it is a spurious fault caused by TLB lazily flushed. + * + * Need not check the access of upper level table entries since + * they are always ACC_ALL. + */ + if (is_writable_pte(spte)) { + ret = true; + goto exit; + } + + /* + * Currently, to simplify the code, only the spte write-protected + * by dirty-log can be fast fixed. + */ + if (!spte_is_locklessly_modifiable(spte)) + goto exit; + + /* + * Currently, fast page fault only works for direct mapping since + * the gfn is not stable for indirect shadow page. + * See Documentation/virtual/kvm/locking.txt to get more detail. + */ + ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte); +exit: + trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep, + spte, ret); + walk_shadow_page_lockless_end(vcpu); + + return ret; +} + static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gva_t gva, pfn_t *pfn, bool write, bool *writable); -static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, - bool prefault) +static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, + gfn_t gfn, bool prefault) { int r; int level; int force_pt_level; pfn_t pfn; unsigned long mmu_seq; - bool map_writable; + bool map_writable, write = error_code & PFERR_WRITE_MASK; force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); if (likely(!force_pt_level)) { @@ -2653,6 +2811,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, } else level = PT_PAGE_TABLE_LEVEL; + if (fast_page_fault(vcpu, v, level, error_code)) + return 0; + mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -3041,7 +3202,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, gfn = gva >> PAGE_SHIFT; return nonpaging_map(vcpu, gva & PAGE_MASK, - error_code & PFERR_WRITE_MASK, gfn, prefault); + error_code, gfn, prefault); } static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) @@ -3121,6 +3282,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, } else level = PT_PAGE_TABLE_LEVEL; + if (fast_page_fault(vcpu, gpa, level, error_code)) + return 0; + mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -3885,6 +4049,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu) void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) { struct kvm_mmu_page *sp; + bool flush = false; list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { int i; @@ -3899,16 +4064,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) !is_last_spte(pt[i], sp->role.level)) continue; - if (is_large_pte(pt[i])) { - drop_spte(kvm, &pt[i]); - --kvm->stat.lpages; - continue; - } - - /* avoid RMW */ - if (is_writable_pte(pt[i])) - mmu_spte_update(&pt[i], - pt[i] & ~PT_WRITABLE_MASK); + spte_write_protect(kvm, &pt[i], &flush, false); } } kvm_flush_remote_tlbs(kvm); @@ -3934,6 +4090,9 @@ static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, { struct kvm_mmu_page *page; + if (list_empty(&kvm->arch.active_mmu_pages)) + return; + page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); kvm_mmu_prepare_zap_page(kvm, page, invalid_list); @@ -3942,7 +4101,6 @@ static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) { struct kvm *kvm; - struct kvm *kvm_freed = NULL; int nr_to_scan = sc->nr_to_scan; if (nr_to_scan == 0) @@ -3954,22 +4112,30 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) int idx; LIST_HEAD(invalid_list); + /* + * n_used_mmu_pages is accessed without holding kvm->mmu_lock + * here. We may skip a VM instance errorneosly, but we do not + * want to shrink a VM that only started to populate its MMU + * anyway. + */ + if (kvm->arch.n_used_mmu_pages > 0) { + if (!nr_to_scan--) + break; + continue; + } + idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); - if (!kvm_freed && nr_to_scan > 0 && - kvm->arch.n_used_mmu_pages > 0) { - kvm_mmu_remove_some_alloc_mmu_pages(kvm, - &invalid_list); - kvm_freed = kvm; - } - nr_to_scan--; + kvm_mmu_remove_some_alloc_mmu_pages(kvm, &invalid_list); kvm_mmu_commit_zap_page(kvm, &invalid_list); + spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx); + + list_move_tail(&kvm->vm_list, &vm_list); + break; } - if (kvm_freed) - list_move_tail(&kvm_freed->vm_list, &vm_list); raw_spin_unlock(&kvm_lock); diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 89fb0e81322a..cd6e98333ba3 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -54,8 +54,8 @@ */ TRACE_EVENT( kvm_mmu_pagetable_walk, - TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault), - TP_ARGS(addr, write_fault, user_fault, fetch_fault), + TP_PROTO(u64 addr, u32 pferr), + TP_ARGS(addr, pferr), TP_STRUCT__entry( __field(__u64, addr) @@ -64,8 +64,7 @@ TRACE_EVENT( TP_fast_assign( __entry->addr = addr; - __entry->pferr = (!!write_fault << 1) | (!!user_fault << 2) - | (!!fetch_fault << 4); + __entry->pferr = pferr; ), TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr, @@ -243,6 +242,44 @@ TRACE_EVENT( TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn, __entry->access) ); + +#define __spte_satisfied(__spte) \ + (__entry->retry && is_writable_pte(__entry->__spte)) + +TRACE_EVENT( + fast_page_fault, + TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, + u64 *sptep, u64 old_spte, bool retry), + TP_ARGS(vcpu, gva, error_code, sptep, old_spte, retry), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(gva_t, gva) + __field(u32, error_code) + __field(u64 *, sptep) + __field(u64, old_spte) + __field(u64, new_spte) + __field(bool, retry) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu->vcpu_id; + __entry->gva = gva; + __entry->error_code = error_code; + __entry->sptep = sptep; + __entry->old_spte = old_spte; + __entry->new_spte = *sptep; + __entry->retry = retry; + ), + + TP_printk("vcpu %d gva %lx error_code %s sptep %p old %#llx" + " new %llx spurious %d fixed %d", __entry->vcpu_id, + __entry->gva, __print_flags(__entry->error_code, "|", + kvm_mmu_trace_pferr_flags), __entry->sptep, + __entry->old_spte, __entry->new_spte, + __spte_satisfied(old_spte), __spte_satisfied(new_spte) + ) +); #endif /* _TRACE_KVMMMU_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 34f970937ef1..bb7cf01cae76 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -154,8 +154,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, const int fetch_fault = access & PFERR_FETCH_MASK; u16 errcode = 0; - trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, - fetch_fault); + trace_kvm_mmu_pagetable_walk(addr, access); retry_walk: eperm = false; walker->level = mmu->root_level; diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 2e88438ffd83..9b7ec1150ab0 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -80,10 +80,10 @@ static inline struct kvm_pmc *get_fixed_pmc_idx(struct kvm_pmu *pmu, int idx) static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx) { - if (idx < X86_PMC_IDX_FIXED) + if (idx < INTEL_PMC_IDX_FIXED) return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0); else - return get_fixed_pmc_idx(pmu, idx - X86_PMC_IDX_FIXED); + return get_fixed_pmc_idx(pmu, idx - INTEL_PMC_IDX_FIXED); } void kvm_deliver_pmi(struct kvm_vcpu *vcpu) @@ -291,7 +291,7 @@ static void reprogram_idx(struct kvm_pmu *pmu, int idx) if (pmc_is_gp(pmc)) reprogram_gp_counter(pmc, pmc->eventsel); else { - int fidx = idx - X86_PMC_IDX_FIXED; + int fidx = idx - INTEL_PMC_IDX_FIXED; reprogram_fixed_counter(pmc, fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx); } @@ -452,7 +452,7 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) return; pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff, - X86_PMC_MAX_GENERIC); + INTEL_PMC_MAX_GENERIC); pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << ((entry->eax >> 16) & 0xff)) - 1; bitmap_len = (entry->eax >> 24) & 0xff; @@ -462,13 +462,13 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) pmu->nr_arch_fixed_counters = 0; } else { pmu->nr_arch_fixed_counters = min((int)(entry->edx & 0x1f), - X86_PMC_MAX_FIXED); + INTEL_PMC_MAX_FIXED); pmu->counter_bitmask[KVM_PMC_FIXED] = ((u64)1 << ((entry->edx >> 5) & 0xff)) - 1; } pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | - (((1ull << pmu->nr_arch_fixed_counters) - 1) << X86_PMC_IDX_FIXED); + (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); pmu->global_ctrl_mask = ~pmu->global_ctrl; } @@ -478,15 +478,15 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = &vcpu->arch.pmu; memset(pmu, 0, sizeof(*pmu)); - for (i = 0; i < X86_PMC_MAX_GENERIC; i++) { + for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { pmu->gp_counters[i].type = KVM_PMC_GP; pmu->gp_counters[i].vcpu = vcpu; pmu->gp_counters[i].idx = i; } - for (i = 0; i < X86_PMC_MAX_FIXED; i++) { + for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) { pmu->fixed_counters[i].type = KVM_PMC_FIXED; pmu->fixed_counters[i].vcpu = vcpu; - pmu->fixed_counters[i].idx = i + X86_PMC_IDX_FIXED; + pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED; } init_irq_work(&pmu->irq_work, trigger_pmi); kvm_pmu_cpuid_update(vcpu); @@ -498,13 +498,13 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu) int i; irq_work_sync(&pmu->irq_work); - for (i = 0; i < X86_PMC_MAX_GENERIC; i++) { + for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { struct kvm_pmc *pmc = &pmu->gp_counters[i]; stop_counter(pmc); pmc->counter = pmc->eventsel = 0; } - for (i = 0; i < X86_PMC_MAX_FIXED; i++) + for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) stop_counter(&pmu->fixed_counters[i]); pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f75af406b268..baead950d6c8 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3185,8 +3185,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) break; case MSR_IA32_DEBUGCTLMSR: if (!boot_cpu_has(X86_FEATURE_LBRV)) { - pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", - __func__, data); + vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", + __func__, data); break; } if (data & DEBUGCTL_RESERVED_BITS) @@ -3205,7 +3205,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) case MSR_VM_CR: return svm_set_vm_cr(vcpu, data); case MSR_VM_IGNNE: - pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); + vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); break; default: return kvm_set_msr_common(vcpu, ecx, data); @@ -4044,6 +4044,11 @@ static bool svm_rdtscp_supported(void) return false; } +static bool svm_invpcid_supported(void) +{ + return false; +} + static bool svm_has_wbinvd_exit(void) { return true; @@ -4312,6 +4317,7 @@ static struct kvm_x86_ops svm_x86_ops = { .cpuid_update = svm_cpuid_update, .rdtscp_supported = svm_rdtscp_supported, + .invpcid_supported = svm_invpcid_supported, .set_supported_cpuid = svm_set_supported_cpuid, diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 911d2641f14c..a71faf727ff3 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -517,6 +517,40 @@ TRACE_EVENT(kvm_apic_accept_irq, __entry->coalesced ? " (coalesced)" : "") ); +TRACE_EVENT(kvm_eoi, + TP_PROTO(struct kvm_lapic *apic, int vector), + TP_ARGS(apic, vector), + + TP_STRUCT__entry( + __field( __u32, apicid ) + __field( int, vector ) + ), + + TP_fast_assign( + __entry->apicid = apic->vcpu->vcpu_id; + __entry->vector = vector; + ), + + TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector) +); + +TRACE_EVENT(kvm_pv_eoi, + TP_PROTO(struct kvm_lapic *apic, int vector), + TP_ARGS(apic, vector), + + TP_STRUCT__entry( + __field( __u32, apicid ) + __field( int, vector ) + ), + + TP_fast_assign( + __entry->apicid = apic->vcpu->vcpu_id; + __entry->vector = vector; + ), + + TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector) +); + /* * Tracepoint for nested VMRUN */ @@ -710,16 +744,6 @@ TRACE_EVENT(kvm_skinit, __entry->rip, __entry->slb) ); -#define __print_insn(insn, ilen) ({ \ - int i; \ - const char *ret = p->buffer + p->len; \ - \ - for (i = 0; i < ilen; ++i) \ - trace_seq_printf(p, " %02x", insn[i]); \ - trace_seq_printf(p, "%c", 0); \ - ret; \ - }) - #define KVM_EMUL_INSN_F_CR0_PE (1 << 0) #define KVM_EMUL_INSN_F_EFL_VM (1 << 1) #define KVM_EMUL_INSN_F_CS_D (1 << 2) @@ -786,7 +810,7 @@ TRACE_EVENT(kvm_emulate_insn, TP_printk("%x:%llx:%s (%s)%s", __entry->csbase, __entry->rip, - __print_insn(__entry->insn, __entry->len), + __print_hex(__entry->insn, __entry->len), __print_symbolic(__entry->flags, kvm_trace_symbol_emul_flags), __entry->failed ? " failed" : "" diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 32eb58866292..c39b60707e02 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -71,7 +71,10 @@ static bool __read_mostly enable_unrestricted_guest = 1; module_param_named(unrestricted_guest, enable_unrestricted_guest, bool, S_IRUGO); -static bool __read_mostly emulate_invalid_guest_state = 0; +static bool __read_mostly enable_ept_ad_bits = 1; +module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); + +static bool __read_mostly emulate_invalid_guest_state = true; module_param(emulate_invalid_guest_state, bool, S_IRUGO); static bool __read_mostly vmm_exclusive = 1; @@ -615,6 +618,10 @@ static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); +static void vmx_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg); +static void vmx_get_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -789,6 +796,11 @@ static inline bool cpu_has_vmx_ept_4levels(void) return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; } +static inline bool cpu_has_vmx_ept_ad_bits(void) +{ + return vmx_capability.ept & VMX_EPT_AD_BIT; +} + static inline bool cpu_has_vmx_invept_individual_addr(void) { return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; @@ -849,6 +861,12 @@ static inline bool cpu_has_vmx_rdtscp(void) SECONDARY_EXEC_RDTSCP; } +static inline bool cpu_has_vmx_invpcid(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_INVPCID; +} + static inline bool cpu_has_virtual_nmis(void) { return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; @@ -1739,6 +1757,11 @@ static bool vmx_rdtscp_supported(void) return cpu_has_vmx_rdtscp(); } +static bool vmx_invpcid_supported(void) +{ + return cpu_has_vmx_invpcid() && enable_ept; +} + /* * Swap MSR entry in host/guest MSR entry array. */ @@ -2458,7 +2481,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_UNRESTRICTED_GUEST | SECONDARY_EXEC_PAUSE_LOOP_EXITING | - SECONDARY_EXEC_RDTSCP; + SECONDARY_EXEC_RDTSCP | + SECONDARY_EXEC_ENABLE_INVPCID; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -2645,8 +2669,12 @@ static __init int hardware_setup(void) !cpu_has_vmx_ept_4levels()) { enable_ept = 0; enable_unrestricted_guest = 0; + enable_ept_ad_bits = 0; } + if (!cpu_has_vmx_ept_ad_bits()) + enable_ept_ad_bits = 0; + if (!cpu_has_vmx_unrestricted_guest()) enable_unrestricted_guest = 0; @@ -2770,6 +2798,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) { unsigned long flags; struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_segment var; if (enable_unrestricted_guest) return; @@ -2813,20 +2842,23 @@ static void enter_rmode(struct kvm_vcpu *vcpu) if (emulate_invalid_guest_state) goto continue_rmode; - vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); - vmcs_write32(GUEST_SS_LIMIT, 0xffff); - vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); + vmx_get_segment(vcpu, &var, VCPU_SREG_SS); + vmx_set_segment(vcpu, &var, VCPU_SREG_SS); + + vmx_get_segment(vcpu, &var, VCPU_SREG_CS); + vmx_set_segment(vcpu, &var, VCPU_SREG_CS); + + vmx_get_segment(vcpu, &var, VCPU_SREG_ES); + vmx_set_segment(vcpu, &var, VCPU_SREG_ES); + + vmx_get_segment(vcpu, &var, VCPU_SREG_DS); + vmx_set_segment(vcpu, &var, VCPU_SREG_DS); - vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); - vmcs_write32(GUEST_CS_LIMIT, 0xffff); - if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) - vmcs_writel(GUEST_CS_BASE, 0xf0000); - vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); + vmx_get_segment(vcpu, &var, VCPU_SREG_GS); + vmx_set_segment(vcpu, &var, VCPU_SREG_GS); - fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); - fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); - fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs); - fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); + vmx_get_segment(vcpu, &var, VCPU_SREG_FS); + vmx_set_segment(vcpu, &var, VCPU_SREG_FS); continue_rmode: kvm_mmu_reset_context(vcpu); @@ -3027,6 +3059,8 @@ static u64 construct_eptp(unsigned long root_hpa) /* TODO write the value reading from MSR */ eptp = VMX_EPT_DEFAULT_MT | VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; + if (enable_ept_ad_bits) + eptp |= VMX_EPT_AD_ENABLE_BIT; eptp |= (root_hpa & PAGE_MASK); return eptp; @@ -3153,11 +3187,22 @@ static int __vmx_get_cpl(struct kvm_vcpu *vcpu) static int vmx_get_cpl(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * If we enter real mode with cs.sel & 3 != 0, the normal CPL calculations + * fail; use the cache instead. + */ + if (unlikely(vmx->emulation_required && emulate_invalid_guest_state)) { + return vmx->cpl; + } + if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); - to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu); + vmx->cpl = __vmx_get_cpl(vcpu); } - return to_vmx(vcpu)->cpl; + + return vmx->cpl; } @@ -3165,7 +3210,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var) { u32 ar; - if (var->unusable) + if (var->unusable || !var->present) ar = 1 << 16; else { ar = var->type & 15; @@ -3177,8 +3222,6 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var) ar |= (var->db & 1) << 14; ar |= (var->g & 1) << 15; } - if (ar == 0) /* a 0 value means unusable */ - ar = AR_UNUSABLE_MASK; return ar; } @@ -3229,6 +3272,44 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, vmcs_write32(sf->ar_bytes, ar); __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); + + /* + * Fix segments for real mode guest in hosts that don't have + * "unrestricted_mode" or it was disabled. + * This is done to allow migration of the guests from hosts with + * unrestricted guest like Westmere to older host that don't have + * unrestricted guest like Nehelem. + */ + if (!enable_unrestricted_guest && vmx->rmode.vm86_active) { + switch (seg) { + case VCPU_SREG_CS: + vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); + vmcs_write32(GUEST_CS_LIMIT, 0xffff); + if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) + vmcs_writel(GUEST_CS_BASE, 0xf0000); + vmcs_write16(GUEST_CS_SELECTOR, + vmcs_readl(GUEST_CS_BASE) >> 4); + break; + case VCPU_SREG_ES: + fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); + break; + case VCPU_SREG_DS: + fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); + break; + case VCPU_SREG_GS: + fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs); + break; + case VCPU_SREG_FS: + fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); + break; + case VCPU_SREG_SS: + vmcs_write16(GUEST_SS_SELECTOR, + vmcs_readl(GUEST_SS_BASE) >> 4); + vmcs_write32(GUEST_SS_LIMIT, 0xffff); + vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); + break; + } + } } static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) @@ -3731,6 +3812,8 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) if (!enable_ept) { exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; enable_unrestricted_guest = 0; + /* Enable INVPCID for non-ept guests may cause performance regression. */ + exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; } if (!enable_unrestricted_guest) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; @@ -4489,7 +4572,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) break; } vcpu->run->exit_reason = 0; - pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", + vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n", (int)(exit_qualification >> 4) & 3, cr); return 0; } @@ -4769,6 +4852,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; gpa_t gpa; + u32 error_code; int gla_validity; exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -4793,7 +4877,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); trace_kvm_page_fault(gpa, exit_qualification); - return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0); + + /* It is a write fault? */ + error_code = exit_qualification & (1U << 1); + /* ept page table is present? */ + error_code |= (exit_qualification >> 3) & 0x1; + + return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); } static u64 ept_rsvd_mask(u64 spte, int level) @@ -4908,15 +4998,18 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) int ret = 1; u32 cpu_exec_ctrl; bool intr_window_requested; + unsigned count = 130; cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; - while (!guest_state_valid(vcpu)) { - if (intr_window_requested - && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF)) + while (!guest_state_valid(vcpu) && count-- != 0) { + if (intr_window_requested && vmx_interrupt_allowed(vcpu)) return handle_interrupt_window(&vmx->vcpu); + if (test_bit(KVM_REQ_EVENT, &vcpu->requests)) + return 1; + err = emulate_instruction(vcpu, 0); if (err == EMULATE_DO_MMIO) { @@ -4924,8 +5017,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) goto out; } - if (err != EMULATE_DONE) + if (err != EMULATE_DONE) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; return 0; + } if (signal_pending(current)) goto out; @@ -4933,7 +5030,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) schedule(); } - vmx->emulation_required = 0; + vmx->emulation_required = !guest_state_valid(vcpu); out: return ret; } @@ -6467,6 +6564,23 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) } } } + + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + /* Exposing INVPCID only when PCID is exposed */ + best = kvm_find_cpuid_entry(vcpu, 0x7, 0); + if (vmx_invpcid_supported() && + best && (best->ecx & bit(X86_FEATURE_INVPCID)) && + guest_cpuid_has_pcid(vcpu)) { + exec_control |= SECONDARY_EXEC_ENABLE_INVPCID; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + exec_control); + } else { + exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + exec_control); + if (best) + best->ecx &= ~bit(X86_FEATURE_INVPCID); + } } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) @@ -7201,6 +7315,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .cpuid_update = vmx_cpuid_update, .rdtscp_supported = vmx_rdtscp_supported, + .invpcid_supported = vmx_invpcid_supported, .set_supported_cpuid = vmx_set_supported_cpuid, @@ -7230,23 +7345,21 @@ static int __init vmx_init(void) if (!vmx_io_bitmap_a) return -ENOMEM; + r = -ENOMEM; + vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_io_bitmap_b) { - r = -ENOMEM; + if (!vmx_io_bitmap_b) goto out; - } vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy) { - r = -ENOMEM; + if (!vmx_msr_bitmap_legacy) goto out1; - } + vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode) { - r = -ENOMEM; + if (!vmx_msr_bitmap_longmode) goto out2; - } + /* * Allow direct access to the PC debug port (it is often used for I/O @@ -7275,8 +7388,10 @@ static int __init vmx_init(void) vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); if (enable_ept) { - kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, - VMX_EPT_EXECUTABLE_MASK); + kvm_mmu_set_mask_ptes(0ull, + (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, + (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, + 0ull, VMX_EPT_EXECUTABLE_MASK); ept_set_mmio_spte_mask(); kvm_enable_tdp(); } else diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index be6d54929fa7..59b59508ff07 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -528,6 +528,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) return 1; } + if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) + return 1; + kvm_x86_ops->set_cr0(vcpu, cr0); if ((cr0 ^ old_cr0) & X86_CR0_PG) { @@ -604,10 +607,20 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) kvm_read_cr3(vcpu))) return 1; + if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) { + if (!guest_cpuid_has_pcid(vcpu)) + return 1; + + /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ + if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu)) + return 1; + } + if (kvm_x86_ops->set_cr4(vcpu, cr4)) return 1; - if ((cr4 ^ old_cr4) & pdptr_bits) + if (((cr4 ^ old_cr4) & pdptr_bits) || + (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) kvm_mmu_reset_context(vcpu); if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) @@ -626,8 +639,12 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) } if (is_long_mode(vcpu)) { - if (cr3 & CR3_L_MODE_RESERVED_BITS) - return 1; + if (kvm_read_cr4(vcpu) & X86_CR4_PCIDE) { + if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS) + return 1; + } else + if (cr3 & CR3_L_MODE_RESERVED_BITS) + return 1; } else { if (is_pae(vcpu)) { if (cr3 & CR3_PAE_RESERVED_BITS) @@ -795,6 +812,7 @@ static u32 msrs_to_save[] = { MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, + MSR_KVM_PV_EOI_EN, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_STAR, #ifdef CONFIG_X86_64 @@ -1437,8 +1455,8 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) break; } default: - pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " - "data 0x%llx\n", msr, data); + vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%llx\n", msr, data); return 1; } return 0; @@ -1470,8 +1488,8 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data) case HV_X64_MSR_TPR: return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data); default: - pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " - "data 0x%llx\n", msr, data); + vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%llx\n", msr, data); return 1; } @@ -1551,15 +1569,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) data &= ~(u64)0x100; /* ignore ignne emulation enable */ data &= ~(u64)0x8; /* ignore TLB cache disable */ if (data != 0) { - pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", - data); + vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", + data); return 1; } break; case MSR_FAM10H_MMIO_CONF_BASE: if (data != 0) { - pr_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " - "0x%llx\n", data); + vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: " + "0x%llx\n", data); return 1; } break; @@ -1574,8 +1592,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) thus reserved and should throw a #GP */ return 1; } - pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", - __func__, data); + vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", + __func__, data); break; case MSR_IA32_UCODE_REV: case MSR_IA32_UCODE_WRITE: @@ -1653,6 +1671,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); break; + case MSR_KVM_PV_EOI_EN: + if (kvm_lapic_enable_pv_eoi(vcpu, data)) + return 1; + break; case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: @@ -1671,8 +1693,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_K7_EVNTSEL2: case MSR_K7_EVNTSEL3: if (data != 0) - pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " - "0x%x data 0x%llx\n", msr, data); + vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: " + "0x%x data 0x%llx\n", msr, data); break; /* at least RHEL 4 unconditionally writes to the perfctr registers, * so we ignore writes to make it happy. @@ -1681,8 +1703,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_K7_PERFCTR1: case MSR_K7_PERFCTR2: case MSR_K7_PERFCTR3: - pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " - "0x%x data 0x%llx\n", msr, data); + vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: " + "0x%x data 0x%llx\n", msr, data); break; case MSR_P6_PERFCTR0: case MSR_P6_PERFCTR1: @@ -1693,8 +1715,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) return kvm_pmu_set_msr(vcpu, msr, data); if (pr || data != 0) - pr_unimpl(vcpu, "disabled perfctr wrmsr: " - "0x%x data 0x%llx\n", msr, data); + vcpu_unimpl(vcpu, "disabled perfctr wrmsr: " + "0x%x data 0x%llx\n", msr, data); break; case MSR_K7_CLK_CTL: /* @@ -1720,7 +1742,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) /* Drop writes to this legacy MSR -- see rdmsr * counterpart for further detail. */ - pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); + vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); break; case MSR_AMD64_OSVW_ID_LENGTH: if (!guest_cpuid_has_osvw(vcpu)) @@ -1738,12 +1760,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (kvm_pmu_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr, data); if (!ignore_msrs) { - pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", - msr, data); + vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", + msr, data); return 1; } else { - pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", - msr, data); + vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", + msr, data); break; } } @@ -1846,7 +1868,7 @@ static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) data = kvm->arch.hv_hypercall; break; default: - pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; } @@ -1877,7 +1899,7 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) data = vcpu->arch.hv_vapic; break; default: - pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; } *pdata = data; @@ -2030,10 +2052,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) if (kvm_pmu_msr(vcpu, msr)) return kvm_pmu_get_msr(vcpu, msr, pdata); if (!ignore_msrs) { - pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); + vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); return 1; } else { - pr_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); + vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr); data = 0; } break; @@ -4116,7 +4138,7 @@ static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr) value = kvm_get_cr8(vcpu); break; default: - vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); + kvm_err("%s: unexpected cr %u\n", __func__, cr); return 0; } @@ -4145,7 +4167,7 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) res = kvm_set_cr8(vcpu, val); break; default: - vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); + kvm_err("%s: unexpected cr %u\n", __func__, cr); res = -1; } @@ -4297,26 +4319,10 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt, return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage); } -static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, +static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) { - struct kvm_cpuid_entry2 *cpuid = NULL; - - if (eax && ecx) - cpuid = kvm_find_cpuid_entry(emul_to_vcpu(ctxt), - *eax, *ecx); - - if (cpuid) { - *eax = cpuid->eax; - *ecx = cpuid->ecx; - if (ebx) - *ebx = cpuid->ebx; - if (edx) - *edx = cpuid->edx; - return true; - } - - return false; + kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx); } static struct x86_emulate_ops emulate_ops = { @@ -5296,8 +5302,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) r = kvm_mmu_reload(vcpu); if (unlikely(r)) { - kvm_x86_ops->cancel_injection(vcpu); - goto out; + goto cancel_injection; } preempt_disable(); @@ -5322,9 +5327,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) smp_wmb(); local_irq_enable(); preempt_enable(); - kvm_x86_ops->cancel_injection(vcpu); r = 1; - goto out; + goto cancel_injection; } srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); @@ -5388,9 +5392,16 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (unlikely(vcpu->arch.tsc_always_catchup)) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - kvm_lapic_sync_from_vapic(vcpu); + if (vcpu->arch.apic_attention) + kvm_lapic_sync_from_vapic(vcpu); r = kvm_x86_ops->handle_exit(vcpu); + return r; + +cancel_injection: + kvm_x86_ops->cancel_injection(vcpu); + if (unlikely(vcpu->arch.apic_attention)) + kvm_lapic_sync_from_vapic(vcpu); out: return r; } @@ -6304,7 +6315,7 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free, for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) { - vfree(free->arch.lpage_info[i]); + kvm_kvfree(free->arch.lpage_info[i]); free->arch.lpage_info[i] = NULL; } } @@ -6323,7 +6334,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) slot->base_gfn, level) + 1; slot->arch.lpage_info[i] = - vzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); + kvm_kvzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); if (!slot->arch.lpage_info[i]) goto out_free; @@ -6350,7 +6361,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) out_free: for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { - vfree(slot->arch.lpage_info[i]); + kvm_kvfree(slot->arch.lpage_info[i]); slot->arch.lpage_info[i] = NULL; } return -ENOMEM; diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c index 459b58a8a15c..25b7ae8d058a 100644 --- a/arch/x86/lib/csum-wrappers_64.c +++ b/arch/x86/lib/csum-wrappers_64.c @@ -115,7 +115,7 @@ EXPORT_SYMBOL(csum_partial_copy_to_user); * @src: source address * @dst: destination address * @len: number of bytes to be copied. - * @isum: initial sum that is added into the result (32bit unfolded) + * @sum: initial sum that is added into the result (32bit unfolded) * * Returns an 32bit unfolded checksum of the buffer. */ diff --git a/arch/x86/lib/msr-reg-export.c b/arch/x86/lib/msr-reg-export.c index a311cc59b65d..8d6ef78b5d01 100644 --- a/arch/x86/lib/msr-reg-export.c +++ b/arch/x86/lib/msr-reg-export.c @@ -1,5 +1,5 @@ #include <linux/module.h> #include <asm/msr.h> -EXPORT_SYMBOL(native_rdmsr_safe_regs); -EXPORT_SYMBOL(native_wrmsr_safe_regs); +EXPORT_SYMBOL(rdmsr_safe_regs); +EXPORT_SYMBOL(wrmsr_safe_regs); diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S index 69fa10623f21..f6d13eefad10 100644 --- a/arch/x86/lib/msr-reg.S +++ b/arch/x86/lib/msr-reg.S @@ -6,13 +6,13 @@ #ifdef CONFIG_X86_64 /* - * int native_{rdmsr,wrmsr}_safe_regs(u32 gprs[8]); + * int {rdmsr,wrmsr}_safe_regs(u32 gprs[8]); * * reg layout: u32 gprs[eax, ecx, edx, ebx, esp, ebp, esi, edi] * */ .macro op_safe_regs op -ENTRY(native_\op\()_safe_regs) +ENTRY(\op\()_safe_regs) CFI_STARTPROC pushq_cfi %rbx pushq_cfi %rbp @@ -45,13 +45,13 @@ ENTRY(native_\op\()_safe_regs) _ASM_EXTABLE(1b, 3b) CFI_ENDPROC -ENDPROC(native_\op\()_safe_regs) +ENDPROC(\op\()_safe_regs) .endm #else /* X86_32 */ .macro op_safe_regs op -ENTRY(native_\op\()_safe_regs) +ENTRY(\op\()_safe_regs) CFI_STARTPROC pushl_cfi %ebx pushl_cfi %ebp @@ -92,7 +92,7 @@ ENTRY(native_\op\()_safe_regs) _ASM_EXTABLE(1b, 3b) CFI_ENDPROC -ENDPROC(native_\op\()_safe_regs) +ENDPROC(\op\()_safe_regs) .endm #endif diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index 677b1ed184c9..4f74d94c8d97 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -22,7 +22,7 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) void *map; int ret; - if (__range_not_ok(from, n, TASK_SIZE) == 0) + if (__range_not_ok(from, n, TASK_SIZE)) return len; do { diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index bc4e9d84157f..e0e6990723e9 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -385,7 +385,7 @@ void free_initmem(void) } #ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) +void __init free_initrd_mem(unsigned long start, unsigned long end) { /* * end could be not aligned, and We can not align that, diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index be1ef574ce9a..78fe3f1ac49f 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -180,7 +180,7 @@ err_free_memtype: /** * ioremap_nocache - map bus memory into CPU space - * @offset: bus address of the memory + * @phys_addr: bus address of the memory * @size: size of the resource to map * * ioremap_nocache performs a platform specific sequence of operations to @@ -217,7 +217,7 @@ EXPORT_SYMBOL(ioremap_nocache); /** * ioremap_wc - map memory into CPU space write combined - * @offset: bus address of the memory + * @phys_addr: bus address of the memory * @size: size of the resource to map * * This version of ioremap ensures that the memory is marked write combining. diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index e1ebde315210..a718e0d23503 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -122,7 +122,7 @@ within(unsigned long addr, unsigned long start, unsigned long end) /** * clflush_cache_range - flush a cache range with clflush - * @addr: virtual start address + * @vaddr: virtual start address * @size: number of bytes to flush * * clflush is an unordered instruction which needs fencing with mfence diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 0597f95b6da6..33643a8bcbbb 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -309,6 +309,10 @@ void bpf_jit_compile(struct sk_filter *fp) else EMIT1_off32(0x0d, K); /* or imm32,%eax */ break; + case BPF_S_ANC_ALU_XOR_X: /* A ^= X; */ + seen |= SEEN_XREG; + EMIT2(0x31, 0xd8); /* xor %ebx,%eax */ + break; case BPF_S_ALU_LSH_X: /* A <<= X; */ seen |= SEEN_XREG; EMIT4(0x89, 0xd9, 0xd3, 0xe0); /* mov %ebx,%ecx; shl %cl,%eax */ diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 303f08637826..b2b94438ff05 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -312,7 +312,7 @@ static int op_amd_fill_in_addresses(struct op_msrs * const msrs) goto fail; } /* both registers must be reserved */ - if (num_counters == AMD64_NUM_COUNTERS_F15H) { + if (num_counters == AMD64_NUM_COUNTERS_CORE) { msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1); msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1); } else { @@ -514,7 +514,7 @@ static int op_amd_init(struct oprofile_operations *ops) ops->create_files = setup_ibs_files; if (boot_cpu_data.x86 == 0x15) { - num_counters = AMD64_NUM_COUNTERS_F15H; + num_counters = AMD64_NUM_COUNTERS_CORE; } else { num_counters = AMD64_NUM_COUNTERS; } diff --git a/arch/x86/platform/mrst/early_printk_mrst.c b/arch/x86/platform/mrst/early_printk_mrst.c index 3c6e328483c7..028454f0c3a5 100644 --- a/arch/x86/platform/mrst/early_printk_mrst.c +++ b/arch/x86/platform/mrst/early_printk_mrst.c @@ -110,19 +110,16 @@ static struct kmsg_dumper dw_dumper; static int dumper_registered; static void dw_kmsg_dump(struct kmsg_dumper *dumper, - enum kmsg_dump_reason reason, - const char *s1, unsigned long l1, - const char *s2, unsigned long l2) + enum kmsg_dump_reason reason) { - int i; + static char line[1024]; + size_t len; /* When run to this, we'd better re-init the HW */ mrst_early_console_init(); - for (i = 0; i < l1; i++) - early_mrst_console.write(&early_mrst_console, s1 + i, 1); - for (i = 0; i < l2; i++) - early_mrst_console.write(&early_mrst_console, s2 + i, 1); + while (kmsg_dump_get_line(dumper, true, line, sizeof(line), &len)) + early_mrst_console.write(&early_mrst_console, line, len); } /* Set the ratio rate to 115200, 8n1, IRQ disabled */ diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c index 23e5b9d7977b..599be499fdf7 100644 --- a/arch/x86/platform/olpc/olpc-xo15-sci.c +++ b/arch/x86/platform/olpc/olpc-xo15-sci.c @@ -203,7 +203,7 @@ static int xo15_sci_remove(struct acpi_device *device, int type) return 0; } -static int xo15_sci_resume(struct acpi_device *device) +static int xo15_sci_resume(struct device *dev) { /* Enable all EC events */ olpc_ec_mask_write(EC_SCI_SRC_ALL); @@ -215,6 +215,8 @@ static int xo15_sci_resume(struct acpi_device *device) return 0; } +static SIMPLE_DEV_PM_OPS(xo15_sci_pm, NULL, xo15_sci_resume); + static const struct acpi_device_id xo15_sci_device_ids[] = { {"XO15EC", 0}, {"", 0}, @@ -227,8 +229,8 @@ static struct acpi_driver xo15_sci_drv = { .ops = { .add = xo15_sci_add, .remove = xo15_sci_remove, - .resume = xo15_sci_resume, }, + .drv.pm = &xo15_sci_pm, }; static int __init xo15_sci_init(void) diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 59880afa851f..71b5d5a07d7b 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1,7 +1,7 @@ /* * SGI UltraViolet TLB flush routines. * - * (c) 2008-2011 Cliff Wickman <cpw@sgi.com>, SGI. + * (c) 2008-2012 Cliff Wickman <cpw@sgi.com>, SGI. * * This code is released under the GNU General Public License version 2 or * later. @@ -38,8 +38,7 @@ static int timeout_base_ns[] = { static int timeout_us; static int nobau; -static int baudisabled; -static spinlock_t disable_lock; +static int nobau_perm; static cycles_t congested_cycles; /* tunables: */ @@ -47,12 +46,13 @@ static int max_concurr = MAX_BAU_CONCURRENT; static int max_concurr_const = MAX_BAU_CONCURRENT; static int plugged_delay = PLUGGED_DELAY; static int plugsb4reset = PLUGSB4RESET; +static int giveup_limit = GIVEUP_LIMIT; static int timeoutsb4reset = TIMEOUTSB4RESET; static int ipi_reset_limit = IPI_RESET_LIMIT; static int complete_threshold = COMPLETE_THRESHOLD; static int congested_respns_us = CONGESTED_RESPONSE_US; static int congested_reps = CONGESTED_REPS; -static int congested_period = CONGESTED_PERIOD; +static int disabled_period = DISABLED_PERIOD; static struct tunables tunables[] = { {&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */ @@ -63,7 +63,8 @@ static struct tunables tunables[] = { {&complete_threshold, COMPLETE_THRESHOLD}, {&congested_respns_us, CONGESTED_RESPONSE_US}, {&congested_reps, CONGESTED_REPS}, - {&congested_period, CONGESTED_PERIOD} + {&disabled_period, DISABLED_PERIOD}, + {&giveup_limit, GIVEUP_LIMIT} }; static struct dentry *tunables_dir; @@ -120,6 +121,40 @@ static DEFINE_PER_CPU(struct ptc_stats, ptcstats); static DEFINE_PER_CPU(struct bau_control, bau_control); static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); +static void +set_bau_on(void) +{ + int cpu; + struct bau_control *bcp; + + if (nobau_perm) { + pr_info("BAU not initialized; cannot be turned on\n"); + return; + } + nobau = 0; + for_each_present_cpu(cpu) { + bcp = &per_cpu(bau_control, cpu); + bcp->nobau = 0; + } + pr_info("BAU turned on\n"); + return; +} + +static void +set_bau_off(void) +{ + int cpu; + struct bau_control *bcp; + + nobau = 1; + for_each_present_cpu(cpu) { + bcp = &per_cpu(bau_control, cpu); + bcp->nobau = 1; + } + pr_info("BAU turned off\n"); + return; +} + /* * Determine the first node on a uvhub. 'Nodes' are used for kernel * memory allocation. @@ -278,7 +313,7 @@ static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp, * Both sockets dump their completed count total into * the message's count. */ - smaster->socket_acknowledge_count[mdp->msg_slot] = 0; + *sp = 0; asp = (struct atomic_short *)&msg->acknowledge_count; msg_ack_count = atom_asr(socket_ack_count, asp); @@ -491,16 +526,15 @@ static int uv1_wait_completion(struct bau_desc *bau_desc, } /* - * UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register. + * UV2 could have an extra bit of status in the ACTIVATION_STATUS_2 register. + * But not currently used. */ static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc) { unsigned long descriptor_status; - unsigned long descriptor_status2; - descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK); - descriptor_status2 = (read_mmr_uv2_status() >> desc) & 0x1UL; - descriptor_status = (descriptor_status << 1) | descriptor_status2; + descriptor_status = + ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK) << 1; return descriptor_status; } @@ -531,87 +565,11 @@ int normal_busy(struct bau_control *bcp) */ int handle_uv2_busy(struct bau_control *bcp) { - int busy_one = bcp->using_desc; - int normal = bcp->uvhub_cpu; - int selected = -1; - int i; - unsigned long descriptor_status; - unsigned long status; - int mmr_offset; - struct bau_desc *bau_desc_old; - struct bau_desc *bau_desc_new; - struct bau_control *hmaster = bcp->uvhub_master; struct ptc_stats *stat = bcp->statp; - cycles_t ttm; stat->s_uv2_wars++; - spin_lock(&hmaster->uvhub_lock); - /* try for the original first */ - if (busy_one != normal) { - if (!normal_busy(bcp)) - selected = normal; - } - if (selected < 0) { - /* can't use the normal, select an alternate */ - mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; - descriptor_status = read_lmmr(mmr_offset); - - /* scan available descriptors 32-63 */ - for (i = 0; i < UV_CPUS_PER_AS; i++) { - if ((hmaster->inuse_map & (1 << i)) == 0) { - status = ((descriptor_status >> - (i * UV_ACT_STATUS_SIZE)) & - UV_ACT_STATUS_MASK) << 1; - if (status != UV2H_DESC_BUSY) { - selected = i + UV_CPUS_PER_AS; - break; - } - } - } - } - - if (busy_one != normal) - /* mark the busy alternate as not in-use */ - hmaster->inuse_map &= ~(1 << (busy_one - UV_CPUS_PER_AS)); - - if (selected >= 0) { - /* switch to the selected descriptor */ - if (selected != normal) { - /* set the selected alternate as in-use */ - hmaster->inuse_map |= - (1 << (selected - UV_CPUS_PER_AS)); - if (selected > stat->s_uv2_wars_hw) - stat->s_uv2_wars_hw = selected; - } - bau_desc_old = bcp->descriptor_base; - bau_desc_old += (ITEMS_PER_DESC * busy_one); - bcp->using_desc = selected; - bau_desc_new = bcp->descriptor_base; - bau_desc_new += (ITEMS_PER_DESC * selected); - *bau_desc_new = *bau_desc_old; - } else { - /* - * All are busy. Wait for the normal one for this cpu to - * free up. - */ - stat->s_uv2_war_waits++; - spin_unlock(&hmaster->uvhub_lock); - ttm = get_cycles(); - do { - cpu_relax(); - } while (normal_busy(bcp)); - spin_lock(&hmaster->uvhub_lock); - /* switch to the original descriptor */ - bcp->using_desc = normal; - bau_desc_old = bcp->descriptor_base; - bau_desc_old += (ITEMS_PER_DESC * bcp->using_desc); - bcp->using_desc = (ITEMS_PER_DESC * normal); - bau_desc_new = bcp->descriptor_base; - bau_desc_new += (ITEMS_PER_DESC * normal); - *bau_desc_new = *bau_desc_old; /* copy the entire descriptor */ - } - spin_unlock(&hmaster->uvhub_lock); - return FLUSH_RETRY_BUSYBUG; + bcp->busy = 1; + return FLUSH_GIVEUP; } static int uv2_wait_completion(struct bau_desc *bau_desc, @@ -620,7 +578,7 @@ static int uv2_wait_completion(struct bau_desc *bau_desc, { unsigned long descriptor_stat; cycles_t ttm; - int desc = bcp->using_desc; + int desc = bcp->uvhub_cpu; long busy_reps = 0; struct ptc_stats *stat = bcp->statp; @@ -628,24 +586,38 @@ static int uv2_wait_completion(struct bau_desc *bau_desc, /* spin on the status MMR, waiting for it to go idle */ while (descriptor_stat != UV2H_DESC_IDLE) { - /* - * Our software ack messages may be blocked because - * there are no swack resources available. As long - * as none of them has timed out hardware will NACK - * our message and its state will stay IDLE. - */ - if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT) || - (descriptor_stat == UV2H_DESC_DEST_PUT_ERR)) { + if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT)) { + /* + * A h/w bug on the destination side may + * have prevented the message being marked + * pending, thus it doesn't get replied to + * and gets continually nacked until it times + * out with a SOURCE_TIMEOUT. + */ stat->s_stimeout++; return FLUSH_GIVEUP; - } else if (descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) { - stat->s_strongnacks++; - bcp->conseccompletes = 0; - return FLUSH_GIVEUP; } else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) { + ttm = get_cycles(); + + /* + * Our retries may be blocked by all destination + * swack resources being consumed, and a timeout + * pending. In that case hardware returns the + * ERROR that looks like a destination timeout. + * Without using the extended status we have to + * deduce from the short time that this was a + * strong nack. + */ + if (cycles_2_us(ttm - bcp->send_message) < timeout_us) { + bcp->conseccompletes = 0; + stat->s_plugged++; + /* FLUSH_RETRY_PLUGGED causes hang on boot */ + return FLUSH_GIVEUP; + } stat->s_dtimeout++; bcp->conseccompletes = 0; - return FLUSH_RETRY_TIMEOUT; + /* FLUSH_RETRY_TIMEOUT causes hang on boot */ + return FLUSH_GIVEUP; } else { busy_reps++; if (busy_reps > 1000000) { @@ -653,9 +625,8 @@ static int uv2_wait_completion(struct bau_desc *bau_desc, busy_reps = 0; ttm = get_cycles(); if ((ttm - bcp->send_message) > - (bcp->clocks_per_100_usec)) { + bcp->timeout_interval) return handle_uv2_busy(bcp); - } } /* * descriptor_stat is still BUSY @@ -679,7 +650,7 @@ static int wait_completion(struct bau_desc *bau_desc, { int right_shift; unsigned long mmr_offset; - int desc = bcp->using_desc; + int desc = bcp->uvhub_cpu; if (desc < UV_CPUS_PER_AS) { mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; @@ -758,33 +729,31 @@ static void destination_timeout(struct bau_desc *bau_desc, } /* - * Completions are taking a very long time due to a congested numalink - * network. + * Stop all cpus on a uvhub from using the BAU for a period of time. + * This is reversed by check_enable. */ -static void disable_for_congestion(struct bau_control *bcp, - struct ptc_stats *stat) +static void disable_for_period(struct bau_control *bcp, struct ptc_stats *stat) { - /* let only one cpu do this disabling */ - spin_lock(&disable_lock); - - if (!baudisabled && bcp->period_requests && - ((bcp->period_time / bcp->period_requests) > congested_cycles)) { - int tcpu; - struct bau_control *tbcp; - /* it becomes this cpu's job to turn on the use of the - BAU again */ - baudisabled = 1; - bcp->set_bau_off = 1; - bcp->set_bau_on_time = get_cycles(); - bcp->set_bau_on_time += sec_2_cycles(bcp->cong_period); + int tcpu; + struct bau_control *tbcp; + struct bau_control *hmaster; + cycles_t tm1; + + hmaster = bcp->uvhub_master; + spin_lock(&hmaster->disable_lock); + if (!bcp->baudisabled) { stat->s_bau_disabled++; + tm1 = get_cycles(); for_each_present_cpu(tcpu) { tbcp = &per_cpu(bau_control, tcpu); - tbcp->baudisabled = 1; + if (tbcp->uvhub_master == hmaster) { + tbcp->baudisabled = 1; + tbcp->set_bau_on_time = + tm1 + bcp->disabled_period; + } } } - - spin_unlock(&disable_lock); + spin_unlock(&hmaster->disable_lock); } static void count_max_concurr(int stat, struct bau_control *bcp, @@ -815,16 +784,30 @@ static void record_send_stats(cycles_t time1, cycles_t time2, bcp->period_requests++; bcp->period_time += elapsed; if ((elapsed > congested_cycles) && - (bcp->period_requests > bcp->cong_reps)) - disable_for_congestion(bcp, stat); + (bcp->period_requests > bcp->cong_reps) && + ((bcp->period_time / bcp->period_requests) > + congested_cycles)) { + stat->s_congested++; + disable_for_period(bcp, stat); + } } } else stat->s_requestor--; if (completion_status == FLUSH_COMPLETE && try > 1) stat->s_retriesok++; - else if (completion_status == FLUSH_GIVEUP) + else if (completion_status == FLUSH_GIVEUP) { stat->s_giveup++; + if (get_cycles() > bcp->period_end) + bcp->period_giveups = 0; + bcp->period_giveups++; + if (bcp->period_giveups == 1) + bcp->period_end = get_cycles() + bcp->disabled_period; + if (bcp->period_giveups > bcp->giveup_limit) { + disable_for_period(bcp, stat); + stat->s_giveuplimit++; + } + } } /* @@ -868,7 +851,8 @@ static void handle_cmplt(int completion_status, struct bau_desc *bau_desc, * Returns 1 if it gives up entirely and the original cpu mask is to be * returned to the kernel. */ -int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp) +int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp, + struct bau_desc *bau_desc) { int seq_number = 0; int completion_stat = 0; @@ -881,24 +865,23 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp) struct bau_control *hmaster = bcp->uvhub_master; struct uv1_bau_msg_header *uv1_hdr = NULL; struct uv2_bau_msg_header *uv2_hdr = NULL; - struct bau_desc *bau_desc; - if (bcp->uvhub_version == 1) + if (bcp->uvhub_version == 1) { + uv1 = 1; uv1_throttle(hmaster, stat); + } while (hmaster->uvhub_quiesce) cpu_relax(); time1 = get_cycles(); + if (uv1) + uv1_hdr = &bau_desc->header.uv1_hdr; + else + uv2_hdr = &bau_desc->header.uv2_hdr; + do { - bau_desc = bcp->descriptor_base; - bau_desc += (ITEMS_PER_DESC * bcp->using_desc); - if (bcp->uvhub_version == 1) { - uv1 = 1; - uv1_hdr = &bau_desc->header.uv1_hdr; - } else - uv2_hdr = &bau_desc->header.uv2_hdr; - if ((try == 0) || (completion_stat == FLUSH_RETRY_BUSYBUG)) { + if (try == 0) { if (uv1) uv1_hdr->msg_type = MSG_REGULAR; else @@ -916,25 +899,24 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp) uv1_hdr->sequence = seq_number; else uv2_hdr->sequence = seq_number; - index = (1UL << AS_PUSH_SHIFT) | bcp->using_desc; + index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu; bcp->send_message = get_cycles(); write_mmr_activation(index); try++; completion_stat = wait_completion(bau_desc, bcp, try); - /* UV2: wait_completion() may change the bcp->using_desc */ handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat); if (bcp->ipi_attempts >= bcp->ipi_reset_limit) { bcp->ipi_attempts = 0; + stat->s_overipilimit++; completion_stat = FLUSH_GIVEUP; break; } cpu_relax(); } while ((completion_stat == FLUSH_RETRY_PLUGGED) || - (completion_stat == FLUSH_RETRY_BUSYBUG) || (completion_stat == FLUSH_RETRY_TIMEOUT)); time2 = get_cycles(); @@ -955,28 +937,33 @@ int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp) } /* - * The BAU is disabled. When the disabled time period has expired, the cpu - * that disabled it must re-enable it. - * Return 0 if it is re-enabled for all cpus. + * The BAU is disabled for this uvhub. When the disabled time period has + * expired re-enable it. + * Return 0 if it is re-enabled for all cpus on this uvhub. */ static int check_enable(struct bau_control *bcp, struct ptc_stats *stat) { int tcpu; struct bau_control *tbcp; + struct bau_control *hmaster; - if (bcp->set_bau_off) { - if (get_cycles() >= bcp->set_bau_on_time) { - stat->s_bau_reenabled++; - baudisabled = 0; - for_each_present_cpu(tcpu) { - tbcp = &per_cpu(bau_control, tcpu); + hmaster = bcp->uvhub_master; + spin_lock(&hmaster->disable_lock); + if (bcp->baudisabled && (get_cycles() >= bcp->set_bau_on_time)) { + stat->s_bau_reenabled++; + for_each_present_cpu(tcpu) { + tbcp = &per_cpu(bau_control, tcpu); + if (tbcp->uvhub_master == hmaster) { tbcp->baudisabled = 0; tbcp->period_requests = 0; tbcp->period_time = 0; + tbcp->period_giveups = 0; } - return 0; } + spin_unlock(&hmaster->disable_lock); + return 0; } + spin_unlock(&hmaster->disable_lock); return -1; } @@ -1078,18 +1065,32 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct cpumask *flush_mask; struct ptc_stats *stat; struct bau_control *bcp; - - /* kernel was booted 'nobau' */ - if (nobau) - return cpumask; + unsigned long descriptor_status; + unsigned long status; bcp = &per_cpu(bau_control, cpu); stat = bcp->statp; + stat->s_enters++; + + if (bcp->nobau) + return cpumask; + + if (bcp->busy) { + descriptor_status = + read_lmmr(UVH_LB_BAU_SB_ACTIVATION_STATUS_0); + status = ((descriptor_status >> (bcp->uvhub_cpu * + UV_ACT_STATUS_SIZE)) & UV_ACT_STATUS_MASK) << 1; + if (status == UV2H_DESC_BUSY) + return cpumask; + bcp->busy = 0; + } /* bau was disabled due to slow response */ if (bcp->baudisabled) { - if (check_enable(bcp, stat)) + if (check_enable(bcp, stat)) { + stat->s_ipifordisabled++; return cpumask; + } } /* @@ -1105,7 +1106,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, stat->s_ntargself++; bau_desc = bcp->descriptor_base; - bau_desc += (ITEMS_PER_DESC * bcp->using_desc); + bau_desc += (ITEMS_PER_DESC * bcp->uvhub_cpu); bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes)) return NULL; @@ -1118,25 +1119,27 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, * uv_flush_send_and_wait returns 0 if all cpu's were messaged, * or 1 if it gave up and the original cpumask should be returned. */ - if (!uv_flush_send_and_wait(flush_mask, bcp)) + if (!uv_flush_send_and_wait(flush_mask, bcp, bau_desc)) return NULL; else return cpumask; } /* - * Search the message queue for any 'other' message with the same software - * acknowledge resource bit vector. + * Search the message queue for any 'other' unprocessed message with the + * same software acknowledge resource bit vector as the 'msg' message. */ struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg, - struct bau_control *bcp, unsigned char swack_vec) + struct bau_control *bcp) { struct bau_pq_entry *msg_next = msg + 1; + unsigned char swack_vec = msg->swack_vec; if (msg_next > bcp->queue_last) msg_next = bcp->queue_first; - while ((msg_next->swack_vec != 0) && (msg_next != msg)) { - if (msg_next->swack_vec == swack_vec) + while (msg_next != msg) { + if ((msg_next->canceled == 0) && (msg_next->replied_to == 0) && + (msg_next->swack_vec == swack_vec)) return msg_next; msg_next++; if (msg_next > bcp->queue_last) @@ -1165,32 +1168,30 @@ void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp) * This message was assigned a swack resource, but no * reserved acknowlegment is pending. * The bug has prevented this message from setting the MMR. - * And no other message has used the same sw_ack resource. - * Do the requested shootdown but do not reply to the msg. - * (the 0 means make no acknowledge) */ - bau_process_message(mdp, bcp, 0); - return; - } - - /* - * Some message has set the MMR 'pending' bit; it might have been - * another message. Look for that message. - */ - other_msg = find_another_by_swack(msg, bcp, msg->swack_vec); - if (other_msg) { - /* There is another. Do not ack the current one. */ - bau_process_message(mdp, bcp, 0); /* - * Let the natural processing of that message acknowledge - * it. Don't get the processing of sw_ack's out of order. + * Some message has set the MMR 'pending' bit; it might have + * been another message. Look for that message. */ - return; + other_msg = find_another_by_swack(msg, bcp); + if (other_msg) { + /* + * There is another. Process this one but do not + * ack it. + */ + bau_process_message(mdp, bcp, 0); + /* + * Let the natural processing of that other message + * acknowledge it. Don't get the processing of sw_ack's + * out of order. + */ + return; + } } /* - * There is no other message using this sw_ack, so it is safe to - * acknowledge it. + * Either the MMR shows this one pending a reply or there is no + * other message using this sw_ack, so it is safe to acknowledge it. */ bau_process_message(mdp, bcp, 1); @@ -1295,7 +1296,8 @@ static void __init enable_timeouts(void) */ mmr_image |= (1L << SOFTACK_MSHIFT); if (is_uv2_hub()) { - mmr_image |= (1L << UV2_EXT_SHFT); + /* hw bug workaround; do not use extended status */ + mmr_image &= ~(1L << UV2_EXT_SHFT); } write_mmr_misc_control(pnode, mmr_image); } @@ -1338,29 +1340,34 @@ static inline unsigned long long usec_2_cycles(unsigned long microsec) static int ptc_seq_show(struct seq_file *file, void *data) { struct ptc_stats *stat; + struct bau_control *bcp; int cpu; cpu = *(loff_t *)data; if (!cpu) { seq_printf(file, - "# cpu sent stime self locals remotes ncpus localhub "); + "# cpu bauoff sent stime self locals remotes ncpus localhub "); seq_printf(file, "remotehub numuvhubs numuvhubs16 numuvhubs8 "); seq_printf(file, - "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries rok "); + "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries "); + seq_printf(file, + "rok resetp resett giveup sto bz throt disable "); seq_printf(file, - "resetp resett giveup sto bz throt swack recv rtime "); + "enable wars warshw warwaits enters ipidis plugged "); seq_printf(file, - "all one mult none retry canc nocan reset rcan "); + "ipiover glim cong swack recv rtime all one mult "); seq_printf(file, - "disable enable wars warshw warwaits\n"); + "none retry canc nocan reset rcan\n"); } if (cpu < num_possible_cpus() && cpu_online(cpu)) { - stat = &per_cpu(ptcstats, cpu); + bcp = &per_cpu(bau_control, cpu); + stat = bcp->statp; /* source side statistics */ seq_printf(file, - "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", - cpu, stat->s_requestor, cycles_2_us(stat->s_time), + "cpu %d %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", + cpu, bcp->nobau, stat->s_requestor, + cycles_2_us(stat->s_time), stat->s_ntargself, stat->s_ntarglocals, stat->s_ntargremotes, stat->s_ntargcpu, stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub, @@ -1374,20 +1381,23 @@ static int ptc_seq_show(struct seq_file *file, void *data) stat->s_resets_plug, stat->s_resets_timeout, stat->s_giveup, stat->s_stimeout, stat->s_busy, stat->s_throttles); + seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", + stat->s_bau_disabled, stat->s_bau_reenabled, + stat->s_uv2_wars, stat->s_uv2_wars_hw, + stat->s_uv2_war_waits, stat->s_enters, + stat->s_ipifordisabled, stat->s_plugged, + stat->s_overipilimit, stat->s_giveuplimit, + stat->s_congested); /* destination side statistics */ seq_printf(file, - "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", + "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", read_gmmr_sw_ack(uv_cpu_to_pnode(cpu)), stat->d_requestee, cycles_2_us(stat->d_time), stat->d_alltlb, stat->d_onetlb, stat->d_multmsg, stat->d_nomsg, stat->d_retries, stat->d_canceled, stat->d_nocanceled, stat->d_resets, stat->d_rcanceled); - seq_printf(file, "%ld %ld %ld %ld %ld\n", - stat->s_bau_disabled, stat->s_bau_reenabled, - stat->s_uv2_wars, stat->s_uv2_wars_hw, - stat->s_uv2_war_waits); } return 0; } @@ -1401,13 +1411,14 @@ static ssize_t tunables_read(struct file *file, char __user *userbuf, char *buf; int ret; - buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n", - "max_concur plugged_delay plugsb4reset", - "timeoutsb4reset ipi_reset_limit complete_threshold", - "congested_response_us congested_reps congested_period", + buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d %d\n", + "max_concur plugged_delay plugsb4reset timeoutsb4reset", + "ipi_reset_limit complete_threshold congested_response_us", + "congested_reps disabled_period giveup_limit", max_concurr, plugged_delay, plugsb4reset, timeoutsb4reset, ipi_reset_limit, complete_threshold, - congested_respns_us, congested_reps, congested_period); + congested_respns_us, congested_reps, disabled_period, + giveup_limit); if (!buf) return -ENOMEM; @@ -1438,6 +1449,14 @@ static ssize_t ptc_proc_write(struct file *file, const char __user *user, return -EFAULT; optstr[count - 1] = '\0'; + if (!strcmp(optstr, "on")) { + set_bau_on(); + return count; + } else if (!strcmp(optstr, "off")) { + set_bau_off(); + return count; + } + if (strict_strtol(optstr, 10, &input_arg) < 0) { printk(KERN_DEBUG "%s is invalid\n", optstr); return -EINVAL; @@ -1570,7 +1589,8 @@ static ssize_t tunables_write(struct file *file, const char __user *user, bcp->complete_threshold = complete_threshold; bcp->cong_response_us = congested_respns_us; bcp->cong_reps = congested_reps; - bcp->cong_period = congested_period; + bcp->disabled_period = sec_2_cycles(disabled_period); + bcp->giveup_limit = giveup_limit; } return count; } @@ -1699,6 +1719,10 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode) * fairness chaining multilevel count replied_to */ } else { + /* + * BIOS uses legacy mode, but UV2 hardware always + * uses native mode for selective broadcasts. + */ uv2_hdr = &bd2->header.uv2_hdr; uv2_hdr->swack_flag = 1; uv2_hdr->base_dest_nasid = @@ -1811,8 +1835,8 @@ static int calculate_destination_timeout(void) index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK; mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT); mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK; - base = timeout_base_ns[index]; - ts_ns = base * mult1 * mult2; + ts_ns = timeout_base_ns[index]; + ts_ns *= (mult1 * mult2); ret = ts_ns / 1000; } else { /* 4 bits 0/1 for 10/80us base, 3 bits of multiplier */ @@ -1836,6 +1860,8 @@ static void __init init_per_cpu_tunables(void) for_each_present_cpu(cpu) { bcp = &per_cpu(bau_control, cpu); bcp->baudisabled = 0; + if (nobau) + bcp->nobau = 1; bcp->statp = &per_cpu(ptcstats, cpu); /* time interval to catch a hardware stay-busy bug */ bcp->timeout_interval = usec_2_cycles(2*timeout_us); @@ -1848,10 +1874,11 @@ static void __init init_per_cpu_tunables(void) bcp->complete_threshold = complete_threshold; bcp->cong_response_us = congested_respns_us; bcp->cong_reps = congested_reps; - bcp->cong_period = congested_period; - bcp->clocks_per_100_usec = usec_2_cycles(100); + bcp->disabled_period = sec_2_cycles(disabled_period); + bcp->giveup_limit = giveup_limit; spin_lock_init(&bcp->queue_lock); spin_lock_init(&bcp->uvhub_lock); + spin_lock_init(&bcp->disable_lock); } } @@ -1972,7 +1999,6 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp, } bcp->uvhub_master = *hmasterp; bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id; - bcp->using_desc = bcp->uvhub_cpu; if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) { printk(KERN_EMERG "%d cpus per uvhub invalid\n", bcp->uvhub_cpu); @@ -2069,16 +2095,12 @@ static int __init uv_bau_init(void) if (!is_uv_system()) return 0; - if (nobau) - return 0; - for_each_possible_cpu(cur_cpu) { mask = &per_cpu(uv_flush_tlb_mask, cur_cpu); zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu)); } nuvhubs = uv_num_possible_blades(); - spin_lock_init(&disable_lock); congested_cycles = usec_2_cycles(congested_respns_us); uv_base_pnode = 0x7fffffff; @@ -2091,7 +2113,8 @@ static int __init uv_bau_init(void) enable_timeouts(); if (init_per_cpu(nuvhubs, uv_base_pnode)) { - nobau = 1; + set_bau_off(); + nobau_perm = 1; return 0; } diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c index f25c2765a5c9..acf7752da952 100644 --- a/arch/x86/platform/uv/uv_irq.c +++ b/arch/x86/platform/uv/uv_irq.c @@ -135,6 +135,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, unsigned long mmr_value; struct uv_IO_APIC_route_entry *entry; int mmr_pnode, err; + unsigned int dest; BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); @@ -143,6 +144,10 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, if (err != 0) return err; + err = apic->cpu_mask_to_apicid_and(eligible_cpu, eligible_cpu, &dest); + if (err != 0) + return err; + if (limit == UV_AFFINITY_CPU) irq_set_status_flags(irq, IRQ_NO_BALANCING); else @@ -159,7 +164,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, entry->polarity = 0; entry->trigger = 0; entry->mask = 0; - entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); + entry->dest = dest; mmr_pnode = uv_blade_to_pnode(mmr_blade); uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); @@ -222,7 +227,7 @@ uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask, if (cfg->move_in_progress) send_cleanup_vector(cfg); - return 0; + return IRQ_SET_MASK_OK_NOCOPY; } /* diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 5b84a2d30888..b2d534cab25f 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -22,7 +22,7 @@ wakeup-objs += video-bios.o realmode-y += header.o realmode-y += trampoline_$(BITS).o realmode-y += stack.o -realmode-$(CONFIG_X86_32) += reboot_32.o +realmode-y += reboot.o realmode-$(CONFIG_ACPI_SLEEP) += $(wakeup-objs) targets += $(realmode-y) diff --git a/arch/x86/realmode/rm/header.S b/arch/x86/realmode/rm/header.S index fadf48378ada..a28221d94e69 100644 --- a/arch/x86/realmode/rm/header.S +++ b/arch/x86/realmode/rm/header.S @@ -6,6 +6,7 @@ #include <linux/linkage.h> #include <asm/page_types.h> +#include <asm/segment.h> #include "realmode.h" @@ -28,8 +29,9 @@ GLOBAL(real_mode_header) .long pa_wakeup_header #endif /* APM/BIOS reboot */ -#ifdef CONFIG_X86_32 .long pa_machine_real_restart_asm +#ifdef CONFIG_X86_64 + .long __KERNEL32_CS #endif END(real_mode_header) diff --git a/arch/x86/realmode/rm/reboot_32.S b/arch/x86/realmode/rm/reboot.S index 114044876b3d..f932ea61d1c8 100644 --- a/arch/x86/realmode/rm/reboot_32.S +++ b/arch/x86/realmode/rm/reboot.S @@ -2,6 +2,8 @@ #include <linux/init.h> #include <asm/segment.h> #include <asm/page_types.h> +#include <asm/processor-flags.h> +#include <asm/msr-index.h> #include "realmode.h" /* @@ -12,13 +14,35 @@ * doesn't work with at least one type of 486 motherboard. It is easy * to stop this code working; hence the copious comments. * - * This code is called with the restart type (0 = BIOS, 1 = APM) in %eax. + * This code is called with the restart type (0 = BIOS, 1 = APM) in + * the primary argument register (%eax for 32 bit, %edi for 64 bit). */ .section ".text32", "ax" .code32 - - .balign 16 ENTRY(machine_real_restart_asm) + +#ifdef CONFIG_X86_64 + /* Switch to trampoline GDT as it is guaranteed < 4 GiB */ + movl $__KERNEL_DS, %eax + movl %eax, %ds + lgdtl pa_tr_gdt + + /* Disable paging to drop us out of long mode */ + movl %cr0, %eax + andl $~X86_CR0_PG, %eax + movl %eax, %cr0 + ljmpl $__KERNEL32_CS, $pa_machine_real_restart_paging_off + +GLOBAL(machine_real_restart_paging_off) + xorl %eax, %eax + xorl %edx, %edx + movl $MSR_EFER, %ecx + wrmsr + + movl %edi, %eax + +#endif /* CONFIG_X86_64 */ + /* Set up the IDT for real mode. */ lidtl pa_machine_real_restart_idt diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index 416bd40c0eba..68d1dc91b37b 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -39,9 +39,9 @@ #undef __SYSCALL_I386 #define __SYSCALL_I386(nr, sym, compat) [ nr ] = sym, -typedef void (*sys_call_ptr_t)(void); +typedef asmlinkage void (*sys_call_ptr_t)(void); -extern void sys_ni_syscall(void); +extern asmlinkage void sys_ni_syscall(void); const sys_call_ptr_t sys_call_table[] __cacheline_aligned = { /* diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 66e6d9359826..0faad646f5fd 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -205,9 +205,9 @@ void syscall32_cpu_init(void) { /* Load these always in case some future AMD CPU supports SYSENTER from compat mode too. */ - checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL); - checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); wrmsrl(MSR_CSTAR, ia32_cstar_target); } diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index e74df9548a02..bf4bda6d3e9a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -31,6 +31,7 @@ #include <linux/pci.h> #include <linux/gfp.h> #include <linux/memblock.h> +#include <linux/syscore_ops.h> #include <xen/xen.h> #include <xen/interface/xen.h> @@ -38,6 +39,7 @@ #include <xen/interface/physdev.h> #include <xen/interface/vcpu.h> #include <xen/interface/memory.h> +#include <xen/interface/xen-mca.h> #include <xen/features.h> #include <xen/page.h> #include <xen/hvm.h> @@ -107,7 +109,7 @@ EXPORT_SYMBOL_GPL(xen_have_vector_callback); * Point at some empty memory to start with. We map the real shared_info * page as soon as fixmap is up and running. */ -struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; +struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info; /* * Flag to determine whether vcpu info placement is available on all @@ -124,6 +126,19 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; */ static int have_vcpu_info_placement = 1; +struct tls_descs { + struct desc_struct desc[3]; +}; + +/* + * Updating the 3 TLS descriptors in the GDT on every task switch is + * surprisingly expensive so we avoid updating them if they haven't + * changed. Since Xen writes different descriptors than the one + * passed in the update_descriptor hypercall we keep shadow copies to + * compare against. + */ +static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); + static void clamp_max_cpus(void) { #ifdef CONFIG_SMP @@ -209,6 +224,9 @@ static void __init xen_banner(void) xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); } +#define CPUID_THERM_POWER_LEAF 6 +#define APERFMPERF_PRESENT 0 + static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0; static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0; @@ -242,6 +260,11 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, *dx = cpuid_leaf5_edx_val; return; + case CPUID_THERM_POWER_LEAF: + /* Disabling APERFMPERF for kernel usage */ + maskecx = ~(1 << APERFMPERF_PRESENT); + break; + case 0xb: /* Suppress extended topology stuff */ maskebx = 0; @@ -333,9 +356,7 @@ static void __init xen_init_cpuid_mask(void) unsigned int xsave_mask; cpuid_leaf1_edx_mask = - ~((1 << X86_FEATURE_MCE) | /* disable MCE */ - (1 << X86_FEATURE_MCA) | /* disable MCA */ - (1 << X86_FEATURE_MTRR) | /* disable MTRR */ + ~((1 << X86_FEATURE_MTRR) | /* disable MTRR */ (1 << X86_FEATURE_ACC)); /* thermal monitoring */ if (!xen_initial_domain()) @@ -532,12 +553,28 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr) BUG(); } +static inline bool desc_equal(const struct desc_struct *d1, + const struct desc_struct *d2) +{ + return d1->a == d2->a && d1->b == d2->b; +} + static void load_TLS_descriptor(struct thread_struct *t, unsigned int cpu, unsigned int i) { - struct desc_struct *gdt = get_cpu_gdt_table(cpu); - xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); - struct multicall_space mc = __xen_mc_entry(0); + struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i]; + struct desc_struct *gdt; + xmaddr_t maddr; + struct multicall_space mc; + + if (desc_equal(shadow, &t->tls_array[i])) + return; + + *shadow = t->tls_array[i]; + + gdt = get_cpu_gdt_table(cpu); + maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); + mc = __xen_mc_entry(0); MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); } @@ -619,8 +656,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, /* * Look for known traps using IST, and substitute them * appropriately. The debugger ones are the only ones we care - * about. Xen will handle faults like double_fault and - * machine_check, so we should never see them. Warn if + * about. Xen will handle faults like double_fault, + * so we should never see them. Warn if * there's an unexpected IST-using fault handler. */ if (addr == (unsigned long)debug) @@ -635,7 +672,11 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, return 0; #ifdef CONFIG_X86_MCE } else if (addr == (unsigned long)machine_check) { - return 0; + /* + * when xen hypervisor inject vMCE to guest, + * use native mce handler to handle it + */ + ; #endif } else { /* Some other trap using IST? */ @@ -1116,9 +1157,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, - .rdmsr_regs = native_rdmsr_safe_regs, .write_msr = xen_write_msr_safe, - .wrmsr_regs = native_wrmsr_safe_regs, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, @@ -1431,64 +1470,155 @@ asmlinkage void __init xen_start_kernel(void) #endif } -static int init_hvm_pv_info(int *major, int *minor) -{ - uint32_t eax, ebx, ecx, edx, pages, msr, base; - u64 pfn; - - base = xen_cpuid_base(); - cpuid(base + 1, &eax, &ebx, &ecx, &edx); - - *major = eax >> 16; - *minor = eax & 0xffff; - printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor); - - cpuid(base + 2, &pages, &msr, &ecx, &edx); - - pfn = __pa(hypercall_page); - wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); - - xen_setup_features(); - - pv_info.name = "Xen HVM"; - - xen_domain_type = XEN_HVM_DOMAIN; +#ifdef CONFIG_XEN_PVHVM +/* + * The pfn containing the shared_info is located somewhere in RAM. This + * will cause trouble if the current kernel is doing a kexec boot into a + * new kernel. The new kernel (and its startup code) can not know where + * the pfn is, so it can not reserve the page. The hypervisor will + * continue to update the pfn, and as a result memory corruption occours + * in the new kernel. + * + * One way to work around this issue is to allocate a page in the + * xen-platform pci device's BAR memory range. But pci init is done very + * late and the shared_info page is already in use very early to read + * the pvclock. So moving the pfn from RAM to MMIO is racy because some + * code paths on other vcpus could access the pfn during the small + * window when the old pfn is moved to the new pfn. There is even a + * small window were the old pfn is not backed by a mfn, and during that + * time all reads return -1. + * + * Because it is not known upfront where the MMIO region is located it + * can not be used right from the start in xen_hvm_init_shared_info. + * + * To minimise trouble the move of the pfn is done shortly before kexec. + * This does not eliminate the race because all vcpus are still online + * when the syscore_ops will be called. But hopefully there is no work + * pending at this point in time. Also the syscore_op is run last which + * reduces the risk further. + */ - return 0; -} +static struct shared_info *xen_hvm_shared_info; -void __ref xen_hvm_init_shared_info(void) +static void xen_hvm_connect_shared_info(unsigned long pfn) { - int cpu; struct xen_add_to_physmap xatp; - static struct shared_info *shared_info_page = 0; - if (!shared_info_page) - shared_info_page = (struct shared_info *) - extend_brk(PAGE_SIZE, PAGE_SIZE); xatp.domid = DOMID_SELF; xatp.idx = 0; xatp.space = XENMAPSPACE_shared_info; - xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; + xatp.gpfn = pfn; if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) BUG(); - HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; +} +static void xen_hvm_set_shared_info(struct shared_info *sip) +{ + int cpu; + + HYPERVISOR_shared_info = sip; /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info * page, we use it in the event channel upcall and in some pvclock * related functions. We don't need the vcpu_info placement * optimizations because we don't use any pv_mmu or pv_irq op on * HVM. - * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is - * online but xen_hvm_init_shared_info is run at resume time too and + * When xen_hvm_set_shared_info is run at boot time only vcpu 0 is + * online but xen_hvm_set_shared_info is run at resume time too and * in that case multiple vcpus might be online. */ for_each_online_cpu(cpu) { per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; } } -#ifdef CONFIG_XEN_PVHVM +/* Reconnect the shared_info pfn to a mfn */ +void xen_hvm_resume_shared_info(void) +{ + xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT); +} + +#ifdef CONFIG_KEXEC +static struct shared_info *xen_hvm_shared_info_kexec; +static unsigned long xen_hvm_shared_info_pfn_kexec; + +/* Remember a pfn in MMIO space for kexec reboot */ +void __devinit xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn) +{ + xen_hvm_shared_info_kexec = sip; + xen_hvm_shared_info_pfn_kexec = pfn; +} + +static void xen_hvm_syscore_shutdown(void) +{ + struct xen_memory_reservation reservation = { + .domid = DOMID_SELF, + .nr_extents = 1, + }; + unsigned long prev_pfn; + int rc; + + if (!xen_hvm_shared_info_kexec) + return; + + prev_pfn = __pa(xen_hvm_shared_info) >> PAGE_SHIFT; + set_xen_guest_handle(reservation.extent_start, &prev_pfn); + + /* Move pfn to MMIO, disconnects previous pfn from mfn */ + xen_hvm_connect_shared_info(xen_hvm_shared_info_pfn_kexec); + + /* Update pointers, following hypercall is also a memory barrier */ + xen_hvm_set_shared_info(xen_hvm_shared_info_kexec); + + /* Allocate new mfn for previous pfn */ + do { + rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); + if (rc == 0) + msleep(123); + } while (rc == 0); + + /* Make sure the previous pfn is really connected to a (new) mfn */ + BUG_ON(rc != 1); +} + +static struct syscore_ops xen_hvm_syscore_ops = { + .shutdown = xen_hvm_syscore_shutdown, +}; +#endif + +/* Use a pfn in RAM, may move to MMIO before kexec. */ +static void __init xen_hvm_init_shared_info(void) +{ + /* Remember pointer for resume */ + xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE); + xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT); + xen_hvm_set_shared_info(xen_hvm_shared_info); +} + +static void __init init_hvm_pv_info(void) +{ + int major, minor; + uint32_t eax, ebx, ecx, edx, pages, msr, base; + u64 pfn; + + base = xen_cpuid_base(); + cpuid(base + 1, &eax, &ebx, &ecx, &edx); + + major = eax >> 16; + minor = eax & 0xffff; + printk(KERN_INFO "Xen version %d.%d.\n", major, minor); + + cpuid(base + 2, &pages, &msr, &ecx, &edx); + + pfn = __pa(hypercall_page); + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + + xen_setup_features(); + + pv_info.name = "Xen HVM"; + + xen_domain_type = XEN_HVM_DOMAIN; +} + static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -1511,14 +1641,12 @@ static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = { static void __init xen_hvm_guest_init(void) { - int r; - int major, minor; - - r = init_hvm_pv_info(&major, &minor); - if (r < 0) - return; + init_hvm_pv_info(); xen_hvm_init_shared_info(); +#ifdef CONFIG_KEXEC + register_syscore_ops(&xen_hvm_syscore_ops); +#endif if (xen_feature(XENFEAT_hvm_callback_vector)) xen_have_vector_callback = 1; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3a73785631ce..27336dfcda8e 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -308,8 +308,20 @@ static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval) static inline void __xen_set_pte(pte_t *ptep, pte_t pteval) { - if (!xen_batched_set_pte(ptep, pteval)) - native_set_pte(ptep, pteval); + if (!xen_batched_set_pte(ptep, pteval)) { + /* + * Could call native_set_pte() here and trap and + * emulate the PTE write but with 32-bit guests this + * needs two traps (one for each of the two 32-bit + * words in the PTE) so do one hypercall directly + * instead. + */ + struct mmu_update u; + + u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; + u.val = pte_val_ma(pteval); + HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF); + } } static void xen_set_pte(pte_t *ptep, pte_t pteval) @@ -1416,13 +1428,28 @@ static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte) } #endif /* CONFIG_X86_64 */ -/* Init-time set_pte while constructing initial pagetables, which - doesn't allow RO pagetable pages to be remapped RW */ +/* + * Init-time set_pte while constructing initial pagetables, which + * doesn't allow RO page table pages to be remapped RW. + * + * If there is no MFN for this PFN then this page is initially + * ballooned out so clear the PTE (as in decrease_reservation() in + * drivers/xen/balloon.c). + * + * Many of these PTE updates are done on unpinned and writable pages + * and doing a hypercall for these is unnecessary and expensive. At + * this point it is not possible to tell if a page is pinned or not, + * so always write the PTE directly and rely on Xen trapping and + * emulating any updates as necessary. + */ static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) { - pte = mask_rw_pte(ptep, pte); + if (pte_mfn(pte) != INVALID_P2M_ENTRY) + pte = mask_rw_pte(ptep, pte); + else + pte = __pte_ma(0); - xen_set_pte(ptep, pte); + native_set_pte(ptep, pte); } static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index ffd08c414e91..64effdc6da94 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -706,6 +706,7 @@ int m2p_add_override(unsigned long mfn, struct page *page, unsigned long uninitialized_var(address); unsigned level; pte_t *ptep = NULL; + int ret = 0; pfn = page_to_pfn(page); if (!PageHighMem(page)) { @@ -741,6 +742,24 @@ int m2p_add_override(unsigned long mfn, struct page *page, list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]); spin_unlock_irqrestore(&m2p_override_lock, flags); + /* p2m(m2p(mfn)) == mfn: the mfn is already present somewhere in + * this domain. Set the FOREIGN_FRAME_BIT in the p2m for the other + * pfn so that the following mfn_to_pfn(mfn) calls will return the + * pfn from the m2p_override (the backend pfn) instead. + * We need to do this because the pages shared by the frontend + * (xen-blkfront) can be already locked (lock_page, called by + * do_read_cache_page); when the userspace backend tries to use them + * with direct_IO, mfn_to_pfn returns the pfn of the frontend, so + * do_blockdev_direct_IO is going to try to lock the same pages + * again resulting in a deadlock. + * As a side effect get_user_pages_fast might not be safe on the + * frontend pages while they are being shared with the backend, + * because mfn_to_pfn (that ends up being called by GUPF) will + * return the backend pfn rather than the frontend pfn. */ + ret = __get_user(pfn, &machine_to_phys_mapping[mfn]); + if (ret == 0 && get_phys_to_machine(pfn) == mfn) + set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)); + return 0; } EXPORT_SYMBOL_GPL(m2p_add_override); @@ -752,6 +771,7 @@ int m2p_remove_override(struct page *page, bool clear_pte) unsigned long uninitialized_var(address); unsigned level; pte_t *ptep = NULL; + int ret = 0; pfn = page_to_pfn(page); mfn = get_phys_to_machine(pfn); @@ -821,6 +841,22 @@ int m2p_remove_override(struct page *page, bool clear_pte) } else set_phys_to_machine(pfn, page->index); + /* p2m(m2p(mfn)) == FOREIGN_FRAME(mfn): the mfn is already present + * somewhere in this domain, even before being added to the + * m2p_override (see comment above in m2p_add_override). + * If there are no other entries in the m2p_override corresponding + * to this mfn, then remove the FOREIGN_FRAME_BIT from the p2m for + * the original pfn (the one shared by the frontend): the backend + * cannot do any IO on this page anymore because it has been + * unshared. Removing the FOREIGN_FRAME_BIT from the p2m entry of + * the original pfn causes mfn_to_pfn(mfn) to return the frontend + * pfn again. */ + mfn &= ~FOREIGN_FRAME_BIT; + ret = __get_user(pfn, &machine_to_phys_mapping[mfn]); + if (ret == 0 && get_phys_to_machine(pfn) == FOREIGN_FRAME(mfn) && + m2p_find_override(mfn) == NULL) + set_phys_to_machine(pfn, mfn); + return 0; } EXPORT_SYMBOL_GPL(m2p_remove_override); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 3ebba0753d38..ead85576d54a 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -157,25 +157,24 @@ static unsigned long __init xen_populate_chunk( unsigned long dest_pfn; for (i = 0, entry = list; i < map_size; i++, entry++) { - unsigned long credits = credits_left; unsigned long s_pfn; unsigned long e_pfn; unsigned long pfns; long capacity; - if (credits <= 0) + if (credits_left <= 0) break; if (entry->type != E820_RAM) continue; - e_pfn = PFN_UP(entry->addr + entry->size); + e_pfn = PFN_DOWN(entry->addr + entry->size); /* We only care about E820 after the xen_start_info->nr_pages */ if (e_pfn <= max_pfn) continue; - s_pfn = PFN_DOWN(entry->addr); + s_pfn = PFN_UP(entry->addr); /* If the E820 falls within the nr_pages, we want to start * at the nr_pages PFN. * If that would mean going past the E820 entry, skip it @@ -184,23 +183,19 @@ static unsigned long __init xen_populate_chunk( capacity = e_pfn - max_pfn; dest_pfn = max_pfn; } else { - /* last_pfn MUST be within E820_RAM regions */ - if (*last_pfn && e_pfn >= *last_pfn) - s_pfn = *last_pfn; capacity = e_pfn - s_pfn; dest_pfn = s_pfn; } - /* If we had filled this E820_RAM entry, go to the next one. */ - if (capacity <= 0) - continue; - if (credits > capacity) - credits = capacity; + if (credits_left < capacity) + capacity = credits_left; - pfns = xen_do_chunk(dest_pfn, dest_pfn + credits, false); + pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false); done += pfns; - credits_left -= pfns; *last_pfn = (dest_pfn + pfns); + if (pfns < capacity) + break; + credits_left -= pfns; } return done; } @@ -371,7 +366,8 @@ char * __init xen_memory_setup(void) populated = xen_populate_chunk(map, memmap.nr_entries, max_pfn, &last_pfn, xen_released_pages); - extra_pages += (xen_released_pages - populated); + xen_released_pages -= populated; + extra_pages += xen_released_pages; if (last_pfn > max_pfn) { max_pfn = min(MAX_DOMAIN_PAGES, last_pfn); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index afb250d22a6b..f58dca7a6e52 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -80,9 +80,7 @@ static void __cpuinit cpu_bringup(void) notify_cpu_starting(cpu); - ipi_call_lock(); set_cpu_online(cpu, true); - ipi_call_unlock(); this_cpu_write(cpu_state, CPU_ONLINE); diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 45329c8c226e..ae8a00c39de4 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -30,7 +30,7 @@ void xen_arch_hvm_post_suspend(int suspend_cancelled) { #ifdef CONFIG_XEN_PVHVM int cpu; - xen_hvm_init_shared_info(); + xen_hvm_resume_shared_info(); xen_callback_vector(); xen_unplug_emulated_devices(); if (xen_feature(XENFEAT_hvm_safe_pvclock)) { diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 202d4c150154..1e4329e04e0f 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -41,7 +41,7 @@ void xen_enable_syscall(void); void xen_vcpu_restore(void); void xen_callback_vector(void); -void xen_hvm_init_shared_info(void); +void xen_hvm_resume_shared_info(void); void xen_unplug_emulated_devices(void); void __init xen_build_dynamic_phys_to_machine(void); |