diff options
Diffstat (limited to 'arch/x86')
96 files changed, 3557 insertions, 881 deletions
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 8bbea6aa40d9..efe5acfc79c3 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -94,10 +94,10 @@ bs_die: .section ".bsdata", "a" bugger_off_msg: - .ascii "Direct booting from floppy is no longer supported.\r\n" - .ascii "Please use a boot loader program instead.\r\n" + .ascii "Direct floppy boot is not supported. " + .ascii "Use a boot loader program instead.\r\n" .ascii "\n" - .ascii "Remove disk and press any key to reboot . . .\r\n" + .ascii "Remove disk and press any key to reboot ...\r\n" .byte 0 #ifdef CONFIG_EFI_STUB @@ -111,7 +111,7 @@ coff_header: #else .word 0x8664 # x86-64 #endif - .word 2 # nr_sections + .word 3 # nr_sections .long 0 # TimeDateStamp .long 0 # PointerToSymbolTable .long 1 # NumberOfSymbols @@ -158,8 +158,8 @@ extra_header_fields: #else .quad 0 # ImageBase #endif - .long 0x1000 # SectionAlignment - .long 0x200 # FileAlignment + .long 0x20 # SectionAlignment + .long 0x20 # FileAlignment .word 0 # MajorOperatingSystemVersion .word 0 # MinorOperatingSystemVersion .word 0 # MajorImageVersion @@ -200,8 +200,10 @@ extra_header_fields: # Section table section_table: - .ascii ".text" - .byte 0 + # + # The offset & size fields are filled in by build.c. + # + .ascii ".setup" .byte 0 .byte 0 .long 0 @@ -217,9 +219,8 @@ section_table: # # The EFI application loader requires a relocation section - # because EFI applications must be relocatable. But since - # we don't need the loader to fixup any relocs for us, we - # just create an empty (zero-length) .reloc section header. + # because EFI applications must be relocatable. The .reloc + # offset & size fields are filled in by build.c. # .ascii ".reloc" .byte 0 @@ -233,6 +234,25 @@ section_table: .word 0 # NumberOfRelocations .word 0 # NumberOfLineNumbers .long 0x42100040 # Characteristics (section flags) + + # + # The offset & size fields are filled in by build.c. + # + .ascii ".text" + .byte 0 + .byte 0 + .byte 0 + .long 0 + .long 0x0 # startup_{32,64} + .long 0 # Size of initialized data + # on disk + .long 0x0 # startup_{32,64} + .long 0 # PointerToRelocations + .long 0 # PointerToLineNumbers + .word 0 # NumberOfRelocations + .word 0 # NumberOfLineNumbers + .long 0x60500020 # Characteristics (section flags) + #endif /* CONFIG_EFI_STUB */ # Kernel attributes; used by setup. This is part 1 of the diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index 3f61f6e2b46f..4b8e165ee572 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -50,6 +50,8 @@ typedef unsigned int u32; u8 buf[SETUP_SECT_MAX*512]; int is_big_kernel; +#define PECOFF_RELOC_RESERVE 0x20 + /*----------------------------------------------------------------------*/ static const u32 crctab32[] = { @@ -133,11 +135,103 @@ static void usage(void) die("Usage: build setup system [> image]"); } -int main(int argc, char ** argv) -{ #ifdef CONFIG_EFI_STUB - unsigned int file_sz, pe_header; + +static void update_pecoff_section_header(char *section_name, u32 offset, u32 size) +{ + unsigned int pe_header; + unsigned short num_sections; + u8 *section; + + pe_header = get_unaligned_le32(&buf[0x3c]); + num_sections = get_unaligned_le16(&buf[pe_header + 6]); + +#ifdef CONFIG_X86_32 + section = &buf[pe_header + 0xa8]; +#else + section = &buf[pe_header + 0xb8]; #endif + + while (num_sections > 0) { + if (strncmp((char*)section, section_name, 8) == 0) { + /* section header size field */ + put_unaligned_le32(size, section + 0x8); + + /* section header vma field */ + put_unaligned_le32(offset, section + 0xc); + + /* section header 'size of initialised data' field */ + put_unaligned_le32(size, section + 0x10); + + /* section header 'file offset' field */ + put_unaligned_le32(offset, section + 0x14); + + break; + } + section += 0x28; + num_sections--; + } +} + +static void update_pecoff_setup_and_reloc(unsigned int size) +{ + u32 setup_offset = 0x200; + u32 reloc_offset = size - PECOFF_RELOC_RESERVE; + u32 setup_size = reloc_offset - setup_offset; + + update_pecoff_section_header(".setup", setup_offset, setup_size); + update_pecoff_section_header(".reloc", reloc_offset, PECOFF_RELOC_RESERVE); + + /* + * Modify .reloc section contents with a single entry. The + * relocation is applied to offset 10 of the relocation section. + */ + put_unaligned_le32(reloc_offset + 10, &buf[reloc_offset]); + put_unaligned_le32(10, &buf[reloc_offset + 4]); +} + +static void update_pecoff_text(unsigned int text_start, unsigned int file_sz) +{ + unsigned int pe_header; + unsigned int text_sz = file_sz - text_start; + + pe_header = get_unaligned_le32(&buf[0x3c]); + + /* Size of image */ + put_unaligned_le32(file_sz, &buf[pe_header + 0x50]); + + /* + * Size of code: Subtract the size of the first sector (512 bytes) + * which includes the header. + */ + put_unaligned_le32(file_sz - 512, &buf[pe_header + 0x1c]); + +#ifdef CONFIG_X86_32 + /* + * Address of entry point. + * + * The EFI stub entry point is +16 bytes from the start of + * the .text section. + */ + put_unaligned_le32(text_start + 16, &buf[pe_header + 0x28]); +#else + /* + * Address of entry point. startup_32 is at the beginning and + * the 64-bit entry point (startup_64) is always 512 bytes + * after. The EFI stub entry point is 16 bytes after that, as + * the first instruction allows legacy loaders to jump over + * the EFI stub initialisation + */ + put_unaligned_le32(text_start + 528, &buf[pe_header + 0x28]); +#endif /* CONFIG_X86_32 */ + + update_pecoff_section_header(".text", text_start, text_sz); +} + +#endif /* CONFIG_EFI_STUB */ + +int main(int argc, char ** argv) +{ unsigned int i, sz, setup_sectors; int c; u32 sys_size; @@ -163,6 +257,12 @@ int main(int argc, char ** argv) die("Boot block hasn't got boot flag (0xAA55)"); fclose(file); +#ifdef CONFIG_EFI_STUB + /* Reserve 0x20 bytes for .reloc section */ + memset(buf+c, 0, PECOFF_RELOC_RESERVE); + c += PECOFF_RELOC_RESERVE; +#endif + /* Pad unused space with zeros */ setup_sectors = (c + 511) / 512; if (setup_sectors < SETUP_SECT_MIN) @@ -170,6 +270,10 @@ int main(int argc, char ** argv) i = setup_sectors*512; memset(buf+c, 0, i-c); +#ifdef CONFIG_EFI_STUB + update_pecoff_setup_and_reloc(i); +#endif + /* Set the default root device */ put_unaligned_le16(DEFAULT_ROOT_DEV, &buf[508]); @@ -194,66 +298,8 @@ int main(int argc, char ** argv) put_unaligned_le32(sys_size, &buf[0x1f4]); #ifdef CONFIG_EFI_STUB - file_sz = sz + i + ((sys_size * 16) - sz); - - pe_header = get_unaligned_le32(&buf[0x3c]); - - /* Size of image */ - put_unaligned_le32(file_sz, &buf[pe_header + 0x50]); - - /* - * Subtract the size of the first section (512 bytes) which - * includes the header and .reloc section. The remaining size - * is that of the .text section. - */ - file_sz -= 512; - - /* Size of code */ - put_unaligned_le32(file_sz, &buf[pe_header + 0x1c]); - -#ifdef CONFIG_X86_32 - /* - * Address of entry point. - * - * The EFI stub entry point is +16 bytes from the start of - * the .text section. - */ - put_unaligned_le32(i + 16, &buf[pe_header + 0x28]); - - /* .text size */ - put_unaligned_le32(file_sz, &buf[pe_header + 0xb0]); - - /* .text vma */ - put_unaligned_le32(0x200, &buf[pe_header + 0xb4]); - - /* .text size of initialised data */ - put_unaligned_le32(file_sz, &buf[pe_header + 0xb8]); - - /* .text file offset */ - put_unaligned_le32(0x200, &buf[pe_header + 0xbc]); -#else - /* - * Address of entry point. startup_32 is at the beginning and - * the 64-bit entry point (startup_64) is always 512 bytes - * after. The EFI stub entry point is 16 bytes after that, as - * the first instruction allows legacy loaders to jump over - * the EFI stub initialisation - */ - put_unaligned_le32(i + 528, &buf[pe_header + 0x28]); - - /* .text size */ - put_unaligned_le32(file_sz, &buf[pe_header + 0xc0]); - - /* .text vma */ - put_unaligned_le32(0x200, &buf[pe_header + 0xc4]); - - /* .text size of initialised data */ - put_unaligned_le32(file_sz, &buf[pe_header + 0xc8]); - - /* .text file offset */ - put_unaligned_le32(0x200, &buf[pe_header + 0xcc]); -#endif /* CONFIG_X86_32 */ -#endif /* CONFIG_EFI_STUB */ + update_pecoff_text(setup_sectors * 512, sz + i + ((sys_size * 16) - sz)); +#endif crc = partial_crc32(buf, i, crc); if (fwrite(buf, 1, i, stdout) != i) diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index be6d9e365a80..3470624d7835 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -2460,10 +2460,12 @@ ENTRY(aesni_cbc_dec) pxor IN3, STATE4 movaps IN4, IV #else - pxor (INP), STATE2 - pxor 0x10(INP), STATE3 pxor IN1, STATE4 movaps IN2, IV + movups (INP), IN1 + pxor IN1, STATE2 + movups 0x10(INP), IN2 + pxor IN2, STATE3 #endif movups STATE1, (OUTP) movups STATE2, 0x10(OUTP) diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index daeca56211e3..673ac9b63d6b 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -38,7 +38,7 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) { int err = 0; - bool ia32 = is_ia32_task(); + bool ia32 = test_thread_flag(TIF_IA32); if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t))) return -EFAULT; diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 49331bedc158..70780689599a 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -75,23 +75,54 @@ static inline int alternatives_text_reserved(void *start, void *end) } #endif /* CONFIG_SMP */ +#define OLDINSTR(oldinstr) "661:\n\t" oldinstr "\n662:\n" + +#define b_replacement(number) "663"#number +#define e_replacement(number) "664"#number + +#define alt_slen "662b-661b" +#define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f" + +#define ALTINSTR_ENTRY(feature, number) \ + " .long 661b - .\n" /* label */ \ + " .long " b_replacement(number)"f - .\n" /* new instruction */ \ + " .word " __stringify(feature) "\n" /* feature bit */ \ + " .byte " alt_slen "\n" /* source len */ \ + " .byte " alt_rlen(number) "\n" /* replacement len */ + +#define DISCARD_ENTRY(number) /* rlen <= slen */ \ + " .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n" + +#define ALTINSTR_REPLACEMENT(newinstr, feature, number) /* replacement */ \ + b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t" + /* alternative assembly primitive: */ #define ALTERNATIVE(oldinstr, newinstr, feature) \ - \ - "661:\n\t" oldinstr "\n662:\n" \ - ".section .altinstructions,\"a\"\n" \ - " .long 661b - .\n" /* label */ \ - " .long 663f - .\n" /* new instruction */ \ - " .word " __stringify(feature) "\n" /* feature bit */ \ - " .byte 662b-661b\n" /* sourcelen */ \ - " .byte 664f-663f\n" /* replacementlen */ \ - ".previous\n" \ - ".section .discard,\"aw\",@progbits\n" \ - " .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */ \ - ".previous\n" \ - ".section .altinstr_replacement, \"ax\"\n" \ - "663:\n\t" newinstr "\n664:\n" /* replacement */ \ - ".previous" + OLDINSTR(oldinstr) \ + ".section .altinstructions,\"a\"\n" \ + ALTINSTR_ENTRY(feature, 1) \ + ".previous\n" \ + ".section .discard,\"aw\",@progbits\n" \ + DISCARD_ENTRY(1) \ + ".previous\n" \ + ".section .altinstr_replacement, \"ax\"\n" \ + ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ + ".previous" + +#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ + OLDINSTR(oldinstr) \ + ".section .altinstructions,\"a\"\n" \ + ALTINSTR_ENTRY(feature1, 1) \ + ALTINSTR_ENTRY(feature2, 2) \ + ".previous\n" \ + ".section .discard,\"aw\",@progbits\n" \ + DISCARD_ENTRY(1) \ + DISCARD_ENTRY(2) \ + ".previous\n" \ + ".section .altinstr_replacement, \"ax\"\n" \ + ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ + ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ + ".previous" /* * This must be included *after* the definition of ALTERNATIVE due to @@ -140,6 +171,19 @@ static inline int alternatives_text_reserved(void *start, void *end) : output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input) /* + * Like alternative_call, but there are two features and respective functions. + * If CPU has feature2, function2 is used. + * Otherwise, if CPU has feature1, function1 is used. + * Otherwise, old function is used. + */ +#define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2, \ + output, input...) \ + asm volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\ + "call %P[new2]", feature2) \ + : output : [old] "i" (oldfunc), [new1] "i" (newfunc1), \ + [new2] "i" (newfunc2), ## input) + +/* * use this macro(s) if you need more than one output parameter * in alternative_io */ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 340ee49961a6..f91e80f4f180 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -176,7 +176,7 @@ #define X86_FEATURE_XSAVEOPT (7*32+ 4) /* Optimized Xsave */ #define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */ #define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */ -#define X86_FEATURE_DTS (7*32+ 7) /* Digital Thermal Sensor */ +#define X86_FEATURE_DTHERM (7*32+ 7) /* Digital Thermal Sensor */ #define X86_FEATURE_HW_PSTATE (7*32+ 8) /* AMD HW-PState */ /* Virtualization flags: Linux defined, word 8 */ diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h index dbe82a5c5eac..d3d74698dce9 100644 --- a/arch/x86/include/asm/floppy.h +++ b/arch/x86/include/asm/floppy.h @@ -99,7 +99,7 @@ static irqreturn_t floppy_hardint(int irq, void *dev_id) virtual_dma_residue += virtual_dma_count; virtual_dma_count = 0; #ifdef TRACE_FLPY_INT - printk("count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n", + printk(KERN_DEBUG "count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n", virtual_dma_count, virtual_dma_residue, calls, bytes, dma_wait); calls = 0; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index db7c1f2709a2..2da88c0cda14 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -313,8 +313,8 @@ struct kvm_pmu { u64 counter_bitmask[2]; u64 global_ctrl_mask; u8 version; - struct kvm_pmc gp_counters[X86_PMC_MAX_GENERIC]; - struct kvm_pmc fixed_counters[X86_PMC_MAX_FIXED]; + struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC]; + struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED]; struct irq_work irq_work; u64 reprogram_pmi; }; diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 084ef95274cd..813ed103f45e 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -115,8 +115,8 @@ notrace static inline int native_write_msr_safe(unsigned int msr, extern unsigned long long native_read_tsc(void); -extern int native_rdmsr_safe_regs(u32 regs[8]); -extern int native_wrmsr_safe_regs(u32 regs[8]); +extern int rdmsr_safe_regs(u32 regs[8]); +extern int wrmsr_safe_regs(u32 regs[8]); static __always_inline unsigned long long __native_read_tsc(void) { @@ -187,43 +187,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) return err; } -static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) -{ - u32 gprs[8] = { 0 }; - int err; - - gprs[1] = msr; - gprs[7] = 0x9c5a203a; - - err = native_rdmsr_safe_regs(gprs); - - *p = gprs[0] | ((u64)gprs[2] << 32); - - return err; -} - -static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) -{ - u32 gprs[8] = { 0 }; - - gprs[0] = (u32)val; - gprs[1] = msr; - gprs[2] = val >> 32; - gprs[7] = 0x9c5a203a; - - return native_wrmsr_safe_regs(gprs); -} - -static inline int rdmsr_safe_regs(u32 regs[8]) -{ - return native_rdmsr_safe_regs(regs); -} - -static inline int wrmsr_safe_regs(u32 regs[8]) -{ - return native_wrmsr_safe_regs(regs); -} - #define rdtscl(low) \ ((low) = (u32)__native_read_tsc()) @@ -237,6 +200,8 @@ do { \ (high) = (u32)(_l >> 32); \ } while (0) +#define rdpmcl(counter, val) ((val) = native_read_pmc(counter)) + #define rdtscp(low, high, aux) \ do { \ unsigned long long _val = native_read_tscp(&(aux)); \ @@ -248,8 +213,7 @@ do { \ #endif /* !CONFIG_PARAVIRT */ - -#define checking_wrmsrl(msr, val) wrmsr_safe((msr), (u32)(val), \ +#define wrmsrl_safe(msr, val) wrmsr_safe((msr), (u32)(val), \ (u32)((val) >> 32)) #define write_tsc(val1, val2) wrmsr(MSR_IA32_TSC, (val1), (val2)) diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 0e3793b821ef..c0fa356e90de 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -44,14 +44,14 @@ struct nmiaction { const char *name; }; -#define register_nmi_handler(t, fn, fg, n) \ +#define register_nmi_handler(t, fn, fg, n, init...) \ ({ \ - static struct nmiaction fn##_na = { \ + static struct nmiaction init fn##_na = { \ .handler = (fn), \ .name = (n), \ .flags = (fg), \ }; \ - __register_nmi_handler((t), &fn##_na); \ + __register_nmi_handler((t), &fn##_na); \ }) int __register_nmi_handler(unsigned int, struct nmiaction *); diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 6cbbabf52707..0b47ddb6f00b 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -128,21 +128,11 @@ static inline u64 paravirt_read_msr(unsigned msr, int *err) return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err); } -static inline int paravirt_rdmsr_regs(u32 *regs) -{ - return PVOP_CALL1(int, pv_cpu_ops.rdmsr_regs, regs); -} - static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high) { return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high); } -static inline int paravirt_wrmsr_regs(u32 *regs) -{ - return PVOP_CALL1(int, pv_cpu_ops.wrmsr_regs, regs); -} - /* These should all do BUG_ON(_err), but our headers are too tangled. */ #define rdmsr(msr, val1, val2) \ do { \ @@ -176,9 +166,6 @@ do { \ _err; \ }) -#define rdmsr_safe_regs(regs) paravirt_rdmsr_regs(regs) -#define wrmsr_safe_regs(regs) paravirt_wrmsr_regs(regs) - static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) { int err; @@ -186,32 +173,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) *p = paravirt_read_msr(msr, &err); return err; } -static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) -{ - u32 gprs[8] = { 0 }; - int err; - - gprs[1] = msr; - gprs[7] = 0x9c5a203a; - - err = paravirt_rdmsr_regs(gprs); - - *p = gprs[0] | ((u64)gprs[2] << 32); - - return err; -} - -static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) -{ - u32 gprs[8] = { 0 }; - - gprs[0] = (u32)val; - gprs[1] = msr; - gprs[2] = val >> 32; - gprs[7] = 0x9c5a203a; - - return paravirt_wrmsr_regs(gprs); -} static inline u64 paravirt_read_tsc(void) { @@ -252,6 +213,8 @@ do { \ high = _l >> 32; \ } while (0) +#define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter)) + static inline unsigned long long paravirt_rdtscp(unsigned int *aux) { return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 8e8b9a4987ee..8613cbb7ba41 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -153,9 +153,7 @@ struct pv_cpu_ops { /* MSR, PMC and TSR operations. err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */ u64 (*read_msr)(unsigned int msr, int *err); - int (*rdmsr_regs)(u32 *regs); int (*write_msr)(unsigned int msr, unsigned low, unsigned high); - int (*wrmsr_regs)(u32 *regs); u64 (*read_tsc)(void); u64 (*read_pmc)(int counter); diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index b3a531746026..5ad24a89b19b 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -7,9 +7,13 @@ #undef DEBUG #ifdef DEBUG -#define DBG(x...) printk(x) +#define DBG(fmt, ...) printk(fmt, ##__VA_ARGS__) #else -#define DBG(x...) +#define DBG(fmt, ...) \ +do { \ + if (0) \ + printk(fmt, ##__VA_ARGS__); \ +} while (0) #endif #define PCI_PROBE_BIOS 0x0001 diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 588f52ea810e..c78f14a0df00 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -5,11 +5,10 @@ * Performance event hw details: */ -#define X86_PMC_MAX_GENERIC 32 -#define X86_PMC_MAX_FIXED 3 +#define INTEL_PMC_MAX_GENERIC 32 +#define INTEL_PMC_MAX_FIXED 3 +#define INTEL_PMC_IDX_FIXED 32 -#define X86_PMC_IDX_GENERIC 0 -#define X86_PMC_IDX_FIXED 32 #define X86_PMC_IDX_MAX 64 #define MSR_ARCH_PERFMON_PERFCTR0 0xc1 @@ -48,8 +47,7 @@ (X86_RAW_EVENT_MASK | \ AMD64_EVENTSEL_EVENT) #define AMD64_NUM_COUNTERS 4 -#define AMD64_NUM_COUNTERS_F15H 6 -#define AMD64_NUM_COUNTERS_MAX AMD64_NUM_COUNTERS_F15H +#define AMD64_NUM_COUNTERS_CORE 6 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) @@ -121,16 +119,16 @@ struct x86_pmu_capability { /* Instr_Retired.Any: */ #define MSR_ARCH_PERFMON_FIXED_CTR0 0x309 -#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0) +#define INTEL_PMC_IDX_FIXED_INSTRUCTIONS (INTEL_PMC_IDX_FIXED + 0) /* CPU_CLK_Unhalted.Core: */ #define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a -#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1) +#define INTEL_PMC_IDX_FIXED_CPU_CYCLES (INTEL_PMC_IDX_FIXED + 1) /* CPU_CLK_Unhalted.Ref: */ #define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b -#define X86_PMC_IDX_FIXED_REF_CYCLES (X86_PMC_IDX_FIXED + 2) -#define X86_PMC_MSK_FIXED_REF_CYCLES (1ULL << X86_PMC_IDX_FIXED_REF_CYCLES) +#define INTEL_PMC_IDX_FIXED_REF_CYCLES (INTEL_PMC_IDX_FIXED + 2) +#define INTEL_PMC_MSK_FIXED_REF_CYCLES (1ULL << INTEL_PMC_IDX_FIXED_REF_CYCLES) /* * We model BTS tracing as another fixed-mode PMC. @@ -139,7 +137,7 @@ struct x86_pmu_capability { * values are used by actual fixed events and higher values are used * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr. */ -#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) +#define INTEL_PMC_IDX_FIXED_BTS (INTEL_PMC_IDX_FIXED + 16) /* * IBS cpuid feature detection @@ -234,6 +232,7 @@ struct perf_guest_switch_msr { extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap); +extern void perf_check_microcode(void); #else static inline perf_guest_switch_msr *perf_guest_get_msrs(int *nr) { @@ -247,6 +246,7 @@ static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) } static inline void perf_events_lapic_init(void) { } +static inline void perf_check_microcode(void) { } #endif #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index 98391db840c6..f2b489cf1602 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -2,9 +2,9 @@ #define _ASM_X86_PGTABLE_2LEVEL_H #define pte_ERROR(e) \ - printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, (e).pte_low) + pr_err("%s:%d: bad pte %08lx\n", __FILE__, __LINE__, (e).pte_low) #define pgd_ERROR(e) \ - printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) + pr_err("%s:%d: bad pgd %08lx\n", __FILE__, __LINE__, pgd_val(e)) /* * Certain architectures need to do special things when PTEs diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 43876f16caf1..4cc9f2b7cdc3 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -9,13 +9,13 @@ */ #define pte_ERROR(e) \ - printk("%s:%d: bad pte %p(%08lx%08lx).\n", \ + pr_err("%s:%d: bad pte %p(%08lx%08lx)\n", \ __FILE__, __LINE__, &(e), (e).pte_high, (e).pte_low) #define pmd_ERROR(e) \ - printk("%s:%d: bad pmd %p(%016Lx).\n", \ + pr_err("%s:%d: bad pmd %p(%016Lx)\n", \ __FILE__, __LINE__, &(e), pmd_val(e)) #define pgd_ERROR(e) \ - printk("%s:%d: bad pgd %p(%016Lx).\n", \ + pr_err("%s:%d: bad pgd %p(%016Lx)\n", \ __FILE__, __LINE__, &(e), pgd_val(e)) /* Rules for using set_pte: the pte being assigned *must* be @@ -47,16 +47,26 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte) * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd * operations. * - * Without THP if the mmap_sem is hold for reading, the - * pmd can only transition from null to not null while pmd_read_atomic runs. - * So there's no need of literally reading it atomically. + * Without THP if the mmap_sem is hold for reading, the pmd can only + * transition from null to not null while pmd_read_atomic runs. So + * we can always return atomic pmd values with this function. * * With THP if the mmap_sem is hold for reading, the pmd can become - * THP or null or point to a pte (and in turn become "stable") at any - * time under pmd_read_atomic, so it's mandatory to read it atomically - * with cmpxchg8b. + * trans_huge or none or point to a pte (and in turn become "stable") + * at any time under pmd_read_atomic. We could read it really + * atomically here with a atomic64_read for the THP enabled case (and + * it would be a whole lot simpler), but to avoid using cmpxchg8b we + * only return an atomic pmdval if the low part of the pmdval is later + * found stable (i.e. pointing to a pte). And we're returning a none + * pmdval if the low part of the pmd is none. In some cases the high + * and low part of the pmdval returned may not be consistent if THP is + * enabled (the low part may point to previously mapped hugepage, + * while the high part may point to a more recently mapped hugepage), + * but pmd_none_or_trans_huge_or_clear_bad() only needs the low part + * of the pmd to be read atomically to decide if the pmd is unstable + * or not, with the only exception of when the low part of the pmd is + * zero in which case we return a none pmd. */ -#ifndef CONFIG_TRANSPARENT_HUGEPAGE static inline pmd_t pmd_read_atomic(pmd_t *pmdp) { pmdval_t ret; @@ -74,12 +84,6 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp) return (pmd_t) { ret }; } -#else /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline pmd_t pmd_read_atomic(pmd_t *pmdp) -{ - return (pmd_t) { atomic64_read((atomic64_t *)pmdp) }; -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) { diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 975f709e09ae..8251be02301e 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -26,16 +26,16 @@ extern pgd_t init_level4_pgt[]; extern void paging_init(void); #define pte_ERROR(e) \ - printk("%s:%d: bad pte %p(%016lx).\n", \ + pr_err("%s:%d: bad pte %p(%016lx)\n", \ __FILE__, __LINE__, &(e), pte_val(e)) #define pmd_ERROR(e) \ - printk("%s:%d: bad pmd %p(%016lx).\n", \ + pr_err("%s:%d: bad pmd %p(%016lx)\n", \ __FILE__, __LINE__, &(e), pmd_val(e)) #define pud_ERROR(e) \ - printk("%s:%d: bad pud %p(%016lx).\n", \ + pr_err("%s:%d: bad pud %p(%016lx)\n", \ __FILE__, __LINE__, &(e), pud_val(e)) #define pgd_ERROR(e) \ - printk("%s:%d: bad pgd %p(%016lx).\n", \ + pr_err("%s:%d: bad pgd %p(%016lx)\n", \ __FILE__, __LINE__, &(e), pgd_val(e)) struct mm_struct; diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index f48394513c37..2ffa95dc2333 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -169,11 +169,6 @@ void x86_idle_thread_init(unsigned int cpu, struct task_struct *idle); void smp_store_cpu_info(int id); #define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu) -/* We don't mark CPUs online until __cpu_up(), so we need another measure */ -static inline int num_booting_cpus(void) -{ - return cpumask_weight(cpu_callout_mask); -} #else /* !CONFIG_SMP */ #define wbinvd_on_cpu(cpu) wbinvd() static inline int wbinvd_on_all_cpus(void) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 04cd6882308e..e1f3a17034fc 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -33,9 +33,8 @@ #define segment_eq(a, b) ((a).seg == (b).seg) #define user_addr_max() (current_thread_info()->addr_limit.seg) -#define __addr_ok(addr) \ - ((unsigned long __force)(addr) < \ - (current_thread_info()->addr_limit.seg)) +#define __addr_ok(addr) \ + ((unsigned long __force)(addr) < user_addr_max()) /* * Test whether a block of memory is a valid user space address. @@ -47,14 +46,14 @@ * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry... */ -#define __range_not_ok(addr, size) \ +#define __range_not_ok(addr, size, limit) \ ({ \ unsigned long flag, roksum; \ __chk_user_ptr(addr); \ asm("add %3,%1 ; sbb %0,%0 ; cmp %1,%4 ; sbb $0,%0" \ : "=&r" (flag), "=r" (roksum) \ : "1" (addr), "g" ((long)(size)), \ - "rm" (current_thread_info()->addr_limit.seg)); \ + "rm" (limit)); \ flag; \ }) @@ -77,7 +76,8 @@ * checks that the pointer is in the user space range - after calling * this function, memory access functions may still return -EFAULT. */ -#define access_ok(type, addr, size) (likely(__range_not_ok(addr, size) == 0)) +#define access_ok(type, addr, size) \ + (likely(__range_not_ok(addr, size, user_addr_max()) == 0)) /* * The exception table consists of pairs of addresses relative to the diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 8e796fbbf9c6..d8def8b3dba0 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -17,6 +17,8 @@ /* Handles exceptions in both to and from, but doesn't do access_ok */ __must_check unsigned long +copy_user_enhanced_fast_string(void *to, const void *from, unsigned len); +__must_check unsigned long copy_user_generic_string(void *to, const void *from, unsigned len); __must_check unsigned long copy_user_generic_unrolled(void *to, const void *from, unsigned len); @@ -26,9 +28,16 @@ copy_user_generic(void *to, const void *from, unsigned len) { unsigned ret; - alternative_call(copy_user_generic_unrolled, + /* + * If CPU has ERMS feature, use copy_user_enhanced_fast_string. + * Otherwise, if CPU has rep_good feature, use copy_user_generic_string. + * Otherwise, use copy_user_generic_unrolled. + */ + alternative_call_2(copy_user_generic_unrolled, copy_user_generic_string, X86_FEATURE_REP_GOOD, + copy_user_enhanced_fast_string, + X86_FEATURE_ERMS, ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from), "=d" (len)), "1" (to), "2" (from), "3" (len) diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index 1e9bed14f7ae..f3971bbcd1de 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -48,7 +48,7 @@ struct arch_uprobe_task { #endif }; -extern int arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm); +extern int arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long addr); extern int arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs); extern int arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs); extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk); diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index becf47b81735..6149b476d9df 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -149,7 +149,6 @@ /* 4 bits of software ack period */ #define UV2_ACK_MASK 0x7UL #define UV2_ACK_UNITS_SHFT 3 -#define UV2_LEG_SHFT UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT #define UV2_EXT_SHFT UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT /* diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 8afb69319815..b2297e58c6ed 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -422,12 +422,14 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header, return 0; } - if (intsrc->source_irq == 0 && intsrc->global_irq == 2) { + if (intsrc->source_irq == 0) { if (acpi_skip_timer_override) { - printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n"); + printk(PREFIX "BIOS IRQ0 override ignored.\n"); return 0; } - if (acpi_fix_pin2_polarity && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) { + + if ((intsrc->global_irq == 2) && acpi_fix_pin2_polarity + && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) { intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK; printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n"); } @@ -1334,17 +1336,12 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d) } /* - * Force ignoring BIOS IRQ0 pin2 override + * Force ignoring BIOS IRQ0 override */ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) { - /* - * The ati_ixp4x0_rev() early PCI quirk should have set - * the acpi_skip_timer_override flag already: - */ if (!acpi_skip_timer_override) { - WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n"); - pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", + pr_notice("%s detected: Ignoring BIOS IRQ0 override\n", d->ident); acpi_skip_timer_override = 1; } @@ -1438,7 +1435,7 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = { * is enabled. This input is incorrectly designated the * ISA IRQ 0 via an interrupt source override even though * it is wired to the output of the master 8259A and INTIN0 - * is not connected at all. Force ignoring BIOS IRQ0 pin2 + * is not connected at all. Force ignoring BIOS IRQ0 * override in that cases. */ { @@ -1473,6 +1470,14 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = { DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"), }, }, + { + .callback = dmi_ignore_irq0_timer_override, + .ident = "FUJITSU SIEMENS", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"), + DMI_MATCH(DMI_PRODUCT_NAME, "AMILO PRO V2030"), + }, + }, {} }; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 1f84794f0759..931280ff8299 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) "SMP alternatives: " fmt + #include <linux/module.h> #include <linux/sched.h> #include <linux/mutex.h> @@ -63,8 +65,11 @@ static int __init setup_noreplace_paravirt(char *str) __setup("noreplace-paravirt", setup_noreplace_paravirt); #endif -#define DPRINTK(fmt, args...) if (debug_alternative) \ - printk(KERN_DEBUG fmt, args) +#define DPRINTK(fmt, ...) \ +do { \ + if (debug_alternative) \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ +} while (0) /* * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes @@ -428,7 +433,7 @@ void alternatives_smp_switch(int smp) * If this still occurs then you should see a hang * or crash shortly after this line: */ - printk("lockdep: fixing up alternatives.\n"); + pr_info("lockdep: fixing up alternatives\n"); #endif if (noreplace_smp || smp_alt_once || skip_smp_alternatives) @@ -444,14 +449,14 @@ void alternatives_smp_switch(int smp) if (smp == smp_mode) { /* nothing */ } else if (smp) { - printk(KERN_INFO "SMP alternatives: switching to SMP code\n"); + pr_info("switching to SMP code\n"); clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP); list_for_each_entry(mod, &smp_alt_modules, next) alternatives_smp_lock(mod->locks, mod->locks_end, mod->text, mod->text_end); } else { - printk(KERN_INFO "SMP alternatives: switching to UP code\n"); + pr_info("switching to UP code\n"); set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); set_cpu_cap(&cpu_data(0), X86_FEATURE_UP); list_for_each_entry(mod, &smp_alt_modules, next) @@ -546,7 +551,7 @@ void __init alternative_instructions(void) #ifdef CONFIG_SMP if (smp_alt_once) { if (1 == num_possible_cpus()) { - printk(KERN_INFO "SMP alternatives: switching to UP code\n"); + pr_info("switching to UP code\n"); set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); set_cpu_cap(&cpu_data(0), X86_FEATURE_UP); @@ -664,7 +669,7 @@ static int __kprobes stop_machine_text_poke(void *data) struct text_poke_param *p; int i; - if (atomic_dec_and_test(&stop_machine_first)) { + if (atomic_xchg(&stop_machine_first, 0)) { for (i = 0; i < tpp->nparams; i++) { p = &tpp->params[i]; text_poke(p->addr, p->opcode, p->len); diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index be16854591cc..f29f6dd6bc08 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -2,6 +2,9 @@ * Shared support code for AMD K8 northbridges and derivates. * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/types.h> #include <linux/slab.h> #include <linux/init.h> @@ -258,7 +261,7 @@ void amd_flush_garts(void) } spin_unlock_irqrestore(&gart_lock, flags); if (!flushed) - printk("nothing to flush?\n"); + pr_notice("nothing to flush?\n"); } EXPORT_SYMBOL_GPL(amd_flush_garts); @@ -269,11 +272,10 @@ static __init int init_amd_nbs(void) err = amd_cache_northbridges(); if (err < 0) - printk(KERN_NOTICE "AMD NB: Cannot enumerate AMD northbridges.\n"); + pr_notice("Cannot enumerate AMD northbridges\n"); if (amd_cache_gart() < 0) - printk(KERN_NOTICE "AMD NB: Cannot initialize GART flush words, " - "GART support disabled.\n"); + pr_notice("Cannot initialize GART flush words, GART support disabled\n"); return err; } diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 6e76c191a835..d5fd66f0d4cd 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -20,7 +20,6 @@ #include <linux/bitops.h> #include <linux/ioport.h> #include <linux/suspend.h> -#include <linux/kmemleak.h> #include <asm/e820.h> #include <asm/io.h> #include <asm/iommu.h> @@ -95,11 +94,6 @@ static u32 __init allocate_aperture(void) return 0; } memblock_reserve(addr, aper_size); - /* - * Kmemleak should not scan this block as it may not be mapped via the - * kernel direct mapping. - */ - kmemleak_ignore(phys_to_virt(addr)); printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", aper_size >> 10, addr); insert_aperture_resource((u32)addr, aper_size); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 9684f963befe..406eee784684 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -448,8 +448,8 @@ static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pi entry = alloc_irq_pin_list(node); if (!entry) { - printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n", - node, apic, pin); + pr_err("can not alloc irq_pin_list (%d,%d,%d)\n", + node, apic, pin); return -ENOMEM; } entry->apic = apic; @@ -661,7 +661,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) ioapic_mask_entry(apic, pin); entry = ioapic_read_entry(apic, pin); if (entry.irr) - printk(KERN_ERR "Unable to reset IRR for apic: %d, pin :%d\n", + pr_err("Unable to reset IRR for apic: %d, pin :%d\n", mpc_ioapic_id(apic), pin); } @@ -895,7 +895,7 @@ static int irq_polarity(int idx) } case 2: /* reserved */ { - printk(KERN_WARNING "broken BIOS!!\n"); + pr_warn("broken BIOS!!\n"); polarity = 1; break; } @@ -906,7 +906,7 @@ static int irq_polarity(int idx) } default: /* invalid */ { - printk(KERN_WARNING "broken BIOS!!\n"); + pr_warn("broken BIOS!!\n"); polarity = 1; break; } @@ -948,7 +948,7 @@ static int irq_trigger(int idx) } default: { - printk(KERN_WARNING "broken BIOS!!\n"); + pr_warn("broken BIOS!!\n"); trigger = 1; break; } @@ -962,7 +962,7 @@ static int irq_trigger(int idx) } case 2: /* reserved */ { - printk(KERN_WARNING "broken BIOS!!\n"); + pr_warn("broken BIOS!!\n"); trigger = 1; break; } @@ -973,7 +973,7 @@ static int irq_trigger(int idx) } default: /* invalid */ { - printk(KERN_WARNING "broken BIOS!!\n"); + pr_warn("broken BIOS!!\n"); trigger = 0; break; } @@ -991,7 +991,7 @@ static int pin_2_irq(int idx, int apic, int pin) * Debugging check, we are in big trouble if this message pops up! */ if (mp_irqs[idx].dstirq != pin) - printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); + pr_err("broken BIOS or MPTABLE parser, ayiee!!\n"); if (test_bit(bus, mp_bus_not_pci)) { irq = mp_irqs[idx].srcbusirq; @@ -1204,7 +1204,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg) BUG_ON(!cfg->vector); vector = cfg->vector; - for_each_cpu_and(cpu, cfg->domain, cpu_online_mask) + for_each_cpu(cpu, cfg->domain) per_cpu(vector_irq, cpu)[vector] = -1; cfg->vector = 0; @@ -1212,7 +1212,7 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg) if (likely(!cfg->move_in_progress)) return; - for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) { + for_each_cpu(cpu, cfg->old_domain) { for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { if (per_cpu(vector_irq, cpu)[vector] != irq) @@ -1535,7 +1535,6 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx) reg_03.raw = io_apic_read(ioapic_idx, 3); raw_spin_unlock_irqrestore(&ioapic_lock, flags); - printk("\n"); printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(ioapic_idx)); printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); @@ -1592,7 +1591,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx) i, ir_entry->index ); - printk("%1d %1d %1d %1d %1d " + pr_cont("%1d %1d %1d %1d %1d " "%1d %1d %X %02X\n", ir_entry->format, ir_entry->mask, @@ -1612,7 +1611,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx) i, entry.dest ); - printk("%1d %1d %1d %1d %1d " + pr_cont("%1d %1d %1d %1d %1d " "%1d %1d %02X\n", entry.mask, entry.trigger, @@ -1665,8 +1664,8 @@ __apicdebuginit(void) print_IO_APICs(void) continue; printk(KERN_DEBUG "IRQ%d ", irq); for_each_irq_pin(entry, cfg->irq_2_pin) - printk("-> %d:%d", entry->apic, entry->pin); - printk("\n"); + pr_cont("-> %d:%d", entry->apic, entry->pin); + pr_cont("\n"); } printk(KERN_INFO ".................................... done.\n"); @@ -1679,9 +1678,9 @@ __apicdebuginit(void) print_APIC_field(int base) printk(KERN_DEBUG); for (i = 0; i < 8; i++) - printk(KERN_CONT "%08x", apic_read(base + i*0x10)); + pr_cont("%08x", apic_read(base + i*0x10)); - printk(KERN_CONT "\n"); + pr_cont("\n"); } __apicdebuginit(void) print_local_APIC(void *dummy) @@ -1783,7 +1782,7 @@ __apicdebuginit(void) print_local_APIC(void *dummy) printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v); } } - printk("\n"); + pr_cont("\n"); } __apicdebuginit(void) print_local_APICs(int maxcpu) @@ -2079,7 +2078,7 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) reg_00.raw = io_apic_read(ioapic_idx, 0); raw_spin_unlock_irqrestore(&ioapic_lock, flags); if (reg_00.bits.ID != mpc_ioapic_id(ioapic_idx)) - printk("could not set ID!\n"); + pr_cont("could not set ID!\n"); else apic_printk(APIC_VERBOSE, " ok.\n"); } @@ -3575,7 +3574,8 @@ static int __init io_apic_get_unique_id(int ioapic, int apic_id) /* Sanity check */ if (reg_00.bits.ID != apic_id) { - printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); + pr_err("IOAPIC[%d]: Unable to change apic_id!\n", + ioapic); return -1; } } diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index b6e61857c29f..77c95c0e1bf7 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -26,6 +26,8 @@ * */ +#define pr_fmt(fmt) "summit: %s: " fmt, __func__ + #include <linux/mm.h> #include <linux/init.h> #include <asm/io.h> @@ -235,8 +237,8 @@ static int summit_apic_id_registered(void) static void summit_setup_apic_routing(void) { - printk("Enabling APIC mode: Summit. Using %d I/O APICs\n", - nr_ioapics); + pr_info("Enabling APIC mode: Summit. Using %d I/O APICs\n", + nr_ioapics); } static int summit_cpu_present_to_apicid(int mps_cpu) @@ -276,7 +278,7 @@ summit_cpu_mask_to_apicid(const struct cpumask *cpumask, unsigned int *dest_id) int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu); if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { - printk("%s: Not a valid mask!\n", __func__); + pr_err("Not a valid mask!\n"); return -EINVAL; } apicid |= new_apicid; @@ -346,7 +348,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus) } } if (i == rio_table_hdr->num_rio_dev) { - printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __func__); + pr_err("Couldn't find owner Cyclone for Winnipeg!\n"); return last_bus; } @@ -357,7 +359,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus) } } if (i == rio_table_hdr->num_scal_dev) { - printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __func__); + pr_err("Couldn't find owner Twister for Cyclone!\n"); return last_bus; } @@ -387,7 +389,7 @@ static int setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus) num_buses = 9; break; default: - printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __func__); + pr_info("Unsupported Winnipeg type!\n"); return last_bus; } @@ -402,13 +404,15 @@ static int build_detail_arrays(void) int i, scal_detail_size, rio_detail_size; if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) { - printk(KERN_WARNING "%s: MAX_NUMNODES too low! Defined as %d, but system has %d nodes.\n", __func__, MAX_NUMNODES, rio_table_hdr->num_scal_dev); + pr_warn("MAX_NUMNODES too low! Defined as %d, but system has %d nodes\n", + MAX_NUMNODES, rio_table_hdr->num_scal_dev); return 0; } switch (rio_table_hdr->version) { default: - printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __func__, rio_table_hdr->version); + pr_warn("Invalid Rio Grande Table Version: %d\n", + rio_table_hdr->version); return 0; case 2: scal_detail_size = 11; @@ -453,7 +457,7 @@ void setup_summit(void) offset = *((unsigned short *)(ptr + offset)); } if (!rio_table_hdr) { - printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __func__); + pr_err("Unable to locate Rio Grande Table in EBDA - bailing!\n"); return; } diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 07b0c0db466c..d65464e43503 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -201,6 +201,8 @@ * http://www.microsoft.com/whdc/archive/amp_12.mspx] */ +#define pr_fmt(fmt) "apm: " fmt + #include <linux/module.h> #include <linux/poll.h> @@ -485,11 +487,11 @@ static void apm_error(char *str, int err) if (error_table[i].key == err) break; if (i < ERROR_COUNT) - printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg); + pr_notice("%s: %s\n", str, error_table[i].msg); else if (err < 0) - printk(KERN_NOTICE "apm: %s: linux error code %i\n", str, err); + pr_notice("%s: linux error code %i\n", str, err); else - printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n", + pr_notice("%s: unknown error code %#2.2x\n", str, err); } @@ -1184,7 +1186,7 @@ static void queue_event(apm_event_t event, struct apm_user *sender) static int notified; if (notified++ == 0) - printk(KERN_ERR "apm: an event queue overflowed\n"); + pr_err("an event queue overflowed\n"); if (++as->event_tail >= APM_MAX_EVENTS) as->event_tail = 0; } @@ -1447,7 +1449,7 @@ static void apm_mainloop(void) static int check_apm_user(struct apm_user *as, const char *func) { if (as == NULL || as->magic != APM_BIOS_MAGIC) { - printk(KERN_ERR "apm: %s passed bad filp\n", func); + pr_err("%s passed bad filp\n", func); return 1; } return 0; @@ -1586,7 +1588,7 @@ static int do_release(struct inode *inode, struct file *filp) as1 = as1->next) ; if (as1 == NULL) - printk(KERN_ERR "apm: filp not in user list\n"); + pr_err("filp not in user list\n"); else as1->next = as->next; } @@ -1600,11 +1602,9 @@ static int do_open(struct inode *inode, struct file *filp) struct apm_user *as; as = kmalloc(sizeof(*as), GFP_KERNEL); - if (as == NULL) { - printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n", - sizeof(*as)); + if (as == NULL) return -ENOMEM; - } + as->magic = APM_BIOS_MAGIC; as->event_tail = as->event_head = 0; as->suspends_pending = as->standbys_pending = 0; @@ -2313,16 +2313,16 @@ static int __init apm_init(void) } if (apm_info.disabled) { - printk(KERN_NOTICE "apm: disabled on user request.\n"); + pr_notice("disabled on user request.\n"); return -ENODEV; } if ((num_online_cpus() > 1) && !power_off && !smp) { - printk(KERN_NOTICE "apm: disabled - APM is not SMP safe.\n"); + pr_notice("disabled - APM is not SMP safe.\n"); apm_info.disabled = 1; return -ENODEV; } if (!acpi_disabled) { - printk(KERN_NOTICE "apm: overridden by ACPI.\n"); + pr_notice("overridden by ACPI.\n"); apm_info.disabled = 1; return -ENODEV; } @@ -2356,8 +2356,7 @@ static int __init apm_init(void) kapmd_task = kthread_create(apm, NULL, "kapmd"); if (IS_ERR(kapmd_task)) { - printk(KERN_ERR "apm: disabled - Unable to start kernel " - "thread.\n"); + pr_err("disabled - Unable to start kernel thread\n"); err = PTR_ERR(kapmd_task); kapmd_task = NULL; remove_proc_entry("apm", NULL); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 6ab6aa2fdfdd..bac4c3804cc7 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -32,7 +32,9 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o ifdef CONFIG_PERF_EVENTS obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd.o -obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_p4.o perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_p4.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o endif obj-$(CONFIG_X86_MCE) += mcheck/ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 146bb6218eec..9d92e19039f0 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -19,6 +19,39 @@ #include "cpu.h" +static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) +{ + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + u32 gprs[8] = { 0 }; + int err; + + WARN_ONCE((c->x86 != 0xf), "%s should only be used on K8!\n", __func__); + + gprs[1] = msr; + gprs[7] = 0x9c5a203a; + + err = rdmsr_safe_regs(gprs); + + *p = gprs[0] | ((u64)gprs[2] << 32); + + return err; +} + +static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) +{ + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + u32 gprs[8] = { 0 }; + + WARN_ONCE((c->x86 != 0xf), "%s should only be used on K8!\n", __func__); + + gprs[0] = (u32)val; + gprs[1] = msr; + gprs[2] = val >> 32; + gprs[7] = 0x9c5a203a; + + return wrmsr_safe_regs(gprs); +} + #ifdef CONFIG_X86_32 /* * B step AMD K6 before B 9730xxxx have hardware bugs that can cause @@ -586,9 +619,9 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) !cpu_has(c, X86_FEATURE_TOPOEXT)) { u64 val; - if (!rdmsrl_amd_safe(0xc0011005, &val)) { + if (!rdmsrl_safe(0xc0011005, &val)) { val |= 1ULL << 54; - wrmsrl_amd_safe(0xc0011005, val); + wrmsrl_safe(0xc0011005, val); rdmsrl(0xc0011005, val); if (val & (1ULL << 54)) { set_cpu_cap(c, X86_FEATURE_TOPOEXT); @@ -679,7 +712,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask); if (err == 0) { mask |= (1 << 10); - checking_wrmsrl(MSR_AMD64_MCx_MASK(4), mask); + wrmsrl_safe(MSR_AMD64_MCx_MASK(4), mask); } } diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 46674fbb62ba..c97bb7b5a9f8 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -55,8 +55,8 @@ static void __init check_fpu(void) if (!boot_cpu_data.hard_math) { #ifndef CONFIG_MATH_EMULATION - printk(KERN_EMERG "No coprocessor found and no math emulation present.\n"); - printk(KERN_EMERG "Giving up.\n"); + pr_emerg("No coprocessor found and no math emulation present\n"); + pr_emerg("Giving up\n"); for (;;) ; #endif return; @@ -86,7 +86,7 @@ static void __init check_fpu(void) boot_cpu_data.fdiv_bug = fdiv_bug; if (boot_cpu_data.fdiv_bug) - printk(KERN_WARNING "Hmm, FPU with FDIV bug.\n"); + pr_warn("Hmm, FPU with FDIV bug\n"); } static void __init check_hlt(void) @@ -94,16 +94,16 @@ static void __init check_hlt(void) if (boot_cpu_data.x86 >= 5 || paravirt_enabled()) return; - printk(KERN_INFO "Checking 'hlt' instruction... "); + pr_info("Checking 'hlt' instruction... "); if (!boot_cpu_data.hlt_works_ok) { - printk("disabled\n"); + pr_cont("disabled\n"); return; } halt(); halt(); halt(); halt(); - printk(KERN_CONT "OK.\n"); + pr_cont("OK\n"); } /* @@ -116,7 +116,7 @@ static void __init check_popad(void) #ifndef CONFIG_X86_POPAD_OK int res, inp = (int) &res; - printk(KERN_INFO "Checking for popad bug... "); + pr_info("Checking for popad bug... "); __asm__ __volatile__( "movl $12345678,%%eax; movl $0,%%edi; pusha; popa; movl (%%edx,%%edi),%%ecx " : "=&a" (res) @@ -127,9 +127,9 @@ static void __init check_popad(void) * CPU hard. Too bad. */ if (res != 12345678) - printk(KERN_CONT "Buggy.\n"); + pr_cont("Buggy\n"); else - printk(KERN_CONT "OK.\n"); + pr_cont("OK\n"); #endif } @@ -161,7 +161,7 @@ void __init check_bugs(void) { identify_boot_cpu(); #ifndef CONFIG_SMP - printk(KERN_INFO "CPU: "); + pr_info("CPU: "); print_cpu_info(&boot_cpu_data); #endif check_config(); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 6b9333b429ba..5bbc082c47ad 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -947,7 +947,7 @@ static void __cpuinit __print_cpu_msr(void) index_max = msr_range_array[i].max; for (index = index_min; index < index_max; index++) { - if (rdmsrl_amd_safe(index, &val)) + if (rdmsrl_safe(index, &val)) continue; printk(KERN_INFO " MSR%08x: %016llx\n", index, val); } diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 0a687fd185e6..5a5a5dc1ff15 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -7,6 +7,9 @@ * Copyright 2008 Intel Corporation * Author: Andi Kleen */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/thread_info.h> #include <linux/capability.h> #include <linux/miscdevice.h> @@ -210,7 +213,7 @@ static void drain_mcelog_buffer(void) cpu_relax(); if (!m->finished && retries >= 4) { - pr_err("MCE: skipping error being logged currently!\n"); + pr_err("skipping error being logged currently!\n"); break; } } @@ -1167,8 +1170,9 @@ int memory_failure(unsigned long pfn, int vector, int flags) { /* mce_severity() should not hand us an ACTION_REQUIRED error */ BUG_ON(flags & MF_ACTION_REQUIRED); - printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n" - "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn); + pr_err("Uncorrected memory error in page 0x%lx ignored\n" + "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", + pfn); return 0; } @@ -1274,7 +1278,7 @@ static void mce_timer_fn(unsigned long data) */ iv = __this_cpu_read(mce_next_interval); if (mce_notify_irq()) - iv = max(iv, (unsigned long) HZ/100); + iv = max(iv / 2, (unsigned long) HZ/100); else iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); __this_cpu_write(mce_next_interval, iv); @@ -1358,11 +1362,10 @@ static int __cpuinit __mcheck_cpu_cap_init(void) b = cap & MCG_BANKCNT_MASK; if (!banks) - printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); + pr_info("CPU supports %d MCE banks\n", b); if (b > MAX_NR_BANKS) { - printk(KERN_WARNING - "MCE: Using only %u machine check banks out of %u\n", + pr_warn("Using only %u machine check banks out of %u\n", MAX_NR_BANKS, b); b = MAX_NR_BANKS; } @@ -1419,7 +1422,7 @@ static void __mcheck_cpu_init_generic(void) static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) { if (c->x86_vendor == X86_VENDOR_UNKNOWN) { - pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); + pr_info("unknown CPU type - not enabling MCE support\n"); return -EOPNOTSUPP; } @@ -1557,7 +1560,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) static void __mcheck_cpu_init_timer(void) { struct timer_list *t = &__get_cpu_var(mce_timer); - unsigned long iv = __this_cpu_read(mce_next_interval); + unsigned long iv = check_interval * HZ; setup_timer(t, mce_timer_fn, smp_processor_id()); @@ -1574,7 +1577,7 @@ static void __mcheck_cpu_init_timer(void) /* Handle unconfigured int18 (should never happen) */ static void unexpected_machine_check(struct pt_regs *regs, long error_code) { - printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", + pr_err("CPU#%d: Unexpected int18 (Machine Check)\n", smp_processor_id()); } @@ -1893,8 +1896,7 @@ static int __init mcheck_enable(char *str) get_option(&str, &monarch_timeout); } } else { - printk(KERN_INFO "mce argument %s ignored. Please use /sys\n", - str); + pr_info("mce argument %s ignored. Please use /sys\n", str); return 0; } return 1; diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl index dfea390e1608..c7b3fe2d72e0 100644 --- a/arch/x86/kernel/cpu/mkcapflags.pl +++ b/arch/x86/kernel/cpu/mkcapflags.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/perl -w # # Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h # @@ -11,22 +11,35 @@ open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n"; print OUT "#include <asm/cpufeature.h>\n\n"; print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n"; +%features = (); +$err = 0; + while (defined($line = <IN>)) { if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) { $macro = $1; - $feature = $2; + $feature = "\L$2"; $tail = $3; if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) { - $feature = $1; + $feature = "\L$1"; } - if ($feature ne '') { - printf OUT "\t%-32s = \"%s\",\n", - "[$macro]", "\L$feature"; + next if ($feature eq ''); + + if ($features{$feature}++) { + print STDERR "$in: duplicate feature name: $feature\n"; + $err++; } + printf OUT "\t%-32s = \"%s\",\n", "[$macro]", $feature; } } print OUT "};\n"; close(IN); close(OUT); + +if ($err) { + unlink($out); + exit(1); +} + +exit(0); diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index bdda2e6c673b..35ffda5d0727 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -258,11 +258,11 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, /* Compute the maximum size with which we can make a range: */ if (range_startk) - max_align = ffs(range_startk) - 1; + max_align = __ffs(range_startk); else - max_align = 32; + max_align = BITS_PER_LONG - 1; - align = fls(range_sizek) - 1; + align = __fls(range_sizek); if (align > max_align) align = max_align; diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 75772ae6c65f..e9fe907cd249 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -361,11 +361,7 @@ static void __init print_mtrr_state(void) } pr_debug("MTRR variable ranges %sabled:\n", mtrr_state.enabled & 2 ? "en" : "dis"); - if (size_or_mask & 0xffffffffUL) - high_width = ffs(size_or_mask & 0xffffffffUL) - 1; - else - high_width = ffs(size_or_mask>>32) + 32 - 1; - high_width = (high_width - (32 - PAGE_SHIFT) + 3) / 4; + high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4; for (i = 0; i < num_var_ranges; ++i) { if (mtrr_state.var_ranges[i].mask_lo & (1 << 11)) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index e049d6da0183..29557aa06dda 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -35,17 +35,6 @@ #include "perf_event.h" -#if 0 -#undef wrmsrl -#define wrmsrl(msr, val) \ -do { \ - trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\ - (unsigned long)(val)); \ - native_write_msr((msr), (u32)((u64)(val)), \ - (u32)((u64)(val) >> 32)); \ -} while (0) -#endif - struct x86_pmu x86_pmu __read_mostly; DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { @@ -74,7 +63,7 @@ u64 x86_perf_event_update(struct perf_event *event) int idx = hwc->idx; s64 delta; - if (idx == X86_PMC_IDX_FIXED_BTS) + if (idx == INTEL_PMC_IDX_FIXED_BTS) return 0; /* @@ -86,7 +75,7 @@ u64 x86_perf_event_update(struct perf_event *event) */ again: prev_raw_count = local64_read(&hwc->prev_count); - rdmsrl(hwc->event_base, new_raw_count); + rdpmcl(hwc->event_base_rdpmc, new_raw_count); if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) @@ -189,7 +178,7 @@ static void release_pmc_hardware(void) {} static bool check_hw_exists(void) { - u64 val, val_new = 0; + u64 val, val_new = ~0; int i, reg, ret = 0; /* @@ -222,8 +211,9 @@ static bool check_hw_exists(void) * that don't trap on the MSR access and always return 0s. */ val = 0xabcdUL; - ret = checking_wrmsrl(x86_pmu_event_addr(0), val); - ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new); + reg = x86_pmu_event_addr(0); + ret = wrmsrl_safe(reg, val); + ret |= rdmsrl_safe(reg, &val_new); if (ret || val != val_new) goto msr_fail; @@ -240,6 +230,7 @@ bios_fail: msr_fail: printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n"); + printk(KERN_ERR "Failed to access perfctr msr (MSR %x is %Lx)\n", reg, val_new); return false; } @@ -388,7 +379,7 @@ int x86_pmu_hw_config(struct perf_event *event) int precise = 0; /* Support for constant skid */ - if (x86_pmu.pebs_active) { + if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { precise++; /* Support for IP fixup */ @@ -637,8 +628,8 @@ static bool __perf_sched_find_counter(struct perf_sched *sched) c = sched->constraints[sched->state.event]; /* Prefer fixed purpose counters */ - if (x86_pmu.num_counters_fixed) { - idx = X86_PMC_IDX_FIXED; + if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) { + idx = INTEL_PMC_IDX_FIXED; for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) { if (!__test_and_set_bit(idx, sched->state.used)) goto done; @@ -646,7 +637,7 @@ static bool __perf_sched_find_counter(struct perf_sched *sched) } /* Grab the first unused counter starting with idx */ idx = sched->state.counter; - for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_FIXED) { + for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) { if (!__test_and_set_bit(idx, sched->state.used)) goto done; } @@ -704,8 +695,8 @@ static bool perf_sched_next_event(struct perf_sched *sched) /* * Assign a counter for each event. */ -static int perf_assign_events(struct event_constraint **constraints, int n, - int wmin, int wmax, int *assign) +int perf_assign_events(struct event_constraint **constraints, int n, + int wmin, int wmax, int *assign) { struct perf_sched sched; @@ -824,15 +815,17 @@ static inline void x86_assign_hw_event(struct perf_event *event, hwc->last_cpu = smp_processor_id(); hwc->last_tag = ++cpuc->tags[i]; - if (hwc->idx == X86_PMC_IDX_FIXED_BTS) { + if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) { hwc->config_base = 0; hwc->event_base = 0; - } else if (hwc->idx >= X86_PMC_IDX_FIXED) { + } else if (hwc->idx >= INTEL_PMC_IDX_FIXED) { hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; - hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED); + hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED); + hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30; } else { hwc->config_base = x86_pmu_config_addr(hwc->idx); hwc->event_base = x86_pmu_event_addr(hwc->idx); + hwc->event_base_rdpmc = hwc->idx; } } @@ -930,7 +923,7 @@ int x86_perf_event_set_period(struct perf_event *event) s64 period = hwc->sample_period; int ret = 0, idx = hwc->idx; - if (idx == X86_PMC_IDX_FIXED_BTS) + if (idx == INTEL_PMC_IDX_FIXED_BTS) return 0; /* @@ -1316,7 +1309,6 @@ static struct attribute_group x86_pmu_format_group = { static int __init init_hw_perf_events(void) { struct x86_pmu_quirk *quirk; - struct event_constraint *c; int err; pr_info("Performance Events: "); @@ -1347,21 +1339,8 @@ static int __init init_hw_perf_events(void) for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) quirk->func(); - if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { - WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", - x86_pmu.num_counters, X86_PMC_MAX_GENERIC); - x86_pmu.num_counters = X86_PMC_MAX_GENERIC; - } - x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; - - if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { - WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", - x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); - x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; - } - - x86_pmu.intel_ctrl |= - ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; + if (!x86_pmu.intel_ctrl) + x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; perf_events_lapic_init(); register_nmi_handler(NMI_LOCAL, perf_event_nmi_handler, 0, "PMI"); @@ -1370,22 +1349,6 @@ static int __init init_hw_perf_events(void) __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, 0, x86_pmu.num_counters, 0); - if (x86_pmu.event_constraints) { - /* - * event on fixed counter2 (REF_CYCLES) only works on this - * counter, so do not extend mask to generic counters - */ - for_each_event_constraint(c, x86_pmu.event_constraints) { - if (c->cmask != X86_RAW_EVENT_MASK - || c->idxmsk64 == X86_PMC_MSK_FIXED_REF_CYCLES) { - continue; - } - - c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; - c->weight += x86_pmu.num_counters; - } - } - x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ x86_pmu_format_group.attrs = x86_pmu.format_attrs; @@ -1496,6 +1459,7 @@ static struct cpu_hw_events *allocate_fake_cpuc(void) if (!cpuc->shared_regs) goto error; } + cpuc->is_fake = 1; return cpuc; error: free_fake_cpuc(cpuc); @@ -1619,8 +1583,8 @@ static int x86_pmu_event_idx(struct perf_event *event) if (!x86_pmu.attr_rdpmc) return 0; - if (x86_pmu.num_counters_fixed && idx >= X86_PMC_IDX_FIXED) { - idx -= X86_PMC_IDX_FIXED; + if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) { + idx -= INTEL_PMC_IDX_FIXED; idx |= 1 << 30; } @@ -1648,7 +1612,12 @@ static ssize_t set_attr_rdpmc(struct device *cdev, struct device_attribute *attr, const char *buf, size_t count) { - unsigned long val = simple_strtoul(buf, NULL, 0); + unsigned long val; + ssize_t ret; + + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; if (!!val != !!x86_pmu.attr_rdpmc) { x86_pmu.attr_rdpmc = !!val; @@ -1681,13 +1650,20 @@ static void x86_pmu_flush_branch_stack(void) x86_pmu.flush_branch_stack(); } +void perf_check_microcode(void) +{ + if (x86_pmu.check_microcode) + x86_pmu.check_microcode(); +} +EXPORT_SYMBOL_GPL(perf_check_microcode); + static struct pmu pmu = { .pmu_enable = x86_pmu_enable, .pmu_disable = x86_pmu_disable, - .attr_groups = x86_pmu_attr_groups, + .attr_groups = x86_pmu_attr_groups, - .event_init = x86_pmu_event_init, + .event_init = x86_pmu_event_init, .add = x86_pmu_add, .del = x86_pmu_del, @@ -1695,11 +1671,11 @@ static struct pmu pmu = { .stop = x86_pmu_stop, .read = x86_pmu_read, - .start_txn = x86_pmu_start_txn, - .cancel_txn = x86_pmu_cancel_txn, - .commit_txn = x86_pmu_commit_txn, + .start_txn = x86_pmu_start_txn, + .cancel_txn = x86_pmu_cancel_txn, + .commit_txn = x86_pmu_commit_txn, - .event_idx = x86_pmu_event_idx, + .event_idx = x86_pmu_event_idx, .flush_branch_stack = x86_pmu_flush_branch_stack, }; @@ -1756,6 +1732,12 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); } +static inline int +valid_user_frame(const void __user *fp, unsigned long size) +{ + return (__range_not_ok(fp, size, TASK_SIZE) == 0); +} + #ifdef CONFIG_COMPAT #include <asm/compat.h> @@ -1780,7 +1762,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) if (bytes != sizeof(frame)) break; - if (fp < compat_ptr(regs->sp)) + if (!valid_user_frame(fp, sizeof(frame))) break; perf_callchain_store(entry, frame.return_address); @@ -1826,7 +1808,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) if (bytes != sizeof(frame)) break; - if ((unsigned long)fp < regs->sp) + if (!valid_user_frame(fp, sizeof(frame))) break; perf_callchain_store(entry, frame.return_address); @@ -1856,7 +1838,7 @@ unsigned long perf_misc_flags(struct pt_regs *regs) else misc |= PERF_RECORD_MISC_GUEST_KERNEL; } else { - if (user_mode(regs)) + if (!kernel_ip(regs->ip)) misc |= PERF_RECORD_MISC_USER; else misc |= PERF_RECORD_MISC_KERNEL; diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 6638aaf54493..a15df4be151f 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -14,6 +14,18 @@ #include <linux/perf_event.h> +#if 0 +#undef wrmsrl +#define wrmsrl(msr, val) \ +do { \ + unsigned int _msr = (msr); \ + u64 _val = (val); \ + trace_printk("wrmsrl(%x, %Lx)\n", (unsigned int)(_msr), \ + (unsigned long long)(_val)); \ + native_write_msr((_msr), (u32)(_val), (u32)(_val >> 32)); \ +} while (0) +#endif + /* * | NHM/WSM | SNB | * register ------------------------------- @@ -57,7 +69,7 @@ struct amd_nb { }; /* The maximal number of PEBS events: */ -#define MAX_PEBS_EVENTS 4 +#define MAX_PEBS_EVENTS 8 /* * A debug store configuration. @@ -117,6 +129,7 @@ struct cpu_hw_events { struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ unsigned int group_flag; + int is_fake; /* * Intel DebugStore bits @@ -348,6 +361,8 @@ struct x86_pmu { void (*cpu_starting)(int cpu); void (*cpu_dying)(int cpu); void (*cpu_dead)(int cpu); + + void (*check_microcode)(void); void (*flush_branch_stack)(void); /* @@ -359,11 +374,16 @@ struct x86_pmu { /* * Intel DebugStore bits */ - int bts, pebs; - int bts_active, pebs_active; + int bts :1, + bts_active :1, + pebs :1, + pebs_active :1, + pebs_broken :1; int pebs_record_size; void (*drain_pebs)(struct pt_regs *regs); struct event_constraint *pebs_constraints; + void (*pebs_aliases)(struct perf_event *event); + int max_pebs_events; /* * Intel LBR @@ -466,6 +486,8 @@ static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, void x86_pmu_enable_all(int added); +int perf_assign_events(struct event_constraint **constraints, int n, + int wmin, int wmax, int *assign); int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign); void x86_pmu_stop(struct perf_event *event, int flags); diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 11a4eb9131d5..4528ae7b6ec4 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -366,7 +366,7 @@ static void amd_pmu_cpu_starting(int cpu) cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY; - if (boot_cpu_data.x86_max_cores < 2 || boot_cpu_data.x86 == 0x15) + if (boot_cpu_data.x86_max_cores < 2) return; nb_id = amd_get_nb_id(cpu); @@ -422,35 +422,6 @@ static struct attribute *amd_format_attr[] = { NULL, }; -static __initconst const struct x86_pmu amd_pmu = { - .name = "AMD", - .handle_irq = x86_pmu_handle_irq, - .disable_all = x86_pmu_disable_all, - .enable_all = x86_pmu_enable_all, - .enable = x86_pmu_enable_event, - .disable = x86_pmu_disable_event, - .hw_config = amd_pmu_hw_config, - .schedule_events = x86_schedule_events, - .eventsel = MSR_K7_EVNTSEL0, - .perfctr = MSR_K7_PERFCTR0, - .event_map = amd_pmu_event_map, - .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_counters = AMD64_NUM_COUNTERS, - .cntval_bits = 48, - .cntval_mask = (1ULL << 48) - 1, - .apic = 1, - /* use highest bit to detect overflow */ - .max_period = (1ULL << 47) - 1, - .get_event_constraints = amd_get_event_constraints, - .put_event_constraints = amd_put_event_constraints, - - .format_attrs = amd_format_attr, - - .cpu_prepare = amd_pmu_cpu_prepare, - .cpu_starting = amd_pmu_cpu_starting, - .cpu_dead = amd_pmu_cpu_dead, -}; - /* AMD Family 15h */ #define AMD_EVENT_TYPE_MASK 0x000000F0ULL @@ -597,8 +568,8 @@ amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *ev } } -static __initconst const struct x86_pmu amd_pmu_f15h = { - .name = "AMD Family 15h", +static __initconst const struct x86_pmu amd_pmu = { + .name = "AMD", .handle_irq = x86_pmu_handle_irq, .disable_all = x86_pmu_disable_all, .enable_all = x86_pmu_enable_all, @@ -606,50 +577,68 @@ static __initconst const struct x86_pmu amd_pmu_f15h = { .disable = x86_pmu_disable_event, .hw_config = amd_pmu_hw_config, .schedule_events = x86_schedule_events, - .eventsel = MSR_F15H_PERF_CTL, - .perfctr = MSR_F15H_PERF_CTR, + .eventsel = MSR_K7_EVNTSEL0, + .perfctr = MSR_K7_PERFCTR0, .event_map = amd_pmu_event_map, .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_counters = AMD64_NUM_COUNTERS_F15H, + .num_counters = AMD64_NUM_COUNTERS, .cntval_bits = 48, .cntval_mask = (1ULL << 48) - 1, .apic = 1, /* use highest bit to detect overflow */ .max_period = (1ULL << 47) - 1, - .get_event_constraints = amd_get_event_constraints_f15h, - /* nortbridge counters not yet implemented: */ -#if 0 + .get_event_constraints = amd_get_event_constraints, .put_event_constraints = amd_put_event_constraints, + .format_attrs = amd_format_attr, + .cpu_prepare = amd_pmu_cpu_prepare, - .cpu_dead = amd_pmu_cpu_dead, -#endif .cpu_starting = amd_pmu_cpu_starting, - .format_attrs = amd_format_attr, + .cpu_dead = amd_pmu_cpu_dead, }; +static int setup_event_constraints(void) +{ + if (boot_cpu_data.x86 >= 0x15) + x86_pmu.get_event_constraints = amd_get_event_constraints_f15h; + return 0; +} + +static int setup_perfctr_core(void) +{ + if (!cpu_has_perfctr_core) { + WARN(x86_pmu.get_event_constraints == amd_get_event_constraints_f15h, + KERN_ERR "Odd, counter constraints enabled but no core perfctrs detected!"); + return -ENODEV; + } + + WARN(x86_pmu.get_event_constraints == amd_get_event_constraints, + KERN_ERR "hw perf events core counters need constraints handler!"); + + /* + * If core performance counter extensions exists, we must use + * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also + * x86_pmu_addr_offset(). + */ + x86_pmu.eventsel = MSR_F15H_PERF_CTL; + x86_pmu.perfctr = MSR_F15H_PERF_CTR; + x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE; + + printk(KERN_INFO "perf: AMD core performance counters detected\n"); + + return 0; +} + __init int amd_pmu_init(void) { /* Performance-monitoring supported from K7 and later: */ if (boot_cpu_data.x86 < 6) return -ENODEV; - /* - * If core performance counter extensions exists, it must be - * family 15h, otherwise fail. See x86_pmu_addr_offset(). - */ - switch (boot_cpu_data.x86) { - case 0x15: - if (!cpu_has_perfctr_core) - return -ENODEV; - x86_pmu = amd_pmu_f15h; - break; - default: - if (cpu_has_perfctr_core) - return -ENODEV; - x86_pmu = amd_pmu; - break; - } + x86_pmu = amd_pmu; + + setup_event_constraints(); + setup_perfctr_core(); /* Events are common for all AMDs */ memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 166546ec6aef..7a8b9d0abcaa 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -5,6 +5,8 @@ * among events on a single PMU. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/stddef.h> #include <linux/types.h> #include <linux/init.h> @@ -21,14 +23,14 @@ */ static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = { - [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, - [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, - [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */ + [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, + [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */ }; static struct event_constraint intel_core_event_constraints[] __read_mostly = @@ -747,7 +749,7 @@ static void intel_pmu_disable_all(void) wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); - if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) + if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) intel_pmu_disable_bts(); intel_pmu_pebs_disable_all(); @@ -763,9 +765,9 @@ static void intel_pmu_enable_all(int added) wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask); - if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { + if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { struct perf_event *event = - cpuc->events[X86_PMC_IDX_FIXED_BTS]; + cpuc->events[INTEL_PMC_IDX_FIXED_BTS]; if (WARN_ON_ONCE(!event)) return; @@ -871,7 +873,7 @@ static inline void intel_pmu_ack_status(u64 ack) static void intel_pmu_disable_fixed(struct hw_perf_event *hwc) { - int idx = hwc->idx - X86_PMC_IDX_FIXED; + int idx = hwc->idx - INTEL_PMC_IDX_FIXED; u64 ctrl_val, mask; mask = 0xfULL << (idx * 4); @@ -886,7 +888,7 @@ static void intel_pmu_disable_event(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { + if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) { intel_pmu_disable_bts(); intel_pmu_drain_bts_buffer(); return; @@ -915,7 +917,7 @@ static void intel_pmu_disable_event(struct perf_event *event) static void intel_pmu_enable_fixed(struct hw_perf_event *hwc) { - int idx = hwc->idx - X86_PMC_IDX_FIXED; + int idx = hwc->idx - INTEL_PMC_IDX_FIXED; u64 ctrl_val, bits, mask; /* @@ -949,7 +951,7 @@ static void intel_pmu_enable_event(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { + if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) { if (!__this_cpu_read(cpu_hw_events.enabled)) return; @@ -1000,14 +1002,14 @@ static void intel_pmu_reset(void) local_irq_save(flags); - printk("clearing PMU state on CPU#%d\n", smp_processor_id()); + pr_info("clearing PMU state on CPU#%d\n", smp_processor_id()); for (idx = 0; idx < x86_pmu.num_counters; idx++) { - checking_wrmsrl(x86_pmu_config_addr(idx), 0ull); - checking_wrmsrl(x86_pmu_event_addr(idx), 0ull); + wrmsrl_safe(x86_pmu_config_addr(idx), 0ull); + wrmsrl_safe(x86_pmu_event_addr(idx), 0ull); } for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) - checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); + wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); if (ds) ds->bts_index = ds->bts_buffer_base; @@ -1119,27 +1121,33 @@ intel_bts_constraints(struct perf_event *event) return NULL; } -static bool intel_try_alt_er(struct perf_event *event, int orig_idx) +static int intel_alt_er(int idx) { if (!(x86_pmu.er_flags & ERF_HAS_RSP_1)) - return false; + return idx; - if (event->hw.extra_reg.idx == EXTRA_REG_RSP_0) { - event->hw.config &= ~INTEL_ARCH_EVENT_MASK; - event->hw.config |= 0x01bb; - event->hw.extra_reg.idx = EXTRA_REG_RSP_1; - event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; - } else if (event->hw.extra_reg.idx == EXTRA_REG_RSP_1) { + if (idx == EXTRA_REG_RSP_0) + return EXTRA_REG_RSP_1; + + if (idx == EXTRA_REG_RSP_1) + return EXTRA_REG_RSP_0; + + return idx; +} + +static void intel_fixup_er(struct perf_event *event, int idx) +{ + event->hw.extra_reg.idx = idx; + + if (idx == EXTRA_REG_RSP_0) { event->hw.config &= ~INTEL_ARCH_EVENT_MASK; event->hw.config |= 0x01b7; - event->hw.extra_reg.idx = EXTRA_REG_RSP_0; event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0; + } else if (idx == EXTRA_REG_RSP_1) { + event->hw.config &= ~INTEL_ARCH_EVENT_MASK; + event->hw.config |= 0x01bb; + event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; } - - if (event->hw.extra_reg.idx == orig_idx) - return false; - - return true; } /* @@ -1157,14 +1165,18 @@ __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, struct event_constraint *c = &emptyconstraint; struct er_account *era; unsigned long flags; - int orig_idx = reg->idx; + int idx = reg->idx; - /* already allocated shared msr */ - if (reg->alloc) + /* + * reg->alloc can be set due to existing state, so for fake cpuc we + * need to ignore this, otherwise we might fail to allocate proper fake + * state for this extra reg constraint. Also see the comment below. + */ + if (reg->alloc && !cpuc->is_fake) return NULL; /* call x86_get_event_constraint() */ again: - era = &cpuc->shared_regs->regs[reg->idx]; + era = &cpuc->shared_regs->regs[idx]; /* * we use spin_lock_irqsave() to avoid lockdep issues when * passing a fake cpuc @@ -1173,6 +1185,29 @@ again: if (!atomic_read(&era->ref) || era->config == reg->config) { + /* + * If its a fake cpuc -- as per validate_{group,event}() we + * shouldn't touch event state and we can avoid doing so + * since both will only call get_event_constraints() once + * on each event, this avoids the need for reg->alloc. + * + * Not doing the ER fixup will only result in era->reg being + * wrong, but since we won't actually try and program hardware + * this isn't a problem either. + */ + if (!cpuc->is_fake) { + if (idx != reg->idx) + intel_fixup_er(event, idx); + + /* + * x86_schedule_events() can call get_event_constraints() + * multiple times on events in the case of incremental + * scheduling(). reg->alloc ensures we only do the ER + * allocation once. + */ + reg->alloc = 1; + } + /* lock in msr value */ era->config = reg->config; era->reg = reg->reg; @@ -1180,17 +1215,17 @@ again: /* one more user */ atomic_inc(&era->ref); - /* no need to reallocate during incremental event scheduling */ - reg->alloc = 1; - /* * need to call x86_get_event_constraint() * to check if associated event has constraints */ c = NULL; - } else if (intel_try_alt_er(event, orig_idx)) { - raw_spin_unlock_irqrestore(&era->lock, flags); - goto again; + } else { + idx = intel_alt_er(idx); + if (idx != reg->idx) { + raw_spin_unlock_irqrestore(&era->lock, flags); + goto again; + } } raw_spin_unlock_irqrestore(&era->lock, flags); @@ -1204,11 +1239,14 @@ __intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc, struct er_account *era; /* - * only put constraint if extra reg was actually - * allocated. Also takes care of event which do - * not use an extra shared reg + * Only put constraint if extra reg was actually allocated. Also takes + * care of event which do not use an extra shared reg. + * + * Also, if this is a fake cpuc we shouldn't touch any event state + * (reg->alloc) and we don't care about leaving inconsistent cpuc state + * either since it'll be thrown out. */ - if (!reg->alloc) + if (!reg->alloc || cpuc->is_fake) return; era = &cpuc->shared_regs->regs[reg->idx]; @@ -1300,15 +1338,9 @@ static void intel_put_event_constraints(struct cpu_hw_events *cpuc, intel_put_shared_regs_event_constraints(cpuc, event); } -static int intel_pmu_hw_config(struct perf_event *event) +static void intel_pebs_aliases_core2(struct perf_event *event) { - int ret = x86_pmu_hw_config(event); - - if (ret) - return ret; - - if (event->attr.precise_ip && - (event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { + if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { /* * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P * (0x003c) so that we can use it with PEBS. @@ -1329,10 +1361,48 @@ static int intel_pmu_hw_config(struct perf_event *event) */ u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16); + alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); + event->hw.config = alt_config; + } +} + +static void intel_pebs_aliases_snb(struct perf_event *event) +{ + if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { + /* + * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P + * (0x003c) so that we can use it with PEBS. + * + * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't + * PEBS capable. However we can use UOPS_RETIRED.ALL + * (0x01c2), which is a PEBS capable event, to get the same + * count. + * + * UOPS_RETIRED.ALL counts the number of cycles that retires + * CNTMASK micro-ops. By setting CNTMASK to a value (16) + * larger than the maximum number of micro-ops that can be + * retired per cycle (4) and then inverting the condition, we + * count all cycles that retire 16 or less micro-ops, which + * is every cycle. + * + * Thereby we gain a PEBS capable cycle counter. + */ + u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16); alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); event->hw.config = alt_config; } +} + +static int intel_pmu_hw_config(struct perf_event *event) +{ + int ret = x86_pmu_hw_config(event); + + if (ret) + return ret; + + if (event->attr.precise_ip && x86_pmu.pebs_aliases) + x86_pmu.pebs_aliases(event); if (intel_pmu_needs_lbr_smpl(event)) { ret = intel_pmu_setup_lbr_filter(event); @@ -1607,6 +1677,7 @@ static __initconst const struct x86_pmu intel_pmu = { .max_period = (1ULL << 31) - 1, .get_event_constraints = intel_get_event_constraints, .put_event_constraints = intel_put_event_constraints, + .pebs_aliases = intel_pebs_aliases_core2, .format_attrs = intel_arch3_formats_attr, @@ -1638,16 +1709,61 @@ static __init void intel_clovertown_quirk(void) * But taken together it might just make sense to not enable PEBS on * these chips. */ - printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); + pr_warn("PEBS disabled due to CPU errata\n"); x86_pmu.pebs = 0; x86_pmu.pebs_constraints = NULL; } +static int intel_snb_pebs_broken(int cpu) +{ + u32 rev = UINT_MAX; /* default to broken for unknown models */ + + switch (cpu_data(cpu).x86_model) { + case 42: /* SNB */ + rev = 0x28; + break; + + case 45: /* SNB-EP */ + switch (cpu_data(cpu).x86_mask) { + case 6: rev = 0x618; break; + case 7: rev = 0x70c; break; + } + } + + return (cpu_data(cpu).microcode < rev); +} + +static void intel_snb_check_microcode(void) +{ + int pebs_broken = 0; + int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) { + if ((pebs_broken = intel_snb_pebs_broken(cpu))) + break; + } + put_online_cpus(); + + if (pebs_broken == x86_pmu.pebs_broken) + return; + + /* + * Serialized by the microcode lock.. + */ + if (x86_pmu.pebs_broken) { + pr_info("PEBS enabled due to microcode update\n"); + x86_pmu.pebs_broken = 0; + } else { + pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n"); + x86_pmu.pebs_broken = 1; + } +} + static __init void intel_sandybridge_quirk(void) { - printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); - x86_pmu.pebs = 0; - x86_pmu.pebs_constraints = NULL; + x86_pmu.check_microcode = intel_snb_check_microcode; + intel_snb_check_microcode(); } static const struct { int id; char *name; } intel_arch_events_map[] __initconst = { @@ -1667,8 +1783,8 @@ static __init void intel_arch_events_quirk(void) /* disable event that reported as not presend by cpuid */ for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) { intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0; - printk(KERN_WARNING "CPUID marked event: \'%s\' unavailable\n", - intel_arch_events_map[bit].name); + pr_warn("CPUID marked event: \'%s\' unavailable\n", + intel_arch_events_map[bit].name); } } @@ -1687,7 +1803,7 @@ static __init void intel_nehalem_quirk(void) intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89; ebx.split.no_branch_misses_retired = 0; x86_pmu.events_maskl = ebx.full; - printk(KERN_INFO "CPU erratum AAJ80 worked around\n"); + pr_info("CPU erratum AAJ80 worked around\n"); } } @@ -1696,6 +1812,7 @@ __init int intel_pmu_init(void) union cpuid10_edx edx; union cpuid10_eax eax; union cpuid10_ebx ebx; + struct event_constraint *c; unsigned int unused; int version; @@ -1731,6 +1848,8 @@ __init int intel_pmu_init(void) x86_pmu.events_maskl = ebx.full; x86_pmu.events_mask_len = eax.split.mask_length; + x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters); + /* * Quirk: v2 perfmon does not report fixed-purpose events, so * assume at least 3 events: @@ -1840,8 +1959,9 @@ __init int intel_pmu_init(void) break; case 42: /* SandyBridge */ - x86_add_quirk(intel_sandybridge_quirk); case 45: /* SandyBridge, "Romely-EP" */ + x86_add_quirk(intel_sandybridge_quirk); + case 58: /* IvyBridge */ memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -1849,6 +1969,7 @@ __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_snb_event_constraints; x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints; + x86_pmu.pebs_aliases = intel_pebs_aliases_snb; x86_pmu.extra_regs = intel_snb_extra_regs; /* all extra regs are per-cpu when HT is on */ x86_pmu.er_flags |= ERF_HAS_RSP_1; @@ -1880,5 +2001,37 @@ __init int intel_pmu_init(void) } } + if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) { + WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", + x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); + x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC; + } + x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; + + if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) { + WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", + x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED); + x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED; + } + + x86_pmu.intel_ctrl |= + ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED; + + if (x86_pmu.event_constraints) { + /* + * event on fixed counter2 (REF_CYCLES) only works on this + * counter, so do not extend mask to generic counters + */ + for_each_event_constraint(c, x86_pmu.event_constraints) { + if (c->cmask != X86_RAW_EVENT_MASK + || c->idxmsk64 == INTEL_PMC_MSK_FIXED_REF_CYCLES) { + continue; + } + + c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; + c->weight += x86_pmu.num_counters; + } + } + return 0; } diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 5a3edc27f6e5..629ae0b7ad90 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -248,7 +248,7 @@ void reserve_ds_buffers(void) */ struct event_constraint bts_constraint = - EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0); + EVENT_CONSTRAINT(0, 1ULL << INTEL_PMC_IDX_FIXED_BTS, 0); void intel_pmu_enable_bts(u64 config) { @@ -295,7 +295,7 @@ int intel_pmu_drain_bts_buffer(void) u64 to; u64 flags; }; - struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; + struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS]; struct bts_record *at, *top; struct perf_output_handle handle; struct perf_event_header header; @@ -400,14 +400,7 @@ struct event_constraint intel_snb_pebs_event_constraints[] = { INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.* */ - INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_LOADS */ - INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_STORES */ - INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOP_RETIRED.LOCK_LOADS */ - INTEL_UEVENT_CONSTRAINT(0x22d0, 0xf), /* MEM_UOP_RETIRED.LOCK_STORES */ - INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_LOADS */ - INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_STORES */ - INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOP_RETIRED.ANY_LOADS */ - INTEL_UEVENT_CONSTRAINT(0x82d0, 0xf), /* MEM_UOP_RETIRED.ANY_STORES */ + INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */ @@ -627,7 +620,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) * Should not happen, we program the threshold at 1 and do not * set a reset value. */ - WARN_ON_ONCE(n > 1); + WARN_ONCE(n > 1, "bad leftover pebs %d\n", n); at += n - 1; __intel_pmu_pebs_event(event, iregs, at); @@ -658,10 +651,10 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) * Should not happen, we program the threshold at 1 and do not * set a reset value. */ - WARN_ON_ONCE(n > MAX_PEBS_EVENTS); + WARN_ONCE(n > x86_pmu.max_pebs_events, "Unexpected number of pebs records %d\n", n); for ( ; at < top; at++) { - for_each_set_bit(bit, (unsigned long *)&at->status, MAX_PEBS_EVENTS) { + for_each_set_bit(bit, (unsigned long *)&at->status, x86_pmu.max_pebs_events) { event = cpuc->events[bit]; if (!test_bit(bit, cpuc->active_mask)) continue; @@ -677,7 +670,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) break; } - if (!event || bit >= MAX_PEBS_EVENTS) + if (!event || bit >= x86_pmu.max_pebs_events) continue; __intel_pmu_pebs_event(event, iregs, at); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c new file mode 100644 index 000000000000..19faffc60886 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -0,0 +1,1850 @@ +#include "perf_event_intel_uncore.h" + +static struct intel_uncore_type *empty_uncore[] = { NULL, }; +static struct intel_uncore_type **msr_uncores = empty_uncore; +static struct intel_uncore_type **pci_uncores = empty_uncore; +/* pci bus to socket mapping */ +static int pcibus_to_physid[256] = { [0 ... 255] = -1, }; + +static DEFINE_RAW_SPINLOCK(uncore_box_lock); + +/* mask of cpus that collect uncore events */ +static cpumask_t uncore_cpu_mask; + +/* constraint for the fixed counter */ +static struct event_constraint constraint_fixed = + EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL); +static struct event_constraint constraint_empty = + EVENT_CONSTRAINT(0, 0, 0); + +DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); +DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); +DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19"); +DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23"); +DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28"); +DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31"); +DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31"); +DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28"); +DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15"); +DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30"); +DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51"); +DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4"); +DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17"); +DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22"); +DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31"); +DEFINE_UNCORE_FORMAT_ATTR(filter_brand0, filter_brand0, "config1:0-7"); +DEFINE_UNCORE_FORMAT_ATTR(filter_brand1, filter_brand1, "config1:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(filter_brand2, filter_brand2, "config1:16-23"); +DEFINE_UNCORE_FORMAT_ATTR(filter_brand3, filter_brand3, "config1:24-31"); + +/* Sandy Bridge-EP uncore support */ +static struct intel_uncore_type snbep_uncore_cbox; +static struct intel_uncore_type snbep_uncore_pcu; + +static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + int box_ctl = uncore_pci_box_ctl(box); + u32 config; + + pci_read_config_dword(pdev, box_ctl, &config); + config |= SNBEP_PMON_BOX_CTL_FRZ; + pci_write_config_dword(pdev, box_ctl, config); +} + +static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + int box_ctl = uncore_pci_box_ctl(box); + u32 config; + + pci_read_config_dword(pdev, box_ctl, &config); + config &= ~SNBEP_PMON_BOX_CTL_FRZ; + pci_write_config_dword(pdev, box_ctl, config); +} + +static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, hwc->config_base, hwc->config | + SNBEP_PMON_CTL_EN); +} + +static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, hwc->config_base, hwc->config); +} + +static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + u64 count; + + pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count); + pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1); + return count; +} + +static void snbep_uncore_pci_init_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, + SNBEP_PMON_BOX_CTL_INT); +} + +static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box) +{ + u64 config; + unsigned msr; + + msr = uncore_msr_box_ctl(box); + if (msr) { + rdmsrl(msr, config); + config |= SNBEP_PMON_BOX_CTL_FRZ; + wrmsrl(msr, config); + return; + } +} + +static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + u64 config; + unsigned msr; + + msr = uncore_msr_box_ctl(box); + if (msr) { + rdmsrl(msr, config); + config &= ~SNBEP_PMON_BOX_CTL_FRZ; + wrmsrl(msr, config); + return; + } +} + +static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + + if (reg1->idx != EXTRA_REG_NONE) + wrmsrl(reg1->reg, reg1->config); + + wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); +} + +static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + wrmsrl(hwc->config_base, hwc->config); +} + +static u64 snbep_uncore_msr_read_counter(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 count; + + rdmsrl(hwc->event_base, count); + return count; +} + +static void snbep_uncore_msr_init_box(struct intel_uncore_box *box) +{ + unsigned msr = uncore_msr_box_ctl(box); + if (msr) + wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT); +} + +static struct event_constraint * +snbep_uncore_get_constraint(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct intel_uncore_extra_reg *er; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + unsigned long flags; + bool ok = false; + + if (reg1->idx == EXTRA_REG_NONE || (box->phys_id >= 0 && reg1->alloc)) + return NULL; + + er = &box->shared_regs[reg1->idx]; + raw_spin_lock_irqsave(&er->lock, flags); + if (!atomic_read(&er->ref) || er->config1 == reg1->config) { + atomic_inc(&er->ref); + er->config1 = reg1->config; + ok = true; + } + raw_spin_unlock_irqrestore(&er->lock, flags); + + if (ok) { + if (box->phys_id >= 0) + reg1->alloc = 1; + return NULL; + } + return &constraint_empty; +} + +static void snbep_uncore_put_constraint(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct intel_uncore_extra_reg *er; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + + if (box->phys_id < 0 || !reg1->alloc) + return; + + er = &box->shared_regs[reg1->idx]; + atomic_dec(&er->ref); + reg1->alloc = 0; +} + +static int snbep_uncore_hw_config(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + + if (box->pmu->type == &snbep_uncore_cbox) { + reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER + + SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx; + reg1->config = event->attr.config1 & + SNBEP_CB0_MSR_PMON_BOX_FILTER_MASK; + } else if (box->pmu->type == &snbep_uncore_pcu) { + reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER; + reg1->config = event->attr.config1 & + SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK; + } else { + return 0; + } + reg1->idx = 0; + return 0; +} + +static struct attribute *snbep_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + NULL, +}; + +static struct attribute *snbep_uncore_ubox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh5.attr, + NULL, +}; + +static struct attribute *snbep_uncore_cbox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_tid_en.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + &format_attr_filter_tid.attr, + &format_attr_filter_nid.attr, + &format_attr_filter_state.attr, + &format_attr_filter_opc.attr, + NULL, +}; + +static struct attribute *snbep_uncore_pcu_formats_attr[] = { + &format_attr_event.attr, + &format_attr_occ_sel.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh5.attr, + &format_attr_occ_invert.attr, + &format_attr_occ_edge.attr, + &format_attr_filter_brand0.attr, + &format_attr_filter_brand1.attr, + &format_attr_filter_brand2.attr, + &format_attr_filter_brand3.attr, + NULL, +}; + +static struct uncore_event_desc snbep_uncore_imc_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), + INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"), + INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"), + { /* end: all zeroes */ }, +}; + +static struct uncore_event_desc snbep_uncore_qpi_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x14"), + INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"), + INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x02,umask=0x08"), + INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x03,umask=0x04"), + { /* end: all zeroes */ }, +}; + +static struct attribute_group snbep_uncore_format_group = { + .name = "format", + .attrs = snbep_uncore_formats_attr, +}; + +static struct attribute_group snbep_uncore_ubox_format_group = { + .name = "format", + .attrs = snbep_uncore_ubox_formats_attr, +}; + +static struct attribute_group snbep_uncore_cbox_format_group = { + .name = "format", + .attrs = snbep_uncore_cbox_formats_attr, +}; + +static struct attribute_group snbep_uncore_pcu_format_group = { + .name = "format", + .attrs = snbep_uncore_pcu_formats_attr, +}; + +static struct intel_uncore_ops snbep_uncore_msr_ops = { + .init_box = snbep_uncore_msr_init_box, + .disable_box = snbep_uncore_msr_disable_box, + .enable_box = snbep_uncore_msr_enable_box, + .disable_event = snbep_uncore_msr_disable_event, + .enable_event = snbep_uncore_msr_enable_event, + .read_counter = snbep_uncore_msr_read_counter, + .get_constraint = snbep_uncore_get_constraint, + .put_constraint = snbep_uncore_put_constraint, + .hw_config = snbep_uncore_hw_config, +}; + +static struct intel_uncore_ops snbep_uncore_pci_ops = { + .init_box = snbep_uncore_pci_init_box, + .disable_box = snbep_uncore_pci_disable_box, + .enable_box = snbep_uncore_pci_enable_box, + .disable_event = snbep_uncore_pci_disable_event, + .enable_event = snbep_uncore_pci_enable_event, + .read_counter = snbep_uncore_pci_read_counter, +}; + +static struct event_constraint snbep_uncore_cbox_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x01, 0x1), + UNCORE_EVENT_CONSTRAINT(0x02, 0x3), + UNCORE_EVENT_CONSTRAINT(0x04, 0x3), + UNCORE_EVENT_CONSTRAINT(0x05, 0x3), + UNCORE_EVENT_CONSTRAINT(0x07, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x1), + UNCORE_EVENT_CONSTRAINT(0x12, 0x3), + UNCORE_EVENT_CONSTRAINT(0x13, 0x3), + UNCORE_EVENT_CONSTRAINT(0x1b, 0xc), + UNCORE_EVENT_CONSTRAINT(0x1c, 0xc), + UNCORE_EVENT_CONSTRAINT(0x1d, 0xc), + UNCORE_EVENT_CONSTRAINT(0x1e, 0xc), + EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff), + UNCORE_EVENT_CONSTRAINT(0x21, 0x3), + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + UNCORE_EVENT_CONSTRAINT(0x31, 0x3), + UNCORE_EVENT_CONSTRAINT(0x32, 0x3), + UNCORE_EVENT_CONSTRAINT(0x33, 0x3), + UNCORE_EVENT_CONSTRAINT(0x34, 0x3), + UNCORE_EVENT_CONSTRAINT(0x35, 0x3), + UNCORE_EVENT_CONSTRAINT(0x36, 0x1), + UNCORE_EVENT_CONSTRAINT(0x37, 0x3), + UNCORE_EVENT_CONSTRAINT(0x38, 0x3), + UNCORE_EVENT_CONSTRAINT(0x39, 0x3), + UNCORE_EVENT_CONSTRAINT(0x3b, 0x1), + EVENT_CONSTRAINT_END +}; + +static struct event_constraint snbep_uncore_r2pcie_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x10, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x3), + UNCORE_EVENT_CONSTRAINT(0x12, 0x1), + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + UNCORE_EVENT_CONSTRAINT(0x24, 0x3), + UNCORE_EVENT_CONSTRAINT(0x25, 0x3), + UNCORE_EVENT_CONSTRAINT(0x26, 0x3), + UNCORE_EVENT_CONSTRAINT(0x32, 0x3), + UNCORE_EVENT_CONSTRAINT(0x33, 0x3), + UNCORE_EVENT_CONSTRAINT(0x34, 0x3), + EVENT_CONSTRAINT_END +}; + +static struct event_constraint snbep_uncore_r3qpi_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x10, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x3), + UNCORE_EVENT_CONSTRAINT(0x12, 0x3), + UNCORE_EVENT_CONSTRAINT(0x13, 0x1), + UNCORE_EVENT_CONSTRAINT(0x20, 0x3), + UNCORE_EVENT_CONSTRAINT(0x21, 0x3), + UNCORE_EVENT_CONSTRAINT(0x22, 0x3), + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + UNCORE_EVENT_CONSTRAINT(0x24, 0x3), + UNCORE_EVENT_CONSTRAINT(0x25, 0x3), + UNCORE_EVENT_CONSTRAINT(0x26, 0x3), + UNCORE_EVENT_CONSTRAINT(0x30, 0x3), + UNCORE_EVENT_CONSTRAINT(0x31, 0x3), + UNCORE_EVENT_CONSTRAINT(0x32, 0x3), + UNCORE_EVENT_CONSTRAINT(0x33, 0x3), + UNCORE_EVENT_CONSTRAINT(0x34, 0x3), + UNCORE_EVENT_CONSTRAINT(0x36, 0x3), + UNCORE_EVENT_CONSTRAINT(0x37, 0x3), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type snbep_uncore_ubox = { + .name = "ubox", + .num_counters = 2, + .num_boxes = 1, + .perf_ctr_bits = 44, + .fixed_ctr_bits = 48, + .perf_ctr = SNBEP_U_MSR_PMON_CTR0, + .event_ctl = SNBEP_U_MSR_PMON_CTL0, + .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK, + .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR, + .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL, + .ops = &snbep_uncore_msr_ops, + .format_group = &snbep_uncore_ubox_format_group, +}; + +static struct intel_uncore_type snbep_uncore_cbox = { + .name = "cbox", + .num_counters = 4, + .num_boxes = 8, + .perf_ctr_bits = 44, + .event_ctl = SNBEP_C0_MSR_PMON_CTL0, + .perf_ctr = SNBEP_C0_MSR_PMON_CTR0, + .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL, + .msr_offset = SNBEP_CBO_MSR_OFFSET, + .num_shared_regs = 1, + .constraints = snbep_uncore_cbox_constraints, + .ops = &snbep_uncore_msr_ops, + .format_group = &snbep_uncore_cbox_format_group, +}; + +static struct intel_uncore_type snbep_uncore_pcu = { + .name = "pcu", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0, + .event_ctl = SNBEP_PCU_MSR_PMON_CTL0, + .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL, + .num_shared_regs = 1, + .ops = &snbep_uncore_msr_ops, + .format_group = &snbep_uncore_pcu_format_group, +}; + +static struct intel_uncore_type *snbep_msr_uncores[] = { + &snbep_uncore_ubox, + &snbep_uncore_cbox, + &snbep_uncore_pcu, + NULL, +}; + +#define SNBEP_UNCORE_PCI_COMMON_INIT() \ + .perf_ctr = SNBEP_PCI_PMON_CTR0, \ + .event_ctl = SNBEP_PCI_PMON_CTL0, \ + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, \ + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \ + .ops = &snbep_uncore_pci_ops, \ + .format_group = &snbep_uncore_format_group + +static struct intel_uncore_type snbep_uncore_ha = { + .name = "ha", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type snbep_uncore_imc = { + .name = "imc", + .num_counters = 4, + .num_boxes = 4, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR, + .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL, + .event_descs = snbep_uncore_imc_events, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type snbep_uncore_qpi = { + .name = "qpi", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + .event_descs = snbep_uncore_qpi_events, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + + +static struct intel_uncore_type snbep_uncore_r2pcie = { + .name = "r2pcie", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 44, + .constraints = snbep_uncore_r2pcie_constraints, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type snbep_uncore_r3qpi = { + .name = "r3qpi", + .num_counters = 3, + .num_boxes = 2, + .perf_ctr_bits = 44, + .constraints = snbep_uncore_r3qpi_constraints, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type *snbep_pci_uncores[] = { + &snbep_uncore_ha, + &snbep_uncore_imc, + &snbep_uncore_qpi, + &snbep_uncore_r2pcie, + &snbep_uncore_r3qpi, + NULL, +}; + +static DEFINE_PCI_DEVICE_TABLE(snbep_uncore_pci_ids) = { + { /* Home Agent */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA), + .driver_data = (unsigned long)&snbep_uncore_ha, + }, + { /* MC Channel 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0), + .driver_data = (unsigned long)&snbep_uncore_imc, + }, + { /* MC Channel 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1), + .driver_data = (unsigned long)&snbep_uncore_imc, + }, + { /* MC Channel 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2), + .driver_data = (unsigned long)&snbep_uncore_imc, + }, + { /* MC Channel 3 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3), + .driver_data = (unsigned long)&snbep_uncore_imc, + }, + { /* QPI Port 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0), + .driver_data = (unsigned long)&snbep_uncore_qpi, + }, + { /* QPI Port 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1), + .driver_data = (unsigned long)&snbep_uncore_qpi, + }, + { /* P2PCIe */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE), + .driver_data = (unsigned long)&snbep_uncore_r2pcie, + }, + { /* R3QPI Link 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0), + .driver_data = (unsigned long)&snbep_uncore_r3qpi, + }, + { /* R3QPI Link 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1), + .driver_data = (unsigned long)&snbep_uncore_r3qpi, + }, + { /* end: all zeroes */ } +}; + +static struct pci_driver snbep_uncore_pci_driver = { + .name = "snbep_uncore", + .id_table = snbep_uncore_pci_ids, +}; + +/* + * build pci bus to socket mapping + */ +static void snbep_pci2phy_map_init(void) +{ + struct pci_dev *ubox_dev = NULL; + int i, bus, nodeid; + u32 config; + + while (1) { + /* find the UBOX device */ + ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, + PCI_DEVICE_ID_INTEL_JAKETOWN_UBOX, + ubox_dev); + if (!ubox_dev) + break; + bus = ubox_dev->bus->number; + /* get the Node ID of the local register */ + pci_read_config_dword(ubox_dev, 0x40, &config); + nodeid = config; + /* get the Node ID mapping */ + pci_read_config_dword(ubox_dev, 0x54, &config); + /* + * every three bits in the Node ID mapping register maps + * to a particular node. + */ + for (i = 0; i < 8; i++) { + if (nodeid == ((config >> (3 * i)) & 0x7)) { + pcibus_to_physid[bus] = i; + break; + } + } + }; + return; +} +/* end of Sandy Bridge-EP uncore support */ + + +/* Sandy Bridge uncore support */ +static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->idx < UNCORE_PMC_IDX_FIXED) + wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN); + else + wrmsrl(hwc->config_base, SNB_UNC_CTL_EN); +} + +static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + wrmsrl(event->hw.config_base, 0); +} + +static u64 snb_uncore_msr_read_counter(struct intel_uncore_box *box, + struct perf_event *event) +{ + u64 count; + rdmsrl(event->hw.event_base, count); + return count; +} + +static void snb_uncore_msr_init_box(struct intel_uncore_box *box) +{ + if (box->pmu->pmu_idx == 0) { + wrmsrl(SNB_UNC_PERF_GLOBAL_CTL, + SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL); + } +} + +static struct attribute *snb_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask5.attr, + NULL, +}; + +static struct attribute_group snb_uncore_format_group = { + .name = "format", + .attrs = snb_uncore_formats_attr, +}; + +static struct intel_uncore_ops snb_uncore_msr_ops = { + .init_box = snb_uncore_msr_init_box, + .disable_event = snb_uncore_msr_disable_event, + .enable_event = snb_uncore_msr_enable_event, + .read_counter = snb_uncore_msr_read_counter, +}; + +static struct event_constraint snb_uncore_cbox_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x80, 0x1), + UNCORE_EVENT_CONSTRAINT(0x83, 0x1), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type snb_uncore_cbox = { + .name = "cbox", + .num_counters = 2, + .num_boxes = 4, + .perf_ctr_bits = 44, + .fixed_ctr_bits = 48, + .perf_ctr = SNB_UNC_CBO_0_PER_CTR0, + .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0, + .fixed_ctr = SNB_UNC_FIXED_CTR, + .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL, + .single_fixed = 1, + .event_mask = SNB_UNC_RAW_EVENT_MASK, + .msr_offset = SNB_UNC_CBO_MSR_OFFSET, + .constraints = snb_uncore_cbox_constraints, + .ops = &snb_uncore_msr_ops, + .format_group = &snb_uncore_format_group, +}; + +static struct intel_uncore_type *snb_msr_uncores[] = { + &snb_uncore_cbox, + NULL, +}; +/* end of Sandy Bridge uncore support */ + +/* Nehalem uncore support */ +static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box) +{ + wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0); +} + +static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, + NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC); +} + +static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->idx < UNCORE_PMC_IDX_FIXED) + wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN); + else + wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN); +} + +static struct attribute *nhm_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask8.attr, + NULL, +}; + +static struct attribute_group nhm_uncore_format_group = { + .name = "format", + .attrs = nhm_uncore_formats_attr, +}; + +static struct uncore_event_desc nhm_uncore_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), + INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any, "event=0x2f,umask=0x0f"), + INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any, "event=0x2c,umask=0x0f"), + INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads, "event=0x20,umask=0x01"), + INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes, "event=0x20,umask=0x02"), + INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads, "event=0x20,umask=0x04"), + INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"), + INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads, "event=0x20,umask=0x10"), + INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes, "event=0x20,umask=0x20"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_ops nhm_uncore_msr_ops = { + .disable_box = nhm_uncore_msr_disable_box, + .enable_box = nhm_uncore_msr_enable_box, + .disable_event = snb_uncore_msr_disable_event, + .enable_event = nhm_uncore_msr_enable_event, + .read_counter = snb_uncore_msr_read_counter, +}; + +static struct intel_uncore_type nhm_uncore = { + .name = "", + .num_counters = 8, + .num_boxes = 1, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .event_ctl = NHM_UNC_PERFEVTSEL0, + .perf_ctr = NHM_UNC_UNCORE_PMC0, + .fixed_ctr = NHM_UNC_FIXED_CTR, + .fixed_ctl = NHM_UNC_FIXED_CTR_CTRL, + .event_mask = NHM_UNC_RAW_EVENT_MASK, + .event_descs = nhm_uncore_events, + .ops = &nhm_uncore_msr_ops, + .format_group = &nhm_uncore_format_group, +}; + +static struct intel_uncore_type *nhm_msr_uncores[] = { + &nhm_uncore, + NULL, +}; +/* end of Nehalem uncore support */ + +static void uncore_assign_hw_event(struct intel_uncore_box *box, + struct perf_event *event, int idx) +{ + struct hw_perf_event *hwc = &event->hw; + + hwc->idx = idx; + hwc->last_tag = ++box->tags[idx]; + + if (hwc->idx == UNCORE_PMC_IDX_FIXED) { + hwc->event_base = uncore_fixed_ctr(box); + hwc->config_base = uncore_fixed_ctl(box); + return; + } + + hwc->config_base = uncore_event_ctl(box, hwc->idx); + hwc->event_base = uncore_perf_ctr(box, hwc->idx); +} + +static void uncore_perf_event_update(struct intel_uncore_box *box, + struct perf_event *event) +{ + u64 prev_count, new_count, delta; + int shift; + + if (event->hw.idx >= UNCORE_PMC_IDX_FIXED) + shift = 64 - uncore_fixed_ctr_bits(box); + else + shift = 64 - uncore_perf_ctr_bits(box); + + /* the hrtimer might modify the previous event value */ +again: + prev_count = local64_read(&event->hw.prev_count); + new_count = uncore_read_counter(box, event); + if (local64_xchg(&event->hw.prev_count, new_count) != prev_count) + goto again; + + delta = (new_count << shift) - (prev_count << shift); + delta >>= shift; + + local64_add(delta, &event->count); +} + +/* + * The overflow interrupt is unavailable for SandyBridge-EP, is broken + * for SandyBridge. So we use hrtimer to periodically poll the counter + * to avoid overflow. + */ +static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer) +{ + struct intel_uncore_box *box; + unsigned long flags; + int bit; + + box = container_of(hrtimer, struct intel_uncore_box, hrtimer); + if (!box->n_active || box->cpu != smp_processor_id()) + return HRTIMER_NORESTART; + /* + * disable local interrupt to prevent uncore_pmu_event_start/stop + * to interrupt the update process + */ + local_irq_save(flags); + + for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX) + uncore_perf_event_update(box, box->events[bit]); + + local_irq_restore(flags); + + hrtimer_forward_now(hrtimer, ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL)); + return HRTIMER_RESTART; +} + +static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box) +{ + __hrtimer_start_range_ns(&box->hrtimer, + ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL), 0, + HRTIMER_MODE_REL_PINNED, 0); +} + +static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box) +{ + hrtimer_cancel(&box->hrtimer); +} + +static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box) +{ + hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + box->hrtimer.function = uncore_pmu_hrtimer; +} + +struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, + int cpu) +{ + struct intel_uncore_box *box; + int i, size; + + size = sizeof(*box) + type->num_shared_regs * + sizeof(struct intel_uncore_extra_reg); + + box = kmalloc_node(size, GFP_KERNEL | __GFP_ZERO, cpu_to_node(cpu)); + if (!box) + return NULL; + + for (i = 0; i < type->num_shared_regs; i++) + raw_spin_lock_init(&box->shared_regs[i].lock); + + uncore_pmu_init_hrtimer(box); + atomic_set(&box->refcnt, 1); + box->cpu = -1; + box->phys_id = -1; + + return box; +} + +static struct intel_uncore_box * +uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) +{ + static struct intel_uncore_box *box; + + box = *per_cpu_ptr(pmu->box, cpu); + if (box) + return box; + + raw_spin_lock(&uncore_box_lock); + list_for_each_entry(box, &pmu->box_list, list) { + if (box->phys_id == topology_physical_package_id(cpu)) { + atomic_inc(&box->refcnt); + *per_cpu_ptr(pmu->box, cpu) = box; + break; + } + } + raw_spin_unlock(&uncore_box_lock); + + return *per_cpu_ptr(pmu->box, cpu); +} + +static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event) +{ + return container_of(event->pmu, struct intel_uncore_pmu, pmu); +} + +static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event) +{ + /* + * perf core schedules event on the basis of cpu, uncore events are + * collected by one of the cpus inside a physical package. + */ + return uncore_pmu_to_box(uncore_event_to_pmu(event), + smp_processor_id()); +} + +static int uncore_collect_events(struct intel_uncore_box *box, + struct perf_event *leader, bool dogrp) +{ + struct perf_event *event; + int n, max_count; + + max_count = box->pmu->type->num_counters; + if (box->pmu->type->fixed_ctl) + max_count++; + + if (box->n_events >= max_count) + return -EINVAL; + + n = box->n_events; + box->event_list[n] = leader; + n++; + if (!dogrp) + return n; + + list_for_each_entry(event, &leader->sibling_list, group_entry) { + if (event->state <= PERF_EVENT_STATE_OFF) + continue; + + if (n >= max_count) + return -EINVAL; + + box->event_list[n] = event; + n++; + } + return n; +} + +static struct event_constraint * +uncore_get_event_constraint(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct intel_uncore_type *type = box->pmu->type; + struct event_constraint *c; + + if (type->ops->get_constraint) { + c = type->ops->get_constraint(box, event); + if (c) + return c; + } + + if (event->hw.config == ~0ULL) + return &constraint_fixed; + + if (type->constraints) { + for_each_event_constraint(c, type->constraints) { + if ((event->hw.config & c->cmask) == c->code) + return c; + } + } + + return &type->unconstrainted; +} + +static void uncore_put_event_constraint(struct intel_uncore_box *box, + struct perf_event *event) +{ + if (box->pmu->type->ops->put_constraint) + box->pmu->type->ops->put_constraint(box, event); +} + +static int uncore_assign_events(struct intel_uncore_box *box, + int assign[], int n) +{ + unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)]; + struct event_constraint *c, *constraints[UNCORE_PMC_IDX_MAX]; + int i, wmin, wmax, ret = 0; + struct hw_perf_event *hwc; + + bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX); + + for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) { + c = uncore_get_event_constraint(box, box->event_list[i]); + constraints[i] = c; + wmin = min(wmin, c->weight); + wmax = max(wmax, c->weight); + } + + /* fastpath, try to reuse previous register */ + for (i = 0; i < n; i++) { + hwc = &box->event_list[i]->hw; + c = constraints[i]; + + /* never assigned */ + if (hwc->idx == -1) + break; + + /* constraint still honored */ + if (!test_bit(hwc->idx, c->idxmsk)) + break; + + /* not already used */ + if (test_bit(hwc->idx, used_mask)) + break; + + __set_bit(hwc->idx, used_mask); + if (assign) + assign[i] = hwc->idx; + } + /* slow path */ + if (i != n) + ret = perf_assign_events(constraints, n, wmin, wmax, assign); + + if (!assign || ret) { + for (i = 0; i < n; i++) + uncore_put_event_constraint(box, box->event_list[i]); + } + return ret ? -EINVAL : 0; +} + +static void uncore_pmu_event_start(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + int idx = event->hw.idx; + + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX)) + return; + + event->hw.state = 0; + box->events[idx] = event; + box->n_active++; + __set_bit(idx, box->active_mask); + + local64_set(&event->hw.prev_count, uncore_read_counter(box, event)); + uncore_enable_event(box, event); + + if (box->n_active == 1) { + uncore_enable_box(box); + uncore_pmu_start_hrtimer(box); + } +} + +static void uncore_pmu_event_stop(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + struct hw_perf_event *hwc = &event->hw; + + if (__test_and_clear_bit(hwc->idx, box->active_mask)) { + uncore_disable_event(box, event); + box->n_active--; + box->events[hwc->idx] = NULL; + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + + if (box->n_active == 0) { + uncore_disable_box(box); + uncore_pmu_cancel_hrtimer(box); + } + } + + if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + uncore_perf_event_update(box, event); + hwc->state |= PERF_HES_UPTODATE; + } +} + +static int uncore_pmu_event_add(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + struct hw_perf_event *hwc = &event->hw; + int assign[UNCORE_PMC_IDX_MAX]; + int i, n, ret; + + if (!box) + return -ENODEV; + + ret = n = uncore_collect_events(box, event, false); + if (ret < 0) + return ret; + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + if (!(flags & PERF_EF_START)) + hwc->state |= PERF_HES_ARCH; + + ret = uncore_assign_events(box, assign, n); + if (ret) + return ret; + + /* save events moving to new counters */ + for (i = 0; i < box->n_events; i++) { + event = box->event_list[i]; + hwc = &event->hw; + + if (hwc->idx == assign[i] && + hwc->last_tag == box->tags[assign[i]]) + continue; + /* + * Ensure we don't accidentally enable a stopped + * counter simply because we rescheduled. + */ + if (hwc->state & PERF_HES_STOPPED) + hwc->state |= PERF_HES_ARCH; + + uncore_pmu_event_stop(event, PERF_EF_UPDATE); + } + + /* reprogram moved events into new counters */ + for (i = 0; i < n; i++) { + event = box->event_list[i]; + hwc = &event->hw; + + if (hwc->idx != assign[i] || + hwc->last_tag != box->tags[assign[i]]) + uncore_assign_hw_event(box, event, assign[i]); + else if (i < box->n_events) + continue; + + if (hwc->state & PERF_HES_ARCH) + continue; + + uncore_pmu_event_start(event, 0); + } + box->n_events = n; + + return 0; +} + +static void uncore_pmu_event_del(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + int i; + + uncore_pmu_event_stop(event, PERF_EF_UPDATE); + + for (i = 0; i < box->n_events; i++) { + if (event == box->event_list[i]) { + uncore_put_event_constraint(box, event); + + while (++i < box->n_events) + box->event_list[i - 1] = box->event_list[i]; + + --box->n_events; + break; + } + } + + event->hw.idx = -1; + event->hw.last_tag = ~0ULL; +} + +static void uncore_pmu_event_read(struct perf_event *event) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + uncore_perf_event_update(box, event); +} + +/* + * validation ensures the group can be loaded onto the + * PMU if it was the only group available. + */ +static int uncore_validate_group(struct intel_uncore_pmu *pmu, + struct perf_event *event) +{ + struct perf_event *leader = event->group_leader; + struct intel_uncore_box *fake_box; + int ret = -EINVAL, n; + + fake_box = uncore_alloc_box(pmu->type, smp_processor_id()); + if (!fake_box) + return -ENOMEM; + + fake_box->pmu = pmu; + /* + * the event is not yet connected with its + * siblings therefore we must first collect + * existing siblings, then add the new event + * before we can simulate the scheduling + */ + n = uncore_collect_events(fake_box, leader, true); + if (n < 0) + goto out; + + fake_box->n_events = n; + n = uncore_collect_events(fake_box, event, false); + if (n < 0) + goto out; + + fake_box->n_events = n; + + ret = uncore_assign_events(fake_box, NULL, n); +out: + kfree(fake_box); + return ret; +} + +int uncore_pmu_event_init(struct perf_event *event) +{ + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + struct hw_perf_event *hwc = &event->hw; + int ret; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + pmu = uncore_event_to_pmu(event); + /* no device found for this pmu */ + if (pmu->func_id < 0) + return -ENOENT; + + /* + * Uncore PMU does measure at all privilege level all the time. + * So it doesn't make sense to specify any exclude bits. + */ + if (event->attr.exclude_user || event->attr.exclude_kernel || + event->attr.exclude_hv || event->attr.exclude_idle) + return -EINVAL; + + /* Sampling not supported yet */ + if (hwc->sample_period) + return -EINVAL; + + /* + * Place all uncore events for a particular physical package + * onto a single cpu + */ + if (event->cpu < 0) + return -EINVAL; + box = uncore_pmu_to_box(pmu, event->cpu); + if (!box || box->cpu < 0) + return -EINVAL; + event->cpu = box->cpu; + + event->hw.idx = -1; + event->hw.last_tag = ~0ULL; + event->hw.extra_reg.idx = EXTRA_REG_NONE; + + if (event->attr.config == UNCORE_FIXED_EVENT) { + /* no fixed counter */ + if (!pmu->type->fixed_ctl) + return -EINVAL; + /* + * if there is only one fixed counter, only the first pmu + * can access the fixed counter + */ + if (pmu->type->single_fixed && pmu->pmu_idx > 0) + return -EINVAL; + hwc->config = ~0ULL; + } else { + hwc->config = event->attr.config & pmu->type->event_mask; + if (pmu->type->ops->hw_config) { + ret = pmu->type->ops->hw_config(box, event); + if (ret) + return ret; + } + } + + if (event->group_leader != event) + ret = uncore_validate_group(pmu, event); + else + ret = 0; + + return ret; +} + +static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu) +{ + int ret; + + pmu->pmu = (struct pmu) { + .attr_groups = pmu->type->attr_groups, + .task_ctx_nr = perf_invalid_context, + .event_init = uncore_pmu_event_init, + .add = uncore_pmu_event_add, + .del = uncore_pmu_event_del, + .start = uncore_pmu_event_start, + .stop = uncore_pmu_event_stop, + .read = uncore_pmu_event_read, + }; + + if (pmu->type->num_boxes == 1) { + if (strlen(pmu->type->name) > 0) + sprintf(pmu->name, "uncore_%s", pmu->type->name); + else + sprintf(pmu->name, "uncore"); + } else { + sprintf(pmu->name, "uncore_%s_%d", pmu->type->name, + pmu->pmu_idx); + } + + ret = perf_pmu_register(&pmu->pmu, pmu->name, -1); + return ret; +} + +static void __init uncore_type_exit(struct intel_uncore_type *type) +{ + int i; + + for (i = 0; i < type->num_boxes; i++) + free_percpu(type->pmus[i].box); + kfree(type->pmus); + type->pmus = NULL; + kfree(type->attr_groups[1]); + type->attr_groups[1] = NULL; +} + +static void uncore_types_exit(struct intel_uncore_type **types) +{ + int i; + for (i = 0; types[i]; i++) + uncore_type_exit(types[i]); +} + +static int __init uncore_type_init(struct intel_uncore_type *type) +{ + struct intel_uncore_pmu *pmus; + struct attribute_group *events_group; + struct attribute **attrs; + int i, j; + + pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL); + if (!pmus) + return -ENOMEM; + + type->unconstrainted = (struct event_constraint) + __EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1, + 0, type->num_counters, 0); + + for (i = 0; i < type->num_boxes; i++) { + pmus[i].func_id = -1; + pmus[i].pmu_idx = i; + pmus[i].type = type; + INIT_LIST_HEAD(&pmus[i].box_list); + pmus[i].box = alloc_percpu(struct intel_uncore_box *); + if (!pmus[i].box) + goto fail; + } + + if (type->event_descs) { + i = 0; + while (type->event_descs[i].attr.attr.name) + i++; + + events_group = kzalloc(sizeof(struct attribute *) * (i + 1) + + sizeof(*events_group), GFP_KERNEL); + if (!events_group) + goto fail; + + attrs = (struct attribute **)(events_group + 1); + events_group->name = "events"; + events_group->attrs = attrs; + + for (j = 0; j < i; j++) + attrs[j] = &type->event_descs[j].attr.attr; + + type->attr_groups[1] = events_group; + } + + type->pmus = pmus; + return 0; +fail: + uncore_type_exit(type); + return -ENOMEM; +} + +static int __init uncore_types_init(struct intel_uncore_type **types) +{ + int i, ret; + + for (i = 0; types[i]; i++) { + ret = uncore_type_init(types[i]); + if (ret) + goto fail; + } + return 0; +fail: + while (--i >= 0) + uncore_type_exit(types[i]); + return ret; +} + +static struct pci_driver *uncore_pci_driver; +static bool pcidrv_registered; + +/* + * add a pci uncore device + */ +static int __devinit uncore_pci_add(struct intel_uncore_type *type, + struct pci_dev *pdev) +{ + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, phys_id; + + phys_id = pcibus_to_physid[pdev->bus->number]; + if (phys_id < 0) + return -ENODEV; + + box = uncore_alloc_box(type, 0); + if (!box) + return -ENOMEM; + + /* + * for performance monitoring unit with multiple boxes, + * each box has a different function id. + */ + for (i = 0; i < type->num_boxes; i++) { + pmu = &type->pmus[i]; + if (pmu->func_id == pdev->devfn) + break; + if (pmu->func_id < 0) { + pmu->func_id = pdev->devfn; + break; + } + pmu = NULL; + } + + if (!pmu) { + kfree(box); + return -EINVAL; + } + + box->phys_id = phys_id; + box->pci_dev = pdev; + box->pmu = pmu; + uncore_box_init(box); + pci_set_drvdata(pdev, box); + + raw_spin_lock(&uncore_box_lock); + list_add_tail(&box->list, &pmu->box_list); + raw_spin_unlock(&uncore_box_lock); + + return 0; +} + +static void uncore_pci_remove(struct pci_dev *pdev) +{ + struct intel_uncore_box *box = pci_get_drvdata(pdev); + struct intel_uncore_pmu *pmu = box->pmu; + int cpu, phys_id = pcibus_to_physid[pdev->bus->number]; + + if (WARN_ON_ONCE(phys_id != box->phys_id)) + return; + + raw_spin_lock(&uncore_box_lock); + list_del(&box->list); + raw_spin_unlock(&uncore_box_lock); + + for_each_possible_cpu(cpu) { + if (*per_cpu_ptr(pmu->box, cpu) == box) { + *per_cpu_ptr(pmu->box, cpu) = NULL; + atomic_dec(&box->refcnt); + } + } + + WARN_ON_ONCE(atomic_read(&box->refcnt) != 1); + kfree(box); +} + +static int __devinit uncore_pci_probe(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + struct intel_uncore_type *type; + + type = (struct intel_uncore_type *)id->driver_data; + return uncore_pci_add(type, pdev); +} + +static int __init uncore_pci_init(void) +{ + int ret; + + switch (boot_cpu_data.x86_model) { + case 45: /* Sandy Bridge-EP */ + pci_uncores = snbep_pci_uncores; + uncore_pci_driver = &snbep_uncore_pci_driver; + snbep_pci2phy_map_init(); + break; + default: + return 0; + } + + ret = uncore_types_init(pci_uncores); + if (ret) + return ret; + + uncore_pci_driver->probe = uncore_pci_probe; + uncore_pci_driver->remove = uncore_pci_remove; + + ret = pci_register_driver(uncore_pci_driver); + if (ret == 0) + pcidrv_registered = true; + else + uncore_types_exit(pci_uncores); + + return ret; +} + +static void __init uncore_pci_exit(void) +{ + if (pcidrv_registered) { + pcidrv_registered = false; + pci_unregister_driver(uncore_pci_driver); + uncore_types_exit(pci_uncores); + } +} + +static void __cpuinit uncore_cpu_dying(int cpu) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, j; + + for (i = 0; msr_uncores[i]; i++) { + type = msr_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + box = *per_cpu_ptr(pmu->box, cpu); + *per_cpu_ptr(pmu->box, cpu) = NULL; + if (box && atomic_dec_and_test(&box->refcnt)) + kfree(box); + } + } +} + +static int __cpuinit uncore_cpu_starting(int cpu) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box, *exist; + int i, j, k, phys_id; + + phys_id = topology_physical_package_id(cpu); + + for (i = 0; msr_uncores[i]; i++) { + type = msr_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + box = *per_cpu_ptr(pmu->box, cpu); + /* called by uncore_cpu_init? */ + if (box && box->phys_id >= 0) { + uncore_box_init(box); + continue; + } + + for_each_online_cpu(k) { + exist = *per_cpu_ptr(pmu->box, k); + if (exist && exist->phys_id == phys_id) { + atomic_inc(&exist->refcnt); + *per_cpu_ptr(pmu->box, cpu) = exist; + kfree(box); + box = NULL; + break; + } + } + + if (box) { + box->phys_id = phys_id; + uncore_box_init(box); + } + } + } + return 0; +} + +static int __cpuinit uncore_cpu_prepare(int cpu, int phys_id) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, j; + + for (i = 0; msr_uncores[i]; i++) { + type = msr_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + if (pmu->func_id < 0) + pmu->func_id = j; + + box = uncore_alloc_box(type, cpu); + if (!box) + return -ENOMEM; + + box->pmu = pmu; + box->phys_id = phys_id; + *per_cpu_ptr(pmu->box, cpu) = box; + } + } + return 0; +} + +static void __cpuinit uncore_change_context(struct intel_uncore_type **uncores, + int old_cpu, int new_cpu) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, j; + + for (i = 0; uncores[i]; i++) { + type = uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + if (old_cpu < 0) + box = uncore_pmu_to_box(pmu, new_cpu); + else + box = uncore_pmu_to_box(pmu, old_cpu); + if (!box) + continue; + + if (old_cpu < 0) { + WARN_ON_ONCE(box->cpu != -1); + box->cpu = new_cpu; + continue; + } + + WARN_ON_ONCE(box->cpu != old_cpu); + if (new_cpu >= 0) { + uncore_pmu_cancel_hrtimer(box); + perf_pmu_migrate_context(&pmu->pmu, + old_cpu, new_cpu); + box->cpu = new_cpu; + } else { + box->cpu = -1; + } + } + } +} + +static void __cpuinit uncore_event_exit_cpu(int cpu) +{ + int i, phys_id, target; + + /* if exiting cpu is used for collecting uncore events */ + if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask)) + return; + + /* find a new cpu to collect uncore events */ + phys_id = topology_physical_package_id(cpu); + target = -1; + for_each_online_cpu(i) { + if (i == cpu) + continue; + if (phys_id == topology_physical_package_id(i)) { + target = i; + break; + } + } + + /* migrate uncore events to the new cpu */ + if (target >= 0) + cpumask_set_cpu(target, &uncore_cpu_mask); + + uncore_change_context(msr_uncores, cpu, target); + uncore_change_context(pci_uncores, cpu, target); +} + +static void __cpuinit uncore_event_init_cpu(int cpu) +{ + int i, phys_id; + + phys_id = topology_physical_package_id(cpu); + for_each_cpu(i, &uncore_cpu_mask) { + if (phys_id == topology_physical_package_id(i)) + return; + } + + cpumask_set_cpu(cpu, &uncore_cpu_mask); + + uncore_change_context(msr_uncores, -1, cpu); + uncore_change_context(pci_uncores, -1, cpu); +} + +static int __cpuinit uncore_cpu_notifier(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + /* allocate/free data structure for uncore box */ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + uncore_cpu_prepare(cpu, -1); + break; + case CPU_STARTING: + uncore_cpu_starting(cpu); + break; + case CPU_UP_CANCELED: + case CPU_DYING: + uncore_cpu_dying(cpu); + break; + default: + break; + } + + /* select the cpu that collects uncore events */ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_FAILED: + case CPU_STARTING: + uncore_event_init_cpu(cpu); + break; + case CPU_DOWN_PREPARE: + uncore_event_exit_cpu(cpu); + break; + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block uncore_cpu_nb __cpuinitdata = { + .notifier_call = uncore_cpu_notifier, + /* + * to migrate uncore events, our notifier should be executed + * before perf core's notifier. + */ + .priority = CPU_PRI_PERF + 1, +}; + +static void __init uncore_cpu_setup(void *dummy) +{ + uncore_cpu_starting(smp_processor_id()); +} + +static int __init uncore_cpu_init(void) +{ + int ret, cpu, max_cores; + + max_cores = boot_cpu_data.x86_max_cores; + switch (boot_cpu_data.x86_model) { + case 26: /* Nehalem */ + case 30: + case 37: /* Westmere */ + case 44: + msr_uncores = nhm_msr_uncores; + break; + case 42: /* Sandy Bridge */ + if (snb_uncore_cbox.num_boxes > max_cores) + snb_uncore_cbox.num_boxes = max_cores; + msr_uncores = snb_msr_uncores; + break; + case 45: /* Sandy Birdge-EP */ + if (snbep_uncore_cbox.num_boxes > max_cores) + snbep_uncore_cbox.num_boxes = max_cores; + msr_uncores = snbep_msr_uncores; + break; + default: + return 0; + } + + ret = uncore_types_init(msr_uncores); + if (ret) + return ret; + + get_online_cpus(); + + for_each_online_cpu(cpu) { + int i, phys_id = topology_physical_package_id(cpu); + + for_each_cpu(i, &uncore_cpu_mask) { + if (phys_id == topology_physical_package_id(i)) { + phys_id = -1; + break; + } + } + if (phys_id < 0) + continue; + + uncore_cpu_prepare(cpu, phys_id); + uncore_event_init_cpu(cpu); + } + on_each_cpu(uncore_cpu_setup, NULL, 1); + + register_cpu_notifier(&uncore_cpu_nb); + + put_online_cpus(); + + return 0; +} + +static int __init uncore_pmus_register(void) +{ + struct intel_uncore_pmu *pmu; + struct intel_uncore_type *type; + int i, j; + + for (i = 0; msr_uncores[i]; i++) { + type = msr_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + uncore_pmu_register(pmu); + } + } + + for (i = 0; pci_uncores[i]; i++) { + type = pci_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + uncore_pmu_register(pmu); + } + } + + return 0; +} + +static int __init intel_uncore_init(void) +{ + int ret; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return -ENODEV; + + ret = uncore_pci_init(); + if (ret) + goto fail; + ret = uncore_cpu_init(); + if (ret) { + uncore_pci_exit(); + goto fail; + } + + uncore_pmus_register(); + return 0; +fail: + return ret; +} +device_initcall(intel_uncore_init); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h new file mode 100644 index 000000000000..b13e9ea81def --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -0,0 +1,424 @@ +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/pci.h> +#include <linux/perf_event.h> +#include "perf_event.h" + +#define UNCORE_PMU_NAME_LEN 32 +#define UNCORE_BOX_HASH_SIZE 8 + +#define UNCORE_PMU_HRTIMER_INTERVAL (60 * NSEC_PER_SEC) + +#define UNCORE_FIXED_EVENT 0xff +#define UNCORE_PMC_IDX_MAX_GENERIC 8 +#define UNCORE_PMC_IDX_FIXED UNCORE_PMC_IDX_MAX_GENERIC +#define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FIXED + 1) + +#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff) + +/* SNB event control */ +#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff +#define SNB_UNC_CTL_UMASK_MASK 0x0000ff00 +#define SNB_UNC_CTL_EDGE_DET (1 << 18) +#define SNB_UNC_CTL_EN (1 << 22) +#define SNB_UNC_CTL_INVERT (1 << 23) +#define SNB_UNC_CTL_CMASK_MASK 0x1f000000 +#define NHM_UNC_CTL_CMASK_MASK 0xff000000 +#define NHM_UNC_FIXED_CTR_CTL_EN (1 << 0) + +#define SNB_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \ + SNB_UNC_CTL_UMASK_MASK | \ + SNB_UNC_CTL_EDGE_DET | \ + SNB_UNC_CTL_INVERT | \ + SNB_UNC_CTL_CMASK_MASK) + +#define NHM_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \ + SNB_UNC_CTL_UMASK_MASK | \ + SNB_UNC_CTL_EDGE_DET | \ + SNB_UNC_CTL_INVERT | \ + NHM_UNC_CTL_CMASK_MASK) + +/* SNB global control register */ +#define SNB_UNC_PERF_GLOBAL_CTL 0x391 +#define SNB_UNC_FIXED_CTR_CTRL 0x394 +#define SNB_UNC_FIXED_CTR 0x395 + +/* SNB uncore global control */ +#define SNB_UNC_GLOBAL_CTL_CORE_ALL ((1 << 4) - 1) +#define SNB_UNC_GLOBAL_CTL_EN (1 << 29) + +/* SNB Cbo register */ +#define SNB_UNC_CBO_0_PERFEVTSEL0 0x700 +#define SNB_UNC_CBO_0_PER_CTR0 0x706 +#define SNB_UNC_CBO_MSR_OFFSET 0x10 + +/* NHM global control register */ +#define NHM_UNC_PERF_GLOBAL_CTL 0x391 +#define NHM_UNC_FIXED_CTR 0x394 +#define NHM_UNC_FIXED_CTR_CTRL 0x395 + +/* NHM uncore global control */ +#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL ((1ULL << 8) - 1) +#define NHM_UNC_GLOBAL_CTL_EN_FC (1ULL << 32) + +/* NHM uncore register */ +#define NHM_UNC_PERFEVTSEL0 0x3c0 +#define NHM_UNC_UNCORE_PMC0 0x3b0 + +/* SNB-EP Box level control */ +#define SNBEP_PMON_BOX_CTL_RST_CTRL (1 << 0) +#define SNBEP_PMON_BOX_CTL_RST_CTRS (1 << 1) +#define SNBEP_PMON_BOX_CTL_FRZ (1 << 8) +#define SNBEP_PMON_BOX_CTL_FRZ_EN (1 << 16) +#define SNBEP_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \ + SNBEP_PMON_BOX_CTL_RST_CTRS | \ + SNBEP_PMON_BOX_CTL_FRZ_EN) +/* SNB-EP event control */ +#define SNBEP_PMON_CTL_EV_SEL_MASK 0x000000ff +#define SNBEP_PMON_CTL_UMASK_MASK 0x0000ff00 +#define SNBEP_PMON_CTL_RST (1 << 17) +#define SNBEP_PMON_CTL_EDGE_DET (1 << 18) +#define SNBEP_PMON_CTL_EV_SEL_EXT (1 << 21) /* only for QPI */ +#define SNBEP_PMON_CTL_EN (1 << 22) +#define SNBEP_PMON_CTL_INVERT (1 << 23) +#define SNBEP_PMON_CTL_TRESH_MASK 0xff000000 +#define SNBEP_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PMON_CTL_UMASK_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_INVERT | \ + SNBEP_PMON_CTL_TRESH_MASK) + +/* SNB-EP Ubox event control */ +#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK 0x1f000000 +#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK \ + (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PMON_CTL_UMASK_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_INVERT | \ + SNBEP_U_MSR_PMON_CTL_TRESH_MASK) + +#define SNBEP_CBO_PMON_CTL_TID_EN (1 << 19) +#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \ + SNBEP_CBO_PMON_CTL_TID_EN) + +/* SNB-EP PCU event control */ +#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK 0x0000c000 +#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK 0x1f000000 +#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT (1 << 30) +#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET (1 << 31) +#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK \ + (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_INVERT | \ + SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET) + +/* SNB-EP pci control register */ +#define SNBEP_PCI_PMON_BOX_CTL 0xf4 +#define SNBEP_PCI_PMON_CTL0 0xd8 +/* SNB-EP pci counter register */ +#define SNBEP_PCI_PMON_CTR0 0xa0 + +/* SNB-EP home agent register */ +#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0 0x40 +#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1 0x44 +#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH 0x48 +/* SNB-EP memory controller register */ +#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL 0xf0 +#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR 0xd0 +/* SNB-EP QPI register */ +#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0 0x228 +#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1 0x22c +#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0 0x238 +#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1 0x23c + +/* SNB-EP Ubox register */ +#define SNBEP_U_MSR_PMON_CTR0 0xc16 +#define SNBEP_U_MSR_PMON_CTL0 0xc10 + +#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL 0xc08 +#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR 0xc09 + +/* SNB-EP Cbo register */ +#define SNBEP_C0_MSR_PMON_CTR0 0xd16 +#define SNBEP_C0_MSR_PMON_CTL0 0xd10 +#define SNBEP_C0_MSR_PMON_BOX_CTL 0xd04 +#define SNBEP_C0_MSR_PMON_BOX_FILTER 0xd14 +#define SNBEP_CB0_MSR_PMON_BOX_FILTER_MASK 0xfffffc1f +#define SNBEP_CBO_MSR_OFFSET 0x20 + +/* SNB-EP PCU register */ +#define SNBEP_PCU_MSR_PMON_CTR0 0xc36 +#define SNBEP_PCU_MSR_PMON_CTL0 0xc30 +#define SNBEP_PCU_MSR_PMON_BOX_CTL 0xc24 +#define SNBEP_PCU_MSR_PMON_BOX_FILTER 0xc34 +#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK 0xffffffff +#define SNBEP_PCU_MSR_CORE_C3_CTR 0x3fc +#define SNBEP_PCU_MSR_CORE_C6_CTR 0x3fd + +struct intel_uncore_ops; +struct intel_uncore_pmu; +struct intel_uncore_box; +struct uncore_event_desc; + +struct intel_uncore_type { + const char *name; + int num_counters; + int num_boxes; + int perf_ctr_bits; + int fixed_ctr_bits; + unsigned perf_ctr; + unsigned event_ctl; + unsigned event_mask; + unsigned fixed_ctr; + unsigned fixed_ctl; + unsigned box_ctl; + unsigned msr_offset; + unsigned num_shared_regs:8; + unsigned single_fixed:1; + struct event_constraint unconstrainted; + struct event_constraint *constraints; + struct intel_uncore_pmu *pmus; + struct intel_uncore_ops *ops; + struct uncore_event_desc *event_descs; + const struct attribute_group *attr_groups[3]; +}; + +#define format_group attr_groups[0] + +struct intel_uncore_ops { + void (*init_box)(struct intel_uncore_box *); + void (*disable_box)(struct intel_uncore_box *); + void (*enable_box)(struct intel_uncore_box *); + void (*disable_event)(struct intel_uncore_box *, struct perf_event *); + void (*enable_event)(struct intel_uncore_box *, struct perf_event *); + u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *); + int (*hw_config)(struct intel_uncore_box *, struct perf_event *); + struct event_constraint *(*get_constraint)(struct intel_uncore_box *, + struct perf_event *); + void (*put_constraint)(struct intel_uncore_box *, struct perf_event *); +}; + +struct intel_uncore_pmu { + struct pmu pmu; + char name[UNCORE_PMU_NAME_LEN]; + int pmu_idx; + int func_id; + struct intel_uncore_type *type; + struct intel_uncore_box ** __percpu box; + struct list_head box_list; +}; + +struct intel_uncore_extra_reg { + raw_spinlock_t lock; + u64 config1; + atomic_t ref; +}; + +struct intel_uncore_box { + int phys_id; + int n_active; /* number of active events */ + int n_events; + int cpu; /* cpu to collect events */ + unsigned long flags; + atomic_t refcnt; + struct perf_event *events[UNCORE_PMC_IDX_MAX]; + struct perf_event *event_list[UNCORE_PMC_IDX_MAX]; + unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)]; + u64 tags[UNCORE_PMC_IDX_MAX]; + struct pci_dev *pci_dev; + struct intel_uncore_pmu *pmu; + struct hrtimer hrtimer; + struct list_head list; + struct intel_uncore_extra_reg shared_regs[0]; +}; + +#define UNCORE_BOX_FLAG_INITIATED 0 + +struct uncore_event_desc { + struct kobj_attribute attr; + const char *config; +}; + +#define INTEL_UNCORE_EVENT_DESC(_name, _config) \ +{ \ + .attr = __ATTR(_name, 0444, uncore_event_show, NULL), \ + .config = _config, \ +} + +#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format) \ +static ssize_t __uncore_##_var##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *page) \ +{ \ + BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ + return sprintf(page, _format "\n"); \ +} \ +static struct kobj_attribute format_attr_##_var = \ + __ATTR(_name, 0444, __uncore_##_var##_show, NULL) + + +static ssize_t uncore_event_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct uncore_event_desc *event = + container_of(attr, struct uncore_event_desc, attr); + return sprintf(buf, "%s", event->config); +} + +static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box) +{ + return box->pmu->type->box_ctl; +} + +static inline unsigned uncore_pci_fixed_ctl(struct intel_uncore_box *box) +{ + return box->pmu->type->fixed_ctl; +} + +static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box) +{ + return box->pmu->type->fixed_ctr; +} + +static inline +unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx) +{ + return idx * 4 + box->pmu->type->event_ctl; +} + +static inline +unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx) +{ + return idx * 8 + box->pmu->type->perf_ctr; +} + +static inline +unsigned uncore_msr_box_ctl(struct intel_uncore_box *box) +{ + if (!box->pmu->type->box_ctl) + return 0; + return box->pmu->type->box_ctl + + box->pmu->type->msr_offset * box->pmu->pmu_idx; +} + +static inline +unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box) +{ + if (!box->pmu->type->fixed_ctl) + return 0; + return box->pmu->type->fixed_ctl + + box->pmu->type->msr_offset * box->pmu->pmu_idx; +} + +static inline +unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box) +{ + return box->pmu->type->fixed_ctr + + box->pmu->type->msr_offset * box->pmu->pmu_idx; +} + +static inline +unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx) +{ + return idx + box->pmu->type->event_ctl + + box->pmu->type->msr_offset * box->pmu->pmu_idx; +} + +static inline +unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx) +{ + return idx + box->pmu->type->perf_ctr + + box->pmu->type->msr_offset * box->pmu->pmu_idx; +} + +static inline +unsigned uncore_fixed_ctl(struct intel_uncore_box *box) +{ + if (box->pci_dev) + return uncore_pci_fixed_ctl(box); + else + return uncore_msr_fixed_ctl(box); +} + +static inline +unsigned uncore_fixed_ctr(struct intel_uncore_box *box) +{ + if (box->pci_dev) + return uncore_pci_fixed_ctr(box); + else + return uncore_msr_fixed_ctr(box); +} + +static inline +unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx) +{ + if (box->pci_dev) + return uncore_pci_event_ctl(box, idx); + else + return uncore_msr_event_ctl(box, idx); +} + +static inline +unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx) +{ + if (box->pci_dev) + return uncore_pci_perf_ctr(box, idx); + else + return uncore_msr_perf_ctr(box, idx); +} + +static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box) +{ + return box->pmu->type->perf_ctr_bits; +} + +static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box) +{ + return box->pmu->type->fixed_ctr_bits; +} + +static inline int uncore_num_counters(struct intel_uncore_box *box) +{ + return box->pmu->type->num_counters; +} + +static inline void uncore_disable_box(struct intel_uncore_box *box) +{ + if (box->pmu->type->ops->disable_box) + box->pmu->type->ops->disable_box(box); +} + +static inline void uncore_enable_box(struct intel_uncore_box *box) +{ + if (box->pmu->type->ops->enable_box) + box->pmu->type->ops->enable_box(box); +} + +static inline void uncore_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + box->pmu->type->ops->disable_event(box, event); +} + +static inline void uncore_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + box->pmu->type->ops->enable_event(box, event); +} + +static inline u64 uncore_read_counter(struct intel_uncore_box *box, + struct perf_event *event) +{ + return box->pmu->type->ops->read_counter(box, event); +} + +static inline void uncore_box_init(struct intel_uncore_box *box) +{ + if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) { + if (box->pmu->type->ops->init_box) + box->pmu->type->ops->init_box(box); + } +} diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 47124a73dd73..92c7e39a079f 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -895,8 +895,8 @@ static void p4_pmu_disable_pebs(void) * So at moment let leave metrics turned on forever -- it's * ok for now but need to be revisited! * - * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0); - * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0); + * (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)0); + * (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)0); */ } @@ -909,7 +909,7 @@ static inline void p4_pmu_disable_event(struct perf_event *event) * state we need to clear P4_CCCR_OVF, otherwise interrupt get * asserted again and again */ - (void)checking_wrmsrl(hwc->config_base, + (void)wrmsrl_safe(hwc->config_base, (u64)(p4_config_unpack_cccr(hwc->config)) & ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED); } @@ -943,8 +943,8 @@ static void p4_pmu_enable_pebs(u64 config) bind = &p4_pebs_bind_map[idx]; - (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs); - (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert); + (void)wrmsrl_safe(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs); + (void)wrmsrl_safe(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert); } static void p4_pmu_enable_event(struct perf_event *event) @@ -978,8 +978,8 @@ static void p4_pmu_enable_event(struct perf_event *event) */ p4_pmu_enable_pebs(hwc->config); - (void)checking_wrmsrl(escr_addr, escr_conf); - (void)checking_wrmsrl(hwc->config_base, + (void)wrmsrl_safe(escr_addr, escr_conf); + (void)wrmsrl_safe(hwc->config_base, (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE); } @@ -1325,7 +1325,7 @@ __init int p4_pmu_init(void) unsigned int low, high; /* If we get stripped -- indexing fails */ - BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC); + BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC); rdmsr(MSR_IA32_MISC_ENABLE, low, high); if (!(low & (1 << 7))) { diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index 32bcfc7dd230..e4dd0f7a0453 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -71,7 +71,7 @@ p6_pmu_disable_event(struct perf_event *event) if (cpuc->enabled) val |= ARCH_PERFMON_EVENTSEL_ENABLE; - (void)checking_wrmsrl(hwc->config_base, val); + (void)wrmsrl_safe(hwc->config_base, val); } static void p6_pmu_enable_event(struct perf_event *event) @@ -84,7 +84,7 @@ static void p6_pmu_enable_event(struct perf_event *event) if (cpuc->enabled) val |= ARCH_PERFMON_EVENTSEL_ENABLE; - (void)checking_wrmsrl(hwc->config_base, val); + (void)wrmsrl_safe(hwc->config_base, val); } PMU_FORMAT_ATTR(event, "config:0-7" ); diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index addf9e82a7f2..ee8e9abc859f 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -31,7 +31,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) const struct cpuid_bit *cb; static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { - { X86_FEATURE_DTS, CR_EAX, 0, 0x00000006, 0 }, + { X86_FEATURE_DTHERM, CR_EAX, 0, 0x00000006, 0 }, { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 }, diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 571246d81edf..ae42418bc50f 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -27,8 +27,8 @@ static int die_counter; void printk_address(unsigned long address, int reliable) { - printk(" [<%p>] %s%pB\n", (void *) address, - reliable ? "" : "? ", (void *) address); + pr_cont(" [<%p>] %s%pB\n", + (void *)address, reliable ? "" : "? ", (void *)address); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -271,6 +271,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) return 1; + print_modules(); show_regs(regs); #ifdef CONFIG_X86_32 if (user_mode_vm(regs)) { diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index e0b1d783daab..1038a417ea53 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -73,11 +73,11 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, if (kstack_end(stack)) break; if (i && ((i % STACKSLOTS_PER_LINE) == 0)) - printk(KERN_CONT "\n"); - printk(KERN_CONT " %08lx", *stack++); + pr_cont("\n"); + pr_cont(" %08lx", *stack++); touch_nmi_watchdog(); } - printk(KERN_CONT "\n"); + pr_cont("\n"); show_trace_log_lvl(task, regs, sp, bp, log_lvl); } @@ -86,12 +86,11 @@ void show_regs(struct pt_regs *regs) { int i; - print_modules(); __show_regs(regs, !user_mode_vm(regs)); - printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", - TASK_COMM_LEN, current->comm, task_pid_nr(current), - current_thread_info(), current, task_thread_info(current)); + pr_emerg("Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", + TASK_COMM_LEN, current->comm, task_pid_nr(current), + current_thread_info(), current, task_thread_info(current)); /* * When in-kernel, we also print out the stack and code at the * time of the fault.. @@ -102,10 +101,10 @@ void show_regs(struct pt_regs *regs) unsigned char c; u8 *ip; - printk(KERN_EMERG "Stack:\n"); + pr_emerg("Stack:\n"); show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG); - printk(KERN_EMERG "Code: "); + pr_emerg("Code:"); ip = (u8 *)regs->ip - code_prologue; if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { @@ -116,16 +115,16 @@ void show_regs(struct pt_regs *regs) for (i = 0; i < code_len; i++, ip++) { if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - printk(KERN_CONT " Bad EIP value."); + pr_cont(" Bad EIP value."); break; } if (ip == (u8 *)regs->ip) - printk(KERN_CONT "<%02x> ", c); + pr_cont(" <%02x>", c); else - printk(KERN_CONT "%02x ", c); + pr_cont(" %02x", c); } } - printk(KERN_CONT "\n"); + pr_cont("\n"); } int is_valid_bugaddr(unsigned long ip) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 791b76122aa8..b653675d5288 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -228,20 +228,20 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, if (stack >= irq_stack && stack <= irq_stack_end) { if (stack == irq_stack_end) { stack = (unsigned long *) (irq_stack_end[-1]); - printk(KERN_CONT " <EOI> "); + pr_cont(" <EOI> "); } } else { if (((long) stack & (THREAD_SIZE-1)) == 0) break; } if (i && ((i % STACKSLOTS_PER_LINE) == 0)) - printk(KERN_CONT "\n"); - printk(KERN_CONT " %016lx", *stack++); + pr_cont("\n"); + pr_cont(" %016lx", *stack++); touch_nmi_watchdog(); } preempt_enable(); - printk(KERN_CONT "\n"); + pr_cont("\n"); show_trace_log_lvl(task, regs, sp, bp, log_lvl); } @@ -254,10 +254,9 @@ void show_regs(struct pt_regs *regs) sp = regs->sp; printk("CPU %d ", cpu); - print_modules(); __show_regs(regs, 1); - printk("Process %s (pid: %d, threadinfo %p, task %p)\n", - cur->comm, cur->pid, task_thread_info(cur), cur); + printk(KERN_DEFAULT "Process %s (pid: %d, threadinfo %p, task %p)\n", + cur->comm, cur->pid, task_thread_info(cur), cur); /* * When in-kernel, we also print out the stack and code at the @@ -284,16 +283,16 @@ void show_regs(struct pt_regs *regs) for (i = 0; i < code_len; i++, ip++) { if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - printk(KERN_CONT " Bad RIP value."); + pr_cont(" Bad RIP value."); break; } if (ip == (u8 *)regs->ip) - printk(KERN_CONT "<%02x> ", c); + pr_cont("<%02x> ", c); else - printk(KERN_CONT "%02x ", c); + pr_cont("%02x ", c); } } - printk(KERN_CONT "\n"); + pr_cont("\n"); } int is_valid_bugaddr(unsigned long ip) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 7d65133b51be..111f6bbd8b38 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1758,10 +1758,30 @@ end_repeat_nmi: */ call save_paranoid DEFAULT_FRAME 0 + + /* + * Save off the CR2 register. If we take a page fault in the NMI then + * it could corrupt the CR2 value. If the NMI preempts a page fault + * handler before it was able to read the CR2 register, and then the + * NMI itself takes a page fault, the page fault that was preempted + * will read the information from the NMI page fault and not the + * origin fault. Save it off and restore it if it changes. + * Use the r12 callee-saved register. + */ + movq %cr2, %r12 + /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ movq %rsp,%rdi movq $-1,%rsi call do_nmi + + /* Did the NMI take a page fault? Restore cr2 if it did */ + movq %cr2, %rcx + cmpq %rcx, %r12 + je 1f + movq %r12, %cr2 +1: + testl %ebx,%ebx /* swapgs needed? */ jnz nmi_restore nmi_swapgs: diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 3dafc6003b7c..1f5f1d5d2a02 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -294,9 +294,9 @@ void fixup_irqs(void) raw_spin_unlock(&desc->lock); if (break_affinity && set_affinity) - printk("Broke affinity for irq %i\n", irq); + pr_notice("Broke affinity for irq %i\n", irq); else if (!set_affinity) - printk("Cannot set affinity for irq %i\n", irq); + pr_notice("Cannot set affinity for irq %i\n", irq); } /* diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 8bfb6146f753..3f61904365cf 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -444,12 +444,12 @@ void kgdb_roundup_cpus(unsigned long flags) /** * kgdb_arch_handle_exception - Handle architecture specific GDB packets. - * @vector: The error vector of the exception that happened. + * @e_vector: The error vector of the exception that happened. * @signo: The signal number of the exception that happened. * @err_code: The error code of the exception that happened. - * @remcom_in_buffer: The buffer of the packet we have read. - * @remcom_out_buffer: The buffer of %BUFMAX bytes to write a packet into. - * @regs: The &struct pt_regs of the current process. + * @remcomInBuffer: The buffer of the packet we have read. + * @remcomOutBuffer: The buffer of %BUFMAX bytes to write a packet into. + * @linux_regs: The &struct pt_regs of the current process. * * This function MUST handle the 'c' and 's' command packets, * as well packets to set / remove a hardware breakpoint, if used. diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 086eb58c6e80..f1b42b3a186c 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -120,11 +120,6 @@ bool kvm_check_and_clear_guest_paused(void) bool ret = false; struct pvclock_vcpu_time_info *src; - /* - * per_cpu() is safe here because this function is only called from - * timer functions where preemption is already disabled. - */ - WARN_ON(!in_atomic()); src = &__get_cpu_var(hv_clock); if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { __this_cpu_and(hv_clock.flags, ~PVCLOCK_GUEST_STOPPED); diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index fbdfc6917180..4873e62db6a1 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -87,6 +87,7 @@ #include <asm/microcode.h> #include <asm/processor.h> #include <asm/cpu_device_id.h> +#include <asm/perf_event.h> MODULE_DESCRIPTION("Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); @@ -277,7 +278,6 @@ static int reload_for_cpu(int cpu) struct ucode_cpu_info *uci = ucode_cpu_info + cpu; int err = 0; - mutex_lock(µcode_mutex); if (uci->valid) { enum ucode_state ustate; @@ -288,7 +288,6 @@ static int reload_for_cpu(int cpu) if (ustate == UCODE_ERROR) err = -EINVAL; } - mutex_unlock(µcode_mutex); return err; } @@ -298,19 +297,31 @@ static ssize_t reload_store(struct device *dev, const char *buf, size_t size) { unsigned long val; - int cpu = dev->id; - ssize_t ret = 0; + int cpu; + ssize_t ret = 0, tmp_ret; ret = kstrtoul(buf, 0, &val); if (ret) return ret; - if (val == 1) { - get_online_cpus(); - if (cpu_online(cpu)) - ret = reload_for_cpu(cpu); - put_online_cpus(); + if (val != 1) + return size; + + get_online_cpus(); + mutex_lock(µcode_mutex); + for_each_online_cpu(cpu) { + tmp_ret = reload_for_cpu(cpu); + if (tmp_ret != 0) + pr_warn("Error reloading microcode on CPU %d\n", cpu); + + /* save retval of the first encountered reload error */ + if (!ret) + ret = tmp_ret; } + if (!ret) + perf_check_microcode(); + mutex_unlock(µcode_mutex); + put_online_cpus(); if (!ret) ret = size; @@ -339,7 +350,6 @@ static DEVICE_ATTR(version, 0400, version_show, NULL); static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL); static struct attribute *mc_default_attrs[] = { - &dev_attr_reload.attr, &dev_attr_version.attr, &dev_attr_processor_flags.attr, NULL @@ -504,7 +514,7 @@ static struct notifier_block __refdata mc_cpu_notifier = { #ifdef MODULE /* Autoload on Intel and AMD systems */ -static const struct x86_cpu_id microcode_id[] = { +static const struct x86_cpu_id __initconst microcode_id[] = { #ifdef CONFIG_MICROCODE_INTEL { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, }, #endif @@ -516,6 +526,16 @@ static const struct x86_cpu_id microcode_id[] = { MODULE_DEVICE_TABLE(x86cpu, microcode_id); #endif +static struct attribute *cpu_root_microcode_attrs[] = { + &dev_attr_reload.attr, + NULL +}; + +static struct attribute_group cpu_root_microcode_group = { + .name = "microcode", + .attrs = cpu_root_microcode_attrs, +}; + static int __init microcode_init(void) { struct cpuinfo_x86 *c = &cpu_data(0); @@ -540,16 +560,25 @@ static int __init microcode_init(void) mutex_lock(µcode_mutex); error = subsys_interface_register(&mc_cpu_interface); - + if (!error) + perf_check_microcode(); mutex_unlock(µcode_mutex); put_online_cpus(); if (error) goto out_pdev; + error = sysfs_create_group(&cpu_subsys.dev_root->kobj, + &cpu_root_microcode_group); + + if (error) { + pr_err("Error creating microcode group!\n"); + goto out_driver; + } + error = microcode_dev_init(); if (error) - goto out_driver; + goto out_ucode_group; register_syscore_ops(&mc_syscore_ops); register_hotcpu_notifier(&mc_cpu_notifier); @@ -559,7 +588,11 @@ static int __init microcode_init(void) return 0; -out_driver: + out_ucode_group: + sysfs_remove_group(&cpu_subsys.dev_root->kobj, + &cpu_root_microcode_group); + + out_driver: get_online_cpus(); mutex_lock(µcode_mutex); @@ -568,7 +601,7 @@ out_driver: mutex_unlock(µcode_mutex); put_online_cpus(); -out_pdev: + out_pdev: platform_device_unregister(microcode_pdev); return error; @@ -584,6 +617,9 @@ static void __exit microcode_exit(void) unregister_hotcpu_notifier(&mc_cpu_notifier); unregister_syscore_ops(&mc_syscore_ops); + sysfs_remove_group(&cpu_subsys.dev_root->kobj, + &cpu_root_microcode_group); + get_online_cpus(); mutex_lock(µcode_mutex); diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index f21fd94ac897..202494d2ec6e 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -15,6 +15,9 @@ along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/moduleloader.h> #include <linux/elf.h> #include <linux/vmalloc.h> @@ -30,9 +33,14 @@ #include <asm/pgtable.h> #if 0 -#define DEBUGP printk +#define DEBUGP(fmt, ...) \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__) #else -#define DEBUGP(fmt...) +#define DEBUGP(fmt, ...) \ +do { \ + if (0) \ + printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ +} while (0) #endif void *module_alloc(unsigned long size) @@ -56,8 +64,8 @@ int apply_relocate(Elf32_Shdr *sechdrs, Elf32_Sym *sym; uint32_t *location; - DEBUGP("Applying relocate section %u to %u\n", relsec, - sechdrs[relsec].sh_info); + DEBUGP("Applying relocate section %u to %u\n", + relsec, sechdrs[relsec].sh_info); for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { /* This is where to make the change */ location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr @@ -77,7 +85,7 @@ int apply_relocate(Elf32_Shdr *sechdrs, *location += sym->st_value - (uint32_t)location; break; default: - printk(KERN_ERR "module %s: Unknown relocation: %u\n", + pr_err("%s: Unknown relocation: %u\n", me->name, ELF32_R_TYPE(rel[i].r_info)); return -ENOEXEC; } @@ -97,8 +105,8 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, void *loc; u64 val; - DEBUGP("Applying relocate section %u to %u\n", relsec, - sechdrs[relsec].sh_info); + DEBUGP("Applying relocate section %u to %u\n", + relsec, sechdrs[relsec].sh_info); for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { /* This is where to make the change */ loc = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr @@ -110,8 +118,8 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, + ELF64_R_SYM(rel[i].r_info); DEBUGP("type %d st_value %Lx r_addend %Lx loc %Lx\n", - (int)ELF64_R_TYPE(rel[i].r_info), - sym->st_value, rel[i].r_addend, (u64)loc); + (int)ELF64_R_TYPE(rel[i].r_info), + sym->st_value, rel[i].r_addend, (u64)loc); val = sym->st_value + rel[i].r_addend; @@ -140,7 +148,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, #endif break; default: - printk(KERN_ERR "module %s: Unknown rela relocation: %llu\n", + pr_err("%s: Unknown rela relocation: %llu\n", me->name, ELF64_R_TYPE(rel[i].r_info)); return -ENOEXEC; } @@ -148,9 +156,9 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, return 0; overflow: - printk(KERN_ERR "overflow in relocation type %d val %Lx\n", + pr_err("overflow in relocation type %d val %Lx\n", (int)ELF64_R_TYPE(rel[i].r_info), val); - printk(KERN_ERR "`%s' likely not compiled with -mcmodel=kernel\n", + pr_err("`%s' likely not compiled with -mcmodel=kernel\n", me->name); return -ENOEXEC; } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index a0b2f84457be..f84f5c57de35 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -365,8 +365,9 @@ static __kprobes void default_do_nmi(struct pt_regs *regs) #ifdef CONFIG_X86_32 /* * For i386, NMIs use the same stack as the kernel, and we can - * add a workaround to the iret problem in C. Simply have 3 states - * the NMI can be in. + * add a workaround to the iret problem in C (preventing nested + * NMIs if an NMI takes a trap). Simply have 3 states the NMI + * can be in: * * 1) not running * 2) executing @@ -383,32 +384,50 @@ static __kprobes void default_do_nmi(struct pt_regs *regs) * If an NMI hits a breakpoint that executes an iret, another * NMI can preempt it. We do not want to allow this new NMI * to run, but we want to execute it when the first one finishes. - * We set the state to "latched", and the first NMI will perform - * an cmpxchg on the state, and if it doesn't successfully - * reset the state to "not running" it will restart the next - * NMI. + * We set the state to "latched", and the exit of the first NMI will + * perform a dec_return, if the result is zero (NOT_RUNNING), then + * it will simply exit the NMI handler. If not, the dec_return + * would have set the state to NMI_EXECUTING (what we want it to + * be when we are running). In this case, we simply jump back + * to rerun the NMI handler again, and restart the 'latched' NMI. + * + * No trap (breakpoint or page fault) should be hit before nmi_restart, + * thus there is no race between the first check of state for NOT_RUNNING + * and setting it to NMI_EXECUTING. The HW will prevent nested NMIs + * at this point. + * + * In case the NMI takes a page fault, we need to save off the CR2 + * because the NMI could have preempted another page fault and corrupt + * the CR2 that is about to be read. As nested NMIs must be restarted + * and they can not take breakpoints or page faults, the update of the + * CR2 must be done before converting the nmi state back to NOT_RUNNING. + * Otherwise, there would be a race of another nested NMI coming in + * after setting state to NOT_RUNNING but before updating the nmi_cr2. */ enum nmi_states { - NMI_NOT_RUNNING, + NMI_NOT_RUNNING = 0, NMI_EXECUTING, NMI_LATCHED, }; static DEFINE_PER_CPU(enum nmi_states, nmi_state); +static DEFINE_PER_CPU(unsigned long, nmi_cr2); #define nmi_nesting_preprocess(regs) \ do { \ - if (__get_cpu_var(nmi_state) != NMI_NOT_RUNNING) { \ - __get_cpu_var(nmi_state) = NMI_LATCHED; \ + if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { \ + this_cpu_write(nmi_state, NMI_LATCHED); \ return; \ } \ - nmi_restart: \ - __get_cpu_var(nmi_state) = NMI_EXECUTING; \ - } while (0) + this_cpu_write(nmi_state, NMI_EXECUTING); \ + this_cpu_write(nmi_cr2, read_cr2()); \ + } while (0); \ + nmi_restart: #define nmi_nesting_postprocess() \ do { \ - if (cmpxchg(&__get_cpu_var(nmi_state), \ - NMI_EXECUTING, NMI_NOT_RUNNING) != NMI_EXECUTING) \ + if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) \ + write_cr2(this_cpu_read(nmi_cr2)); \ + if (this_cpu_dec_return(nmi_state)) \ goto nmi_restart; \ } while (0) #else /* x86_64 */ diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index e31bf8d5c4d2..6d9582ec0324 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -42,7 +42,8 @@ static int __init nmi_unk_cb(unsigned int val, struct pt_regs *regs) static void __init init_nmi_testsuite(void) { /* trap all the unknown NMIs we may generate */ - register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk"); + register_nmi_handler(NMI_UNKNOWN, nmi_unk_cb, 0, "nmi_selftest_unk", + __initdata); } static void __init cleanup_nmi_testsuite(void) @@ -65,7 +66,7 @@ static void __init test_nmi_ipi(struct cpumask *mask) unsigned long timeout; if (register_nmi_handler(NMI_LOCAL, test_nmi_ipi_callback, - NMI_FLAG_FIRST, "nmi_selftest")) { + NMI_FLAG_FIRST, "nmi_selftest", __initdata)) { nmi_fail = FAILURE; return; } diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 9ce885996fd7..17fff18a1031 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -352,9 +352,7 @@ struct pv_cpu_ops pv_cpu_ops = { #endif .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, - .rdmsr_regs = native_rdmsr_safe_regs, .write_msr = native_write_msr_safe, - .wrmsr_regs = native_wrmsr_safe_regs, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, .read_tscp = native_read_tscp, diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index b72838bae64a..299d49302e7d 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -22,6 +22,8 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#define pr_fmt(fmt) "Calgary: " fmt + #include <linux/kernel.h> #include <linux/init.h> #include <linux/types.h> @@ -245,7 +247,7 @@ static unsigned long iommu_range_alloc(struct device *dev, offset = iommu_area_alloc(tbl->it_map, tbl->it_size, 0, npages, 0, boundary_size, 0); if (offset == ~0UL) { - printk(KERN_WARNING "Calgary: IOMMU full.\n"); + pr_warn("IOMMU full\n"); spin_unlock_irqrestore(&tbl->it_lock, flags); if (panic_on_overflow) panic("Calgary: fix the allocator.\n"); @@ -271,8 +273,8 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, entry = iommu_range_alloc(dev, tbl, npages); if (unlikely(entry == DMA_ERROR_CODE)) { - printk(KERN_WARNING "Calgary: failed to allocate %u pages in " - "iommu %p\n", npages, tbl); + pr_warn("failed to allocate %u pages in iommu %p\n", + npages, tbl); return DMA_ERROR_CODE; } @@ -561,8 +563,7 @@ static void calgary_tce_cache_blast(struct iommu_table *tbl) i++; } while ((val & 0xff) != 0xff && i < 100); if (i == 100) - printk(KERN_WARNING "Calgary: PCI bus not quiesced, " - "continuing anyway\n"); + pr_warn("PCI bus not quiesced, continuing anyway\n"); /* invalidate TCE cache */ target = calgary_reg(bbar, tar_offset(tbl->it_busno)); @@ -604,8 +605,7 @@ begin: i++; } while ((val64 & 0xff) != 0xff && i < 100); if (i == 100) - printk(KERN_WARNING "CalIOC2: PCI bus not quiesced, " - "continuing anyway\n"); + pr_warn("CalIOC2: PCI bus not quiesced, continuing anyway\n"); /* 3. poll Page Migration DEBUG for SoftStopFault */ target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); @@ -617,8 +617,7 @@ begin: if (++count < 100) goto begin; else { - printk(KERN_WARNING "CalIOC2: too many SoftStopFaults, " - "aborting TCE cache flush sequence!\n"); + pr_warn("CalIOC2: too many SoftStopFaults, aborting TCE cache flush sequence!\n"); return; /* pray for the best */ } } @@ -840,8 +839,8 @@ static void calgary_dump_error_regs(struct iommu_table *tbl) plssr = be32_to_cpu(readl(target)); /* If no error, the agent ID in the CSR is not valid */ - printk(KERN_EMERG "Calgary: DMA error on Calgary PHB 0x%x, " - "0x%08x@CSR 0x%08x@PLSSR\n", tbl->it_busno, csr, plssr); + pr_emerg("DMA error on Calgary PHB 0x%x, 0x%08x@CSR 0x%08x@PLSSR\n", + tbl->it_busno, csr, plssr); } static void calioc2_dump_error_regs(struct iommu_table *tbl) @@ -867,22 +866,21 @@ static void calioc2_dump_error_regs(struct iommu_table *tbl) target = calgary_reg(bbar, phboff | 0x800); mck = be32_to_cpu(readl(target)); - printk(KERN_EMERG "Calgary: DMA error on CalIOC2 PHB 0x%x\n", - tbl->it_busno); + pr_emerg("DMA error on CalIOC2 PHB 0x%x\n", tbl->it_busno); - printk(KERN_EMERG "Calgary: 0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n", - csr, plssr, csmr, mck); + pr_emerg("0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n", + csr, plssr, csmr, mck); /* dump rest of error regs */ - printk(KERN_EMERG "Calgary: "); + pr_emerg(""); for (i = 0; i < ARRAY_SIZE(errregs); i++) { /* err regs are at 0x810 - 0x870 */ erroff = (0x810 + (i * 0x10)); target = calgary_reg(bbar, phboff | erroff); errregs[i] = be32_to_cpu(readl(target)); - printk("0x%08x@0x%lx ", errregs[i], erroff); + pr_cont("0x%08x@0x%lx ", errregs[i], erroff); } - printk("\n"); + pr_cont("\n"); /* root complex status */ target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS); diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 62c9457ccd2f..c0f420f76cd3 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -100,7 +100,7 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size, struct dma_attrs *attrs) { unsigned long dma_mask; - struct page *page = NULL; + struct page *page; unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; dma_addr_t addr; @@ -108,6 +108,7 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size, flag |= __GFP_ZERO; again: + page = NULL; if (!(flag & GFP_ATOMIC)) page = dma_alloc_from_contiguous(dev, count, get_order(size)); if (!page) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 735279e54e59..ef6a8456f719 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/errno.h> #include <linux/kernel.h> #include <linux/mm.h> @@ -145,16 +147,14 @@ void show_regs_common(void) /* Board Name is optional */ board = dmi_get_system_info(DMI_BOARD_NAME); - printk(KERN_CONT "\n"); - printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - printk(KERN_CONT " %s %s", vendor, product); - if (board) - printk(KERN_CONT "/%s", board); - printk(KERN_CONT "\n"); + printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s %s%s%s\n", + current->pid, current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version, + vendor, product, + board ? "/" : "", + board ? board : ""); } void flush_thread(void) @@ -645,7 +645,7 @@ static void amd_e400_idle(void) amd_e400_c1e_detected = true; if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) mark_tsc_unstable("TSC halt in AMD C1E"); - printk(KERN_INFO "System has AMD C1E enabled\n"); + pr_info("System has AMD C1E enabled\n"); } } @@ -659,8 +659,7 @@ static void amd_e400_idle(void) */ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, &cpu); - printk(KERN_INFO "Switch to broadcast mode on CPU%d\n", - cpu); + pr_info("Switch to broadcast mode on CPU%d\n", cpu); } clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); @@ -681,8 +680,7 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP if (pm_idle == poll_idle && smp_num_siblings > 1) { - printk_once(KERN_WARNING "WARNING: polling idle and HT enabled," - " performance may degrade.\n"); + pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n"); } #endif if (pm_idle) @@ -692,11 +690,11 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) /* * One CPU supports mwait => All CPUs supports mwait */ - printk(KERN_INFO "using mwait in idle threads.\n"); + pr_info("using mwait in idle threads\n"); pm_idle = mwait_idle; } else if (cpu_has_amd_erratum(amd_erratum_400)) { /* E400: APIC timer interrupt does not wake up CPU from C1e */ - printk(KERN_INFO "using AMD E400 aware idle routine\n"); + pr_info("using AMD E400 aware idle routine\n"); pm_idle = amd_e400_idle; } else pm_idle = default_idle; @@ -715,7 +713,7 @@ static int __init idle_setup(char *str) return -EINVAL; if (!strcmp(str, "poll")) { - printk("using polling idle threads.\n"); + pr_info("using polling idle threads\n"); pm_idle = poll_idle; boot_option_idle_override = IDLE_POLL; } else if (!strcmp(str, "mwait")) { diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 61cdf7fdf099..0a980c9d7cb8 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -117,10 +117,10 @@ void release_thread(struct task_struct *dead_task) { if (dead_task->mm) { if (dead_task->mm->context.size) { - printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", - dead_task->comm, - dead_task->mm->context.ldt, - dead_task->mm->context.size); + pr_warn("WARNING: dead process %8s still has LDT? <%p/%d>\n", + dead_task->comm, + dead_task->mm->context.ldt, + dead_task->mm->context.size); BUG(); } } @@ -466,7 +466,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) task->thread.gs = addr; if (doit) { load_gs_index(0); - ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); + ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr); } } put_cpu(); @@ -494,7 +494,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) /* set the selector to 0 to not confuse __switch_to */ loadsegment(fs, 0); - ret = checking_wrmsrl(MSR_FS_BASE, addr); + ret = wrmsrl_safe(MSR_FS_BASE, addr); } } put_cpu(); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 79c45af81604..d0f81d3d388f 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/reboot.h> #include <linux/init.h> @@ -152,7 +154,8 @@ static int __init set_bios_reboot(const struct dmi_system_id *d) { if (reboot_type != BOOT_BIOS) { reboot_type = BOOT_BIOS; - printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident); + pr_info("%s series board detected. Selecting %s-method for reboots.\n", + "BIOS", d->ident); } return 0; } @@ -207,8 +210,8 @@ static int __init set_pci_reboot(const struct dmi_system_id *d) { if (reboot_type != BOOT_CF9) { reboot_type = BOOT_CF9; - printk(KERN_INFO "%s series board detected. " - "Selecting PCI-method for reboots.\n", d->ident); + pr_info("%s series board detected. Selecting %s-method for reboots.\n", + "PCI", d->ident); } return 0; } @@ -217,7 +220,8 @@ static int __init set_kbd_reboot(const struct dmi_system_id *d) { if (reboot_type != BOOT_KBD) { reboot_type = BOOT_KBD; - printk(KERN_INFO "%s series board detected. Selecting KBD-method for reboot.\n", d->ident); + pr_info("%s series board detected. Selecting %s-method for reboot.\n", + "KBD", d->ident); } return 0; } @@ -451,6 +455,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"), }, }, + { /* Handle problems with rebooting on the Precision M6600. */ + .callback = set_pci_reboot, + .ident = "Dell OptiPlex 990", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Precision M6600"), + }, + }, { } }; @@ -639,9 +651,11 @@ void native_machine_shutdown(void) set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id)); /* - * O.K Now that I'm on the appropriate processor, - * stop all of the others. + * O.K Now that I'm on the appropriate processor, stop all of the + * others. Also disable the local irq to not receive the per-cpu + * timer interrupt which may trigger scheduler's load balance. */ + local_irq_disable(); stop_other_cpus(); #endif @@ -668,7 +682,7 @@ static void __machine_emergency_restart(int emergency) static void native_machine_restart(char *__unused) { - printk("machine restart\n"); + pr_notice("machine restart\n"); if (!reboot_force) machine_shutdown(); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 21af737053aa..b280908a376e 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -6,6 +6,9 @@ * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes * 2000-2002 x86-64 support by Andi Kleen */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/sched.h> #include <linux/mm.h> #include <linux/smp.h> @@ -814,7 +817,7 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where) me->comm, me->pid, where, frame, regs->ip, regs->sp, regs->orig_ax); print_vma_addr(" in ", regs->ip); - printk(KERN_CONT "\n"); + pr_cont("\n"); } force_sig(SIGSEGV, me); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index fd019d78b1f4..c1a310fb8309 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1,4 +1,4 @@ -/* + /* * x86 SMP booting functions * * (c) 1995 Alan Cox, Building #3 <alan@lxorguk.ukuu.org.uk> @@ -39,6 +39,8 @@ * Glauber Costa : i386 and x86_64 integration */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/init.h> #include <linux/smp.h> #include <linux/module.h> @@ -184,7 +186,7 @@ static void __cpuinit smp_callin(void) * boards) */ - pr_debug("CALLIN, before setup_local_APIC().\n"); + pr_debug("CALLIN, before setup_local_APIC()\n"); if (apic->smp_callin_clear_local_apic) apic->smp_callin_clear_local_apic(); setup_local_APIC(); @@ -255,22 +257,13 @@ notrace static void __cpuinit start_secondary(void *unused) check_tsc_sync_target(); /* - * We need to hold call_lock, so there is no inconsistency - * between the time smp_call_function() determines number of - * IPI recipients, and the time when the determination is made - * for which cpus receive the IPI. Holding this - * lock helps us to not include this cpu in a currently in progress - * smp_call_function(). - * * We need to hold vector_lock so there the set of online cpus * does not change while we are assigning vectors to cpus. Holding * this lock ensures we don't half assign or remove an irq from a cpu. */ - ipi_call_lock(); lock_vector_lock(); set_cpu_online(smp_processor_id(), true); unlock_vector_lock(); - ipi_call_unlock(); per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; x86_platform.nmi_init(); @@ -349,9 +342,12 @@ static bool __cpuinit match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) static bool __cpuinit match_mc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { - if (c->phys_proc_id == o->phys_proc_id) - return topology_sane(c, o, "mc"); + if (c->phys_proc_id == o->phys_proc_id) { + if (cpu_has(c, X86_FEATURE_AMD_DCM)) + return true; + return topology_sane(c, o, "mc"); + } return false; } @@ -382,6 +378,15 @@ void __cpuinit set_cpu_sibling_map(int cpu) if ((i == cpu) || (has_mc && match_llc(c, o))) link_mask(llc_shared, cpu, i); + } + + /* + * This needs a separate iteration over the cpus because we rely on all + * cpu_sibling_mask links to be set-up. + */ + for_each_cpu(i, cpu_sibling_setup_mask) { + o = &cpu_data(i); + if ((i == cpu) || (has_mc && match_mc(c, o))) { link_mask(core, cpu, i); @@ -420,17 +425,16 @@ static void impress_friends(void) /* * Allow the user to impress friends. */ - pr_debug("Before bogomips.\n"); + pr_debug("Before bogomips\n"); for_each_possible_cpu(cpu) if (cpumask_test_cpu(cpu, cpu_callout_mask)) bogosum += cpu_data(cpu).loops_per_jiffy; - printk(KERN_INFO - "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", + pr_info("Total of %d processors activated (%lu.%02lu BogoMIPS)\n", num_online_cpus(), bogosum/(500000/HZ), (bogosum/(5000/HZ))%100); - pr_debug("Before bogocount - setting activated=1.\n"); + pr_debug("Before bogocount - setting activated=1\n"); } void __inquire_remote_apic(int apicid) @@ -440,18 +444,17 @@ void __inquire_remote_apic(int apicid) int timeout; u32 status; - printk(KERN_INFO "Inquiring remote APIC 0x%x...\n", apicid); + pr_info("Inquiring remote APIC 0x%x...\n", apicid); for (i = 0; i < ARRAY_SIZE(regs); i++) { - printk(KERN_INFO "... APIC 0x%x %s: ", apicid, names[i]); + pr_info("... APIC 0x%x %s: ", apicid, names[i]); /* * Wait for idle. */ status = safe_apic_wait_icr_idle(); if (status) - printk(KERN_CONT - "a previous APIC delivery may have failed\n"); + pr_cont("a previous APIC delivery may have failed\n"); apic_icr_write(APIC_DM_REMRD | regs[i], apicid); @@ -464,10 +467,10 @@ void __inquire_remote_apic(int apicid) switch (status) { case APIC_ICR_RR_VALID: status = apic_read(APIC_RRR); - printk(KERN_CONT "%08x\n", status); + pr_cont("%08x\n", status); break; default: - printk(KERN_CONT "failed\n"); + pr_cont("failed\n"); } } } @@ -501,12 +504,12 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip) apic_write(APIC_ESR, 0); accept_status = (apic_read(APIC_ESR) & 0xEF); } - pr_debug("NMI sent.\n"); + pr_debug("NMI sent\n"); if (send_status) - printk(KERN_ERR "APIC never delivered???\n"); + pr_err("APIC never delivered???\n"); if (accept_status) - printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status); + pr_err("APIC delivery error (%lx)\n", accept_status); return (send_status | accept_status); } @@ -528,7 +531,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) apic_read(APIC_ESR); } - pr_debug("Asserting INIT.\n"); + pr_debug("Asserting INIT\n"); /* * Turn INIT on target chip @@ -544,7 +547,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) mdelay(10); - pr_debug("Deasserting INIT.\n"); + pr_debug("Deasserting INIT\n"); /* Target chip */ /* Send IPI */ @@ -577,14 +580,14 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) /* * Run STARTUP IPI loop. */ - pr_debug("#startup loops: %d.\n", num_starts); + pr_debug("#startup loops: %d\n", num_starts); for (j = 1; j <= num_starts; j++) { - pr_debug("Sending STARTUP #%d.\n", j); + pr_debug("Sending STARTUP #%d\n", j); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); apic_read(APIC_ESR); - pr_debug("After apic_write.\n"); + pr_debug("After apic_write\n"); /* * STARTUP IPI @@ -601,7 +604,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) */ udelay(300); - pr_debug("Startup point 1.\n"); + pr_debug("Startup point 1\n"); pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); @@ -616,12 +619,12 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) if (send_status || accept_status) break; } - pr_debug("After Startup.\n"); + pr_debug("After Startup\n"); if (send_status) - printk(KERN_ERR "APIC never delivered???\n"); + pr_err("APIC never delivered???\n"); if (accept_status) - printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status); + pr_err("APIC delivery error (%lx)\n", accept_status); return (send_status | accept_status); } @@ -635,11 +638,11 @@ static void __cpuinit announce_cpu(int cpu, int apicid) if (system_state == SYSTEM_BOOTING) { if (node != current_node) { if (current_node > (-1)) - pr_cont(" Ok.\n"); + pr_cont(" OK\n"); current_node = node; pr_info("Booting Node %3d, Processors ", node); } - pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " Ok.\n" : ""); + pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " OK\n" : ""); return; } else pr_info("Booting Node %d Processor %d APIC 0x%x\n", @@ -719,9 +722,9 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) /* * allow APs to start initializing. */ - pr_debug("Before Callout %d.\n", cpu); + pr_debug("Before Callout %d\n", cpu); cpumask_set_cpu(cpu, cpu_callout_mask); - pr_debug("After Callout %d.\n", cpu); + pr_debug("After Callout %d\n", cpu); /* * Wait 5s total for a response @@ -749,7 +752,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle) pr_err("CPU%d: Stuck ??\n", cpu); else /* trampoline code not run */ - pr_err("CPU%d: Not responding.\n", cpu); + pr_err("CPU%d: Not responding\n", cpu); if (apic->inquire_remote_apic) apic->inquire_remote_apic(apicid); } @@ -794,7 +797,7 @@ int __cpuinit native_cpu_up(unsigned int cpu, struct task_struct *tidle) if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || !physid_isset(apicid, phys_cpu_present_map) || !apic->apic_id_valid(apicid)) { - printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu); + pr_err("%s: bad cpu %d\n", __func__, cpu); return -EINVAL; } @@ -875,9 +878,8 @@ static int __init smp_sanity_check(unsigned max_cpus) unsigned int cpu; unsigned nr; - printk(KERN_WARNING - "More than 8 CPUs detected - skipping them.\n" - "Use CONFIG_X86_BIGSMP.\n"); + pr_warn("More than 8 CPUs detected - skipping them\n" + "Use CONFIG_X86_BIGSMP\n"); nr = 0; for_each_present_cpu(cpu) { @@ -898,8 +900,7 @@ static int __init smp_sanity_check(unsigned max_cpus) #endif if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { - printk(KERN_WARNING - "weird, boot CPU (#%d) not listed by the BIOS.\n", + pr_warn("weird, boot CPU (#%d) not listed by the BIOS\n", hard_smp_processor_id()); physid_set(hard_smp_processor_id(), phys_cpu_present_map); @@ -911,11 +912,10 @@ static int __init smp_sanity_check(unsigned max_cpus) */ if (!smp_found_config && !acpi_lapic) { preempt_enable(); - printk(KERN_NOTICE "SMP motherboard not detected.\n"); + pr_notice("SMP motherboard not detected\n"); disable_smp(); if (APIC_init_uniprocessor()) - printk(KERN_NOTICE "Local APIC not detected." - " Using dummy APIC emulation.\n"); + pr_notice("Local APIC not detected. Using dummy APIC emulation.\n"); return -1; } @@ -924,9 +924,8 @@ static int __init smp_sanity_check(unsigned max_cpus) * CPU too, but we do it for the sake of robustness anyway. */ if (!apic->check_phys_apicid_present(boot_cpu_physical_apicid)) { - printk(KERN_NOTICE - "weird, boot CPU (#%d) not listed by the BIOS.\n", - boot_cpu_physical_apicid); + pr_notice("weird, boot CPU (#%d) not listed by the BIOS\n", + boot_cpu_physical_apicid); physid_set(hard_smp_processor_id(), phys_cpu_present_map); } preempt_enable(); @@ -939,8 +938,7 @@ static int __init smp_sanity_check(unsigned max_cpus) if (!disable_apic) { pr_err("BIOS bug, local APIC #%d not detected!...\n", boot_cpu_physical_apicid); - pr_err("... forcing use of dummy APIC emulation." - "(tell your hw vendor)\n"); + pr_err("... forcing use of dummy APIC emulation (tell your hw vendor)\n"); } smpboot_clear_io_apic(); disable_ioapic_support(); @@ -953,7 +951,7 @@ static int __init smp_sanity_check(unsigned max_cpus) * If SMP should be disabled, then really disable it! */ if (!max_cpus) { - printk(KERN_INFO "SMP mode deactivated.\n"); + pr_info("SMP mode deactivated\n"); smpboot_clear_io_apic(); connect_bsp_APIC(); @@ -1005,7 +1003,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) if (smp_sanity_check(max_cpus) < 0) { - printk(KERN_INFO "SMP disabled\n"); + pr_info("SMP disabled\n"); disable_smp(); goto out; } @@ -1043,7 +1041,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) * Set up local APIC timer on boot CPU. */ - printk(KERN_INFO "CPU%d: ", 0); + pr_info("CPU%d: ", 0); print_cpu_info(&cpu_data(0)); x86_init.timers.setup_percpu_clockev(); @@ -1093,7 +1091,7 @@ void __init native_smp_prepare_boot_cpu(void) void __init native_smp_cpus_done(unsigned int max_cpus) { - pr_debug("Boot done.\n"); + pr_debug("Boot done\n"); nmi_selftest(); impress_friends(); @@ -1154,8 +1152,7 @@ __init void prefill_possible_map(void) /* nr_cpu_ids could be reduced via nr_cpus= */ if (possible > nr_cpu_ids) { - printk(KERN_WARNING - "%d Processors exceeds NR_CPUS limit of %d\n", + pr_warn("%d Processors exceeds NR_CPUS limit of %d\n", possible, nr_cpu_ids); possible = nr_cpu_ids; } @@ -1164,13 +1161,12 @@ __init void prefill_possible_map(void) if (!setup_max_cpus) #endif if (possible > i) { - printk(KERN_WARNING - "%d Processors exceeds max_cpus limit of %u\n", + pr_warn("%d Processors exceeds max_cpus limit of %u\n", possible, setup_max_cpus); possible = i; } - printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", + pr_info("Allowing %d CPUs, %d hotplug CPUs\n", possible, max_t(int, possible - num_processors, 0)); for (i = 0; i < possible; i++) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 05b31d92f69c..b481341c9369 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -9,6 +9,9 @@ /* * Handle hardware traps and faults. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/interrupt.h> #include <linux/kallsyms.h> #include <linux/spinlock.h> @@ -143,12 +146,11 @@ trap_signal: #ifdef CONFIG_X86_64 if (show_unhandled_signals && unhandled_signal(tsk, signr) && printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] trap %s ip:%lx sp:%lx error:%lx", - tsk->comm, tsk->pid, str, - regs->ip, regs->sp, error_code); + pr_info("%s[%d] trap %s ip:%lx sp:%lx error:%lx", + tsk->comm, tsk->pid, str, + regs->ip, regs->sp, error_code); print_vma_addr(" in ", regs->ip); - printk("\n"); + pr_cont("\n"); } #endif @@ -269,12 +271,11 @@ do_general_protection(struct pt_regs *regs, long error_code) if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] general protection ip:%lx sp:%lx error:%lx", + pr_info("%s[%d] general protection ip:%lx sp:%lx error:%lx", tsk->comm, task_pid_nr(tsk), regs->ip, regs->sp, error_code); print_vma_addr(" in ", regs->ip); - printk("\n"); + pr_cont("\n"); } force_sig(SIGSEGV, tsk); @@ -570,7 +571,7 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) conditional_sti(regs); #if 0 /* No need to warn about this any longer. */ - printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); + pr_info("Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); #endif } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index fc0a147e3727..cfa5d4f7ca56 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1,3 +1,5 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/sched.h> #include <linux/init.h> @@ -84,8 +86,7 @@ EXPORT_SYMBOL_GPL(check_tsc_unstable); #ifdef CONFIG_X86_TSC int __init notsc_setup(char *str) { - printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, " - "cannot disable TSC completely.\n"); + pr_warn("Kernel compiled with CONFIG_X86_TSC, cannot disable TSC completely\n"); tsc_disabled = 1; return 1; } @@ -373,7 +374,7 @@ static unsigned long quick_pit_calibrate(void) goto success; } } - printk("Fast TSC calibration failed\n"); + pr_err("Fast TSC calibration failed\n"); return 0; success: @@ -392,7 +393,7 @@ success: */ delta *= PIT_TICK_RATE; do_div(delta, i*256*1000); - printk("Fast TSC calibration using PIT\n"); + pr_info("Fast TSC calibration using PIT\n"); return delta; } @@ -487,9 +488,8 @@ unsigned long native_calibrate_tsc(void) * use the reference value, as it is more precise. */ if (delta >= 90 && delta <= 110) { - printk(KERN_INFO - "TSC: PIT calibration matches %s. %d loops\n", - hpet ? "HPET" : "PMTIMER", i + 1); + pr_info("PIT calibration matches %s. %d loops\n", + hpet ? "HPET" : "PMTIMER", i + 1); return tsc_ref_min; } @@ -511,38 +511,36 @@ unsigned long native_calibrate_tsc(void) */ if (tsc_pit_min == ULONG_MAX) { /* PIT gave no useful value */ - printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n"); + pr_warn("Unable to calibrate against PIT\n"); /* We don't have an alternative source, disable TSC */ if (!hpet && !ref1 && !ref2) { - printk("TSC: No reference (HPET/PMTIMER) available\n"); + pr_notice("No reference (HPET/PMTIMER) available\n"); return 0; } /* The alternative source failed as well, disable TSC */ if (tsc_ref_min == ULONG_MAX) { - printk(KERN_WARNING "TSC: HPET/PMTIMER calibration " - "failed.\n"); + pr_warn("HPET/PMTIMER calibration failed\n"); return 0; } /* Use the alternative source */ - printk(KERN_INFO "TSC: using %s reference calibration\n", - hpet ? "HPET" : "PMTIMER"); + pr_info("using %s reference calibration\n", + hpet ? "HPET" : "PMTIMER"); return tsc_ref_min; } /* We don't have an alternative source, use the PIT calibration value */ if (!hpet && !ref1 && !ref2) { - printk(KERN_INFO "TSC: Using PIT calibration value\n"); + pr_info("Using PIT calibration value\n"); return tsc_pit_min; } /* The alternative source failed, use the PIT calibration value */ if (tsc_ref_min == ULONG_MAX) { - printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. " - "Using PIT calibration\n"); + pr_warn("HPET/PMTIMER calibration failed. Using PIT calibration.\n"); return tsc_pit_min; } @@ -551,9 +549,9 @@ unsigned long native_calibrate_tsc(void) * the PIT value as we know that there are PMTIMERs around * running at double speed. At least we let the user know: */ - printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n", - hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); - printk(KERN_INFO "TSC: Using PIT calibration value\n"); + pr_warn("PIT calibration deviates from %s: %lu %lu\n", + hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); + pr_info("Using PIT calibration value\n"); return tsc_pit_min; } @@ -785,7 +783,7 @@ void mark_tsc_unstable(char *reason) tsc_unstable = 1; sched_clock_stable = 0; disable_sched_clock_irqtime(); - printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); + pr_info("Marking TSC unstable due to %s\n", reason); /* Change only the rating, when not registered */ if (clocksource_tsc.mult) clocksource_mark_unstable(&clocksource_tsc); @@ -912,9 +910,9 @@ static void tsc_refine_calibration_work(struct work_struct *work) goto out; tsc_khz = freq; - printk(KERN_INFO "Refined TSC clocksource calibration: " - "%lu.%03lu MHz.\n", (unsigned long)tsc_khz / 1000, - (unsigned long)tsc_khz % 1000); + pr_info("Refined TSC clocksource calibration: %lu.%03lu MHz\n", + (unsigned long)tsc_khz / 1000, + (unsigned long)tsc_khz % 1000); out: clocksource_register_khz(&clocksource_tsc, tsc_khz); @@ -970,9 +968,9 @@ void __init tsc_init(void) return; } - printk("Detected %lu.%03lu MHz processor.\n", - (unsigned long)cpu_khz / 1000, - (unsigned long)cpu_khz % 1000); + pr_info("Detected %lu.%03lu MHz processor\n", + (unsigned long)cpu_khz / 1000, + (unsigned long)cpu_khz % 1000); /* * Secondary CPUs do not run through tsc_init(), so set up diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index dc4e910a7d96..36fd42091fa7 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -409,9 +409,10 @@ static int validate_insn_bits(struct arch_uprobe *auprobe, struct mm_struct *mm, * arch_uprobe_analyze_insn - instruction analysis including validity and fixups. * @mm: the probed address space. * @arch_uprobe: the probepoint information. + * @addr: virtual address at which to install the probepoint * Return 0 on success or a -ve number on error. */ -int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm) +int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr) { int ret; struct insn insn; diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 255f58ae71e8..54abcc0baf23 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -28,6 +28,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/capability.h> #include <linux/errno.h> #include <linux/interrupt.h> @@ -137,14 +139,14 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) local_irq_enable(); if (!current->thread.vm86_info) { - printk("no vm86_info: BAD\n"); + pr_alert("no vm86_info: BAD\n"); do_exit(SIGSEGV); } set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask); tmp = copy_vm86_regs_to_user(¤t->thread.vm86_info->regs, regs); tmp += put_user(current->thread.screen_bitmap, ¤t->thread.vm86_info->screen_bitmap); if (tmp) { - printk("vm86: could not access userspace vm86_info\n"); + pr_alert("could not access userspace vm86_info\n"); do_exit(SIGSEGV); } diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 7515cf0e1805..8d141b309046 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -18,6 +18,8 @@ * use the vDSO. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/time.h> #include <linux/init.h> #include <linux/kernel.h> @@ -111,18 +113,13 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, const char *message) { - static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - struct task_struct *tsk; - - if (!show_unhandled_signals || !__ratelimit(&rs)) + if (!show_unhandled_signals) return; - tsk = current; - - printk("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", - level, tsk->comm, task_pid_nr(tsk), - message, regs->ip, regs->cs, - regs->sp, regs->ax, regs->si, regs->di); + pr_notice_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", + level, current->comm, task_pid_nr(current), + message, regs->ip, regs->cs, + regs->sp, regs->ax, regs->si, regs->di); } static int addr_to_vsyscall_nr(unsigned long addr) @@ -139,6 +136,19 @@ static int addr_to_vsyscall_nr(unsigned long addr) return nr; } +#ifdef CONFIG_SECCOMP +static int vsyscall_seccomp(struct task_struct *tsk, int syscall_nr) +{ + if (!seccomp_mode(&tsk->seccomp)) + return 0; + task_pt_regs(tsk)->orig_ax = syscall_nr; + task_pt_regs(tsk)->ax = syscall_nr; + return __secure_computing(syscall_nr); +} +#else +#define vsyscall_seccomp(_tsk, _nr) 0 +#endif + static bool write_ok_or_segv(unsigned long ptr, size_t size) { /* @@ -174,6 +184,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) int vsyscall_nr; int prev_sig_on_uaccess_error; long ret; + int skip; /* * No point in checking CS -- the only way to get here is a user mode @@ -205,9 +216,6 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) } tsk = current; - if (seccomp_mode(&tsk->seccomp)) - do_exit(SIGKILL); - /* * With a real vsyscall, page faults cause SIGSEGV. We want to * preserve that behavior to make writing exploits harder. @@ -222,8 +230,13 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) * address 0". */ ret = -EFAULT; + skip = 0; switch (vsyscall_nr) { case 0: + skip = vsyscall_seccomp(tsk, __NR_gettimeofday); + if (skip) + break; + if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || !write_ok_or_segv(regs->si, sizeof(struct timezone))) break; @@ -234,6 +247,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) break; case 1: + skip = vsyscall_seccomp(tsk, __NR_time); + if (skip) + break; + if (!write_ok_or_segv(regs->di, sizeof(time_t))) break; @@ -241,6 +258,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) break; case 2: + skip = vsyscall_seccomp(tsk, __NR_getcpu); + if (skip) + break; + if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || !write_ok_or_segv(regs->si, sizeof(unsigned))) break; @@ -253,6 +274,12 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; + if (skip) { + if ((long)regs->ax <= 0L) /* seccomp errno emulation */ + goto do_ret; + goto done; /* seccomp trace/trap */ + } + if (ret == -EFAULT) { /* Bad news -- userspace fed a bad pointer to a vsyscall. */ warn_bad_vsyscall(KERN_INFO, regs, @@ -271,10 +298,11 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) regs->ax = ret; +do_ret: /* Emulate a ret instruction. */ regs->ip = caller; regs->sp += 8; - +done: return true; sigsegv: diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 9796c2f3d074..6020f6f5927c 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -28,6 +28,7 @@ EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(copy_user_generic_string); EXPORT_SYMBOL(copy_user_generic_unrolled); +EXPORT_SYMBOL(copy_user_enhanced_fast_string); EXPORT_SYMBOL(__copy_user_nocache); EXPORT_SYMBOL(_copy_from_user); EXPORT_SYMBOL(_copy_to_user); diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index bd18149b2b0f..3d3e20709119 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -3,6 +3,9 @@ * * Author: Suresh Siddha <suresh.b.siddha@intel.com> */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/bootmem.h> #include <linux/compat.h> #include <asm/i387.h> @@ -162,7 +165,7 @@ int save_i387_xstate(void __user *buf) BUG_ON(sig_xstate_size < xstate_size); if ((unsigned long)buf % 64) - printk("save_i387_xstate: bad fpstate %p\n", buf); + pr_err("%s: bad fpstate %p\n", __func__, buf); if (!used_math()) return 0; @@ -422,7 +425,7 @@ static void __init xstate_enable_boot_cpu(void) pcntxt_mask = eax + ((u64)edx << 32); if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { - printk(KERN_ERR "FP/SSE not shown under xsave features 0x%llx\n", + pr_err("FP/SSE not shown under xsave features 0x%llx\n", pcntxt_mask); BUG(); } @@ -445,9 +448,8 @@ static void __init xstate_enable_boot_cpu(void) setup_xstate_init(); - printk(KERN_INFO "xsave/xrstor: enabled xstate_bv 0x%llx, " - "cntxt size 0x%x\n", - pcntxt_mask, xstate_size); + pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n", + pcntxt_mask, xstate_size); } /* diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index be3cea4407ff..57e168e27b5b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3934,6 +3934,9 @@ static void kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, { struct kvm_mmu_page *page; + if (list_empty(&kvm->arch.active_mmu_pages)) + return; + page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); kvm_mmu_prepare_zap_page(kvm, page, invalid_list); diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 2e88438ffd83..9b7ec1150ab0 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -80,10 +80,10 @@ static inline struct kvm_pmc *get_fixed_pmc_idx(struct kvm_pmu *pmu, int idx) static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx) { - if (idx < X86_PMC_IDX_FIXED) + if (idx < INTEL_PMC_IDX_FIXED) return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0); else - return get_fixed_pmc_idx(pmu, idx - X86_PMC_IDX_FIXED); + return get_fixed_pmc_idx(pmu, idx - INTEL_PMC_IDX_FIXED); } void kvm_deliver_pmi(struct kvm_vcpu *vcpu) @@ -291,7 +291,7 @@ static void reprogram_idx(struct kvm_pmu *pmu, int idx) if (pmc_is_gp(pmc)) reprogram_gp_counter(pmc, pmc->eventsel); else { - int fidx = idx - X86_PMC_IDX_FIXED; + int fidx = idx - INTEL_PMC_IDX_FIXED; reprogram_fixed_counter(pmc, fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx); } @@ -452,7 +452,7 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) return; pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff, - X86_PMC_MAX_GENERIC); + INTEL_PMC_MAX_GENERIC); pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << ((entry->eax >> 16) & 0xff)) - 1; bitmap_len = (entry->eax >> 24) & 0xff; @@ -462,13 +462,13 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) pmu->nr_arch_fixed_counters = 0; } else { pmu->nr_arch_fixed_counters = min((int)(entry->edx & 0x1f), - X86_PMC_MAX_FIXED); + INTEL_PMC_MAX_FIXED); pmu->counter_bitmask[KVM_PMC_FIXED] = ((u64)1 << ((entry->edx >> 5) & 0xff)) - 1; } pmu->global_ctrl = ((1 << pmu->nr_arch_gp_counters) - 1) | - (((1ull << pmu->nr_arch_fixed_counters) - 1) << X86_PMC_IDX_FIXED); + (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED); pmu->global_ctrl_mask = ~pmu->global_ctrl; } @@ -478,15 +478,15 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = &vcpu->arch.pmu; memset(pmu, 0, sizeof(*pmu)); - for (i = 0; i < X86_PMC_MAX_GENERIC; i++) { + for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { pmu->gp_counters[i].type = KVM_PMC_GP; pmu->gp_counters[i].vcpu = vcpu; pmu->gp_counters[i].idx = i; } - for (i = 0; i < X86_PMC_MAX_FIXED; i++) { + for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) { pmu->fixed_counters[i].type = KVM_PMC_FIXED; pmu->fixed_counters[i].vcpu = vcpu; - pmu->fixed_counters[i].idx = i + X86_PMC_IDX_FIXED; + pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED; } init_irq_work(&pmu->irq_work, trigger_pmi); kvm_pmu_cpuid_update(vcpu); @@ -498,13 +498,13 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu) int i; irq_work_sync(&pmu->irq_work); - for (i = 0; i < X86_PMC_MAX_GENERIC; i++) { + for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { struct kvm_pmc *pmc = &pmu->gp_counters[i]; stop_counter(pmc); pmc->counter = pmc->eventsel = 0; } - for (i = 0; i < X86_PMC_MAX_FIXED; i++) + for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) stop_counter(&pmu->fixed_counters[i]); pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 911d2641f14c..62d02e3c3ed6 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -710,16 +710,6 @@ TRACE_EVENT(kvm_skinit, __entry->rip, __entry->slb) ); -#define __print_insn(insn, ilen) ({ \ - int i; \ - const char *ret = p->buffer + p->len; \ - \ - for (i = 0; i < ilen; ++i) \ - trace_seq_printf(p, " %02x", insn[i]); \ - trace_seq_printf(p, "%c", 0); \ - ret; \ - }) - #define KVM_EMUL_INSN_F_CR0_PE (1 << 0) #define KVM_EMUL_INSN_F_EFL_VM (1 << 1) #define KVM_EMUL_INSN_F_CS_D (1 << 2) @@ -786,7 +776,7 @@ TRACE_EVENT(kvm_emulate_insn, TP_printk("%x:%llx:%s (%s)%s", __entry->csbase, __entry->rip, - __print_insn(__entry->insn, __entry->len), + __print_hex(__entry->insn, __entry->len), __print_symbolic(__entry->flags, kvm_trace_symbol_emul_flags), __entry->failed ? " failed" : "" diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c index 459b58a8a15c..25b7ae8d058a 100644 --- a/arch/x86/lib/csum-wrappers_64.c +++ b/arch/x86/lib/csum-wrappers_64.c @@ -115,7 +115,7 @@ EXPORT_SYMBOL(csum_partial_copy_to_user); * @src: source address * @dst: destination address * @len: number of bytes to be copied. - * @isum: initial sum that is added into the result (32bit unfolded) + * @sum: initial sum that is added into the result (32bit unfolded) * * Returns an 32bit unfolded checksum of the buffer. */ diff --git a/arch/x86/lib/msr-reg-export.c b/arch/x86/lib/msr-reg-export.c index a311cc59b65d..8d6ef78b5d01 100644 --- a/arch/x86/lib/msr-reg-export.c +++ b/arch/x86/lib/msr-reg-export.c @@ -1,5 +1,5 @@ #include <linux/module.h> #include <asm/msr.h> -EXPORT_SYMBOL(native_rdmsr_safe_regs); -EXPORT_SYMBOL(native_wrmsr_safe_regs); +EXPORT_SYMBOL(rdmsr_safe_regs); +EXPORT_SYMBOL(wrmsr_safe_regs); diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S index 69fa10623f21..f6d13eefad10 100644 --- a/arch/x86/lib/msr-reg.S +++ b/arch/x86/lib/msr-reg.S @@ -6,13 +6,13 @@ #ifdef CONFIG_X86_64 /* - * int native_{rdmsr,wrmsr}_safe_regs(u32 gprs[8]); + * int {rdmsr,wrmsr}_safe_regs(u32 gprs[8]); * * reg layout: u32 gprs[eax, ecx, edx, ebx, esp, ebp, esi, edi] * */ .macro op_safe_regs op -ENTRY(native_\op\()_safe_regs) +ENTRY(\op\()_safe_regs) CFI_STARTPROC pushq_cfi %rbx pushq_cfi %rbp @@ -45,13 +45,13 @@ ENTRY(native_\op\()_safe_regs) _ASM_EXTABLE(1b, 3b) CFI_ENDPROC -ENDPROC(native_\op\()_safe_regs) +ENDPROC(\op\()_safe_regs) .endm #else /* X86_32 */ .macro op_safe_regs op -ENTRY(native_\op\()_safe_regs) +ENTRY(\op\()_safe_regs) CFI_STARTPROC pushl_cfi %ebx pushl_cfi %ebp @@ -92,7 +92,7 @@ ENTRY(native_\op\()_safe_regs) _ASM_EXTABLE(1b, 3b) CFI_ENDPROC -ENDPROC(native_\op\()_safe_regs) +ENDPROC(\op\()_safe_regs) .endm #endif diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index f61ee67ec00f..4f74d94c8d97 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -8,6 +8,7 @@ #include <linux/module.h> #include <asm/word-at-a-time.h> +#include <linux/sched.h> /* * best effort, GUP based copy_from_user() that is NMI-safe @@ -21,6 +22,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) void *map; int ret; + if (__range_not_ok(from, n, TASK_SIZE)) + return len; + do { ret = __get_user_pages_fast(addr, 1, 0, &page); if (!ret) diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 819137904428..5d7e51f3fd28 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -28,7 +28,7 @@ # - (66): the last prefix is 0x66 # - (F3): the last prefix is 0xF3 # - (F2): the last prefix is 0xF2 -# +# - (!F3) : the last prefix is not 0xF3 (including non-last prefix case) Table: one byte opcode Referrer: @@ -515,12 +515,12 @@ b4: LFS Gv,Mp b5: LGS Gv,Mp b6: MOVZX Gv,Eb b7: MOVZX Gv,Ew -b8: JMPE | POPCNT Gv,Ev (F3) +b8: JMPE (!F3) | POPCNT Gv,Ev (F3) b9: Grp10 (1A) ba: Grp8 Ev,Ib (1A) bb: BTC Ev,Gv -bc: BSF Gv,Ev | TZCNT Gv,Ev (F3) -bd: BSR Gv,Ev | LZCNT Gv,Ev (F3) +bc: BSF Gv,Ev (!F3) | TZCNT Gv,Ev (F3) +bd: BSR Gv,Ev (!F3) | LZCNT Gv,Ev (F3) be: MOVSX Gv,Eb bf: MOVSX Gv,Ew # 0x0f 0xc0-0xcf diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 97141c26a13a..e0e6990723e9 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -62,7 +62,8 @@ static void __init find_early_table_space(struct map_range *mr, unsigned long en extra += PMD_SIZE; #endif /* The first 2/4M doesn't use large pages. */ - extra += mr->end - mr->start; + if (mr->start < PMD_SIZE) + extra += mr->end - mr->start; ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; } else @@ -384,7 +385,7 @@ void free_initmem(void) } #ifdef CONFIG_BLK_DEV_INITRD -void free_initrd_mem(unsigned long start, unsigned long end) +void __init free_initrd_mem(unsigned long start, unsigned long end) { /* * end could be not aligned, and We can not align that, diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index be1ef574ce9a..78fe3f1ac49f 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -180,7 +180,7 @@ err_free_memtype: /** * ioremap_nocache - map bus memory into CPU space - * @offset: bus address of the memory + * @phys_addr: bus address of the memory * @size: size of the resource to map * * ioremap_nocache performs a platform specific sequence of operations to @@ -217,7 +217,7 @@ EXPORT_SYMBOL(ioremap_nocache); /** * ioremap_wc - map memory into CPU space write combined - * @offset: bus address of the memory + * @phys_addr: bus address of the memory * @size: size of the resource to map * * This version of ioremap ensures that the memory is marked write combining. diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index e1ebde315210..a718e0d23503 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -122,7 +122,7 @@ within(unsigned long addr, unsigned long start, unsigned long end) /** * clflush_cache_range - flush a cache range with clflush - * @addr: virtual start address + * @vaddr: virtual start address * @size: number of bytes to flush * * clflush is an unordered instruction which needs fencing with mfence diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 732af3a96183..4599c3e8bcb6 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -176,6 +176,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) return; } + node_set(node, numa_nodes_parsed); + printk(KERN_INFO "SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]\n", node, pxm, (unsigned long long) start, (unsigned long long) end - 1); diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 303f08637826..b2b94438ff05 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -312,7 +312,7 @@ static int op_amd_fill_in_addresses(struct op_msrs * const msrs) goto fail; } /* both registers must be reserved */ - if (num_counters == AMD64_NUM_COUNTERS_F15H) { + if (num_counters == AMD64_NUM_COUNTERS_CORE) { msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1); msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1); } else { @@ -514,7 +514,7 @@ static int op_amd_init(struct oprofile_operations *ops) ops->create_files = setup_ibs_files; if (boot_cpu_data.x86 == 0x15) { - num_counters = AMD64_NUM_COUNTERS_F15H; + num_counters = AMD64_NUM_COUNTERS_CORE; } else { num_counters = AMD64_NUM_COUNTERS; } diff --git a/arch/x86/platform/mrst/early_printk_mrst.c b/arch/x86/platform/mrst/early_printk_mrst.c index 3c6e328483c7..028454f0c3a5 100644 --- a/arch/x86/platform/mrst/early_printk_mrst.c +++ b/arch/x86/platform/mrst/early_printk_mrst.c @@ -110,19 +110,16 @@ static struct kmsg_dumper dw_dumper; static int dumper_registered; static void dw_kmsg_dump(struct kmsg_dumper *dumper, - enum kmsg_dump_reason reason, - const char *s1, unsigned long l1, - const char *s2, unsigned long l2) + enum kmsg_dump_reason reason) { - int i; + static char line[1024]; + size_t len; /* When run to this, we'd better re-init the HW */ mrst_early_console_init(); - for (i = 0; i < l1; i++) - early_mrst_console.write(&early_mrst_console, s1 + i, 1); - for (i = 0; i < l2; i++) - early_mrst_console.write(&early_mrst_console, s2 + i, 1); + while (kmsg_dump_get_line(dumper, true, line, sizeof(line), &len)) + early_mrst_console.write(&early_mrst_console, line, len); } /* Set the ratio rate to 115200, 8n1, IRQ disabled */ diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index e31bcd8f2eee..fd41a9262d65 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -782,7 +782,7 @@ BLOCKING_NOTIFIER_HEAD(intel_scu_notifier); EXPORT_SYMBOL_GPL(intel_scu_notifier); /* Called by IPC driver */ -void intel_scu_devices_create(void) +void __devinit intel_scu_devices_create(void) { int i; diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 3ae0e61abd23..59880afa851f 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -1295,7 +1295,6 @@ static void __init enable_timeouts(void) */ mmr_image |= (1L << SOFTACK_MSHIFT); if (is_uv2_hub()) { - mmr_image &= ~(1L << UV2_LEG_SHFT); mmr_image |= (1L << UV2_EXT_SHFT); } write_mmr_misc_control(pnode, mmr_image); diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index 5f6a5b6c3a15..ddcf39b1a18d 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -66,9 +66,10 @@ BEGIN { rex_expr = "^REX(\\.[XRWB]+)*" fpu_expr = "^ESC" # TODO - lprefix1_expr = "\\(66\\)" + lprefix1_expr = "\\((66|!F3)\\)" lprefix2_expr = "\\(F3\\)" - lprefix3_expr = "\\(F2\\)" + lprefix3_expr = "\\((F2|!F3)\\)" + lprefix_expr = "\\((66|F2|F3)\\)" max_lprefix = 4 # All opcodes starting with lower-case 'v' or with (v1) superscript @@ -333,13 +334,16 @@ function convert_operands(count,opnd, i,j,imm,mod) if (match(ext, lprefix1_expr)) { lptable1[idx] = add_flags(lptable1[idx],flags) variant = "INAT_VARIANT" - } else if (match(ext, lprefix2_expr)) { + } + if (match(ext, lprefix2_expr)) { lptable2[idx] = add_flags(lptable2[idx],flags) variant = "INAT_VARIANT" - } else if (match(ext, lprefix3_expr)) { + } + if (match(ext, lprefix3_expr)) { lptable3[idx] = add_flags(lptable3[idx],flags) variant = "INAT_VARIANT" - } else { + } + if (!match(ext, lprefix_expr)){ table[idx] = add_flags(table[idx],flags) } } diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index 416bd40c0eba..68d1dc91b37b 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -39,9 +39,9 @@ #undef __SYSCALL_I386 #define __SYSCALL_I386(nr, sym, compat) [ nr ] = sym, -typedef void (*sys_call_ptr_t)(void); +typedef asmlinkage void (*sys_call_ptr_t)(void); -extern void sys_ni_syscall(void); +extern asmlinkage void sys_ni_syscall(void); const sys_call_ptr_t sys_call_table[] __cacheline_aligned = { /* diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 66e6d9359826..0faad646f5fd 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -205,9 +205,9 @@ void syscall32_cpu_init(void) { /* Load these always in case some future AMD CPU supports SYSENTER from compat mode too. */ - checking_wrmsrl(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL); - checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); wrmsrl(MSR_CSTAR, ia32_cstar_target); } diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index e74df9548a02..ed7d54985d0c 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -209,6 +209,9 @@ static void __init xen_banner(void) xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); } +#define CPUID_THERM_POWER_LEAF 6 +#define APERFMPERF_PRESENT 0 + static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0; static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0; @@ -242,6 +245,11 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, *dx = cpuid_leaf5_edx_val; return; + case CPUID_THERM_POWER_LEAF: + /* Disabling APERFMPERF for kernel usage */ + maskecx = ~(1 << APERFMPERF_PRESENT); + break; + case 0xb: /* Suppress extended topology stuff */ maskebx = 0; @@ -1116,9 +1124,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, - .rdmsr_regs = native_rdmsr_safe_regs, .write_msr = xen_write_msr_safe, - .wrmsr_regs = native_wrmsr_safe_regs, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index ffd08c414e91..64effdc6da94 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -706,6 +706,7 @@ int m2p_add_override(unsigned long mfn, struct page *page, unsigned long uninitialized_var(address); unsigned level; pte_t *ptep = NULL; + int ret = 0; pfn = page_to_pfn(page); if (!PageHighMem(page)) { @@ -741,6 +742,24 @@ int m2p_add_override(unsigned long mfn, struct page *page, list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]); spin_unlock_irqrestore(&m2p_override_lock, flags); + /* p2m(m2p(mfn)) == mfn: the mfn is already present somewhere in + * this domain. Set the FOREIGN_FRAME_BIT in the p2m for the other + * pfn so that the following mfn_to_pfn(mfn) calls will return the + * pfn from the m2p_override (the backend pfn) instead. + * We need to do this because the pages shared by the frontend + * (xen-blkfront) can be already locked (lock_page, called by + * do_read_cache_page); when the userspace backend tries to use them + * with direct_IO, mfn_to_pfn returns the pfn of the frontend, so + * do_blockdev_direct_IO is going to try to lock the same pages + * again resulting in a deadlock. + * As a side effect get_user_pages_fast might not be safe on the + * frontend pages while they are being shared with the backend, + * because mfn_to_pfn (that ends up being called by GUPF) will + * return the backend pfn rather than the frontend pfn. */ + ret = __get_user(pfn, &machine_to_phys_mapping[mfn]); + if (ret == 0 && get_phys_to_machine(pfn) == mfn) + set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)); + return 0; } EXPORT_SYMBOL_GPL(m2p_add_override); @@ -752,6 +771,7 @@ int m2p_remove_override(struct page *page, bool clear_pte) unsigned long uninitialized_var(address); unsigned level; pte_t *ptep = NULL; + int ret = 0; pfn = page_to_pfn(page); mfn = get_phys_to_machine(pfn); @@ -821,6 +841,22 @@ int m2p_remove_override(struct page *page, bool clear_pte) } else set_phys_to_machine(pfn, page->index); + /* p2m(m2p(mfn)) == FOREIGN_FRAME(mfn): the mfn is already present + * somewhere in this domain, even before being added to the + * m2p_override (see comment above in m2p_add_override). + * If there are no other entries in the m2p_override corresponding + * to this mfn, then remove the FOREIGN_FRAME_BIT from the p2m for + * the original pfn (the one shared by the frontend): the backend + * cannot do any IO on this page anymore because it has been + * unshared. Removing the FOREIGN_FRAME_BIT from the p2m entry of + * the original pfn causes mfn_to_pfn(mfn) to return the frontend + * pfn again. */ + mfn &= ~FOREIGN_FRAME_BIT; + ret = __get_user(pfn, &machine_to_phys_mapping[mfn]); + if (ret == 0 && get_phys_to_machine(pfn) == FOREIGN_FRAME(mfn) && + m2p_find_override(mfn) == NULL) + set_phys_to_machine(pfn, mfn); + return 0; } EXPORT_SYMBOL_GPL(m2p_remove_override); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 3ebba0753d38..a4790bf22c59 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -371,7 +371,8 @@ char * __init xen_memory_setup(void) populated = xen_populate_chunk(map, memmap.nr_entries, max_pfn, &last_pfn, xen_released_pages); - extra_pages += (xen_released_pages - populated); + xen_released_pages -= populated; + extra_pages += xen_released_pages; if (last_pfn > max_pfn) { max_pfn = min(MAX_DOMAIN_PAGES, last_pfn); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index afb250d22a6b..f58dca7a6e52 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -80,9 +80,7 @@ static void __cpuinit cpu_bringup(void) notify_cpu_starting(cpu); - ipi_call_lock(); set_cpu_online(cpu, true); - ipi_call_unlock(); this_cpu_write(cpu_state, CPU_ONLINE); |