diff options
Diffstat (limited to 'arch/x86/kernel/cpu/mcheck')
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/Makefile | 2 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/k7.c | 102 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce.c | 90 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/mce.h | 14 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/non-fatal.c | 91 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/p4.c | 253 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/p5.c | 53 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/p6.c | 119 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/therm_throt.c | 186 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mcheck/winchip.c | 36 |
10 files changed, 946 insertions, 0 deletions
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile new file mode 100644 index 000000000000..f1ebe1c1c17a --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -0,0 +1,2 @@ +obj-y = mce.o k7.o p4.o p5.o p6.o winchip.o therm_throt.o +obj-$(CONFIG_X86_MCE_NONFATAL) += non-fatal.o diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c new file mode 100644 index 000000000000..eef63e3630c2 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/k7.c @@ -0,0 +1,102 @@ +/* + * Athlon/Hammer specific Machine Check Exception Reporting + * (C) Copyright 2002 Dave Jones <davej@codemonkey.org.uk> + */ + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/interrupt.h> +#include <linux/smp.h> + +#include <asm/processor.h> +#include <asm/system.h> +#include <asm/msr.h> + +#include "mce.h" + +/* Machine Check Handler For AMD Athlon/Duron */ +static fastcall void k7_machine_check(struct pt_regs * regs, long error_code) +{ + int recover=1; + u32 alow, ahigh, high, low; + u32 mcgstl, mcgsth; + int i; + + rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth); + if (mcgstl & (1<<0)) /* Recoverable ? */ + recover=0; + + printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", + smp_processor_id(), mcgsth, mcgstl); + + for (i=1; i<nr_mce_banks; i++) { + rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high); + if (high&(1<<31)) { + if (high & (1<<29)) + recover |= 1; + if (high & (1<<25)) + recover |= 2; + printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low); + high &= ~(1<<31); + if (high & (1<<27)) { + rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh); + printk ("[%08x%08x]", ahigh, alow); + } + if (high & (1<<26)) { + rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh); + printk (" at %08x%08x", ahigh, alow); + } + printk ("\n"); + /* Clear it */ + wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); + /* Serialize */ + wmb(); + add_taint(TAINT_MACHINE_CHECK); + } + } + + if (recover&2) + panic ("CPU context corrupt"); + if (recover&1) + panic ("Unable to continue"); + printk (KERN_EMERG "Attempting to continue.\n"); + mcgstl &= ~(1<<2); + wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth); +} + + +/* AMD K7 machine check is Intel like */ +void amd_mcheck_init(struct cpuinfo_x86 *c) +{ + u32 l, h; + int i; + + if (!cpu_has(c, X86_FEATURE_MCE)) + return; + + machine_check_vector = k7_machine_check; + wmb(); + + printk (KERN_INFO "Intel machine check architecture supported.\n"); + rdmsr (MSR_IA32_MCG_CAP, l, h); + if (l & (1<<8)) /* Control register present ? */ + wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); + nr_mce_banks = l & 0xff; + + /* Clear status for MC index 0 separately, we don't touch CTL, + * as some K7 Athlons cause spurious MCEs when its enabled. */ + if (boot_cpu_data.x86 == 6) { + wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0); + i = 1; + } else + i = 0; + for (; i<nr_mce_banks; i++) { + wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); + wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); + } + + set_in_cr4 (X86_CR4_MCE); + printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", + smp_processor_id()); +} diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c new file mode 100644 index 000000000000..34c781eddee4 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -0,0 +1,90 @@ +/* + * mce.c - x86 Machine Check Exception Reporting + * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@codemonkey.org.uk> + */ + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/smp.h> +#include <linux/thread_info.h> + +#include <asm/processor.h> +#include <asm/system.h> +#include <asm/mce.h> + +#include "mce.h" + +int mce_disabled = 0; +int nr_mce_banks; + +EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ + +/* Handle unconfigured int18 (should never happen) */ +static fastcall void unexpected_machine_check(struct pt_regs * regs, long error_code) +{ + printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id()); +} + +/* Call the installed machine check handler for this CPU setup. */ +void fastcall (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; + +/* This has to be run for each processor */ +void mcheck_init(struct cpuinfo_x86 *c) +{ + if (mce_disabled==1) + return; + + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + amd_mcheck_init(c); + break; + + case X86_VENDOR_INTEL: + if (c->x86==5) + intel_p5_mcheck_init(c); + if (c->x86==6) + intel_p6_mcheck_init(c); + if (c->x86==15) + intel_p4_mcheck_init(c); + break; + + case X86_VENDOR_CENTAUR: + if (c->x86==5) + winchip_mcheck_init(c); + break; + + default: + break; + } +} + +static unsigned long old_cr4 __initdata; + +void __init stop_mce(void) +{ + old_cr4 = read_cr4(); + clear_in_cr4(X86_CR4_MCE); +} + +void __init restart_mce(void) +{ + if (old_cr4 & X86_CR4_MCE) + set_in_cr4(X86_CR4_MCE); +} + +static int __init mcheck_disable(char *str) +{ + mce_disabled = 1; + return 1; +} + +static int __init mcheck_enable(char *str) +{ + mce_disabled = -1; + return 1; +} + +__setup("nomce", mcheck_disable); +__setup("mce", mcheck_enable); diff --git a/arch/x86/kernel/cpu/mcheck/mce.h b/arch/x86/kernel/cpu/mcheck/mce.h new file mode 100644 index 000000000000..81fb6e2d35f3 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce.h @@ -0,0 +1,14 @@ +#include <linux/init.h> +#include <asm/mce.h> + +void amd_mcheck_init(struct cpuinfo_x86 *c); +void intel_p4_mcheck_init(struct cpuinfo_x86 *c); +void intel_p5_mcheck_init(struct cpuinfo_x86 *c); +void intel_p6_mcheck_init(struct cpuinfo_x86 *c); +void winchip_mcheck_init(struct cpuinfo_x86 *c); + +/* Call the installed machine check handler for this CPU setup. */ +extern fastcall void (*machine_check_vector)(struct pt_regs *, long error_code); + +extern int nr_mce_banks; + diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c new file mode 100644 index 000000000000..bf39409b3838 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c @@ -0,0 +1,91 @@ +/* + * Non Fatal Machine Check Exception Reporting + * + * (C) Copyright 2002 Dave Jones. <davej@codemonkey.org.uk> + * + * This file contains routines to check for non-fatal MCEs every 15s + * + */ + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/jiffies.h> +#include <linux/workqueue.h> +#include <linux/interrupt.h> +#include <linux/smp.h> +#include <linux/module.h> + +#include <asm/processor.h> +#include <asm/system.h> +#include <asm/msr.h> + +#include "mce.h" + +static int firstbank; + +#define MCE_RATE 15*HZ /* timer rate is 15s */ + +static void mce_checkregs (void *info) +{ + u32 low, high; + int i; + + for (i=firstbank; i<nr_mce_banks; i++) { + rdmsr (MSR_IA32_MC0_STATUS+i*4, low, high); + + if (high & (1<<31)) { + printk(KERN_INFO "MCE: The hardware reports a non " + "fatal, correctable incident occurred on " + "CPU %d.\n", + smp_processor_id()); + printk (KERN_INFO "Bank %d: %08x%08x\n", i, high, low); + + /* Scrub the error so we don't pick it up in MCE_RATE seconds time. */ + wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL); + + /* Serialize */ + wmb(); + add_taint(TAINT_MACHINE_CHECK); + } + } +} + +static void mce_work_fn(struct work_struct *work); +static DECLARE_DELAYED_WORK(mce_work, mce_work_fn); + +static void mce_work_fn(struct work_struct *work) +{ + on_each_cpu(mce_checkregs, NULL, 1, 1); + schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); +} + +static int __init init_nonfatal_mce_checker(void) +{ + struct cpuinfo_x86 *c = &boot_cpu_data; + + /* Check for MCE support */ + if (!cpu_has(c, X86_FEATURE_MCE)) + return -ENODEV; + + /* Check for PPro style MCA */ + if (!cpu_has(c, X86_FEATURE_MCA)) + return -ENODEV; + + /* Some Athlons misbehave when we frob bank 0 */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86 == 6) + firstbank = 1; + else + firstbank = 0; + + /* + * Check for non-fatal errors every MCE_RATE s + */ + schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE)); + printk(KERN_INFO "Machine check exception polling timer started.\n"); + return 0; +} +module_init(init_nonfatal_mce_checker); + +MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c new file mode 100644 index 000000000000..1509edfb2313 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -0,0 +1,253 @@ +/* + * P4 specific Machine Check Exception Reporting + */ + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/interrupt.h> +#include <linux/smp.h> + +#include <asm/processor.h> +#include <asm/system.h> +#include <asm/msr.h> +#include <asm/apic.h> + +#include <asm/therm_throt.h> + +#include "mce.h" + +/* as supported by the P4/Xeon family */ +struct intel_mce_extended_msrs { + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; + u32 esi; + u32 edi; + u32 ebp; + u32 esp; + u32 eflags; + u32 eip; + /* u32 *reserved[]; */ +}; + +static int mce_num_extended_msrs = 0; + + +#ifdef CONFIG_X86_MCE_P4THERMAL +static void unexpected_thermal_interrupt(struct pt_regs *regs) +{ + printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", + smp_processor_id()); + add_taint(TAINT_MACHINE_CHECK); +} + +/* P4/Xeon Thermal transition interrupt handler */ +static void intel_thermal_interrupt(struct pt_regs *regs) +{ + __u64 msr_val; + + ack_APIC_irq(); + + rdmsrl(MSR_IA32_THERM_STATUS, msr_val); + therm_throt_process(msr_val & 0x1); +} + +/* Thermal interrupt handler for this CPU setup */ +static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; + +fastcall void smp_thermal_interrupt(struct pt_regs *regs) +{ + irq_enter(); + vendor_thermal_interrupt(regs); + irq_exit(); +} + +/* P4/Xeon Thermal regulation detect and init */ +static void intel_init_thermal(struct cpuinfo_x86 *c) +{ + u32 l, h; + unsigned int cpu = smp_processor_id(); + + /* Thermal monitoring */ + if (!cpu_has(c, X86_FEATURE_ACPI)) + return; /* -ENODEV */ + + /* Clock modulation */ + if (!cpu_has(c, X86_FEATURE_ACC)) + return; /* -ENODEV */ + + /* first check if its enabled already, in which case there might + * be some SMM goo which handles it, so we can't even put a handler + * since it might be delivered via SMI already -zwanem. + */ + rdmsr (MSR_IA32_MISC_ENABLE, l, h); + h = apic_read(APIC_LVTTHMR); + if ((l & (1<<3)) && (h & APIC_DM_SMI)) { + printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", + cpu); + return; /* -EBUSY */ + } + + /* check whether a vector already exists, temporarily masked? */ + if (h & APIC_VECTOR_MASK) { + printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already " + "installed\n", + cpu, (h & APIC_VECTOR_MASK)); + return; /* -EBUSY */ + } + + /* The temperature transition interrupt handler setup */ + h = THERMAL_APIC_VECTOR; /* our delivery vector */ + h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */ + apic_write_around(APIC_LVTTHMR, h); + + rdmsr (MSR_IA32_THERM_INTERRUPT, l, h); + wrmsr (MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); + + /* ok we're good to go... */ + vendor_thermal_interrupt = intel_thermal_interrupt; + + rdmsr (MSR_IA32_MISC_ENABLE, l, h); + wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h); + + l = apic_read (APIC_LVTTHMR); + apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + printk (KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); + + /* enable thermal throttle processing */ + atomic_set(&therm_throt_en, 1); + return; +} +#endif /* CONFIG_X86_MCE_P4THERMAL */ + + +/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */ +static inline void intel_get_extended_msrs(struct intel_mce_extended_msrs *r) +{ + u32 h; + + rdmsr (MSR_IA32_MCG_EAX, r->eax, h); + rdmsr (MSR_IA32_MCG_EBX, r->ebx, h); + rdmsr (MSR_IA32_MCG_ECX, r->ecx, h); + rdmsr (MSR_IA32_MCG_EDX, r->edx, h); + rdmsr (MSR_IA32_MCG_ESI, r->esi, h); + rdmsr (MSR_IA32_MCG_EDI, r->edi, h); + rdmsr (MSR_IA32_MCG_EBP, r->ebp, h); + rdmsr (MSR_IA32_MCG_ESP, r->esp, h); + rdmsr (MSR_IA32_MCG_EFLAGS, r->eflags, h); + rdmsr (MSR_IA32_MCG_EIP, r->eip, h); +} + +static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) +{ + int recover=1; + u32 alow, ahigh, high, low; + u32 mcgstl, mcgsth; + int i; + + rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth); + if (mcgstl & (1<<0)) /* Recoverable ? */ + recover=0; + + printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", + smp_processor_id(), mcgsth, mcgstl); + + if (mce_num_extended_msrs > 0) { + struct intel_mce_extended_msrs dbg; + intel_get_extended_msrs(&dbg); + printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n", + smp_processor_id(), dbg.eip, dbg.eflags); + printk (KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n", + dbg.eax, dbg.ebx, dbg.ecx, dbg.edx); + printk (KERN_DEBUG "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n", + dbg.esi, dbg.edi, dbg.ebp, dbg.esp); + } + + for (i=0; i<nr_mce_banks; i++) { + rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high); + if (high & (1<<31)) { + if (high & (1<<29)) + recover |= 1; + if (high & (1<<25)) + recover |= 2; + printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low); + high &= ~(1<<31); + if (high & (1<<27)) { + rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh); + printk ("[%08x%08x]", ahigh, alow); + } + if (high & (1<<26)) { + rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh); + printk (" at %08x%08x", ahigh, alow); + } + printk ("\n"); + } + } + + if (recover & 2) + panic ("CPU context corrupt"); + if (recover & 1) + panic ("Unable to continue"); + + printk(KERN_EMERG "Attempting to continue.\n"); + /* + * Do not clear the MSR_IA32_MCi_STATUS if the error is not + * recoverable/continuable.This will allow BIOS to look at the MSRs + * for errors if the OS could not log the error. + */ + for (i=0; i<nr_mce_banks; i++) { + u32 msr; + msr = MSR_IA32_MC0_STATUS+i*4; + rdmsr (msr, low, high); + if (high&(1<<31)) { + /* Clear it */ + wrmsr(msr, 0UL, 0UL); + /* Serialize */ + wmb(); + add_taint(TAINT_MACHINE_CHECK); + } + } + mcgstl &= ~(1<<2); + wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth); +} + + +void intel_p4_mcheck_init(struct cpuinfo_x86 *c) +{ + u32 l, h; + int i; + + machine_check_vector = intel_machine_check; + wmb(); + + printk (KERN_INFO "Intel machine check architecture supported.\n"); + rdmsr (MSR_IA32_MCG_CAP, l, h); + if (l & (1<<8)) /* Control register present ? */ + wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); + nr_mce_banks = l & 0xff; + + for (i=0; i<nr_mce_banks; i++) { + wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); + wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); + } + + set_in_cr4 (X86_CR4_MCE); + printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", + smp_processor_id()); + + /* Check for P4/Xeon extended MCE MSRs */ + rdmsr (MSR_IA32_MCG_CAP, l, h); + if (l & (1<<9)) {/* MCG_EXT_P */ + mce_num_extended_msrs = (l >> 16) & 0xff; + printk (KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)" + " available\n", + smp_processor_id(), mce_num_extended_msrs); + +#ifdef CONFIG_X86_MCE_P4THERMAL + /* Check for P4/Xeon Thermal monitor */ + intel_init_thermal(c); +#endif + } +} diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c new file mode 100644 index 000000000000..94bc43d950cf --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -0,0 +1,53 @@ +/* + * P5 specific Machine Check Exception Reporting + * (C) Copyright 2002 Alan Cox <alan@redhat.com> + */ + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/interrupt.h> +#include <linux/smp.h> + +#include <asm/processor.h> +#include <asm/system.h> +#include <asm/msr.h> + +#include "mce.h" + +/* Machine check handler for Pentium class Intel */ +static fastcall void pentium_machine_check(struct pt_regs * regs, long error_code) +{ + u32 loaddr, hi, lotype; + rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); + rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); + printk(KERN_EMERG "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", smp_processor_id(), loaddr, lotype); + if(lotype&(1<<5)) + printk(KERN_EMERG "CPU#%d: Possible thermal failure (CPU on fire ?).\n", smp_processor_id()); + add_taint(TAINT_MACHINE_CHECK); +} + +/* Set up machine check reporting for processors with Intel style MCE */ +void intel_p5_mcheck_init(struct cpuinfo_x86 *c) +{ + u32 l, h; + + /*Check for MCE support */ + if( !cpu_has(c, X86_FEATURE_MCE) ) + return; + + /* Default P5 to off as its often misconnected */ + if(mce_disabled != -1) + return; + machine_check_vector = pentium_machine_check; + wmb(); + + /* Read registers before enabling */ + rdmsr(MSR_IA32_P5_MC_ADDR, l, h); + rdmsr(MSR_IA32_P5_MC_TYPE, l, h); + printk(KERN_INFO "Intel old style machine check architecture supported.\n"); + + /* Enable MCE */ + set_in_cr4(X86_CR4_MCE); + printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id()); +} diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c new file mode 100644 index 000000000000..deeae42ce199 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/p6.c @@ -0,0 +1,119 @@ +/* + * P6 specific Machine Check Exception Reporting + * (C) Copyright 2002 Alan Cox <alan@redhat.com> + */ + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/interrupt.h> +#include <linux/smp.h> + +#include <asm/processor.h> +#include <asm/system.h> +#include <asm/msr.h> + +#include "mce.h" + +/* Machine Check Handler For PII/PIII */ +static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) +{ + int recover=1; + u32 alow, ahigh, high, low; + u32 mcgstl, mcgsth; + int i; + + rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth); + if (mcgstl & (1<<0)) /* Recoverable ? */ + recover=0; + + printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", + smp_processor_id(), mcgsth, mcgstl); + + for (i=0; i<nr_mce_banks; i++) { + rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high); + if (high & (1<<31)) { + if (high & (1<<29)) + recover |= 1; + if (high & (1<<25)) + recover |= 2; + printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low); + high &= ~(1<<31); + if (high & (1<<27)) { + rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh); + printk ("[%08x%08x]", ahigh, alow); + } + if (high & (1<<26)) { + rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh); + printk (" at %08x%08x", ahigh, alow); + } + printk ("\n"); + } + } + + if (recover & 2) + panic ("CPU context corrupt"); + if (recover & 1) + panic ("Unable to continue"); + + printk (KERN_EMERG "Attempting to continue.\n"); + /* + * Do not clear the MSR_IA32_MCi_STATUS if the error is not + * recoverable/continuable.This will allow BIOS to look at the MSRs + * for errors if the OS could not log the error. + */ + for (i=0; i<nr_mce_banks; i++) { + unsigned int msr; + msr = MSR_IA32_MC0_STATUS+i*4; + rdmsr (msr,low, high); + if (high & (1<<31)) { + /* Clear it */ + wrmsr (msr, 0UL, 0UL); + /* Serialize */ + wmb(); + add_taint(TAINT_MACHINE_CHECK); + } + } + mcgstl &= ~(1<<2); + wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth); +} + +/* Set up machine check reporting for processors with Intel style MCE */ +void intel_p6_mcheck_init(struct cpuinfo_x86 *c) +{ + u32 l, h; + int i; + + /* Check for MCE support */ + if (!cpu_has(c, X86_FEATURE_MCE)) + return; + + /* Check for PPro style MCA */ + if (!cpu_has(c, X86_FEATURE_MCA)) + return; + + /* Ok machine check is available */ + machine_check_vector = intel_machine_check; + wmb(); + + printk (KERN_INFO "Intel machine check architecture supported.\n"); + rdmsr (MSR_IA32_MCG_CAP, l, h); + if (l & (1<<8)) /* Control register present ? */ + wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); + nr_mce_banks = l & 0xff; + + /* + * Following the example in IA-32 SDM Vol 3: + * - MC0_CTL should not be written + * - Status registers on all banks should be cleared on reset + */ + for (i=1; i<nr_mce_banks; i++) + wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff); + + for (i=0; i<nr_mce_banks; i++) + wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0); + + set_in_cr4 (X86_CR4_MCE); + printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", + smp_processor_id()); +} diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c new file mode 100644 index 000000000000..1203dc5ab87a --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -0,0 +1,186 @@ +/* + * linux/arch/i386/kernel/cpu/mcheck/therm_throt.c + * + * Thermal throttle event support code (such as syslog messaging and rate + * limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c). + * This allows consistent reporting of CPU thermal throttle events. + * + * Maintains a counter in /sys that keeps track of the number of thermal + * events, such that the user knows how bad the thermal problem might be + * (since the logging to syslog and mcelog is rate limited). + * + * Author: Dmitriy Zavin (dmitriyz@google.com) + * + * Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c. + * Inspired by Ross Biro's and Al Borchers' counter code. + */ + +#include <linux/percpu.h> +#include <linux/sysdev.h> +#include <linux/cpu.h> +#include <asm/cpu.h> +#include <linux/notifier.h> +#include <linux/jiffies.h> +#include <asm/therm_throt.h> + +/* How long to wait between reporting thermal events */ +#define CHECK_INTERVAL (300 * HZ) + +static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; +static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); +atomic_t therm_throt_en = ATOMIC_INIT(0); + +#ifdef CONFIG_SYSFS +#define define_therm_throt_sysdev_one_ro(_name) \ + static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) + +#define define_therm_throt_sysdev_show_func(name) \ +static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ + char *buf) \ +{ \ + unsigned int cpu = dev->id; \ + ssize_t ret; \ + \ + preempt_disable(); /* CPU hotplug */ \ + if (cpu_online(cpu)) \ + ret = sprintf(buf, "%lu\n", \ + per_cpu(thermal_throttle_##name, cpu)); \ + else \ + ret = 0; \ + preempt_enable(); \ + \ + return ret; \ +} + +define_therm_throt_sysdev_show_func(count); +define_therm_throt_sysdev_one_ro(count); + +static struct attribute *thermal_throttle_attrs[] = { + &attr_count.attr, + NULL +}; + +static struct attribute_group thermal_throttle_attr_group = { + .attrs = thermal_throttle_attrs, + .name = "thermal_throttle" +}; +#endif /* CONFIG_SYSFS */ + +/*** + * therm_throt_process - Process thermal throttling event from interrupt + * @curr: Whether the condition is current or not (boolean), since the + * thermal interrupt normally gets called both when the thermal + * event begins and once the event has ended. + * + * This function is called by the thermal interrupt after the + * IRQ has been acknowledged. + * + * It will take care of rate limiting and printing messages to the syslog. + * + * Returns: 0 : Event should NOT be further logged, i.e. still in + * "timeout" from previous log message. + * 1 : Event should be logged further, and a message has been + * printed to the syslog. + */ +int therm_throt_process(int curr) +{ + unsigned int cpu = smp_processor_id(); + __u64 tmp_jiffs = get_jiffies_64(); + + if (curr) + __get_cpu_var(thermal_throttle_count)++; + + if (time_before64(tmp_jiffs, __get_cpu_var(next_check))) + return 0; + + __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL; + + /* if we just entered the thermal event */ + if (curr) { + printk(KERN_CRIT "CPU%d: Temperature above threshold, " + "cpu clock throttled (total events = %lu)\n", cpu, + __get_cpu_var(thermal_throttle_count)); + + add_taint(TAINT_MACHINE_CHECK); + } else { + printk(KERN_CRIT "CPU%d: Temperature/speed normal\n", cpu); + } + + return 1; +} + +#ifdef CONFIG_SYSFS +/* Add/Remove thermal_throttle interface for CPU device */ +static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) +{ + return sysfs_create_group(&sys_dev->kobj, &thermal_throttle_attr_group); +} + +static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) +{ + return sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); +} + +/* Mutex protecting device creation against CPU hotplug */ +static DEFINE_MUTEX(therm_cpu_lock); + +/* Get notified when a cpu comes on/off. Be hotplug friendly. */ +static __cpuinit int thermal_throttle_cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct sys_device *sys_dev; + int err; + + sys_dev = get_cpu_sysdev(cpu); + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + mutex_lock(&therm_cpu_lock); + err = thermal_throttle_add_dev(sys_dev); + mutex_unlock(&therm_cpu_lock); + WARN_ON(err); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + mutex_lock(&therm_cpu_lock); + thermal_throttle_remove_dev(sys_dev); + mutex_unlock(&therm_cpu_lock); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block thermal_throttle_cpu_notifier = +{ + .notifier_call = thermal_throttle_cpu_callback, +}; + +static __init int thermal_throttle_init_device(void) +{ + unsigned int cpu = 0; + int err; + + if (!atomic_read(&therm_throt_en)) + return 0; + + register_hotcpu_notifier(&thermal_throttle_cpu_notifier); + +#ifdef CONFIG_HOTPLUG_CPU + mutex_lock(&therm_cpu_lock); +#endif + /* connect live CPUs to sysfs */ + for_each_online_cpu(cpu) { + err = thermal_throttle_add_dev(get_cpu_sysdev(cpu)); + WARN_ON(err); + } +#ifdef CONFIG_HOTPLUG_CPU + mutex_unlock(&therm_cpu_lock); +#endif + + return 0; +} + +device_initcall(thermal_throttle_init_device); +#endif /* CONFIG_SYSFS */ diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c new file mode 100644 index 000000000000..9e424b6c293d --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -0,0 +1,36 @@ +/* + * IDT Winchip specific Machine Check Exception Reporting + * (C) Copyright 2002 Alan Cox <alan@redhat.com> + */ + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/interrupt.h> + +#include <asm/processor.h> +#include <asm/system.h> +#include <asm/msr.h> + +#include "mce.h" + +/* Machine check handler for WinChip C6 */ +static fastcall void winchip_machine_check(struct pt_regs * regs, long error_code) +{ + printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); + add_taint(TAINT_MACHINE_CHECK); +} + +/* Set up machine check reporting on the Winchip C6 series */ +void winchip_mcheck_init(struct cpuinfo_x86 *c) +{ + u32 lo, hi; + machine_check_vector = winchip_machine_check; + wmb(); + rdmsr(MSR_IDT_FCR1, lo, hi); + lo|= (1<<2); /* Enable EIERRINT (int 18 MCE) */ + lo&= ~(1<<4); /* Enable MCE */ + wrmsr(MSR_IDT_FCR1, lo, hi); + set_in_cr4(X86_CR4_MCE); + printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n"); +} |