diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-03-08 20:11:39 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-03-08 20:11:39 +0300 |
commit | e13284da944df29ab08e8a9d2a50fc0ad1d858ab (patch) | |
tree | 8e6e2580d27cf4fe5f712e0857dc495aa52fd27e | |
parent | 1b37b8c48d2c2d8553f116ec2a75d21056f1fb35 (diff) | |
parent | 41f035a86b5b72a4f947c38e94239d20d595352a (diff) | |
download | linux-e13284da944df29ab08e8a9d2a50fc0ad1d858ab.tar.xz |
Merge branch 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull RAS updates from Borislav Petkov:
"This time around we have in store:
- Disable MC4_MISC thresholding banks on all AMD family 0x15 models
(Shirish S)
- AMD MCE error descriptions update and error decode improvements
(Yazen Ghannam)
- The usual smaller conversions and fixes"
* 'ras-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/mce: Improve error message when kernel cannot recover, p2
EDAC/mce_amd: Decode MCA_STATUS in bit definition order
EDAC/mce_amd: Decode MCA_STATUS[Scrub] bit
EDAC, mce_amd: Print ExtErrorCode and description on a single line
EDAC, mce_amd: Match error descriptions to latest documentation
x86/MCE/AMD, EDAC/mce_amd: Add new error descriptions for some SMCA bank types
x86/MCE/AMD, EDAC/mce_amd: Add new McaTypes for CS, PSP, and SMU units
x86/MCE/AMD, EDAC/mce_amd: Add new MP5, NBIO, and PCIE SMCA bank types
RAS: Add a MAINTAINERS entry
RAS: Use consistent types for UUIDs
x86/MCE/AMD: Carve out the MC4_MISC thresholding quirk
x86/MCE/AMD: Turn off MC4_MISC thresholding on all family 0x15 models
x86/MCE: Switch to use the new generic UUID API
-rw-r--r-- | MAINTAINERS | 10 | ||||
-rw-r--r-- | arch/x86/include/asm/mce.h | 7 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mce/amd.c | 62 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mce/apei.c | 10 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mce/core.c | 30 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mce/severity.c | 5 | ||||
-rw-r--r-- | drivers/edac/mce_amd.c | 291 | ||||
-rw-r--r-- | drivers/ras/ras.c | 2 | ||||
-rw-r--r-- | include/ras/ras_event.h | 8 |
9 files changed, 282 insertions, 143 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index f758445317c5..58de951ee4ba 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12960,6 +12960,16 @@ M: Alexandre Bounine <alex.bou9@gmail.com> S: Maintained F: drivers/rapidio/ +RAS INFRASTRUCTURE +M: Tony Luck <tony.luck@intel.com> +M: Borislav Petkov <bp@alien8.de> +L: linux-edac@vger.kernel.org +S: Maintained +F: drivers/ras/ +F: include/linux/ras.h +F: include/ras/ras_event.h +F: Documentation/admin-guide/ras.rst + RAYLINK/WEBGEAR 802.11 WIRELESS LAN DRIVER L: linux-wireless@vger.kernel.org S: Orphan diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index c1a812bd5a27..22d05e3835f0 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -48,6 +48,7 @@ #define MCI_STATUS_SYNDV BIT_ULL(53) /* synd reg. valid */ #define MCI_STATUS_DEFERRED BIT_ULL(44) /* uncorrected error, deferred exception */ #define MCI_STATUS_POISON BIT_ULL(43) /* access poisonous data */ +#define MCI_STATUS_SCRUB BIT_ULL(40) /* Error detected during scrub operation */ /* * McaX field if set indicates a given bank supports MCA extensions: @@ -307,11 +308,17 @@ enum smca_bank_types { SMCA_FP, /* Floating Point */ SMCA_L3_CACHE, /* L3 Cache */ SMCA_CS, /* Coherent Slave */ + SMCA_CS_V2, /* Coherent Slave */ SMCA_PIE, /* Power, Interrupts, etc. */ SMCA_UMC, /* Unified Memory Controller */ SMCA_PB, /* Parameter Block */ SMCA_PSP, /* Platform Security Processor */ + SMCA_PSP_V2, /* Platform Security Processor */ SMCA_SMU, /* System Management Unit */ + SMCA_SMU_V2, /* System Management Unit */ + SMCA_MP5, /* Microprocessor 5 Unit */ + SMCA_NBIO, /* Northbridge IO Unit */ + SMCA_PCIE, /* PCI Express Unit */ N_SMCA_BANK_TYPES }; diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 89298c83de53..e64de5149e50 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -88,11 +88,17 @@ static struct smca_bank_name smca_names[] = { [SMCA_FP] = { "floating_point", "Floating Point Unit" }, [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" }, [SMCA_CS] = { "coherent_slave", "Coherent Slave" }, + [SMCA_CS_V2] = { "coherent_slave", "Coherent Slave" }, [SMCA_PIE] = { "pie", "Power, Interrupts, etc." }, [SMCA_UMC] = { "umc", "Unified Memory Controller" }, [SMCA_PB] = { "param_block", "Parameter Block" }, [SMCA_PSP] = { "psp", "Platform Security Processor" }, + [SMCA_PSP_V2] = { "psp", "Platform Security Processor" }, [SMCA_SMU] = { "smu", "System Management Unit" }, + [SMCA_SMU_V2] = { "smu", "System Management Unit" }, + [SMCA_MP5] = { "mp5", "Microprocessor 5 Unit" }, + [SMCA_NBIO] = { "nbio", "Northbridge IO Unit" }, + [SMCA_PCIE] = { "pcie", "PCI Express Unit" }, }; static u32 smca_bank_addrs[MAX_NR_BANKS][NR_BLOCKS] __ro_after_init = @@ -138,30 +144,42 @@ static struct smca_hwid smca_hwid_mcatypes[] = { { SMCA_RESERVED, HWID_MCATYPE(0x00, 0x0), 0x0 }, /* ZN Core (HWID=0xB0) MCA types */ - { SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFEF }, + { SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFFF }, { SMCA_IF, HWID_MCATYPE(0xB0, 0x1), 0x3FFF }, { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2), 0xF }, { SMCA_DE, HWID_MCATYPE(0xB0, 0x3), 0x1FF }, /* HWID 0xB0 MCATYPE 0x4 is Reserved */ - { SMCA_EX, HWID_MCATYPE(0xB0, 0x5), 0x7FF }, + { SMCA_EX, HWID_MCATYPE(0xB0, 0x5), 0xFFF }, { SMCA_FP, HWID_MCATYPE(0xB0, 0x6), 0x7F }, { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7), 0xFF }, /* Data Fabric MCA types */ { SMCA_CS, HWID_MCATYPE(0x2E, 0x0), 0x1FF }, - { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1), 0xF }, + { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1), 0x1F }, + { SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2), 0x3FFF }, /* Unified Memory Controller MCA type */ - { SMCA_UMC, HWID_MCATYPE(0x96, 0x0), 0x3F }, + { SMCA_UMC, HWID_MCATYPE(0x96, 0x0), 0xFF }, /* Parameter Block MCA type */ { SMCA_PB, HWID_MCATYPE(0x05, 0x0), 0x1 }, /* Platform Security Processor MCA type */ { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0), 0x1 }, + { SMCA_PSP_V2, HWID_MCATYPE(0xFF, 0x1), 0x3FFFF }, /* System Management Unit MCA type */ { SMCA_SMU, HWID_MCATYPE(0x01, 0x0), 0x1 }, + { SMCA_SMU_V2, HWID_MCATYPE(0x01, 0x1), 0x7FF }, + + /* Microprocessor 5 Unit MCA type */ + { SMCA_MP5, HWID_MCATYPE(0x01, 0x2), 0x3FF }, + + /* Northbridge IO Unit MCA type */ + { SMCA_NBIO, HWID_MCATYPE(0x18, 0x0), 0x1F }, + + /* PCI Express Unit MCA type */ + { SMCA_PCIE, HWID_MCATYPE(0x46, 0x0), 0x1F }, }; struct smca_bank smca_banks[MAX_NR_BANKS]; @@ -545,6 +563,40 @@ out: return offset; } +/* + * Turn off MC4_MISC thresholding banks on all family 0x15 models since + * they're not supported there. + */ +void disable_err_thresholding(struct cpuinfo_x86 *c) +{ + int i; + u64 hwcr; + bool need_toggle; + u32 msrs[] = { + 0x00000413, /* MC4_MISC0 */ + 0xc0000408, /* MC4_MISC1 */ + }; + + if (c->x86 != 0x15) + return; + + rdmsrl(MSR_K7_HWCR, hwcr); + + /* McStatusWrEn has to be set */ + need_toggle = !(hwcr & BIT(18)); + + if (need_toggle) + wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); + + /* Clear CntP bit safely */ + for (i = 0; i < ARRAY_SIZE(msrs); i++) + msr_clear_bit(msrs[i], 62); + + /* restore old settings */ + if (need_toggle) + wrmsrl(MSR_K7_HWCR, hwcr); +} + /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { @@ -552,6 +604,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) unsigned int bank, block, cpu = smp_processor_id(); int offset = -1; + disable_err_thresholding(c); + for (bank = 0; bank < mca_cfg.banks; ++bank) { if (mce_flags.smca) smca_configure(bank, cpu); diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c index 1d9b3ce662a0..c038e5c00a59 100644 --- a/arch/x86/kernel/cpu/mce/apei.c +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -64,11 +64,11 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) EXPORT_SYMBOL_GPL(apei_mce_report_mem_error); #define CPER_CREATOR_MCE \ - UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \ - 0x64, 0x90, 0xb8, 0x9d) + GUID_INIT(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \ + 0x64, 0x90, 0xb8, 0x9d) #define CPER_SECTION_TYPE_MCE \ - UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \ - 0x04, 0x4a, 0x38, 0xfc) + GUID_INIT(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \ + 0x04, 0x4a, 0x38, 0xfc) /* * CPER specification (in UEFI specification 2.3 appendix N) requires @@ -135,7 +135,7 @@ retry: goto out; /* try to skip other type records in storage */ else if (rc != sizeof(rcd) || - uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) + !guid_equal(&rcd.hdr.creator_id, &CPER_CREATOR_MCE)) goto retry; memcpy(m, &rcd.mce, sizeof(*m)); rc = sizeof(*m); diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 6ce290c506d9..b7fb541a4873 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1612,36 +1612,6 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) if (c->x86 == 0x15 && c->x86_model <= 0xf) mce_flags.overflow_recov = 1; - /* - * Turn off MC4_MISC thresholding banks on those models since - * they're not supported there. - */ - if (c->x86 == 0x15 && - (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) { - int i; - u64 hwcr; - bool need_toggle; - u32 msrs[] = { - 0x00000413, /* MC4_MISC0 */ - 0xc0000408, /* MC4_MISC1 */ - }; - - rdmsrl(MSR_K7_HWCR, hwcr); - - /* McStatusWrEn has to be set */ - need_toggle = !(hwcr & BIT(18)); - - if (need_toggle) - wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); - - /* Clear CntP bit safely */ - for (i = 0; i < ARRAY_SIZE(msrs); i++) - msr_clear_bit(msrs[i], 62); - - /* restore old settings */ - if (need_toggle) - wrmsrl(MSR_K7_HWCR, hwcr); - } } if (c->x86_vendor == X86_VENDOR_INTEL) { diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c index dc3e26e905a3..65201e180fe0 100644 --- a/arch/x86/kernel/cpu/mce/severity.c +++ b/arch/x86/kernel/cpu/mce/severity.c @@ -165,6 +165,11 @@ static struct severity { SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), KERNEL ), + MCESEV( + PANIC, "Instruction fetch error in kernel", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), + KERNEL + ), #endif MCESEV( PANIC, "Action required: unknown MCACOD", diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index c605089d899f..0a1814dad6cf 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -151,138 +151,223 @@ static const char * const mc6_mce_desc[] = { /* Scalable MCA error strings */ static const char * const smca_ls_mce_desc[] = { - "Load queue parity", - "Store queue parity", - "Miss address buffer payload parity", - "L1 TLB parity", - "Reserved", - "DC tag error type 6", - "DC tag error type 1", + "Load queue parity error", + "Store queue parity error", + "Miss address buffer payload parity error", + "Level 1 TLB parity error", + "DC Tag error type 5", + "DC Tag error type 6", + "DC Tag error type 1", "Internal error type 1", "Internal error type 2", - "Sys Read data error thread 0", - "Sys read data error thread 1", - "DC tag error type 2", - "DC data error type 1 (poison consumption)", - "DC data error type 2", - "DC data error type 3", - "DC tag error type 4", - "L2 TLB parity", + "System Read Data Error Thread 0", + "System Read Data Error Thread 1", + "DC Tag error type 2", + "DC Data error type 1 and poison consumption", + "DC Data error type 2", + "DC Data error type 3", + "DC Tag error type 4", + "Level 2 TLB parity error", "PDC parity error", - "DC tag error type 3", - "DC tag error type 5", - "L2 fill data error", + "DC Tag error type 3", + "DC Tag error type 5", + "L2 Fill Data error", }; static const char * const smca_if_mce_desc[] = { - "microtag probe port parity error", - "IC microtag or full tag multi-hit error", - "IC full tag parity", - "IC data array parity", - "Decoupling queue phys addr parity error", - "L0 ITLB parity error", - "L1 ITLB parity error", - "L2 ITLB parity error", - "BPQ snoop parity on Thread 0", - "BPQ snoop parity on Thread 1", - "L1 BTB multi-match error", - "L2 BTB multi-match error", - "L2 Cache Response Poison error", - "System Read Data error", + "Op Cache Microtag Probe Port Parity Error", + "IC Microtag or Full Tag Multi-hit Error", + "IC Full Tag Parity Error", + "IC Data Array Parity Error", + "Decoupling Queue PhysAddr Parity Error", + "L0 ITLB Parity Error", + "L1 ITLB Parity Error", + "L2 ITLB Parity Error", + "BPQ Thread 0 Snoop Parity Error", + "BPQ Thread 1 Snoop Parity Error", + "L1 BTB Multi-Match Error", + "L2 BTB Multi-Match Error", + "L2 Cache Response Poison Error", + "System Read Data Error", }; static const char * const smca_l2_mce_desc[] = { - "L2M tag multi-way-hit error", - "L2M tag ECC error", - "L2M data ECC error", - "HW assert", + "L2M Tag Multiple-Way-Hit error", + "L2M Tag or State Array ECC Error", + "L2M Data Array ECC Error", + "Hardware Assert Error", }; static const char * const smca_de_mce_desc[] = { - "uop cache tag parity error", - "uop cache data parity error", - "Insn buffer parity error", - "uop queue parity error", - "Insn dispatch queue parity error", - "Fetch address FIFO parity", - "Patch RAM data parity", - "Patch RAM sequencer parity", - "uop buffer parity" + "Micro-op cache tag parity error", + "Micro-op cache data parity error", + "Instruction buffer parity error", + "Micro-op queue parity error", + "Instruction dispatch queue parity error", + "Fetch address FIFO parity error", + "Patch RAM data parity error", + "Patch RAM sequencer parity error", + "Micro-op buffer parity error" }; static const char * const smca_ex_mce_desc[] = { - "Watchdog timeout error", - "Phy register file parity", - "Flag register file parity", - "Immediate displacement register file parity", - "Address generator payload parity", - "EX payload parity", - "Checkpoint queue parity", - "Retire dispatch queue parity", + "Watchdog Timeout error", + "Physical register file parity error", + "Flag register file parity error", + "Immediate displacement register file parity error", + "Address generator payload parity error", + "EX payload parity error", + "Checkpoint queue parity error", + "Retire dispatch queue parity error", "Retire status queue parity error", "Scheduling queue parity error", "Branch buffer queue parity error", + "Hardware Assertion error", }; static const char * const smca_fp_mce_desc[] = { - "Physical register file parity", - "Freelist parity error", - "Schedule queue parity", + "Physical register file (PRF) parity error", + "Freelist (FL) parity error", + "Schedule queue parity error", "NSQ parity error", - "Retire queue parity", - "Status register file parity", + "Retire queue (RQ) parity error", + "Status register file (SRF) parity error", "Hardware assertion", }; static const char * const smca_l3_mce_desc[] = { - "Shadow tag macro ECC error", - "Shadow tag macro multi-way-hit error", - "L3M tag ECC error", - "L3M tag multi-way-hit error", - "L3M data ECC error", - "XI parity, L3 fill done channel error", - "L3 victim queue parity", - "L3 HW assert", + "Shadow Tag Macro ECC Error", + "Shadow Tag Macro Multi-way-hit Error", + "L3M Tag ECC Error", + "L3M Tag Multi-way-hit Error", + "L3M Data ECC Error", + "SDP Parity Error or SystemReadDataError from XI", + "L3 Victim Queue Parity Error", + "L3 Hardware Assertion", }; static const char * const smca_cs_mce_desc[] = { - "Illegal request from transport layer", - "Address violation", - "Security violation", - "Illegal response from transport layer", - "Unexpected response", - "Parity error on incoming request or probe response data", - "Parity error on incoming read response data", - "Atomic request parity", - "ECC error on probe filter access", + "Illegal Request", + "Address Violation", + "Security Violation", + "Illegal Response", + "Unexpected Response", + "Request or Probe Parity Error", + "Read Response Parity Error", + "Atomic Request Parity Error", + "Probe Filter ECC Error", +}; + +static const char * const smca_cs2_mce_desc[] = { + "Illegal Request", + "Address Violation", + "Security Violation", + "Illegal Response", + "Unexpected Response", + "Request or Probe Parity Error", + "Read Response Parity Error", + "Atomic Request Parity Error", + "SDP read response had no match in the CS queue", + "Probe Filter Protocol Error", + "Probe Filter ECC Error", + "SDP read response had an unexpected RETRY error", + "Counter overflow error", + "Counter underflow error", }; static const char * const smca_pie_mce_desc[] = { - "HW assert", - "Internal PIE register security violation", - "Error on GMI link", - "Poison data written to internal PIE register", + "Hardware Assert", + "Register security violation", + "Link Error", + "Poison data consumption", + "A deferred error was detected in the DF" }; static const char * const smca_umc_mce_desc[] = { "DRAM ECC error", - "Data poison error on DRAM", + "Data poison error", "SDP parity error", "Advanced peripheral bus error", - "Command/address parity error", + "Address/Command parity error", "Write data CRC error", + "DCQ SRAM ECC error", + "AES SRAM ECC error", }; static const char * const smca_pb_mce_desc[] = { - "Parameter Block RAM ECC error", + "An ECC error in the Parameter Block RAM array", }; static const char * const smca_psp_mce_desc[] = { - "PSP RAM ECC or parity error", + "An ECC or parity error in a PSP RAM instance", +}; + +static const char * const smca_psp2_mce_desc[] = { + "High SRAM ECC or parity error", + "Low SRAM ECC or parity error", + "Instruction Cache Bank 0 ECC or parity error", + "Instruction Cache Bank 1 ECC or parity error", + "Instruction Tag Ram 0 parity error", + "Instruction Tag Ram 1 parity error", + "Data Cache Bank 0 ECC or parity error", + "Data Cache Bank 1 ECC or parity error", + "Data Cache Bank 2 ECC or parity error", + "Data Cache Bank 3 ECC or parity error", + "Data Tag Bank 0 parity error", + "Data Tag Bank 1 parity error", + "Data Tag Bank 2 parity error", + "Data Tag Bank 3 parity error", + "Dirty Data Ram parity error", + "TLB Bank 0 parity error", + "TLB Bank 1 parity error", + "System Hub Read Buffer ECC or parity error", }; static const char * const smca_smu_mce_desc[] = { - "SMU RAM ECC or parity error", + "An ECC or parity error in an SMU RAM instance", +}; + +static const char * const smca_smu2_mce_desc[] = { + "High SRAM ECC or parity error", + "Low SRAM ECC or parity error", + "Data Cache Bank A ECC or parity error", + "Data Cache Bank B ECC or parity error", + "Data Tag Cache Bank A ECC or parity error", + "Data Tag Cache Bank B ECC or parity error", + "Instruction Cache Bank A ECC or parity error", + "Instruction Cache Bank B ECC or parity error", + "Instruction Tag Cache Bank A ECC or parity error", + "Instruction Tag Cache Bank B ECC or parity error", + "System Hub Read Buffer ECC or parity error", +}; + +static const char * const smca_mp5_mce_desc[] = { + "High SRAM ECC or parity error", + "Low SRAM ECC or parity error", + "Data Cache Bank A ECC or parity error", + "Data Cache Bank B ECC or parity error", + "Data Tag Cache Bank A ECC or parity error", + "Data Tag Cache Bank B ECC or parity error", + "Instruction Cache Bank A ECC or parity error", + "Instruction Cache Bank B ECC or parity error", + "Instruction Tag Cache Bank A ECC or parity error", + "Instruction Tag Cache Bank B ECC or parity error", +}; + +static const char * const smca_nbio_mce_desc[] = { + "ECC or Parity error", + "PCIE error", + "SDP ErrEvent error", + "SDP Egress Poison Error", + "IOHC Internal Poison Error", +}; + +static const char * const smca_pcie_mce_desc[] = { + "CCIX PER Message logging", + "CCIX Read Response with Status: Non-Data Error", + "CCIX Write Response with Status: Non-Data Error", + "CCIX Read Response with Status: Data Error", + "CCIX Non-okay write response with data error", }; struct smca_mce_desc { @@ -299,11 +384,17 @@ static struct smca_mce_desc smca_mce_descs[] = { [SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) }, [SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) }, [SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) }, + [SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) }, [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) }, [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) }, [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) }, [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) }, + [SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc) }, [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) }, + [SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc) }, + [SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) }, + [SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc) }, + [SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc) }, }; static bool f12h_mc0_mce(u16 ec, u8 xec) @@ -874,13 +965,12 @@ static void decode_smca_error(struct mce *m) ip_name = smca_get_long_name(bank_type); - pr_emerg(HW_ERR "%s Extended Error Code: %d\n", ip_name, xec); + pr_emerg(HW_ERR "%s Ext. Error Code: %d", ip_name, xec); /* Only print the decode of valid error codes */ if (xec < smca_mce_descs[bank_type].num_descs && (hwid->xec_bitmap & BIT_ULL(xec))) { - pr_emerg(HW_ERR "%s Error: ", ip_name); - pr_cont("%s.\n", smca_mce_descs[bank_type].descs[xec]); + pr_cont(", %s.\n", smca_mce_descs[bank_type].descs[xec]); } if (bank_type == SMCA_UMC && xec == 0 && decode_dram_ecc) @@ -961,26 +1051,18 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) ((m->status & MCI_STATUS_UC) ? "UE" : (m->status & MCI_STATUS_DEFERRED) ? "-" : "CE"), ((m->status & MCI_STATUS_MISCV) ? "MiscV" : "-"), - ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"), - ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-")); - - if (fam >= 0x15) { - pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-")); - - /* F15h, bank4, bit 43 is part of McaStatSubCache. */ - if (fam != 0x15 || m->bank != 4) - pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-")); - } + ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"), + ((m->status & MCI_STATUS_PCC) ? "PCC" : "-")); if (boot_cpu_has(X86_FEATURE_SMCA)) { u32 low, high; u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank); - pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-")); - if (!rdmsr_safe(addr, &low, &high) && (low & MCI_CONFIG_MCAX)) pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-")); + + pr_cont("|%s", ((m->status & MCI_STATUS_SYNDV) ? "SyndV" : "-")); } /* do the two bits[14:13] together */ @@ -988,6 +1070,17 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) if (ecc) pr_cont("|%sECC", ((ecc == 2) ? "C" : "U")); + if (fam >= 0x15) { + pr_cont("|%s", (m->status & MCI_STATUS_DEFERRED ? "Deferred" : "-")); + + /* F15h, bank4, bit 43 is part of McaStatSubCache. */ + if (fam != 0x15 || m->bank != 4) + pr_cont("|%s", (m->status & MCI_STATUS_POISON ? "Poison" : "-")); + } + + if (fam >= 0x17) + pr_cont("|%s", (m->status & MCI_STATUS_SCRUB ? "Scrub" : "-")); + pr_cont("]: 0x%016llx\n", m->status); if (m->status & MCI_STATUS_ADDRV) diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c index 3f38907320dc..95540ea8dd9d 100644 --- a/drivers/ras/ras.c +++ b/drivers/ras/ras.c @@ -14,7 +14,7 @@ #define TRACE_INCLUDE_PATH ../../include/ras #include <ras/ras_event.h> -void log_non_standard_event(const uuid_le *sec_type, const uuid_le *fru_id, +void log_non_standard_event(const guid_t *sec_type, const guid_t *fru_id, const char *fru_text, const u8 sev, const u8 *err, const u32 len) { diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index a0794632fd01..36c5c5e38c1d 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -27,7 +27,7 @@ TRACE_EVENT(extlog_mem_event, TP_PROTO(struct cper_sec_mem_err *mem, u32 err_seq, - const uuid_le *fru_id, + const guid_t *fru_id, const char *fru_text, u8 sev), @@ -39,7 +39,7 @@ TRACE_EVENT(extlog_mem_event, __field(u8, sev) __field(u64, pa) __field(u8, pa_mask_lsb) - __field_struct(uuid_le, fru_id) + __field_struct(guid_t, fru_id) __string(fru_text, fru_text) __field_struct(struct cper_mem_err_compact, data) ), @@ -218,8 +218,8 @@ TRACE_EVENT(arm_event, */ TRACE_EVENT(non_standard_event, - TP_PROTO(const uuid_le *sec_type, - const uuid_le *fru_id, + TP_PROTO(const guid_t *sec_type, + const guid_t *fru_id, const char *fru_text, const u8 sev, const u8 *err, |