diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-10-04 19:33:12 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-10-04 19:33:12 +0300 |
commit | 51eaa866a50f3e5f006b0c4876ddfa0e5c72c5f0 (patch) | |
tree | 9a544f7df35866d79a4f519ec0c083309cb6efeb | |
parent | 7db99f01d1879f30af95f14dfd5cbcf009d15166 (diff) | |
parent | f9781bb18ed828e7b83b7bac4a4ad7cd497ee7d7 (diff) | |
download | linux-51eaa866a50f3e5f006b0c4876ddfa0e5c72c5f0.tar.xz |
Merge tag 'ras_core_for_v6.1_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 RAS updates from Borislav Petkov:
- Fix the APEI MCE callback handler to consult the hardware about the
granularity of the memory error instead of hard-coding it
- Offline memory pages on Intel machines after 2 errors reported per
page
* tag 'ras_core_for_v6.1_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/mce: Retrieve poison range from hardware
RAS/CEC: Reduce offline page threshold for Intel systems
-rw-r--r-- | arch/x86/kernel/cpu/mce/apei.c | 13 | ||||
-rw-r--r-- | drivers/ras/cec.c | 8 |
2 files changed, 20 insertions, 1 deletions
diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c index 717192915f28..8ed341714686 100644 --- a/arch/x86/kernel/cpu/mce/apei.c +++ b/arch/x86/kernel/cpu/mce/apei.c @@ -29,15 +29,26 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) { struct mce m; + int lsb; if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) return; + /* + * Even if the ->validation_bits are set for address mask, + * to be extra safe, check and reject an error radius '0', + * and fall back to the default page size. + */ + if (mem_err->validation_bits & CPER_MEM_VALID_PA_MASK) + lsb = find_first_bit((void *)&mem_err->physical_addr_mask, PAGE_SHIFT); + else + lsb = PAGE_SHIFT; + mce_setup(&m); m.bank = -1; /* Fake a memory read error with unknown channel */ m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f; - m.misc = (MCI_MISC_ADDR_PHYS << 6) | PAGE_SHIFT; + m.misc = (MCI_MISC_ADDR_PHYS << 6) | lsb; if (severity >= GHES_SEV_RECOVERABLE) m.status |= MCI_STATUS_UC; diff --git a/drivers/ras/cec.c b/drivers/ras/cec.c index 42f2fc0bc8a9..321af498ee11 100644 --- a/drivers/ras/cec.c +++ b/drivers/ras/cec.c @@ -556,6 +556,14 @@ static int __init cec_init(void) if (ce_arr.disabled) return -ENODEV; + /* + * Intel systems may avoid uncorrectable errors + * if pages with corrected errors are aggressively + * taken offline. + */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + action_threshold = 2; + ce_arr.array = (void *)get_zeroed_page(GFP_KERNEL); if (!ce_arr.array) { pr_err("Error allocating CE array page!\n"); |