diff options
Diffstat (limited to 'drivers/acpi/apei/ghes.c')
-rw-r--r-- | drivers/acpi/apei/ghes.c | 253 |
1 files changed, 205 insertions, 48 deletions
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index d0855c09f32f..d661d452b238 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -45,10 +45,14 @@ #include <linux/aer.h> #include <linux/nmi.h> #include <linux/sched/clock.h> +#include <linux/uuid.h> +#include <linux/ras.h> +#include <acpi/actbl1.h> #include <acpi/ghes.h> #include <acpi/apei.h> #include <asm/tlbflush.h> +#include <ras/ras_event.h> #include "apei-internal.h" @@ -80,6 +84,11 @@ ((struct acpi_hest_generic_status *) \ ((struct ghes_estatus_node *)(estatus_node) + 1)) +static inline bool is_hest_type_generic_v2(struct ghes *ghes) +{ + return ghes->generic->header.type == ACPI_HEST_TYPE_GENERIC_ERROR_V2; +} + /* * This driver isn't really modular, however for the time being, * continuing to use module_param is the easiest way to remain @@ -89,14 +98,14 @@ bool ghes_disable; module_param_named(disable, ghes_disable, bool, 0); /* - * All error sources notified with SCI shares one notifier function, - * so they need to be linked and checked one by one. This is applied - * to NMI too. + * All error sources notified with HED (Hardware Error Device) share a + * single notifier callback, so they need to be linked and checked one + * by one. This holds true for NMI too. * * RCU is used for these lists, so ghes_list_mutex is only used for * list changing, not for traversing. */ -static LIST_HEAD(ghes_sci); +static LIST_HEAD(ghes_hed); static DEFINE_MUTEX(ghes_list_mutex); /* @@ -110,11 +119,7 @@ static DEFINE_MUTEX(ghes_list_mutex); * Two virtual pages are used, one for IRQ/PROCESS context, the other for * NMI context (optionally). */ -#ifdef CONFIG_HAVE_ACPI_APEI_NMI #define GHES_IOREMAP_PAGES 2 -#else -#define GHES_IOREMAP_PAGES 1 -#endif #define GHES_IOREMAP_IRQ_PAGE(base) (base) #define GHES_IOREMAP_NMI_PAGE(base) ((base) + PAGE_SIZE) @@ -133,6 +138,8 @@ static unsigned long ghes_estatus_pool_size_request; static struct ghes_estatus_cache *ghes_estatus_caches[GHES_ESTATUS_CACHES_SIZE]; static atomic_t ghes_estatus_cache_alloced; +static int ghes_panic_timeout __read_mostly = 30; + static int ghes_ioremap_init(void) { ghes_ioremap_area = __get_vm_area(PAGE_SIZE * GHES_IOREMAP_PAGES, @@ -153,10 +160,14 @@ static void ghes_ioremap_exit(void) static void __iomem *ghes_ioremap_pfn_nmi(u64 pfn) { unsigned long vaddr; + phys_addr_t paddr; + pgprot_t prot; vaddr = (unsigned long)GHES_IOREMAP_NMI_PAGE(ghes_ioremap_area->addr); - ioremap_page_range(vaddr, vaddr + PAGE_SIZE, - pfn << PAGE_SHIFT, PAGE_KERNEL); + + paddr = pfn << PAGE_SHIFT; + prot = arch_apei_get_mem_attribute(paddr); + ioremap_page_range(vaddr, vaddr + PAGE_SIZE, paddr, prot); return (void __iomem *)vaddr; } @@ -240,6 +251,16 @@ static int ghes_estatus_pool_expand(unsigned long len) return 0; } +static int map_gen_v2(struct ghes *ghes) +{ + return apei_map_generic_address(&ghes->generic_v2->read_ack_register); +} + +static void unmap_gen_v2(struct ghes *ghes) +{ + apei_unmap_generic_address(&ghes->generic_v2->read_ack_register); +} + static struct ghes *ghes_new(struct acpi_hest_generic *generic) { struct ghes *ghes; @@ -249,10 +270,17 @@ static struct ghes *ghes_new(struct acpi_hest_generic *generic) ghes = kzalloc(sizeof(*ghes), GFP_KERNEL); if (!ghes) return ERR_PTR(-ENOMEM); + ghes->generic = generic; + if (is_hest_type_generic_v2(ghes)) { + rc = map_gen_v2(ghes); + if (rc) + goto err_free; + } + rc = apei_map_generic_address(&generic->error_status_address); if (rc) - goto err_free; + goto err_unmap_read_ack_addr; error_block_length = generic->error_block_length; if (error_block_length > GHES_ESTATUS_MAX_SIZE) { pr_warning(FW_WARN GHES_PFX @@ -264,13 +292,16 @@ static struct ghes *ghes_new(struct acpi_hest_generic *generic) ghes->estatus = kmalloc(error_block_length, GFP_KERNEL); if (!ghes->estatus) { rc = -ENOMEM; - goto err_unmap; + goto err_unmap_status_addr; } return ghes; -err_unmap: +err_unmap_status_addr: apei_unmap_generic_address(&generic->error_status_address); +err_unmap_read_ack_addr: + if (is_hest_type_generic_v2(ghes)) + unmap_gen_v2(ghes); err_free: kfree(ghes); return ERR_PTR(rc); @@ -280,6 +311,8 @@ static void ghes_fini(struct ghes *ghes) { kfree(ghes->estatus); apei_unmap_generic_address(&ghes->generic->error_status_address); + if (is_hest_type_generic_v2(ghes)) + unmap_gen_v2(ghes); } static inline int ghes_severity(int severity) @@ -400,8 +433,7 @@ static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, int unsigned long pfn; int flags = -1; int sec_sev = ghes_severity(gdata->error_severity); - struct cper_sec_mem_err *mem_err; - mem_err = (struct cper_sec_mem_err *)(gdata + 1); + struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata); if (!(mem_err->validation_bits & CPER_MEM_VALID_PA)) return; @@ -431,24 +463,32 @@ static void ghes_do_proc(struct ghes *ghes, { int sev, sec_sev; struct acpi_hest_generic_data *gdata; + guid_t *sec_type; + guid_t *fru_id = &NULL_UUID_LE; + char *fru_text = ""; sev = ghes_severity(estatus->error_severity); apei_estatus_for_each_section(estatus, gdata) { + sec_type = (guid_t *)gdata->section_type; sec_sev = ghes_severity(gdata->error_severity); - if (!uuid_le_cmp(*(uuid_le *)gdata->section_type, - CPER_SEC_PLATFORM_MEM)) { - struct cper_sec_mem_err *mem_err; - mem_err = (struct cper_sec_mem_err *)(gdata+1); + if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID) + fru_id = (guid_t *)gdata->fru_id; + + if (gdata->validation_bits & CPER_SEC_VALID_FRU_TEXT) + fru_text = gdata->fru_text; + + if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) { + struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata); + ghes_edac_report_mem_error(ghes, sev, mem_err); arch_apei_report_mem_error(sev, mem_err); ghes_handle_memory_failure(gdata, sev); } #ifdef CONFIG_ACPI_APEI_PCIEAER - else if (!uuid_le_cmp(*(uuid_le *)gdata->section_type, - CPER_SEC_PCIE)) { - struct cper_sec_pcie *pcie_err; - pcie_err = (struct cper_sec_pcie *)(gdata+1); + else if (guid_equal(sec_type, &CPER_SEC_PCIE)) { + struct cper_sec_pcie *pcie_err = acpi_hest_get_payload(gdata); + if (sev == GHES_SEV_RECOVERABLE && sec_sev == GHES_SEV_RECOVERABLE && pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID && @@ -477,6 +517,17 @@ static void ghes_do_proc(struct ghes *ghes, } #endif + else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) { + struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata); + + log_arm_hw_error(err); + } else { + void *err = acpi_hest_get_payload(gdata); + + log_non_standard_event(sec_type, fru_id, fru_text, + sec_sev, err, + gdata->error_data_length); + } } } @@ -649,6 +700,31 @@ static void ghes_estatus_cache_add( rcu_read_unlock(); } +static int ghes_ack_error(struct acpi_hest_generic_v2 *gv2) +{ + int rc; + u64 val = 0; + + rc = apei_read(&val, &gv2->read_ack_register); + if (rc) + return rc; + + val &= gv2->read_ack_preserve << gv2->read_ack_register.bit_offset; + val |= gv2->read_ack_write << gv2->read_ack_register.bit_offset; + + return apei_write(val, &gv2->read_ack_register); +} + +static void __ghes_panic(struct ghes *ghes) +{ + __ghes_print_estatus(KERN_EMERG, ghes->generic, ghes->estatus); + + /* reboot to log the error! */ + if (!panic_timeout) + panic_timeout = ghes_panic_timeout; + panic("Fatal hardware error!"); +} + static int ghes_proc(struct ghes *ghes) { int rc; @@ -656,11 +732,26 @@ static int ghes_proc(struct ghes *ghes) rc = ghes_read_estatus(ghes, 0); if (rc) goto out; + + if (ghes_severity(ghes->estatus->error_severity) >= GHES_SEV_PANIC) { + __ghes_panic(ghes); + } + if (!ghes_estatus_cached(ghes->estatus)) { if (ghes_print_estatus(NULL, ghes->generic, ghes->estatus)) ghes_estatus_cache_add(ghes->generic, ghes->estatus); } ghes_do_proc(ghes, ghes->estatus); + + /* + * GHESv2 type HEST entries introduce support for error acknowledgment, + * so only acknowledge the error if this support is present. + */ + if (is_hest_type_generic_v2(ghes)) { + rc = ghes_ack_error(ghes->generic_v2); + if (rc) + return rc; + } out: ghes_clear_estatus(ghes); return rc; @@ -702,14 +793,14 @@ static irqreturn_t ghes_irq_func(int irq, void *data) return IRQ_HANDLED; } -static int ghes_notify_sci(struct notifier_block *this, - unsigned long event, void *data) +static int ghes_notify_hed(struct notifier_block *this, unsigned long event, + void *data) { struct ghes *ghes; int ret = NOTIFY_DONE; rcu_read_lock(); - list_for_each_entry_rcu(ghes, &ghes_sci, list) { + list_for_each_entry_rcu(ghes, &ghes_hed, list) { if (!ghes_proc(ghes)) ret = NOTIFY_OK; } @@ -718,10 +809,59 @@ static int ghes_notify_sci(struct notifier_block *this, return ret; } -static struct notifier_block ghes_notifier_sci = { - .notifier_call = ghes_notify_sci, +static struct notifier_block ghes_notifier_hed = { + .notifier_call = ghes_notify_hed, }; +#ifdef CONFIG_ACPI_APEI_SEA +static LIST_HEAD(ghes_sea); + +/* + * Return 0 only if one of the SEA error sources successfully reported an error + * record sent from the firmware. + */ +int ghes_notify_sea(void) +{ + struct ghes *ghes; + int ret = -ENOENT; + + rcu_read_lock(); + list_for_each_entry_rcu(ghes, &ghes_sea, list) { + if (!ghes_proc(ghes)) + ret = 0; + } + rcu_read_unlock(); + return ret; +} + +static void ghes_sea_add(struct ghes *ghes) +{ + mutex_lock(&ghes_list_mutex); + list_add_rcu(&ghes->list, &ghes_sea); + mutex_unlock(&ghes_list_mutex); +} + +static void ghes_sea_remove(struct ghes *ghes) +{ + mutex_lock(&ghes_list_mutex); + list_del_rcu(&ghes->list); + mutex_unlock(&ghes_list_mutex); + synchronize_rcu(); +} +#else /* CONFIG_ACPI_APEI_SEA */ +static inline void ghes_sea_add(struct ghes *ghes) +{ + pr_err(GHES_PFX "ID: %d, trying to add SEA notification which is not supported\n", + ghes->generic->header.source_id); +} + +static inline void ghes_sea_remove(struct ghes *ghes) +{ + pr_err(GHES_PFX "ID: %d, trying to remove SEA notification which is not supported\n", + ghes->generic->header.source_id); +} +#endif /* CONFIG_ACPI_APEI_SEA */ + #ifdef CONFIG_HAVE_ACPI_APEI_NMI /* * printk is not safe in NMI context. So in NMI handler, we allocate @@ -742,8 +882,6 @@ static atomic_t ghes_in_nmi = ATOMIC_INIT(0); static LIST_HEAD(ghes_nmi); -static int ghes_panic_timeout __read_mostly = 30; - static void ghes_proc_in_irq(struct irq_work *irq_work) { struct llist_node *llnode, *next; @@ -829,18 +967,6 @@ static void __process_error(struct ghes *ghes) #endif } -static void __ghes_panic(struct ghes *ghes) -{ - oops_begin(); - ghes_print_queued_estatus(); - __ghes_print_estatus(KERN_EMERG, ghes->generic, ghes->estatus); - - /* reboot to log the error! */ - if (panic_timeout == 0) - panic_timeout = ghes_panic_timeout; - panic("Fatal hardware error!"); -} - static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs) { struct ghes *ghes; @@ -858,8 +984,11 @@ static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs) } sev = ghes_severity(ghes->estatus->error_severity); - if (sev >= GHES_SEV_PANIC) + if (sev >= GHES_SEV_PANIC) { + oops_begin(); + ghes_print_queued_estatus(); __ghes_panic(ghes); + } if (!(ghes->flags & GHES_TO_CLEAR)) continue; @@ -966,6 +1095,17 @@ static int ghes_probe(struct platform_device *ghes_dev) case ACPI_HEST_NOTIFY_POLLED: case ACPI_HEST_NOTIFY_EXTERNAL: case ACPI_HEST_NOTIFY_SCI: + case ACPI_HEST_NOTIFY_GSIV: + case ACPI_HEST_NOTIFY_GPIO: + break; + + case ACPI_HEST_NOTIFY_SEA: + if (!IS_ENABLED(CONFIG_ACPI_APEI_SEA)) { + pr_warn(GHES_PFX "Generic hardware error source: %d notified via SEA is not supported\n", + generic->header.source_id); + rc = -ENOTSUPP; + goto err; + } break; case ACPI_HEST_NOTIFY_NMI: if (!IS_ENABLED(CONFIG_HAVE_ACPI_APEI_NMI)) { @@ -1024,13 +1164,20 @@ static int ghes_probe(struct platform_device *ghes_dev) goto err_edac_unreg; } break; + case ACPI_HEST_NOTIFY_SCI: + case ACPI_HEST_NOTIFY_GSIV: + case ACPI_HEST_NOTIFY_GPIO: mutex_lock(&ghes_list_mutex); - if (list_empty(&ghes_sci)) - register_acpi_hed_notifier(&ghes_notifier_sci); - list_add_rcu(&ghes->list, &ghes_sci); + if (list_empty(&ghes_hed)) + register_acpi_hed_notifier(&ghes_notifier_hed); + list_add_rcu(&ghes->list, &ghes_hed); mutex_unlock(&ghes_list_mutex); break; + + case ACPI_HEST_NOTIFY_SEA: + ghes_sea_add(ghes); + break; case ACPI_HEST_NOTIFY_NMI: ghes_nmi_add(ghes); break; @@ -1039,6 +1186,9 @@ static int ghes_probe(struct platform_device *ghes_dev) } platform_set_drvdata(ghes_dev, ghes); + /* Handle any pending errors right away */ + ghes_proc(ghes); + return 0; err_edac_unreg: ghes_edac_unregister(ghes); @@ -1066,14 +1216,21 @@ static int ghes_remove(struct platform_device *ghes_dev) case ACPI_HEST_NOTIFY_EXTERNAL: free_irq(ghes->irq, ghes); break; + case ACPI_HEST_NOTIFY_SCI: + case ACPI_HEST_NOTIFY_GSIV: + case ACPI_HEST_NOTIFY_GPIO: mutex_lock(&ghes_list_mutex); list_del_rcu(&ghes->list); - if (list_empty(&ghes_sci)) - unregister_acpi_hed_notifier(&ghes_notifier_sci); + if (list_empty(&ghes_hed)) + unregister_acpi_hed_notifier(&ghes_notifier_hed); mutex_unlock(&ghes_list_mutex); synchronize_rcu(); break; + + case ACPI_HEST_NOTIFY_SEA: + ghes_sea_remove(ghes); + break; case ACPI_HEST_NOTIFY_NMI: ghes_nmi_remove(ghes); break; |