From 1ca1d8d54f925ad0eb6d9806ecd4309738f25301 Mon Sep 17 00:00:00 2001 From: Lance Ortiz Date: Thu, 3 Jan 2013 15:34:01 -0700 Subject: aerdrv: Trace Event for PCI Express Advanced Error Reporting This header file will define a new trace event that will be triggered when a AER event occurs. The following data will be provided to the trace event. char * dev_name - The name of the slot where the device resides ([domain:]bus:device.function). u32 status - Either the correctable or uncorrectable register indicating what error or errors have been see. u8 severity - error severity 0:NONFATAL 1:FATAL 2:CORRECTED The trace event will also provide a trace string that may look like: "0000:05:00.0 PCIe Bus Error:severity=Uncorrected (Non-Fatal), Poisoned TLP" Signed-off-by: Lance Ortiz Acked-by: Mauro Carvalho Chehab Acked-by: Boris Petkov Signed-off-by: Tony Luck --- include/trace/events/ras.h | 77 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 include/trace/events/ras.h (limited to 'include') diff --git a/include/trace/events/ras.h b/include/trace/events/ras.h new file mode 100644 index 000000000000..88b878383797 --- /dev/null +++ b/include/trace/events/ras.h @@ -0,0 +1,77 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM ras + +#if !defined(_TRACE_AER_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_AER_H + +#include +#include + + +/* + * PCIe AER Trace event + * + * These events are generated when hardware detects a corrected or + * uncorrected event on a PCIe device. The event report has + * the following structure: + * + * char * dev_name - The name of the slot where the device resides + * ([domain:]bus:device.function). + * u32 status - Either the correctable or uncorrectable register + * indicating what error or errors have been seen + * u8 severity - error severity 0:NONFATAL 1:FATAL 2:CORRECTED + */ + +#define aer_correctable_errors \ + {BIT(0), "Receiver Error"}, \ + {BIT(6), "Bad TLP"}, \ + {BIT(7), "Bad DLLP"}, \ + {BIT(8), "RELAY_NUM Rollover"}, \ + {BIT(12), "Replay Timer Timeout"}, \ + {BIT(13), "Advisory Non-Fatal"} + +#define aer_uncorrectable_errors \ + {BIT(4), "Data Link Protocol"}, \ + {BIT(12), "Poisoned TLP"}, \ + {BIT(13), "Flow Control Protocol"}, \ + {BIT(14), "Completion Timeout"}, \ + {BIT(15), "Completer Abort"}, \ + {BIT(16), "Unexpected Completion"}, \ + {BIT(17), "Receiver Overflow"}, \ + {BIT(18), "Malformed TLP"}, \ + {BIT(19), "ECRC"}, \ + {BIT(20), "Unsupported Request"} + +TRACE_EVENT(aer_event, + TP_PROTO(const char *dev_name, + const u32 status, + const u8 severity), + + TP_ARGS(dev_name, status, severity), + + TP_STRUCT__entry( + __string( dev_name, dev_name ) + __field( u32, status ) + __field( u8, severity ) + ), + + TP_fast_assign( + __assign_str(dev_name, dev_name); + __entry->status = status; + __entry->severity = severity; + ), + + TP_printk("%s PCIe Bus Error: severity=%s, %s\n", + __get_str(dev_name), + __entry->severity == HW_EVENT_ERR_CORRECTED ? "Corrected" : + __entry->severity == HW_EVENT_ERR_FATAL ? + "Fatal" : "Uncorrected", + __entry->severity == HW_EVENT_ERR_CORRECTED ? + __print_flags(__entry->status, "|", aer_correctable_errors) : + __print_flags(__entry->status, "|", aer_uncorrectable_errors)) +); + +#endif /* _TRACE_AER_H */ + +/* This part must be outside protection */ +#include -- cgit v1.2.3 From 1d5210008bd3a26daf4b06aed9d6c330dd4c83e2 Mon Sep 17 00:00:00 2001 From: Lance Ortiz Date: Thu, 3 Jan 2013 15:34:08 -0700 Subject: aerdrv: Enhanced AER logging This patch will provide a more reliable and easy way for user-space applications to have access to AER logs rather than reading them from the message buffer. It also provides a way to notify user-space when an AER event occurs. The aer driver is updated to generate a trace event of function 'aer_event' when a PCIe error is reported over the AER interface. The trace event was added to both the interrupt based aer path and the firmware first path. Signed-off-by: Lance Ortiz Acked-by: Mauro Carvalho Chehab Acked-by: Boris Petkov Signed-off-by: Tony Luck --- drivers/acpi/apei/cper.c | 19 ++++++++++++++++--- drivers/pci/pcie/aer/aerdrv_errprint.c | 9 ++++++++- include/linux/aer.h | 4 ++-- 3 files changed, 26 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/acpi/apei/cper.c b/drivers/acpi/apei/cper.c index e6defd86b424..1e5d8a40101e 100644 --- a/drivers/acpi/apei/cper.c +++ b/drivers/acpi/apei/cper.c @@ -29,6 +29,7 @@ #include #include #include +#include #include /* @@ -249,6 +250,10 @@ static const char *cper_pcie_port_type_strs[] = { static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie, const struct acpi_hest_generic_data *gdata) { +#ifdef CONFIG_ACPI_APEI_PCIEAER + struct pci_dev *dev; +#endif + if (pcie->validation_bits & CPER_PCIE_VALID_PORT_TYPE) printk("%s""port_type: %d, %s\n", pfx, pcie->port_type, pcie->port_type < ARRAY_SIZE(cper_pcie_port_type_strs) ? @@ -281,10 +286,18 @@ static void cper_print_pcie(const char *pfx, const struct cper_sec_pcie *pcie, "%s""bridge: secondary_status: 0x%04x, control: 0x%04x\n", pfx, pcie->bridge.secondary_status, pcie->bridge.control); #ifdef CONFIG_ACPI_APEI_PCIEAER - if (pcie->validation_bits & CPER_PCIE_VALID_AER_INFO) { - struct aer_capability_regs *aer_regs = (void *)pcie->aer_info; - cper_print_aer(pfx, gdata->error_severity, aer_regs); + dev = pci_get_domain_bus_and_slot(pcie->device_id.segment, + pcie->device_id.bus, pcie->device_id.function); + if (!dev) { + pr_err("PCI AER Cannot get PCI device %04x:%02x:%02x.%d\n", + pcie->device_id.segment, pcie->device_id.bus, + pcie->device_id.slot, pcie->device_id.function); + return; } + if (pcie->validation_bits & CPER_PCIE_VALID_AER_INFO) + cper_print_aer(pfx, dev, gdata->error_severity, + (struct aer_capability_regs *) pcie->aer_info); + pci_dev_put(dev); #endif } diff --git a/drivers/pci/pcie/aer/aerdrv_errprint.c b/drivers/pci/pcie/aer/aerdrv_errprint.c index 3ea51736f18d..d3e5fc5a2de9 100644 --- a/drivers/pci/pcie/aer/aerdrv_errprint.c +++ b/drivers/pci/pcie/aer/aerdrv_errprint.c @@ -23,6 +23,9 @@ #include "aerdrv.h" +#define CREATE_TRACE_POINTS +#include + #define AER_AGENT_RECEIVER 0 #define AER_AGENT_REQUESTER 1 #define AER_AGENT_COMPLETER 2 @@ -194,6 +197,8 @@ void aer_print_error(struct pci_dev *dev, struct aer_err_info *info) if (info->id && info->error_dev_num > 1 && info->id == id) printk("%s"" Error of this Agent(%04x) is reported first\n", prefix, id); + trace_aer_event(dev_name(&dev->dev), (info->status & ~info->mask), + info->severity); } void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info) @@ -217,7 +222,7 @@ int cper_severity_to_aer(int cper_severity) } EXPORT_SYMBOL_GPL(cper_severity_to_aer); -void cper_print_aer(const char *prefix, int cper_severity, +void cper_print_aer(const char *prefix, struct pci_dev *dev, int cper_severity, struct aer_capability_regs *aer) { int aer_severity, layer, agent, status_strs_size, tlp_header_valid = 0; @@ -259,5 +264,7 @@ void cper_print_aer(const char *prefix, int cper_severity, *(tlp + 8), *(tlp + 15), *(tlp + 14), *(tlp + 13), *(tlp + 12)); } + trace_aer_event(dev_name(&dev->dev), (status & ~mask), + aer_severity); } #endif diff --git a/include/linux/aer.h b/include/linux/aer.h index 544abdb2238c..ec10e1b24c1c 100644 --- a/include/linux/aer.h +++ b/include/linux/aer.h @@ -49,8 +49,8 @@ static inline int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev) } #endif -extern void cper_print_aer(const char *prefix, int cper_severity, - struct aer_capability_regs *aer); +extern void cper_print_aer(const char *prefix, struct pci_dev *dev, + int cper_severity, struct aer_capability_regs *aer); extern int cper_severity_to_aer(int cper_severity); extern void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn, int severity); -- cgit v1.2.3