diff options
27 files changed, 2175 insertions, 377 deletions
diff --git a/Documentation/devicetree/bindings/arm/altera/socfpga-sdram-edac.txt b/Documentation/devicetree/bindings/arm/altera/socfpga-sdram-edac.txt index d0ce01da5c59..f5ad0ff69fae 100644 --- a/Documentation/devicetree/bindings/arm/altera/socfpga-sdram-edac.txt +++ b/Documentation/devicetree/bindings/arm/altera/socfpga-sdram-edac.txt @@ -2,7 +2,7 @@ Altera SOCFPGA SDRAM Error Detection & Correction [EDAC] The EDAC accesses a range of registers in the SDRAM controller. Required properties: -- compatible : should contain "altr,sdram-edac"; +- compatible : should contain "altr,sdram-edac" or "altr,sdram-edac-a10" - altr,sdr-syscon : phandle of the sdr module - interrupts : Should contain the SDRAM ECC IRQ in the appropriate format for the IRQ controller. diff --git a/Documentation/devicetree/bindings/edac/apm-xgene-edac.txt b/Documentation/devicetree/bindings/edac/apm-xgene-edac.txt new file mode 100644 index 000000000000..78edb80002c8 --- /dev/null +++ b/Documentation/devicetree/bindings/edac/apm-xgene-edac.txt @@ -0,0 +1,79 @@ +* APM X-Gene SoC EDAC node + +EDAC node is defined to describe on-chip error detection and correction. +The follow error types are supported: + + memory controller - Memory controller + PMD (L1/L2) - Processor module unit (PMD) L1/L2 cache + +The following section describes the EDAC DT node binding. + +Required properties: +- compatible : Shall be "apm,xgene-edac". +- regmap-csw : Regmap of the CPU switch fabric (CSW) resource. +- regmap-mcba : Regmap of the MCB-A (memory bridge) resource. +- regmap-mcbb : Regmap of the MCB-B (memory bridge) resource. +- regmap-efuse : Regmap of the PMD efuse resource. +- reg : First resource shall be the CPU bus (PCP) resource. +- interrupts : Interrupt-specifier for MCU, PMD, L3, or SoC error + IRQ(s). + +Required properties for memory controller subnode: +- compatible : Shall be "apm,xgene-edac-mc". +- reg : First resource shall be the memory controller unit + (MCU) resource. +- memory-controller : Instance number of the memory controller. + +Required properties for PMD subnode: +- compatible : Shall be "apm,xgene-edac-pmd" or + "apm,xgene-edac-pmd-v2". +- reg : First resource shall be the PMD resource. +- pmd-controller : Instance number of the PMD controller. + +Example: + csw: csw@7e200000 { + compatible = "apm,xgene-csw", "syscon"; + reg = <0x0 0x7e200000 0x0 0x1000>; + }; + + mcba: mcba@7e700000 { + compatible = "apm,xgene-mcb", "syscon"; + reg = <0x0 0x7e700000 0x0 0x1000>; + }; + + mcbb: mcbb@7e720000 { + compatible = "apm,xgene-mcb", "syscon"; + reg = <0x0 0x7e720000 0x0 0x1000>; + }; + + efuse: efuse@1054a000 { + compatible = "apm,xgene-efuse", "syscon"; + reg = <0x0 0x1054a000 0x0 0x20>; + }; + + edac@78800000 { + compatible = "apm,xgene-edac"; + #address-cells = <2>; + #size-cells = <2>; + ranges; + regmap-csw = <&csw>; + regmap-mcba = <&mcba>; + regmap-mcbb = <&mcbb>; + regmap-efuse = <&efuse>; + reg = <0x0 0x78800000 0x0 0x100>; + interrupts = <0x0 0x20 0x4>, + <0x0 0x21 0x4>, + <0x0 0x27 0x4>; + + edacmc@7e800000 { + compatible = "apm,xgene-edac-mc"; + reg = <0x0 0x7e800000 0x0 0x1000>; + memory-controller = <0>; + }; + + edacpmd@7c000000 { + compatible = "apm,xgene-edac-pmd"; + reg = <0x0 0x7c000000 0x0 0x200000>; + pmd-controller = <0>; + }; + }; diff --git a/Documentation/edac.txt b/Documentation/edac.txt index 73fff13e848f..0cf27a3544a5 100644 --- a/Documentation/edac.txt +++ b/Documentation/edac.txt @@ -1,53 +1,34 @@ - - EDAC - Error Detection And Correction - -Written by Doug Thompson <dougthompson@xmission.com> -7 Dec 2005 -17 Jul 2007 Updated - -(c) Mauro Carvalho Chehab -05 Aug 2009 Nehalem interface - -EDAC is maintained and written by: - - Doug Thompson, Dave Jiang, Dave Peterson et al, - original author: Thayne Harbaugh, - -Contact: - website: bluesmoke.sourceforge.net - mailing list: bluesmoke-devel@lists.sourceforge.net +===================================== "bluesmoke" was the name for this device driver when it was "out-of-tree" and maintained at sourceforge.net. When it was pushed into 2.6.16 for the first time, it was renamed to 'EDAC'. -The bluesmoke project at sourceforge.net is now utilized as a 'staging area' -for EDAC development, before it is sent upstream to kernel.org - -At the bluesmoke/EDAC project site is a series of quilt patches against -recent kernels, stored in a SVN repository. For easier downloading, there -is also a tarball snapshot available. +PURPOSE +------- -============================================================================ -EDAC PURPOSE - -The 'edac' kernel module goal is to detect and report errors that occur -within the computer system running under linux. +The 'edac' kernel module's goal is to detect and report hardware errors +that occur within the computer system running under linux. MEMORY +------ -In the initial release, memory Correctable Errors (CE) and Uncorrectable -Errors (UE) are the primary errors being harvested. These types of errors -are harvested by the 'edac_mc' class of device. +Memory Correctable Errors (CE) and Uncorrectable Errors (UE) are the +primary errors being harvested. These types of errors are harvested by +the 'edac_mc' device. Detecting CE events, then harvesting those events and reporting them, -CAN be a predictor of future UE events. With CE events, the system can -continue to operate, but with less safety. Preventive maintenance and -proactive part replacement of memory DIMMs exhibiting CEs can reduce -the likelihood of the dreaded UE events and system 'panics'. +*can* but must not necessarily be a predictor of future UE events. With +CE events only, the system can and will continue to operate as no data +has been damaged yet. + +However, preventive maintenance and proactive part replacement of memory +DIMMs exhibiting CEs can reduce the likelihood of the dreaded UE events +and system panics. -NON-MEMORY +OTHER HARDWARE ELEMENTS +----------------------- A new feature for EDAC, the edac_device class of device, was added in the 2.6.23 version of the kernel. @@ -56,70 +37,57 @@ This new device type allows for non-memory type of ECC hardware detectors to have their states harvested and presented to userspace via the sysfs interface. -Some architectures have ECC detectors for L1, L2 and L3 caches, along with DMA -engines, fabric switches, main data path switches, interconnections, -and various other hardware data paths. If the hardware reports it, then -a edac_device device probably can be constructed to harvest and present -that to userspace. +Some architectures have ECC detectors for L1, L2 and L3 caches, +along with DMA engines, fabric switches, main data path switches, +interconnections, and various other hardware data paths. If the hardware +reports it, then a edac_device device probably can be constructed to +harvest and present that to userspace. PCI BUS SCANNING +---------------- -In addition, PCI Bus Parity and SERR Errors are scanned for on PCI devices -in order to determine if errors are occurring on data transfers. +In addition, PCI devices are scanned for PCI Bus Parity and SERR Errors +in order to determine if errors are occurring during data transfers. The presence of PCI Parity errors must be examined with a grain of salt. -There are several add-in adapters that do NOT follow the PCI specification +There are several add-in adapters that do *not* follow the PCI specification with regards to Parity generation and reporting. The specification says the vendor should tie the parity status bits to 0 if they do not intend to generate parity. Some vendors do not do this, and thus the parity bit can "float" giving false positives. -In the kernel there is a PCI device attribute located in sysfs that is -checked by the EDAC PCI scanning code. If that attribute is set, -PCI parity/error scanning is skipped for that device. The attribute -is: +There is a PCI device attribute located in sysfs that is checked by +the EDAC PCI scanning code. If that attribute is set, PCI parity/error +scanning is skipped for that device. The attribute is: broken_parity_status -as is located in /sys/devices/pci<XXX>/0000:XX:YY.Z directories for +and is located in /sys/devices/pci<XXX>/0000:XX:YY.Z directories for PCI devices. -FUTURE HARDWARE SCANNING -EDAC will have future error detectors that will be integrated with -EDAC or added to it, in the following list: - - MCE Machine Check Exception - MCA Machine Check Architecture - NMI NMI notification of ECC errors - MSRs Machine Specific Register error cases - and other mechanisms. - -These errors are usually bus errors, ECC errors, thermal throttling -and the like. - - -============================================================================ -EDAC VERSIONING +VERSIONING +---------- EDAC is composed of a "core" module (edac_core.ko) and several Memory -Controller (MC) driver modules. On a given system, the CORE -is loaded and one MC driver will be loaded. Both the CORE and -the MC driver (or edac_device driver) have individual versions that reflect -current release level of their respective modules. +Controller (MC) driver modules. On a given system, the CORE is loaded +and one MC driver will be loaded. Both the CORE and the MC driver (or +edac_device driver) have individual versions that reflect current +release level of their respective modules. -Thus, to "report" on what version a system is running, one must report both -the CORE's and the MC driver's versions. +Thus, to "report" on what version a system is running, one must report +both the CORE's and the MC driver's versions. LOADING +------- -If 'edac' was statically linked with the kernel then no loading is -necessary. If 'edac' was built as modules then simply modprobe the -'edac' pieces that you need. You should be able to modprobe -hardware-specific modules and have the dependencies load the necessary core -modules. +If 'edac' was statically linked with the kernel then no loading +is necessary. If 'edac' was built as modules then simply modprobe +the 'edac' pieces that you need. You should be able to modprobe +hardware-specific modules and have the dependencies load the necessary +core modules. Example: @@ -129,35 +97,33 @@ loads both the amd76x_edac.ko memory controller module and the edac_mc.ko core module. -============================================================================ -EDAC sysfs INTERFACE - -EDAC presents a 'sysfs' interface for control, reporting and attribute -reporting purposes. +SYSFS INTERFACE +--------------- -EDAC lives in the /sys/devices/system/edac directory. +EDAC presents a 'sysfs' interface for control and reporting purposes. It +lives in the /sys/devices/system/edac directory. -Within this directory there currently reside 2 'edac' components: +Within this directory there currently reside 2 components: mc memory controller(s) system pci PCI control and status system -============================================================================ + Memory Controller (mc) Model +---------------------------- -First a background on the memory controller's model abstracted in EDAC. -Each 'mc' device controls a set of DIMM memory modules. These modules are -laid out in a Chip-Select Row (csrowX) and Channel table (chX). There can -be multiple csrows and multiple channels. +Each 'mc' device controls a set of DIMM memory modules. These modules +are laid out in a Chip-Select Row (csrowX) and Channel table (chX). +There can be multiple csrows and multiple channels. -Memory controllers allow for several csrows, with 8 csrows being a typical value. -Yet, the actual number of csrows depends on the electrical "loading" -of a given motherboard, memory controller and DIMM characteristics. +Memory controllers allow for several csrows, with 8 csrows being a +typical value. Yet, the actual number of csrows depends on the layout of +a given motherboard, memory controller and DIMM characteristics. -Dual channels allows for 128 bit data transfers to the CPU from memory. -Some newer chipsets allow for more than 2 channels, like Fully Buffered DIMMs -(FB-DIMMs). The following example will assume 2 channels: +Dual channels allows for 128 bit data transfers to/from the CPU from/to +memory. Some newer chipsets allow for more than 2 channels, like Fully +Buffered DIMMs (FB-DIMMs). The following example will assume 2 channels: Channel 0 Channel 1 @@ -179,12 +145,12 @@ for memory DIMMs: DIMM_A1 DIMM_B1 -Labels for these slots are usually silk screened on the motherboard. Slots -labeled 'A' are channel 0 in this example. Slots labeled 'B' -are channel 1. Notice that there are two csrows possible on a -physical DIMM. These csrows are allocated their csrow assignment -based on the slot into which the memory DIMM is placed. Thus, when 1 DIMM -is placed in each Channel, the csrows cross both DIMMs. +Labels for these slots are usually silk-screened on the motherboard. +Slots labeled 'A' are channel 0 in this example. Slots labeled 'B' are +channel 1. Notice that there are two csrows possible on a physical DIMM. +These csrows are allocated their csrow assignment based on the slot into +which the memory DIMM is placed. Thus, when 1 DIMM is placed in each +Channel, the csrows cross both DIMMs. Memory DIMMs come single or dual "ranked". A rank is a populated csrow. Thus, 2 single ranked DIMMs, placed in slots DIMM_A0 and DIMM_B0 above @@ -193,8 +159,8 @@ when 2 dual ranked DIMMs are similarly placed, then both csrow0 and csrow1 will be populated. The pattern repeats itself for csrow2 and csrow3. -The representation of the above is reflected in the directory tree -in EDAC's sysfs interface. Starting in directory +The representation of the above is reflected in the directory +tree in EDAC's sysfs interface. Starting in directory /sys/devices/system/edac/mc each memory controller will be represented by its own 'mcX' directory, where 'X' is the index of the MC. @@ -217,34 +183,35 @@ Under each 'mcX' directory each 'csrowX' is again represented by a |->csrow3 .... -Notice that there is no csrow1, which indicates that csrow0 is -composed of a single ranked DIMMs. This should also apply in both -Channels, in order to have dual-channel mode be operational. Since -both csrow2 and csrow3 are populated, this indicates a dual ranked -set of DIMMs for channels 0 and 1. +Notice that there is no csrow1, which indicates that csrow0 is composed +of a single ranked DIMMs. This should also apply in both Channels, in +order to have dual-channel mode be operational. Since both csrow2 and +csrow3 are populated, this indicates a dual ranked set of DIMMs for +channels 0 and 1. -Within each of the 'mcX' and 'csrowX' directories are several -EDAC control and attribute files. +Within each of the 'mcX' and 'csrowX' directories are several EDAC +control and attribute files. -============================================================================ -'mcX' DIRECTORIES +'mcX' directories +----------------- In 'mcX' directories are EDAC control and attribute files for this 'X' instance of the memory controllers. For a description of the sysfs API, please see: - Documentation/ABI/testing/sysfs/devices-edac + Documentation/ABI/testing/sysfs-devices-edac + -============================================================================ -'csrowX' DIRECTORIES +'csrowX' directories +-------------------- -When CONFIG_EDAC_LEGACY_SYSFS is enabled, the sysfs will contain the -csrowX directories. As this API doesn't work properly for Rambus, FB-DIMMs -and modern Intel Memory Controllers, this is being deprecated in favor -of dimmX directories. +When CONFIG_EDAC_LEGACY_SYSFS is enabled, sysfs will contain the csrowX +directories. As this API doesn't work properly for Rambus, FB-DIMMs and +modern Intel Memory Controllers, this is being deprecated in favor of +dimmX directories. In the 'csrowX' directories are EDAC control and attribute files for this 'X' instance of csrow: @@ -265,18 +232,18 @@ Total Correctable Errors count attribute file: 'ce_count' This attribute file displays the total count of correctable - errors that have occurred on this csrow. This - count is very important to examine. CEs provide early - indications that a DIMM is beginning to fail. This count - field should be monitored for non-zero values and report - such information to the system administrator. + errors that have occurred on this csrow. This count is very + important to examine. CEs provide early indications that a + DIMM is beginning to fail. This count field should be + monitored for non-zero values and report such information + to the system administrator. Total memory managed by this csrow attribute file: 'size_mb' - This attribute file displays, in count of megabytes, of memory + This attribute file displays, in count of megabytes, the memory that this csrow contains. @@ -377,11 +344,13 @@ Channel 1 DIMM Label control file: motherboard specific and determination of this information must occur in userland at this time. -============================================================================ + + SYSTEM LOGGING +-------------- -If logging for UEs and CEs are enabled then system logs will have -error notices indicating errors that have been detected: +If logging for UEs and CEs is enabled, then system logs will contain +information indicating that errors have been detected: EDAC MC0: CE page 0x283, offset 0xce0, grain 8, syndrome 0x6ec3, row 0, channel 1 "DIMM_B1": amd76x_edac @@ -404,24 +373,23 @@ The structure of the message is: and then an optional, driver-specific message that may have additional information. -Both UEs and CEs with no info will lack all but memory controller, -error type, a notice of "no info" and then an optional, -driver-specific error message. +Both UEs and CEs with no info will lack all but memory controller, error +type, a notice of "no info" and then an optional, driver-specific error +message. -============================================================================ PCI Bus Parity Detection +------------------------ - -On Header Type 00 devices the primary status is looked at -for any parity error regardless of whether Parity is enabled on the -device. (The spec indicates parity is generated in some cases). -On Header Type 01 bridges, the secondary status register is also -looked at to see if parity occurred on the bus on the other side of -the bridge. +On Header Type 00 devices, the primary status is looked at for any +parity error regardless of whether parity is enabled on the device or +not. (The spec indicates parity is generated in some cases). On Header +Type 01 bridges, the secondary status register is also looked at to see +if parity occurred on the bus on the other side of the bridge. SYSFS CONFIGURATION +------------------- Under /sys/devices/system/edac/pci are control and attribute files as follows: @@ -450,8 +418,9 @@ Parity Count: have been detected. -============================================================================ + MODULE PARAMETERS +----------------- Panic on UE control file: @@ -516,7 +485,7 @@ Panic on PCI PARITY Error: 'panic_on_pci_parity' - This control files enables or disables panicking when a parity + This control file enables or disables panicking when a parity error has been detected. @@ -530,10 +499,8 @@ Panic on PCI PARITY Error: -======================================================================= - - -EDAC_DEVICE type of device +EDAC device type +---------------- In the header file, edac_core.h, there is a series of edac_device structures and APIs for the EDAC_DEVICE. @@ -573,6 +540,7 @@ The test_device_edac device adds at least one of its own custom control: The symlink points to the 'struct dev' that is registered for this edac_device. INSTANCES +--------- One or more instance directories are present. For the 'test_device_edac' case: @@ -586,6 +554,7 @@ counter in deeper subdirectories. ue_count total of UE events of subdirectories BLOCKS +------ At the lowest directory level is the 'block' directory. There can be 0, 1 or more blocks specified in each instance. @@ -617,14 +586,15 @@ The 'test_device_edac' device adds 4 attributes and 1 control: reset all the above counters. -Use of the 'test_device_edac' driver should any others to create their own +Use of the 'test_device_edac' driver should enable any others to create their own unique drivers for their hardware systems. The 'test_device_edac' sample driver is located at the bluesmoke.sourceforge.net project site for EDAC. -======================================================================= + NEHALEM USAGE OF EDAC APIs +-------------------------- This chapter documents some EXPERIMENTAL mappings for EDAC API to handle Nehalem EDAC driver. They will likely be changed on future versions @@ -633,7 +603,7 @@ of the driver. Due to the way Nehalem exports Memory Controller data, some adjustments were done at i7core_edac driver. This chapter will cover those differences -1) On Nehalem, there are one Memory Controller per Quick Patch Interconnect +1) On Nehalem, there is one Memory Controller per Quick Patch Interconnect (QPI). At the driver, the term "socket" means one QPI. This is associated with a physical CPU socket. @@ -642,7 +612,7 @@ were done at i7core_edac driver. This chapter will cover those differences Each channel can have up to 3 DIMMs. The minimum known unity is DIMMs. There are no information about csrows. - As EDAC API maps the minimum unity is csrows, the driver sequencially + As EDAC API maps the minimum unity is csrows, the driver sequentially maps channel/dimm into different csrows. For example, supposing the following layout: @@ -664,7 +634,7 @@ exports one Each QPI is exported as a different memory controller. -2) Nehalem MC has the hability to generate errors. The driver implements this +2) Nehalem MC has the ability to generate errors. The driver implements this functionality via some error injection nodes: For injecting a memory error, there are some sysfs nodes, under @@ -771,5 +741,22 @@ exports one The standard error counters are generated when an mcelog error is received by the driver. Since, with udimm, this is counted by software, it is - possible that some errors could be lost. With rdimm's, they displays the + possible that some errors could be lost. With rdimm's, they display the contents of the registers + +CREDITS: +======== + +Written by Doug Thompson <dougthompson@xmission.com> +7 Dec 2005 +17 Jul 2007 Updated + +(c) Mauro Carvalho Chehab +05 Aug 2009 Nehalem interface + +EDAC authors/maintainers: + + Doug Thompson, Dave Jiang, Dave Peterson et al, + Mauro Carvalho Chehab + Borislav Petkov + original author: Thayne Harbaugh diff --git a/MAINTAINERS b/MAINTAINERS index 7ee651eb0b79..d7c53502aa86 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3777,7 +3777,7 @@ S: Maintained F: drivers/edac/ie31200_edac.c EDAC-MPC85XX -M: Johannes Thumshirn <johannes.thumshirn@men.de> +M: Johannes Thumshirn <morbidrsa@gmail.com> L: linux-edac@vger.kernel.org W: bluesmoke.sourceforge.net S: Maintained @@ -3804,6 +3804,13 @@ W: bluesmoke.sourceforge.net S: Maintained F: drivers/edac/sb_edac.c +EDAC-XGENE +APPLIED MICRO (APM) X-GENE SOC EDAC +M: Loc Ho <lho@apm.com> +S: Supported +F: drivers/edac/xgene_edac.c +F: Documentation/devicetree/bindings/edac/apm-xgene-edac.txt + EDIROL UA-101/UA-1000 DRIVER M: Clemens Ladisch <clemens@ladisch.de> L: alsa-devel@alsa-project.org (moderated for non-subscribers) @@ -6488,14 +6495,14 @@ F: include/linux/mtd/ F: include/uapi/mtd/ MEN A21 WATCHDOG DRIVER -M: Johannes Thumshirn <johannes.thumshirn@men.de> +M: Johannes Thumshirn <morbidrsa@gmail.com> L: linux-watchdog@vger.kernel.org -S: Supported +S: Maintained F: drivers/watchdog/mena21_wdt.c MEN CHAMELEON BUS (mcb) -M: Johannes Thumshirn <johannes.thumshirn@men.de> -S: Supported +M: Johannes Thumshirn <morbidrsa@gmail.com> +S: Maintained F: drivers/mcb/ F: include/linux/mcb.h diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 45df48ba0b12..325d6f3a596a 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -15,6 +15,8 @@ config ARM select CLONE_BACKWARDS select CPU_PM if (SUSPEND || CPU_IDLE) select DCACHE_WORD_ACCESS if HAVE_EFFICIENT_UNALIGNED_ACCESS + select EDAC_SUPPORT + select EDAC_ATOMIC_SCRUB select GENERIC_ALLOCATOR select GENERIC_ATOMIC64 if (CPU_V7M || CPU_V6 || !CPU_32v6K || !AEABI) select GENERIC_CLOCKEVENTS_BROADCAST if SMP diff --git a/arch/arm/boot/dts/socfpga_arria10.dtsi b/arch/arm/boot/dts/socfpga_arria10.dtsi index 8a05c47fd57f..4be75960a603 100644 --- a/arch/arm/boot/dts/socfpga_arria10.dtsi +++ b/arch/arm/boot/dts/socfpga_arria10.dtsi @@ -253,6 +253,17 @@ status = "disabled"; }; + sdr: sdr@ffc25000 { + compatible = "syscon"; + reg = <0xffcfb100 0x80>; + }; + + sdramedac { + compatible = "altr,sdram-edac-a10"; + altr,sdr-syscon = <&sdr>; + interrupts = <0 2 4>, <0 0 4>; + }; + L2: l2-cache@fffff000 { compatible = "arm,pl310-cache"; reg = <0xfffff000 0x1000>; diff --git a/arch/arm/include/asm/edac.h b/arch/arm/include/asm/edac.h index 0df7a2c1fc3d..5189fa819b60 100644 --- a/arch/arm/include/asm/edac.h +++ b/arch/arm/include/asm/edac.h @@ -18,11 +18,12 @@ #define ASM_EDAC_H /* * ECC atomic, DMA, SMP and interrupt safe scrub function. - * Implements the per arch atomic_scrub() that EDAC use for software + * Implements the per arch edac_atomic_scrub() that EDAC use for software * ECC scrubbing. It reads memory and then writes back the original * value, allowing the hardware to detect and correct memory errors. */ -static inline void atomic_scrub(void *va, u32 size) + +static inline void edac_atomic_scrub(void *va, u32 size) { #if __LINUX_ARM_ARCH__ >= 6 unsigned int *virt_addr = va; diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 802400f2a69e..290ed648aa11 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -23,6 +23,7 @@ config ARM64 select BUILDTIME_EXTABLE_SORT select CLONE_BACKWARDS select COMMON_CLK + select EDAC_SUPPORT select CPU_PM if (SUSPEND || CPU_IDLE) select DCACHE_WORD_ACCESS select GENERIC_ALLOCATOR diff --git a/arch/arm64/boot/dts/apm/apm-storm.dtsi b/arch/arm64/boot/dts/apm/apm-storm.dtsi index d8f3a1c65ecd..0bb287ca0a98 100644 --- a/arch/arm64/boot/dts/apm/apm-storm.dtsi +++ b/arch/arm64/boot/dts/apm/apm-storm.dtsi @@ -396,6 +396,89 @@ 0x0 0x1f 0x4>; }; + csw: csw@7e200000 { + compatible = "apm,xgene-csw", "syscon"; + reg = <0x0 0x7e200000 0x0 0x1000>; + }; + + mcba: mcba@7e700000 { + compatible = "apm,xgene-mcb", "syscon"; + reg = <0x0 0x7e700000 0x0 0x1000>; + }; + + mcbb: mcbb@7e720000 { + compatible = "apm,xgene-mcb", "syscon"; + reg = <0x0 0x7e720000 0x0 0x1000>; + }; + + efuse: efuse@1054a000 { + compatible = "apm,xgene-efuse", "syscon"; + reg = <0x0 0x1054a000 0x0 0x20>; + }; + + edac@78800000 { + compatible = "apm,xgene-edac"; + #address-cells = <2>; + #size-cells = <2>; + ranges; + regmap-csw = <&csw>; + regmap-mcba = <&mcba>; + regmap-mcbb = <&mcbb>; + regmap-efuse = <&efuse>; + reg = <0x0 0x78800000 0x0 0x100>; + interrupts = <0x0 0x20 0x4>, + <0x0 0x21 0x4>, + <0x0 0x27 0x4>; + + edacmc@7e800000 { + compatible = "apm,xgene-edac-mc"; + reg = <0x0 0x7e800000 0x0 0x1000>; + memory-controller = <0>; + }; + + edacmc@7e840000 { + compatible = "apm,xgene-edac-mc"; + reg = <0x0 0x7e840000 0x0 0x1000>; + memory-controller = <1>; + }; + + edacmc@7e880000 { + compatible = "apm,xgene-edac-mc"; + reg = <0x0 0x7e880000 0x0 0x1000>; + memory-controller = <2>; + }; + + edacmc@7e8c0000 { + compatible = "apm,xgene-edac-mc"; + reg = <0x0 0x7e8c0000 0x0 0x1000>; + memory-controller = <3>; + }; + + edacpmd@7c000000 { + compatible = "apm,xgene-edac-pmd"; + reg = <0x0 0x7c000000 0x0 0x200000>; + pmd-controller = <0>; + }; + + edacpmd@7c200000 { + compatible = "apm,xgene-edac-pmd"; + reg = <0x0 0x7c200000 0x0 0x200000>; + pmd-controller = <1>; + }; + + edacpmd@7c400000 { + compatible = "apm,xgene-edac-pmd"; + reg = <0x0 0x7c400000 0x0 0x200000>; + pmd-controller = <2>; + }; + + edacpmd@7c600000 { + compatible = "apm,xgene-edac-pmd"; + reg = <0x0 0x7c600000 0x0 0x200000>; + pmd-controller = <3>; + }; + }; + pcie0: pcie@1f2b0000 { status = "disabled"; device_type = "pci"; diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index f5016656494f..b65edf514b40 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -819,6 +819,7 @@ config CAVIUM_OCTEON_SOC select SYS_SUPPORTS_64BIT_KERNEL select SYS_SUPPORTS_BIG_ENDIAN select EDAC_SUPPORT + select EDAC_ATOMIC_SCRUB select SYS_SUPPORTS_LITTLE_ENDIAN select SYS_SUPPORTS_HOTPLUG_CPU if CPU_BIG_ENDIAN select SYS_HAS_EARLY_PRINTK diff --git a/arch/mips/include/asm/edac.h b/arch/mips/include/asm/edac.h index 94105d3f58f4..980b16527374 100644 --- a/arch/mips/include/asm/edac.h +++ b/arch/mips/include/asm/edac.h @@ -5,7 +5,7 @@ /* ECC atomic, DMA, SMP and interrupt safe scrub function */ -static inline void atomic_scrub(void *va, u32 size) +static inline void edac_atomic_scrub(void *va, u32 size) { unsigned long *virt_addr = va; unsigned long temp; @@ -21,7 +21,7 @@ static inline void atomic_scrub(void *va, u32 size) __asm__ __volatile__ ( " .set mips2 \n" - "1: ll %0, %1 # atomic_scrub \n" + "1: ll %0, %1 # edac_atomic_scrub \n" " addu %0, $0 \n" " sc %0, %1 \n" " beqz %0, 1b \n" diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 190cc48abc0c..5ef27113b898 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -153,6 +153,8 @@ config PPC select NO_BOOTMEM select HAVE_GENERIC_RCU_GUP select HAVE_PERF_EVENTS_NMI if PPC64 + select EDAC_SUPPORT + select EDAC_ATOMIC_SCRUB config GENERIC_CSUM def_bool CPU_LITTLE_ENDIAN diff --git a/arch/powerpc/include/asm/edac.h b/arch/powerpc/include/asm/edac.h index 6ead88bbfbb8..5571e23d253e 100644 --- a/arch/powerpc/include/asm/edac.h +++ b/arch/powerpc/include/asm/edac.h @@ -12,11 +12,11 @@ #define ASM_EDAC_H /* * ECC atomic, DMA, SMP and interrupt safe scrub function. - * Implements the per arch atomic_scrub() that EDAC use for software + * Implements the per arch edac_atomic_scrub() that EDAC use for software * ECC scrubbing. It reads memory and then writes back the original * value, allowing the hardware to detect and correct memory errors. */ -static __inline__ void atomic_scrub(void *va, u32 size) +static __inline__ void edac_atomic_scrub(void *va, u32 size) { unsigned int *virt_addr = va; unsigned int temp; diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index a07e31b50d3f..59cf0b911898 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -28,6 +28,7 @@ config TILE select HAVE_DEBUG_STACKOVERFLOW select ARCH_WANT_FRAME_POINTERS select HAVE_CONTEXT_TRACKING + select EDAC_SUPPORT # FIXME: investigate whether we need/want these options. # select HAVE_IOREMAP_PROT diff --git a/arch/tile/include/asm/edac.h b/arch/tile/include/asm/edac.h deleted file mode 100644 index 87fc83eeaffd..000000000000 --- a/arch/tile/include/asm/edac.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright 2011 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - */ - -#ifndef _ASM_TILE_EDAC_H -#define _ASM_TILE_EDAC_H - -/* ECC atomic, DMA, SMP and interrupt safe scrub function */ - -static inline void atomic_scrub(void *va, u32 size) -{ - /* - * These is nothing to be done here because CE is - * corrected by the mshim. - */ - return; -} - -#endif /* _ASM_TILE_EDAC_H */ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7e39f9b22705..8e0b76ad8350 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -50,6 +50,8 @@ config X86 select CLONE_BACKWARDS if X86_32 select COMPAT_OLD_SIGACTION if IA32_EMULATION select DCACHE_WORD_ACCESS + select EDAC_ATOMIC_SCRUB + select EDAC_SUPPORT select GENERIC_CLOCKEVENTS select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC) select GENERIC_CLOCKEVENTS_MIN_ADJUST diff --git a/arch/x86/include/asm/edac.h b/arch/x86/include/asm/edac.h index e9b57ecc70c5..cf8fdf83b231 100644 --- a/arch/x86/include/asm/edac.h +++ b/arch/x86/include/asm/edac.h @@ -3,7 +3,7 @@ /* ECC atomic, DMA, SMP and interrupt safe scrub function */ -static inline void atomic_scrub(void *va, u32 size) +static inline void edac_atomic_scrub(void *va, u32 size) { u32 i, *virt_addr = va; diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index cb59619df23f..8677ead2a8e1 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -2,15 +2,16 @@ # EDAC Kconfig # Copyright (c) 2008 Doug Thompson www.softwarebitmaker.com # Licensed and distributed under the GPL -# + +config EDAC_ATOMIC_SCRUB + bool config EDAC_SUPPORT bool menuconfig EDAC bool "EDAC (Error Detection And Correction) reporting" - depends on HAS_IOMEM - depends on X86 || PPC || TILE || ARM || EDAC_SUPPORT + depends on HAS_IOMEM && EDAC_SUPPORT help EDAC is designed to report errors in the core system. These are low-level errors that are reported in the CPU or @@ -262,10 +263,10 @@ config EDAC_SBRIDGE config EDAC_MPC85XX tristate "Freescale MPC83xx / MPC85xx" - depends on EDAC_MM_EDAC && FSL_SOC && (PPC_83xx || PPC_85xx) + depends on EDAC_MM_EDAC && FSL_SOC help Support for error detection and correction on the Freescale - MPC8349, MPC8560, MPC8540, MPC8548 + MPC8349, MPC8560, MPC8540, MPC8548, T4240 config EDAC_MV64X60 tristate "Marvell MV64x60" @@ -377,8 +378,8 @@ config EDAC_OCTEON_PCI Cavium Octeon family of SOCs. config EDAC_ALTERA_MC - tristate "Altera SDRAM Memory Controller EDAC" - depends on EDAC_MM_EDAC && ARCH_SOCFPGA + bool "Altera SDRAM Memory Controller EDAC" + depends on EDAC_MM_EDAC=y && ARCH_SOCFPGA help Support for error detection and correction on the Altera SDRAM memory controller. Note that the @@ -392,4 +393,11 @@ config EDAC_SYNOPSYS Support for error detection and correction on the Synopsys DDR memory controller. +config EDAC_XGENE + tristate "APM X-Gene SoC" + depends on EDAC_MM_EDAC && (ARM64 || COMPILE_TEST) + help + Support for error detection and correction on the + APM X-Gene family of SOCs. + endif # EDAC diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index b255f362b1db..28ef2a519f65 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile @@ -68,3 +68,4 @@ obj-$(CONFIG_EDAC_OCTEON_PCI) += octeon_edac-pci.o obj-$(CONFIG_EDAC_ALTERA_MC) += altera_edac.o obj-$(CONFIG_EDAC_SYNOPSYS) += synopsys_edac.o +obj-$(CONFIG_EDAC_XGENE) += xgene_edac.o diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c index 3c4929fda9d5..23ef0917483c 100644 --- a/drivers/edac/altera_edac.c +++ b/drivers/edac/altera_edac.c @@ -1,5 +1,5 @@ /* - * Copyright Altera Corporation (C) 2014. All rights reserved. + * Copyright Altera Corporation (C) 2014-2015. All rights reserved. * Copyright 2011-2012 Calxeda, Inc. * * This program is free software; you can redistribute it and/or modify it @@ -28,113 +28,92 @@ #include <linux/types.h> #include <linux/uaccess.h> +#include "altera_edac.h" #include "edac_core.h" #include "edac_module.h" #define EDAC_MOD_STR "altera_edac" #define EDAC_VERSION "1" -/* SDRAM Controller CtrlCfg Register */ -#define CTLCFG_OFST 0x00 - -/* SDRAM Controller CtrlCfg Register Bit Masks */ -#define CTLCFG_ECC_EN 0x400 -#define CTLCFG_ECC_CORR_EN 0x800 -#define CTLCFG_GEN_SB_ERR 0x2000 -#define CTLCFG_GEN_DB_ERR 0x4000 - -#define CTLCFG_ECC_AUTO_EN (CTLCFG_ECC_EN | \ - CTLCFG_ECC_CORR_EN) - -/* SDRAM Controller Address Width Register */ -#define DRAMADDRW_OFST 0x2C - -/* SDRAM Controller Address Widths Field Register */ -#define DRAMADDRW_COLBIT_MASK 0x001F -#define DRAMADDRW_COLBIT_SHIFT 0 -#define DRAMADDRW_ROWBIT_MASK 0x03E0 -#define DRAMADDRW_ROWBIT_SHIFT 5 -#define DRAMADDRW_BANKBIT_MASK 0x1C00 -#define DRAMADDRW_BANKBIT_SHIFT 10 -#define DRAMADDRW_CSBIT_MASK 0xE000 -#define DRAMADDRW_CSBIT_SHIFT 13 - -/* SDRAM Controller Interface Data Width Register */ -#define DRAMIFWIDTH_OFST 0x30 - -/* SDRAM Controller Interface Data Width Defines */ -#define DRAMIFWIDTH_16B_ECC 24 -#define DRAMIFWIDTH_32B_ECC 40 - -/* SDRAM Controller DRAM Status Register */ -#define DRAMSTS_OFST 0x38 - -/* SDRAM Controller DRAM Status Register Bit Masks */ -#define DRAMSTS_SBEERR 0x04 -#define DRAMSTS_DBEERR 0x08 -#define DRAMSTS_CORR_DROP 0x10 - -/* SDRAM Controller DRAM IRQ Register */ -#define DRAMINTR_OFST 0x3C - -/* SDRAM Controller DRAM IRQ Register Bit Masks */ -#define DRAMINTR_INTREN 0x01 -#define DRAMINTR_SBEMASK 0x02 -#define DRAMINTR_DBEMASK 0x04 -#define DRAMINTR_CORRDROPMASK 0x08 -#define DRAMINTR_INTRCLR 0x10 - -/* SDRAM Controller Single Bit Error Count Register */ -#define SBECOUNT_OFST 0x40 - -/* SDRAM Controller Single Bit Error Count Register Bit Masks */ -#define SBECOUNT_MASK 0x0F - -/* SDRAM Controller Double Bit Error Count Register */ -#define DBECOUNT_OFST 0x44 - -/* SDRAM Controller Double Bit Error Count Register Bit Masks */ -#define DBECOUNT_MASK 0x0F - -/* SDRAM Controller ECC Error Address Register */ -#define ERRADDR_OFST 0x48 - -/* SDRAM Controller ECC Error Address Register Bit Masks */ -#define ERRADDR_MASK 0xFFFFFFFF +static const struct altr_sdram_prv_data c5_data = { + .ecc_ctrl_offset = CV_CTLCFG_OFST, + .ecc_ctl_en_mask = CV_CTLCFG_ECC_AUTO_EN, + .ecc_stat_offset = CV_DRAMSTS_OFST, + .ecc_stat_ce_mask = CV_DRAMSTS_SBEERR, + .ecc_stat_ue_mask = CV_DRAMSTS_DBEERR, + .ecc_saddr_offset = CV_ERRADDR_OFST, + .ecc_daddr_offset = CV_ERRADDR_OFST, + .ecc_cecnt_offset = CV_SBECOUNT_OFST, + .ecc_uecnt_offset = CV_DBECOUNT_OFST, + .ecc_irq_en_offset = CV_DRAMINTR_OFST, + .ecc_irq_en_mask = CV_DRAMINTR_INTREN, + .ecc_irq_clr_offset = CV_DRAMINTR_OFST, + .ecc_irq_clr_mask = (CV_DRAMINTR_INTRCLR | CV_DRAMINTR_INTREN), + .ecc_cnt_rst_offset = CV_DRAMINTR_OFST, + .ecc_cnt_rst_mask = CV_DRAMINTR_INTRCLR, +#ifdef CONFIG_EDAC_DEBUG + .ce_ue_trgr_offset = CV_CTLCFG_OFST, + .ce_set_mask = CV_CTLCFG_GEN_SB_ERR, + .ue_set_mask = CV_CTLCFG_GEN_DB_ERR, +#endif +}; -/* Altera SDRAM Memory Controller data */ -struct altr_sdram_mc_data { - struct regmap *mc_vbase; +static const struct altr_sdram_prv_data a10_data = { + .ecc_ctrl_offset = A10_ECCCTRL1_OFST, + .ecc_ctl_en_mask = A10_ECCCTRL1_ECC_EN, + .ecc_stat_offset = A10_INTSTAT_OFST, + .ecc_stat_ce_mask = A10_INTSTAT_SBEERR, + .ecc_stat_ue_mask = A10_INTSTAT_DBEERR, + .ecc_saddr_offset = A10_SERRADDR_OFST, + .ecc_daddr_offset = A10_DERRADDR_OFST, + .ecc_irq_en_offset = A10_ERRINTEN_OFST, + .ecc_irq_en_mask = A10_ECC_IRQ_EN_MASK, + .ecc_irq_clr_offset = A10_INTSTAT_OFST, + .ecc_irq_clr_mask = (A10_INTSTAT_SBEERR | A10_INTSTAT_DBEERR), + .ecc_cnt_rst_offset = A10_ECCCTRL1_OFST, + .ecc_cnt_rst_mask = A10_ECC_CNT_RESET_MASK, +#ifdef CONFIG_EDAC_DEBUG + .ce_ue_trgr_offset = A10_DIAGINTTEST_OFST, + .ce_set_mask = A10_DIAGINT_TSERRA_MASK, + .ue_set_mask = A10_DIAGINT_TDERRA_MASK, +#endif }; static irqreturn_t altr_sdram_mc_err_handler(int irq, void *dev_id) { struct mem_ctl_info *mci = dev_id; struct altr_sdram_mc_data *drvdata = mci->pvt_info; - u32 status, err_count, err_addr; + const struct altr_sdram_prv_data *priv = drvdata->data; + u32 status, err_count = 1, err_addr; - /* Error Address is shared by both SBE & DBE */ - regmap_read(drvdata->mc_vbase, ERRADDR_OFST, &err_addr); + regmap_read(drvdata->mc_vbase, priv->ecc_stat_offset, &status); - regmap_read(drvdata->mc_vbase, DRAMSTS_OFST, &status); - - if (status & DRAMSTS_DBEERR) { - regmap_read(drvdata->mc_vbase, DBECOUNT_OFST, &err_count); + if (status & priv->ecc_stat_ue_mask) { + regmap_read(drvdata->mc_vbase, priv->ecc_daddr_offset, + &err_addr); + if (priv->ecc_uecnt_offset) + regmap_read(drvdata->mc_vbase, priv->ecc_uecnt_offset, + &err_count); panic("\nEDAC: [%d Uncorrectable errors @ 0x%08X]\n", err_count, err_addr); } - if (status & DRAMSTS_SBEERR) { - regmap_read(drvdata->mc_vbase, SBECOUNT_OFST, &err_count); + if (status & priv->ecc_stat_ce_mask) { + regmap_read(drvdata->mc_vbase, priv->ecc_saddr_offset, + &err_addr); + if (priv->ecc_uecnt_offset) + regmap_read(drvdata->mc_vbase, priv->ecc_cecnt_offset, + &err_count); edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, err_count, err_addr >> PAGE_SHIFT, err_addr & ~PAGE_MASK, 0, 0, 0, -1, mci->ctl_name, ""); - } - - regmap_write(drvdata->mc_vbase, DRAMINTR_OFST, - (DRAMINTR_INTRCLR | DRAMINTR_INTREN)); + /* Clear IRQ to resume */ + regmap_write(drvdata->mc_vbase, priv->ecc_irq_clr_offset, + priv->ecc_irq_clr_mask); - return IRQ_HANDLED; + return IRQ_HANDLED; + } + return IRQ_NONE; } #ifdef CONFIG_EDAC_DEBUG @@ -144,6 +123,7 @@ static ssize_t altr_sdr_mc_err_inject_write(struct file *file, { struct mem_ctl_info *mci = file->private_data; struct altr_sdram_mc_data *drvdata = mci->pvt_info; + const struct altr_sdram_prv_data *priv = drvdata->data; u32 *ptemp; dma_addr_t dma_handle; u32 reg, read_reg; @@ -156,8 +136,9 @@ static ssize_t altr_sdr_mc_err_inject_write(struct file *file, return -ENOMEM; } - regmap_read(drvdata->mc_vbase, CTLCFG_OFST, &read_reg); - read_reg &= ~(CTLCFG_GEN_SB_ERR | CTLCFG_GEN_DB_ERR); + regmap_read(drvdata->mc_vbase, priv->ce_ue_trgr_offset, + &read_reg); + read_reg &= ~(priv->ce_set_mask | priv->ue_set_mask); /* Error are injected by writing a word while the SBE or DBE * bit in the CTLCFG register is set. Reading the word will @@ -166,20 +147,20 @@ static ssize_t altr_sdr_mc_err_inject_write(struct file *file, if (count == 3) { edac_printk(KERN_ALERT, EDAC_MC, "Inject Double bit error\n"); - regmap_write(drvdata->mc_vbase, CTLCFG_OFST, - (read_reg | CTLCFG_GEN_DB_ERR)); + regmap_write(drvdata->mc_vbase, priv->ce_ue_trgr_offset, + (read_reg | priv->ue_set_mask)); } else { edac_printk(KERN_ALERT, EDAC_MC, "Inject Single bit error\n"); - regmap_write(drvdata->mc_vbase, CTLCFG_OFST, - (read_reg | CTLCFG_GEN_SB_ERR)); + regmap_write(drvdata->mc_vbase, priv->ce_ue_trgr_offset, + (read_reg | priv->ce_set_mask)); } ptemp[0] = 0x5A5A5A5A; ptemp[1] = 0xA5A5A5A5; /* Clear the error injection bits */ - regmap_write(drvdata->mc_vbase, CTLCFG_OFST, read_reg); + regmap_write(drvdata->mc_vbase, priv->ce_ue_trgr_offset, read_reg); /* Ensure it has been written out */ wmb(); @@ -219,50 +200,106 @@ static void altr_sdr_mc_create_debugfs_nodes(struct mem_ctl_info *mci) {} #endif -/* Get total memory size in bytes */ -static u32 altr_sdram_get_total_mem_size(struct regmap *mc_vbase) +/* Get total memory size from Open Firmware DTB */ +static unsigned long get_total_mem(void) { - u32 size, read_reg, row, bank, col, cs, width; - - if (regmap_read(mc_vbase, DRAMADDRW_OFST, &read_reg) < 0) - return 0; - - if (regmap_read(mc_vbase, DRAMIFWIDTH_OFST, &width) < 0) - return 0; - - col = (read_reg & DRAMADDRW_COLBIT_MASK) >> - DRAMADDRW_COLBIT_SHIFT; - row = (read_reg & DRAMADDRW_ROWBIT_MASK) >> - DRAMADDRW_ROWBIT_SHIFT; - bank = (read_reg & DRAMADDRW_BANKBIT_MASK) >> - DRAMADDRW_BANKBIT_SHIFT; - cs = (read_reg & DRAMADDRW_CSBIT_MASK) >> - DRAMADDRW_CSBIT_SHIFT; - - /* Correct for ECC as its not addressible */ - if (width == DRAMIFWIDTH_32B_ECC) - width = 32; - if (width == DRAMIFWIDTH_16B_ECC) - width = 16; - - /* calculate the SDRAM size base on this info */ - size = 1 << (row + bank + col); - size = size * cs * (width / 8); - return size; + struct device_node *np = NULL; + const unsigned int *reg, *reg_end; + int len, sw, aw; + unsigned long start, size, total_mem = 0; + + for_each_node_by_type(np, "memory") { + aw = of_n_addr_cells(np); + sw = of_n_size_cells(np); + reg = (const unsigned int *)of_get_property(np, "reg", &len); + reg_end = reg + (len / sizeof(u32)); + + total_mem = 0; + do { + start = of_read_number(reg, aw); + reg += aw; + size = of_read_number(reg, sw); + reg += sw; + total_mem += size; + } while (reg < reg_end); + } + edac_dbg(0, "total_mem 0x%lx\n", total_mem); + return total_mem; +} + +static const struct of_device_id altr_sdram_ctrl_of_match[] = { + { .compatible = "altr,sdram-edac", .data = (void *)&c5_data}, + { .compatible = "altr,sdram-edac-a10", .data = (void *)&a10_data}, + {}, +}; +MODULE_DEVICE_TABLE(of, altr_sdram_ctrl_of_match); + +static int a10_init(struct regmap *mc_vbase) +{ + if (regmap_update_bits(mc_vbase, A10_INTMODE_OFST, + A10_INTMODE_SB_INT, A10_INTMODE_SB_INT)) { + edac_printk(KERN_ERR, EDAC_MC, + "Error setting SB IRQ mode\n"); + return -ENODEV; + } + + if (regmap_write(mc_vbase, A10_SERRCNTREG_OFST, 1)) { + edac_printk(KERN_ERR, EDAC_MC, + "Error setting trigger count\n"); + return -ENODEV; + } + + return 0; +} + +static int a10_unmask_irq(struct platform_device *pdev, u32 mask) +{ + void __iomem *sm_base; + int ret = 0; + + if (!request_mem_region(A10_SYMAN_INTMASK_CLR, sizeof(u32), + dev_name(&pdev->dev))) { + edac_printk(KERN_ERR, EDAC_MC, + "Unable to request mem region\n"); + return -EBUSY; + } + + sm_base = ioremap(A10_SYMAN_INTMASK_CLR, sizeof(u32)); + if (!sm_base) { + edac_printk(KERN_ERR, EDAC_MC, + "Unable to ioremap device\n"); + + ret = -ENOMEM; + goto release; + } + + iowrite32(mask, sm_base); + + iounmap(sm_base); + +release: + release_mem_region(A10_SYMAN_INTMASK_CLR, sizeof(u32)); + + return ret; } static int altr_sdram_probe(struct platform_device *pdev) { + const struct of_device_id *id; struct edac_mc_layer layers[2]; struct mem_ctl_info *mci; struct altr_sdram_mc_data *drvdata; + const struct altr_sdram_prv_data *priv; struct regmap *mc_vbase; struct dimm_info *dimm; - u32 read_reg, mem_size; - int irq; - int res = 0; + u32 read_reg; + int irq, irq2, res = 0; + unsigned long mem_size, irqflags = 0; + + id = of_match_device(altr_sdram_ctrl_of_match, &pdev->dev); + if (!id) + return -ENODEV; - /* Validate the SDRAM controller has ECC enabled */ /* Grab the register range from the sdr controller in device tree */ mc_vbase = syscon_regmap_lookup_by_phandle(pdev->dev.of_node, "altr,sdr-syscon"); @@ -272,25 +309,46 @@ static int altr_sdram_probe(struct platform_device *pdev) return -ENODEV; } - if (regmap_read(mc_vbase, CTLCFG_OFST, &read_reg) || - ((read_reg & CTLCFG_ECC_AUTO_EN) != CTLCFG_ECC_AUTO_EN)) { + /* Check specific dependencies for the module */ + priv = of_match_node(altr_sdram_ctrl_of_match, + pdev->dev.of_node)->data; + + /* Validate the SDRAM controller has ECC enabled */ + if (regmap_read(mc_vbase, priv->ecc_ctrl_offset, &read_reg) || + ((read_reg & priv->ecc_ctl_en_mask) != priv->ecc_ctl_en_mask)) { edac_printk(KERN_ERR, EDAC_MC, "No ECC/ECC disabled [0x%08X]\n", read_reg); return -ENODEV; } /* Grab memory size from device tree. */ - mem_size = altr_sdram_get_total_mem_size(mc_vbase); + mem_size = get_total_mem(); if (!mem_size) { + edac_printk(KERN_ERR, EDAC_MC, "Unable to calculate memory size\n"); + return -ENODEV; + } + + /* Ensure the SDRAM Interrupt is disabled */ + if (regmap_update_bits(mc_vbase, priv->ecc_irq_en_offset, + priv->ecc_irq_en_mask, 0)) { + edac_printk(KERN_ERR, EDAC_MC, + "Error disabling SDRAM ECC IRQ\n"); + return -ENODEV; + } + + /* Toggle to clear the SDRAM Error count */ + if (regmap_update_bits(mc_vbase, priv->ecc_cnt_rst_offset, + priv->ecc_cnt_rst_mask, + priv->ecc_cnt_rst_mask)) { edac_printk(KERN_ERR, EDAC_MC, - "Unable to calculate memory size\n"); + "Error clearing SDRAM ECC count\n"); return -ENODEV; } - /* Ensure the SDRAM Interrupt is disabled and cleared */ - if (regmap_write(mc_vbase, DRAMINTR_OFST, DRAMINTR_INTRCLR)) { + if (regmap_update_bits(mc_vbase, priv->ecc_cnt_rst_offset, + priv->ecc_cnt_rst_mask, 0)) { edac_printk(KERN_ERR, EDAC_MC, - "Error clearing SDRAM ECC IRQ\n"); + "Error clearing SDRAM ECC count\n"); return -ENODEV; } @@ -301,6 +359,9 @@ static int altr_sdram_probe(struct platform_device *pdev) return -ENODEV; } + /* Arria10 has a 2nd IRQ */ + irq2 = platform_get_irq(pdev, 1); + layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; layers[0].size = 1; layers[0].is_virt_csrow = true; @@ -315,9 +376,12 @@ static int altr_sdram_probe(struct platform_device *pdev) mci->pdev = &pdev->dev; drvdata = mci->pvt_info; drvdata->mc_vbase = mc_vbase; + drvdata->data = priv; platform_set_drvdata(pdev, mci); if (!devres_open_group(&pdev->dev, NULL, GFP_KERNEL)) { + edac_printk(KERN_ERR, EDAC_MC, + "Unable to get managed device resource\n"); res = -ENOMEM; goto free; } @@ -342,8 +406,32 @@ static int altr_sdram_probe(struct platform_device *pdev) if (res < 0) goto err; + /* Only the Arria10 has separate IRQs */ + if (irq2 > 0) { + /* Arria10 specific initialization */ + res = a10_init(mc_vbase); + if (res < 0) + goto err2; + + res = devm_request_irq(&pdev->dev, irq2, + altr_sdram_mc_err_handler, + IRQF_SHARED, dev_name(&pdev->dev), mci); + if (res < 0) { + edac_mc_printk(mci, KERN_ERR, + "Unable to request irq %d\n", irq2); + res = -ENODEV; + goto err2; + } + + res = a10_unmask_irq(pdev, A10_DDR0_IRQ_MASK); + if (res < 0) + goto err2; + + irqflags = IRQF_SHARED; + } + res = devm_request_irq(&pdev->dev, irq, altr_sdram_mc_err_handler, - 0, dev_name(&pdev->dev), mci); + irqflags, dev_name(&pdev->dev), mci); if (res < 0) { edac_mc_printk(mci, KERN_ERR, "Unable to request irq %d\n", irq); @@ -351,8 +439,9 @@ static int altr_sdram_probe(struct platform_device *pdev) goto err2; } - if (regmap_write(drvdata->mc_vbase, DRAMINTR_OFST, - (DRAMINTR_INTRCLR | DRAMINTR_INTREN))) { + /* Infrastructure ready - enable the IRQ */ + if (regmap_update_bits(drvdata->mc_vbase, priv->ecc_irq_en_offset, + priv->ecc_irq_en_mask, priv->ecc_irq_en_mask)) { edac_mc_printk(mci, KERN_ERR, "Error enabling SDRAM ECC IRQ\n"); res = -ENODEV; @@ -388,17 +477,31 @@ static int altr_sdram_remove(struct platform_device *pdev) return 0; } -static const struct of_device_id altr_sdram_ctrl_of_match[] = { - { .compatible = "altr,sdram-edac", }, - {}, +/* + * If you want to suspend, need to disable EDAC by removing it + * from the device tree or defconfig. + */ +#ifdef CONFIG_PM +static int altr_sdram_prepare(struct device *dev) +{ + pr_err("Suspend not allowed when EDAC is enabled.\n"); + + return -EPERM; +} + +static const struct dev_pm_ops altr_sdram_pm_ops = { + .prepare = altr_sdram_prepare, }; -MODULE_DEVICE_TABLE(of, altr_sdram_ctrl_of_match); +#endif static struct platform_driver altr_sdram_edac_driver = { .probe = altr_sdram_probe, .remove = altr_sdram_remove, .driver = { .name = "altr_sdram_edac", +#ifdef CONFIG_PM + .pm = &altr_sdram_pm_ops, +#endif .of_match_table = altr_sdram_ctrl_of_match, }, }; diff --git a/drivers/edac/altera_edac.h b/drivers/edac/altera_edac.h new file mode 100644 index 000000000000..7b64dc7c4eb7 --- /dev/null +++ b/drivers/edac/altera_edac.h @@ -0,0 +1,201 @@ +/* + * + * Copyright (C) 2015 Altera Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _ALTERA_EDAC_H +#define _ALTERA_EDAC_H + +#include <linux/edac.h> +#include <linux/types.h> + +/* SDRAM Controller CtrlCfg Register */ +#define CV_CTLCFG_OFST 0x00 + +/* SDRAM Controller CtrlCfg Register Bit Masks */ +#define CV_CTLCFG_ECC_EN 0x400 +#define CV_CTLCFG_ECC_CORR_EN 0x800 +#define CV_CTLCFG_GEN_SB_ERR 0x2000 +#define CV_CTLCFG_GEN_DB_ERR 0x4000 + +#define CV_CTLCFG_ECC_AUTO_EN (CV_CTLCFG_ECC_EN | \ + CV_CTLCFG_ECC_CORR_EN) + +/* SDRAM Controller Address Width Register */ +#define CV_DRAMADDRW_OFST 0x2C + +/* SDRAM Controller Address Widths Field Register */ +#define DRAMADDRW_COLBIT_MASK 0x001F +#define DRAMADDRW_COLBIT_SHIFT 0 +#define DRAMADDRW_ROWBIT_MASK 0x03E0 +#define DRAMADDRW_ROWBIT_SHIFT 5 +#define CV_DRAMADDRW_BANKBIT_MASK 0x1C00 +#define CV_DRAMADDRW_BANKBIT_SHIFT 10 +#define CV_DRAMADDRW_CSBIT_MASK 0xE000 +#define CV_DRAMADDRW_CSBIT_SHIFT 13 + +/* SDRAM Controller Interface Data Width Register */ +#define CV_DRAMIFWIDTH_OFST 0x30 + +/* SDRAM Controller Interface Data Width Defines */ +#define CV_DRAMIFWIDTH_16B_ECC 24 +#define CV_DRAMIFWIDTH_32B_ECC 40 + +/* SDRAM Controller DRAM Status Register */ +#define CV_DRAMSTS_OFST 0x38 + +/* SDRAM Controller DRAM Status Register Bit Masks */ +#define CV_DRAMSTS_SBEERR 0x04 +#define CV_DRAMSTS_DBEERR 0x08 +#define CV_DRAMSTS_CORR_DROP 0x10 + +/* SDRAM Controller DRAM IRQ Register */ +#define CV_DRAMINTR_OFST 0x3C + +/* SDRAM Controller DRAM IRQ Register Bit Masks */ +#define CV_DRAMINTR_INTREN 0x01 +#define CV_DRAMINTR_SBEMASK 0x02 +#define CV_DRAMINTR_DBEMASK 0x04 +#define CV_DRAMINTR_CORRDROPMASK 0x08 +#define CV_DRAMINTR_INTRCLR 0x10 + +/* SDRAM Controller Single Bit Error Count Register */ +#define CV_SBECOUNT_OFST 0x40 + +/* SDRAM Controller Double Bit Error Count Register */ +#define CV_DBECOUNT_OFST 0x44 + +/* SDRAM Controller ECC Error Address Register */ +#define CV_ERRADDR_OFST 0x48 + +/*-----------------------------------------*/ + +/* SDRAM Controller EccCtrl Register */ +#define A10_ECCCTRL1_OFST 0x00 + +/* SDRAM Controller EccCtrl Register Bit Masks */ +#define A10_ECCCTRL1_ECC_EN 0x001 +#define A10_ECCCTRL1_CNT_RST 0x010 +#define A10_ECCCTRL1_AWB_CNT_RST 0x100 +#define A10_ECC_CNT_RESET_MASK (A10_ECCCTRL1_CNT_RST | \ + A10_ECCCTRL1_AWB_CNT_RST) + +/* SDRAM Controller Address Width Register */ +#define CV_DRAMADDRW 0xFFC2502C +#define A10_DRAMADDRW 0xFFCFA0A8 + +/* SDRAM Controller Address Widths Field Register */ +#define DRAMADDRW_COLBIT_MASK 0x001F +#define DRAMADDRW_COLBIT_SHIFT 0 +#define DRAMADDRW_ROWBIT_MASK 0x03E0 +#define DRAMADDRW_ROWBIT_SHIFT 5 +#define CV_DRAMADDRW_BANKBIT_MASK 0x1C00 +#define CV_DRAMADDRW_BANKBIT_SHIFT 10 +#define CV_DRAMADDRW_CSBIT_MASK 0xE000 +#define CV_DRAMADDRW_CSBIT_SHIFT 13 + +#define A10_DRAMADDRW_BANKBIT_MASK 0x3C00 +#define A10_DRAMADDRW_BANKBIT_SHIFT 10 +#define A10_DRAMADDRW_GRPBIT_MASK 0xC000 +#define A10_DRAMADDRW_GRPBIT_SHIFT 14 +#define A10_DRAMADDRW_CSBIT_MASK 0x70000 +#define A10_DRAMADDRW_CSBIT_SHIFT 16 + +/* SDRAM Controller Interface Data Width Register */ +#define CV_DRAMIFWIDTH 0xFFC25030 +#define A10_DRAMIFWIDTH 0xFFCFB008 + +/* SDRAM Controller Interface Data Width Defines */ +#define CV_DRAMIFWIDTH_16B_ECC 24 +#define CV_DRAMIFWIDTH_32B_ECC 40 + +#define A10_DRAMIFWIDTH_16B 0x0 +#define A10_DRAMIFWIDTH_32B 0x1 +#define A10_DRAMIFWIDTH_64B 0x2 + +/* SDRAM Controller DRAM IRQ Register */ +#define A10_ERRINTEN_OFST 0x10 + +/* SDRAM Controller DRAM IRQ Register Bit Masks */ +#define A10_ERRINTEN_SERRINTEN 0x01 +#define A10_ERRINTEN_DERRINTEN 0x02 +#define A10_ECC_IRQ_EN_MASK (A10_ERRINTEN_SERRINTEN | \ + A10_ERRINTEN_DERRINTEN) + +/* SDRAM Interrupt Mode Register */ +#define A10_INTMODE_OFST 0x1C +#define A10_INTMODE_SB_INT 1 + +/* SDRAM Controller Error Status Register */ +#define A10_INTSTAT_OFST 0x20 + +/* SDRAM Controller Error Status Register Bit Masks */ +#define A10_INTSTAT_SBEERR 0x01 +#define A10_INTSTAT_DBEERR 0x02 + +/* SDRAM Controller ECC Error Address Register */ +#define A10_DERRADDR_OFST 0x2C +#define A10_SERRADDR_OFST 0x30 + +/* SDRAM Controller ECC Diagnostic Register */ +#define A10_DIAGINTTEST_OFST 0x24 + +#define A10_DIAGINT_TSERRA_MASK 0x0001 +#define A10_DIAGINT_TDERRA_MASK 0x0100 + +#define A10_SBERR_IRQ 34 +#define A10_DBERR_IRQ 32 + +/* SDRAM Single Bit Error Count Compare Set Register */ +#define A10_SERRCNTREG_OFST 0x3C + +#define A10_SYMAN_INTMASK_CLR 0xFFD06098 +#define A10_INTMASK_CLR_OFST 0x10 +#define A10_DDR0_IRQ_MASK BIT(17) + +struct altr_sdram_prv_data { + int ecc_ctrl_offset; + int ecc_ctl_en_mask; + int ecc_cecnt_offset; + int ecc_uecnt_offset; + int ecc_stat_offset; + int ecc_stat_ce_mask; + int ecc_stat_ue_mask; + int ecc_saddr_offset; + int ecc_daddr_offset; + int ecc_irq_en_offset; + int ecc_irq_en_mask; + int ecc_irq_clr_offset; + int ecc_irq_clr_mask; + int ecc_cnt_rst_offset; + int ecc_cnt_rst_mask; +#ifdef CONFIG_EDAC_DEBUG + struct edac_dev_sysfs_attribute *eccmgr_sysfs_attr; + int ecc_enable_mask; + int ce_set_mask; + int ue_set_mask; + int ce_ue_trgr_offset; +#endif +}; + +/* Altera SDRAM Memory Controller data */ +struct altr_sdram_mc_data { + struct regmap *mc_vbase; + int sb_irq; + int db_irq; + const struct altr_sdram_prv_data *data; +}; + +#endif /* #ifndef _ALTERA_EDAC_H */ diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c index af3be1914dbb..943ed8cf71b9 100644 --- a/drivers/edac/edac_mc.c +++ b/drivers/edac/edac_mc.c @@ -30,11 +30,16 @@ #include <linux/bitops.h> #include <asm/uaccess.h> #include <asm/page.h> -#include <asm/edac.h> #include "edac_core.h" #include "edac_module.h" #include <ras/ras_event.h> +#ifdef CONFIG_EDAC_ATOMIC_SCRUB +#include <asm/edac.h> +#else +#define edac_atomic_scrub(va, size) do { } while (0) +#endif + /* lock to memory controller's control array */ static DEFINE_MUTEX(mem_ctls_mutex); static LIST_HEAD(mc_devices); @@ -874,7 +879,7 @@ static void edac_mc_scrub_block(unsigned long page, unsigned long offset, virt_addr = kmap_atomic(pg); /* Perform architecture specific atomic scrub operation */ - atomic_scrub(virt_addr + offset, size); + edac_atomic_scrub(virt_addr + offset, size); /* Unmap and complete */ kunmap_atomic(virt_addr); diff --git a/drivers/edac/edac_stub.c b/drivers/edac/edac_stub.c index 9d9e18aefaaa..ff07aae5b7fb 100644 --- a/drivers/edac/edac_stub.c +++ b/drivers/edac/edac_stub.c @@ -16,7 +16,6 @@ #include <linux/edac.h> #include <linux/atomic.h> #include <linux/device.h> -#include <asm/edac.h> int edac_op_state = EDAC_OPSTATE_INVAL; EXPORT_SYMBOL_GPL(edac_op_state); diff --git a/drivers/edac/mce_amd_inj.c b/drivers/edac/mce_amd_inj.c index f7681b553fd5..4c73e4d03d46 100644 --- a/drivers/edac/mce_amd_inj.c +++ b/drivers/edac/mce_amd_inj.c @@ -15,6 +15,8 @@ #include <linux/device.h> #include <linux/module.h> #include <linux/cpu.h> +#include <linux/string.h> +#include <linux/uaccess.h> #include <asm/mce.h> #include "mce_amd.h" @@ -25,6 +27,25 @@ static struct mce i_mce; static struct dentry *dfs_inj; +static u8 n_banks; + +#define MAX_FLAG_OPT_SIZE 3 + +enum injection_type { + SW_INJ = 0, /* SW injection, simply decode the error */ + HW_INJ, /* Trigger a #MC */ + N_INJ_TYPES, +}; + +static const char * const flags_options[] = { + [SW_INJ] = "sw", + [HW_INJ] = "hw", + NULL +}; + +/* Set default injection to SW_INJ */ +static enum injection_type inj_type = SW_INJ; + #define MCE_INJECT_SET(reg) \ static int inj_##reg##_set(void *data, u64 val) \ { \ @@ -79,24 +100,66 @@ static int toggle_hw_mce_inject(unsigned int cpu, bool enable) return err; } -static int flags_get(void *data, u64 *val) +static int __set_inj(const char *buf) { - struct mce *m = (struct mce *)data; + int i; - *val = m->inject_flags; + for (i = 0; i < N_INJ_TYPES; i++) { + if (!strncmp(flags_options[i], buf, strlen(flags_options[i]))) { + inj_type = i; + return 0; + } + } + return -EINVAL; +} - return 0; +static ssize_t flags_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[MAX_FLAG_OPT_SIZE]; + int n; + + n = sprintf(buf, "%s\n", flags_options[inj_type]); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, n); } -static int flags_set(void *data, u64 val) +static ssize_t flags_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) { - struct mce *m = (struct mce *)data; + char buf[MAX_FLAG_OPT_SIZE], *__buf; + int err; + size_t ret; - m->inject_flags = (u8)val; - return 0; + if (cnt > MAX_FLAG_OPT_SIZE) + cnt = MAX_FLAG_OPT_SIZE; + + ret = cnt; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt - 1] = 0; + + /* strip whitespace */ + __buf = strstrip(buf); + + err = __set_inj(__buf); + if (err) { + pr_err("%s: Invalid flags value: %s\n", __func__, __buf); + return err; + } + + *ppos += ret; + + return ret; } -DEFINE_SIMPLE_ATTRIBUTE(flags_fops, flags_get, flags_set, "%llu\n"); +static const struct file_operations flags_fops = { + .read = flags_read, + .write = flags_write, + .llseek = generic_file_llseek, +}; /* * On which CPU to inject? @@ -128,21 +191,24 @@ static void do_inject(void) unsigned int cpu = i_mce.extcpu; u8 b = i_mce.bank; - if (!(i_mce.inject_flags & MCJ_EXCEPTION)) { + if (i_mce.misc) + i_mce.status |= MCI_STATUS_MISCV; + + if (inj_type == SW_INJ) { amd_decode_mce(NULL, 0, &i_mce); return; } - get_online_cpus(); - if (!cpu_online(cpu)) - goto err; - /* prep MCE global settings for the injection */ mcg_status = MCG_STATUS_MCIP | MCG_STATUS_EIPV; if (!(i_mce.status & MCI_STATUS_PCC)) mcg_status |= MCG_STATUS_RIPV; + get_online_cpus(); + if (!cpu_online(cpu)) + goto err; + toggle_hw_mce_inject(cpu, true); wrmsr_on_cpu(cpu, MSR_IA32_MCG_STATUS, @@ -174,11 +240,9 @@ static int inj_bank_set(void *data, u64 val) { struct mce *m = (struct mce *)data; - if (val > 5) { - if (boot_cpu_data.x86 != 0x15 || val > 6) { - pr_err("Non-existent MCE bank: %llu\n", val); - return -EINVAL; - } + if (val >= n_banks) { + pr_err("Non-existent MCE bank: %llu\n", val); + return -EINVAL; } m->bank = val; @@ -187,32 +251,81 @@ static int inj_bank_set(void *data, u64 val) return 0; } -static int inj_bank_get(void *data, u64 *val) -{ - struct mce *m = (struct mce *)data; +MCE_INJECT_GET(bank); - *val = m->bank; - return 0; +DEFINE_SIMPLE_ATTRIBUTE(bank_fops, inj_bank_get, inj_bank_set, "%llu\n"); + +static const char readme_msg[] = +"Description of the files and their usages:\n" +"\n" +"Note1: i refers to the bank number below.\n" +"Note2: See respective BKDGs for the exact bit definitions of the files below\n" +"as they mirror the hardware registers.\n" +"\n" +"status:\t Set MCi_STATUS: the bits in that MSR control the error type and\n" +"\t attributes of the error which caused the MCE.\n" +"\n" +"misc:\t Set MCi_MISC: provide auxiliary info about the error. It is mostly\n" +"\t used for error thresholding purposes and its validity is indicated by\n" +"\t MCi_STATUS[MiscV].\n" +"\n" +"addr:\t Error address value to be written to MCi_ADDR. Log address information\n" +"\t associated with the error.\n" +"\n" +"cpu:\t The CPU to inject the error on.\n" +"\n" +"bank:\t Specify the bank you want to inject the error into: the number of\n" +"\t banks in a processor varies and is family/model-specific, therefore, the\n" +"\t supplied value is sanity-checked. Setting the bank value also triggers the\n" +"\t injection.\n" +"\n" +"flags:\t Injection type to be performed. Writing to this file will trigger a\n" +"\t real machine check, an APIC interrupt or invoke the error decoder routines\n" +"\t for AMD processors.\n" +"\n" +"\t Allowed error injection types:\n" +"\t - \"sw\": Software error injection. Decode error to a human-readable \n" +"\t format only. Safe to use.\n" +"\t - \"hw\": Hardware error injection. Causes the #MC exception handler to \n" +"\t handle the error. Be warned: might cause system panic if MCi_STATUS[PCC] \n" +"\t is set. Therefore, consider setting (debugfs_mountpoint)/mce/fake_panic \n" +"\t before injecting.\n" +"\n"; + +static ssize_t +inj_readme_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return simple_read_from_buffer(ubuf, cnt, ppos, + readme_msg, strlen(readme_msg)); } -DEFINE_SIMPLE_ATTRIBUTE(bank_fops, inj_bank_get, inj_bank_set, "%llu\n"); +static const struct file_operations readme_fops = { + .read = inj_readme_read, +}; static struct dfs_node { char *name; struct dentry *d; const struct file_operations *fops; + umode_t perm; } dfs_fls[] = { - { .name = "status", .fops = &status_fops }, - { .name = "misc", .fops = &misc_fops }, - { .name = "addr", .fops = &addr_fops }, - { .name = "bank", .fops = &bank_fops }, - { .name = "flags", .fops = &flags_fops }, - { .name = "cpu", .fops = &extcpu_fops }, + { .name = "status", .fops = &status_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "README", .fops = &readme_fops, .perm = S_IRUSR | S_IRGRP | S_IROTH }, }; static int __init init_mce_inject(void) { int i; + u64 cap; + + rdmsrl(MSR_IA32_MCG_CAP, cap); + n_banks = cap & MCG_BANKCNT_MASK; dfs_inj = debugfs_create_dir("mce-inject", NULL); if (!dfs_inj) @@ -220,7 +333,7 @@ static int __init init_mce_inject(void) for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) { dfs_fls[i].d = debugfs_create_file(dfs_fls[i].name, - S_IRUSR | S_IWUSR, + dfs_fls[i].perm, dfs_inj, &i_mce, dfs_fls[i].fops); diff --git a/drivers/edac/mpc85xx_edac.c b/drivers/edac/mpc85xx_edac.c index 68bf234bdfe6..23ef8e9f2c9a 100644 --- a/drivers/edac/mpc85xx_edac.c +++ b/drivers/edac/mpc85xx_edac.c @@ -811,6 +811,8 @@ static void sbe_ecc_decode(u32 cap_high, u32 cap_low, u32 cap_ecc, } } +#define make64(high, low) (((u64)(high) << 32) | (low)) + static void mpc85xx_mc_check(struct mem_ctl_info *mci) { struct mpc85xx_mc_pdata *pdata = mci->pvt_info; @@ -818,7 +820,7 @@ static void mpc85xx_mc_check(struct mem_ctl_info *mci) u32 bus_width; u32 err_detect; u32 syndrome; - u32 err_addr; + u64 err_addr; u32 pfn; int row_index; u32 cap_high; @@ -849,7 +851,9 @@ static void mpc85xx_mc_check(struct mem_ctl_info *mci) else syndrome &= 0xffff; - err_addr = in_be32(pdata->mc_vbase + MPC85XX_MC_CAPTURE_ADDRESS); + err_addr = make64( + in_be32(pdata->mc_vbase + MPC85XX_MC_CAPTURE_EXT_ADDRESS), + in_be32(pdata->mc_vbase + MPC85XX_MC_CAPTURE_ADDRESS)); pfn = err_addr >> PAGE_SHIFT; for (row_index = 0; row_index < mci->nr_csrows; row_index++) { @@ -886,7 +890,7 @@ static void mpc85xx_mc_check(struct mem_ctl_info *mci) mpc85xx_mc_printk(mci, KERN_ERR, "Captured Data / ECC:\t%#8.8x_%08x / %#2.2x\n", cap_high, cap_low, syndrome); - mpc85xx_mc_printk(mci, KERN_ERR, "Err addr: %#8.8x\n", err_addr); + mpc85xx_mc_printk(mci, KERN_ERR, "Err addr: %#8.8llx\n", err_addr); mpc85xx_mc_printk(mci, KERN_ERR, "PFN: %#8.8x\n", pfn); /* we are out of range */ diff --git a/drivers/edac/mpc85xx_edac.h b/drivers/edac/mpc85xx_edac.h index 4498baf9ce05..9352e88d53e5 100644 --- a/drivers/edac/mpc85xx_edac.h +++ b/drivers/edac/mpc85xx_edac.h @@ -43,6 +43,7 @@ #define MPC85XX_MC_ERR_INT_EN 0x0e48 #define MPC85XX_MC_CAPTURE_ATRIBUTES 0x0e4c #define MPC85XX_MC_CAPTURE_ADDRESS 0x0e50 +#define MPC85XX_MC_CAPTURE_EXT_ADDRESS 0x0e54 #define MPC85XX_MC_ERR_SBE 0x0e58 #define DSC_MEM_EN 0x80000000 diff --git a/drivers/edac/xgene_edac.c b/drivers/edac/xgene_edac.c new file mode 100644 index 000000000000..14636e4b6a08 --- /dev/null +++ b/drivers/edac/xgene_edac.c @@ -0,0 +1,1215 @@ +/* + * APM X-Gene SoC EDAC (error detection and correction) + * + * Copyright (c) 2015, Applied Micro Circuits Corporation + * Author: Feng Kan <fkan@apm.com> + * Loc Ho <lho@apm.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/ctype.h> +#include <linux/edac.h> +#include <linux/interrupt.h> +#include <linux/mfd/syscon.h> +#include <linux/module.h> +#include <linux/of.h> +#include <linux/of_address.h> +#include <linux/regmap.h> + +#include "edac_core.h" + +#define EDAC_MOD_STR "xgene_edac" + +/* Global error configuration status registers (CSR) */ +#define PCPHPERRINTSTS 0x0000 +#define PCPHPERRINTMSK 0x0004 +#define MCU_CTL_ERR_MASK BIT(12) +#define IOB_PA_ERR_MASK BIT(11) +#define IOB_BA_ERR_MASK BIT(10) +#define IOB_XGIC_ERR_MASK BIT(9) +#define IOB_RB_ERR_MASK BIT(8) +#define L3C_UNCORR_ERR_MASK BIT(5) +#define MCU_UNCORR_ERR_MASK BIT(4) +#define PMD3_MERR_MASK BIT(3) +#define PMD2_MERR_MASK BIT(2) +#define PMD1_MERR_MASK BIT(1) +#define PMD0_MERR_MASK BIT(0) +#define PCPLPERRINTSTS 0x0008 +#define PCPLPERRINTMSK 0x000C +#define CSW_SWITCH_TRACE_ERR_MASK BIT(2) +#define L3C_CORR_ERR_MASK BIT(1) +#define MCU_CORR_ERR_MASK BIT(0) +#define MEMERRINTSTS 0x0010 +#define MEMERRINTMSK 0x0014 + +struct xgene_edac { + struct device *dev; + struct regmap *csw_map; + struct regmap *mcba_map; + struct regmap *mcbb_map; + struct regmap *efuse_map; + void __iomem *pcp_csr; + spinlock_t lock; + struct dentry *dfs; + + struct list_head mcus; + struct list_head pmds; + + struct mutex mc_lock; + int mc_active_mask; + int mc_registered_mask; +}; + +static void xgene_edac_pcp_rd(struct xgene_edac *edac, u32 reg, u32 *val) +{ + *val = readl(edac->pcp_csr + reg); +} + +static void xgene_edac_pcp_clrbits(struct xgene_edac *edac, u32 reg, + u32 bits_mask) +{ + u32 val; + + spin_lock(&edac->lock); + val = readl(edac->pcp_csr + reg); + val &= ~bits_mask; + writel(val, edac->pcp_csr + reg); + spin_unlock(&edac->lock); +} + +static void xgene_edac_pcp_setbits(struct xgene_edac *edac, u32 reg, + u32 bits_mask) +{ + u32 val; + + spin_lock(&edac->lock); + val = readl(edac->pcp_csr + reg); + val |= bits_mask; + writel(val, edac->pcp_csr + reg); + spin_unlock(&edac->lock); +} + +/* Memory controller error CSR */ +#define MCU_MAX_RANK 8 +#define MCU_RANK_STRIDE 0x40 + +#define MCUGECR 0x0110 +#define MCU_GECR_DEMANDUCINTREN_MASK BIT(0) +#define MCU_GECR_BACKUCINTREN_MASK BIT(1) +#define MCU_GECR_CINTREN_MASK BIT(2) +#define MUC_GECR_MCUADDRERREN_MASK BIT(9) +#define MCUGESR 0x0114 +#define MCU_GESR_ADDRNOMATCH_ERR_MASK BIT(7) +#define MCU_GESR_ADDRMULTIMATCH_ERR_MASK BIT(6) +#define MCU_GESR_PHYP_ERR_MASK BIT(3) +#define MCUESRR0 0x0314 +#define MCU_ESRR_MULTUCERR_MASK BIT(3) +#define MCU_ESRR_BACKUCERR_MASK BIT(2) +#define MCU_ESRR_DEMANDUCERR_MASK BIT(1) +#define MCU_ESRR_CERR_MASK BIT(0) +#define MCUESRRA0 0x0318 +#define MCUEBLRR0 0x031c +#define MCU_EBLRR_ERRBANK_RD(src) (((src) & 0x00000007) >> 0) +#define MCUERCRR0 0x0320 +#define MCU_ERCRR_ERRROW_RD(src) (((src) & 0xFFFF0000) >> 16) +#define MCU_ERCRR_ERRCOL_RD(src) ((src) & 0x00000FFF) +#define MCUSBECNT0 0x0324 +#define MCU_SBECNT_COUNT(src) ((src) & 0xFFFF) + +#define CSW_CSWCR 0x0000 +#define CSW_CSWCR_DUALMCB_MASK BIT(0) + +#define MCBADDRMR 0x0000 +#define MCBADDRMR_MCU_INTLV_MODE_MASK BIT(3) +#define MCBADDRMR_DUALMCU_MODE_MASK BIT(2) +#define MCBADDRMR_MCB_INTLV_MODE_MASK BIT(1) +#define MCBADDRMR_ADDRESS_MODE_MASK BIT(0) + +struct xgene_edac_mc_ctx { + struct list_head next; + char *name; + struct mem_ctl_info *mci; + struct xgene_edac *edac; + void __iomem *mcu_csr; + u32 mcu_id; +}; + +static ssize_t xgene_edac_mc_err_inject_write(struct file *file, + const char __user *data, + size_t count, loff_t *ppos) +{ + struct mem_ctl_info *mci = file->private_data; + struct xgene_edac_mc_ctx *ctx = mci->pvt_info; + int i; + + for (i = 0; i < MCU_MAX_RANK; i++) { + writel(MCU_ESRR_MULTUCERR_MASK | MCU_ESRR_BACKUCERR_MASK | + MCU_ESRR_DEMANDUCERR_MASK | MCU_ESRR_CERR_MASK, + ctx->mcu_csr + MCUESRRA0 + i * MCU_RANK_STRIDE); + } + return count; +} + +static const struct file_operations xgene_edac_mc_debug_inject_fops = { + .open = simple_open, + .write = xgene_edac_mc_err_inject_write, + .llseek = generic_file_llseek, +}; + +static void xgene_edac_mc_create_debugfs_node(struct mem_ctl_info *mci) +{ + if (!IS_ENABLED(CONFIG_EDAC_DEBUG)) + return; +#ifdef CONFIG_EDAC_DEBUG + if (!mci->debugfs) + return; + debugfs_create_file("inject_ctrl", S_IWUSR, mci->debugfs, mci, + &xgene_edac_mc_debug_inject_fops); +#endif +} + +static void xgene_edac_mc_check(struct mem_ctl_info *mci) +{ + struct xgene_edac_mc_ctx *ctx = mci->pvt_info; + unsigned int pcp_hp_stat; + unsigned int pcp_lp_stat; + u32 reg; + u32 rank; + u32 bank; + u32 count; + u32 col_row; + + xgene_edac_pcp_rd(ctx->edac, PCPHPERRINTSTS, &pcp_hp_stat); + xgene_edac_pcp_rd(ctx->edac, PCPLPERRINTSTS, &pcp_lp_stat); + if (!((MCU_UNCORR_ERR_MASK & pcp_hp_stat) || + (MCU_CTL_ERR_MASK & pcp_hp_stat) || + (MCU_CORR_ERR_MASK & pcp_lp_stat))) + return; + + for (rank = 0; rank < MCU_MAX_RANK; rank++) { + reg = readl(ctx->mcu_csr + MCUESRR0 + rank * MCU_RANK_STRIDE); + + /* Detect uncorrectable memory error */ + if (reg & (MCU_ESRR_DEMANDUCERR_MASK | + MCU_ESRR_BACKUCERR_MASK)) { + /* Detected uncorrectable memory error */ + edac_mc_chipset_printk(mci, KERN_ERR, "X-Gene", + "MCU uncorrectable error at rank %d\n", rank); + + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, + 1, 0, 0, 0, 0, 0, -1, mci->ctl_name, ""); + } + + /* Detect correctable memory error */ + if (reg & MCU_ESRR_CERR_MASK) { + bank = readl(ctx->mcu_csr + MCUEBLRR0 + + rank * MCU_RANK_STRIDE); + col_row = readl(ctx->mcu_csr + MCUERCRR0 + + rank * MCU_RANK_STRIDE); + count = readl(ctx->mcu_csr + MCUSBECNT0 + + rank * MCU_RANK_STRIDE); + edac_mc_chipset_printk(mci, KERN_WARNING, "X-Gene", + "MCU correctable error at rank %d bank %d column %d row %d count %d\n", + rank, MCU_EBLRR_ERRBANK_RD(bank), + MCU_ERCRR_ERRCOL_RD(col_row), + MCU_ERCRR_ERRROW_RD(col_row), + MCU_SBECNT_COUNT(count)); + + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, + 1, 0, 0, 0, 0, 0, -1, mci->ctl_name, ""); + } + + /* Clear all error registers */ + writel(0x0, ctx->mcu_csr + MCUEBLRR0 + rank * MCU_RANK_STRIDE); + writel(0x0, ctx->mcu_csr + MCUERCRR0 + rank * MCU_RANK_STRIDE); + writel(0x0, ctx->mcu_csr + MCUSBECNT0 + + rank * MCU_RANK_STRIDE); + writel(reg, ctx->mcu_csr + MCUESRR0 + rank * MCU_RANK_STRIDE); + } + + /* Detect memory controller error */ + reg = readl(ctx->mcu_csr + MCUGESR); + if (reg) { + if (reg & MCU_GESR_ADDRNOMATCH_ERR_MASK) + edac_mc_chipset_printk(mci, KERN_WARNING, "X-Gene", + "MCU address miss-match error\n"); + if (reg & MCU_GESR_ADDRMULTIMATCH_ERR_MASK) + edac_mc_chipset_printk(mci, KERN_WARNING, "X-Gene", + "MCU address multi-match error\n"); + + writel(reg, ctx->mcu_csr + MCUGESR); + } +} + +static void xgene_edac_mc_irq_ctl(struct mem_ctl_info *mci, bool enable) +{ + struct xgene_edac_mc_ctx *ctx = mci->pvt_info; + unsigned int val; + + if (edac_op_state != EDAC_OPSTATE_INT) + return; + + mutex_lock(&ctx->edac->mc_lock); + + /* + * As there is only single bit for enable error and interrupt mask, + * we must only enable top level interrupt after all MCUs are + * registered. Otherwise, if there is an error and the corresponding + * MCU has not registered, the interrupt will never get cleared. To + * determine all MCU have registered, we will keep track of active + * MCUs and registered MCUs. + */ + if (enable) { + /* Set registered MCU bit */ + ctx->edac->mc_registered_mask |= 1 << ctx->mcu_id; + + /* Enable interrupt after all active MCU registered */ + if (ctx->edac->mc_registered_mask == + ctx->edac->mc_active_mask) { + /* Enable memory controller top level interrupt */ + xgene_edac_pcp_clrbits(ctx->edac, PCPHPERRINTMSK, + MCU_UNCORR_ERR_MASK | + MCU_CTL_ERR_MASK); + xgene_edac_pcp_clrbits(ctx->edac, PCPLPERRINTMSK, + MCU_CORR_ERR_MASK); + } + + /* Enable MCU interrupt and error reporting */ + val = readl(ctx->mcu_csr + MCUGECR); + val |= MCU_GECR_DEMANDUCINTREN_MASK | + MCU_GECR_BACKUCINTREN_MASK | + MCU_GECR_CINTREN_MASK | + MUC_GECR_MCUADDRERREN_MASK; + writel(val, ctx->mcu_csr + MCUGECR); + } else { + /* Disable MCU interrupt */ + val = readl(ctx->mcu_csr + MCUGECR); + val &= ~(MCU_GECR_DEMANDUCINTREN_MASK | + MCU_GECR_BACKUCINTREN_MASK | + MCU_GECR_CINTREN_MASK | + MUC_GECR_MCUADDRERREN_MASK); + writel(val, ctx->mcu_csr + MCUGECR); + + /* Disable memory controller top level interrupt */ + xgene_edac_pcp_setbits(ctx->edac, PCPHPERRINTMSK, + MCU_UNCORR_ERR_MASK | MCU_CTL_ERR_MASK); + xgene_edac_pcp_setbits(ctx->edac, PCPLPERRINTMSK, + MCU_CORR_ERR_MASK); + + /* Clear registered MCU bit */ + ctx->edac->mc_registered_mask &= ~(1 << ctx->mcu_id); + } + + mutex_unlock(&ctx->edac->mc_lock); +} + +static int xgene_edac_mc_is_active(struct xgene_edac_mc_ctx *ctx, int mc_idx) +{ + unsigned int reg; + u32 mcu_mask; + + if (regmap_read(ctx->edac->csw_map, CSW_CSWCR, ®)) + return 0; + + if (reg & CSW_CSWCR_DUALMCB_MASK) { + /* + * Dual MCB active - Determine if all 4 active or just MCU0 + * and MCU2 active + */ + if (regmap_read(ctx->edac->mcbb_map, MCBADDRMR, ®)) + return 0; + mcu_mask = (reg & MCBADDRMR_DUALMCU_MODE_MASK) ? 0xF : 0x5; + } else { + /* + * Single MCB active - Determine if MCU0/MCU1 or just MCU0 + * active + */ + if (regmap_read(ctx->edac->mcba_map, MCBADDRMR, ®)) + return 0; + mcu_mask = (reg & MCBADDRMR_DUALMCU_MODE_MASK) ? 0x3 : 0x1; + } + + /* Save active MC mask if hasn't set already */ + if (!ctx->edac->mc_active_mask) + ctx->edac->mc_active_mask = mcu_mask; + + return (mcu_mask & (1 << mc_idx)) ? 1 : 0; +} + +static int xgene_edac_mc_add(struct xgene_edac *edac, struct device_node *np) +{ + struct mem_ctl_info *mci; + struct edac_mc_layer layers[2]; + struct xgene_edac_mc_ctx tmp_ctx; + struct xgene_edac_mc_ctx *ctx; + struct resource res; + int rc; + + memset(&tmp_ctx, 0, sizeof(tmp_ctx)); + tmp_ctx.edac = edac; + + if (!devres_open_group(edac->dev, xgene_edac_mc_add, GFP_KERNEL)) + return -ENOMEM; + + rc = of_address_to_resource(np, 0, &res); + if (rc < 0) { + dev_err(edac->dev, "no MCU resource address\n"); + goto err_group; + } + tmp_ctx.mcu_csr = devm_ioremap_resource(edac->dev, &res); + if (IS_ERR(tmp_ctx.mcu_csr)) { + dev_err(edac->dev, "unable to map MCU resource\n"); + rc = PTR_ERR(tmp_ctx.mcu_csr); + goto err_group; + } + + /* Ignore non-active MCU */ + if (of_property_read_u32(np, "memory-controller", &tmp_ctx.mcu_id)) { + dev_err(edac->dev, "no memory-controller property\n"); + rc = -ENODEV; + goto err_group; + } + if (!xgene_edac_mc_is_active(&tmp_ctx, tmp_ctx.mcu_id)) { + rc = -ENODEV; + goto err_group; + } + + layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; + layers[0].size = 4; + layers[0].is_virt_csrow = true; + layers[1].type = EDAC_MC_LAYER_CHANNEL; + layers[1].size = 2; + layers[1].is_virt_csrow = false; + mci = edac_mc_alloc(tmp_ctx.mcu_id, ARRAY_SIZE(layers), layers, + sizeof(*ctx)); + if (!mci) { + rc = -ENOMEM; + goto err_group; + } + + ctx = mci->pvt_info; + *ctx = tmp_ctx; /* Copy over resource value */ + ctx->name = "xgene_edac_mc_err"; + ctx->mci = mci; + mci->pdev = &mci->dev; + mci->ctl_name = ctx->name; + mci->dev_name = ctx->name; + + mci->mtype_cap = MEM_FLAG_RDDR | MEM_FLAG_RDDR2 | MEM_FLAG_RDDR3 | + MEM_FLAG_DDR | MEM_FLAG_DDR2 | MEM_FLAG_DDR3; + mci->edac_ctl_cap = EDAC_FLAG_SECDED; + mci->edac_cap = EDAC_FLAG_SECDED; + mci->mod_name = EDAC_MOD_STR; + mci->mod_ver = "0.1"; + mci->ctl_page_to_phys = NULL; + mci->scrub_cap = SCRUB_FLAG_HW_SRC; + mci->scrub_mode = SCRUB_HW_SRC; + + if (edac_op_state == EDAC_OPSTATE_POLL) + mci->edac_check = xgene_edac_mc_check; + + if (edac_mc_add_mc(mci)) { + dev_err(edac->dev, "edac_mc_add_mc failed\n"); + rc = -EINVAL; + goto err_free; + } + + xgene_edac_mc_create_debugfs_node(mci); + + list_add(&ctx->next, &edac->mcus); + + xgene_edac_mc_irq_ctl(mci, true); + + devres_remove_group(edac->dev, xgene_edac_mc_add); + + dev_info(edac->dev, "X-Gene EDAC MC registered\n"); + return 0; + +err_free: + edac_mc_free(mci); +err_group: + devres_release_group(edac->dev, xgene_edac_mc_add); + return rc; +} + +static int xgene_edac_mc_remove(struct xgene_edac_mc_ctx *mcu) +{ + xgene_edac_mc_irq_ctl(mcu->mci, false); + edac_mc_del_mc(&mcu->mci->dev); + edac_mc_free(mcu->mci); + return 0; +} + +/* CPU L1/L2 error CSR */ +#define MAX_CPU_PER_PMD 2 +#define CPU_CSR_STRIDE 0x00100000 +#define CPU_L2C_PAGE 0x000D0000 +#define CPU_MEMERR_L2C_PAGE 0x000E0000 +#define CPU_MEMERR_CPU_PAGE 0x000F0000 + +#define MEMERR_CPU_ICFECR_PAGE_OFFSET 0x0000 +#define MEMERR_CPU_ICFESR_PAGE_OFFSET 0x0004 +#define MEMERR_CPU_ICFESR_ERRWAY_RD(src) (((src) & 0xFF000000) >> 24) +#define MEMERR_CPU_ICFESR_ERRINDEX_RD(src) (((src) & 0x003F0000) >> 16) +#define MEMERR_CPU_ICFESR_ERRINFO_RD(src) (((src) & 0x0000FF00) >> 8) +#define MEMERR_CPU_ICFESR_ERRTYPE_RD(src) (((src) & 0x00000070) >> 4) +#define MEMERR_CPU_ICFESR_MULTCERR_MASK BIT(2) +#define MEMERR_CPU_ICFESR_CERR_MASK BIT(0) +#define MEMERR_CPU_LSUESR_PAGE_OFFSET 0x000c +#define MEMERR_CPU_LSUESR_ERRWAY_RD(src) (((src) & 0xFF000000) >> 24) +#define MEMERR_CPU_LSUESR_ERRINDEX_RD(src) (((src) & 0x003F0000) >> 16) +#define MEMERR_CPU_LSUESR_ERRINFO_RD(src) (((src) & 0x0000FF00) >> 8) +#define MEMERR_CPU_LSUESR_ERRTYPE_RD(src) (((src) & 0x00000070) >> 4) +#define MEMERR_CPU_LSUESR_MULTCERR_MASK BIT(2) +#define MEMERR_CPU_LSUESR_CERR_MASK BIT(0) +#define MEMERR_CPU_LSUECR_PAGE_OFFSET 0x0008 +#define MEMERR_CPU_MMUECR_PAGE_OFFSET 0x0010 +#define MEMERR_CPU_MMUESR_PAGE_OFFSET 0x0014 +#define MEMERR_CPU_MMUESR_ERRWAY_RD(src) (((src) & 0xFF000000) >> 24) +#define MEMERR_CPU_MMUESR_ERRINDEX_RD(src) (((src) & 0x007F0000) >> 16) +#define MEMERR_CPU_MMUESR_ERRINFO_RD(src) (((src) & 0x0000FF00) >> 8) +#define MEMERR_CPU_MMUESR_ERRREQSTR_LSU_MASK BIT(7) +#define MEMERR_CPU_MMUESR_ERRTYPE_RD(src) (((src) & 0x00000070) >> 4) +#define MEMERR_CPU_MMUESR_MULTCERR_MASK BIT(2) +#define MEMERR_CPU_MMUESR_CERR_MASK BIT(0) +#define MEMERR_CPU_ICFESRA_PAGE_OFFSET 0x0804 +#define MEMERR_CPU_LSUESRA_PAGE_OFFSET 0x080c +#define MEMERR_CPU_MMUESRA_PAGE_OFFSET 0x0814 + +#define MEMERR_L2C_L2ECR_PAGE_OFFSET 0x0000 +#define MEMERR_L2C_L2ESR_PAGE_OFFSET 0x0004 +#define MEMERR_L2C_L2ESR_ERRSYN_RD(src) (((src) & 0xFF000000) >> 24) +#define MEMERR_L2C_L2ESR_ERRWAY_RD(src) (((src) & 0x00FC0000) >> 18) +#define MEMERR_L2C_L2ESR_ERRCPU_RD(src) (((src) & 0x00020000) >> 17) +#define MEMERR_L2C_L2ESR_ERRGROUP_RD(src) (((src) & 0x0000E000) >> 13) +#define MEMERR_L2C_L2ESR_ERRACTION_RD(src) (((src) & 0x00001C00) >> 10) +#define MEMERR_L2C_L2ESR_ERRTYPE_RD(src) (((src) & 0x00000300) >> 8) +#define MEMERR_L2C_L2ESR_MULTUCERR_MASK BIT(3) +#define MEMERR_L2C_L2ESR_MULTICERR_MASK BIT(2) +#define MEMERR_L2C_L2ESR_UCERR_MASK BIT(1) +#define MEMERR_L2C_L2ESR_ERR_MASK BIT(0) +#define MEMERR_L2C_L2EALR_PAGE_OFFSET 0x0008 +#define CPUX_L2C_L2RTOCR_PAGE_OFFSET 0x0010 +#define MEMERR_L2C_L2EAHR_PAGE_OFFSET 0x000c +#define CPUX_L2C_L2RTOSR_PAGE_OFFSET 0x0014 +#define MEMERR_L2C_L2RTOSR_MULTERR_MASK BIT(1) +#define MEMERR_L2C_L2RTOSR_ERR_MASK BIT(0) +#define CPUX_L2C_L2RTOALR_PAGE_OFFSET 0x0018 +#define CPUX_L2C_L2RTOAHR_PAGE_OFFSET 0x001c +#define MEMERR_L2C_L2ESRA_PAGE_OFFSET 0x0804 + +/* + * Processor Module Domain (PMD) context - Context for a pair of processsors. + * Each PMD consists of 2 CPUs and a shared L2 cache. Each CPU consists of + * its own L1 cache. + */ +struct xgene_edac_pmd_ctx { + struct list_head next; + struct device ddev; + char *name; + struct xgene_edac *edac; + struct edac_device_ctl_info *edac_dev; + void __iomem *pmd_csr; + u32 pmd; + int version; +}; + +static void xgene_edac_pmd_l1_check(struct edac_device_ctl_info *edac_dev, + int cpu_idx) +{ + struct xgene_edac_pmd_ctx *ctx = edac_dev->pvt_info; + void __iomem *pg_f; + u32 val; + + pg_f = ctx->pmd_csr + cpu_idx * CPU_CSR_STRIDE + CPU_MEMERR_CPU_PAGE; + + val = readl(pg_f + MEMERR_CPU_ICFESR_PAGE_OFFSET); + if (val) { + dev_err(edac_dev->dev, + "CPU%d L1 memory error ICF 0x%08X Way 0x%02X Index 0x%02X Info 0x%02X\n", + ctx->pmd * MAX_CPU_PER_PMD + cpu_idx, val, + MEMERR_CPU_ICFESR_ERRWAY_RD(val), + MEMERR_CPU_ICFESR_ERRINDEX_RD(val), + MEMERR_CPU_ICFESR_ERRINFO_RD(val)); + if (val & MEMERR_CPU_ICFESR_CERR_MASK) + dev_err(edac_dev->dev, + "One or more correctable error\n"); + if (val & MEMERR_CPU_ICFESR_MULTCERR_MASK) + dev_err(edac_dev->dev, "Multiple correctable error\n"); + switch (MEMERR_CPU_ICFESR_ERRTYPE_RD(val)) { + case 1: + dev_err(edac_dev->dev, "L1 TLB multiple hit\n"); + break; + case 2: + dev_err(edac_dev->dev, "Way select multiple hit\n"); + break; + case 3: + dev_err(edac_dev->dev, "Physical tag parity error\n"); + break; + case 4: + case 5: + dev_err(edac_dev->dev, "L1 data parity error\n"); + break; + case 6: + dev_err(edac_dev->dev, "L1 pre-decode parity error\n"); + break; + } + + /* Clear any HW errors */ + writel(val, pg_f + MEMERR_CPU_ICFESR_PAGE_OFFSET); + + if (val & (MEMERR_CPU_ICFESR_CERR_MASK | + MEMERR_CPU_ICFESR_MULTCERR_MASK)) + edac_device_handle_ce(edac_dev, 0, 0, + edac_dev->ctl_name); + } + + val = readl(pg_f + MEMERR_CPU_LSUESR_PAGE_OFFSET); + if (val) { + dev_err(edac_dev->dev, + "CPU%d memory error LSU 0x%08X Way 0x%02X Index 0x%02X Info 0x%02X\n", + ctx->pmd * MAX_CPU_PER_PMD + cpu_idx, val, + MEMERR_CPU_LSUESR_ERRWAY_RD(val), + MEMERR_CPU_LSUESR_ERRINDEX_RD(val), + MEMERR_CPU_LSUESR_ERRINFO_RD(val)); + if (val & MEMERR_CPU_LSUESR_CERR_MASK) + dev_err(edac_dev->dev, + "One or more correctable error\n"); + if (val & MEMERR_CPU_LSUESR_MULTCERR_MASK) + dev_err(edac_dev->dev, "Multiple correctable error\n"); + switch (MEMERR_CPU_LSUESR_ERRTYPE_RD(val)) { + case 0: + dev_err(edac_dev->dev, "Load tag error\n"); + break; + case 1: + dev_err(edac_dev->dev, "Load data error\n"); + break; + case 2: + dev_err(edac_dev->dev, "WSL multihit error\n"); + break; + case 3: + dev_err(edac_dev->dev, "Store tag error\n"); + break; + case 4: + dev_err(edac_dev->dev, + "DTB multihit from load pipeline error\n"); + break; + case 5: + dev_err(edac_dev->dev, + "DTB multihit from store pipeline error\n"); + break; + } + + /* Clear any HW errors */ + writel(val, pg_f + MEMERR_CPU_LSUESR_PAGE_OFFSET); + + if (val & (MEMERR_CPU_LSUESR_CERR_MASK | + MEMERR_CPU_LSUESR_MULTCERR_MASK)) + edac_device_handle_ce(edac_dev, 0, 0, + edac_dev->ctl_name); + } + + val = readl(pg_f + MEMERR_CPU_MMUESR_PAGE_OFFSET); + if (val) { + dev_err(edac_dev->dev, + "CPU%d memory error MMU 0x%08X Way 0x%02X Index 0x%02X Info 0x%02X %s\n", + ctx->pmd * MAX_CPU_PER_PMD + cpu_idx, val, + MEMERR_CPU_MMUESR_ERRWAY_RD(val), + MEMERR_CPU_MMUESR_ERRINDEX_RD(val), + MEMERR_CPU_MMUESR_ERRINFO_RD(val), + val & MEMERR_CPU_MMUESR_ERRREQSTR_LSU_MASK ? "LSU" : + "ICF"); + if (val & MEMERR_CPU_MMUESR_CERR_MASK) + dev_err(edac_dev->dev, + "One or more correctable error\n"); + if (val & MEMERR_CPU_MMUESR_MULTCERR_MASK) + dev_err(edac_dev->dev, "Multiple correctable error\n"); + switch (MEMERR_CPU_MMUESR_ERRTYPE_RD(val)) { + case 0: + dev_err(edac_dev->dev, "Stage 1 UTB hit error\n"); + break; + case 1: + dev_err(edac_dev->dev, "Stage 1 UTB miss error\n"); + break; + case 2: + dev_err(edac_dev->dev, "Stage 1 UTB allocate error\n"); + break; + case 3: + dev_err(edac_dev->dev, + "TMO operation single bank error\n"); + break; + case 4: + dev_err(edac_dev->dev, "Stage 2 UTB error\n"); + break; + case 5: + dev_err(edac_dev->dev, "Stage 2 UTB miss error\n"); + break; + case 6: + dev_err(edac_dev->dev, "Stage 2 UTB allocate error\n"); + break; + case 7: + dev_err(edac_dev->dev, + "TMO operation multiple bank error\n"); + break; + } + + /* Clear any HW errors */ + writel(val, pg_f + MEMERR_CPU_MMUESR_PAGE_OFFSET); + + edac_device_handle_ce(edac_dev, 0, 0, edac_dev->ctl_name); + } +} + +static void xgene_edac_pmd_l2_check(struct edac_device_ctl_info *edac_dev) +{ + struct xgene_edac_pmd_ctx *ctx = edac_dev->pvt_info; + void __iomem *pg_d; + void __iomem *pg_e; + u32 val_hi; + u32 val_lo; + u32 val; + + /* Check L2 */ + pg_e = ctx->pmd_csr + CPU_MEMERR_L2C_PAGE; + val = readl(pg_e + MEMERR_L2C_L2ESR_PAGE_OFFSET); + if (val) { + val_lo = readl(pg_e + MEMERR_L2C_L2EALR_PAGE_OFFSET); + val_hi = readl(pg_e + MEMERR_L2C_L2EAHR_PAGE_OFFSET); + dev_err(edac_dev->dev, + "PMD%d memory error L2C L2ESR 0x%08X @ 0x%08X.%08X\n", + ctx->pmd, val, val_hi, val_lo); + dev_err(edac_dev->dev, + "ErrSyndrome 0x%02X ErrWay 0x%02X ErrCpu %d ErrGroup 0x%02X ErrAction 0x%02X\n", + MEMERR_L2C_L2ESR_ERRSYN_RD(val), + MEMERR_L2C_L2ESR_ERRWAY_RD(val), + MEMERR_L2C_L2ESR_ERRCPU_RD(val), + MEMERR_L2C_L2ESR_ERRGROUP_RD(val), + MEMERR_L2C_L2ESR_ERRACTION_RD(val)); + + if (val & MEMERR_L2C_L2ESR_ERR_MASK) + dev_err(edac_dev->dev, + "One or more correctable error\n"); + if (val & MEMERR_L2C_L2ESR_MULTICERR_MASK) + dev_err(edac_dev->dev, "Multiple correctable error\n"); + if (val & MEMERR_L2C_L2ESR_UCERR_MASK) + dev_err(edac_dev->dev, + "One or more uncorrectable error\n"); + if (val & MEMERR_L2C_L2ESR_MULTUCERR_MASK) + dev_err(edac_dev->dev, + "Multiple uncorrectable error\n"); + + switch (MEMERR_L2C_L2ESR_ERRTYPE_RD(val)) { + case 0: + dev_err(edac_dev->dev, "Outbound SDB parity error\n"); + break; + case 1: + dev_err(edac_dev->dev, "Inbound SDB parity error\n"); + break; + case 2: + dev_err(edac_dev->dev, "Tag ECC error\n"); + break; + case 3: + dev_err(edac_dev->dev, "Data ECC error\n"); + break; + } + + /* Clear any HW errors */ + writel(val, pg_e + MEMERR_L2C_L2ESR_PAGE_OFFSET); + + if (val & (MEMERR_L2C_L2ESR_ERR_MASK | + MEMERR_L2C_L2ESR_MULTICERR_MASK)) + edac_device_handle_ce(edac_dev, 0, 0, + edac_dev->ctl_name); + if (val & (MEMERR_L2C_L2ESR_UCERR_MASK | + MEMERR_L2C_L2ESR_MULTUCERR_MASK)) + edac_device_handle_ue(edac_dev, 0, 0, + edac_dev->ctl_name); + } + + /* Check if any memory request timed out on L2 cache */ + pg_d = ctx->pmd_csr + CPU_L2C_PAGE; + val = readl(pg_d + CPUX_L2C_L2RTOSR_PAGE_OFFSET); + if (val) { + val_lo = readl(pg_d + CPUX_L2C_L2RTOALR_PAGE_OFFSET); + val_hi = readl(pg_d + CPUX_L2C_L2RTOAHR_PAGE_OFFSET); + dev_err(edac_dev->dev, + "PMD%d L2C error L2C RTOSR 0x%08X @ 0x%08X.%08X\n", + ctx->pmd, val, val_hi, val_lo); + writel(val, pg_d + CPUX_L2C_L2RTOSR_PAGE_OFFSET); + } +} + +static void xgene_edac_pmd_check(struct edac_device_ctl_info *edac_dev) +{ + struct xgene_edac_pmd_ctx *ctx = edac_dev->pvt_info; + unsigned int pcp_hp_stat; + int i; + + xgene_edac_pcp_rd(ctx->edac, PCPHPERRINTSTS, &pcp_hp_stat); + if (!((PMD0_MERR_MASK << ctx->pmd) & pcp_hp_stat)) + return; + + /* Check CPU L1 error */ + for (i = 0; i < MAX_CPU_PER_PMD; i++) + xgene_edac_pmd_l1_check(edac_dev, i); + + /* Check CPU L2 error */ + xgene_edac_pmd_l2_check(edac_dev); +} + +static void xgene_edac_pmd_cpu_hw_cfg(struct edac_device_ctl_info *edac_dev, + int cpu) +{ + struct xgene_edac_pmd_ctx *ctx = edac_dev->pvt_info; + void __iomem *pg_f = ctx->pmd_csr + cpu * CPU_CSR_STRIDE + + CPU_MEMERR_CPU_PAGE; + + /* + * Enable CPU memory error: + * MEMERR_CPU_ICFESRA, MEMERR_CPU_LSUESRA, and MEMERR_CPU_MMUESRA + */ + writel(0x00000301, pg_f + MEMERR_CPU_ICFECR_PAGE_OFFSET); + writel(0x00000301, pg_f + MEMERR_CPU_LSUECR_PAGE_OFFSET); + writel(0x00000101, pg_f + MEMERR_CPU_MMUECR_PAGE_OFFSET); +} + +static void xgene_edac_pmd_hw_cfg(struct edac_device_ctl_info *edac_dev) +{ + struct xgene_edac_pmd_ctx *ctx = edac_dev->pvt_info; + void __iomem *pg_d = ctx->pmd_csr + CPU_L2C_PAGE; + void __iomem *pg_e = ctx->pmd_csr + CPU_MEMERR_L2C_PAGE; + + /* Enable PMD memory error - MEMERR_L2C_L2ECR and L2C_L2RTOCR */ + writel(0x00000703, pg_e + MEMERR_L2C_L2ECR_PAGE_OFFSET); + /* Configure L2C HW request time out feature if supported */ + if (ctx->version > 1) + writel(0x00000119, pg_d + CPUX_L2C_L2RTOCR_PAGE_OFFSET); +} + +static void xgene_edac_pmd_hw_ctl(struct edac_device_ctl_info *edac_dev, + bool enable) +{ + struct xgene_edac_pmd_ctx *ctx = edac_dev->pvt_info; + int i; + + /* Enable PMD error interrupt */ + if (edac_dev->op_state == OP_RUNNING_INTERRUPT) { + if (enable) + xgene_edac_pcp_clrbits(ctx->edac, PCPHPERRINTMSK, + PMD0_MERR_MASK << ctx->pmd); + else + xgene_edac_pcp_setbits(ctx->edac, PCPHPERRINTMSK, + PMD0_MERR_MASK << ctx->pmd); + } + + if (enable) { + xgene_edac_pmd_hw_cfg(edac_dev); + + /* Two CPUs per a PMD */ + for (i = 0; i < MAX_CPU_PER_PMD; i++) + xgene_edac_pmd_cpu_hw_cfg(edac_dev, i); + } +} + +static ssize_t xgene_edac_pmd_l1_inject_ctrl_write(struct file *file, + const char __user *data, + size_t count, loff_t *ppos) +{ + struct edac_device_ctl_info *edac_dev = file->private_data; + struct xgene_edac_pmd_ctx *ctx = edac_dev->pvt_info; + void __iomem *cpux_pg_f; + int i; + + for (i = 0; i < MAX_CPU_PER_PMD; i++) { + cpux_pg_f = ctx->pmd_csr + i * CPU_CSR_STRIDE + + CPU_MEMERR_CPU_PAGE; + + writel(MEMERR_CPU_ICFESR_MULTCERR_MASK | + MEMERR_CPU_ICFESR_CERR_MASK, + cpux_pg_f + MEMERR_CPU_ICFESRA_PAGE_OFFSET); + writel(MEMERR_CPU_LSUESR_MULTCERR_MASK | + MEMERR_CPU_LSUESR_CERR_MASK, + cpux_pg_f + MEMERR_CPU_LSUESRA_PAGE_OFFSET); + writel(MEMERR_CPU_MMUESR_MULTCERR_MASK | + MEMERR_CPU_MMUESR_CERR_MASK, + cpux_pg_f + MEMERR_CPU_MMUESRA_PAGE_OFFSET); + } + return count; +} + +static ssize_t xgene_edac_pmd_l2_inject_ctrl_write(struct file *file, + const char __user *data, + size_t count, loff_t *ppos) +{ + struct edac_device_ctl_info *edac_dev = file->private_data; + struct xgene_edac_pmd_ctx *ctx = edac_dev->pvt_info; + void __iomem *pg_e = ctx->pmd_csr + CPU_MEMERR_L2C_PAGE; + + writel(MEMERR_L2C_L2ESR_MULTUCERR_MASK | + MEMERR_L2C_L2ESR_MULTICERR_MASK | + MEMERR_L2C_L2ESR_UCERR_MASK | + MEMERR_L2C_L2ESR_ERR_MASK, + pg_e + MEMERR_L2C_L2ESRA_PAGE_OFFSET); + return count; +} + +static const struct file_operations xgene_edac_pmd_debug_inject_fops[] = { + { + .open = simple_open, + .write = xgene_edac_pmd_l1_inject_ctrl_write, + .llseek = generic_file_llseek, }, + { + .open = simple_open, + .write = xgene_edac_pmd_l2_inject_ctrl_write, + .llseek = generic_file_llseek, }, + { } +}; + +static void xgene_edac_pmd_create_debugfs_nodes( + struct edac_device_ctl_info *edac_dev) +{ + struct xgene_edac_pmd_ctx *ctx = edac_dev->pvt_info; + struct dentry *edac_debugfs; + char name[30]; + + if (!IS_ENABLED(CONFIG_EDAC_DEBUG)) + return; + + /* + * Todo: Switch to common EDAC debug file system for edac device + * when available. + */ + if (!ctx->edac->dfs) { + ctx->edac->dfs = debugfs_create_dir(edac_dev->dev->kobj.name, + NULL); + if (!ctx->edac->dfs) + return; + } + sprintf(name, "PMD%d", ctx->pmd); + edac_debugfs = debugfs_create_dir(name, ctx->edac->dfs); + if (!edac_debugfs) + return; + + debugfs_create_file("l1_inject_ctrl", S_IWUSR, edac_debugfs, edac_dev, + &xgene_edac_pmd_debug_inject_fops[0]); + debugfs_create_file("l2_inject_ctrl", S_IWUSR, edac_debugfs, edac_dev, + &xgene_edac_pmd_debug_inject_fops[1]); +} + +static int xgene_edac_pmd_available(u32 efuse, int pmd) +{ + return (efuse & (1 << pmd)) ? 0 : 1; +} + +static int xgene_edac_pmd_add(struct xgene_edac *edac, struct device_node *np, + int version) +{ + struct edac_device_ctl_info *edac_dev; + struct xgene_edac_pmd_ctx *ctx; + struct resource res; + char edac_name[10]; + u32 pmd; + int rc; + u32 val; + + if (!devres_open_group(edac->dev, xgene_edac_pmd_add, GFP_KERNEL)) + return -ENOMEM; + + /* Determine if this PMD is disabled */ + if (of_property_read_u32(np, "pmd-controller", &pmd)) { + dev_err(edac->dev, "no pmd-controller property\n"); + rc = -ENODEV; + goto err_group; + } + rc = regmap_read(edac->efuse_map, 0, &val); + if (rc) + goto err_group; + if (!xgene_edac_pmd_available(val, pmd)) { + rc = -ENODEV; + goto err_group; + } + + sprintf(edac_name, "l2c%d", pmd); + edac_dev = edac_device_alloc_ctl_info(sizeof(*ctx), + edac_name, 1, "l2c", 1, 2, NULL, + 0, edac_device_alloc_index()); + if (!edac_dev) { + rc = -ENOMEM; + goto err_group; + } + + ctx = edac_dev->pvt_info; + ctx->name = "xgene_pmd_err"; + ctx->pmd = pmd; + ctx->edac = edac; + ctx->edac_dev = edac_dev; + ctx->ddev = *edac->dev; + ctx->version = version; + edac_dev->dev = &ctx->ddev; + edac_dev->ctl_name = ctx->name; + edac_dev->dev_name = ctx->name; + edac_dev->mod_name = EDAC_MOD_STR; + + rc = of_address_to_resource(np, 0, &res); + if (rc < 0) { + dev_err(edac->dev, "no PMD resource address\n"); + goto err_free; + } + ctx->pmd_csr = devm_ioremap_resource(edac->dev, &res); + if (IS_ERR(ctx->pmd_csr)) { + dev_err(edac->dev, + "devm_ioremap_resource failed for PMD resource address\n"); + rc = PTR_ERR(ctx->pmd_csr); + goto err_free; + } + + if (edac_op_state == EDAC_OPSTATE_POLL) + edac_dev->edac_check = xgene_edac_pmd_check; + + xgene_edac_pmd_create_debugfs_nodes(edac_dev); + + rc = edac_device_add_device(edac_dev); + if (rc > 0) { + dev_err(edac->dev, "edac_device_add_device failed\n"); + rc = -ENOMEM; + goto err_free; + } + + if (edac_op_state == EDAC_OPSTATE_INT) + edac_dev->op_state = OP_RUNNING_INTERRUPT; + + list_add(&ctx->next, &edac->pmds); + + xgene_edac_pmd_hw_ctl(edac_dev, 1); + + devres_remove_group(edac->dev, xgene_edac_pmd_add); + + dev_info(edac->dev, "X-Gene EDAC PMD%d registered\n", ctx->pmd); + return 0; + +err_free: + edac_device_free_ctl_info(edac_dev); +err_group: + devres_release_group(edac->dev, xgene_edac_pmd_add); + return rc; +} + +static int xgene_edac_pmd_remove(struct xgene_edac_pmd_ctx *pmd) +{ + struct edac_device_ctl_info *edac_dev = pmd->edac_dev; + + xgene_edac_pmd_hw_ctl(edac_dev, 0); + edac_device_del_device(edac_dev->dev); + edac_device_free_ctl_info(edac_dev); + return 0; +} + +static irqreturn_t xgene_edac_isr(int irq, void *dev_id) +{ + struct xgene_edac *ctx = dev_id; + struct xgene_edac_pmd_ctx *pmd; + unsigned int pcp_hp_stat; + unsigned int pcp_lp_stat; + + xgene_edac_pcp_rd(ctx, PCPHPERRINTSTS, &pcp_hp_stat); + xgene_edac_pcp_rd(ctx, PCPLPERRINTSTS, &pcp_lp_stat); + if ((MCU_UNCORR_ERR_MASK & pcp_hp_stat) || + (MCU_CTL_ERR_MASK & pcp_hp_stat) || + (MCU_CORR_ERR_MASK & pcp_lp_stat)) { + struct xgene_edac_mc_ctx *mcu; + + list_for_each_entry(mcu, &ctx->mcus, next) { + xgene_edac_mc_check(mcu->mci); + } + } + + list_for_each_entry(pmd, &ctx->pmds, next) { + if ((PMD0_MERR_MASK << pmd->pmd) & pcp_hp_stat) + xgene_edac_pmd_check(pmd->edac_dev); + } + + return IRQ_HANDLED; +} + +static int xgene_edac_probe(struct platform_device *pdev) +{ + struct xgene_edac *edac; + struct device_node *child; + struct resource *res; + int rc; + + edac = devm_kzalloc(&pdev->dev, sizeof(*edac), GFP_KERNEL); + if (!edac) + return -ENOMEM; + + edac->dev = &pdev->dev; + platform_set_drvdata(pdev, edac); + INIT_LIST_HEAD(&edac->mcus); + INIT_LIST_HEAD(&edac->pmds); + spin_lock_init(&edac->lock); + mutex_init(&edac->mc_lock); + + edac->csw_map = syscon_regmap_lookup_by_phandle(pdev->dev.of_node, + "regmap-csw"); + if (IS_ERR(edac->csw_map)) { + dev_err(edac->dev, "unable to get syscon regmap csw\n"); + rc = PTR_ERR(edac->csw_map); + goto out_err; + } + + edac->mcba_map = syscon_regmap_lookup_by_phandle(pdev->dev.of_node, + "regmap-mcba"); + if (IS_ERR(edac->mcba_map)) { + dev_err(edac->dev, "unable to get syscon regmap mcba\n"); + rc = PTR_ERR(edac->mcba_map); + goto out_err; + } + + edac->mcbb_map = syscon_regmap_lookup_by_phandle(pdev->dev.of_node, + "regmap-mcbb"); + if (IS_ERR(edac->mcbb_map)) { + dev_err(edac->dev, "unable to get syscon regmap mcbb\n"); + rc = PTR_ERR(edac->mcbb_map); + goto out_err; + } + edac->efuse_map = syscon_regmap_lookup_by_phandle(pdev->dev.of_node, + "regmap-efuse"); + if (IS_ERR(edac->efuse_map)) { + dev_err(edac->dev, "unable to get syscon regmap efuse\n"); + rc = PTR_ERR(edac->efuse_map); + goto out_err; + } + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + edac->pcp_csr = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(edac->pcp_csr)) { + dev_err(&pdev->dev, "no PCP resource address\n"); + rc = PTR_ERR(edac->pcp_csr); + goto out_err; + } + + if (edac_op_state == EDAC_OPSTATE_INT) { + int irq; + int i; + + for (i = 0; i < 3; i++) { + irq = platform_get_irq(pdev, i); + if (irq < 0) { + dev_err(&pdev->dev, "No IRQ resource\n"); + rc = -EINVAL; + goto out_err; + } + rc = devm_request_irq(&pdev->dev, irq, + xgene_edac_isr, IRQF_SHARED, + dev_name(&pdev->dev), edac); + if (rc) { + dev_err(&pdev->dev, + "Could not request IRQ %d\n", irq); + goto out_err; + } + } + } + + for_each_child_of_node(pdev->dev.of_node, child) { + if (!of_device_is_available(child)) + continue; + if (of_device_is_compatible(child, "apm,xgene-edac-mc")) + xgene_edac_mc_add(edac, child); + if (of_device_is_compatible(child, "apm,xgene-edac-pmd")) + xgene_edac_pmd_add(edac, child, 1); + if (of_device_is_compatible(child, "apm,xgene-edac-pmd-v2")) + xgene_edac_pmd_add(edac, child, 2); + } + + return 0; + +out_err: + return rc; +} + +static int xgene_edac_remove(struct platform_device *pdev) +{ + struct xgene_edac *edac = dev_get_drvdata(&pdev->dev); + struct xgene_edac_mc_ctx *mcu; + struct xgene_edac_mc_ctx *temp_mcu; + struct xgene_edac_pmd_ctx *pmd; + struct xgene_edac_pmd_ctx *temp_pmd; + + list_for_each_entry_safe(mcu, temp_mcu, &edac->mcus, next) { + xgene_edac_mc_remove(mcu); + } + + list_for_each_entry_safe(pmd, temp_pmd, &edac->pmds, next) { + xgene_edac_pmd_remove(pmd); + } + return 0; +} + +static const struct of_device_id xgene_edac_of_match[] = { + { .compatible = "apm,xgene-edac" }, + {}, +}; +MODULE_DEVICE_TABLE(of, xgene_edac_of_match); + +static struct platform_driver xgene_edac_driver = { + .probe = xgene_edac_probe, + .remove = xgene_edac_remove, + .driver = { + .name = "xgene-edac", + .owner = THIS_MODULE, + .of_match_table = xgene_edac_of_match, + }, +}; + +static int __init xgene_edac_init(void) +{ + int rc; + + /* Make sure error reporting method is sane */ + switch (edac_op_state) { + case EDAC_OPSTATE_POLL: + case EDAC_OPSTATE_INT: + break; + default: + edac_op_state = EDAC_OPSTATE_INT; + break; + } + + rc = platform_driver_register(&xgene_edac_driver); + if (rc) { + edac_printk(KERN_ERR, EDAC_MOD_STR, + "EDAC fails to register\n"); + goto reg_failed; + } + + return 0; + +reg_failed: + return rc; +} +module_init(xgene_edac_init); + +static void __exit xgene_edac_exit(void) +{ + platform_driver_unregister(&xgene_edac_driver); +} +module_exit(xgene_edac_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Feng Kan <fkan@apm.com>"); +MODULE_DESCRIPTION("APM X-Gene EDAC driver"); +module_param(edac_op_state, int, 0444); +MODULE_PARM_DESC(edac_op_state, + "EDAC error reporting state: 0=Poll, 2=Interrupt"); |