diff options
-rw-r--r-- | Documentation/ABI/testing/sysfs-edac-ecs | 74 | ||||
-rw-r--r-- | Documentation/ABI/testing/sysfs-edac-memory-repair | 206 | ||||
-rw-r--r-- | Documentation/ABI/testing/sysfs-edac-scrub | 69 | ||||
-rw-r--r-- | Documentation/edac/features.rst | 103 | ||||
-rw-r--r-- | Documentation/edac/index.rst | 12 | ||||
-rw-r--r-- | Documentation/edac/memory_repair.rst | 121 | ||||
-rw-r--r-- | Documentation/edac/scrub.rst | 266 | ||||
-rw-r--r-- | MAINTAINERS | 1 | ||||
-rw-r--r-- | drivers/edac/Kconfig | 28 | ||||
-rw-r--r-- | drivers/edac/Makefile | 3 | ||||
-rw-r--r-- | drivers/edac/amd64_edac.c | 52 | ||||
-rw-r--r-- | drivers/edac/debugfs.c | 5 | ||||
-rwxr-xr-x | drivers/edac/ecs.c | 205 | ||||
-rw-r--r-- | drivers/edac/edac_device.c | 185 | ||||
-rw-r--r-- | drivers/edac/i5400_edac.c | 3 | ||||
-rw-r--r-- | drivers/edac/i7300_edac.c | 7 | ||||
-rw-r--r-- | drivers/edac/igen6_edac.c | 20 | ||||
-rwxr-xr-x | drivers/edac/mem_repair.c | 359 | ||||
-rw-r--r-- | drivers/edac/pnd2_edac.c | 4 | ||||
-rwxr-xr-x | drivers/edac/scrub.c | 209 | ||||
-rw-r--r-- | drivers/edac/xgene_edac.c | 17 | ||||
-rw-r--r-- | include/linux/edac.h | 215 |
22 files changed, 2111 insertions, 53 deletions
diff --git a/Documentation/ABI/testing/sysfs-edac-ecs b/Documentation/ABI/testing/sysfs-edac-ecs new file mode 100644 index 000000000000..87c885c4eb1a --- /dev/null +++ b/Documentation/ABI/testing/sysfs-edac-ecs @@ -0,0 +1,74 @@ +What: /sys/bus/edac/devices/<dev-name>/ecs_fruX +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + The sysfs EDAC bus devices /<dev-name>/ecs_fruX subdirectory + pertains to the memory media ECS (Error Check Scrub) control + feature, where <dev-name> directory corresponds to a device + registered with the EDAC device driver for the ECS feature. + /ecs_fruX belongs to the media FRUs (Field Replaceable Unit) + under the memory device. + + The sysfs ECS attr nodes are only present if the parent + driver has implemented the corresponding attr callback + function and provided the necessary operations to the EDAC + device driver during registration. + +What: /sys/bus/edac/devices/<dev-name>/ecs_fruX/log_entry_type +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RW) The log entry type of how the DDR5 ECS log is reported. + + - 0 - per DRAM. + + - 1 - per memory media FRU. + + - All other values are reserved. + +What: /sys/bus/edac/devices/<dev-name>/ecs_fruX/mode +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RW) The mode of how the DDR5 ECS counts the errors. + Error count is tracked based on two different modes + selected by DDR5 ECS Control Feature - Codeword mode and + Row Count mode. If the ECS is under Codeword mode, then + the error count increments each time a codeword with check + bit errors is detected. If the ECS is under Row Count mode, + then the error counter increments each time a row with + check bit errors is detected. + + - 0 - ECS counts rows in the memory media that have ECC errors. + + - 1 - ECS counts codewords with errors, specifically, it counts + the number of ECC-detected errors in the memory media. + + - All other values are reserved. + +What: /sys/bus/edac/devices/<dev-name>/ecs_fruX/reset +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (WO) ECS reset ECC counter. + + - 1 - reset ECC counter to the default value. + + - All other values are reserved. + +What: /sys/bus/edac/devices/<dev-name>/ecs_fruX/threshold +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RW) DDR5 ECS threshold count per gigabits of memory cells. + The ECS error count is subject to the ECS Threshold count + per Gbit, which masks error counts less than the Threshold. + + Supported values are 256, 1024 and 4096. + + All other values are reserved. diff --git a/Documentation/ABI/testing/sysfs-edac-memory-repair b/Documentation/ABI/testing/sysfs-edac-memory-repair new file mode 100644 index 000000000000..0434a3b23ff3 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-edac-memory-repair @@ -0,0 +1,206 @@ +What: /sys/bus/edac/devices/<dev-name>/mem_repairX +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + The sysfs EDAC bus devices /<dev-name>/mem_repairX subdirectory + pertains to the memory media repair features control, such as + PPR (Post Package Repair), memory sparing etc, where <dev-name> + directory corresponds to a device registered with the EDAC + device driver for the memory repair features. + + Post Package Repair is a maintenance operation requests the memory + device to perform a repair operation on its media. It is a memory + self-healing feature that fixes a failing memory location by + replacing it with a spare row in a DRAM device. For example, a + CXL memory device with DRAM components that support PPR features may + implement PPR maintenance operations. DRAM components may support + two types of PPR functions: hard PPR, for a permanent row repair, and + soft PPR, for a temporary row repair. Soft PPR may be much faster + than hard PPR, but the repair is lost with a power cycle. + + The sysfs attributes nodes for a repair feature are only + present if the parent driver has implemented the corresponding + attr callback function and provided the necessary operations + to the EDAC device driver during registration. + + In some states of system configuration (e.g. before address + decoders have been configured), memory devices (e.g. CXL) + may not have an active mapping in the main host address + physical address map. As such, the memory to repair must be + identified by a device specific physical addressing scheme + using a device physical address(DPA). The DPA and other control + attributes to use will be presented in related error records. + +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/repair_type +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RO) Memory repair type. For eg. post package repair, + memory sparing etc. Valid values are: + + - ppr - Post package repair. + + - cacheline-sparing + + - row-sparing + + - bank-sparing + + - rank-sparing + + - All other values are reserved. + +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/persist_mode +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RW) Get/Set the current persist repair mode set for a + repair function. Persist repair modes supported in the + device, based on a memory repair function, either is temporary, + which is lost with a power cycle or permanent. Valid values are: + + - 0 - Soft memory repair (temporary repair). + + - 1 - Hard memory repair (permanent repair). + + - All other values are reserved. + +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/repair_safe_when_in_use +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RO) True if memory media is accessible and data is retained + during the memory repair operation. + The data may not be retained and memory requests may not be + correctly processed during a repair operation. In such case + repair operation can not be executed at runtime. The memory + must be taken offline. + +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/hpa +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RW) Host Physical Address (HPA) of the memory to repair. + The HPA to use will be provided in related error records. + +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/dpa +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RW) Device Physical Address (DPA) of the memory to repair. + The specific DPA to use will be provided in related error + records. + + In some states of system configuration (e.g. before address + decoders have been configured), memory devices (e.g. CXL) + may not have an active mapping in the main host address + physical address map. As such, the memory to repair must be + identified by a device specific physical addressing scheme + using a DPA. The device physical address(DPA) to use will be + presented in related error records. + +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/nibble_mask +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RW) Read/Write Nibble mask of the memory to repair. + Nibble mask identifies one or more nibbles in error on the + memory bus that produced the error event. Nibble Mask bit 0 + shall be set if nibble 0 on the memory bus produced the + event, etc. For example, CXL PPR and sparing, a nibble mask + bit set to 1 indicates the request to perform repair + operation in the specific device. All nibble mask bits set + to 1 indicates the request to perform the operation in all + devices. Eg. for CXL memory repair, the specific value of + nibble mask to use will be provided in related error records. + For more details, See nibble mask field in CXL spec ver 3.1, + section 8.2.9.7.1.2 Table 8-103 soft PPR and section + 8.2.9.7.1.3 Table 8-104 hard PPR, section 8.2.9.7.1.4 + Table 8-105 memory sparing. + +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/min_hpa +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/max_hpa +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/min_dpa +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/max_dpa +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RW) The supported range of memory address that is to be + repaired. The memory device may give the supported range of + attributes to use and it will depend on the memory device + and the portion of memory to repair. + The userspace may receive the specific value of attributes + to use for a repair operation from the memory device via + related error records and trace events, for eg. CXL DRAM + and CXL general media error records in CXL memory devices. + +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/bank_group +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/bank +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/rank +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/row +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/column +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/channel +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/sub_channel +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RW) The control attributes for the memory to be repaired. + The specific value of attributes to use depends on the + portion of memory to repair and will be reported to the host + in related error records and be available to userspace + in trace events, such as CXL DRAM and CXL general media + error records of CXL memory devices. + + When readng back these attributes, it returns the current + value of memory requested to be repaired. + + bank_group - The bank group of the memory to repair. + + bank - The bank number of the memory to repair. + + rank - The rank of the memory to repair. Rank is defined as a + set of memory devices on a channel that together execute a + transaction. + + row - The row number of the memory to repair. + + column - The column number of the memory to repair. + + channel - The channel of the memory to repair. Channel is + defined as an interface that can be independently accessed + for a transaction. + + sub_channel - The subchannel of the memory to repair. + + The requirement to set these attributes varies based on the + repair function. The attributes in sysfs are not present + unless required for a repair function. + + For example, CXL spec ver 3.1, Section 8.2.9.7.1.2 Table 8-103 + soft PPR and Section 8.2.9.7.1.3 Table 8-104 hard PPR operations, + these attributes are not required to set. CXL spec ver 3.1, + Section 8.2.9.7.1.4 Table 8-105 memory sparing, these attributes + are required to set based on memory sparing granularity. + +What: /sys/bus/edac/devices/<dev-name>/mem_repairX/repair +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (WO) Issue the memory repair operation for the specified + memory repair attributes. The operation may fail if resources + are insufficient based on the requirements of the memory + device and repair function. + + - 1 - Issue the repair operation. + + - All other values are reserved. diff --git a/Documentation/ABI/testing/sysfs-edac-scrub b/Documentation/ABI/testing/sysfs-edac-scrub new file mode 100644 index 000000000000..c43be90deab4 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-edac-scrub @@ -0,0 +1,69 @@ +What: /sys/bus/edac/devices/<dev-name>/scrubX +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + The sysfs EDAC bus devices /<dev-name>/scrubX subdirectory + belongs to an instance of memory scrub control feature, + where <dev-name> directory corresponds to a device/memory + region registered with the EDAC device driver for the + scrub control feature. + + The sysfs scrub attr nodes are only present if the parent + driver has implemented the corresponding attr callback + function and provided the necessary operations to the EDAC + device driver during registration. + +What: /sys/bus/edac/devices/<dev-name>/scrubX/addr +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RW) The base address of the memory region to be scrubbed + for on-demand scrubbing. Setting address starts scrubbing. + The size must be set before that. + + The readback addr value is non-zero if the requested + on-demand scrubbing is in progress, zero otherwise. + +What: /sys/bus/edac/devices/<dev-name>/scrubX/size +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RW) The size of the memory region to be scrubbed + (on-demand scrubbing). + +What: /sys/bus/edac/devices/<dev-name>/scrubX/enable_background +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RW) Start/Stop background (patrol) scrubbing if supported. + +What: /sys/bus/edac/devices/<dev-name>/scrubX/min_cycle_duration +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RO) Supported minimum scrub cycle duration in seconds + by the memory scrubber. + +What: /sys/bus/edac/devices/<dev-name>/scrubX/max_cycle_duration +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RO) Supported maximum scrub cycle duration in seconds + by the memory scrubber. + +What: /sys/bus/edac/devices/<dev-name>/scrubX/current_cycle_duration +Date: March 2025 +KernelVersion: 6.15 +Contact: linux-edac@vger.kernel.org +Description: + (RW) The current scrub cycle duration in seconds and must be + within the supported range by the memory scrubber. + + Scrub has an overhead when running and that may want to be + reduced by taking longer to do it. diff --git a/Documentation/edac/features.rst b/Documentation/edac/features.rst new file mode 100644 index 000000000000..3f283de297c7 --- /dev/null +++ b/Documentation/edac/features.rst @@ -0,0 +1,103 @@ +.. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.2-no-invariants-or-later + +================= +EDAC/RAS features +================= + +Copyright (c) 2024-2025 HiSilicon Limited. + +:Author: Shiju Jose <shiju.jose@huawei.com> +:License: The GNU Free Documentation License, Version 1.2 without + Invariant Sections, Front-Cover Texts nor Back-Cover Texts. + (dual licensed under the GPL v2) + +- Written for: 6.15 + +Introduction +------------ + +EDAC/RAS components plugging and high-level design: + +1. Scrub control + +2. Error Check Scrub (ECS) control + +3. ACPI RAS2 features + +4. Post Package Repair (PPR) control + +5. Memory Sparing Repair control + +High level design is illustrated in the following diagram:: + + +-----------------------------------------------+ + | Userspace - Rasdaemon | + | +-------------+ | + | | RAS CXL mem | +---------------+ | + | |error handler|---->| | | + | +-------------+ | RAS dynamic | | + | +-------------+ | scrub, memory | | + | | RAS memory |---->| repair control| | + | |error handler| +----|----------+ | + | +-------------+ | | + +--------------------------|--------------------+ + | + | + +-------------------------------|------------------------------+ + | Kernel EDAC extension for | controlling RAS Features | + |+------------------------------|----------------------------+ | + || EDAC Core Sysfs EDAC| Bus | | + || +--------------------------|---------------------------+| | + || |/sys/bus/edac/devices/<dev>/scrubX/ | | EDAC device || | + || |/sys/bus/edac/devices/<dev>/ecsX/ |<->| EDAC MC || | + || |/sys/bus/edac/devices/<dev>/repairX | | EDAC sysfs || | + || +---------------------------|--------------------------+| | + || EDAC|Bus | | + || | | | + || +----------+ Get feature | Get feature | | + || | | desc +---------|------+ desc +----------+ | | + || |EDAC scrub|<-----| EDAC device | | | | | + || +----------+ | driver- RAS |----->| EDAC mem | | | + || +----------+ | feature control| | repair | | | + || | |<-----| | +----------+ | | + || |EDAC ECS | +---------|------+ | | + || +----------+ Register RAS|features | | + || ______________________|_____________ | | + |+---------|---------------|------------------|--------------+ | + | +-------|----+ +-------|-------+ +----|----------+ | + | | | | CXL mem driver| | Client driver | | + | | ACPI RAS2 | | scrub, ECS, | | memory repair | | + | | driver | | sparing, PPR | | features | | + | +-----|------+ +-------|-------+ +------|--------+ | + | | | | | + +--------|-----------------|--------------------|--------------+ + | | | + +--------|-----------------|--------------------|--------------+ + | +---|-----------------|--------------------|-------+ | + | | | | + | | Platform HW and Firmware | | + | +--------------------------------------------------+ | + +--------------------------------------------------------------+ + + +1. EDAC Features components - Create feature-specific descriptors. For + example: scrub, ECS, memory repair in the above diagram. + +2. EDAC device driver for controlling RAS Features - Get feature's attribute + descriptors from EDAC RAS feature component and registers device's RAS + features with EDAC bus and expose the features control attributes via + sysfs. For example, /sys/bus/edac/devices/<dev-name>/<feature>X/ + +3. RAS dynamic feature controller - Userspace sample modules in rasdaemon for + dynamic scrub/repair control to issue scrubbing/repair when excess number + of corrected memory errors are reported in a short span of time. + +RAS features +------------ +1. Memory Scrub + +Memory scrub features are documented in `Documentation/edac/scrub.rst`. + +2. Memory Repair + +Memory repair features are documented in `Documentation/edac/memory_repair.rst`. diff --git a/Documentation/edac/index.rst b/Documentation/edac/index.rst new file mode 100644 index 000000000000..420c6601dbae --- /dev/null +++ b/Documentation/edac/index.rst @@ -0,0 +1,12 @@ +.. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.2-no-invariants-or-later + +============== +EDAC Subsystem +============== + +.. toctree:: + :maxdepth: 1 + + features + memory_repair + scrub diff --git a/Documentation/edac/memory_repair.rst b/Documentation/edac/memory_repair.rst new file mode 100644 index 000000000000..52162a422864 --- /dev/null +++ b/Documentation/edac/memory_repair.rst @@ -0,0 +1,121 @@ +.. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.2-no-invariants-or-later + +========================== +EDAC Memory Repair Control +========================== + +Copyright (c) 2024-2025 HiSilicon Limited. + +:Author: Shiju Jose <shiju.jose@huawei.com> +:License: The GNU Free Documentation License, Version 1.2 without + Invariant Sections, Front-Cover Texts nor Back-Cover Texts. + (dual licensed under the GPL v2) +:Original Reviewers: + +- Written for: 6.15 + +Introduction +------------ + +Some memory devices support repair operations to address issues in their +memory media. Post Package Repair (PPR) and memory sparing are examples of +such features. + +Post Package Repair (PPR) +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Post Package Repair is a maintenance operation which requests the memory +device to perform repair operation on its media. It is a memory self-healing +feature that fixes a failing memory location by replacing it with a spare row +in a DRAM device. + +For example, a CXL memory device with DRAM components that support PPR +features implements maintenance operations. DRAM components support those +types of PPR functions: + + - hard PPR, for a permanent row repair, and + - soft PPR, for a temporary row repair. + +Soft PPR is much faster than hard PPR, but the repair is lost after a power +cycle. + +The data may not be retained and memory requests may not be correctly +processed during a repair operation. In such case, the repair operation should +not be executed at runtime. + +For example, for CXL memory devices, see CXL spec rev 3.1 [1]_ sections +8.2.9.7.1.1 PPR Maintenance Operations, 8.2.9.7.1.2 sPPR Maintenance Operation +and 8.2.9.7.1.3 hPPR Maintenance Operation for more details. + +Memory Sparing +~~~~~~~~~~~~~~ + +Memory sparing is a repair function that replaces a portion of memory with +a portion of functional memory at a particular granularity. Memory +sparing has cacheline/row/bank/rank sparing granularities. For example, in +rank memory-sparing mode, one memory rank serves as a spare for other ranks on +the same channel in case they fail. + +The spare rank is held in reserve and not used as active memory until +a failure is indicated, with reserved capacity subtracted from the total +available memory in the system. + +After an error threshold is surpassed in a system protected by memory sparing, +the content of a failing rank of DIMMs is copied to the spare rank. The +failing rank is then taken offline and the spare rank placed online for use as +active memory in place of the failed rank. + +For example, CXL memory devices can support various subclasses for sparing +operation vary in terms of the scope of the sparing being performed. + +Cacheline sparing subclass refers to a sparing action that can replace a full +cacheline. Row sparing is provided as an alternative to PPR sparing functions +and its scope is that of a single DDR row. Bank sparing allows an entire bank +to be replaced. Rank sparing is defined as an operation in which an entire DDR +rank is replaced. + +See CXL spec 3.1 [1]_ section 8.2.9.7.1.4 Memory Sparing Maintenance +Operations for more details. + +.. [1] https://computeexpresslink.org/cxl-specification/ + +Use cases of generic memory repair features control +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +1. The soft PPR, hard PPR and memory-sparing features share similar control + attributes. Therefore, there is a need for a standardized, generic sysfs + repair control that is exposed to userspace and used by administrators, + scripts and tools. + +2. When a CXL device detects an error in a memory component, it informs the + host of the need for a repair maintenance operation by using an event + record where the "maintenance needed" flag is set. The event record + specifies the device physical address (DPA) and attributes of the memory + that requires repair. The kernel reports the corresponding CXL general + media or DRAM trace event to userspace, and userspace tools (e.g. + rasdaemon) initiate a repair maintenance operation in response to the + device request using the sysfs repair control. + +3. Userspace tools, such as rasdaemon, request a repair operation on a memory + region when maintenance need flag set or an uncorrected memory error or + excess of corrected memory errors above a threshold value is reported or an + exceed corrected errors threshold flag set for that memory. + +4. Multiple PPR/sparing instances may be present per memory device. + +5. Drivers should enforce that live repair is safe. In systems where memory + mapping functions can change between boots, one approach to this is to log + memory errors seen on this boot against which to check live memory repair + requests. + +The File System +--------------- + +The control attributes of a registered memory repair instance could be +accessed in the /sys/bus/edac/devices/<dev-name>/mem_repairX/ + +sysfs +----- + +Sysfs files are documented in +`Documentation/ABI/testing/sysfs-edac-memory-repair`. diff --git a/Documentation/edac/scrub.rst b/Documentation/edac/scrub.rst new file mode 100644 index 000000000000..daab929cdba1 --- /dev/null +++ b/Documentation/edac/scrub.rst @@ -0,0 +1,266 @@ +.. SPDX-License-Identifier: GPL-2.0 OR GFDL-1.2-no-invariants-or-later + +============= +Scrub Control +============= + +Copyright (c) 2024-2025 HiSilicon Limited. + +:Author: Shiju Jose <shiju.jose@huawei.com> +:License: The GNU Free Documentation License, Version 1.2 without + Invariant Sections, Front-Cover Texts nor Back-Cover Texts. + (dual licensed under the GPL v2) + +- Written for: 6.15 + +Introduction +------------ + +Increasing DRAM size and cost have made memory subsystem reliability an +important concern. These modules are used where potentially corrupted data +could cause expensive or fatal issues. Memory errors are among the top +hardware failures that cause server and workload crashes. + +Memory scrubbing is a feature where an ECC (Error-Correcting Code) engine +reads data from each memory media location, corrects if necessary and writes +the corrected data back to the same memory media location. + +DIMMs can be scrubbed at a configurable rate to detect uncorrected memory +errors and attempt recovery from detected errors, providing the following +benefits: + +1. Proactively scrubbing DIMMs reduces the chance of a correctable error + becoming uncorrectable. + +2. When detected, uncorrected errors caught in unallocated memory pages are + isolated and prevented from being allocated to an application or the OS. + +3. This reduces the likelihood of software or hardware products encountering + memory errors. + +4. The additional data on failures in memory may be used to build up + statistics that are later used to decide whether to use memory repair + technologies such as Post Package Repair or Sparing. + +There are 2 types of memory scrubbing: + +1. Background (patrol) scrubbing while the DRAM is otherwise idle. + +2. On-demand scrubbing for a specific address range or region of memory. + +Several types of interfaces to hardware memory scrubbers have been +identified, such as CXL memory device patrol scrub, CXL DDR5 ECS, ACPI +RAS2 memory scrubbing, and ACPI NVDIMM ARS (Address Range Scrub). + +The control mechanisms vary across different memory scrubbers. To enable +standardized userspace tooling, there is a need to present these controls +through a standardized ABI. + +A generic memory EDAC scrub control allows users to manage underlying +scrubbers in the system through a standardized sysfs control interface. It +abstracts the management of various scrubbing functionalities into a unified +set of functions. + +Use cases of common scrub control feature +----------------------------------------- + +1. Several types of interfaces for hardware memory scrubbers have been + identified, including the CXL memory device patrol scrub, CXL DDR5 ECS, + ACPI RAS2 memory scrubbing features, ACPI NVDIMM ARS (Address Range Scrub), + and software-based memory scrubbers. + + Of the identified interfaces to hardware memory scrubbers some support + control over patrol (background) scrubbing (e.g., ACPI RAS2, CXL) and/or + on-demand scrubbing (e.g., ACPI RAS2, ACPI ARS). However, the scrub control + interfaces vary between memory scrubbers, highlighting the need for + a standardized, generic sysfs scrub control interface that is accessible to + userspace for administration and use by scripts/tools. + +2. User-space scrub controls allow users to disable scrubbing if necessary, + for example, to disable background patrol scrubbing or adjust the scrub + rate for performance-aware operations where background activities need to + be minimized or disabled. + +3. User-space tools enable on-demand scrubbing for specific address ranges, + provided that the scrubber supports this functionality. + +4. User-space tools can also control memory DIMM scrubbing at a configurable + scrub rate via sysfs scrub controls. This approach offers several benefits: + + 4.1. Detects uncorrectable memory errors early, before user access to affected + memory, helping facilitate recovery. + + 4.2. Reduces the likelihood of correctable errors developing into uncorrectable + errors. + +5. Policy control for hotplugged memory is necessary because there may not + be a system-wide BIOS or similar control to manage scrub settings for a CXL + device added after boot. Determining these settings is a policy decision, + balancing reliability against performance, so userspace should control it. + Therefore, a unified interface is recommended for handling this function in + a way that aligns with other similar interfaces, rather than creating a + separate one. + +Scrubbing features +------------------ + +CXL Memory Scrubbing features +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +CXL spec r3.1 [1]_ section 8.2.9.9.11.1 describes the memory device patrol +scrub control feature. The device patrol scrub proactively locates and makes +corrections to errors in regular cycle. The patrol scrub control allows the +userspace request to change CXL patrol scrubber's configurations. + +The patrol scrub control allows the requester to specify the number of +hours in which the patrol scrub cycles must be completed, provided that +the requested scrub rate must be within the supported range of the +scrub rate that the device is capable of. In the CXL driver, the +number of seconds per scrub cycles, which user requests via sysfs, is +rescaled to hours per scrub cycles. + +In addition, they allow the host to disable the feature in case it interferes +with performance-aware operations which require the background operations to +be turned off. + +Error Check Scrub (ECS) +~~~~~~~~~~~~~~~~~~~~~~~ + +CXL spec r3.1 [1]_ section 8.2.9.9.11.2 describes Error Check Scrub (ECS) +- a feature defined in the JEDEC DDR5 SDRAM Specification (JESD79-5) and +allowing DRAM to internally read, correct single-bit errors, and write back +corrected data bits to the DRAM array while providing transparency to error +counts. + +The DDR5 device contains number of memory media Field Replaceable Units (FRU) +per device. The DDR5 ECS feature and thus the ECS control driver supports +configuring the ECS parameters per FRU. + +ACPI RAS2 Hardware-based Memory Scrubbing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +ACPI spec 6.5 [2]_ section 5.2.21 ACPI RAS2 describes an ACPI RAS2 table +which provides interfaces for platform RAS features and supports independent +RAS controls and capabilities for a given RAS feature for multiple instances +of the same component in a given system. + +Memory RAS features apply to RAS capabilities, controls and operations that +are specific to memory. RAS2 PCC sub-spaces for memory-specific RAS features +have a Feature Type of 0x00 (Memory). + +The platform can use the hardware-based memory scrubbing feature to expose +controls and capabilities associated with hardware-based memory scrub +engines. The RAS2 memory scrubbing feature supports as per spec, + +1. Independent memory scrubbing controls for each NUMA domain, identified + using its proximity domain. + +2. Provision for background (patrol) scrubbing of the entire memory system, + as well as on-demand scrubbing for a specific region of memory. + +ACPI Address Range Scrubbing (ARS) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +ACPI spec 6.5 [2]_ section 9.19.7.2 describes Address Range Scrubbing (ARS). +ARS allows the platform to communicate memory errors to system software. +This capability allows system software to prevent accesses to addresses with +uncorrectable errors in memory. ARS functions manage all NVDIMMs present in +the system. Only one scrub can be in progress system wide at any given time. + +The following functions are supported as per the specification: + +1. Query ARS Capabilities for a given address range, indicates platform + supports the ACPI NVDIMM Root Device Unconsumed Error Notification. + +2. Start ARS triggers an Address Range Scrub for the given memory range. + Address scrubbing can be done for volatile or persistent memory, or both. + +3. Query ARS Status command allows software to get the status of ARS, + including the progress of ARS and ARS error record. + +4. Clear Uncorrectable Error. + +5. Translate SPA + +6. ARS Error Inject etc. + +The kernel supports an existing control for ARS and ARS is currently not +supported in EDAC. + +.. [1] https://computeexpresslink.org/cxl-specification/ + +.. [2] https://uefi.org/specs/ACPI/6.5/ + +Comparison of various scrubbing features +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +--------------+-----------+-----------+-----------+-----------+ + | | ACPI | CXL patrol| CXL ECS | ARS | + | Name | RAS2 | scrub | | | + +--------------+-----------+-----------+-----------+-----------+ + | | | | | | + | On-demand | Supported | No | No | Supported | + | Scrubbing | | | | | + | | | | | | + +--------------+-----------+-----------+-----------+-----------+ + | | | | | | + | Background | Supported | Supported | Supported | No | + | scrubbing | | | | | + | | | | | | + +--------------+-----------+-----------+-----------+-----------+ + | | | | | | + | Mode of | Scrub ctrl| per device| per memory| Unknown | + | scrubbing | per NUMA | | media | | + | | domain. | | | | + +--------------+-----------+-----------+-----------+-----------+ + | | | | | | + | Query scrub | Supported | Supported | Supported | Supported | + | capabilities | | | | | + | | | | | | + +--------------+-----------+-----------+-----------+-----------+ + | | | | | | + | Setting | Supported | No | No | Supported | + | address range| | | | | + | | | | | | + +--------------+-----------+-----------+-----------+-----------+ + | | | | | | + | Setting | Supported | Supported | No | No | + | scrub rate | | | | | + | | | | | | + +--------------+-----------+-----------+-----------+-----------+ + | | | | | | + | Unit for | Not | in hours | No | No | + | scrub rate | Defined | | | | + | | | | | | + +--------------+-----------+-----------+-----------+-----------+ + | | Supported | | | | + | Scrub | on-demand | No | No | Supported | + | status/ | scrubbing | | | | + | Completion | only | | | | + +--------------+-----------+-----------+-----------+-----------+ + | UC error | |CXL general|CXL general| ACPI UCE | + | reporting | Exception |media/DRAM |media/DRAM | notify and| + | | |event/media|event/media| query | + | | |scan? |scan? | ARS status| + +--------------+-----------+-----------+-----------+-----------+ + | | | | | | + | Support for | Supported | Supported | Supported | No | + | EDAC control | | | | | + | | | | | | + +--------------+-----------+-----------+-----------+-----------+ + +The File System +--------------- + +The control attributes of a registered scrubber instance could be +accessed in: + +/sys/bus/edac/devices/<dev-name>/scrubX/ + +sysfs +----- + +Sysfs files are documented in +`Documentation/ABI/testing/sysfs-edac-scrub` + +`Documentation/ABI/testing/sysfs-edac-ecs` diff --git a/MAINTAINERS b/MAINTAINERS index efee40ea589f..570411b580ad 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8214,6 +8214,7 @@ F: drivers/edac/aspeed_edac.c EDAC-BLUEFIELD M: Shravan Kumar Ramani <shravankr@nvidia.com> +M: David Thompson <davthompson@nvidia.com> S: Supported F: drivers/edac/bluefield_edac.c diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index b908a118fb32..19ad3c3b675d 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -75,6 +75,34 @@ config EDAC_GHES In doubt, say 'Y'. +config EDAC_SCRUB + bool "EDAC scrub feature" + help + The EDAC scrub feature is optional and is designed to control the + memory scrubbers in the system. The common sysfs scrub interface + abstracts the control of various arbitrary scrubbing functionalities + into a unified set of functions. + Say 'y/n' to enable/disable EDAC scrub feature. + +config EDAC_ECS + bool "EDAC ECS (Error Check Scrub) feature" + help + The EDAC ECS feature is optional and is designed to control on-die + error check scrub (e.g., DDR5 ECS) in the system. The common sysfs + ECS interface abstracts the control of various ECS functionalities + into a unified set of functions. + Say 'y/n' to enable/disable EDAC ECS feature. + +config EDAC_MEM_REPAIR + bool "EDAC memory repair feature" + help + The EDAC memory repair feature is optional and is designed to control + the memory devices with repair features, such as Post Package Repair + (PPR), memory sparing etc. The common sysfs memory repair interface + abstracts the control of various memory repair functionalities into + a unified set of functions. + Say 'y/n' to enable/disable EDAC memory repair feature. + config EDAC_AMD64 tristate "AMD64 (Opteron, Athlon64)" depends on AMD_NB && EDAC_DECODE_MCE diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index 89789ba8275f..a8f2d8f6c894 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile @@ -12,6 +12,9 @@ edac_core-y := edac_mc.o edac_device.o edac_mc_sysfs.o edac_core-y += edac_module.o edac_device_sysfs.o wq.o edac_core-$(CONFIG_EDAC_DEBUG) += debugfs.o +edac_core-$(CONFIG_EDAC_SCRUB) += scrub.o +edac_core-$(CONFIG_EDAC_ECS) += ecs.o +edac_core-$(CONFIG_EDAC_MEM_REPAIR) += mem_repair.o ifdef CONFIG_PCI edac_core-y += edac_pci.o edac_pci_sysfs.o diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 8414ceb43e4a..90f0eb7cc5b9 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only #include <linux/ras.h> +#include <linux/string_choices.h> #include "amd64_edac.h" #include <asm/amd_nb.h> #include <asm/amd_node.h> @@ -1171,22 +1172,21 @@ static void debug_dump_dramcfg_low(struct amd64_pvt *pvt, u32 dclr, int chan) edac_dbg(1, " LRDIMM %dx rank multiply\n", (dcsm & 0x3)); } - edac_dbg(1, "All DIMMs support ECC:%s\n", - (dclr & BIT(19)) ? "yes" : "no"); + edac_dbg(1, "All DIMMs support ECC: %s\n", str_yes_no(dclr & BIT(19))); edac_dbg(1, " PAR/ERR parity: %s\n", - (dclr & BIT(8)) ? "enabled" : "disabled"); + str_enabled_disabled(dclr & BIT(8))); if (pvt->fam == 0x10) edac_dbg(1, " DCT 128bit mode width: %s\n", (dclr & BIT(11)) ? "128b" : "64b"); edac_dbg(1, " x4 logical DIMMs present: L0: %s L1: %s L2: %s L3: %s\n", - (dclr & BIT(12)) ? "yes" : "no", - (dclr & BIT(13)) ? "yes" : "no", - (dclr & BIT(14)) ? "yes" : "no", - (dclr & BIT(15)) ? "yes" : "no"); + str_yes_no(dclr & BIT(12)), + str_yes_no(dclr & BIT(13)), + str_yes_no(dclr & BIT(14)), + str_yes_no(dclr & BIT(15))); } #define CS_EVEN_PRIMARY BIT(0) @@ -1353,14 +1353,14 @@ static void umc_dump_misc_regs(struct amd64_pvt *pvt) edac_dbg(1, "UMC%d UMC cap high: 0x%x\n", i, umc->umc_cap_hi); edac_dbg(1, "UMC%d ECC capable: %s, ChipKill ECC capable: %s\n", - i, (umc->umc_cap_hi & BIT(30)) ? "yes" : "no", - (umc->umc_cap_hi & BIT(31)) ? "yes" : "no"); + i, str_yes_no(umc->umc_cap_hi & BIT(30)), + str_yes_no(umc->umc_cap_hi & BIT(31))); edac_dbg(1, "UMC%d All DIMMs support ECC: %s\n", - i, (umc->umc_cfg & BIT(12)) ? "yes" : "no"); + i, str_yes_no(umc->umc_cfg & BIT(12))); edac_dbg(1, "UMC%d x4 DIMMs present: %s\n", - i, (umc->dimm_cfg & BIT(6)) ? "yes" : "no"); + i, str_yes_no(umc->dimm_cfg & BIT(6))); edac_dbg(1, "UMC%d x16 DIMMs present: %s\n", - i, (umc->dimm_cfg & BIT(7)) ? "yes" : "no"); + i, str_yes_no(umc->dimm_cfg & BIT(7))); umc_debug_display_dimm_sizes(pvt, i); } @@ -1371,11 +1371,11 @@ static void dct_dump_misc_regs(struct amd64_pvt *pvt) edac_dbg(1, "F3xE8 (NB Cap): 0x%08x\n", pvt->nbcap); edac_dbg(1, " NB two channel DRAM capable: %s\n", - (pvt->nbcap & NBCAP_DCT_DUAL) ? "yes" : "no"); + str_yes_no(pvt->nbcap & NBCAP_DCT_DUAL)); edac_dbg(1, " ECC capable: %s, ChipKill ECC capable: %s\n", - (pvt->nbcap & NBCAP_SECDED) ? "yes" : "no", - (pvt->nbcap & NBCAP_CHIPKILL) ? "yes" : "no"); + str_yes_no(pvt->nbcap & NBCAP_SECDED), + str_yes_no(pvt->nbcap & NBCAP_CHIPKILL)); debug_dump_dramcfg_low(pvt, pvt->dclr0, 0); @@ -1398,7 +1398,7 @@ static void dct_dump_misc_regs(struct amd64_pvt *pvt) if (!dct_ganging_enabled(pvt)) debug_dump_dramcfg_low(pvt, pvt->dclr1, 1); - edac_dbg(1, " DramHoleValid: %s\n", dhar_valid(pvt) ? "yes" : "no"); + edac_dbg(1, " DramHoleValid: %s\n", str_yes_no(dhar_valid(pvt))); amd64_info("using x%u syndromes.\n", pvt->ecc_sym_sz); } @@ -2027,15 +2027,15 @@ static void read_dram_ctl_register(struct amd64_pvt *pvt) if (!dct_ganging_enabled(pvt)) edac_dbg(0, " Address range split per DCT: %s\n", - (dct_high_range_enabled(pvt) ? "yes" : "no")); + str_yes_no(dct_high_range_enabled(pvt))); edac_dbg(0, " data interleave for ECC: %s, DRAM cleared since last warm reset: %s\n", - (dct_data_intlv_enabled(pvt) ? "enabled" : "disabled"), - (dct_memory_cleared(pvt) ? "yes" : "no")); + str_enabled_disabled(dct_data_intlv_enabled(pvt)), + str_yes_no(dct_memory_cleared(pvt))); edac_dbg(0, " channel interleave: %s, " "interleave bits selector: 0x%x\n", - (dct_interleave_enabled(pvt) ? "enabled" : "disabled"), + str_enabled_disabled(dct_interleave_enabled(pvt)), dct_sel_interleave_addr(pvt)); } @@ -3208,8 +3208,7 @@ static bool nb_mce_bank_enabled_on_node(u16 nid) nbe = reg->l & MSR_MCGCTL_NBE; edac_dbg(0, "core: %u, MCG_CTL: 0x%llx, NB MSR is %s\n", - cpu, reg->q, - (nbe ? "enabled" : "disabled")); + cpu, reg->q, str_enabled_disabled(nbe)); if (!nbe) goto out; @@ -3353,12 +3352,9 @@ static bool dct_ecc_enabled(struct amd64_pvt *pvt) edac_dbg(0, "NB MCE bank disabled, set MSR 0x%08x[4] on node %d to enable.\n", MSR_IA32_MCG_CTL, nid); - edac_dbg(3, "Node %d: DRAM ECC %s.\n", nid, (ecc_en ? "enabled" : "disabled")); + edac_dbg(3, "Node %d: DRAM ECC %s.\n", nid, str_enabled_disabled(ecc_en)); - if (!ecc_en || !nb_mce_en) - return false; - else - return true; + return ecc_en && nb_mce_en; } static bool umc_ecc_enabled(struct amd64_pvt *pvt) @@ -3378,7 +3374,7 @@ static bool umc_ecc_enabled(struct amd64_pvt *pvt) } } - edac_dbg(3, "Node %d: DRAM ECC %s.\n", pvt->mc_node_id, (ecc_en ? "enabled" : "disabled")); + edac_dbg(3, "Node %d: DRAM ECC %s.\n", pvt->mc_node_id, str_enabled_disabled(ecc_en)); return ecc_en; } diff --git a/drivers/edac/debugfs.c b/drivers/edac/debugfs.c index 4804332d9946..8195fc9c9354 100644 --- a/drivers/edac/debugfs.c +++ b/drivers/edac/debugfs.c @@ -1,4 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only + +#include <linux/string_choices.h> + #include "edac_module.h" static struct dentry *edac_debugfs; @@ -22,7 +25,7 @@ static ssize_t edac_fake_inject_write(struct file *file, "Generating %d %s fake error%s to %d.%d.%d to test core handling. NOTE: this won't test the driver-specific decoding logic.\n", errcount, (type == HW_EVENT_ERR_UNCORRECTED) ? "UE" : "CE", - errcount > 1 ? "s" : "", + str_plural(errcount), mci->fake_inject_layer[0], mci->fake_inject_layer[1], mci->fake_inject_layer[2] diff --git a/drivers/edac/ecs.c b/drivers/edac/ecs.c new file mode 100755 index 000000000000..1d51838a60c1 --- /dev/null +++ b/drivers/edac/ecs.c @@ -0,0 +1,205 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * The generic ECS driver is designed to support control of on-die error + * check scrub (e.g., DDR5 ECS). The common sysfs ECS interface abstracts + * the control of various ECS functionalities into a unified set of functions. + * + * Copyright (c) 2024-2025 HiSilicon Limited. + */ + +#include <linux/edac.h> + +#define EDAC_ECS_FRU_NAME "ecs_fru" + +enum edac_ecs_attributes { + ECS_LOG_ENTRY_TYPE, + ECS_MODE, + ECS_RESET, + ECS_THRESHOLD, + ECS_MAX_ATTRS +}; + +struct edac_ecs_dev_attr { + struct device_attribute dev_attr; + int fru_id; +}; + +struct edac_ecs_fru_context { + char name[EDAC_FEAT_NAME_LEN]; + struct edac_ecs_dev_attr dev_attr[ECS_MAX_ATTRS]; + struct attribute *ecs_attrs[ECS_MAX_ATTRS + 1]; + struct attribute_group group; +}; + +struct edac_ecs_context { + u16 num_media_frus; + struct edac_ecs_fru_context *fru_ctxs; +}; + +#define TO_ECS_DEV_ATTR(_dev_attr) \ + container_of(_dev_attr, struct edac_ecs_dev_attr, dev_attr) + +#define EDAC_ECS_ATTR_SHOW(attrib, cb, type, format) \ +static ssize_t attrib##_show(struct device *ras_feat_dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct edac_ecs_dev_attr *dev_attr = TO_ECS_DEV_ATTR(attr); \ + struct edac_dev_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev); \ + const struct edac_ecs_ops *ops = ctx->ecs.ecs_ops; \ + type data; \ + int ret; \ + \ + ret = ops->cb(ras_feat_dev->parent, ctx->ecs.private, \ + dev_attr->fru_id, &data); \ + if (ret) \ + return ret; \ + \ + return sysfs_emit(buf, format, data); \ +} + +EDAC_ECS_ATTR_SHOW(log_entry_type, get_log_entry_type, u32, "%u\n") +EDAC_ECS_ATTR_SHOW(mode, get_mode, u32, "%u\n") +EDAC_ECS_ATTR_SHOW(threshold, get_threshold, u32, "%u\n") + +#define EDAC_ECS_ATTR_STORE(attrib, cb, type, conv_func) \ +static ssize_t attrib##_store(struct device *ras_feat_dev, \ + struct device_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + struct edac_ecs_dev_attr *dev_attr = TO_ECS_DEV_ATTR(attr); \ + struct edac_dev_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev); \ + const struct edac_ecs_ops *ops = ctx->ecs.ecs_ops; \ + type data; \ + int ret; \ + \ + ret = conv_func(buf, 0, &data); \ + if (ret < 0) \ + return ret; \ + \ + ret = ops->cb(ras_feat_dev->parent, ctx->ecs.private, \ + dev_attr->fru_id, data); \ + if (ret) \ + return ret; \ + \ + return len; \ +} + +EDAC_ECS_ATTR_STORE(log_entry_type, set_log_entry_type, unsigned long, kstrtoul) +EDAC_ECS_ATTR_STORE(mode, set_mode, unsigned long, kstrtoul) +EDAC_ECS_ATTR_STORE(reset, reset, unsigned long, kstrtoul) +EDAC_ECS_ATTR_STORE(threshold, set_threshold, unsigned long, kstrtoul) + +static umode_t ecs_attr_visible(struct kobject *kobj, struct attribute *a, int attr_id) +{ + struct device *ras_feat_dev = kobj_to_dev(kobj); + struct edac_dev_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev); + const struct edac_ecs_ops *ops = ctx->ecs.ecs_ops; + + switch (attr_id) { + case ECS_LOG_ENTRY_TYPE: + if (ops->get_log_entry_type) { + if (ops->set_log_entry_type) + return a->mode; + else + return 0444; + } + break; + case ECS_MODE: + if (ops->get_mode) { + if (ops->set_mode) + return a->mode; + else + return 0444; + } + break; + case ECS_RESET: + if (ops->reset) + return a->mode; + break; + case ECS_THRESHOLD: + if (ops->get_threshold) { + if (ops->set_threshold) + return a->mode; + else + return 0444; + } + break; + default: + break; + } + + return 0; +} + +#define EDAC_ECS_ATTR_RO(_name, _fru_id) \ + ((struct edac_ecs_dev_attr) { .dev_attr = __ATTR_RO(_name), \ + .fru_id = _fru_id }) + +#define EDAC_ECS_ATTR_WO(_name, _fru_id) \ + ((struct edac_ecs_dev_attr) { .dev_attr = __ATTR_WO(_name), \ + .fru_id = _fru_id }) + +#define EDAC_ECS_ATTR_RW(_name, _fru_id) \ + ((struct edac_ecs_dev_attr) { .dev_attr = __ATTR_RW(_name), \ + .fru_id = _fru_id }) + +static int ecs_create_desc(struct device *ecs_dev, const struct attribute_group **attr_groups, + u16 num_media_frus) +{ + struct edac_ecs_context *ecs_ctx; + u32 fru; + + ecs_ctx = devm_kzalloc(ecs_dev, sizeof(*ecs_ctx), GFP_KERNEL); + if (!ecs_ctx) + return -ENOMEM; + + ecs_ctx->num_media_frus = num_media_frus; + ecs_ctx->fru_ctxs = devm_kcalloc(ecs_dev, num_media_frus, + sizeof(*ecs_ctx->fru_ctxs), + GFP_KERNEL); + if (!ecs_ctx->fru_ctxs) + return -ENOMEM; + + for (fru = 0; fru < num_media_frus; fru++) { + struct edac_ecs_fru_context *fru_ctx = &ecs_ctx->fru_ctxs[fru]; + struct attribute_group *group = &fru_ctx->group; + int i; + + fru_ctx->dev_attr[ECS_LOG_ENTRY_TYPE] = EDAC_ECS_ATTR_RW(log_entry_type, fru); + fru_ctx->dev_attr[ECS_MODE] = EDAC_ECS_ATTR_RW(mode, fru); + fru_ctx->dev_attr[ECS_RESET] = EDAC_ECS_ATTR_WO(reset, fru); + fru_ctx->dev_attr[ECS_THRESHOLD] = EDAC_ECS_ATTR_RW(threshold, fru); + + for (i = 0; i < ECS_MAX_ATTRS; i++) + fru_ctx->ecs_attrs[i] = &fru_ctx->dev_attr[i].dev_attr.attr; + + sprintf(fru_ctx->name, "%s%d", EDAC_ECS_FRU_NAME, fru); + group->name = fru_ctx->name; + group->attrs = fru_ctx->ecs_attrs; + group->is_visible = ecs_attr_visible; + + attr_groups[fru] = group; + } + + return 0; +} + +/** + * edac_ecs_get_desc - get EDAC ECS descriptors + * @ecs_dev: client device, supports ECS feature + * @attr_groups: pointer to attribute group container + * @num_media_frus: number of media FRUs in the device + * + * Return: + * * %0 - Success. + * * %-EINVAL - Invalid parameters passed. + * * %-ENOMEM - Dynamic memory allocation failed. + */ +int edac_ecs_get_desc(struct device *ecs_dev, + const struct attribute_group **attr_groups, u16 num_media_frus) +{ + if (!ecs_dev || !attr_groups || !num_media_frus) + return -EINVAL; + + return ecs_create_desc(ecs_dev, attr_groups, num_media_frus); +} diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c index 621dc2a5d034..0734909b08a4 100644 --- a/drivers/edac/edac_device.c +++ b/drivers/edac/edac_device.c @@ -570,3 +570,188 @@ void edac_device_handle_ue_count(struct edac_device_ctl_info *edac_dev, block ? block->name : "N/A", count, msg); } EXPORT_SYMBOL_GPL(edac_device_handle_ue_count); + +static void edac_dev_release(struct device *dev) +{ + struct edac_dev_feat_ctx *ctx = container_of(dev, struct edac_dev_feat_ctx, dev); + + kfree(ctx->mem_repair); + kfree(ctx->scrub); + kfree(ctx->dev.groups); + kfree(ctx); +} + +static const struct device_type edac_dev_type = { + .name = "edac_dev", + .release = edac_dev_release, +}; + +static void edac_dev_unreg(void *data) +{ + device_unregister(data); +} + +/** + * edac_dev_register - register device for RAS features with EDAC + * @parent: parent device. + * @name: name for the folder in the /sys/bus/edac/devices/, + * which is derived from the parent device. + * For e.g. /sys/bus/edac/devices/cxl_mem0/ + * @private: parent driver's data to store in the context if any. + * @num_features: number of RAS features to register. + * @ras_features: list of RAS features to register. + * + * Return: + * * %0 - Success. + * * %-EINVAL - Invalid parameters passed. + * * %-ENOMEM - Dynamic memory allocation failed. + * + */ +int edac_dev_register(struct device *parent, char *name, + void *private, int num_features, + const struct edac_dev_feature *ras_features) +{ + const struct attribute_group **ras_attr_groups; + struct edac_dev_data *dev_data; + struct edac_dev_feat_ctx *ctx; + int mem_repair_cnt = 0; + int attr_gcnt = 0; + int ret = -ENOMEM; + int scrub_cnt = 0; + int feat; + + if (!parent || !name || !num_features || !ras_features) + return -EINVAL; + + /* Double parse to make space for attributes */ + for (feat = 0; feat < num_features; feat++) { + switch (ras_features[feat].ft_type) { + case RAS_FEAT_SCRUB: + attr_gcnt++; + scrub_cnt++; + break; + case RAS_FEAT_ECS: + attr_gcnt += ras_features[feat].ecs_info.num_media_frus; + break; + case RAS_FEAT_MEM_REPAIR: + attr_gcnt++; + mem_repair_cnt++; + break; + default: + return -EINVAL; + } + } + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ras_attr_groups = kcalloc(attr_gcnt + 1, sizeof(*ras_attr_groups), GFP_KERNEL); + if (!ras_attr_groups) + goto ctx_free; + + if (scrub_cnt) { + ctx->scrub = kcalloc(scrub_cnt, sizeof(*ctx->scrub), GFP_KERNEL); + if (!ctx->scrub) + goto groups_free; + } + + if (mem_repair_cnt) { + ctx->mem_repair = kcalloc(mem_repair_cnt, sizeof(*ctx->mem_repair), GFP_KERNEL); + if (!ctx->mem_repair) + goto data_mem_free; + } + + attr_gcnt = 0; + scrub_cnt = 0; + mem_repair_cnt = 0; + for (feat = 0; feat < num_features; feat++, ras_features++) { + switch (ras_features->ft_type) { + case RAS_FEAT_SCRUB: + if (!ras_features->scrub_ops || scrub_cnt != ras_features->instance) { + ret = -EINVAL; + goto data_mem_free; + } + + dev_data = &ctx->scrub[scrub_cnt]; + dev_data->instance = scrub_cnt; + dev_data->scrub_ops = ras_features->scrub_ops; + dev_data->private = ras_features->ctx; + ret = edac_scrub_get_desc(parent, &ras_attr_groups[attr_gcnt], + ras_features->instance); + if (ret) + goto data_mem_free; + + scrub_cnt++; + attr_gcnt++; + break; + case RAS_FEAT_ECS: + if (!ras_features->ecs_ops) { + ret = -EINVAL; + goto data_mem_free; + } + + dev_data = &ctx->ecs; + dev_data->ecs_ops = ras_features->ecs_ops; + dev_data->private = ras_features->ctx; + ret = edac_ecs_get_desc(parent, &ras_attr_groups[attr_gcnt], + ras_features->ecs_info.num_media_frus); + if (ret) + goto data_mem_free; + + attr_gcnt += ras_features->ecs_info.num_media_frus; + break; + case RAS_FEAT_MEM_REPAIR: + if (!ras_features->mem_repair_ops || + mem_repair_cnt != ras_features->instance) { + ret = -EINVAL; + goto data_mem_free; + } + + dev_data = &ctx->mem_repair[mem_repair_cnt]; + dev_data->instance = mem_repair_cnt; + dev_data->mem_repair_ops = ras_features->mem_repair_ops; + dev_data->private = ras_features->ctx; + ret = edac_mem_repair_get_desc(parent, &ras_attr_groups[attr_gcnt], + ras_features->instance); + if (ret) + goto data_mem_free; + + mem_repair_cnt++; + attr_gcnt++; + break; + default: + ret = -EINVAL; + goto data_mem_free; + } + } + + ctx->dev.parent = parent; + ctx->dev.bus = edac_get_sysfs_subsys(); + ctx->dev.type = &edac_dev_type; + ctx->dev.groups = ras_attr_groups; + ctx->private = private; + dev_set_drvdata(&ctx->dev, ctx); + + ret = dev_set_name(&ctx->dev, "%s", name); + if (ret) + goto data_mem_free; + + ret = device_register(&ctx->dev); + if (ret) { + put_device(&ctx->dev); + return ret; + } + + return devm_add_action_or_reset(parent, edac_dev_unreg, &ctx->dev); + +data_mem_free: + kfree(ctx->mem_repair); + kfree(ctx->scrub); +groups_free: + kfree(ras_attr_groups); +ctx_free: + kfree(ctx); + return ret; +} +EXPORT_SYMBOL_GPL(edac_dev_register); diff --git a/drivers/edac/i5400_edac.c b/drivers/edac/i5400_edac.c index 49b4499269fb..b5cf25905b05 100644 --- a/drivers/edac/i5400_edac.c +++ b/drivers/edac/i5400_edac.c @@ -31,6 +31,7 @@ #include <linux/slab.h> #include <linux/edac.h> #include <linux/mmzone.h> +#include <linux/string_choices.h> #include "edac_module.h" @@ -899,7 +900,7 @@ static void decode_mtr(int slot_row, u16 mtr) edac_dbg(2, "\t\tWIDTH: x%d\n", MTR_DRAM_WIDTH(mtr)); edac_dbg(2, "\t\tELECTRICAL THROTTLING is %s\n", - MTR_DIMMS_ETHROTTLE(mtr) ? "enabled" : "disabled"); + str_enabled_disabled(MTR_DIMMS_ETHROTTLE(mtr))); edac_dbg(2, "\t\tNUMBANK: %d bank(s)\n", MTR_DRAM_BANKS(mtr)); edac_dbg(2, "\t\tNUMRANK: %s\n", diff --git a/drivers/edac/i7300_edac.c b/drivers/edac/i7300_edac.c index 61adaa872ba7..69068f8d0cad 100644 --- a/drivers/edac/i7300_edac.c +++ b/drivers/edac/i7300_edac.c @@ -23,6 +23,7 @@ #include <linux/slab.h> #include <linux/edac.h> #include <linux/mmzone.h> +#include <linux/string_choices.h> #include "edac_module.h" @@ -620,7 +621,7 @@ static int decode_mtr(struct i7300_pvt *pvt, edac_dbg(2, "\t\tWIDTH: x%d\n", MTR_DRAM_WIDTH(mtr)); edac_dbg(2, "\t\tELECTRICAL THROTTLING is %s\n", - MTR_DIMMS_ETHROTTLE(mtr) ? "enabled" : "disabled"); + str_enabled_disabled(MTR_DIMMS_ETHROTTLE(mtr))); edac_dbg(2, "\t\tNUMBANK: %d bank(s)\n", MTR_DRAM_BANKS(mtr)); edac_dbg(2, "\t\tNUMRANK: %s\n", @@ -871,9 +872,9 @@ static int i7300_get_mc_regs(struct mem_ctl_info *mci) IS_MIRRORED(pvt->mc_settings) ? "" : "non-"); edac_dbg(0, "Error detection is %s\n", - IS_ECC_ENABLED(pvt->mc_settings) ? "enabled" : "disabled"); + str_enabled_disabled(IS_ECC_ENABLED(pvt->mc_settings))); edac_dbg(0, "Retry is %s\n", - IS_RETRY_ENABLED(pvt->mc_settings) ? "enabled" : "disabled"); + str_enabled_disabled(IS_RETRY_ENABLED(pvt->mc_settings))); /* Get Memory Interleave Range registers */ pci_read_config_word(pvt->pci_dev_16_1_fsb_addr_map, MIR0, diff --git a/drivers/edac/igen6_edac.c b/drivers/edac/igen6_edac.c index 595908af9e5c..5807517ee32d 100644 --- a/drivers/edac/igen6_edac.c +++ b/drivers/edac/igen6_edac.c @@ -125,7 +125,7 @@ #define MEM_SLICE_HASH_MASK(v) (GET_BITFIELD(v, 6, 19) << 6) #define MEM_SLICE_HASH_LSB_MASK_BIT(v) GET_BITFIELD(v, 24, 26) -static struct res_config { +static const struct res_config { bool machine_check; int num_imc; u32 imc_base; @@ -472,7 +472,7 @@ static u64 rpl_p_err_addr(u64 ecclog) return ECC_ERROR_LOG_ADDR45(ecclog); } -static struct res_config ehl_cfg = { +static const struct res_config ehl_cfg = { .num_imc = 1, .imc_base = 0x5000, .ibecc_base = 0xdc00, @@ -482,7 +482,7 @@ static struct res_config ehl_cfg = { .err_addr_to_imc_addr = ehl_err_addr_to_imc_addr, }; -static struct res_config icl_cfg = { +static const struct res_config icl_cfg = { .num_imc = 1, .imc_base = 0x5000, .ibecc_base = 0xd800, @@ -492,7 +492,7 @@ static struct res_config icl_cfg = { .err_addr_to_imc_addr = ehl_err_addr_to_imc_addr, }; -static struct res_config tgl_cfg = { +static const struct res_config tgl_cfg = { .machine_check = true, .num_imc = 2, .imc_base = 0x5000, @@ -506,7 +506,7 @@ static struct res_config tgl_cfg = { .err_addr_to_imc_addr = tgl_err_addr_to_imc_addr, }; -static struct res_config adl_cfg = { +static const struct res_config adl_cfg = { .machine_check = true, .num_imc = 2, .imc_base = 0xd800, @@ -517,7 +517,7 @@ static struct res_config adl_cfg = { .err_addr_to_imc_addr = adl_err_addr_to_imc_addr, }; -static struct res_config adl_n_cfg = { +static const struct res_config adl_n_cfg = { .machine_check = true, .num_imc = 1, .imc_base = 0xd800, @@ -528,7 +528,7 @@ static struct res_config adl_n_cfg = { .err_addr_to_imc_addr = adl_err_addr_to_imc_addr, }; -static struct res_config rpl_p_cfg = { +static const struct res_config rpl_p_cfg = { .machine_check = true, .num_imc = 2, .imc_base = 0xd800, @@ -540,7 +540,7 @@ static struct res_config rpl_p_cfg = { .err_addr_to_imc_addr = adl_err_addr_to_imc_addr, }; -static struct res_config mtl_ps_cfg = { +static const struct res_config mtl_ps_cfg = { .machine_check = true, .num_imc = 2, .imc_base = 0xd800, @@ -551,7 +551,7 @@ static struct res_config mtl_ps_cfg = { .err_addr_to_imc_addr = adl_err_addr_to_imc_addr, }; -static struct res_config mtl_p_cfg = { +static const struct res_config mtl_p_cfg = { .machine_check = true, .num_imc = 2, .imc_base = 0xd800, @@ -1383,7 +1383,7 @@ static void unregister_err_handler(void) unregister_nmi_handler(NMI_SERR, IGEN6_NMI_NAME); } -static void opstate_set(struct res_config *cfg, const struct pci_device_id *ent) +static void opstate_set(const struct res_config *cfg, const struct pci_device_id *ent) { /* * Quirk: Certain SoCs' error reporting interrupts don't work. diff --git a/drivers/edac/mem_repair.c b/drivers/edac/mem_repair.c new file mode 100755 index 000000000000..3b1a845457b0 --- /dev/null +++ b/drivers/edac/mem_repair.c @@ -0,0 +1,359 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * The generic EDAC memory repair driver is designed to control the memory + * devices with memory repair features, such as Post Package Repair (PPR), + * memory sparing etc. The common sysfs memory repair interface abstracts + * the control of various arbitrary memory repair functionalities into a + * unified set of functions. + * + * Copyright (c) 2024-2025 HiSilicon Limited. + */ + +#include <linux/edac.h> + +enum edac_mem_repair_attributes { + MR_TYPE, + MR_PERSIST_MODE, + MR_SAFE_IN_USE, + MR_HPA, + MR_MIN_HPA, + MR_MAX_HPA, + MR_DPA, + MR_MIN_DPA, + MR_MAX_DPA, + MR_NIBBLE_MASK, + MR_BANK_GROUP, + MR_BANK, + MR_RANK, + MR_ROW, + MR_COLUMN, + MR_CHANNEL, + MR_SUB_CHANNEL, + MEM_DO_REPAIR, + MR_MAX_ATTRS +}; + +struct edac_mem_repair_dev_attr { + struct device_attribute dev_attr; + u8 instance; +}; + +struct edac_mem_repair_context { + char name[EDAC_FEAT_NAME_LEN]; + struct edac_mem_repair_dev_attr mem_repair_dev_attr[MR_MAX_ATTRS]; + struct attribute *mem_repair_attrs[MR_MAX_ATTRS + 1]; + struct attribute_group group; +}; + +#define TO_MR_DEV_ATTR(_dev_attr) \ + container_of(_dev_attr, struct edac_mem_repair_dev_attr, dev_attr) + +#define MR_ATTR_SHOW(attrib, cb, type, format) \ +static ssize_t attrib##_show(struct device *ras_feat_dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + u8 inst = TO_MR_DEV_ATTR(attr)->instance; \ + struct edac_dev_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev); \ + const struct edac_mem_repair_ops *ops = \ + ctx->mem_repair[inst].mem_repair_ops; \ + type data; \ + int ret; \ + \ + ret = ops->cb(ras_feat_dev->parent, ctx->mem_repair[inst].private, \ + &data); \ + if (ret) \ + return ret; \ + \ + return sysfs_emit(buf, format, data); \ +} + +MR_ATTR_SHOW(repair_type, get_repair_type, const char *, "%s\n") +MR_ATTR_SHOW(persist_mode, get_persist_mode, bool, "%u\n") +MR_ATTR_SHOW(repair_safe_when_in_use, get_repair_safe_when_in_use, bool, "%u\n") +MR_ATTR_SHOW(hpa, get_hpa, u64, "0x%llx\n") +MR_ATTR_SHOW(min_hpa, get_min_hpa, u64, "0x%llx\n") +MR_ATTR_SHOW(max_hpa, get_max_hpa, u64, "0x%llx\n") +MR_ATTR_SHOW(dpa, get_dpa, u64, "0x%llx\n") +MR_ATTR_SHOW(min_dpa, get_min_dpa, u64, "0x%llx\n") +MR_ATTR_SHOW(max_dpa, get_max_dpa, u64, "0x%llx\n") +MR_ATTR_SHOW(nibble_mask, get_nibble_mask, u32, "0x%x\n") +MR_ATTR_SHOW(bank_group, get_bank_group, u32, "%u\n") +MR_ATTR_SHOW(bank, get_bank, u32, "%u\n") +MR_ATTR_SHOW(rank, get_rank, u32, "%u\n") +MR_ATTR_SHOW(row, get_row, u32, "0x%x\n") +MR_ATTR_SHOW(column, get_column, u32, "%u\n") +MR_ATTR_SHOW(channel, get_channel, u32, "%u\n") +MR_ATTR_SHOW(sub_channel, get_sub_channel, u32, "%u\n") + +#define MR_ATTR_STORE(attrib, cb, type, conv_func) \ +static ssize_t attrib##_store(struct device *ras_feat_dev, \ + struct device_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + u8 inst = TO_MR_DEV_ATTR(attr)->instance; \ + struct edac_dev_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev); \ + const struct edac_mem_repair_ops *ops = \ + ctx->mem_repair[inst].mem_repair_ops; \ + type data; \ + int ret; \ + \ + ret = conv_func(buf, 0, &data); \ + if (ret < 0) \ + return ret; \ + \ + ret = ops->cb(ras_feat_dev->parent, ctx->mem_repair[inst].private, \ + data); \ + if (ret) \ + return ret; \ + \ + return len; \ +} + +MR_ATTR_STORE(persist_mode, set_persist_mode, unsigned long, kstrtoul) +MR_ATTR_STORE(hpa, set_hpa, u64, kstrtou64) +MR_ATTR_STORE(dpa, set_dpa, u64, kstrtou64) +MR_ATTR_STORE(nibble_mask, set_nibble_mask, unsigned long, kstrtoul) +MR_ATTR_STORE(bank_group, set_bank_group, unsigned long, kstrtoul) +MR_ATTR_STORE(bank, set_bank, unsigned long, kstrtoul) +MR_ATTR_STORE(rank, set_rank, unsigned long, kstrtoul) +MR_ATTR_STORE(row, set_row, unsigned long, kstrtoul) +MR_ATTR_STORE(column, set_column, unsigned long, kstrtoul) +MR_ATTR_STORE(channel, set_channel, unsigned long, kstrtoul) +MR_ATTR_STORE(sub_channel, set_sub_channel, unsigned long, kstrtoul) + +#define MR_DO_OP(attrib, cb) \ +static ssize_t attrib##_store(struct device *ras_feat_dev, \ + struct device_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + u8 inst = TO_MR_DEV_ATTR(attr)->instance; \ + struct edac_dev_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev); \ + const struct edac_mem_repair_ops *ops = ctx->mem_repair[inst].mem_repair_ops; \ + unsigned long data; \ + int ret; \ + \ + ret = kstrtoul(buf, 0, &data); \ + if (ret < 0) \ + return ret; \ + \ + ret = ops->cb(ras_feat_dev->parent, ctx->mem_repair[inst].private, data); \ + if (ret) \ + return ret; \ + \ + return len; \ +} + +MR_DO_OP(repair, do_repair) + +static umode_t mem_repair_attr_visible(struct kobject *kobj, struct attribute *a, int attr_id) +{ + struct device *ras_feat_dev = kobj_to_dev(kobj); + struct device_attribute *dev_attr = container_of(a, struct device_attribute, attr); + struct edac_dev_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev); + u8 inst = TO_MR_DEV_ATTR(dev_attr)->instance; + const struct edac_mem_repair_ops *ops = ctx->mem_repair[inst].mem_repair_ops; + + switch (attr_id) { + case MR_TYPE: + if (ops->get_repair_type) + return a->mode; + break; + case MR_PERSIST_MODE: + if (ops->get_persist_mode) { + if (ops->set_persist_mode) + return a->mode; + else + return 0444; + } + break; + case MR_SAFE_IN_USE: + if (ops->get_repair_safe_when_in_use) + return a->mode; + break; + case MR_HPA: + if (ops->get_hpa) { + if (ops->set_hpa) + return a->mode; + else + return 0444; + } + break; + case MR_MIN_HPA: + if (ops->get_min_hpa) + return a->mode; + break; + case MR_MAX_HPA: + if (ops->get_max_hpa) + return a->mode; + break; + case MR_DPA: + if (ops->get_dpa) { + if (ops->set_dpa) + return a->mode; + else + return 0444; + } + break; + case MR_MIN_DPA: + if (ops->get_min_dpa) + return a->mode; + break; + case MR_MAX_DPA: + if (ops->get_max_dpa) + return a->mode; + break; + case MR_NIBBLE_MASK: + if (ops->get_nibble_mask) { + if (ops->set_nibble_mask) + return a->mode; + else + return 0444; + } + break; + case MR_BANK_GROUP: + if (ops->get_bank_group) { + if (ops->set_bank_group) + return a->mode; + else + return 0444; + } + break; + case MR_BANK: + if (ops->get_bank) { + if (ops->set_bank) + return a->mode; + else + return 0444; + } + break; + case MR_RANK: + if (ops->get_rank) { + if (ops->set_rank) + return a->mode; + else + return 0444; + } + break; + case MR_ROW: + if (ops->get_row) { + if (ops->set_row) + return a->mode; + else + return 0444; + } + break; + case MR_COLUMN: + if (ops->get_column) { + if (ops->set_column) + return a->mode; + else + return 0444; + } + break; + case MR_CHANNEL: + if (ops->get_channel) { + if (ops->set_channel) + return a->mode; + else + return 0444; + } + break; + case MR_SUB_CHANNEL: + if (ops->get_sub_channel) { + if (ops->set_sub_channel) + return a->mode; + else + return 0444; + } + break; + case MEM_DO_REPAIR: + if (ops->do_repair) + return a->mode; + break; + default: + break; + } + + return 0; +} + +#define MR_ATTR_RO(_name, _instance) \ + ((struct edac_mem_repair_dev_attr) { .dev_attr = __ATTR_RO(_name), \ + .instance = _instance }) + +#define MR_ATTR_WO(_name, _instance) \ + ((struct edac_mem_repair_dev_attr) { .dev_attr = __ATTR_WO(_name), \ + .instance = _instance }) + +#define MR_ATTR_RW(_name, _instance) \ + ((struct edac_mem_repair_dev_attr) { .dev_attr = __ATTR_RW(_name), \ + .instance = _instance }) + +static int mem_repair_create_desc(struct device *dev, + const struct attribute_group **attr_groups, + u8 instance) +{ + struct edac_mem_repair_context *ctx; + struct attribute_group *group; + int i; + struct edac_mem_repair_dev_attr dev_attr[] = { + [MR_TYPE] = MR_ATTR_RO(repair_type, instance), + [MR_PERSIST_MODE] = MR_ATTR_RW(persist_mode, instance), + [MR_SAFE_IN_USE] = MR_ATTR_RO(repair_safe_when_in_use, instance), + [MR_HPA] = MR_ATTR_RW(hpa, instance), + [MR_MIN_HPA] = MR_ATTR_RO(min_hpa, instance), + [MR_MAX_HPA] = MR_ATTR_RO(max_hpa, instance), + [MR_DPA] = MR_ATTR_RW(dpa, instance), + [MR_MIN_DPA] = MR_ATTR_RO(min_dpa, instance), + [MR_MAX_DPA] = MR_ATTR_RO(max_dpa, instance), + [MR_NIBBLE_MASK] = MR_ATTR_RW(nibble_mask, instance), + [MR_BANK_GROUP] = MR_ATTR_RW(bank_group, instance), + [MR_BANK] = MR_ATTR_RW(bank, instance), + [MR_RANK] = MR_ATTR_RW(rank, instance), + [MR_ROW] = MR_ATTR_RW(row, instance), + [MR_COLUMN] = MR_ATTR_RW(column, instance), + [MR_CHANNEL] = MR_ATTR_RW(channel, instance), + [MR_SUB_CHANNEL] = MR_ATTR_RW(sub_channel, instance), + [MEM_DO_REPAIR] = MR_ATTR_WO(repair, instance) + }; + + ctx = devm_kzalloc(dev, sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + for (i = 0; i < MR_MAX_ATTRS; i++) { + memcpy(&ctx->mem_repair_dev_attr[i], + &dev_attr[i], sizeof(dev_attr[i])); + ctx->mem_repair_attrs[i] = + &ctx->mem_repair_dev_attr[i].dev_attr.attr; + } + + sprintf(ctx->name, "%s%d", "mem_repair", instance); + group = &ctx->group; + group->name = ctx->name; + group->attrs = ctx->mem_repair_attrs; + group->is_visible = mem_repair_attr_visible; + attr_groups[0] = group; + + return 0; +} + +/** + * edac_mem_repair_get_desc - get EDAC memory repair descriptors + * @dev: client device with memory repair feature + * @attr_groups: pointer to attribute group container + * @instance: device's memory repair instance number. + * + * Return: + * * %0 - Success. + * * %-EINVAL - Invalid parameters passed. + * * %-ENOMEM - Dynamic memory allocation failed. + */ +int edac_mem_repair_get_desc(struct device *dev, + const struct attribute_group **attr_groups, u8 instance) +{ + if (!dev || !attr_groups) + return -EINVAL; + + return mem_repair_create_desc(dev, attr_groups, instance); +} diff --git a/drivers/edac/pnd2_edac.c b/drivers/edac/pnd2_edac.c index f93f2f2b1cf2..af14c8a3279f 100644 --- a/drivers/edac/pnd2_edac.c +++ b/drivers/edac/pnd2_edac.c @@ -372,7 +372,7 @@ static int gen_asym_mask(struct b_cr_slice_channel_hash *p, struct b_cr_asym_mem_region1_mchbar *as1, struct b_cr_asym_2way_mem_region_mchbar *as2way) { - const int intlv[] = { 0x5, 0xA, 0x3, 0xC }; + static const int intlv[] = { 0x5, 0xA, 0x3, 0xC }; int mask = 0; if (as2way->asym_2way_interleave_enable) @@ -489,7 +489,7 @@ static int dnv_get_registers(void) */ static int get_registers(void) { - const int intlv[] = { 10, 11, 12, 12 }; + static const int intlv[] = { 10, 11, 12, 12 }; if (RD_REG(&tolud, b_cr_tolud_pci) || RD_REG(&touud_lo, b_cr_touud_lo_pci) || diff --git a/drivers/edac/scrub.c b/drivers/edac/scrub.c new file mode 100755 index 000000000000..e421d3ebd959 --- /dev/null +++ b/drivers/edac/scrub.c @@ -0,0 +1,209 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * The generic EDAC scrub driver controls the memory scrubbers in the + * system. The common sysfs scrub interface abstracts the control of + * various arbitrary scrubbing functionalities into a unified set of + * functions. + * + * Copyright (c) 2024-2025 HiSilicon Limited. + */ + +#include <linux/edac.h> + +enum edac_scrub_attributes { + SCRUB_ADDRESS, + SCRUB_SIZE, + SCRUB_ENABLE_BACKGROUND, + SCRUB_MIN_CYCLE_DURATION, + SCRUB_MAX_CYCLE_DURATION, + SCRUB_CUR_CYCLE_DURATION, + SCRUB_MAX_ATTRS +}; + +struct edac_scrub_dev_attr { + struct device_attribute dev_attr; + u8 instance; +}; + +struct edac_scrub_context { + char name[EDAC_FEAT_NAME_LEN]; + struct edac_scrub_dev_attr scrub_dev_attr[SCRUB_MAX_ATTRS]; + struct attribute *scrub_attrs[SCRUB_MAX_ATTRS + 1]; + struct attribute_group group; +}; + +#define TO_SCRUB_DEV_ATTR(_dev_attr) \ + container_of(_dev_attr, struct edac_scrub_dev_attr, dev_attr) + +#define EDAC_SCRUB_ATTR_SHOW(attrib, cb, type, format) \ +static ssize_t attrib##_show(struct device *ras_feat_dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + u8 inst = TO_SCRUB_DEV_ATTR(attr)->instance; \ + struct edac_dev_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev); \ + const struct edac_scrub_ops *ops = ctx->scrub[inst].scrub_ops; \ + type data; \ + int ret; \ + \ + ret = ops->cb(ras_feat_dev->parent, ctx->scrub[inst].private, &data); \ + if (ret) \ + return ret; \ + \ + return sysfs_emit(buf, format, data); \ +} + +EDAC_SCRUB_ATTR_SHOW(addr, read_addr, u64, "0x%llx\n") +EDAC_SCRUB_ATTR_SHOW(size, read_size, u64, "0x%llx\n") +EDAC_SCRUB_ATTR_SHOW(enable_background, get_enabled_bg, bool, "%u\n") +EDAC_SCRUB_ATTR_SHOW(min_cycle_duration, get_min_cycle, u32, "%u\n") +EDAC_SCRUB_ATTR_SHOW(max_cycle_duration, get_max_cycle, u32, "%u\n") +EDAC_SCRUB_ATTR_SHOW(current_cycle_duration, get_cycle_duration, u32, "%u\n") + +#define EDAC_SCRUB_ATTR_STORE(attrib, cb, type, conv_func) \ +static ssize_t attrib##_store(struct device *ras_feat_dev, \ + struct device_attribute *attr, \ + const char *buf, size_t len) \ +{ \ + u8 inst = TO_SCRUB_DEV_ATTR(attr)->instance; \ + struct edac_dev_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev); \ + const struct edac_scrub_ops *ops = ctx->scrub[inst].scrub_ops; \ + type data; \ + int ret; \ + \ + ret = conv_func(buf, 0, &data); \ + if (ret < 0) \ + return ret; \ + \ + ret = ops->cb(ras_feat_dev->parent, ctx->scrub[inst].private, data); \ + if (ret) \ + return ret; \ + \ + return len; \ +} + +EDAC_SCRUB_ATTR_STORE(addr, write_addr, u64, kstrtou64) +EDAC_SCRUB_ATTR_STORE(size, write_size, u64, kstrtou64) +EDAC_SCRUB_ATTR_STORE(enable_background, set_enabled_bg, unsigned long, kstrtoul) +EDAC_SCRUB_ATTR_STORE(current_cycle_duration, set_cycle_duration, unsigned long, kstrtoul) + +static umode_t scrub_attr_visible(struct kobject *kobj, struct attribute *a, int attr_id) +{ + struct device *ras_feat_dev = kobj_to_dev(kobj); + struct device_attribute *dev_attr = container_of(a, struct device_attribute, attr); + u8 inst = TO_SCRUB_DEV_ATTR(dev_attr)->instance; + struct edac_dev_feat_ctx *ctx = dev_get_drvdata(ras_feat_dev); + const struct edac_scrub_ops *ops = ctx->scrub[inst].scrub_ops; + + switch (attr_id) { + case SCRUB_ADDRESS: + if (ops->read_addr) { + if (ops->write_addr) + return a->mode; + else + return 0444; + } + break; + case SCRUB_SIZE: + if (ops->read_size) { + if (ops->write_size) + return a->mode; + else + return 0444; + } + break; + case SCRUB_ENABLE_BACKGROUND: + if (ops->get_enabled_bg) { + if (ops->set_enabled_bg) + return a->mode; + else + return 0444; + } + break; + case SCRUB_MIN_CYCLE_DURATION: + if (ops->get_min_cycle) + return a->mode; + break; + case SCRUB_MAX_CYCLE_DURATION: + if (ops->get_max_cycle) + return a->mode; + break; + case SCRUB_CUR_CYCLE_DURATION: + if (ops->get_cycle_duration) { + if (ops->set_cycle_duration) + return a->mode; + else + return 0444; + } + break; + default: + break; + } + + return 0; +} + +#define EDAC_SCRUB_ATTR_RO(_name, _instance) \ + ((struct edac_scrub_dev_attr) { .dev_attr = __ATTR_RO(_name), \ + .instance = _instance }) + +#define EDAC_SCRUB_ATTR_WO(_name, _instance) \ + ((struct edac_scrub_dev_attr) { .dev_attr = __ATTR_WO(_name), \ + .instance = _instance }) + +#define EDAC_SCRUB_ATTR_RW(_name, _instance) \ + ((struct edac_scrub_dev_attr) { .dev_attr = __ATTR_RW(_name), \ + .instance = _instance }) + +static int scrub_create_desc(struct device *scrub_dev, + const struct attribute_group **attr_groups, u8 instance) +{ + struct edac_scrub_context *scrub_ctx; + struct attribute_group *group; + int i; + struct edac_scrub_dev_attr dev_attr[] = { + [SCRUB_ADDRESS] = EDAC_SCRUB_ATTR_RW(addr, instance), + [SCRUB_SIZE] = EDAC_SCRUB_ATTR_RW(size, instance), + [SCRUB_ENABLE_BACKGROUND] = EDAC_SCRUB_ATTR_RW(enable_background, instance), + [SCRUB_MIN_CYCLE_DURATION] = EDAC_SCRUB_ATTR_RO(min_cycle_duration, instance), + [SCRUB_MAX_CYCLE_DURATION] = EDAC_SCRUB_ATTR_RO(max_cycle_duration, instance), + [SCRUB_CUR_CYCLE_DURATION] = EDAC_SCRUB_ATTR_RW(current_cycle_duration, instance) + }; + + scrub_ctx = devm_kzalloc(scrub_dev, sizeof(*scrub_ctx), GFP_KERNEL); + if (!scrub_ctx) + return -ENOMEM; + + group = &scrub_ctx->group; + for (i = 0; i < SCRUB_MAX_ATTRS; i++) { + memcpy(&scrub_ctx->scrub_dev_attr[i], &dev_attr[i], sizeof(dev_attr[i])); + scrub_ctx->scrub_attrs[i] = &scrub_ctx->scrub_dev_attr[i].dev_attr.attr; + } + sprintf(scrub_ctx->name, "%s%d", "scrub", instance); + group->name = scrub_ctx->name; + group->attrs = scrub_ctx->scrub_attrs; + group->is_visible = scrub_attr_visible; + + attr_groups[0] = group; + + return 0; +} + +/** + * edac_scrub_get_desc - get EDAC scrub descriptors + * @scrub_dev: client device, with scrub support + * @attr_groups: pointer to attribute group container + * @instance: device's scrub instance number. + * + * Return: + * * %0 - Success. + * * %-EINVAL - Invalid parameters passed. + * * %-ENOMEM - Dynamic memory allocation failed. + */ +int edac_scrub_get_desc(struct device *scrub_dev, + const struct attribute_group **attr_groups, u8 instance) +{ + if (!scrub_dev || !attr_groups) + return -EINVAL; + + return scrub_create_desc(scrub_dev, attr_groups, instance); +} diff --git a/drivers/edac/xgene_edac.c b/drivers/edac/xgene_edac.c index 699c7d29d80c..9955396c9a52 100644 --- a/drivers/edac/xgene_edac.c +++ b/drivers/edac/xgene_edac.c @@ -15,6 +15,7 @@ #include <linux/of.h> #include <linux/of_address.h> #include <linux/regmap.h> +#include <linux/string_choices.h> #include "edac_module.h" @@ -1407,7 +1408,7 @@ static void xgene_edac_iob_gic_report(struct edac_device_ctl_info *edac_dev) dev_err(edac_dev->dev, "Multiple XGIC write size error\n"); info = readl(ctx->dev_csr + XGICTRANSERRREQINFO); dev_err(edac_dev->dev, "XGIC %s access @ 0x%08X (0x%08X)\n", - info & REQTYPE_MASK ? "read" : "write", ERRADDR_RD(info), + str_read_write(info & REQTYPE_MASK), ERRADDR_RD(info), info); writel(reg, ctx->dev_csr + XGICTRANSERRINTSTS); @@ -1489,19 +1490,19 @@ static void xgene_edac_rb_report(struct edac_device_ctl_info *edac_dev) if (reg & AGENT_OFFLINE_ERR_MASK) dev_err(edac_dev->dev, "IOB bus %s access to offline agent error\n", - write ? "write" : "read"); + str_write_read(write)); if (reg & UNIMPL_RBPAGE_ERR_MASK) dev_err(edac_dev->dev, "IOB bus %s access to unimplemented page error\n", - write ? "write" : "read"); + str_write_read(write)); if (reg & WORD_ALIGNED_ERR_MASK) dev_err(edac_dev->dev, "IOB bus %s word aligned access error\n", - write ? "write" : "read"); + str_write_read(write)); if (reg & PAGE_ACCESS_ERR_MASK) dev_err(edac_dev->dev, "IOB bus %s to page out of range access error\n", - write ? "write" : "read"); + str_write_read(write)); if (regmap_write(ctx->edac->rb_map, RBEIR, 0)) return; if (regmap_write(ctx->edac->rb_map, RBCSR, 0)) @@ -1560,7 +1561,7 @@ rb_skip: err_addr_lo = readl(ctx->dev_csr + IOBBATRANSERRREQINFOL); err_addr_hi = readl(ctx->dev_csr + IOBBATRANSERRREQINFOH); dev_err(edac_dev->dev, "IOB BA %s access at 0x%02X.%08X (0x%08X)\n", - REQTYPE_F2_RD(err_addr_hi) ? "read" : "write", + str_read_write(REQTYPE_F2_RD(err_addr_hi)), ERRADDRH_F2_RD(err_addr_hi), err_addr_lo, err_addr_hi); if (reg & WRERR_RESP_MASK) dev_err(edac_dev->dev, "IOB BA requestor ID 0x%08X\n", @@ -1611,7 +1612,7 @@ chk_iob_axi0: dev_err(edac_dev->dev, "%sAXI slave 0 illegal %s access @ 0x%02X.%08X (0x%08X)\n", reg & IOBAXIS0_M_ILLEGAL_ACCESS_MASK ? "Multiple " : "", - REQTYPE_RD(err_addr_hi) ? "read" : "write", + str_read_write(REQTYPE_RD(err_addr_hi)), ERRADDRH_RD(err_addr_hi), err_addr_lo, err_addr_hi); writel(reg, ctx->dev_csr + IOBAXIS0TRANSERRINTSTS); @@ -1625,7 +1626,7 @@ chk_iob_axi1: dev_err(edac_dev->dev, "%sAXI slave 1 illegal %s access @ 0x%02X.%08X (0x%08X)\n", reg & IOBAXIS0_M_ILLEGAL_ACCESS_MASK ? "Multiple " : "", - REQTYPE_RD(err_addr_hi) ? "read" : "write", + str_read_write(REQTYPE_RD(err_addr_hi)), ERRADDRH_RD(err_addr_hi), err_addr_lo, err_addr_hi); writel(reg, ctx->dev_csr + IOBAXIS1TRANSERRINTSTS); } diff --git a/include/linux/edac.h b/include/linux/edac.h index b4ee8961e623..451f9c152c99 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -661,4 +661,219 @@ static inline struct dimm_info *edac_get_dimm(struct mem_ctl_info *mci, return mci->dimms[index]; } + +#define EDAC_FEAT_NAME_LEN 128 + +/* RAS feature type */ +enum edac_dev_feat { + RAS_FEAT_SCRUB, + RAS_FEAT_ECS, + RAS_FEAT_MEM_REPAIR, + RAS_FEAT_MAX +}; + +/** + * struct edac_scrub_ops - scrub device operations (all elements optional) + * @read_addr: read base address of scrubbing range. + * @read_size: read offset of scrubbing range. + * @write_addr: set base address of the scrubbing range. + * @write_size: set offset of the scrubbing range. + * @get_enabled_bg: check if currently performing background scrub. + * @set_enabled_bg: start or stop a bg-scrub. + * @get_min_cycle: get minimum supported scrub cycle duration in seconds. + * @get_max_cycle: get maximum supported scrub cycle duration in seconds. + * @get_cycle_duration: get current scrub cycle duration in seconds. + * @set_cycle_duration: set current scrub cycle duration in seconds. + */ +struct edac_scrub_ops { + int (*read_addr)(struct device *dev, void *drv_data, u64 *base); + int (*read_size)(struct device *dev, void *drv_data, u64 *size); + int (*write_addr)(struct device *dev, void *drv_data, u64 base); + int (*write_size)(struct device *dev, void *drv_data, u64 size); + int (*get_enabled_bg)(struct device *dev, void *drv_data, bool *enable); + int (*set_enabled_bg)(struct device *dev, void *drv_data, bool enable); + int (*get_min_cycle)(struct device *dev, void *drv_data, u32 *min); + int (*get_max_cycle)(struct device *dev, void *drv_data, u32 *max); + int (*get_cycle_duration)(struct device *dev, void *drv_data, u32 *cycle); + int (*set_cycle_duration)(struct device *dev, void *drv_data, u32 cycle); +}; + +#if IS_ENABLED(CONFIG_EDAC_SCRUB) +int edac_scrub_get_desc(struct device *scrub_dev, + const struct attribute_group **attr_groups, + u8 instance); +#else +static inline int edac_scrub_get_desc(struct device *scrub_dev, + const struct attribute_group **attr_groups, + u8 instance) +{ return -EOPNOTSUPP; } +#endif /* CONFIG_EDAC_SCRUB */ + +/** + * struct edac_ecs_ops - ECS device operations (all elements optional) + * @get_log_entry_type: read the log entry type value. + * @set_log_entry_type: set the log entry type value. + * @get_mode: read the mode value. + * @set_mode: set the mode value. + * @reset: reset the ECS counter. + * @get_threshold: read the threshold count per gigabits of memory cells. + * @set_threshold: set the threshold count per gigabits of memory cells. + */ +struct edac_ecs_ops { + int (*get_log_entry_type)(struct device *dev, void *drv_data, int fru_id, u32 *val); + int (*set_log_entry_type)(struct device *dev, void *drv_data, int fru_id, u32 val); + int (*get_mode)(struct device *dev, void *drv_data, int fru_id, u32 *val); + int (*set_mode)(struct device *dev, void *drv_data, int fru_id, u32 val); + int (*reset)(struct device *dev, void *drv_data, int fru_id, u32 val); + int (*get_threshold)(struct device *dev, void *drv_data, int fru_id, u32 *threshold); + int (*set_threshold)(struct device *dev, void *drv_data, int fru_id, u32 threshold); +}; + +struct edac_ecs_ex_info { + u16 num_media_frus; +}; + +#if IS_ENABLED(CONFIG_EDAC_ECS) +int edac_ecs_get_desc(struct device *ecs_dev, + const struct attribute_group **attr_groups, + u16 num_media_frus); +#else +static inline int edac_ecs_get_desc(struct device *ecs_dev, + const struct attribute_group **attr_groups, + u16 num_media_frus) +{ return -EOPNOTSUPP; } +#endif /* CONFIG_EDAC_ECS */ + +enum edac_mem_repair_type { + EDAC_REPAIR_MAX +}; + +enum edac_mem_repair_cmd { + EDAC_DO_MEM_REPAIR = 1, +}; + +/** + * struct edac_mem_repair_ops - memory repair operations + * (all elements are optional except do_repair, set_hpa/set_dpa) + * @get_repair_type: get the memory repair type, listed in + * enum edac_mem_repair_function. + * @get_persist_mode: get the current persist mode. + * false - Soft repair type (temporary repair). + * true - Hard memory repair type (permanent repair). + * @set_persist_mode: set the persist mode of the memory repair instance. + * @get_repair_safe_when_in_use: get whether memory media is accessible and + * data is retained during repair operation. + * @get_hpa: get current host physical address (HPA) of memory to repair. + * @set_hpa: set host physical address (HPA) of memory to repair. + * @get_min_hpa: get the minimum supported host physical address (HPA). + * @get_max_hpa: get the maximum supported host physical address (HPA). + * @get_dpa: get current device physical address (DPA) of memory to repair. + * @set_dpa: set device physical address (DPA) of memory to repair. + * In some states of system configuration (e.g. before address decoders + * have been configured), memory devices (e.g. CXL) may not have an active + * mapping in the host physical address map. As such, the memory + * to repair must be identified by a device specific physical addressing + * scheme using a device physical address(DPA). The DPA and other control + * attributes to use for the repair operations will be presented in related + * error records. + * @get_min_dpa: get the minimum supported device physical address (DPA). + * @get_max_dpa: get the maximum supported device physical address (DPA). + * @get_nibble_mask: get current nibble mask of memory to repair. + * @set_nibble_mask: set nibble mask of memory to repair. + * @get_bank_group: get current bank group of memory to repair. + * @set_bank_group: set bank group of memory to repair. + * @get_bank: get current bank of memory to repair. + * @set_bank: set bank of memory to repair. + * @get_rank: get current rank of memory to repair. + * @set_rank: set rank of memory to repair. + * @get_row: get current row of memory to repair. + * @set_row: set row of memory to repair. + * @get_column: get current column of memory to repair. + * @set_column: set column of memory to repair. + * @get_channel: get current channel of memory to repair. + * @set_channel: set channel of memory to repair. + * @get_sub_channel: get current subchannel of memory to repair. + * @set_sub_channel: set subchannel of memory to repair. + * @do_repair: Issue memory repair operation for the HPA/DPA and + * other control attributes set for the memory to repair. + * + * All elements are optional except do_repair and at least one of set_hpa/set_dpa. + */ +struct edac_mem_repair_ops { + int (*get_repair_type)(struct device *dev, void *drv_data, const char **type); + int (*get_persist_mode)(struct device *dev, void *drv_data, bool *persist); + int (*set_persist_mode)(struct device *dev, void *drv_data, bool persist); + int (*get_repair_safe_when_in_use)(struct device *dev, void *drv_data, bool *safe); + int (*get_hpa)(struct device *dev, void *drv_data, u64 *hpa); + int (*set_hpa)(struct device *dev, void *drv_data, u64 hpa); + int (*get_min_hpa)(struct device *dev, void *drv_data, u64 *hpa); + int (*get_max_hpa)(struct device *dev, void *drv_data, u64 *hpa); + int (*get_dpa)(struct device *dev, void *drv_data, u64 *dpa); + int (*set_dpa)(struct device *dev, void *drv_data, u64 dpa); + int (*get_min_dpa)(struct device *dev, void *drv_data, u64 *dpa); + int (*get_max_dpa)(struct device *dev, void *drv_data, u64 *dpa); + int (*get_nibble_mask)(struct device *dev, void *drv_data, u32 *val); + int (*set_nibble_mask)(struct device *dev, void *drv_data, u32 val); + int (*get_bank_group)(struct device *dev, void *drv_data, u32 *val); + int (*set_bank_group)(struct device *dev, void *drv_data, u32 val); + int (*get_bank)(struct device *dev, void *drv_data, u32 *val); + int (*set_bank)(struct device *dev, void *drv_data, u32 val); + int (*get_rank)(struct device *dev, void *drv_data, u32 *val); + int (*set_rank)(struct device *dev, void *drv_data, u32 val); + int (*get_row)(struct device *dev, void *drv_data, u32 *val); + int (*set_row)(struct device *dev, void *drv_data, u32 val); + int (*get_column)(struct device *dev, void *drv_data, u32 *val); + int (*set_column)(struct device *dev, void *drv_data, u32 val); + int (*get_channel)(struct device *dev, void *drv_data, u32 *val); + int (*set_channel)(struct device *dev, void *drv_data, u32 val); + int (*get_sub_channel)(struct device *dev, void *drv_data, u32 *val); + int (*set_sub_channel)(struct device *dev, void *drv_data, u32 val); + int (*do_repair)(struct device *dev, void *drv_data, u32 val); +}; + +#if IS_ENABLED(CONFIG_EDAC_MEM_REPAIR) +int edac_mem_repair_get_desc(struct device *dev, + const struct attribute_group **attr_groups, + u8 instance); +#else +static inline int edac_mem_repair_get_desc(struct device *dev, + const struct attribute_group **attr_groups, + u8 instance) +{ return -EOPNOTSUPP; } +#endif /* CONFIG_EDAC_MEM_REPAIR */ + +/* EDAC device feature information structure */ +struct edac_dev_data { + union { + const struct edac_scrub_ops *scrub_ops; + const struct edac_ecs_ops *ecs_ops; + const struct edac_mem_repair_ops *mem_repair_ops; + }; + u8 instance; + void *private; +}; + +struct edac_dev_feat_ctx { + struct device dev; + void *private; + struct edac_dev_data *scrub; + struct edac_dev_data ecs; + struct edac_dev_data *mem_repair; +}; + +struct edac_dev_feature { + enum edac_dev_feat ft_type; + u8 instance; + union { + const struct edac_scrub_ops *scrub_ops; + const struct edac_ecs_ops *ecs_ops; + const struct edac_mem_repair_ops *mem_repair_ops; + }; + void *ctx; + struct edac_ecs_ex_info ecs_info; +}; + +int edac_dev_register(struct device *parent, char *dev_name, + void *parent_pvt_data, int num_features, + const struct edac_dev_feature *ras_features); #endif /* _LINUX_EDAC_H_ */ |