From 3f3174996be6b4312c38f54d5969f5d5b75fec9e Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Mon, 22 Jan 2024 22:13:59 -0600 Subject: RAS: Introduce AMD Address Translation Library AMD Zen-based systems report memory errors through Machine Check banks representing Unified Memory Controllers (UMCs). The address value reported for DRAM ECC errors is a "normalized address" that is relative to the UMC. This normalized address must be converted to a system physical address to be usable by the OS. Support for this address translation was introduced to the MCA subsystem with Zen1 systems. The code was later moved to the AMD64 EDAC module, since this was the only user of the code at the time. However, there are uses for this translation outside of EDAC. The system physical address can be used in MCA for preemptive page offlining as done in some MCA notifier functions. Also, this translation is needed as the basis of similar functionality needed for some CXL configurations on AMD systems. Introduce a common address translation library that can be used for multiple subsystems including MCA, EDAC, and CXL. Include support for UMC normalized to system physical address translation for current CPU systems. The Data Fabric Indirect register access offsets and one of the register fields were changed. Default to the current offsets and register field definition. And fallback to the older values if running on a "legacy" system. Provide built-in code to facilitate the loading and unloading of the library module without affecting other modules or built-in code. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240123041401.79812-2-yazen.ghannam@amd.com --- include/linux/ras.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/ras.h b/include/linux/ras.h index 1f4048bf2674..09c632832bf1 100644 --- a/include/linux/ras.h +++ b/include/linux/ras.h @@ -25,6 +25,7 @@ void log_non_standard_event(const guid_t *sec_type, const guid_t *fru_id, const char *fru_text, const u8 sev, const u8 *err, const u32 len); void log_arm_hw_error(struct cper_sec_proc_arm *err); + #else static inline void log_non_standard_event(const guid_t *sec_type, @@ -35,4 +36,19 @@ static inline void log_arm_hw_error(struct cper_sec_proc_arm *err) { return; } #endif +struct atl_err { + u64 addr; + u64 ipid; + u32 cpu; +}; + +#if IS_ENABLED(CONFIG_AMD_ATL) +void amd_atl_register_decoder(unsigned long (*f)(struct atl_err *)); +void amd_atl_unregister_decoder(void); +unsigned long amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err); +#else +static inline unsigned long +amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { return -EINVAL; } +#endif /* CONFIG_AMD_ATL */ + #endif /* __RAS_H__ */ -- cgit v1.2.3 From 3b566b30b41401888ee0e8eb904a1e7a6693794b Mon Sep 17 00:00:00 2001 From: Yazen Ghannam Date: Tue, 13 Feb 2024 21:35:15 -0600 Subject: RAS/AMD/ATL: Add MI300 row retirement support DRAM row retirement depends on model-specific information that is best done within the AMD Address Translation Library. Export a generic wrapper function for other modules to use. Add any model-specific helpers here. Signed-off-by: Yazen Ghannam Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/20240214033516.1344948-2-yazen.ghannam@amd.com --- drivers/ras/amd/atl/Kconfig | 1 + drivers/ras/amd/atl/umc.c | 51 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/ras.h | 2 ++ 3 files changed, 54 insertions(+) (limited to 'include') diff --git a/drivers/ras/amd/atl/Kconfig b/drivers/ras/amd/atl/Kconfig index a43513a700f1..df49c23e7f62 100644 --- a/drivers/ras/amd/atl/Kconfig +++ b/drivers/ras/amd/atl/Kconfig @@ -10,6 +10,7 @@ config AMD_ATL tristate "AMD Address Translation Library" depends on AMD_NB && X86_64 && RAS + depends on MEMORY_FAILURE default N help This library includes support for implementation-specific diff --git a/drivers/ras/amd/atl/umc.c b/drivers/ras/amd/atl/umc.c index 7e310d1dfcfc..08c6dbd44c62 100644 --- a/drivers/ras/amd/atl/umc.c +++ b/drivers/ras/amd/atl/umc.c @@ -239,6 +239,57 @@ static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr) return addr; } +/* + * When a DRAM ECC error occurs on MI300 systems, it is recommended to retire + * all memory within that DRAM row. This applies to the memory with a DRAM + * bank. + * + * To find the memory addresses, loop through permutations of the DRAM column + * bits and find the System Physical address of each. The column bits are used + * to calculate the intermediate Normalized address, so all permutations should + * be checked. + * + * See amd_atl::convert_dram_to_norm_addr_mi300() for MI300 address formats. + */ +#define MI300_NUM_COL BIT(HWEIGHT(MI300_UMC_MCA_COL)) +static void retire_row_mi300(struct atl_err *a_err) +{ + unsigned long addr; + struct page *p; + u8 col; + + for (col = 0; col < MI300_NUM_COL; col++) { + a_err->addr &= ~MI300_UMC_MCA_COL; + a_err->addr |= FIELD_PREP(MI300_UMC_MCA_COL, col); + + addr = amd_convert_umc_mca_addr_to_sys_addr(a_err); + if (IS_ERR_VALUE(addr)) + continue; + + addr = PHYS_PFN(addr); + + /* + * Skip invalid or already poisoned pages to avoid unnecessary + * error messages from memory_failure(). + */ + p = pfn_to_online_page(addr); + if (!p) + continue; + + if (PageHWPoison(p)) + continue; + + memory_failure(addr, 0); + } +} + +void amd_retire_dram_row(struct atl_err *a_err) +{ + if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) + return retire_row_mi300(a_err); +} +EXPORT_SYMBOL_GPL(amd_retire_dram_row); + static unsigned long get_addr(unsigned long addr) { if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous) diff --git a/include/linux/ras.h b/include/linux/ras.h index 09c632832bf1..a64182bc72ad 100644 --- a/include/linux/ras.h +++ b/include/linux/ras.h @@ -45,8 +45,10 @@ struct atl_err { #if IS_ENABLED(CONFIG_AMD_ATL) void amd_atl_register_decoder(unsigned long (*f)(struct atl_err *)); void amd_atl_unregister_decoder(void); +void amd_retire_dram_row(struct atl_err *err); unsigned long amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err); #else +static inline void amd_retire_dram_row(struct atl_err *err) { } static inline unsigned long amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { return -EINVAL; } #endif /* CONFIG_AMD_ATL */ -- cgit v1.2.3