diff options
Diffstat (limited to 'drivers/edac')
-rw-r--r-- | drivers/edac/Kconfig | 16 | ||||
-rw-r--r-- | drivers/edac/Makefile | 2 | ||||
-rw-r--r-- | drivers/edac/a72_edac.c | 225 | ||||
-rw-r--r-- | drivers/edac/altera_edac.c | 5 | ||||
-rw-r--r-- | drivers/edac/amd64_edac.c | 20 | ||||
-rw-r--r-- | drivers/edac/amd64_edac.h | 2 | ||||
-rw-r--r--[-rwxr-xr-x] | drivers/edac/ecs.c | 0 | ||||
-rw-r--r-- | drivers/edac/edac_mc_sysfs.c | 24 | ||||
-rw-r--r-- | drivers/edac/i10nm_base.c | 57 | ||||
-rw-r--r-- | drivers/edac/ie31200_edac.c | 42 | ||||
-rw-r--r-- | drivers/edac/igen6_edac.c | 17 | ||||
-rw-r--r--[-rwxr-xr-x] | drivers/edac/mem_repair.c | 56 | ||||
-rw-r--r--[-rwxr-xr-x] | drivers/edac/scrub.c | 0 | ||||
-rw-r--r-- | drivers/edac/skx_base.c | 33 | ||||
-rw-r--r-- | drivers/edac/skx_common.c | 58 | ||||
-rw-r--r-- | drivers/edac/skx_common.h | 28 | ||||
-rw-r--r-- | drivers/edac/synopsys_edac.c | 97 | ||||
-rw-r--r-- | drivers/edac/versalnet_edac.c | 960 |
18 files changed, 1482 insertions, 160 deletions
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index 19ad3c3b675d..39352b9b7a7e 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -576,4 +576,20 @@ config EDAC_LOONGSON errors (CE) only. Loongson-3A5000/3C5000/3D5000/3A6000/3C6000 are compatible. +config EDAC_CORTEX_A72 + tristate "ARM Cortex A72" + depends on ARM64 + help + Support for L1/L2 cache error detection for ARM Cortex A72 processor. + The detected and reported errors are from reading CPU/L2 memory error + syndrome registers. + +config EDAC_VERSALNET + tristate "AMD VersalNET DDR Controller" + depends on CDX_CONTROLLER && ARCH_ZYNQMP + help + Support for single bit error correction, double bit error detection + and other system errors from various IP subsystems like RPU, NOCs, + HNICX, PL on the AMD Versal NET DDR memory controller. + endif # EDAC diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index a8f2d8f6c894..1c14796410a3 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile @@ -88,3 +88,5 @@ obj-$(CONFIG_EDAC_NPCM) += npcm_edac.o obj-$(CONFIG_EDAC_ZYNQMP) += zynqmp_edac.o obj-$(CONFIG_EDAC_VERSAL) += versal_edac.o obj-$(CONFIG_EDAC_LOONGSON) += loongson_edac.o +obj-$(CONFIG_EDAC_VERSALNET) += versalnet_edac.o +obj-$(CONFIG_EDAC_CORTEX_A72) += a72_edac.o diff --git a/drivers/edac/a72_edac.c b/drivers/edac/a72_edac.c new file mode 100644 index 000000000000..9262d75c3855 --- /dev/null +++ b/drivers/edac/a72_edac.c @@ -0,0 +1,225 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Cortex A72 EDAC L1 and L2 cache error detection + * + * Copyright (c) 2020 Pengutronix, Sascha Hauer <s.hauer@pengutronix.de> + * Copyright (c) 2025 Microsoft Corporation, <vijayb@linux.microsoft.com> + * + * Based on Code from: + * Copyright (c) 2018, NXP Semiconductor + * Author: York Sun <york.sun@nxp.com> + */ + +#include <linux/module.h> +#include <linux/of.h> +#include <linux/bitfield.h> +#include <asm/smp_plat.h> + +#include "edac_module.h" + +#define DRVNAME "a72-edac" + +#define SYS_CPUMERRSR_EL1 sys_reg(3, 1, 15, 2, 2) +#define SYS_L2MERRSR_EL1 sys_reg(3, 1, 15, 2, 3) + +#define CPUMERRSR_EL1_RAMID GENMASK(30, 24) +#define L2MERRSR_EL1_CPUID_WAY GENMASK(21, 18) + +#define CPUMERRSR_EL1_VALID BIT(31) +#define CPUMERRSR_EL1_FATAL BIT(63) +#define L2MERRSR_EL1_VALID BIT(31) +#define L2MERRSR_EL1_FATAL BIT(63) + +#define L1_I_TAG_RAM 0x00 +#define L1_I_DATA_RAM 0x01 +#define L1_D_TAG_RAM 0x08 +#define L1_D_DATA_RAM 0x09 +#define TLB_RAM 0x18 + +#define MESSAGE_SIZE 64 + +struct mem_err_synd_reg { + u64 cpu_mesr; + u64 l2_mesr; +}; + +static struct cpumask compat_mask; + +static void report_errors(struct edac_device_ctl_info *edac_ctl, int cpu, + struct mem_err_synd_reg *mesr) +{ + u64 cpu_mesr = mesr->cpu_mesr; + u64 l2_mesr = mesr->l2_mesr; + char msg[MESSAGE_SIZE]; + + if (cpu_mesr & CPUMERRSR_EL1_VALID) { + const char *str; + bool fatal = cpu_mesr & CPUMERRSR_EL1_FATAL; + + switch (FIELD_GET(CPUMERRSR_EL1_RAMID, cpu_mesr)) { + case L1_I_TAG_RAM: + str = "L1-I Tag RAM"; + break; + case L1_I_DATA_RAM: + str = "L1-I Data RAM"; + break; + case L1_D_TAG_RAM: + str = "L1-D Tag RAM"; + break; + case L1_D_DATA_RAM: + str = "L1-D Data RAM"; + break; + case TLB_RAM: + str = "TLB RAM"; + break; + default: + str = "Unspecified"; + break; + } + + snprintf(msg, MESSAGE_SIZE, "%s %s error(s) on CPU %d", + str, fatal ? "fatal" : "correctable", cpu); + + if (fatal) + edac_device_handle_ue(edac_ctl, cpu, 0, msg); + else + edac_device_handle_ce(edac_ctl, cpu, 0, msg); + } + + if (l2_mesr & L2MERRSR_EL1_VALID) { + bool fatal = l2_mesr & L2MERRSR_EL1_FATAL; + + snprintf(msg, MESSAGE_SIZE, "L2 %s error(s) on CPU %d CPUID/WAY 0x%lx", + fatal ? "fatal" : "correctable", cpu, + FIELD_GET(L2MERRSR_EL1_CPUID_WAY, l2_mesr)); + if (fatal) + edac_device_handle_ue(edac_ctl, cpu, 1, msg); + else + edac_device_handle_ce(edac_ctl, cpu, 1, msg); + } +} + +static void read_errors(void *data) +{ + struct mem_err_synd_reg *mesr = data; + + mesr->cpu_mesr = read_sysreg_s(SYS_CPUMERRSR_EL1); + if (mesr->cpu_mesr & CPUMERRSR_EL1_VALID) { + write_sysreg_s(0, SYS_CPUMERRSR_EL1); + isb(); + } + mesr->l2_mesr = read_sysreg_s(SYS_L2MERRSR_EL1); + if (mesr->l2_mesr & L2MERRSR_EL1_VALID) { + write_sysreg_s(0, SYS_L2MERRSR_EL1); + isb(); + } +} + +static void a72_edac_check(struct edac_device_ctl_info *edac_ctl) +{ + struct mem_err_synd_reg mesr; + int cpu; + + cpus_read_lock(); + for_each_cpu_and(cpu, cpu_online_mask, &compat_mask) { + smp_call_function_single(cpu, read_errors, &mesr, true); + report_errors(edac_ctl, cpu, &mesr); + } + cpus_read_unlock(); +} + +static int a72_edac_probe(struct platform_device *pdev) +{ + struct edac_device_ctl_info *edac_ctl; + struct device *dev = &pdev->dev; + int rc; + + edac_ctl = edac_device_alloc_ctl_info(0, "cpu", + num_possible_cpus(), "L", 2, 1, + edac_device_alloc_index()); + if (!edac_ctl) + return -ENOMEM; + + edac_ctl->edac_check = a72_edac_check; + edac_ctl->dev = dev; + edac_ctl->mod_name = dev_name(dev); + edac_ctl->dev_name = dev_name(dev); + edac_ctl->ctl_name = DRVNAME; + dev_set_drvdata(dev, edac_ctl); + + rc = edac_device_add_device(edac_ctl); + if (rc) + goto out_dev; + + return 0; + +out_dev: + edac_device_free_ctl_info(edac_ctl); + + return rc; +} + +static void a72_edac_remove(struct platform_device *pdev) +{ + struct edac_device_ctl_info *edac_ctl = dev_get_drvdata(&pdev->dev); + + edac_device_del_device(edac_ctl->dev); + edac_device_free_ctl_info(edac_ctl); +} + +static const struct of_device_id cortex_arm64_edac_of_match[] = { + { .compatible = "arm,cortex-a72" }, + {} +}; +MODULE_DEVICE_TABLE(of, cortex_arm64_edac_of_match); + +static struct platform_driver a72_edac_driver = { + .probe = a72_edac_probe, + .remove = a72_edac_remove, + .driver = { + .name = DRVNAME, + }, +}; + +static struct platform_device *a72_pdev; + +static int __init a72_edac_driver_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct device_node *np __free(device_node) = of_cpu_device_node_get(cpu); + if (np) { + if (of_match_node(cortex_arm64_edac_of_match, np) && + of_property_read_bool(np, "edac-enabled")) { + cpumask_set_cpu(cpu, &compat_mask); + } + } else { + pr_warn("failed to find device node for CPU %d\n", cpu); + } + } + + if (cpumask_empty(&compat_mask)) + return 0; + + a72_pdev = platform_device_register_simple(DRVNAME, -1, NULL, 0); + if (IS_ERR(a72_pdev)) { + pr_err("failed to register A72 EDAC device\n"); + return PTR_ERR(a72_pdev); + } + + return platform_driver_register(&a72_edac_driver); +} + +static void __exit a72_edac_driver_exit(void) +{ + platform_device_unregister(a72_pdev); + platform_driver_unregister(&a72_edac_driver); +} + +module_init(a72_edac_driver_init); +module_exit(a72_edac_driver_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Sascha Hauer <s.hauer@pengutronix.de>"); +MODULE_DESCRIPTION("Cortex A72 L1 and L2 cache EDAC driver"); diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c index cae52c654a15..103b2c2eba2a 100644 --- a/drivers/edac/altera_edac.c +++ b/drivers/edac/altera_edac.c @@ -128,7 +128,6 @@ static ssize_t altr_sdr_mc_err_inject_write(struct file *file, ptemp = dma_alloc_coherent(mci->pdev, 16, &dma_handle, GFP_KERNEL); if (!ptemp) { - dma_free_coherent(mci->pdev, 16, ptemp, dma_handle); edac_printk(KERN_ERR, EDAC_MC, "Inject: Buffer Allocation error\n"); return -ENOMEM; @@ -2131,8 +2130,8 @@ static int altr_edac_a10_probe(struct platform_device *pdev) edac->irq_chip.name = pdev->dev.of_node->name; edac->irq_chip.irq_mask = a10_eccmgr_irq_mask; edac->irq_chip.irq_unmask = a10_eccmgr_irq_unmask; - edac->domain = irq_domain_create_linear(of_fwnode_handle(pdev->dev.of_node), - 64, &a10_eccmgr_ic_ops, edac); + edac->domain = irq_domain_create_linear(dev_fwnode(&pdev->dev), 64, &a10_eccmgr_ic_ops, + edac); if (!edac->domain) { dev_err(&pdev->dev, "Error adding IRQ domain\n"); return -ENOMEM; diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 07f1e9dc1ca7..2f6ab783bf20 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -3923,6 +3923,26 @@ static int per_family_init(struct amd64_pvt *pvt) pvt->ctl_name = "F1Ah_M40h"; pvt->flags.zn_regs_v2 = 1; break; + case 0x50 ... 0x57: + pvt->ctl_name = "F1Ah_M50h"; + pvt->max_mcs = 16; + pvt->flags.zn_regs_v2 = 1; + break; + case 0x90 ... 0x9f: + pvt->ctl_name = "F1Ah_M90h"; + pvt->max_mcs = 8; + pvt->flags.zn_regs_v2 = 1; + break; + case 0xa0 ... 0xaf: + pvt->ctl_name = "F1Ah_MA0h"; + pvt->max_mcs = 8; + pvt->flags.zn_regs_v2 = 1; + break; + case 0xc0 ... 0xc7: + pvt->ctl_name = "F1Ah_MC0h"; + pvt->max_mcs = 16; + pvt->flags.zn_regs_v2 = 1; + break; } break; diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 17228d07de4c..d70b8a8d0b09 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -96,7 +96,7 @@ /* Hardware limit on ChipSelect rows per MC and processors per system */ #define NUM_CHIPSELECTS 8 #define DRAM_RANGES 8 -#define NUM_CONTROLLERS 12 +#define NUM_CONTROLLERS 16 #define ON true #define OFF false diff --git a/drivers/edac/ecs.c b/drivers/edac/ecs.c index 51c451c7f0f0..51c451c7f0f0 100755..100644 --- a/drivers/edac/ecs.c +++ b/drivers/edac/ecs.c diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index 0f338adf7d93..8689631f1905 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -305,6 +305,14 @@ DEVICE_CHANNEL(ch10_dimm_label, S_IRUGO | S_IWUSR, channel_dimm_label_show, channel_dimm_label_store, 10); DEVICE_CHANNEL(ch11_dimm_label, S_IRUGO | S_IWUSR, channel_dimm_label_show, channel_dimm_label_store, 11); +DEVICE_CHANNEL(ch12_dimm_label, S_IRUGO | S_IWUSR, + channel_dimm_label_show, channel_dimm_label_store, 12); +DEVICE_CHANNEL(ch13_dimm_label, S_IRUGO | S_IWUSR, + channel_dimm_label_show, channel_dimm_label_store, 13); +DEVICE_CHANNEL(ch14_dimm_label, S_IRUGO | S_IWUSR, + channel_dimm_label_show, channel_dimm_label_store, 14); +DEVICE_CHANNEL(ch15_dimm_label, S_IRUGO | S_IWUSR, + channel_dimm_label_show, channel_dimm_label_store, 15); /* Total possible dynamic DIMM Label attribute file table */ static struct attribute *dynamic_csrow_dimm_attr[] = { @@ -320,6 +328,10 @@ static struct attribute *dynamic_csrow_dimm_attr[] = { &dev_attr_legacy_ch9_dimm_label.attr.attr, &dev_attr_legacy_ch10_dimm_label.attr.attr, &dev_attr_legacy_ch11_dimm_label.attr.attr, + &dev_attr_legacy_ch12_dimm_label.attr.attr, + &dev_attr_legacy_ch13_dimm_label.attr.attr, + &dev_attr_legacy_ch14_dimm_label.attr.attr, + &dev_attr_legacy_ch15_dimm_label.attr.attr, NULL }; @@ -348,6 +360,14 @@ DEVICE_CHANNEL(ch10_ce_count, S_IRUGO, channel_ce_count_show, NULL, 10); DEVICE_CHANNEL(ch11_ce_count, S_IRUGO, channel_ce_count_show, NULL, 11); +DEVICE_CHANNEL(ch12_ce_count, S_IRUGO, + channel_ce_count_show, NULL, 12); +DEVICE_CHANNEL(ch13_ce_count, S_IRUGO, + channel_ce_count_show, NULL, 13); +DEVICE_CHANNEL(ch14_ce_count, S_IRUGO, + channel_ce_count_show, NULL, 14); +DEVICE_CHANNEL(ch15_ce_count, S_IRUGO, + channel_ce_count_show, NULL, 15); /* Total possible dynamic ce_count attribute file table */ static struct attribute *dynamic_csrow_ce_count_attr[] = { @@ -363,6 +383,10 @@ static struct attribute *dynamic_csrow_ce_count_attr[] = { &dev_attr_legacy_ch9_ce_count.attr.attr, &dev_attr_legacy_ch10_ce_count.attr.attr, &dev_attr_legacy_ch11_ce_count.attr.attr, + &dev_attr_legacy_ch12_ce_count.attr.attr, + &dev_attr_legacy_ch13_ce_count.attr.attr, + &dev_attr_legacy_ch14_ce_count.attr.attr, + &dev_attr_legacy_ch15_ce_count.attr.attr, NULL }; diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index a3fca2567752..2010a47149f4 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -62,6 +62,7 @@ ((GET_BITFIELD(reg, 0, 10) << 12) + 0x140000) #define I10NM_GNR_IMC_MMIO_OFFSET 0x24c000 +#define I10NM_GNR_D_IMC_MMIO_OFFSET 0x206000 #define I10NM_GNR_IMC_MMIO_SIZE 0x4000 #define I10NM_HBM_IMC_MMIO_SIZE 0x9000 #define I10NM_DDR_IMC_CH_CNT(reg) GET_BITFIELD(reg, 21, 24) @@ -343,7 +344,7 @@ static void show_retry_rd_err_log(struct decoded_addr *res, char *msg, status_mask = rrl->over_mask | rrl->uc_mask | rrl->v_mask; - n = snprintf(msg, len, " retry_rd_err_log["); + n = scnprintf(msg, len, " retry_rd_err_log["); for (i = 0; i < rrl->set_num; i++) { scrub = (rrl->modes[i] == FRE_SCRUB || rrl->modes[i] == LRE_SCRUB); if (scrub_err != scrub) @@ -355,9 +356,9 @@ static void show_retry_rd_err_log(struct decoded_addr *res, char *msg, log = read_imc_reg(imc, ch, offset, width); if (width == 4) - n += snprintf(msg + n, len - n, "%.8llx ", log); + n += scnprintf(msg + n, len - n, "%.8llx ", log); else - n += snprintf(msg + n, len - n, "%.16llx ", log); + n += scnprintf(msg + n, len - n, "%.16llx ", log); /* Clear RRL status if RRL in Linux control mode. */ if (retry_rd_err_log == 2 && !j && (log & status_mask)) @@ -367,10 +368,10 @@ static void show_retry_rd_err_log(struct decoded_addr *res, char *msg, /* Move back one space. */ n--; - n += snprintf(msg + n, len - n, "]"); + n += scnprintf(msg + n, len - n, "]"); if (len - n > 0) { - n += snprintf(msg + n, len - n, " correrrcnt["); + n += scnprintf(msg + n, len - n, " correrrcnt["); for (i = 0; i < rrl->cecnt_num && len - n > 0; i++) { offset = rrl->cecnt_offsets[i]; width = rrl->cecnt_widths[i]; @@ -378,20 +379,20 @@ static void show_retry_rd_err_log(struct decoded_addr *res, char *msg, /* CPUs {ICX,SPR} encode two counters per 4-byte CORRERRCNT register. */ if (res_cfg->type <= SPR) { - n += snprintf(msg + n, len - n, "%.4llx %.4llx ", + n += scnprintf(msg + n, len - n, "%.4llx %.4llx ", corr & 0xffff, corr >> 16); } else { /* CPUs {GNR} encode one counter per CORRERRCNT register. */ if (width == 4) - n += snprintf(msg + n, len - n, "%.8llx ", corr); + n += scnprintf(msg + n, len - n, "%.8llx ", corr); else - n += snprintf(msg + n, len - n, "%.16llx ", corr); + n += scnprintf(msg + n, len - n, "%.16llx ", corr); } } /* Move back one space. */ n--; - n += snprintf(msg + n, len - n, "]"); + n += scnprintf(msg + n, len - n, "]"); } } @@ -467,17 +468,18 @@ static int i10nm_get_imc_num(struct res_config *cfg) return -ENODEV; } - if (imc_num > I10NM_NUM_DDR_IMC) { - i10nm_printk(KERN_ERR, "Need to make I10NM_NUM_DDR_IMC >= %d\n", imc_num); - return -EINVAL; - } - if (cfg->ddr_imc_num != imc_num) { /* - * Store the number of present DDR memory controllers. + * Update the configuration data to reflect the number of + * present DDR memory controllers. */ cfg->ddr_imc_num = imc_num; edac_dbg(2, "Set DDR MC number: %d", imc_num); + + /* Release and reallocate skx_dev list with the updated number. */ + skx_remove(); + if (skx_get_all_bus_mappings(cfg, &i10nm_edac_list) <= 0) + return -ENODEV; } return 0; @@ -687,6 +689,14 @@ static struct pci_dev *get_gnr_mdev(struct skx_dev *d, int logical_idx, int *phy return NULL; } +static u32 get_gnr_imc_mmio_offset(void) +{ + if (boot_cpu_data.x86_vfm == INTEL_GRANITERAPIDS_D) + return I10NM_GNR_D_IMC_MMIO_OFFSET; + + return I10NM_GNR_IMC_MMIO_OFFSET; +} + /** * get_ddr_munit() - Get the resource of the i-th DDR memory controller. * @@ -715,7 +725,7 @@ static struct pci_dev *get_ddr_munit(struct skx_dev *d, int i, u32 *offset, unsi return NULL; *offset = I10NM_GET_IMC_MMIO_OFFSET(reg) + - I10NM_GNR_IMC_MMIO_OFFSET + + get_gnr_imc_mmio_offset() + physical_idx * I10NM_GNR_IMC_MMIO_SIZE; *size = I10NM_GNR_IMC_MMIO_SIZE; @@ -1030,6 +1040,7 @@ static const struct x86_cpu_id i10nm_cpuids[] = { X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &spr_cfg), X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &spr_cfg), X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, &gnr_cfg), + X86_MATCH_VFM(INTEL_GRANITERAPIDS_D, &gnr_cfg), X86_MATCH_VFM(INTEL_ATOM_CRESTMONT_X, &gnr_cfg), X86_MATCH_VFM(INTEL_ATOM_CRESTMONT, &gnr_cfg), X86_MATCH_VFM(INTEL_ATOM_DARKMONT_X, &gnr_cfg), @@ -1047,6 +1058,15 @@ static bool i10nm_check_ecc(struct skx_imc *imc, int chan) return !!GET_BITFIELD(mcmtr, 2, 2); } +static bool i10nm_channel_disabled(struct skx_imc *imc, int chan) +{ + u32 mcmtr = I10NM_GET_MCMTR(imc, chan); + + edac_dbg(1, "mc%d ch%d mcmtr reg %x\n", imc->mc, chan, mcmtr); + + return (mcmtr == ~0 || GET_BITFIELD(mcmtr, 18, 18)); +} + static int i10nm_get_dimm_config(struct mem_ctl_info *mci, struct res_config *cfg) { @@ -1060,6 +1080,11 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci, if (!imc->mbase) continue; + if (i10nm_channel_disabled(imc, i)) { + edac_dbg(1, "mc%d ch%d is disabled.\n", imc->mc, i); + continue; + } + ndimms = 0; if (res_cfg->type != GNR) diff --git a/drivers/edac/ie31200_edac.c b/drivers/edac/ie31200_edac.c index a53612be4b2f..5a080ab65476 100644 --- a/drivers/edac/ie31200_edac.c +++ b/drivers/edac/ie31200_edac.c @@ -87,13 +87,32 @@ #define PCI_DEVICE_ID_INTEL_IE31200_HB_CFL_10 0x3eca /* Raptor Lake-S */ -#define PCI_DEVICE_ID_INTEL_IE31200_RPL_S_1 0xa703 -#define PCI_DEVICE_ID_INTEL_IE31200_RPL_S_2 0x4640 -#define PCI_DEVICE_ID_INTEL_IE31200_RPL_S_3 0x4630 -#define PCI_DEVICE_ID_INTEL_IE31200_RPL_S_4 0xa700 +#define PCI_DEVICE_ID_INTEL_IE31200_RPL_S_1 0xa703 /* 8P+8E, e.g. i7-13700 */ +#define PCI_DEVICE_ID_INTEL_IE31200_RPL_S_2 0x4640 /* 6P+8E, e.g. i5-13500, i5-13600, i5-14500 */ +#define PCI_DEVICE_ID_INTEL_IE31200_RPL_S_3 0x4630 /* 4P+0E, e.g. i3-13100E */ +#define PCI_DEVICE_ID_INTEL_IE31200_RPL_S_4 0xa700 /* 8P+16E, e.g. i9-13900, i9-14900 */ +#define PCI_DEVICE_ID_INTEL_IE31200_RPL_S_5 0xa740 /* 8P+12E, e.g. i7-14700 */ +#define PCI_DEVICE_ID_INTEL_IE31200_RPL_S_6 0xa704 /* 6P+8E, e.g. i5-14600 */ + +/* Raptor Lake-HX */ +#define PCI_DEVICE_ID_INTEL_IE31200_RPL_HX_1 0xa702 /* 8P+16E, e.g. i9-13950HX */ /* Alder Lake-S */ #define PCI_DEVICE_ID_INTEL_IE31200_ADL_S_1 0x4660 +#define PCI_DEVICE_ID_INTEL_IE31200_ADL_S_2 0x4668 /* 8P+4E, e.g. i7-12700K */ +#define PCI_DEVICE_ID_INTEL_IE31200_ADL_S_3 0x4648 /* 6P+4E, e.g. i5-12600K */ + +/* Bartlett Lake-S */ +#define PCI_DEVICE_ID_INTEL_IE31200_BTL_S_1 0x4639 +#define PCI_DEVICE_ID_INTEL_IE31200_BTL_S_2 0x463c +#define PCI_DEVICE_ID_INTEL_IE31200_BTL_S_3 0x4642 +#define PCI_DEVICE_ID_INTEL_IE31200_BTL_S_4 0x4643 +#define PCI_DEVICE_ID_INTEL_IE31200_BTL_S_5 0xa731 +#define PCI_DEVICE_ID_INTEL_IE31200_BTL_S_6 0xa732 +#define PCI_DEVICE_ID_INTEL_IE31200_BTL_S_7 0xa733 +#define PCI_DEVICE_ID_INTEL_IE31200_BTL_S_8 0xa741 +#define PCI_DEVICE_ID_INTEL_IE31200_BTL_S_9 0xa744 +#define PCI_DEVICE_ID_INTEL_IE31200_BTL_S_10 0xa745 #define IE31200_RANKS_PER_CHANNEL 8 #define IE31200_DIMMS_PER_CHANNEL 2 @@ -740,7 +759,22 @@ static const struct pci_device_id ie31200_pci_tbl[] = { { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_RPL_S_2), (kernel_ulong_t)&rpl_s_cfg}, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_RPL_S_3), (kernel_ulong_t)&rpl_s_cfg}, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_RPL_S_4), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_RPL_S_5), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_RPL_S_6), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_RPL_HX_1), (kernel_ulong_t)&rpl_s_cfg}, { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_ADL_S_1), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_ADL_S_2), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_ADL_S_3), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_BTL_S_1), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_BTL_S_2), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_BTL_S_3), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_BTL_S_4), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_BTL_S_5), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_BTL_S_6), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_BTL_S_7), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_BTL_S_8), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_BTL_S_9), (kernel_ulong_t)&rpl_s_cfg}, + { PCI_VDEVICE(INTEL, PCI_DEVICE_ID_INTEL_IE31200_BTL_S_10), (kernel_ulong_t)&rpl_s_cfg}, { 0, } /* 0 terminated list. */ }; MODULE_DEVICE_TABLE(pci, ie31200_pci_tbl); diff --git a/drivers/edac/igen6_edac.c b/drivers/edac/igen6_edac.c index 1cb5c67e78ae..2fc59f9eed69 100644 --- a/drivers/edac/igen6_edac.c +++ b/drivers/edac/igen6_edac.c @@ -275,6 +275,9 @@ static struct work_struct ecclog_work; #define DID_PTL_H_SKU2 0xb001 #define DID_PTL_H_SKU3 0xb002 +/* Compute die IDs for Wildcat Lake with IBECC */ +#define DID_WCL_SKU1 0xfd00 + static int get_mchbar(struct pci_dev *pdev, u64 *mchbar) { union { @@ -569,6 +572,17 @@ static struct res_config mtl_p_cfg = { .err_addr_to_imc_addr = adl_err_addr_to_imc_addr, }; +static struct res_config wcl_cfg = { + .machine_check = true, + .num_imc = 1, + .imc_base = 0xd800, + .ibecc_base = 0xd400, + .ibecc_error_log_offset = 0x170, + .ibecc_available = mtl_p_ibecc_available, + .err_addr_to_sys_addr = adl_err_addr_to_sys_addr, + .err_addr_to_imc_addr = adl_err_addr_to_imc_addr, +}; + static struct pci_device_id igen6_pci_tbl[] = { { PCI_VDEVICE(INTEL, DID_EHL_SKU5), (kernel_ulong_t)&ehl_cfg }, { PCI_VDEVICE(INTEL, DID_EHL_SKU6), (kernel_ulong_t)&ehl_cfg }, @@ -622,6 +636,7 @@ static struct pci_device_id igen6_pci_tbl[] = { { PCI_VDEVICE(INTEL, DID_PTL_H_SKU1), (kernel_ulong_t)&mtl_p_cfg }, { PCI_VDEVICE(INTEL, DID_PTL_H_SKU2), (kernel_ulong_t)&mtl_p_cfg }, { PCI_VDEVICE(INTEL, DID_PTL_H_SKU3), (kernel_ulong_t)&mtl_p_cfg }, + { PCI_VDEVICE(INTEL, DID_WCL_SKU1), (kernel_ulong_t)&wcl_cfg }, { }, }; MODULE_DEVICE_TABLE(pci, igen6_pci_tbl); @@ -1351,7 +1366,7 @@ static int igen6_register_mcis(struct pci_dev *pdev, u64 mchbar) } if (lmc < res_cfg->num_imc) { - igen6_printk(KERN_WARNING, "Expected %d mcs, but only %d detected.", + igen6_printk(KERN_DEBUG, "Expected %d mcs, but only %d detected.", res_cfg->num_imc, lmc); res_cfg->num_imc = lmc; } diff --git a/drivers/edac/mem_repair.c b/drivers/edac/mem_repair.c index 70a033a76233..108d69209146 100755..100644 --- a/drivers/edac/mem_repair.c +++ b/drivers/edac/mem_repair.c @@ -286,17 +286,26 @@ static umode_t mem_repair_attr_visible(struct kobject *kobj, struct attribute *a return 0; } -#define MR_ATTR_RO(_name, _instance) \ - ((struct edac_mem_repair_dev_attr) { .dev_attr = __ATTR_RO(_name), \ - .instance = _instance }) - -#define MR_ATTR_WO(_name, _instance) \ - ((struct edac_mem_repair_dev_attr) { .dev_attr = __ATTR_WO(_name), \ - .instance = _instance }) - -#define MR_ATTR_RW(_name, _instance) \ - ((struct edac_mem_repair_dev_attr) { .dev_attr = __ATTR_RW(_name), \ - .instance = _instance }) +static const struct device_attribute mem_repair_dev_attr[] = { + [MR_TYPE] = __ATTR_RO(repair_type), + [MR_PERSIST_MODE] = __ATTR_RW(persist_mode), + [MR_SAFE_IN_USE] = __ATTR_RO(repair_safe_when_in_use), + [MR_HPA] = __ATTR_RW(hpa), + [MR_MIN_HPA] = __ATTR_RO(min_hpa), + [MR_MAX_HPA] = __ATTR_RO(max_hpa), + [MR_DPA] = __ATTR_RW(dpa), + [MR_MIN_DPA] = __ATTR_RO(min_dpa), + [MR_MAX_DPA] = __ATTR_RO(max_dpa), + [MR_NIBBLE_MASK] = __ATTR_RW(nibble_mask), + [MR_BANK_GROUP] = __ATTR_RW(bank_group), + [MR_BANK] = __ATTR_RW(bank), + [MR_RANK] = __ATTR_RW(rank), + [MR_ROW] = __ATTR_RW(row), + [MR_COLUMN] = __ATTR_RW(column), + [MR_CHANNEL] = __ATTR_RW(channel), + [MR_SUB_CHANNEL] = __ATTR_RW(sub_channel), + [MEM_DO_REPAIR] = __ATTR_WO(repair) +}; static int mem_repair_create_desc(struct device *dev, const struct attribute_group **attr_groups, @@ -305,34 +314,13 @@ static int mem_repair_create_desc(struct device *dev, struct edac_mem_repair_context *ctx; struct attribute_group *group; int i; - struct edac_mem_repair_dev_attr dev_attr[] = { - [MR_TYPE] = MR_ATTR_RO(repair_type, instance), - [MR_PERSIST_MODE] = MR_ATTR_RW(persist_mode, instance), - [MR_SAFE_IN_USE] = MR_ATTR_RO(repair_safe_when_in_use, instance), - [MR_HPA] = MR_ATTR_RW(hpa, instance), - [MR_MIN_HPA] = MR_ATTR_RO(min_hpa, instance), - [MR_MAX_HPA] = MR_ATTR_RO(max_hpa, instance), - [MR_DPA] = MR_ATTR_RW(dpa, instance), - [MR_MIN_DPA] = MR_ATTR_RO(min_dpa, instance), - [MR_MAX_DPA] = MR_ATTR_RO(max_dpa, instance), - [MR_NIBBLE_MASK] = MR_ATTR_RW(nibble_mask, instance), - [MR_BANK_GROUP] = MR_ATTR_RW(bank_group, instance), - [MR_BANK] = MR_ATTR_RW(bank, instance), - [MR_RANK] = MR_ATTR_RW(rank, instance), - [MR_ROW] = MR_ATTR_RW(row, instance), - [MR_COLUMN] = MR_ATTR_RW(column, instance), - [MR_CHANNEL] = MR_ATTR_RW(channel, instance), - [MR_SUB_CHANNEL] = MR_ATTR_RW(sub_channel, instance), - [MEM_DO_REPAIR] = MR_ATTR_WO(repair, instance) - }; - ctx = devm_kzalloc(dev, sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; for (i = 0; i < MR_MAX_ATTRS; i++) { - memcpy(&ctx->mem_repair_dev_attr[i], - &dev_attr[i], sizeof(dev_attr[i])); + ctx->mem_repair_dev_attr[i].dev_attr = mem_repair_dev_attr[i]; + ctx->mem_repair_dev_attr[i].instance = instance; sysfs_attr_init(&ctx->mem_repair_dev_attr[i].dev_attr.attr); ctx->mem_repair_attrs[i] = &ctx->mem_repair_dev_attr[i].dev_attr.attr; diff --git a/drivers/edac/scrub.c b/drivers/edac/scrub.c index f9d02af2fc3a..f9d02af2fc3a 100755..100644 --- a/drivers/edac/scrub.c +++ b/drivers/edac/scrub.c diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c index 29897b21fb8e..078ddf95cc6e 100644 --- a/drivers/edac/skx_base.c +++ b/drivers/edac/skx_base.c @@ -33,6 +33,15 @@ static unsigned int nvdimm_count; #define MASK26 0x3FFFFFF /* Mask for 2^26 */ #define MASK29 0x1FFFFFFF /* Mask for 2^29 */ +static struct res_config skx_cfg = { + .type = SKX, + .decs_did = 0x2016, + .busno_cfg_offset = 0xcc, + .ddr_imc_num = 2, + .ddr_chan_num = 3, + .ddr_dimm_num = 2, +}; + static struct skx_dev *get_skx_dev(struct pci_bus *bus, u8 idx) { struct skx_dev *d; @@ -52,7 +61,7 @@ enum munittype { struct munit { u16 did; - u16 devfn[SKX_NUM_IMC]; + u16 devfn[2]; u8 busidx; u8 per_socket; enum munittype mtype; @@ -89,11 +98,11 @@ static int get_all_munits(const struct munit *m) if (!pdev) break; ndev++; - if (m->per_socket == SKX_NUM_IMC) { - for (i = 0; i < SKX_NUM_IMC; i++) + if (m->per_socket == skx_cfg.ddr_imc_num) { + for (i = 0; i < skx_cfg.ddr_imc_num; i++) if (m->devfn[i] == pdev->devfn) break; - if (i == SKX_NUM_IMC) + if (i == skx_cfg.ddr_imc_num) goto fail; } d = get_skx_dev(pdev->bus, m->busidx); @@ -157,12 +166,6 @@ fail: return -ENODEV; } -static struct res_config skx_cfg = { - .type = SKX, - .decs_did = 0x2016, - .busno_cfg_offset = 0xcc, -}; - static const struct x86_cpu_id skx_cpuids[] = { X86_MATCH_VFM(INTEL_SKYLAKE_X, &skx_cfg), { } @@ -186,11 +189,11 @@ static int skx_get_dimm_config(struct mem_ctl_info *mci, struct res_config *cfg) /* Only the mcmtr on the first channel is effective */ pci_read_config_dword(imc->chan[0].cdev, 0x87c, &mcmtr); - for (i = 0; i < SKX_NUM_CHANNELS; i++) { + for (i = 0; i < cfg->ddr_chan_num; i++) { ndimms = 0; pci_read_config_dword(imc->chan[i].cdev, 0x8C, &amap); pci_read_config_dword(imc->chan[i].cdev, 0x400, &mcddrtcfg); - for (j = 0; j < SKX_NUM_DIMMS; j++) { + for (j = 0; j < cfg->ddr_dimm_num; j++) { dimm = edac_get_dimm(mci, i, j, 0); pci_read_config_dword(imc->chan[i].cdev, 0x80 + 4 * j, &mtr); @@ -620,6 +623,7 @@ static int __init skx_init(void) return -ENODEV; cfg = (struct res_config *)id->driver_data; + skx_set_res_cfg(cfg); rc = skx_get_hi_lo(0x2034, off, &skx_tolm, &skx_tohm); if (rc) @@ -652,10 +656,13 @@ static int __init skx_init(void) goto fail; edac_dbg(2, "src_id = %d\n", src_id); - for (i = 0; i < SKX_NUM_IMC; i++) { + for (i = 0; i < cfg->ddr_imc_num; i++) { d->imc[i].mc = mc++; d->imc[i].lmc = i; d->imc[i].src_id = src_id; + d->imc[i].num_channels = cfg->ddr_chan_num; + d->imc[i].num_dimms = cfg->ddr_dimm_num; + rc = skx_register_mci(&d->imc[i], d->imc[i].chan[0].cdev, "Skylake Socket", EDAC_MOD_STR, skx_get_dimm_config, cfg); diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c index c9ade45c1a99..724842f512ac 100644 --- a/drivers/edac/skx_common.c +++ b/drivers/edac/skx_common.c @@ -14,9 +14,11 @@ * Copyright (c) 2018, Intel Corporation. */ +#include <linux/topology.h> #include <linux/acpi.h> #include <linux/dmi.h> #include <linux/adxl.h> +#include <linux/overflow.h> #include <acpi/nfit.h> #include <asm/mce.h> #include <asm/uv/uv.h> @@ -130,8 +132,8 @@ static void skx_init_mc_mapping(struct skx_dev *d) * the logical indices of the memory controllers enumerated by the * EDAC driver. */ - for (int i = 0; i < NUM_IMC; i++) - d->mc_mapping[i] = i; + for (int i = 0; i < d->num_imc; i++) + d->imc[i].mc_mapping = i; } void skx_set_mc_mapping(struct skx_dev *d, u8 pmc, u8 lmc) @@ -139,22 +141,28 @@ void skx_set_mc_mapping(struct skx_dev *d, u8 pmc, u8 lmc) edac_dbg(0, "Set the mapping of mc phy idx to logical idx: %02d -> %02d\n", pmc, lmc); - d->mc_mapping[pmc] = lmc; + d->imc[lmc].mc_mapping = pmc; } EXPORT_SYMBOL_GPL(skx_set_mc_mapping); -static u8 skx_get_mc_mapping(struct skx_dev *d, u8 pmc) +static int skx_get_mc_mapping(struct skx_dev *d, u8 pmc) { - edac_dbg(0, "Get the mapping of mc phy idx to logical idx: %02d -> %02d\n", - pmc, d->mc_mapping[pmc]); + for (int lmc = 0; lmc < d->num_imc; lmc++) { + if (d->imc[lmc].mc_mapping == pmc) { + edac_dbg(0, "Get the mapping of mc phy idx to logical idx: %02d -> %02d\n", + pmc, lmc); - return d->mc_mapping[pmc]; + return lmc; + } + } + + return -1; } static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src) { + int i, lmc, len = 0; struct skx_dev *d; - int i, len = 0; if (res->addr >= skx_tohm || (res->addr >= skx_tolm && res->addr < BIT_ULL(32))) { @@ -200,7 +208,7 @@ static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src) res->cs = (int)adxl_values[component_indices[INDEX_CS]]; } - if (res->imc > NUM_IMC - 1 || res->imc < 0) { + if (res->imc < 0) { skx_printk(KERN_ERR, "Bad imc %d\n", res->imc); return false; } @@ -218,7 +226,13 @@ static bool skx_adxl_decode(struct decoded_addr *res, enum error_source err_src) return false; } - res->imc = skx_get_mc_mapping(d, res->imc); + lmc = skx_get_mc_mapping(d, res->imc); + if (lmc < 0) { + skx_printk(KERN_ERR, "No lmc for imc %d\n", res->imc); + return false; + } + + res->imc = lmc; for (i = 0; i < adxl_component_count; i++) { if (adxl_values[i] == ~0x0ull) @@ -265,7 +279,7 @@ static int skx_get_pkg_id(struct skx_dev *d, u8 *id) struct cpuinfo_x86 *c = &cpu_data(cpu); if (c->initialized && cpu_to_node(cpu) == node) { - *id = c->topo.pkg_id; + *id = topology_physical_package_id(cpu); return 0; } } @@ -320,10 +334,10 @@ static int get_width(u32 mtr) */ int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list) { + int ndev = 0, imc_num = cfg->ddr_imc_num + cfg->hbm_imc_num; struct pci_dev *pdev, *prev; struct skx_dev *d; u32 reg; - int ndev = 0; prev = NULL; for (;;) { @@ -331,7 +345,7 @@ int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list) if (!pdev) break; ndev++; - d = kzalloc(sizeof(*d), GFP_KERNEL); + d = kzalloc(struct_size(d, imc, imc_num), GFP_KERNEL); if (!d) { pci_dev_put(pdev); return -ENOMEM; @@ -354,8 +368,10 @@ int skx_get_all_bus_mappings(struct res_config *cfg, struct list_head **list) d->seg = GET_BITFIELD(reg, 16, 23); } - edac_dbg(2, "busses: 0x%x, 0x%x, 0x%x, 0x%x\n", - d->bus[0], d->bus[1], d->bus[2], d->bus[3]); + d->num_imc = imc_num; + + edac_dbg(2, "busses: 0x%x, 0x%x, 0x%x, 0x%x, imcs %d\n", + d->bus[0], d->bus[1], d->bus[2], d->bus[3], imc_num); list_add_tail(&d->list, &dev_edac_list); prev = pdev; @@ -541,10 +557,10 @@ int skx_register_mci(struct skx_imc *imc, struct pci_dev *pdev, /* Allocate a new MC control structure */ layers[0].type = EDAC_MC_LAYER_CHANNEL; - layers[0].size = NUM_CHANNELS; + layers[0].size = imc->num_channels; layers[0].is_virt_csrow = false; layers[1].type = EDAC_MC_LAYER_SLOT; - layers[1].size = NUM_DIMMS; + layers[1].size = imc->num_dimms; layers[1].is_virt_csrow = true; mci = edac_mc_alloc(imc->mc, ARRAY_SIZE(layers), layers, sizeof(struct skx_pvt)); @@ -670,12 +686,12 @@ static void skx_mce_output_error(struct mem_ctl_info *mci, } if (res->decoded_by_adxl) { - len = snprintf(skx_msg, MSG_SIZE, "%s%s err_code:0x%04x:0x%04x %s", + len = scnprintf(skx_msg, MSG_SIZE, "%s%s err_code:0x%04x:0x%04x %s", overflow ? " OVERFLOW" : "", (uncorrected_error && recoverable) ? " recoverable" : "", mscod, errcode, adxl_msg); } else { - len = snprintf(skx_msg, MSG_SIZE, + len = scnprintf(skx_msg, MSG_SIZE, "%s%s err_code:0x%04x:0x%04x ProcessorSocketId:0x%x MemoryControllerId:0x%x PhysicalRankId:0x%x Row:0x%x Column:0x%x Bank:0x%x BankGroup:0x%x", overflow ? " OVERFLOW" : "", (uncorrected_error && recoverable) ? " recoverable" : "", @@ -784,7 +800,7 @@ void skx_remove(void) list_for_each_entry_safe(d, tmp, &dev_edac_list, list) { list_del(&d->list); - for (i = 0; i < NUM_IMC; i++) { + for (i = 0; i < d->num_imc; i++) { if (d->imc[i].mci) skx_unregister_mci(&d->imc[i]); @@ -794,7 +810,7 @@ void skx_remove(void) if (d->imc[i].mbase) iounmap(d->imc[i].mbase); - for (j = 0; j < NUM_CHANNELS; j++) { + for (j = 0; j < d->imc[i].num_channels; j++) { if (d->imc[i].chan[j].cdev) pci_dev_put(d->imc[i].chan[j].cdev); } diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h index ec4966f7ea40..73ba89786cdf 100644 --- a/drivers/edac/skx_common.h +++ b/drivers/edac/skx_common.h @@ -29,23 +29,18 @@ #define GET_BITFIELD(v, lo, hi) \ (((v) & GENMASK_ULL((hi), (lo))) >> (lo)) -#define SKX_NUM_IMC 2 /* Memory controllers per socket */ #define SKX_NUM_CHANNELS 3 /* Channels per memory controller */ #define SKX_NUM_DIMMS 2 /* Max DIMMS per channel */ -#define I10NM_NUM_DDR_IMC 12 #define I10NM_NUM_DDR_CHANNELS 2 #define I10NM_NUM_DDR_DIMMS 2 -#define I10NM_NUM_HBM_IMC 16 #define I10NM_NUM_HBM_CHANNELS 2 #define I10NM_NUM_HBM_DIMMS 1 -#define I10NM_NUM_IMC (I10NM_NUM_DDR_IMC + I10NM_NUM_HBM_IMC) #define I10NM_NUM_CHANNELS MAX(I10NM_NUM_DDR_CHANNELS, I10NM_NUM_HBM_CHANNELS) #define I10NM_NUM_DIMMS MAX(I10NM_NUM_DDR_DIMMS, I10NM_NUM_HBM_DIMMS) -#define NUM_IMC MAX(SKX_NUM_IMC, I10NM_NUM_IMC) #define NUM_CHANNELS MAX(SKX_NUM_CHANNELS, I10NM_NUM_CHANNELS) #define NUM_DIMMS MAX(SKX_NUM_DIMMS, I10NM_NUM_DIMMS) @@ -134,16 +129,7 @@ struct skx_dev { struct pci_dev *uracu; /* for i10nm CPU */ struct pci_dev *pcu_cr3; /* for HBM memory detection */ u32 mcroute; - /* - * Some server BIOS may hide certain memory controllers, and the - * EDAC driver skips those hidden memory controllers. However, the - * ADXL still decodes memory error address using physical memory - * controller indices. The mapping table is used to convert the - * physical indices (reported by ADXL) to the logical indices - * (used the EDAC driver) of present memory controllers during the - * error handling process. - */ - u8 mc_mapping[NUM_IMC]; + int num_imc; struct skx_imc { struct mem_ctl_info *mci; struct pci_dev *mdev; /* for i10nm CPU */ @@ -155,6 +141,16 @@ struct skx_dev { u8 mc; /* system wide mc# */ u8 lmc; /* socket relative mc# */ u8 src_id; + /* + * Some server BIOS may hide certain memory controllers, and the + * EDAC driver skips those hidden memory controllers. However, the + * ADXL still decodes memory error address using physical memory + * controller indices. The mapping table is used to convert the + * physical indices (reported by ADXL) to the logical indices + * (used the EDAC driver) of present memory controllers during the + * error handling process. + */ + u8 mc_mapping; struct skx_channel { struct pci_dev *cdev; struct pci_dev *edev; @@ -171,7 +167,7 @@ struct skx_dev { u8 colbits; } dimms[NUM_DIMMS]; } chan[NUM_CHANNELS]; - } imc[NUM_IMC]; + } imc[]; }; struct skx_pvt { diff --git a/drivers/edac/synopsys_edac.c b/drivers/edac/synopsys_edac.c index 5ed32a3299c4..51143b3257de 100644 --- a/drivers/edac/synopsys_edac.c +++ b/drivers/edac/synopsys_edac.c @@ -332,20 +332,26 @@ struct synps_edac_priv { #endif }; +enum synps_platform_type { + ZYNQ, + ZYNQMP, + SYNPS, +}; + /** * struct synps_platform_data - synps platform data structure. + * @platform: Identifies the target hardware platform * @get_error_info: Get EDAC error info. * @get_mtype: Get mtype. * @get_dtype: Get dtype. - * @get_ecc_state: Get ECC state. * @get_mem_info: Get EDAC memory info * @quirks: To differentiate IPs. */ struct synps_platform_data { + enum synps_platform_type platform; int (*get_error_info)(struct synps_edac_priv *priv); enum mem_type (*get_mtype)(const void __iomem *base); enum dev_type (*get_dtype)(const void __iomem *base); - bool (*get_ecc_state)(void __iomem *base); #ifdef CONFIG_EDAC_DEBUG u64 (*get_mem_info)(struct synps_edac_priv *priv); #endif @@ -720,51 +726,38 @@ static enum dev_type zynqmp_get_dtype(const void __iomem *base) return dt; } -/** - * zynq_get_ecc_state - Return the controller ECC enable/disable status. - * @base: DDR memory controller base address. - * - * Get the ECC enable/disable status of the controller. - * - * Return: true if enabled, otherwise false. - */ -static bool zynq_get_ecc_state(void __iomem *base) +static bool get_ecc_state(struct synps_edac_priv *priv) { + u32 ecctype, clearval; enum dev_type dt; - u32 ecctype; - - dt = zynq_get_dtype(base); - if (dt == DEV_UNKNOWN) - return false; - ecctype = readl(base + SCRUB_OFST) & SCRUB_MODE_MASK; - if ((ecctype == SCRUB_MODE_SECDED) && (dt == DEV_X2)) - return true; - - return false; -} - -/** - * zynqmp_get_ecc_state - Return the controller ECC enable/disable status. - * @base: DDR memory controller base address. - * - * Get the ECC enable/disable status for the controller. - * - * Return: a ECC status boolean i.e true/false - enabled/disabled. - */ -static bool zynqmp_get_ecc_state(void __iomem *base) -{ - enum dev_type dt; - u32 ecctype; - - dt = zynqmp_get_dtype(base); - if (dt == DEV_UNKNOWN) - return false; - - ecctype = readl(base + ECC_CFG0_OFST) & SCRUB_MODE_MASK; - if ((ecctype == SCRUB_MODE_SECDED) && - ((dt == DEV_X2) || (dt == DEV_X4) || (dt == DEV_X8))) - return true; + if (priv->p_data->platform == ZYNQ) { + dt = zynq_get_dtype(priv->baseaddr); + if (dt == DEV_UNKNOWN) + return false; + + ecctype = readl(priv->baseaddr + SCRUB_OFST) & SCRUB_MODE_MASK; + if (ecctype == SCRUB_MODE_SECDED && dt == DEV_X2) { + clearval = ECC_CTRL_CLR_CE_ERR | ECC_CTRL_CLR_UE_ERR; + writel(clearval, priv->baseaddr + ECC_CTRL_OFST); + writel(0x0, priv->baseaddr + ECC_CTRL_OFST); + return true; + } + } else { + dt = zynqmp_get_dtype(priv->baseaddr); + if (dt == DEV_UNKNOWN) + return false; + + ecctype = readl(priv->baseaddr + ECC_CFG0_OFST) & SCRUB_MODE_MASK; + if (ecctype == SCRUB_MODE_SECDED && + (dt == DEV_X2 || dt == DEV_X4 || dt == DEV_X8)) { + clearval = readl(priv->baseaddr + ECC_CLR_OFST) | + ECC_CTRL_CLR_CE_ERR | ECC_CTRL_CLR_CE_ERRCNT | + ECC_CTRL_CLR_UE_ERR | ECC_CTRL_CLR_UE_ERRCNT; + writel(clearval, priv->baseaddr + ECC_CLR_OFST); + return true; + } + } return false; } @@ -934,18 +927,18 @@ static int setup_irq(struct mem_ctl_info *mci, } static const struct synps_platform_data zynq_edac_def = { + .platform = ZYNQ, .get_error_info = zynq_get_error_info, .get_mtype = zynq_get_mtype, .get_dtype = zynq_get_dtype, - .get_ecc_state = zynq_get_ecc_state, .quirks = 0, }; static const struct synps_platform_data zynqmp_edac_def = { + .platform = ZYNQMP, .get_error_info = zynqmp_get_error_info, .get_mtype = zynqmp_get_mtype, .get_dtype = zynqmp_get_dtype, - .get_ecc_state = zynqmp_get_ecc_state, #ifdef CONFIG_EDAC_DEBUG .get_mem_info = zynqmp_get_mem_info, #endif @@ -957,10 +950,10 @@ static const struct synps_platform_data zynqmp_edac_def = { }; static const struct synps_platform_data synopsys_edac_def = { + .platform = SYNPS, .get_error_info = zynqmp_get_error_info, .get_mtype = zynqmp_get_mtype, .get_dtype = zynqmp_get_dtype, - .get_ecc_state = zynqmp_get_ecc_state, .quirks = (DDR_ECC_INTR_SUPPORT | DDR_ECC_INTR_SELF_CLEAR #ifdef CONFIG_EDAC_DEBUG | DDR_ECC_DATA_POISON_SUPPORT @@ -1390,10 +1383,6 @@ static int mc_probe(struct platform_device *pdev) if (!p_data) return -ENODEV; - if (!p_data->get_ecc_state(baseaddr)) { - edac_printk(KERN_INFO, EDAC_MC, "ECC not enabled\n"); - return -ENXIO; - } layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; layers[0].size = SYNPS_EDAC_NR_CSROWS; @@ -1413,6 +1402,12 @@ static int mc_probe(struct platform_device *pdev) priv = mci->pvt_info; priv->baseaddr = baseaddr; priv->p_data = p_data; + if (!get_ecc_state(priv)) { + edac_printk(KERN_INFO, EDAC_MC, "ECC not enabled\n"); + rc = -ENODEV; + goto free_edac_mc; + } + spin_lock_init(&priv->reglock); mc_init(mci, pdev); diff --git a/drivers/edac/versalnet_edac.c b/drivers/edac/versalnet_edac.c new file mode 100644 index 000000000000..7c5db8bf0595 --- /dev/null +++ b/drivers/edac/versalnet_edac.c @@ -0,0 +1,960 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD Versal NET memory controller driver + * Copyright (C) 2025 Advanced Micro Devices, Inc. + */ + +#include <linux/cdx/edac_cdx_pcol.h> +#include <linux/edac.h> +#include <linux/module.h> +#include <linux/of_device.h> +#include <linux/ras.h> +#include <linux/remoteproc.h> +#include <linux/rpmsg.h> +#include <linux/sizes.h> +#include <ras/ras_event.h> + +#include "edac_module.h" + +/* Granularity of reported error in bytes */ +#define MC5_ERR_GRAIN 1 +#define MC_GET_DDR_CONFIG_IN_LEN 4 + +#define MC5_IRQ_CE_MASK GENMASK(18, 15) +#define MC5_IRQ_UE_MASK GENMASK(14, 11) + +#define MC5_RANK_1_MASK GENMASK(11, 6) +#define MASK_24 GENMASK(29, 24) +#define MASK_0 GENMASK(5, 0) + +#define MC5_LRANK_1_MASK GENMASK(11, 6) +#define MC5_LRANK_2_MASK GENMASK(17, 12) +#define MC5_BANK1_MASK GENMASK(11, 6) +#define MC5_GRP_0_MASK GENMASK(17, 12) +#define MC5_GRP_1_MASK GENMASK(23, 18) + +#define MC5_REGHI_ROW 7 +#define MC5_EACHBIT 1 +#define MC5_ERR_TYPE_CE 0 +#define MC5_ERR_TYPE_UE 1 +#define MC5_HIGH_MEM_EN BIT(20) +#define MC5_MEM_MASK GENMASK(19, 0) +#define MC5_X16_BASE 256 +#define MC5_X16_ECC 32 +#define MC5_X16_SIZE (MC5_X16_BASE + MC5_X16_ECC) +#define MC5_X32_SIZE 576 +#define MC5_HIMEM_BASE (256 * SZ_1M) +#define MC5_ILC_HIMEM_EN BIT(28) +#define MC5_ILC_MEM GENMASK(27, 0) +#define MC5_INTERLEAVE_SEL GENMASK(3, 0) +#define MC5_BUS_WIDTH_MASK GENMASK(19, 18) +#define MC5_NUM_CHANS_MASK BIT(17) +#define MC5_RANK_MASK GENMASK(15, 14) + +#define ERROR_LEVEL 2 +#define ERROR_ID 3 +#define TOTAL_ERR_LENGTH 5 +#define MSG_ERR_OFFSET 8 +#define MSG_ERR_LENGTH 9 +#define ERROR_DATA 10 +#define MCDI_RESPONSE 0xFF + +#define REG_MAX 152 +#define ADEC_MAX 152 +#define NUM_CONTROLLERS 8 +#define REGS_PER_CONTROLLER 19 +#define ADEC_NUM 19 +#define BUFFER_SZ 80 + +#define XDDR5_BUS_WIDTH_64 0 +#define XDDR5_BUS_WIDTH_32 1 +#define XDDR5_BUS_WIDTH_16 2 + +/** + * struct ecc_error_info - ECC error log information. + * @burstpos: Burst position. + * @lrank: Logical Rank number. + * @rank: Rank number. + * @group: Group number. + * @bank: Bank number. + * @col: Column number. + * @row: Row number. + * @rowhi: Row number higher bits. + * @i: Combined ECC error vector containing encoded values of burst position, + * rank, bank, column, and row information. + */ +union ecc_error_info { + struct { + u32 burstpos:3; + u32 lrank:4; + u32 rank:2; + u32 group:3; + u32 bank:2; + u32 col:11; + u32 row:7; + u32 rowhi; + }; + u64 i; +} __packed; + +/* Row and column bit positions in the address decoder (ADEC) registers. */ +union row_col_mapping { + struct { + u32 row0:6; + u32 row1:6; + u32 row2:6; + u32 row3:6; + u32 row4:6; + u32 reserved:2; + }; + struct { + u32 col1:6; + u32 col2:6; + u32 col3:6; + u32 col4:6; + u32 col5:6; + u32 reservedcol:2; + }; + u32 i; +} __packed; + +/** + * struct ecc_status - ECC status information to report. + * @ceinfo: Correctable errors. + * @ueinfo: Uncorrected errors. + * @channel: Channel number. + * @error_type: Error type. + */ +struct ecc_status { + union ecc_error_info ceinfo[2]; + union ecc_error_info ueinfo[2]; + u8 channel; + u8 error_type; +}; + +/** + * struct mc_priv - DDR memory controller private instance data. + * @message: Buffer for framing the event specific info. + * @stat: ECC status information. + * @error_id: The error id. + * @error_level: The error level. + * @dwidth: Width of data bus excluding ECC bits. + * @part_len: The support of the message received. + * @regs: The registers sent on the rpmsg. + * @adec: Address decode registers. + * @mci: Memory controller interface. + * @ept: rpmsg endpoint. + * @mcdi: The mcdi handle. + */ +struct mc_priv { + char message[256]; + struct ecc_status stat; + u32 error_id; + u32 error_level; + u32 dwidth; + u32 part_len; + u32 regs[REG_MAX]; + u32 adec[ADEC_MAX]; + struct mem_ctl_info *mci[NUM_CONTROLLERS]; + struct rpmsg_endpoint *ept; + struct cdx_mcdi *mcdi; +}; + +/* + * Address decoder (ADEC) registers to match the order in which the register + * information is received from the firmware. + */ +enum adec_info { + CONF = 0, + ADEC0, + ADEC1, + ADEC2, + ADEC3, + ADEC4, + ADEC5, + ADEC6, + ADEC7, + ADEC8, + ADEC9, + ADEC10, + ADEC11, + ADEC12, + ADEC13, + ADEC14, + ADEC15, + ADEC16, + ADECILC, +}; + +enum reg_info { + ISR = 0, + IMR, + ECCR0_ERR_STATUS, + ECCR0_ADDR_LO, + ECCR0_ADDR_HI, + ECCR0_DATA_LO, + ECCR0_DATA_HI, + ECCR0_PAR, + ECCR1_ERR_STATUS, + ECCR1_ADDR_LO, + ECCR1_ADDR_HI, + ECCR1_DATA_LO, + ECCR1_DATA_HI, + ECCR1_PAR, + XMPU_ERR, + XMPU_ERR_ADDR_L0, + XMPU_ERR_ADDR_HI, + XMPU_ERR_AXI_ID, + ADEC_CHK_ERR_LOG, +}; + +static bool get_ddr_info(u32 *error_data, struct mc_priv *priv) +{ + u32 reglo, reghi, parity, eccr0_val, eccr1_val, isr; + struct ecc_status *p; + + isr = error_data[ISR]; + + if (!(isr & (MC5_IRQ_UE_MASK | MC5_IRQ_CE_MASK))) + return false; + + eccr0_val = error_data[ECCR0_ERR_STATUS]; + eccr1_val = error_data[ECCR1_ERR_STATUS]; + + if (!eccr0_val && !eccr1_val) + return false; + + p = &priv->stat; + + if (!eccr0_val) + p->channel = 1; + else + p->channel = 0; + + reglo = error_data[ECCR0_ADDR_LO]; + reghi = error_data[ECCR0_ADDR_HI]; + if (isr & MC5_IRQ_CE_MASK) + p->ceinfo[0].i = reglo | (u64)reghi << 32; + else if (isr & MC5_IRQ_UE_MASK) + p->ueinfo[0].i = reglo | (u64)reghi << 32; + + parity = error_data[ECCR0_PAR]; + edac_dbg(2, "ERR DATA: 0x%08X%08X PARITY: 0x%08X\n", + reghi, reglo, parity); + + reglo = error_data[ECCR1_ADDR_LO]; + reghi = error_data[ECCR1_ADDR_HI]; + if (isr & MC5_IRQ_CE_MASK) + p->ceinfo[1].i = reglo | (u64)reghi << 32; + else if (isr & MC5_IRQ_UE_MASK) + p->ueinfo[1].i = reglo | (u64)reghi << 32; + + parity = error_data[ECCR1_PAR]; + edac_dbg(2, "ERR DATA: 0x%08X%08X PARITY: 0x%08X\n", + reghi, reglo, parity); + + return true; +} + +/** + * convert_to_physical - Convert @error_data to a physical address. + * @priv: DDR memory controller private instance data. + * @pinf: ECC error info structure. + * @controller: Controller number of the MC5 + * @error_data: the DDRMC5 ADEC address decoder register data + * + * Return: physical address of the DDR memory. + */ +static unsigned long convert_to_physical(struct mc_priv *priv, + union ecc_error_info pinf, + int controller, int *error_data) +{ + u32 row, blk, rsh_req_addr, interleave, ilc_base_ctrl_add, ilc_himem_en, reg, offset; + u64 high_mem_base, high_mem_offset, low_mem_offset, ilcmem_base; + unsigned long err_addr = 0, addr; + union row_col_mapping cols; + union row_col_mapping rows; + u32 col_bit_0; + + row = pinf.rowhi << MC5_REGHI_ROW | pinf.row; + offset = controller * ADEC_NUM; + + reg = error_data[ADEC6]; + rows.i = reg; + err_addr |= (row & BIT(0)) << rows.row0; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row1; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row2; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row3; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row4; + row >>= MC5_EACHBIT; + + reg = error_data[ADEC7]; + rows.i = reg; + err_addr |= (row & BIT(0)) << rows.row0; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row1; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row2; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row3; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row4; + row >>= MC5_EACHBIT; + + reg = error_data[ADEC8]; + rows.i = reg; + err_addr |= (row & BIT(0)) << rows.row0; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row1; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row2; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row3; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row4; + + reg = error_data[ADEC9]; + rows.i = reg; + + err_addr |= (row & BIT(0)) << rows.row0; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row1; + row >>= MC5_EACHBIT; + err_addr |= (row & BIT(0)) << rows.row2; + row >>= MC5_EACHBIT; + + col_bit_0 = FIELD_GET(MASK_24, error_data[ADEC9]); + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << col_bit_0; + + cols.i = error_data[ADEC10]; + err_addr |= (pinf.col & 1) << cols.col1; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col2; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col3; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col4; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col5; + pinf.col >>= 1; + + cols.i = error_data[ADEC11]; + err_addr |= (pinf.col & 1) << cols.col1; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col2; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col3; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col4; + pinf.col >>= 1; + err_addr |= (pinf.col & 1) << cols.col5; + pinf.col >>= 1; + + reg = error_data[ADEC12]; + err_addr |= (pinf.bank & BIT(0)) << (reg & MASK_0); + pinf.bank >>= MC5_EACHBIT; + err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_BANK1_MASK, reg); + pinf.bank >>= MC5_EACHBIT; + + err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_GRP_0_MASK, reg); + pinf.group >>= MC5_EACHBIT; + err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MC5_GRP_1_MASK, reg); + pinf.group >>= MC5_EACHBIT; + err_addr |= (pinf.bank & BIT(0)) << FIELD_GET(MASK_24, reg); + pinf.group >>= MC5_EACHBIT; + + reg = error_data[ADEC4]; + err_addr |= (pinf.rank & BIT(0)) << (reg & MASK_0); + pinf.rank >>= MC5_EACHBIT; + err_addr |= (pinf.rank & BIT(0)) << FIELD_GET(MC5_RANK_1_MASK, reg); + pinf.rank >>= MC5_EACHBIT; + + reg = error_data[ADEC5]; + err_addr |= (pinf.lrank & BIT(0)) << (reg & MASK_0); + pinf.lrank >>= MC5_EACHBIT; + err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MC5_LRANK_1_MASK, reg); + pinf.lrank >>= MC5_EACHBIT; + err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MC5_LRANK_2_MASK, reg); + pinf.lrank >>= MC5_EACHBIT; + err_addr |= (pinf.lrank & BIT(0)) << FIELD_GET(MASK_24, reg); + pinf.lrank >>= MC5_EACHBIT; + + high_mem_base = (priv->adec[ADEC2 + offset] & MC5_MEM_MASK) * MC5_HIMEM_BASE; + interleave = priv->adec[ADEC13 + offset] & MC5_INTERLEAVE_SEL; + + high_mem_offset = priv->adec[ADEC3 + offset] & MC5_MEM_MASK; + low_mem_offset = priv->adec[ADEC1 + offset] & MC5_MEM_MASK; + reg = priv->adec[ADEC14 + offset]; + ilc_himem_en = !!(reg & MC5_ILC_HIMEM_EN); + ilcmem_base = (reg & MC5_ILC_MEM) * SZ_1M; + if (ilc_himem_en) + ilc_base_ctrl_add = ilcmem_base - high_mem_offset; + else + ilc_base_ctrl_add = ilcmem_base - low_mem_offset; + + if (priv->dwidth == DEV_X16) { + blk = err_addr / MC5_X16_SIZE; + rsh_req_addr = (blk << 8) + ilc_base_ctrl_add; + err_addr = rsh_req_addr * interleave * 2; + } else { + blk = err_addr / MC5_X32_SIZE; + rsh_req_addr = (blk << 9) + ilc_base_ctrl_add; + err_addr = rsh_req_addr * interleave * 2; + } + + if ((priv->adec[ADEC2 + offset] & MC5_HIGH_MEM_EN) && err_addr >= high_mem_base) + addr = err_addr - high_mem_offset; + else + addr = err_addr - low_mem_offset; + + return addr; +} + +/** + * handle_error - Handle errors. + * @priv: DDR memory controller private instance data. + * @stat: ECC status structure. + * @ctl_num: Controller number of the MC5 + * @error_data: the MC5 ADEC address decoder register data + * + * Handles ECC correctable and uncorrectable errors. + */ +static void handle_error(struct mc_priv *priv, struct ecc_status *stat, + int ctl_num, int *error_data) +{ + union ecc_error_info pinf; + struct mem_ctl_info *mci; + unsigned long pa; + phys_addr_t pfn; + int err; + + if (WARN_ON_ONCE(ctl_num > NUM_CONTROLLERS)) + return; + + mci = priv->mci[ctl_num]; + + if (stat->error_type == MC5_ERR_TYPE_CE) { + pinf = stat->ceinfo[stat->channel]; + snprintf(priv->message, sizeof(priv->message), + "Error type:%s Controller %d Addr at %lx\n", + "CE", ctl_num, convert_to_physical(priv, pinf, ctl_num, error_data)); + + edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci, + 1, 0, 0, 0, 0, 0, -1, + priv->message, ""); + } + + if (stat->error_type == MC5_ERR_TYPE_UE) { + pinf = stat->ueinfo[stat->channel]; + snprintf(priv->message, sizeof(priv->message), + "Error type:%s controller %d Addr at %lx\n", + "UE", ctl_num, convert_to_physical(priv, pinf, ctl_num, error_data)); + + edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci, + 1, 0, 0, 0, 0, 0, -1, + priv->message, ""); + pa = convert_to_physical(priv, pinf, ctl_num, error_data); + pfn = PHYS_PFN(pa); + + if (IS_ENABLED(CONFIG_MEMORY_FAILURE)) { + err = memory_failure(pfn, MF_ACTION_REQUIRED); + if (err) + edac_dbg(2, "memory_failure() error: %d", err); + else + edac_dbg(2, "Poison page at PA 0x%lx\n", pa); + } + } +} + +static void mc_init(struct mem_ctl_info *mci, struct device *dev) +{ + struct mc_priv *priv = mci->pvt_info; + struct csrow_info *csi; + struct dimm_info *dimm; + u32 row; + int ch; + + /* Initialize controller capabilities and configuration */ + mci->mtype_cap = MEM_FLAG_DDR5; + mci->edac_ctl_cap = EDAC_FLAG_NONE | EDAC_FLAG_SECDED; + mci->scrub_cap = SCRUB_HW_SRC; + mci->scrub_mode = SCRUB_NONE; + + mci->edac_cap = EDAC_FLAG_SECDED; + mci->ctl_name = "VersalNET DDR5"; + mci->dev_name = dev_name(dev); + mci->mod_name = "versalnet_edac"; + + edac_op_state = EDAC_OPSTATE_INT; + + for (row = 0; row < mci->nr_csrows; row++) { + csi = mci->csrows[row]; + for (ch = 0; ch < csi->nr_channels; ch++) { + dimm = csi->channels[ch]->dimm; + dimm->edac_mode = EDAC_SECDED; + dimm->mtype = MEM_DDR5; + dimm->grain = MC5_ERR_GRAIN; + dimm->dtype = priv->dwidth; + } + } +} + +#define to_mci(k) container_of(k, struct mem_ctl_info, dev) + +static unsigned int mcdi_rpc_timeout(struct cdx_mcdi *cdx, unsigned int cmd) +{ + return MCDI_RPC_TIMEOUT; +} + +static void mcdi_request(struct cdx_mcdi *cdx, + const struct cdx_dword *hdr, size_t hdr_len, + const struct cdx_dword *sdu, size_t sdu_len) +{ + void *send_buf; + int ret; + + send_buf = kzalloc(hdr_len + sdu_len, GFP_KERNEL); + if (!send_buf) + return; + + memcpy(send_buf, hdr, hdr_len); + memcpy(send_buf + hdr_len, sdu, sdu_len); + + ret = rpmsg_send(cdx->ept, send_buf, hdr_len + sdu_len); + if (ret) + dev_err(&cdx->rpdev->dev, "Failed to send rpmsg data: %d\n", ret); + + kfree(send_buf); +} + +static const struct cdx_mcdi_ops mcdi_ops = { + .mcdi_rpc_timeout = mcdi_rpc_timeout, + .mcdi_request = mcdi_request, +}; + +static void get_ddr_config(u32 index, u32 *buffer, struct cdx_mcdi *amd_mcdi) +{ + size_t outlen; + int ret; + + MCDI_DECLARE_BUF(inbuf, MC_GET_DDR_CONFIG_IN_LEN); + MCDI_DECLARE_BUF(outbuf, BUFFER_SZ); + + MCDI_SET_DWORD(inbuf, EDAC_GET_DDR_CONFIG_IN_CONTROLLER_INDEX, index); + + ret = cdx_mcdi_rpc(amd_mcdi, MC_CMD_EDAC_GET_DDR_CONFIG, inbuf, sizeof(inbuf), + outbuf, sizeof(outbuf), &outlen); + if (!ret) + memcpy(buffer, MCDI_PTR(outbuf, GET_DDR_CONFIG), + (ADEC_NUM * 4)); +} + +static int setup_mcdi(struct mc_priv *mc_priv) +{ + struct cdx_mcdi *amd_mcdi; + int ret, i; + + amd_mcdi = kzalloc(sizeof(*amd_mcdi), GFP_KERNEL); + if (!amd_mcdi) + return -ENOMEM; + + amd_mcdi->mcdi_ops = &mcdi_ops; + ret = cdx_mcdi_init(amd_mcdi); + if (ret) { + kfree(amd_mcdi); + return ret; + } + + amd_mcdi->ept = mc_priv->ept; + mc_priv->mcdi = amd_mcdi; + + for (i = 0; i < NUM_CONTROLLERS; i++) + get_ddr_config(i, &mc_priv->adec[ADEC_NUM * i], amd_mcdi); + + return 0; +} + +static const guid_t amd_versalnet_guid = GUID_INIT(0x82678888, 0xa556, 0x44f2, + 0xb8, 0xb4, 0x45, 0x56, 0x2e, + 0x8c, 0x5b, 0xec); + +static int rpmsg_cb(struct rpmsg_device *rpdev, void *data, + int len, void *priv, u32 src) +{ + struct mc_priv *mc_priv = dev_get_drvdata(&rpdev->dev); + const guid_t *sec_type = &guid_null; + u32 length, offset, error_id; + u32 *result = (u32 *)data; + struct ecc_status *p; + int i, j, k, sec_sev; + const char *err_str; + u32 *adec_data; + + if (*(u8 *)data == MCDI_RESPONSE) { + cdx_mcdi_process_cmd(mc_priv->mcdi, (struct cdx_dword *)data, len); + return 0; + } + + sec_sev = result[ERROR_LEVEL]; + error_id = result[ERROR_ID]; + length = result[MSG_ERR_LENGTH]; + offset = result[MSG_ERR_OFFSET]; + + if (result[TOTAL_ERR_LENGTH] > length) { + if (!mc_priv->part_len) + mc_priv->part_len = length; + else + mc_priv->part_len += length; + /* + * The data can come in 2 stretches. Construct the regs from 2 + * messages the offset indicates the offset from which the data is to + * be taken + */ + for (i = 0 ; i < length; i++) { + k = offset + i; + j = ERROR_DATA + i; + mc_priv->regs[k] = result[j]; + } + if (mc_priv->part_len < result[TOTAL_ERR_LENGTH]) + return 0; + mc_priv->part_len = 0; + } + + mc_priv->error_id = error_id; + mc_priv->error_level = result[ERROR_LEVEL]; + + switch (error_id) { + case 5: err_str = "General Software Non-Correctable error"; break; + case 6: err_str = "CFU error"; break; + case 7: err_str = "CFRAME error"; break; + case 10: err_str = "DDRMC Microblaze Correctable ECC error"; break; + case 11: err_str = "DDRMC Microblaze Non-Correctable ECC error"; break; + case 15: err_str = "MMCM error"; break; + case 16: err_str = "HNICX Correctable error"; break; + case 17: err_str = "HNICX Non-Correctable error"; break; + + case 18: + p = &mc_priv->stat; + memset(p, 0, sizeof(struct ecc_status)); + p->error_type = MC5_ERR_TYPE_CE; + for (i = 0 ; i < NUM_CONTROLLERS; i++) { + if (get_ddr_info(&mc_priv->regs[i * REGS_PER_CONTROLLER], mc_priv)) { + adec_data = mc_priv->adec + ADEC_NUM * i; + handle_error(mc_priv, &mc_priv->stat, i, adec_data); + } + } + return 0; + case 19: + p = &mc_priv->stat; + memset(p, 0, sizeof(struct ecc_status)); + p->error_type = MC5_ERR_TYPE_UE; + for (i = 0 ; i < NUM_CONTROLLERS; i++) { + if (get_ddr_info(&mc_priv->regs[i * REGS_PER_CONTROLLER], mc_priv)) { + adec_data = mc_priv->adec + ADEC_NUM * i; + handle_error(mc_priv, &mc_priv->stat, i, adec_data); + } + } + return 0; + + case 21: err_str = "GT Non-Correctable error"; break; + case 22: err_str = "PL Sysmon Correctable error"; break; + case 23: err_str = "PL Sysmon Non-Correctable error"; break; + case 111: err_str = "LPX unexpected dfx activation error"; break; + case 114: err_str = "INT_LPD Non-Correctable error"; break; + case 116: err_str = "INT_OCM Non-Correctable error"; break; + case 117: err_str = "INT_FPD Correctable error"; break; + case 118: err_str = "INT_FPD Non-Correctable error"; break; + case 120: err_str = "INT_IOU Non-Correctable error"; break; + case 123: err_str = "err_int_irq from APU GIC Distributor"; break; + case 124: err_str = "fault_int_irq from APU GIC Distribute"; break; + case 132 ... 139: err_str = "FPX SPLITTER error"; break; + case 140: err_str = "APU Cluster 0 error"; break; + case 141: err_str = "APU Cluster 1 error"; break; + case 142: err_str = "APU Cluster 2 error"; break; + case 143: err_str = "APU Cluster 3 error"; break; + case 145: err_str = "WWDT1 LPX error"; break; + case 147: err_str = "IPI error"; break; + case 152 ... 153: err_str = "AFIFS error"; break; + case 154 ... 155: err_str = "LPX glitch error"; break; + case 185 ... 186: err_str = "FPX AFIFS error"; break; + case 195 ... 199: err_str = "AFIFM error"; break; + case 108: err_str = "PSM Correctable error"; break; + case 59: err_str = "PMC correctable error"; break; + case 60: err_str = "PMC Un correctable error"; break; + case 43 ... 47: err_str = "PMC Sysmon error"; break; + case 163 ... 184: err_str = "RPU error"; break; + case 148: err_str = "OCM0 correctable error"; break; + case 149: err_str = "OCM1 correctable error"; break; + case 150: err_str = "OCM0 Un-correctable error"; break; + case 151: err_str = "OCM1 Un-correctable error"; break; + case 189: err_str = "PSX_CMN_3 PD block consolidated error"; break; + case 191: err_str = "FPD_INT_WRAP PD block consolidated error"; break; + case 232: err_str = "CRAM Un-Correctable error"; break; + default: err_str = "VERSAL_EDAC_ERR_ID: %d"; break; + } + + snprintf(mc_priv->message, + sizeof(mc_priv->message), + "[VERSAL_EDAC_ERR_ID: %d] Error type: %s", error_id, err_str); + + /* Convert to bytes */ + length = result[TOTAL_ERR_LENGTH] * 4; + log_non_standard_event(sec_type, &amd_versalnet_guid, mc_priv->message, + sec_sev, (void *)&result[ERROR_DATA], length); + + return 0; +} + +static struct rpmsg_device_id amd_rpmsg_id_table[] = { + { .name = "error_ipc" }, + { }, +}; +MODULE_DEVICE_TABLE(rpmsg, amd_rpmsg_id_table); + +static int rpmsg_probe(struct rpmsg_device *rpdev) +{ + struct rpmsg_channel_info chinfo; + struct mc_priv *pg; + + pg = (struct mc_priv *)amd_rpmsg_id_table[0].driver_data; + chinfo.src = RPMSG_ADDR_ANY; + chinfo.dst = rpdev->dst; + strscpy(chinfo.name, amd_rpmsg_id_table[0].name, + strlen(amd_rpmsg_id_table[0].name)); + + pg->ept = rpmsg_create_ept(rpdev, rpmsg_cb, NULL, chinfo); + if (!pg->ept) + return dev_err_probe(&rpdev->dev, -ENXIO, "Failed to create ept for channel %s\n", + chinfo.name); + + dev_set_drvdata(&rpdev->dev, pg); + + return 0; +} + +static void rpmsg_remove(struct rpmsg_device *rpdev) +{ + struct mc_priv *mc_priv = dev_get_drvdata(&rpdev->dev); + + rpmsg_destroy_ept(mc_priv->ept); + dev_set_drvdata(&rpdev->dev, NULL); +} + +static struct rpmsg_driver amd_rpmsg_driver = { + .drv.name = KBUILD_MODNAME, + .probe = rpmsg_probe, + .remove = rpmsg_remove, + .callback = rpmsg_cb, + .id_table = amd_rpmsg_id_table, +}; + +static void versal_edac_release(struct device *dev) +{ + kfree(dev); +} + +static int init_versalnet(struct mc_priv *priv, struct platform_device *pdev) +{ + u32 num_chans, rank, dwidth, config; + struct edac_mc_layer layers[2]; + struct mem_ctl_info *mci; + struct device *dev; + enum dev_type dt; + char *name; + int rc, i; + + for (i = 0; i < NUM_CONTROLLERS; i++) { + config = priv->adec[CONF + i * ADEC_NUM]; + num_chans = FIELD_GET(MC5_NUM_CHANS_MASK, config); + rank = 1 << FIELD_GET(MC5_RANK_MASK, config); + dwidth = FIELD_GET(MC5_BUS_WIDTH_MASK, config); + + switch (dwidth) { + case XDDR5_BUS_WIDTH_16: + dt = DEV_X16; + break; + case XDDR5_BUS_WIDTH_32: + dt = DEV_X32; + break; + case XDDR5_BUS_WIDTH_64: + dt = DEV_X64; + break; + default: + dt = DEV_UNKNOWN; + } + + if (dt == DEV_UNKNOWN) + continue; + + /* Find the first enabled device and register that one. */ + layers[0].type = EDAC_MC_LAYER_CHIP_SELECT; + layers[0].size = rank; + layers[0].is_virt_csrow = true; + layers[1].type = EDAC_MC_LAYER_CHANNEL; + layers[1].size = num_chans; + layers[1].is_virt_csrow = false; + + rc = -ENOMEM; + mci = edac_mc_alloc(i, ARRAY_SIZE(layers), layers, + sizeof(struct mc_priv)); + if (!mci) { + edac_printk(KERN_ERR, EDAC_MC, "Failed memory allocation for MC%d\n", i); + goto err_alloc; + } + + priv->mci[i] = mci; + priv->dwidth = dt; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + dev->release = versal_edac_release; + name = kmalloc(32, GFP_KERNEL); + sprintf(name, "versal-net-ddrmc5-edac-%d", i); + dev->init_name = name; + rc = device_register(dev); + if (rc) + goto err_alloc; + + mci->pdev = dev; + + platform_set_drvdata(pdev, priv); + + mc_init(mci, dev); + rc = edac_mc_add_mc(mci); + if (rc) { + edac_printk(KERN_ERR, EDAC_MC, "Failed to register MC%d with EDAC core\n", i); + goto err_alloc; + } + } + return 0; + +err_alloc: + while (i--) { + mci = priv->mci[i]; + if (!mci) + continue; + + if (mci->pdev) { + device_unregister(mci->pdev); + edac_mc_del_mc(mci->pdev); + } + + edac_mc_free(mci); + } + + return rc; +} + +static void remove_versalnet(struct mc_priv *priv) +{ + struct mem_ctl_info *mci; + int i; + + for (i = 0; i < NUM_CONTROLLERS; i++) { + device_unregister(priv->mci[i]->pdev); + mci = edac_mc_del_mc(priv->mci[i]->pdev); + if (!mci) + return; + + edac_mc_free(mci); + } +} + +static int mc_probe(struct platform_device *pdev) +{ + struct device_node *r5_core_node; + struct mc_priv *priv; + struct rproc *rp; + int rc; + + r5_core_node = of_parse_phandle(pdev->dev.of_node, "amd,rproc", 0); + if (!r5_core_node) { + dev_err(&pdev->dev, "amd,rproc: invalid phandle\n"); + return -EINVAL; + } + + rp = rproc_get_by_phandle(r5_core_node->phandle); + if (!rp) + return -EPROBE_DEFER; + + rc = rproc_boot(rp); + if (rc) { + dev_err(&pdev->dev, "Failed to attach to remote processor\n"); + goto err_rproc_boot; + } + + priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL); + if (!priv) { + rc = -ENOMEM; + goto err_alloc; + } + + amd_rpmsg_id_table[0].driver_data = (kernel_ulong_t)priv; + + rc = register_rpmsg_driver(&amd_rpmsg_driver); + if (rc) { + edac_printk(KERN_ERR, EDAC_MC, "Failed to register RPMsg driver: %d\n", rc); + goto err_alloc; + } + + rc = setup_mcdi(priv); + if (rc) + goto err_unreg; + + priv->mcdi->r5_rproc = rp; + + rc = init_versalnet(priv, pdev); + if (rc) + goto err_init; + + return 0; + +err_init: + cdx_mcdi_finish(priv->mcdi); + +err_unreg: + unregister_rpmsg_driver(&amd_rpmsg_driver); + +err_alloc: + rproc_shutdown(rp); + +err_rproc_boot: + rproc_put(rp); + + return rc; +} + +static void mc_remove(struct platform_device *pdev) +{ + struct mc_priv *priv = platform_get_drvdata(pdev); + + unregister_rpmsg_driver(&amd_rpmsg_driver); + remove_versalnet(priv); + rproc_shutdown(priv->mcdi->r5_rproc); + cdx_mcdi_finish(priv->mcdi); +} + +static const struct of_device_id amd_edac_match[] = { + { .compatible = "xlnx,versal-net-ddrmc5", }, + {} +}; +MODULE_DEVICE_TABLE(of, amd_edac_match); + +static struct platform_driver amd_ddr_edac_mc_driver = { + .driver = { + .name = "versal-net-edac", + .of_match_table = amd_edac_match, + }, + .probe = mc_probe, + .remove = mc_remove, +}; + +module_platform_driver(amd_ddr_edac_mc_driver); + +MODULE_AUTHOR("AMD Inc"); +MODULE_DESCRIPTION("Versal NET EDAC driver"); +MODULE_LICENSE("GPL"); |