EDAC, sb_edac: Fix reporting for patrol scrubber errors

sb_edac sometimes reports the wrong DIMM for a memory error found by the patrol scrubber. That is because the hardware provides only a 4KB page-aligned address for the error case. This means that the EDAC driver will point at the DIMM matching offset 0x0 in the 4KB page, but because of interleaving across channels and ranks, the actual DIMM involved may be different if the error is on some other cache line within the page. Therefore, reconstruct the socket/iMC/channel information from the "mce" structure passed to the EDAC driver. The DIMM cannot be determined, so pass "dimm=-1" to the EDAC core. It will report that all the DIMMs on that channel may be affected. Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com> Cc: Aristeu Rozanski <aris@redhat.com> Cc: Mauro Carvalho Chehab <mchehab@kernel.org> Cc: Qiuxu Zhuo <qiuxu.zhuo@intel.com> Cc: linux-edac <linux-edac@vger.kernel.org> Link: http://lkml.kernel.org/r/20180907230828.13901-3-tony.luck@intel.com [ Improve comments on the functions to convert bank number to memory controller number. Minor cleanup to commit message. ] Signed-off-by: Tony Luck <tony.luck@intel.com> [ Massage commit message more. ] Signed-off-by: Borislav Petkov <bp@suse.de>
author: Qiuxu Zhuo <qiuxu.zhuo@intel.com> 2018-09-11 00:11:45 +0300
committer: Borislav Petkov <bp@suse.de> 2018-09-11 12:09:54 +0300
commit: 8489b17ce29d9a35a36c08bbea93cdce4c98a6ad (patch)
tree: 752d188b55996a1182fc587ab1399a71539924c7 /drivers/edac
parent: dcc960b225ceb2bd66c45e0845d03e577f7010f9 (diff)
download: linux-8489b17ce29d9a35a36c08bbea93cdce4c98a6ad.tar.xz
1 files changed, 110 insertions, 6 deletions
diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
index ee2e2e53935f..d4a10793f807 100644
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -326,6 +326,7 @@ struct sbridge_info {
 	const struct interleave_pkg *interleave_pkg;
 	u8		max_sad;
 	u8		(*get_node_id)(struct sbridge_pvt *pvt);
+	u8		(*get_ha)(u8 bank);
 	enum mem_type	(*get_memory_type)(struct sbridge_pvt *pvt);
 	enum dev_type	(*get_width)(struct sbridge_pvt *pvt, u32 mtr);
 	struct pci_dev	*pci_vtd;
@@ -1002,6 +1003,39 @@ static u8 knl_get_node_id(struct sbridge_pvt *pvt)
 	return GET_BITFIELD(reg, 0, 2);
 }
 
+/*
+ * Use the reporting bank number to determine which memory
+ * controller (also known as "ha" for "home agent"). Sandy
+ * Bridge only has one memory controller per socket, so the
+ * answer is always zero.
+ */
+static u8 sbridge_get_ha(u8 bank)
+{
+	return 0;
+}
+
+/*
+ * On Ivy Bridge, Haswell and Broadwell the error may be in a
+ * home agent bank (7, 8), or one of the per-channel memory
+ * controller banks (9 .. 16).
+ */
+static u8 ibridge_get_ha(u8 bank)
+{
+	switch (bank) {
+	case 7 ... 8:
+		return bank - 7;
+	case 9 ... 16:
+		return (bank - 9) / 4;
+	default:
+		return -EINVAL;
+	}
+}
+
+/* Not used, but included for safety/symmetry */
+static u8 knl_get_ha(u8 bank)
+{
+	return -EINVAL;
+}
 
 static u64 haswell_get_tolm(struct sbridge_pvt *pvt)
 {
@@ -2207,6 +2241,60 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
 	return 0;
 }
 
+static int get_memory_error_data_from_mce(struct mem_ctl_info *mci,
+					  const struct mce *m, u8 *socket,
+					  u8 *ha, long *channel_mask,
+					  char *msg)
+{
+	u32 reg, channel = GET_BITFIELD(m->status, 0, 3);
+	struct mem_ctl_info *new_mci;
+	struct sbridge_pvt *pvt;
+	struct pci_dev *pci_ha;
+	bool tad0;
+
+	if (channel >= NUM_CHANNELS) {
+		sprintf(msg, "Invalid channel 0x%x", channel);
+		return -EINVAL;
+	}
+
+	pvt = mci->pvt_info;
+	if (!pvt->info.get_ha) {
+		sprintf(msg, "No get_ha()");
+		return -EINVAL;
+	}
+	*ha = pvt->info.get_ha(m->bank);
+	if (*ha != 0 && *ha != 1) {
+		sprintf(msg, "Impossible bank %d", m->bank);
+		return -EINVAL;
+	}
+
+	*socket = m->socketid;
+	new_mci = get_mci_for_node_id(*socket, *ha);
+	if (!new_mci) {
+		strcpy(msg, "mci socket got corrupted!");
+		return -EINVAL;
+	}
+
+	pvt = new_mci->pvt_info;
+	pci_ha = pvt->pci_ha;
+	pci_read_config_dword(pci_ha, tad_dram_rule[0], &reg);
+	tad0 = m->addr <= TAD_LIMIT(reg);
+
+	*channel_mask = 1 << channel;
+	if (pvt->mirror_mode == FULL_MIRRORING ||
+	    (pvt->mirror_mode == ADDR_RANGE_MIRRORING && tad0)) {
+		*channel_mask |= 1 << ((channel + 2) % 4);
+		pvt->is_cur_addr_mirrored = true;
+	} else {
+		pvt->is_cur_addr_mirrored = false;
+	}
+
+	if (pvt->is_lockstep)
+		*channel_mask |= 1 << ((channel + 1) % 4);
+
+	return 0;
+}
+
 /****************************************************************************
 	Device initialization routines: put/get, init/exit
  ****************************************************************************/
@@ -2877,10 +2965,16 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
 	u32 errcode = GET_BITFIELD(m->status, 0, 15);
 	u32 channel = GET_BITFIELD(m->status, 0, 3);
 	u32 optypenum = GET_BITFIELD(m->status, 4, 6);
+	/*
+	 * Bits 5-0 of MCi_MISC give the least significant bit that is valid.
+	 * A value 6 is for cache line aligned address, a value 12 is for page
+	 * aligned address reported by patrol scrubber.
+	 */
+	u32 lsb = GET_BITFIELD(m->misc, 0, 5);
 	long channel_mask, first_channel;
-	u8  rank, socket, ha;
+	u8  rank = 0xff, socket, ha;
 	int rc, dimm;
-	char *area_type = NULL;
+	char *area_type = "DRAM";
 
 	if (pvt->info.type != SANDY_BRIDGE)
 		recoverable = true;
@@ -2964,9 +3058,13 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
 				optype, msg);
 		}
 		return;
-	} else {
+	} else if (lsb < 12) {
 		rc = get_memory_error_data(mci, m->addr, &socket, &ha,
-				&channel_mask, &rank, &area_type, msg);
+					   &channel_mask, &rank,
+					   &area_type, msg);
+	} else {
+		rc = get_memory_error_data_from_mce(mci, m, &socket, &ha,
+						    &channel_mask, msg);
 	}
 
 	if (rc < 0)
@@ -2981,14 +3079,15 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
 
 	first_channel = find_first_bit(&channel_mask, NUM_CHANNELS);
 
-	if (rank < 4)
+	if (rank == 0xff)
+		dimm = -1;
+	else if (rank < 4)
 		dimm = 0;
 	else if (rank < 8)
 		dimm = 1;
 	else
 		dimm = 2;
 
-
 	/*
 	 * FIXME: On some memory configurations (mirror, lockstep), the
 	 * Memory Controller can't point the error to a single DIMM. The
@@ -3175,6 +3274,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
 		pvt->info.dram_rule = ibridge_dram_rule;
 		pvt->info.get_memory_type = get_memory_type;
 		pvt->info.get_node_id = get_node_id;
+		pvt->info.get_ha = ibridge_get_ha;
 		pvt->info.rir_limit = rir_limit;
 		pvt->info.sad_limit = sad_limit;
 		pvt->info.interleave_mode = interleave_mode;
@@ -3199,6 +3299,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
 		pvt->info.dram_rule = sbridge_dram_rule;
 		pvt->info.get_memory_type = get_memory_type;
 		pvt->info.get_node_id = get_node_id;
+		pvt->info.get_ha = sbridge_get_ha;
 		pvt->info.rir_limit = rir_limit;
 		pvt->info.sad_limit = sad_limit;
 		pvt->info.interleave_mode = interleave_mode;
@@ -3223,6 +3324,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
 		pvt->info.dram_rule = ibridge_dram_rule;
 		pvt->info.get_memory_type = haswell_get_memory_type;
 		pvt->info.get_node_id = haswell_get_node_id;
+		pvt->info.get_ha = ibridge_get_ha;
 		pvt->info.rir_limit = haswell_rir_limit;
 		pvt->info.sad_limit = sad_limit;
 		pvt->info.interleave_mode = interleave_mode;
@@ -3247,6 +3349,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
 		pvt->info.dram_rule = ibridge_dram_rule;
 		pvt->info.get_memory_type = haswell_get_memory_type;
 		pvt->info.get_node_id = haswell_get_node_id;
+		pvt->info.get_ha = ibridge_get_ha;
 		pvt->info.rir_limit = haswell_rir_limit;
 		pvt->info.sad_limit = sad_limit;
 		pvt->info.interleave_mode = interleave_mode;
@@ -3271,6 +3374,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
 		pvt->info.dram_rule = knl_dram_rule;
 		pvt->info.get_memory_type = knl_get_memory_type;
 		pvt->info.get_node_id = knl_get_node_id;
+		pvt->info.get_ha = knl_get_ha;
 		pvt->info.rir_limit = NULL;
 		pvt->info.sad_limit = knl_sad_limit;
 		pvt->info.interleave_mode = knl_interleave_mode;
author	Qiuxu Zhuo <qiuxu.zhuo@intel.com>	2018-09-11 00:11:45 +0300
committer	Borislav Petkov <bp@suse.de>	2018-09-11 12:09:54 +0300
commit	8489b17ce29d9a35a36c08bbea93cdce4c98a6ad (patch)
tree	752d188b55996a1182fc587ab1399a71539924c7 /drivers/edac
parent	dcc960b225ceb2bd66c45e0845d03e577f7010f9 (diff)
download	linux-8489b17ce29d9a35a36c08bbea93cdce4c98a6ad.tar.xz