aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2018-09-18 12:28:57 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2018-09-18 12:28:57 +1000
commit9d3df6f544639822714a04640a852550a1313161 (patch)
tree0381ed0fca44a445625e3edf8446a2c1ac2532f2
parent5e82ebe486e9d9764d376aeacd3a3d7647b4a996 (diff)
parentc968ed08594dadb788a93dc2bc128ed4ef35c93e (diff)
Merge remote-tracking branch 'edac-amd/for-next'
-rw-r--r--MAINTAINERS3
-rw-r--r--drivers/edac/amd64_edac.c14
-rw-r--r--drivers/edac/amd64_edac.h3
-rw-r--r--drivers/edac/sb_edac.c201
4 files changed, 171 insertions, 50 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 8cdee9750561..35af312e885c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5258,7 +5258,8 @@ S: Maintained
F: drivers/edac/r82600_edac.c
EDAC-SBRIDGE
-M: Mauro Carvalho Chehab <mchehab@kernel.org>
+M: Tony Luck <tony.luck@intel.com>
+R: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
L: linux-edac@vger.kernel.org
S: Maintained
F: drivers/edac/sb_edac.c
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 18aeabb1d5ee..e2addb2bca29 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2200,6 +2200,15 @@ static struct amd64_family_type family_types[] = {
.dbam_to_cs = f17_base_addr_to_cs_size,
}
},
+ [F17_M10H_CPUS] = {
+ .ctl_name = "F17h_M10h",
+ .f0_id = PCI_DEVICE_ID_AMD_17H_M10H_DF_F0,
+ .f6_id = PCI_DEVICE_ID_AMD_17H_M10H_DF_F6,
+ .ops = {
+ .early_channel_count = f17_early_channel_count,
+ .dbam_to_cs = f17_base_addr_to_cs_size,
+ }
+ },
};
/*
@@ -3188,6 +3197,11 @@ static struct amd64_family_type *per_family_init(struct amd64_pvt *pvt)
break;
case 0x17:
+ if (pvt->model >= 0x10 && pvt->model <= 0x2f) {
+ fam_type = &family_types[F17_M10H_CPUS];
+ pvt->ops = &family_types[F17_M10H_CPUS].ops;
+ break;
+ }
fam_type = &family_types[F17_CPUS];
pvt->ops = &family_types[F17_CPUS].ops;
break;
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index 1d4b74e9a037..4242f8e39c18 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -115,6 +115,8 @@
#define PCI_DEVICE_ID_AMD_16H_M30H_NB_F2 0x1582
#define PCI_DEVICE_ID_AMD_17H_DF_F0 0x1460
#define PCI_DEVICE_ID_AMD_17H_DF_F6 0x1466
+#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F0 0x15e8
+#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F6 0x15ee
/*
* Function 1 - Address Map
@@ -281,6 +283,7 @@ enum amd_families {
F16_CPUS,
F16_M30H_CPUS,
F17_CPUS,
+ F17_M10H_CPUS,
NUM_FAMILIES,
};
diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c
index 07726fb00321..5aeb9a1ee898 100644
--- a/drivers/edac/sb_edac.c
+++ b/drivers/edac/sb_edac.c
@@ -326,6 +326,7 @@ struct sbridge_info {
const struct interleave_pkg *interleave_pkg;
u8 max_sad;
u8 (*get_node_id)(struct sbridge_pvt *pvt);
+ u8 (*get_ha)(u8 bank);
enum mem_type (*get_memory_type)(struct sbridge_pvt *pvt);
enum dev_type (*get_width)(struct sbridge_pvt *pvt, u32 mtr);
struct pci_dev *pci_vtd;
@@ -1002,6 +1003,39 @@ static u8 knl_get_node_id(struct sbridge_pvt *pvt)
return GET_BITFIELD(reg, 0, 2);
}
+/*
+ * Use the reporting bank number to determine which memory
+ * controller (also known as "ha" for "home agent"). Sandy
+ * Bridge only has one memory controller per socket, so the
+ * answer is always zero.
+ */
+static u8 sbridge_get_ha(u8 bank)
+{
+ return 0;
+}
+
+/*
+ * On Ivy Bridge, Haswell and Broadwell the error may be in a
+ * home agent bank (7, 8), or one of the per-channel memory
+ * controller banks (9 .. 16).
+ */
+static u8 ibridge_get_ha(u8 bank)
+{
+ switch (bank) {
+ case 7 ... 8:
+ return bank - 7;
+ case 9 ... 16:
+ return (bank - 9) / 4;
+ default:
+ return 0xff;
+ }
+}
+
+/* Not used, but included for safety/symmetry */
+static u8 knl_get_ha(u8 bank)
+{
+ return 0xff;
+}
static u64 haswell_get_tolm(struct sbridge_pvt *pvt)
{
@@ -2207,6 +2241,60 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
return 0;
}
+static int get_memory_error_data_from_mce(struct mem_ctl_info *mci,
+ const struct mce *m, u8 *socket,
+ u8 *ha, long *channel_mask,
+ char *msg)
+{
+ u32 reg, channel = GET_BITFIELD(m->status, 0, 3);
+ struct mem_ctl_info *new_mci;
+ struct sbridge_pvt *pvt;
+ struct pci_dev *pci_ha;
+ bool tad0;
+
+ if (channel >= NUM_CHANNELS) {
+ sprintf(msg, "Invalid channel 0x%x", channel);
+ return -EINVAL;
+ }
+
+ pvt = mci->pvt_info;
+ if (!pvt->info.get_ha) {
+ sprintf(msg, "No get_ha()");
+ return -EINVAL;
+ }
+ *ha = pvt->info.get_ha(m->bank);
+ if (*ha != 0 && *ha != 1) {
+ sprintf(msg, "Impossible bank %d", m->bank);
+ return -EINVAL;
+ }
+
+ *socket = m->socketid;
+ new_mci = get_mci_for_node_id(*socket, *ha);
+ if (!new_mci) {
+ strcpy(msg, "mci socket got corrupted!");
+ return -EINVAL;
+ }
+
+ pvt = new_mci->pvt_info;
+ pci_ha = pvt->pci_ha;
+ pci_read_config_dword(pci_ha, tad_dram_rule[0], &reg);
+ tad0 = m->addr <= TAD_LIMIT(reg);
+
+ *channel_mask = 1 << channel;
+ if (pvt->mirror_mode == FULL_MIRRORING ||
+ (pvt->mirror_mode == ADDR_RANGE_MIRRORING && tad0)) {
+ *channel_mask |= 1 << ((channel + 2) % 4);
+ pvt->is_cur_addr_mirrored = true;
+ } else {
+ pvt->is_cur_addr_mirrored = false;
+ }
+
+ if (pvt->is_lockstep)
+ *channel_mask |= 1 << ((channel + 1) % 4);
+
+ return 0;
+}
+
/****************************************************************************
Device initialization routines: put/get, init/exit
****************************************************************************/
@@ -2877,10 +2965,16 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
u32 errcode = GET_BITFIELD(m->status, 0, 15);
u32 channel = GET_BITFIELD(m->status, 0, 3);
u32 optypenum = GET_BITFIELD(m->status, 4, 6);
+ /*
+ * Bits 5-0 of MCi_MISC give the least significant bit that is valid.
+ * A value 6 is for cache line aligned address, a value 12 is for page
+ * aligned address reported by patrol scrubber.
+ */
+ u32 lsb = GET_BITFIELD(m->misc, 0, 5);
long channel_mask, first_channel;
- u8 rank, socket, ha;
+ u8 rank = 0xff, socket, ha;
int rc, dimm;
- char *area_type = NULL;
+ char *area_type = "DRAM";
if (pvt->info.type != SANDY_BRIDGE)
recoverable = true;
@@ -2911,35 +3005,27 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
* cccc = channel
* If the mask doesn't match, report an error to the parsing logic
*/
- if (! ((errcode & 0xef80) == 0x80)) {
- optype = "Can't parse: it is not a mem";
- } else {
- switch (optypenum) {
- case 0:
- optype = "generic undef request error";
- break;
- case 1:
- optype = "memory read error";
- break;
- case 2:
- optype = "memory write error";
- break;
- case 3:
- optype = "addr/cmd error";
- break;
- case 4:
- optype = "memory scrubbing error";
- break;
- default:
- optype = "reserved";
- break;
- }
+ switch (optypenum) {
+ case 0:
+ optype = "generic undef request error";
+ break;
+ case 1:
+ optype = "memory read error";
+ break;
+ case 2:
+ optype = "memory write error";
+ break;
+ case 3:
+ optype = "addr/cmd error";
+ break;
+ case 4:
+ optype = "memory scrubbing error";
+ break;
+ default:
+ optype = "reserved";
+ break;
}
- /* Only decode errors with an valid address (ADDRV) */
- if (!GET_BITFIELD(m->status, 58, 58))
- return;
-
if (pvt->info.type == KNIGHTS_LANDING) {
if (channel == 14) {
edac_dbg(0, "%s%s err_code:%04x:%04x EDRAM bank %d\n",
@@ -2972,9 +3058,13 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
optype, msg);
}
return;
- } else {
+ } else if (lsb < 12) {
rc = get_memory_error_data(mci, m->addr, &socket, &ha,
- &channel_mask, &rank, &area_type, msg);
+ &channel_mask, &rank,
+ &area_type, msg);
+ } else {
+ rc = get_memory_error_data_from_mce(mci, m, &socket, &ha,
+ &channel_mask, msg);
}
if (rc < 0)
@@ -2989,14 +3079,15 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
first_channel = find_first_bit(&channel_mask, NUM_CHANNELS);
- if (rank < 4)
+ if (rank == 0xff)
+ dimm = -1;
+ else if (rank < 4)
dimm = 0;
else if (rank < 8)
dimm = 1;
else
dimm = 2;
-
/*
* FIXME: On some memory configurations (mirror, lockstep), the
* Memory Controller can't point the error to a single DIMM. The
@@ -3045,17 +3136,11 @@ static int sbridge_mce_check_error(struct notifier_block *nb, unsigned long val,
{
struct mce *mce = (struct mce *)data;
struct mem_ctl_info *mci;
- struct sbridge_pvt *pvt;
char *type;
if (edac_get_report_status() == EDAC_REPORTING_DISABLED)
return NOTIFY_DONE;
- mci = get_mci_for_node_id(mce->socketid, IMC0);
- if (!mci)
- return NOTIFY_DONE;
- pvt = mci->pvt_info;
-
/*
* Just let mcelog handle it if the error is
* outside the memory controller. A memory error
@@ -3065,6 +3150,22 @@ static int sbridge_mce_check_error(struct notifier_block *nb, unsigned long val,
if ((mce->status & 0xefff) >> 7 != 1)
return NOTIFY_DONE;
+ /* Check ADDRV bit in STATUS */
+ if (!GET_BITFIELD(mce->status, 58, 58))
+ return NOTIFY_DONE;
+
+ /* Check MISCV bit in STATUS */
+ if (!GET_BITFIELD(mce->status, 59, 59))
+ return NOTIFY_DONE;
+
+ /* Check address type in MISC (physical address only) */
+ if (GET_BITFIELD(mce->misc, 6, 8) != 2)
+ return NOTIFY_DONE;
+
+ mci = get_mci_for_node_id(mce->socketid, IMC0);
+ if (!mci)
+ return NOTIFY_DONE;
+
if (mce->mcgstatus & MCG_STATUS_MCIP)
type = "Exception";
else
@@ -3173,6 +3274,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
pvt->info.dram_rule = ibridge_dram_rule;
pvt->info.get_memory_type = get_memory_type;
pvt->info.get_node_id = get_node_id;
+ pvt->info.get_ha = ibridge_get_ha;
pvt->info.rir_limit = rir_limit;
pvt->info.sad_limit = sad_limit;
pvt->info.interleave_mode = interleave_mode;
@@ -3197,6 +3299,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
pvt->info.dram_rule = sbridge_dram_rule;
pvt->info.get_memory_type = get_memory_type;
pvt->info.get_node_id = get_node_id;
+ pvt->info.get_ha = sbridge_get_ha;
pvt->info.rir_limit = rir_limit;
pvt->info.sad_limit = sad_limit;
pvt->info.interleave_mode = interleave_mode;
@@ -3221,6 +3324,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
pvt->info.dram_rule = ibridge_dram_rule;
pvt->info.get_memory_type = haswell_get_memory_type;
pvt->info.get_node_id = haswell_get_node_id;
+ pvt->info.get_ha = ibridge_get_ha;
pvt->info.rir_limit = haswell_rir_limit;
pvt->info.sad_limit = sad_limit;
pvt->info.interleave_mode = interleave_mode;
@@ -3245,6 +3349,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
pvt->info.dram_rule = ibridge_dram_rule;
pvt->info.get_memory_type = haswell_get_memory_type;
pvt->info.get_node_id = haswell_get_node_id;
+ pvt->info.get_ha = ibridge_get_ha;
pvt->info.rir_limit = haswell_rir_limit;
pvt->info.sad_limit = sad_limit;
pvt->info.interleave_mode = interleave_mode;
@@ -3269,6 +3374,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
pvt->info.dram_rule = knl_dram_rule;
pvt->info.get_memory_type = knl_get_memory_type;
pvt->info.get_node_id = knl_get_node_id;
+ pvt->info.get_ha = knl_get_ha;
pvt->info.rir_limit = NULL;
pvt->info.sad_limit = knl_sad_limit;
pvt->info.interleave_mode = knl_interleave_mode;
@@ -3320,17 +3426,14 @@ fail0:
return rc;
}
-#define ICPU(model, table) \
- { X86_VENDOR_INTEL, 6, model, 0, (unsigned long)&table }
-
static const struct x86_cpu_id sbridge_cpuids[] = {
- ICPU(INTEL_FAM6_SANDYBRIDGE_X, pci_dev_descr_sbridge_table),
- ICPU(INTEL_FAM6_IVYBRIDGE_X, pci_dev_descr_ibridge_table),
- ICPU(INTEL_FAM6_HASWELL_X, pci_dev_descr_haswell_table),
- ICPU(INTEL_FAM6_BROADWELL_X, pci_dev_descr_broadwell_table),
- ICPU(INTEL_FAM6_BROADWELL_XEON_D, pci_dev_descr_broadwell_table),
- ICPU(INTEL_FAM6_XEON_PHI_KNL, pci_dev_descr_knl_table),
- ICPU(INTEL_FAM6_XEON_PHI_KNM, pci_dev_descr_knl_table),
+ INTEL_CPU_FAM6(SANDYBRIDGE_X, pci_dev_descr_sbridge_table),
+ INTEL_CPU_FAM6(IVYBRIDGE_X, pci_dev_descr_ibridge_table),
+ INTEL_CPU_FAM6(HASWELL_X, pci_dev_descr_haswell_table),
+ INTEL_CPU_FAM6(BROADWELL_X, pci_dev_descr_broadwell_table),
+ INTEL_CPU_FAM6(BROADWELL_XEON_D, pci_dev_descr_broadwell_table),
+ INTEL_CPU_FAM6(XEON_PHI_KNL, pci_dev_descr_knl_table),
+ INTEL_CPU_FAM6(XEON_PHI_KNM, pci_dev_descr_knl_table),
{ }
};
MODULE_DEVICE_TABLE(x86cpu, sbridge_cpuids);