From 8960de4a5ca7980ed1e19e7ca5a774d3b7a55c38 Mon Sep 17 00:00:00 2001 From: Michael Jin Date: Thu, 16 Aug 2018 15:28:40 -0400 Subject: EDAC, amd64: Add Family 17h, models 10h-2fh support Add new device IDs for family 17h, models 10h-2fh. This is required by amd64_edac_mod in order to properly detect PCI device functions 0 and 6. Signed-off-by: Michael Jin Reviewed-by: Yazen Ghannam Cc: Link: http://lkml.kernel.org/r/20180816192840.31166-1-mikhail.jin@gmail.com Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 14 ++++++++++++++ drivers/edac/amd64_edac.h | 3 +++ 2 files changed, 17 insertions(+) diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 18aeabb1d5ee..e2addb2bca29 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2200,6 +2200,15 @@ static struct amd64_family_type family_types[] = { .dbam_to_cs = f17_base_addr_to_cs_size, } }, + [F17_M10H_CPUS] = { + .ctl_name = "F17h_M10h", + .f0_id = PCI_DEVICE_ID_AMD_17H_M10H_DF_F0, + .f6_id = PCI_DEVICE_ID_AMD_17H_M10H_DF_F6, + .ops = { + .early_channel_count = f17_early_channel_count, + .dbam_to_cs = f17_base_addr_to_cs_size, + } + }, }; /* @@ -3188,6 +3197,11 @@ static struct amd64_family_type *per_family_init(struct amd64_pvt *pvt) break; case 0x17: + if (pvt->model >= 0x10 && pvt->model <= 0x2f) { + fam_type = &family_types[F17_M10H_CPUS]; + pvt->ops = &family_types[F17_M10H_CPUS].ops; + break; + } fam_type = &family_types[F17_CPUS]; pvt->ops = &family_types[F17_CPUS].ops; break; diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h index 1d4b74e9a037..4242f8e39c18 100644 --- a/drivers/edac/amd64_edac.h +++ b/drivers/edac/amd64_edac.h @@ -115,6 +115,8 @@ #define PCI_DEVICE_ID_AMD_16H_M30H_NB_F2 0x1582 #define PCI_DEVICE_ID_AMD_17H_DF_F0 0x1460 #define PCI_DEVICE_ID_AMD_17H_DF_F6 0x1466 +#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F0 0x15e8 +#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F6 0x15ee /* * Function 1 - Address Map @@ -281,6 +283,7 @@ enum amd_families { F16_CPUS, F16_M30H_CPUS, F17_CPUS, + F17_M10H_CPUS, NUM_FAMILIES, }; -- cgit v1.2.3 From bd1852317ffea39ed970f942259be82713bbf8d4 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 31 Aug 2018 11:23:41 +0300 Subject: EDAC: Get rid of custom ICPU() macro Replace custom grown macro with generic INTEL_CPU_FAM6() one. No functional change intended. Signed-off-by: Andy Shevchenko Cc: linux-edac Link: http://lkml.kernel.org/r/20180831082341.72363-1-andriy.shevchenko@linux.intel.com Signed-off-by: Borislav Petkov --- drivers/edac/sb_edac.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index 07726fb00321..6f3b763af7cd 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -3320,17 +3320,14 @@ fail0: return rc; } -#define ICPU(model, table) \ - { X86_VENDOR_INTEL, 6, model, 0, (unsigned long)&table } - static const struct x86_cpu_id sbridge_cpuids[] = { - ICPU(INTEL_FAM6_SANDYBRIDGE_X, pci_dev_descr_sbridge_table), - ICPU(INTEL_FAM6_IVYBRIDGE_X, pci_dev_descr_ibridge_table), - ICPU(INTEL_FAM6_HASWELL_X, pci_dev_descr_haswell_table), - ICPU(INTEL_FAM6_BROADWELL_X, pci_dev_descr_broadwell_table), - ICPU(INTEL_FAM6_BROADWELL_XEON_D, pci_dev_descr_broadwell_table), - ICPU(INTEL_FAM6_XEON_PHI_KNL, pci_dev_descr_knl_table), - ICPU(INTEL_FAM6_XEON_PHI_KNM, pci_dev_descr_knl_table), + INTEL_CPU_FAM6(SANDYBRIDGE_X, pci_dev_descr_sbridge_table), + INTEL_CPU_FAM6(IVYBRIDGE_X, pci_dev_descr_ibridge_table), + INTEL_CPU_FAM6(HASWELL_X, pci_dev_descr_haswell_table), + INTEL_CPU_FAM6(BROADWELL_X, pci_dev_descr_broadwell_table), + INTEL_CPU_FAM6(BROADWELL_XEON_D, pci_dev_descr_broadwell_table), + INTEL_CPU_FAM6(XEON_PHI_KNL, pci_dev_descr_knl_table), + INTEL_CPU_FAM6(XEON_PHI_KNM, pci_dev_descr_knl_table), { } }; MODULE_DEVICE_TABLE(x86cpu, sbridge_cpuids); -- cgit v1.2.3 From 528d132c86a1f4be68e431edb247dae58f278160 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 4 Sep 2018 14:07:57 -0700 Subject: MAINTAINERS: Update maintainer for drivers/edac/sb_edac.c Change maintainer of sb_edac. Suggested-by: Borislav Petkov Signed-off-by: Tony Luck Cc: Mauro Carvalho Chehab Cc: Mauro Carvalho Chehab Cc: Qiuxu Zhuo Cc: linux-edac Link: http://lkml.kernel.org/r/20180904210759.3814-2-tony.luck@intel.com Signed-off-by: Borislav Petkov --- MAINTAINERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index a5b256b25905..7b3b73615101 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5329,7 +5329,8 @@ S: Maintained F: drivers/edac/r82600_edac.c EDAC-SBRIDGE -M: Mauro Carvalho Chehab +M: Tony Luck +R: Qiuxu Zhuo L: linux-edac@vger.kernel.org S: Maintained F: drivers/edac/sb_edac.c -- cgit v1.2.3 From dcc960b225ceb2bd66c45e0845d03e577f7010f9 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Fri, 7 Sep 2018 16:08:27 -0700 Subject: EDAC, sb_edac: Return early on ADDRV bit and address type test Users of the mce_register_decode_chain() are called for every logged error. EDAC drivers should check: 1) Is this a memory error? [bit 7 in status register] 2) Is there a valid address? [bit 58 in status register] 3) Is the address a system address? [bitfield 8:6 in misc register] The sb_edac driver performed test "1" twice. Waited far too long to perform check "2". Didn't do check "3" at all. Fix it by moving the test for valid address from sbridge_mce_output_error() into sbridge_mce_check_error() and add a test for the type immediately after. Delete the redundant check for the type of the error from sbridge_mce_output_error(). Signed-off-by: Qiuxu Zhuo Cc: Aristeu Rozanski Cc: Mauro Carvalho Chehab Cc: Qiuxu Zhuo Cc: linux-edac Link: http://lkml.kernel.org/r/20180907230828.13901-2-tony.luck@intel.com [ Re-word commit message. ] Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov --- drivers/edac/sb_edac.c | 68 ++++++++++++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 33 deletions(-) diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index 6f3b763af7cd..ee2e2e53935f 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -2911,35 +2911,27 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci, * cccc = channel * If the mask doesn't match, report an error to the parsing logic */ - if (! ((errcode & 0xef80) == 0x80)) { - optype = "Can't parse: it is not a mem"; - } else { - switch (optypenum) { - case 0: - optype = "generic undef request error"; - break; - case 1: - optype = "memory read error"; - break; - case 2: - optype = "memory write error"; - break; - case 3: - optype = "addr/cmd error"; - break; - case 4: - optype = "memory scrubbing error"; - break; - default: - optype = "reserved"; - break; - } + switch (optypenum) { + case 0: + optype = "generic undef request error"; + break; + case 1: + optype = "memory read error"; + break; + case 2: + optype = "memory write error"; + break; + case 3: + optype = "addr/cmd error"; + break; + case 4: + optype = "memory scrubbing error"; + break; + default: + optype = "reserved"; + break; } - /* Only decode errors with an valid address (ADDRV) */ - if (!GET_BITFIELD(m->status, 58, 58)) - return; - if (pvt->info.type == KNIGHTS_LANDING) { if (channel == 14) { edac_dbg(0, "%s%s err_code:%04x:%04x EDRAM bank %d\n", @@ -3045,17 +3037,11 @@ static int sbridge_mce_check_error(struct notifier_block *nb, unsigned long val, { struct mce *mce = (struct mce *)data; struct mem_ctl_info *mci; - struct sbridge_pvt *pvt; char *type; if (edac_get_report_status() == EDAC_REPORTING_DISABLED) return NOTIFY_DONE; - mci = get_mci_for_node_id(mce->socketid, IMC0); - if (!mci) - return NOTIFY_DONE; - pvt = mci->pvt_info; - /* * Just let mcelog handle it if the error is * outside the memory controller. A memory error @@ -3065,6 +3051,22 @@ static int sbridge_mce_check_error(struct notifier_block *nb, unsigned long val, if ((mce->status & 0xefff) >> 7 != 1) return NOTIFY_DONE; + /* Check ADDRV bit in STATUS */ + if (!GET_BITFIELD(mce->status, 58, 58)) + return NOTIFY_DONE; + + /* Check MISCV bit in STATUS */ + if (!GET_BITFIELD(mce->status, 59, 59)) + return NOTIFY_DONE; + + /* Check address type in MISC (physical address only) */ + if (GET_BITFIELD(mce->misc, 6, 8) != 2) + return NOTIFY_DONE; + + mci = get_mci_for_node_id(mce->socketid, IMC0); + if (!mci) + return NOTIFY_DONE; + if (mce->mcgstatus & MCG_STATUS_MCIP) type = "Exception"; else -- cgit v1.2.3 From 8489b17ce29d9a35a36c08bbea93cdce4c98a6ad Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Mon, 10 Sep 2018 14:11:45 -0700 Subject: EDAC, sb_edac: Fix reporting for patrol scrubber errors sb_edac sometimes reports the wrong DIMM for a memory error found by the patrol scrubber. That is because the hardware provides only a 4KB page-aligned address for the error case. This means that the EDAC driver will point at the DIMM matching offset 0x0 in the 4KB page, but because of interleaving across channels and ranks, the actual DIMM involved may be different if the error is on some other cache line within the page. Therefore, reconstruct the socket/iMC/channel information from the "mce" structure passed to the EDAC driver. The DIMM cannot be determined, so pass "dimm=-1" to the EDAC core. It will report that all the DIMMs on that channel may be affected. Signed-off-by: Qiuxu Zhuo Cc: Aristeu Rozanski Cc: Mauro Carvalho Chehab Cc: Qiuxu Zhuo Cc: linux-edac Link: http://lkml.kernel.org/r/20180907230828.13901-3-tony.luck@intel.com [ Improve comments on the functions to convert bank number to memory controller number. Minor cleanup to commit message. ] Signed-off-by: Tony Luck [ Massage commit message more. ] Signed-off-by: Borislav Petkov --- drivers/edac/sb_edac.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 110 insertions(+), 6 deletions(-) diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index ee2e2e53935f..d4a10793f807 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -326,6 +326,7 @@ struct sbridge_info { const struct interleave_pkg *interleave_pkg; u8 max_sad; u8 (*get_node_id)(struct sbridge_pvt *pvt); + u8 (*get_ha)(u8 bank); enum mem_type (*get_memory_type)(struct sbridge_pvt *pvt); enum dev_type (*get_width)(struct sbridge_pvt *pvt, u32 mtr); struct pci_dev *pci_vtd; @@ -1002,6 +1003,39 @@ static u8 knl_get_node_id(struct sbridge_pvt *pvt) return GET_BITFIELD(reg, 0, 2); } +/* + * Use the reporting bank number to determine which memory + * controller (also known as "ha" for "home agent"). Sandy + * Bridge only has one memory controller per socket, so the + * answer is always zero. + */ +static u8 sbridge_get_ha(u8 bank) +{ + return 0; +} + +/* + * On Ivy Bridge, Haswell and Broadwell the error may be in a + * home agent bank (7, 8), or one of the per-channel memory + * controller banks (9 .. 16). + */ +static u8 ibridge_get_ha(u8 bank) +{ + switch (bank) { + case 7 ... 8: + return bank - 7; + case 9 ... 16: + return (bank - 9) / 4; + default: + return -EINVAL; + } +} + +/* Not used, but included for safety/symmetry */ +static u8 knl_get_ha(u8 bank) +{ + return -EINVAL; +} static u64 haswell_get_tolm(struct sbridge_pvt *pvt) { @@ -2207,6 +2241,60 @@ static int get_memory_error_data(struct mem_ctl_info *mci, return 0; } +static int get_memory_error_data_from_mce(struct mem_ctl_info *mci, + const struct mce *m, u8 *socket, + u8 *ha, long *channel_mask, + char *msg) +{ + u32 reg, channel = GET_BITFIELD(m->status, 0, 3); + struct mem_ctl_info *new_mci; + struct sbridge_pvt *pvt; + struct pci_dev *pci_ha; + bool tad0; + + if (channel >= NUM_CHANNELS) { + sprintf(msg, "Invalid channel 0x%x", channel); + return -EINVAL; + } + + pvt = mci->pvt_info; + if (!pvt->info.get_ha) { + sprintf(msg, "No get_ha()"); + return -EINVAL; + } + *ha = pvt->info.get_ha(m->bank); + if (*ha != 0 && *ha != 1) { + sprintf(msg, "Impossible bank %d", m->bank); + return -EINVAL; + } + + *socket = m->socketid; + new_mci = get_mci_for_node_id(*socket, *ha); + if (!new_mci) { + strcpy(msg, "mci socket got corrupted!"); + return -EINVAL; + } + + pvt = new_mci->pvt_info; + pci_ha = pvt->pci_ha; + pci_read_config_dword(pci_ha, tad_dram_rule[0], ®); + tad0 = m->addr <= TAD_LIMIT(reg); + + *channel_mask = 1 << channel; + if (pvt->mirror_mode == FULL_MIRRORING || + (pvt->mirror_mode == ADDR_RANGE_MIRRORING && tad0)) { + *channel_mask |= 1 << ((channel + 2) % 4); + pvt->is_cur_addr_mirrored = true; + } else { + pvt->is_cur_addr_mirrored = false; + } + + if (pvt->is_lockstep) + *channel_mask |= 1 << ((channel + 1) % 4); + + return 0; +} + /**************************************************************************** Device initialization routines: put/get, init/exit ****************************************************************************/ @@ -2877,10 +2965,16 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci, u32 errcode = GET_BITFIELD(m->status, 0, 15); u32 channel = GET_BITFIELD(m->status, 0, 3); u32 optypenum = GET_BITFIELD(m->status, 4, 6); + /* + * Bits 5-0 of MCi_MISC give the least significant bit that is valid. + * A value 6 is for cache line aligned address, a value 12 is for page + * aligned address reported by patrol scrubber. + */ + u32 lsb = GET_BITFIELD(m->misc, 0, 5); long channel_mask, first_channel; - u8 rank, socket, ha; + u8 rank = 0xff, socket, ha; int rc, dimm; - char *area_type = NULL; + char *area_type = "DRAM"; if (pvt->info.type != SANDY_BRIDGE) recoverable = true; @@ -2964,9 +3058,13 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci, optype, msg); } return; - } else { + } else if (lsb < 12) { rc = get_memory_error_data(mci, m->addr, &socket, &ha, - &channel_mask, &rank, &area_type, msg); + &channel_mask, &rank, + &area_type, msg); + } else { + rc = get_memory_error_data_from_mce(mci, m, &socket, &ha, + &channel_mask, msg); } if (rc < 0) @@ -2981,14 +3079,15 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci, first_channel = find_first_bit(&channel_mask, NUM_CHANNELS); - if (rank < 4) + if (rank == 0xff) + dimm = -1; + else if (rank < 4) dimm = 0; else if (rank < 8) dimm = 1; else dimm = 2; - /* * FIXME: On some memory configurations (mirror, lockstep), the * Memory Controller can't point the error to a single DIMM. The @@ -3175,6 +3274,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type) pvt->info.dram_rule = ibridge_dram_rule; pvt->info.get_memory_type = get_memory_type; pvt->info.get_node_id = get_node_id; + pvt->info.get_ha = ibridge_get_ha; pvt->info.rir_limit = rir_limit; pvt->info.sad_limit = sad_limit; pvt->info.interleave_mode = interleave_mode; @@ -3199,6 +3299,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type) pvt->info.dram_rule = sbridge_dram_rule; pvt->info.get_memory_type = get_memory_type; pvt->info.get_node_id = get_node_id; + pvt->info.get_ha = sbridge_get_ha; pvt->info.rir_limit = rir_limit; pvt->info.sad_limit = sad_limit; pvt->info.interleave_mode = interleave_mode; @@ -3223,6 +3324,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type) pvt->info.dram_rule = ibridge_dram_rule; pvt->info.get_memory_type = haswell_get_memory_type; pvt->info.get_node_id = haswell_get_node_id; + pvt->info.get_ha = ibridge_get_ha; pvt->info.rir_limit = haswell_rir_limit; pvt->info.sad_limit = sad_limit; pvt->info.interleave_mode = interleave_mode; @@ -3247,6 +3349,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type) pvt->info.dram_rule = ibridge_dram_rule; pvt->info.get_memory_type = haswell_get_memory_type; pvt->info.get_node_id = haswell_get_node_id; + pvt->info.get_ha = ibridge_get_ha; pvt->info.rir_limit = haswell_rir_limit; pvt->info.sad_limit = sad_limit; pvt->info.interleave_mode = interleave_mode; @@ -3271,6 +3374,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type) pvt->info.dram_rule = knl_dram_rule; pvt->info.get_memory_type = knl_get_memory_type; pvt->info.get_node_id = knl_get_node_id; + pvt->info.get_ha = knl_get_ha; pvt->info.rir_limit = NULL; pvt->info.sad_limit = knl_sad_limit; pvt->info.interleave_mode = knl_interleave_mode; -- cgit v1.2.3 From c968ed08594dadb788a93dc2bc128ed4ef35c93e Mon Sep 17 00:00:00 2001 From: "Luck, Tony" Date: Fri, 14 Sep 2018 13:19:05 -0700 Subject: EDAC, sb_edac: Fix signedness bugs in *_get_ha() functions A static checker gave the following warnings: drivers/edac/sb_edac.c:1030 ibridge_get_ha() warn: signedness bug returning '(-22)' drivers/edac/sb_edac.c:1037 knl_get_ha() warn: signedness bug returning '(-22)' Both because the functions are declared to return a "u8", but try to return -EINVAL for the error case. Fix by returning 0xff (since the caller doesn't look at, or pass on, the return value). Reported-by: Dan Carpenter Signed-off-by: Tony Luck Cc: Qiuxu Zhuo Cc: linux-edac Link: http://lkml.kernel.org/r/20180914201905.GA30946@agluck-desk Signed-off-by: Borislav Petkov --- drivers/edac/sb_edac.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index d4a10793f807..5aeb9a1ee898 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -1027,14 +1027,14 @@ static u8 ibridge_get_ha(u8 bank) case 9 ... 16: return (bank - 9) / 4; default: - return -EINVAL; + return 0xff; } } /* Not used, but included for safety/symmetry */ static u8 knl_get_ha(u8 bank) { - return -EINVAL; + return 0xff; } static u64 haswell_get_tolm(struct sbridge_pvt *pvt) -- cgit v1.2.3