Skip to content

Commit 8489b17

Browse files
qzhuo2suryasaimadhu
authored andcommitted
EDAC, sb_edac: Fix reporting for patrol scrubber errors
sb_edac sometimes reports the wrong DIMM for a memory error found by the patrol scrubber. That is because the hardware provides only a 4KB page-aligned address for the error case. This means that the EDAC driver will point at the DIMM matching offset 0x0 in the 4KB page, but because of interleaving across channels and ranks, the actual DIMM involved may be different if the error is on some other cache line within the page. Therefore, reconstruct the socket/iMC/channel information from the "mce" structure passed to the EDAC driver. The DIMM cannot be determined, so pass "dimm=-1" to the EDAC core. It will report that all the DIMMs on that channel may be affected. Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com> Cc: Aristeu Rozanski <aris@redhat.com> Cc: Mauro Carvalho Chehab <mchehab@kernel.org> Cc: Qiuxu Zhuo <qiuxu.zhuo@intel.com> Cc: linux-edac <linux-edac@vger.kernel.org> Link: http://lkml.kernel.org/r/20180907230828.13901-3-tony.luck@intel.com [ Improve comments on the functions to convert bank number to memory controller number. Minor cleanup to commit message. ] Signed-off-by: Tony Luck <tony.luck@intel.com> [ Massage commit message more. ] Signed-off-by: Borislav Petkov <bp@suse.de>
1 parent dcc960b commit 8489b17

File tree

1 file changed

+110
-6
lines changed

1 file changed

+110
-6
lines changed

drivers/edac/sb_edac.c

Lines changed: 110 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,7 @@ struct sbridge_info {
326326
const struct interleave_pkg *interleave_pkg;
327327
u8 max_sad;
328328
u8 (*get_node_id)(struct sbridge_pvt *pvt);
329+
u8 (*get_ha)(u8 bank);
329330
enum mem_type (*get_memory_type)(struct sbridge_pvt *pvt);
330331
enum dev_type (*get_width)(struct sbridge_pvt *pvt, u32 mtr);
331332
struct pci_dev *pci_vtd;
@@ -1002,6 +1003,39 @@ static u8 knl_get_node_id(struct sbridge_pvt *pvt)
10021003
return GET_BITFIELD(reg, 0, 2);
10031004
}
10041005

1006+
/*
1007+
* Use the reporting bank number to determine which memory
1008+
* controller (also known as "ha" for "home agent"). Sandy
1009+
* Bridge only has one memory controller per socket, so the
1010+
* answer is always zero.
1011+
*/
1012+
static u8 sbridge_get_ha(u8 bank)
1013+
{
1014+
return 0;
1015+
}
1016+
1017+
/*
1018+
* On Ivy Bridge, Haswell and Broadwell the error may be in a
1019+
* home agent bank (7, 8), or one of the per-channel memory
1020+
* controller banks (9 .. 16).
1021+
*/
1022+
static u8 ibridge_get_ha(u8 bank)
1023+
{
1024+
switch (bank) {
1025+
case 7 ... 8:
1026+
return bank - 7;
1027+
case 9 ... 16:
1028+
return (bank - 9) / 4;
1029+
default:
1030+
return -EINVAL;
1031+
}
1032+
}
1033+
1034+
/* Not used, but included for safety/symmetry */
1035+
static u8 knl_get_ha(u8 bank)
1036+
{
1037+
return -EINVAL;
1038+
}
10051039

10061040
static u64 haswell_get_tolm(struct sbridge_pvt *pvt)
10071041
{
@@ -2207,6 +2241,60 @@ static int get_memory_error_data(struct mem_ctl_info *mci,
22072241
return 0;
22082242
}
22092243

2244+
static int get_memory_error_data_from_mce(struct mem_ctl_info *mci,
2245+
const struct mce *m, u8 *socket,
2246+
u8 *ha, long *channel_mask,
2247+
char *msg)
2248+
{
2249+
u32 reg, channel = GET_BITFIELD(m->status, 0, 3);
2250+
struct mem_ctl_info *new_mci;
2251+
struct sbridge_pvt *pvt;
2252+
struct pci_dev *pci_ha;
2253+
bool tad0;
2254+
2255+
if (channel >= NUM_CHANNELS) {
2256+
sprintf(msg, "Invalid channel 0x%x", channel);
2257+
return -EINVAL;
2258+
}
2259+
2260+
pvt = mci->pvt_info;
2261+
if (!pvt->info.get_ha) {
2262+
sprintf(msg, "No get_ha()");
2263+
return -EINVAL;
2264+
}
2265+
*ha = pvt->info.get_ha(m->bank);
2266+
if (*ha != 0 && *ha != 1) {
2267+
sprintf(msg, "Impossible bank %d", m->bank);
2268+
return -EINVAL;
2269+
}
2270+
2271+
*socket = m->socketid;
2272+
new_mci = get_mci_for_node_id(*socket, *ha);
2273+
if (!new_mci) {
2274+
strcpy(msg, "mci socket got corrupted!");
2275+
return -EINVAL;
2276+
}
2277+
2278+
pvt = new_mci->pvt_info;
2279+
pci_ha = pvt->pci_ha;
2280+
pci_read_config_dword(pci_ha, tad_dram_rule[0], &reg);
2281+
tad0 = m->addr <= TAD_LIMIT(reg);
2282+
2283+
*channel_mask = 1 << channel;
2284+
if (pvt->mirror_mode == FULL_MIRRORING ||
2285+
(pvt->mirror_mode == ADDR_RANGE_MIRRORING && tad0)) {
2286+
*channel_mask |= 1 << ((channel + 2) % 4);
2287+
pvt->is_cur_addr_mirrored = true;
2288+
} else {
2289+
pvt->is_cur_addr_mirrored = false;
2290+
}
2291+
2292+
if (pvt->is_lockstep)
2293+
*channel_mask |= 1 << ((channel + 1) % 4);
2294+
2295+
return 0;
2296+
}
2297+
22102298
/****************************************************************************
22112299
Device initialization routines: put/get, init/exit
22122300
****************************************************************************/
@@ -2877,10 +2965,16 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
28772965
u32 errcode = GET_BITFIELD(m->status, 0, 15);
28782966
u32 channel = GET_BITFIELD(m->status, 0, 3);
28792967
u32 optypenum = GET_BITFIELD(m->status, 4, 6);
2968+
/*
2969+
* Bits 5-0 of MCi_MISC give the least significant bit that is valid.
2970+
* A value 6 is for cache line aligned address, a value 12 is for page
2971+
* aligned address reported by patrol scrubber.
2972+
*/
2973+
u32 lsb = GET_BITFIELD(m->misc, 0, 5);
28802974
long channel_mask, first_channel;
2881-
u8 rank, socket, ha;
2975+
u8 rank = 0xff, socket, ha;
28822976
int rc, dimm;
2883-
char *area_type = NULL;
2977+
char *area_type = "DRAM";
28842978

28852979
if (pvt->info.type != SANDY_BRIDGE)
28862980
recoverable = true;
@@ -2964,9 +3058,13 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
29643058
optype, msg);
29653059
}
29663060
return;
2967-
} else {
3061+
} else if (lsb < 12) {
29683062
rc = get_memory_error_data(mci, m->addr, &socket, &ha,
2969-
&channel_mask, &rank, &area_type, msg);
3063+
&channel_mask, &rank,
3064+
&area_type, msg);
3065+
} else {
3066+
rc = get_memory_error_data_from_mce(mci, m, &socket, &ha,
3067+
&channel_mask, msg);
29703068
}
29713069

29723070
if (rc < 0)
@@ -2981,14 +3079,15 @@ static void sbridge_mce_output_error(struct mem_ctl_info *mci,
29813079

29823080
first_channel = find_first_bit(&channel_mask, NUM_CHANNELS);
29833081

2984-
if (rank < 4)
3082+
if (rank == 0xff)
3083+
dimm = -1;
3084+
else if (rank < 4)
29853085
dimm = 0;
29863086
else if (rank < 8)
29873087
dimm = 1;
29883088
else
29893089
dimm = 2;
29903090

2991-
29923091
/*
29933092
* FIXME: On some memory configurations (mirror, lockstep), the
29943093
* Memory Controller can't point the error to a single DIMM. The
@@ -3175,6 +3274,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
31753274
pvt->info.dram_rule = ibridge_dram_rule;
31763275
pvt->info.get_memory_type = get_memory_type;
31773276
pvt->info.get_node_id = get_node_id;
3277+
pvt->info.get_ha = ibridge_get_ha;
31783278
pvt->info.rir_limit = rir_limit;
31793279
pvt->info.sad_limit = sad_limit;
31803280
pvt->info.interleave_mode = interleave_mode;
@@ -3199,6 +3299,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
31993299
pvt->info.dram_rule = sbridge_dram_rule;
32003300
pvt->info.get_memory_type = get_memory_type;
32013301
pvt->info.get_node_id = get_node_id;
3302+
pvt->info.get_ha = sbridge_get_ha;
32023303
pvt->info.rir_limit = rir_limit;
32033304
pvt->info.sad_limit = sad_limit;
32043305
pvt->info.interleave_mode = interleave_mode;
@@ -3223,6 +3324,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
32233324
pvt->info.dram_rule = ibridge_dram_rule;
32243325
pvt->info.get_memory_type = haswell_get_memory_type;
32253326
pvt->info.get_node_id = haswell_get_node_id;
3327+
pvt->info.get_ha = ibridge_get_ha;
32263328
pvt->info.rir_limit = haswell_rir_limit;
32273329
pvt->info.sad_limit = sad_limit;
32283330
pvt->info.interleave_mode = interleave_mode;
@@ -3247,6 +3349,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
32473349
pvt->info.dram_rule = ibridge_dram_rule;
32483350
pvt->info.get_memory_type = haswell_get_memory_type;
32493351
pvt->info.get_node_id = haswell_get_node_id;
3352+
pvt->info.get_ha = ibridge_get_ha;
32503353
pvt->info.rir_limit = haswell_rir_limit;
32513354
pvt->info.sad_limit = sad_limit;
32523355
pvt->info.interleave_mode = interleave_mode;
@@ -3271,6 +3374,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev, enum type type)
32713374
pvt->info.dram_rule = knl_dram_rule;
32723375
pvt->info.get_memory_type = knl_get_memory_type;
32733376
pvt->info.get_node_id = knl_get_node_id;
3377+
pvt->info.get_ha = knl_get_ha;
32743378
pvt->info.rir_limit = NULL;
32753379
pvt->info.sad_limit = knl_sad_limit;
32763380
pvt->info.interleave_mode = knl_interleave_mode;

0 commit comments

Comments
 (0)