Skip to content

Commit f5c8a10

Browse files
author
Ingo Molnar
committed
Merge tag 'amd_severity' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras into x86/ras
Pull RAS update from Borislav Petkov: "This has been long in the making - an AMD-specific MCE-severity grading function. And it is actually readable at a quick glance. Further error recovery actions will be based on its output. Patches tested on every relevant AMD family out there." Signed-off-by: Ingo Molnar <mingo@kernel.org>
2 parents c9ce871 + 43eaa2a commit f5c8a10

File tree

4 files changed

+85
-2
lines changed

4 files changed

+85
-2
lines changed

arch/x86/include/asm/mce.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,12 @@ struct mca_config {
116116
u32 rip_msr;
117117
};
118118

119+
struct mce_vendor_flags {
120+
__u64 overflow_recov : 1, /* cpuid_ebx(80000007) */
121+
__reserved_0 : 63;
122+
};
123+
extern struct mce_vendor_flags mce_flags;
124+
119125
extern struct mca_config mca_cfg;
120126
extern void mce_register_decode_chain(struct notifier_block *nb);
121127
extern void mce_unregister_decode_chain(struct notifier_block *nb);
@@ -128,9 +134,11 @@ extern int mce_p5_enabled;
128134
#ifdef CONFIG_X86_MCE
129135
int mcheck_init(void);
130136
void mcheck_cpu_init(struct cpuinfo_x86 *c);
137+
void mcheck_vendor_init_severity(void);
131138
#else
132139
static inline int mcheck_init(void) { return 0; }
133140
static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
141+
static inline void mcheck_vendor_init_severity(void) {}
134142
#endif
135143

136144
#ifdef CONFIG_X86_ANCIENT_MCE

arch/x86/kernel/cpu/mcheck/mce-internal.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ struct mce_bank {
2424
char attrname[ATTR_LEN]; /* attribute name */
2525
};
2626

27-
int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp);
27+
extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
2828
struct dentry *mce_get_debugfs_dir(void);
2929

3030
extern struct mce_bank *mce_banks;

arch/x86/kernel/cpu/mcheck/mce-severity.c

Lines changed: 66 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,62 @@ static int error_context(struct mce *m)
186186
return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
187187
}
188188

189-
int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
189+
/*
190+
* See AMD Error Scope Hierarchy table in a newer BKDG. For example
191+
* 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
192+
*/
193+
static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_excp)
194+
{
195+
enum context ctx = error_context(m);
196+
197+
/* Processor Context Corrupt, no need to fumble too much, die! */
198+
if (m->status & MCI_STATUS_PCC)
199+
return MCE_PANIC_SEVERITY;
200+
201+
if (m->status & MCI_STATUS_UC) {
202+
203+
/*
204+
* On older systems where overflow_recov flag is not present, we
205+
* should simply panic if an error overflow occurs. If
206+
* overflow_recov flag is present and set, then software can try
207+
* to at least kill process to prolong system operation.
208+
*/
209+
if (mce_flags.overflow_recov) {
210+
/* software can try to contain */
211+
if (!(m->mcgstatus & MCG_STATUS_RIPV))
212+
if (ctx == IN_KERNEL)
213+
return MCE_PANIC_SEVERITY;
214+
215+
/* kill current process */
216+
return MCE_AR_SEVERITY;
217+
} else {
218+
/* at least one error was not logged */
219+
if (m->status & MCI_STATUS_OVER)
220+
return MCE_PANIC_SEVERITY;
221+
}
222+
223+
/*
224+
* For any other case, return MCE_UC_SEVERITY so that we log the
225+
* error and exit #MC handler.
226+
*/
227+
return MCE_UC_SEVERITY;
228+
}
229+
230+
/*
231+
* deferred error: poll handler catches these and adds to mce_ring so
232+
* memory-failure can take recovery actions.
233+
*/
234+
if (m->status & MCI_STATUS_DEFERRED)
235+
return MCE_DEFERRED_SEVERITY;
236+
237+
/*
238+
* corrected error: poll handler catches these and passes responsibility
239+
* of decoding the error to EDAC
240+
*/
241+
return MCE_KEEP_SEVERITY;
242+
}
243+
244+
static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_excp)
190245
{
191246
enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
192247
enum context ctx = error_context(m);
@@ -216,6 +271,16 @@ int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
216271
}
217272
}
218273

274+
/* Default to mce_severity_intel */
275+
int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) =
276+
mce_severity_intel;
277+
278+
void __init mcheck_vendor_init_severity(void)
279+
{
280+
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
281+
mce_severity = mce_severity_amd;
282+
}
283+
219284
#ifdef CONFIG_DEBUG_FS
220285
static void *s_start(struct seq_file *f, loff_t *pos)
221286
{

arch/x86/kernel/cpu/mcheck/mce.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
6464
DEFINE_PER_CPU(unsigned, mce_exception_count);
6565

6666
struct mce_bank *mce_banks __read_mostly;
67+
struct mce_vendor_flags mce_flags __read_mostly;
6768

6869
struct mca_config mca_cfg __read_mostly = {
6970
.bootlog = -1,
@@ -1534,6 +1535,13 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
15341535
if (c->x86 == 6 && cfg->banks > 0)
15351536
mce_banks[0].ctl = 0;
15361537

1538+
/*
1539+
* overflow_recov is supported for F15h Models 00h-0fh
1540+
* even though we don't have a CPUID bit for it.
1541+
*/
1542+
if (c->x86 == 0x15 && c->x86_model <= 0xf)
1543+
mce_flags.overflow_recov = 1;
1544+
15371545
/*
15381546
* Turn off MC4_MISC thresholding banks on those models since
15391547
* they're not supported there.
@@ -1633,6 +1641,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
16331641
break;
16341642
case X86_VENDOR_AMD:
16351643
mce_amd_feature_init(c);
1644+
mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1;
16361645
break;
16371646
default:
16381647
break;
@@ -2017,6 +2026,7 @@ __setup("mce", mcheck_enable);
20172026
int __init mcheck_init(void)
20182027
{
20192028
mcheck_intel_therm_init();
2029+
mcheck_vendor_init_severity();
20202030

20212031
return 0;
20222032
}

0 commit comments

Comments
 (0)