Skip to content

Commit bf80bbd

Browse files
Aravind Gopalakrishnansuryasaimadhu
authored andcommitted
x86/mce: Add an AMD severities-grading function
Add a severities function that caters to AMD processors. This allows us to do some vendor-specific work within the function if necessary. Also, introduce a vendor flag bitfield for vendor-specific settings. The severities code uses this to define error scope based on the prescence of the flags field. This is based off of work by Boris Petkov. Testing details: Fam10h, Model 9h (Greyhound) Fam15h: Models 0h-0fh (Orochi), 30h-3fh (Kaveri) and 60h-6fh (Carrizo), Fam16h Model 00h-0fh (Kabini) Boris: Intel SNB AMD K8 (JH-E0) Signed-off-by: Aravind Gopalakrishnan <aravind.gopalakrishnan@amd.com> Acked-by: Tony Luck <tony.luck@intel.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Ingo Molnar <mingo@kernel.org> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Chen Yucong <slaoub@gmail.com> Cc: Andy Lutomirski <luto@amacapital.net> Cc: linux-edac@vger.kernel.org Link: http://lkml.kernel.org/r/1427125373-2918-2-git-send-email-Aravind.Gopalakrishnan@amd.com [ Fixup build, clean up comments. ] Signed-off-by: Borislav Petkov <bp@suse.de>
1 parent c9ce871 commit bf80bbd

File tree

3 files changed

+71
-0
lines changed

3 files changed

+71
-0
lines changed

arch/x86/include/asm/mce.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,12 @@ struct mca_config {
116116
u32 rip_msr;
117117
};
118118

119+
struct mce_vendor_flags {
120+
__u64 overflow_recov : 1, /* cpuid_ebx(80000007) */
121+
__reserved_0 : 63;
122+
};
123+
extern struct mce_vendor_flags mce_flags;
124+
119125
extern struct mca_config mca_cfg;
120126
extern void mce_register_decode_chain(struct notifier_block *nb);
121127
extern void mce_unregister_decode_chain(struct notifier_block *nb);

arch/x86/kernel/cpu/mcheck/mce-severity.c

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,12 +186,68 @@ static int error_context(struct mce *m)
186186
return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
187187
}
188188

189+
/*
190+
* See AMD Error Scope Hierarchy table in a newer BKDG. For example
191+
* 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
192+
*/
193+
static int mce_severity_amd(struct mce *m, enum context ctx)
194+
{
195+
/* Processor Context Corrupt, no need to fumble too much, die! */
196+
if (m->status & MCI_STATUS_PCC)
197+
return MCE_PANIC_SEVERITY;
198+
199+
if (m->status & MCI_STATUS_UC) {
200+
201+
/*
202+
* On older systems where overflow_recov flag is not present, we
203+
* should simply panic if an error overflow occurs. If
204+
* overflow_recov flag is present and set, then software can try
205+
* to at least kill process to prolong system operation.
206+
*/
207+
if (mce_flags.overflow_recov) {
208+
/* software can try to contain */
209+
if (!(m->mcgstatus & MCG_STATUS_RIPV))
210+
if (ctx == IN_KERNEL)
211+
return MCE_PANIC_SEVERITY;
212+
213+
/* kill current process */
214+
return MCE_AR_SEVERITY;
215+
} else {
216+
/* at least one error was not logged */
217+
if (m->status & MCI_STATUS_OVER)
218+
return MCE_PANIC_SEVERITY;
219+
}
220+
221+
/*
222+
* For any other case, return MCE_UC_SEVERITY so that we log the
223+
* error and exit #MC handler.
224+
*/
225+
return MCE_UC_SEVERITY;
226+
}
227+
228+
/*
229+
* deferred error: poll handler catches these and adds to mce_ring so
230+
* memory-failure can take recovery actions.
231+
*/
232+
if (m->status & MCI_STATUS_DEFERRED)
233+
return MCE_DEFERRED_SEVERITY;
234+
235+
/*
236+
* corrected error: poll handler catches these and passes responsibility
237+
* of decoding the error to EDAC
238+
*/
239+
return MCE_KEEP_SEVERITY;
240+
}
241+
189242
int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
190243
{
191244
enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
192245
enum context ctx = error_context(m);
193246
struct severity *s;
194247

248+
if (m->cpuvendor == X86_VENDOR_AMD)
249+
return mce_severity_amd(m, ctx);
250+
195251
for (s = severities;; s++) {
196252
if ((m->status & s->mask) != s->result)
197253
continue;

arch/x86/kernel/cpu/mcheck/mce.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
6464
DEFINE_PER_CPU(unsigned, mce_exception_count);
6565

6666
struct mce_bank *mce_banks __read_mostly;
67+
struct mce_vendor_flags mce_flags __read_mostly;
6768

6869
struct mca_config mca_cfg __read_mostly = {
6970
.bootlog = -1,
@@ -1534,6 +1535,13 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
15341535
if (c->x86 == 6 && cfg->banks > 0)
15351536
mce_banks[0].ctl = 0;
15361537

1538+
/*
1539+
* overflow_recov is supported for F15h Models 00h-0fh
1540+
* even though we don't have a CPUID bit for it.
1541+
*/
1542+
if (c->x86 == 0x15 && c->x86_model <= 0xf)
1543+
mce_flags.overflow_recov = 1;
1544+
15371545
/*
15381546
* Turn off MC4_MISC thresholding banks on those models since
15391547
* they're not supported there.
@@ -1633,6 +1641,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
16331641
break;
16341642
case X86_VENDOR_AMD:
16351643
mce_amd_feature_init(c);
1644+
mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1;
16361645
break;
16371646
default:
16381647
break;

0 commit comments

Comments
 (0)