Skip to content

Commit fab5669

Browse files
committed
Merge branch 'x86-ras-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 RAS changes from Ingo Molnar: - SCI reporting for other error types not only correctable ones - GHES cleanups - Add the functionality to override error reporting agents as some machines are sporting a new extended error logging capability which, if done properly in the BIOS, makes a corresponding EDAC module redundant - PCIe AER tracepoint severity levels fix - Error path correction for the mce device init - MCE timer fix - Add more flexibility to the error injection (EINJ) debugfs interface * 'x86-ras-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86, mce: Fix mce_start_timer semantics ACPI, APEI, GHES: Cleanup ghes memory error handling ACPI, APEI: Cleanup alignment-aware accesses ACPI, APEI, GHES: Do not report only correctable errors with SCI ACPI, APEI, EINJ: Changes to the ACPI/APEI/EINJ debugfs interface ACPI, eMCA: Combine eMCA/EDAC event reporting priority EDAC, sb_edac: Modify H/W event reporting policy EDAC: Add an edac_report parameter to EDAC PCI, AER: Fix severity usage in aer trace event x86, mce: Call put_device on device_register failure
2 parents 74e8ee8 + b769e01 commit fab5669

File tree

13 files changed

+183
-54
lines changed

13 files changed

+183
-54
lines changed

Documentation/acpi/apei/einj.txt

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,11 +45,22 @@ directory apei/einj. The following files are provided.
4545
injection. Before this, please specify all necessary error
4646
parameters.
4747

48+
- flags
49+
Present for kernel version 3.13 and above. Used to specify which
50+
of param{1..4} are valid and should be used by BIOS during injection.
51+
Value is a bitmask as specified in ACPI5.0 spec for the
52+
SET_ERROR_TYPE_WITH_ADDRESS data structure:
53+
Bit 0 - Processor APIC field valid (see param3 below)
54+
Bit 1 - Memory address and mask valid (param1 and param2)
55+
Bit 2 - PCIe (seg,bus,dev,fn) valid (param4 below)
56+
If set to zero, legacy behaviour is used where the type of injection
57+
specifies just one bit set, and param1 is multiplexed.
58+
4859
- param1
4960
This file is used to set the first error parameter value. Effect of
5061
parameter depends on error_type specified. For example, if error
5162
type is memory related type, the param1 should be a valid physical
52-
memory address.
63+
memory address. [Unless "flag" is set - see above]
5364

5465
- param2
5566
This file is used to set the second error parameter value. Effect of
@@ -58,6 +69,12 @@ directory apei/einj. The following files are provided.
5869
address mask. Linux requires page or narrower granularity, say,
5970
0xfffffffffffff000.
6071

72+
- param3
73+
Used when the 0x1 bit is set in "flag" to specify the APIC id
74+
75+
- param4
76+
Used when the 0x4 bit is set in "flag" to specify target PCIe device
77+
6178
- notrigger
6279
The EINJ mechanism is a two step process. First inject the error, then
6380
perform some actions to trigger it. Setting "notrigger" to 1 skips the

Documentation/kernel-parameters.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -890,6 +890,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
890890

891891
The xen output can only be used by Xen PV guests.
892892

893+
edac_report= [HW,EDAC] Control how to report EDAC event
894+
Format: {"on" | "off" | "force"}
895+
on: enable EDAC to report H/W event. May be overridden
896+
by other higher priority error reporting module.
897+
off: disable H/W event reporting through EDAC.
898+
force: enforce the use of EDAC to report H/W event.
899+
default: on.
900+
893901
ekgdboc= [X86,KGDB] Allow early kernel console debugging
894902
ekgdboc=kbd
895903

arch/x86/kernel/cpu/mcheck/mce-apei.c

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,22 +33,28 @@
3333
#include <linux/acpi.h>
3434
#include <linux/cper.h>
3535
#include <acpi/apei.h>
36+
#include <acpi/ghes.h>
3637
#include <asm/mce.h>
3738

3839
#include "mce-internal.h"
3940

40-
void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
41+
void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
4142
{
4243
struct mce m;
4344

44-
/* Only corrected MC is reported */
45-
if (!corrected || !(mem_err->validation_bits & CPER_MEM_VALID_PA))
45+
if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
4646
return;
4747

4848
mce_setup(&m);
4949
m.bank = 1;
50-
/* Fake a memory read corrected error with unknown channel */
50+
/* Fake a memory read error with unknown channel */
5151
m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
52+
53+
if (severity >= GHES_SEV_RECOVERABLE)
54+
m.status |= MCI_STATUS_UC;
55+
if (severity >= GHES_SEV_PANIC)
56+
m.status |= MCI_STATUS_PCC;
57+
5258
m.addr = mem_err->physical_addr;
5359
mce_log(&m);
5460
mce_notify_irq();

arch/x86/kernel/cpu/mcheck/mce.c

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1638,15 +1638,15 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
16381638

16391639
static void mce_start_timer(unsigned int cpu, struct timer_list *t)
16401640
{
1641-
unsigned long iv = mce_adjust_timer(check_interval * HZ);
1642-
1643-
__this_cpu_write(mce_next_interval, iv);
1641+
unsigned long iv = check_interval * HZ;
16441642

16451643
if (mca_cfg.ignore_ce || !iv)
16461644
return;
16471645

1646+
per_cpu(mce_next_interval, cpu) = iv;
1647+
16481648
t->expires = round_jiffies(jiffies + iv);
1649-
add_timer_on(t, smp_processor_id());
1649+
add_timer_on(t, cpu);
16501650
}
16511651

16521652
static void __mcheck_cpu_init_timer(void)
@@ -2272,8 +2272,10 @@ static int mce_device_create(unsigned int cpu)
22722272
dev->release = &mce_device_release;
22732273

22742274
err = device_register(dev);
2275-
if (err)
2275+
if (err) {
2276+
put_device(dev);
22762277
return err;
2278+
}
22772279

22782280
for (i = 0; mce_device_attrs[i]; i++) {
22792281
err = device_create_file(dev, mce_device_attrs[i]);

drivers/acpi/acpi_extlog.c

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <acpi/acpi_bus.h>
1313
#include <linux/cper.h>
1414
#include <linux/ratelimit.h>
15+
#include <linux/edac.h>
1516
#include <asm/cpu.h>
1617
#include <asm/mce.h>
1718

@@ -43,6 +44,8 @@ struct extlog_l1_head {
4344
u8 rev1[12];
4445
};
4546

47+
static int old_edac_report_status;
48+
4649
static u8 extlog_dsm_uuid[] = "663E35AF-CC10-41A4-88EA-5470AF055295";
4750

4851
/* L1 table related physical address */
@@ -150,7 +153,7 @@ static int extlog_print(struct notifier_block *nb, unsigned long val,
150153

151154
rc = print_extlog_rcd(NULL, (struct acpi_generic_status *)elog_buf, cpu);
152155

153-
return NOTIFY_DONE;
156+
return NOTIFY_STOP;
154157
}
155158

156159
static int extlog_get_dsm(acpi_handle handle, int rev, int func, u64 *ret)
@@ -231,8 +234,12 @@ static int __init extlog_init(void)
231234
u64 cap;
232235
int rc;
233236

234-
rc = -ENODEV;
237+
if (get_edac_report_status() == EDAC_REPORTING_FORCE) {
238+
pr_warn("Not loading eMCA, error reporting force-enabled through EDAC.\n");
239+
return -EPERM;
240+
}
235241

242+
rc = -ENODEV;
236243
rdmsrl(MSR_IA32_MCG_CAP, cap);
237244
if (!(cap & MCG_ELOG_P))
238245
return rc;
@@ -287,6 +294,12 @@ static int __init extlog_init(void)
287294
if (elog_buf == NULL)
288295
goto err_release_elog;
289296

297+
/*
298+
* eMCA event report method has higher priority than EDAC method,
299+
* unless EDAC event report method is mandatory.
300+
*/
301+
old_edac_report_status = get_edac_report_status();
302+
set_edac_report_status(EDAC_REPORTING_DISABLED);
290303
mce_register_decode_chain(&extlog_mce_dec);
291304
/* enable OS to be involved to take over management from BIOS */
292305
((struct extlog_l1_head *)extlog_l1_addr)->flags |= FLAG_OS_OPTIN;
@@ -308,6 +321,7 @@ static int __init extlog_init(void)
308321

309322
static void __exit extlog_exit(void)
310323
{
324+
set_edac_report_status(old_edac_report_status);
311325
mce_unregister_decode_chain(&extlog_mce_dec);
312326
((struct extlog_l1_head *)extlog_l1_addr)->flags &= ~FLAG_OS_OPTIN;
313327
if (extlog_l1_addr)

drivers/acpi/apei/apei-base.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
#include <linux/rculist.h>
4242
#include <linux/interrupt.h>
4343
#include <linux/debugfs.h>
44+
#include <asm/unaligned.h>
4445

4546
#include "apei-internal.h"
4647

@@ -567,8 +568,7 @@ static int apei_check_gar(struct acpi_generic_address *reg, u64 *paddr,
567568
bit_offset = reg->bit_offset;
568569
access_size_code = reg->access_width;
569570
space_id = reg->space_id;
570-
/* Handle possible alignment issues */
571-
memcpy(paddr, &reg->address, sizeof(*paddr));
571+
*paddr = get_unaligned(&reg->address);
572572
if (!*paddr) {
573573
pr_warning(FW_BUG APEI_PFX
574574
"Invalid physical address in GAR [0x%llx/%u/%u/%u/%u]\n",

drivers/acpi/apei/einj.c

Lines changed: 43 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
#include <linux/delay.h>
3535
#include <linux/mm.h>
3636
#include <acpi/acpi.h>
37+
#include <asm/unaligned.h>
3738

3839
#include "apei-internal.h"
3940

@@ -216,7 +217,7 @@ static void check_vendor_extension(u64 paddr,
216217
static void *einj_get_parameter_address(void)
217218
{
218219
int i;
219-
u64 paddrv4 = 0, paddrv5 = 0;
220+
u64 pa_v4 = 0, pa_v5 = 0;
220221
struct acpi_whea_header *entry;
221222

222223
entry = EINJ_TAB_ENTRY(einj_tab);
@@ -225,30 +226,28 @@ static void *einj_get_parameter_address(void)
225226
entry->instruction == ACPI_EINJ_WRITE_REGISTER &&
226227
entry->register_region.space_id ==
227228
ACPI_ADR_SPACE_SYSTEM_MEMORY)
228-
memcpy(&paddrv4, &entry->register_region.address,
229-
sizeof(paddrv4));
229+
pa_v4 = get_unaligned(&entry->register_region.address);
230230
if (entry->action == ACPI_EINJ_SET_ERROR_TYPE_WITH_ADDRESS &&
231231
entry->instruction == ACPI_EINJ_WRITE_REGISTER &&
232232
entry->register_region.space_id ==
233233
ACPI_ADR_SPACE_SYSTEM_MEMORY)
234-
memcpy(&paddrv5, &entry->register_region.address,
235-
sizeof(paddrv5));
234+
pa_v5 = get_unaligned(&entry->register_region.address);
236235
entry++;
237236
}
238-
if (paddrv5) {
237+
if (pa_v5) {
239238
struct set_error_type_with_address *v5param;
240239

241-
v5param = acpi_os_map_memory(paddrv5, sizeof(*v5param));
240+
v5param = acpi_os_map_memory(pa_v5, sizeof(*v5param));
242241
if (v5param) {
243242
acpi5 = 1;
244-
check_vendor_extension(paddrv5, v5param);
243+
check_vendor_extension(pa_v5, v5param);
245244
return v5param;
246245
}
247246
}
248-
if (param_extension && paddrv4) {
247+
if (param_extension && pa_v4) {
249248
struct einj_parameter *v4param;
250249

251-
v4param = acpi_os_map_memory(paddrv4, sizeof(*v4param));
250+
v4param = acpi_os_map_memory(pa_v4, sizeof(*v4param));
252251
if (!v4param)
253252
return NULL;
254253
if (v4param->reserved1 || v4param->reserved2) {
@@ -416,7 +415,8 @@ static int __einj_error_trigger(u64 trigger_paddr, u32 type,
416415
return rc;
417416
}
418417

419-
static int __einj_error_inject(u32 type, u64 param1, u64 param2)
418+
static int __einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
419+
u64 param3, u64 param4)
420420
{
421421
struct apei_exec_context ctx;
422422
u64 val, trigger_paddr, timeout = FIRMWARE_TIMEOUT;
@@ -446,6 +446,12 @@ static int __einj_error_inject(u32 type, u64 param1, u64 param2)
446446
break;
447447
}
448448
v5param->flags = vendor_flags;
449+
} else if (flags) {
450+
v5param->flags = flags;
451+
v5param->memory_address = param1;
452+
v5param->memory_address_range = param2;
453+
v5param->apicid = param3;
454+
v5param->pcie_sbdf = param4;
449455
} else {
450456
switch (type) {
451457
case ACPI_EINJ_PROCESSOR_CORRECTABLE:
@@ -514,11 +520,17 @@ static int __einj_error_inject(u32 type, u64 param1, u64 param2)
514520
}
515521

516522
/* Inject the specified hardware error */
517-
static int einj_error_inject(u32 type, u64 param1, u64 param2)
523+
static int einj_error_inject(u32 type, u32 flags, u64 param1, u64 param2,
524+
u64 param3, u64 param4)
518525
{
519526
int rc;
520527
unsigned long pfn;
521528

529+
/* If user manually set "flags", make sure it is legal */
530+
if (flags && (flags &
531+
~(SETWA_FLAGS_APICID|SETWA_FLAGS_MEM|SETWA_FLAGS_PCIE_SBDF)))
532+
return -EINVAL;
533+
522534
/*
523535
* We need extra sanity checks for memory errors.
524536
* Other types leap directly to injection.
@@ -532,7 +544,7 @@ static int einj_error_inject(u32 type, u64 param1, u64 param2)
532544
if (type & ACPI5_VENDOR_BIT) {
533545
if (vendor_flags != SETWA_FLAGS_MEM)
534546
goto inject;
535-
} else if (!(type & MEM_ERROR_MASK))
547+
} else if (!(type & MEM_ERROR_MASK) && !(flags & SETWA_FLAGS_MEM))
536548
goto inject;
537549

538550
/*
@@ -546,15 +558,18 @@ static int einj_error_inject(u32 type, u64 param1, u64 param2)
546558

547559
inject:
548560
mutex_lock(&einj_mutex);
549-
rc = __einj_error_inject(type, param1, param2);
561+
rc = __einj_error_inject(type, flags, param1, param2, param3, param4);
550562
mutex_unlock(&einj_mutex);
551563

552564
return rc;
553565
}
554566

555567
static u32 error_type;
568+
static u32 error_flags;
556569
static u64 error_param1;
557570
static u64 error_param2;
571+
static u64 error_param3;
572+
static u64 error_param4;
558573
static struct dentry *einj_debug_dir;
559574

560575
static int available_error_type_show(struct seq_file *m, void *v)
@@ -648,7 +663,8 @@ static int error_inject_set(void *data, u64 val)
648663
if (!error_type)
649664
return -EINVAL;
650665

651-
return einj_error_inject(error_type, error_param1, error_param2);
666+
return einj_error_inject(error_type, error_flags, error_param1, error_param2,
667+
error_param3, error_param4);
652668
}
653669

654670
DEFINE_SIMPLE_ATTRIBUTE(error_inject_fops, NULL,
@@ -729,6 +745,10 @@ static int __init einj_init(void)
729745
rc = -ENOMEM;
730746
einj_param = einj_get_parameter_address();
731747
if ((param_extension || acpi5) && einj_param) {
748+
fentry = debugfs_create_x32("flags", S_IRUSR | S_IWUSR,
749+
einj_debug_dir, &error_flags);
750+
if (!fentry)
751+
goto err_unmap;
732752
fentry = debugfs_create_x64("param1", S_IRUSR | S_IWUSR,
733753
einj_debug_dir, &error_param1);
734754
if (!fentry)
@@ -737,6 +757,14 @@ static int __init einj_init(void)
737757
einj_debug_dir, &error_param2);
738758
if (!fentry)
739759
goto err_unmap;
760+
fentry = debugfs_create_x64("param3", S_IRUSR | S_IWUSR,
761+
einj_debug_dir, &error_param3);
762+
if (!fentry)
763+
goto err_unmap;
764+
fentry = debugfs_create_x64("param4", S_IRUSR | S_IWUSR,
765+
einj_debug_dir, &error_param4);
766+
if (!fentry)
767+
goto err_unmap;
740768

741769
fentry = debugfs_create_x32("notrigger", S_IRUSR | S_IWUSR,
742770
einj_debug_dir, &notrigger);

drivers/acpi/apei/erst.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -611,7 +611,7 @@ static void __erst_record_id_cache_compact(void)
611611
if (entries[i] == APEI_ERST_INVALID_RECORD_ID)
612612
continue;
613613
if (wpos != i)
614-
memcpy(&entries[wpos], &entries[i], sizeof(entries[i]));
614+
entries[wpos] = entries[i];
615615
wpos++;
616616
}
617617
erst_record_id_cache.len = wpos;

0 commit comments

Comments
 (0)