Skip to content

Commit 0becc0a

Browse files
committed
x86/mce: Make timer handling more robust
Erik reported that on a preproduction hardware a CMCI storm triggers the BUG_ON in add_timer_on(). The reason is that the per CPU MCE timer is started by the CMCI logic before the MCE CPU hotplug callback starts the timer with add_timer_on(). So the timer is already queued which triggers the BUG. Using add_timer_on() is pretty pointless in this code because the timer is strictlty per CPU, initialized as pinned and all operations which arm the timer happen on the CPU to which the timer belongs. Simplify the whole machinery by using mod_timer() instead of add_timer_on() which avoids the problem because mod_timer() can handle already queued timers. Use __start_timer() everywhere so the earliest armed expiry time is preserved. Reported-by: Erik Veijola <erik.veijola@intel.com> Tested-by: Borislav Petkov <bp@alien8.de> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Reviewed-by: Borislav Petkov <bp@alien8.de> Cc: Tony Luck <tony.luck@intel.com> Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1701310936080.3457@nanos Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
1 parent 24c2503 commit 0becc0a

File tree

1 file changed

+12
-19
lines changed
  • arch/x86/kernel/cpu/mcheck

1 file changed

+12
-19
lines changed

arch/x86/kernel/cpu/mcheck/mce.c

Lines changed: 12 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1373,20 +1373,15 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
13731373

13741374
static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
13751375

1376-
static void __restart_timer(struct timer_list *t, unsigned long interval)
1376+
static void __start_timer(struct timer_list *t, unsigned long interval)
13771377
{
13781378
unsigned long when = jiffies + interval;
13791379
unsigned long flags;
13801380

13811381
local_irq_save(flags);
13821382

1383-
if (timer_pending(t)) {
1384-
if (time_before(when, t->expires))
1385-
mod_timer(t, when);
1386-
} else {
1387-
t->expires = round_jiffies(when);
1388-
add_timer_on(t, smp_processor_id());
1389-
}
1383+
if (!timer_pending(t) || time_before(when, t->expires))
1384+
mod_timer(t, round_jiffies(when));
13901385

13911386
local_irq_restore(flags);
13921387
}
@@ -1421,7 +1416,7 @@ static void mce_timer_fn(unsigned long data)
14211416

14221417
done:
14231418
__this_cpu_write(mce_next_interval, iv);
1424-
__restart_timer(t, iv);
1419+
__start_timer(t, iv);
14251420
}
14261421

14271422
/*
@@ -1432,7 +1427,7 @@ void mce_timer_kick(unsigned long interval)
14321427
struct timer_list *t = this_cpu_ptr(&mce_timer);
14331428
unsigned long iv = __this_cpu_read(mce_next_interval);
14341429

1435-
__restart_timer(t, interval);
1430+
__start_timer(t, interval);
14361431

14371432
if (interval < iv)
14381433
__this_cpu_write(mce_next_interval, interval);
@@ -1779,17 +1774,15 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
17791774
}
17801775
}
17811776

1782-
static void mce_start_timer(unsigned int cpu, struct timer_list *t)
1777+
static void mce_start_timer(struct timer_list *t)
17831778
{
17841779
unsigned long iv = check_interval * HZ;
17851780

17861781
if (mca_cfg.ignore_ce || !iv)
17871782
return;
17881783

1789-
per_cpu(mce_next_interval, cpu) = iv;
1790-
1791-
t->expires = round_jiffies(jiffies + iv);
1792-
add_timer_on(t, cpu);
1784+
this_cpu_write(mce_next_interval, iv);
1785+
__start_timer(t, iv);
17931786
}
17941787

17951788
static void __mcheck_cpu_setup_timer(void)
@@ -1806,7 +1799,7 @@ static void __mcheck_cpu_init_timer(void)
18061799
unsigned int cpu = smp_processor_id();
18071800

18081801
setup_pinned_timer(t, mce_timer_fn, cpu);
1809-
mce_start_timer(cpu, t);
1802+
mce_start_timer(t);
18101803
}
18111804

18121805
/* Handle unconfigured int18 (should never happen) */
@@ -2566,7 +2559,7 @@ static int mce_cpu_dead(unsigned int cpu)
25662559

25672560
static int mce_cpu_online(unsigned int cpu)
25682561
{
2569-
struct timer_list *t = &per_cpu(mce_timer, cpu);
2562+
struct timer_list *t = this_cpu_ptr(&mce_timer);
25702563
int ret;
25712564

25722565
mce_device_create(cpu);
@@ -2577,13 +2570,13 @@ static int mce_cpu_online(unsigned int cpu)
25772570
return ret;
25782571
}
25792572
mce_reenable_cpu();
2580-
mce_start_timer(cpu, t);
2573+
mce_start_timer(t);
25812574
return 0;
25822575
}
25832576

25842577
static int mce_cpu_pre_down(unsigned int cpu)
25852578
{
2586-
struct timer_list *t = &per_cpu(mce_timer, cpu);
2579+
struct timer_list *t = this_cpu_ptr(&mce_timer);
25872580

25882581
mce_disable_cpu();
25892582
del_timer_sync(t);

0 commit comments

Comments
 (0)