Skip to content

Commit 58687ac

Browse files
dzickusrhfweisbec
authored andcommitted
lockup_detector: Combine nmi_watchdog and softlockup detector
The new nmi_watchdog (which uses the perf event subsystem) is very similar in structure to the softlockup detector. Using Ingo's suggestion, I combined the two functionalities into one file: kernel/watchdog.c. Now both the nmi_watchdog (or hardlockup detector) and softlockup detector sit on top of the perf event subsystem, which is run every 60 seconds or so to see if there are any lockups. To detect hardlockups, cpus not responding to interrupts, I implemented an hrtimer that runs 5 times for every perf event overflow event. If that stops counting on a cpu, then the cpu is most likely in trouble. To detect softlockups, tasks not yielding to the scheduler, I used the previous kthread idea that now gets kicked every time the hrtimer fires. If the kthread isn't being scheduled neither is anyone else and the warning is printed to the console. I tested this on x86_64 and both the softlockup and hardlockup paths work. V2: - cleaned up the Kconfig and softlockup combination - surrounded hardlockup cases with #ifdef CONFIG_PERF_EVENTS_NMI - seperated out the softlockup case from perf event subsystem - re-arranged the enabling/disabling nmi watchdog from proc space - added cpumasks for hardlockup failure cases - removed fallback to soft events if no PMU exists for hard events V3: - comment cleanups - drop support for older softlockup code - per_cpu cleanups - completely remove software clock base hardlockup detector - use per_cpu masking on hard/soft lockup detection - #ifdef cleanups - rename config option NMI_WATCHDOG to LOCKUP_DETECTOR - documentation additions V4: - documentation fixes - convert per_cpu to __get_cpu_var - powerpc compile fixes V5: - split apart warn flags for hard and soft lockups TODO: - figure out how to make an arch-agnostic clock2cycles call (if possible) to feed into perf events as a sample period [fweisbec: merged conflict patch] Signed-off-by: Don Zickus <dzickus@redhat.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Cyrill Gorcunov <gorcunov@gmail.com> Cc: Eric Paris <eparis@redhat.com> Cc: Randy Dunlap <randy.dunlap@oracle.com> LKML-Reference: <1273266711-18706-2-git-send-email-dzickus@redhat.com> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
1 parent a9aa1d0 commit 58687ac

File tree

12 files changed

+650
-29
lines changed

12 files changed

+650
-29
lines changed

Documentation/kernel-parameters.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1777,6 +1777,8 @@ and is between 256 and 4096 characters. It is defined in the file
17771777

17781778
nousb [USB] Disable the USB subsystem
17791779

1780+
nowatchdog [KNL] Disable the lockup detector.
1781+
17801782
nowb [ARM]
17811783

17821784
nox2apic [X86-64,APIC] Do not enable x2APIC mode.

arch/x86/include/asm/nmi.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ int do_nmi_callback(struct pt_regs *regs, int cpu);
1717

1818
extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
1919
extern int check_nmi_watchdog(void);
20-
#if !defined(CONFIG_NMI_WATCHDOG)
20+
#if !defined(CONFIG_LOCKUP_DETECTOR)
2121
extern int nmi_watchdog_enabled;
2222
#endif
2323
extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);

arch/x86/kernel/apic/Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,10 @@
33
#
44

55
obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o
6-
ifneq ($(CONFIG_NMI_WATCHDOG),y)
6+
ifneq ($(CONFIG_LOCKUP_DETECTOR),y)
77
obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o
88
endif
9-
obj-$(CONFIG_NMI_WATCHDOG) += hw_nmi.o
9+
obj-$(CONFIG_LOCKUP_DETECTOR) += hw_nmi.o
1010

1111
obj-$(CONFIG_X86_IO_APIC) += io_apic.o
1212
obj-$(CONFIG_SMP) += ipi.o

arch/x86/kernel/apic/hw_nmi.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *regs)
8989

9090
u64 hw_nmi_get_sample_period(void)
9191
{
92-
return cpu_khz * 1000;
92+
return (u64)(cpu_khz) * 1000 * 60;
9393
}
9494

9595
#ifdef ARCH_HAS_NMI_WATCHDOG

arch/x86/kernel/traps.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -406,15 +406,15 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
406406
== NOTIFY_STOP)
407407
return;
408408

409-
#ifndef CONFIG_NMI_WATCHDOG
409+
#ifndef CONFIG_LOCKUP_DETECTOR
410410
/*
411411
* Ok, so this is none of the documented NMI sources,
412412
* so it must be the NMI watchdog.
413413
*/
414414
if (nmi_watchdog_tick(regs, reason))
415415
return;
416416
if (!do_nmi_callback(regs, cpu))
417-
#endif /* !CONFIG_NMI_WATCHDOG */
417+
#endif /* !CONFIG_LOCKUP_DETECTOR */
418418
unknown_nmi_error(reason, regs);
419419
#else
420420
unknown_nmi_error(reason, regs);

include/linux/nmi.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ extern void touch_nmi_watchdog(void);
2020
extern void acpi_nmi_disable(void);
2121
extern void acpi_nmi_enable(void);
2222
#else
23-
#ifndef CONFIG_NMI_WATCHDOG
23+
#ifndef CONFIG_LOCKUP_DETECTOR
2424
static inline void touch_nmi_watchdog(void)
2525
{
2626
touch_softlockup_watchdog();
@@ -51,12 +51,12 @@ static inline bool trigger_all_cpu_backtrace(void)
5151
}
5252
#endif
5353

54-
#ifdef CONFIG_NMI_WATCHDOG
54+
#ifdef CONFIG_LOCKUP_DETECTOR
5555
int hw_nmi_is_cpu_stuck(struct pt_regs *);
5656
u64 hw_nmi_get_sample_period(void);
57-
extern int nmi_watchdog_enabled;
57+
extern int watchdog_enabled;
5858
struct ctl_table;
59-
extern int proc_nmi_enabled(struct ctl_table *, int ,
59+
extern int proc_dowatchdog_enabled(struct ctl_table *, int ,
6060
void __user *, size_t *, loff_t *);
6161
#endif
6262

include/linux/sched.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,12 @@ extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
346346
size_t *lenp, loff_t *ppos);
347347
#endif
348348

349+
#ifdef CONFIG_LOCKUP_DETECTOR
350+
extern int proc_dowatchdog_thresh(struct ctl_table *table, int write,
351+
void __user *buffer,
352+
size_t *lenp, loff_t *ppos);
353+
#endif
354+
349355
/* Attach to any functions which should be ignored in wchan output. */
350356
#define __sched __attribute__((__section__(".sched.text")))
351357

init/Kconfig

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -944,8 +944,11 @@ config PERF_USE_VMALLOC
944944

945945
config PERF_EVENTS_NMI
946946
bool
947+
depends on PERF_EVENTS
947948
help
948-
Arch has support for nmi_watchdog
949+
System hardware can generate an NMI using the perf event
950+
subsystem. Also has support for calculating CPU cycle events
951+
to determine how many clock cycles in a given period.
949952

950953
menu "Kernel Performance Events And Counters"
951954

kernel/Makefile

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,8 @@ obj-$(CONFIG_GCOV_KERNEL) += gcov/
7676
obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
7777
obj-$(CONFIG_KPROBES) += kprobes.o
7878
obj-$(CONFIG_KGDB) += kgdb.o
79-
obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
80-
obj-$(CONFIG_NMI_WATCHDOG) += nmi_watchdog.o
8179
obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
80+
obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
8281
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
8382
obj-$(CONFIG_SECCOMP) += seccomp.o
8483
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o

kernel/sysctl.c

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@
7474
#include <scsi/sg.h>
7575
#endif
7676

77-
#ifdef CONFIG_NMI_WATCHDOG
77+
#ifdef CONFIG_LOCKUP_DETECTOR
7878
#include <linux/nmi.h>
7979
#endif
8080

@@ -686,16 +686,25 @@ static struct ctl_table kern_table[] = {
686686
.mode = 0444,
687687
.proc_handler = proc_dointvec,
688688
},
689-
#if defined(CONFIG_NMI_WATCHDOG)
689+
#if defined(CONFIG_LOCKUP_DETECTOR)
690690
{
691-
.procname = "nmi_watchdog",
692-
.data = &nmi_watchdog_enabled,
691+
.procname = "watchdog",
692+
.data = &watchdog_enabled,
693693
.maxlen = sizeof (int),
694694
.mode = 0644,
695-
.proc_handler = proc_nmi_enabled,
695+
.proc_handler = proc_dowatchdog_enabled,
696+
},
697+
{
698+
.procname = "watchdog_thresh",
699+
.data = &softlockup_thresh,
700+
.maxlen = sizeof(int),
701+
.mode = 0644,
702+
.proc_handler = proc_dowatchdog_thresh,
703+
.extra1 = &neg_one,
704+
.extra2 = &sixty,
696705
},
697706
#endif
698-
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_NMI_WATCHDOG)
707+
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) && !defined(CONFIG_LOCKUP_DETECTOR)
699708
{
700709
.procname = "unknown_nmi_panic",
701710
.data = &unknown_nmi_panic,

0 commit comments

Comments
 (0)