26
26
#include <asm/paca.h>
27
27
28
28
/*
29
- * The watchdog has a simple timer that runs on each CPU, once per timer
30
- * period. This is the heartbeat.
29
+ * The powerpc watchdog ensures that each CPU is able to service timers.
30
+ * The watchdog sets up a simple timer on each CPU to run once per timer
31
+ * period, and updates a per-cpu timestamp and a "pending" cpumask. This is
32
+ * the heartbeat.
31
33
*
32
- * Then there are checks to see if the heartbeat has not triggered on a CPU
33
- * for the panic timeout period. Currently the watchdog only supports an
34
- * SMP check, so the heartbeat only turns on when we have 2 or more CPUs.
34
+ * Then there are two systems to check that the heartbeat is still running.
35
+ * The local soft-NMI, and the SMP checker.
35
36
*
36
- * This is not an NMI watchdog, but Linux uses that name for a generic
37
- * watchdog in some cases, so NMI gets used in some places.
37
+ * The soft-NMI checker can detect lockups on the local CPU. When interrupts
38
+ * are disabled with local_irq_disable(), platforms that use soft-masking
39
+ * can leave hardware interrupts enabled and handle them with a masked
40
+ * interrupt handler. The masked handler can send the timer interrupt to the
41
+ * watchdog's soft_nmi_interrupt(), which appears to Linux as an NMI
42
+ * interrupt, and can be used to detect CPUs stuck with IRQs disabled.
43
+ *
44
+ * The soft-NMI checker will compare the heartbeat timestamp for this CPU
45
+ * with the current time, and take action if the difference exceeds the
46
+ * watchdog threshold.
47
+ *
48
+ * The limitation of the soft-NMI watchdog is that it does not work when
49
+ * interrupts are hard disabled or otherwise not being serviced. This is
50
+ * solved by also having a SMP watchdog where all CPUs check all other
51
+ * CPUs heartbeat.
52
+ *
53
+ * The SMP checker can detect lockups on other CPUs. A gobal "pending"
54
+ * cpumask is kept, containing all CPUs which enable the watchdog. Each
55
+ * CPU clears their pending bit in their heartbeat timer. When the bitmask
56
+ * becomes empty, the last CPU to clear its pending bit updates a global
57
+ * timestamp and refills the pending bitmask.
58
+ *
59
+ * In the heartbeat timer, if any CPU notices that the global timestamp has
60
+ * not been updated for a period exceeding the watchdog threshold, then it
61
+ * means the CPU(s) with their bit still set in the pending mask have had
62
+ * their heartbeat stop, and action is taken.
63
+ *
64
+ * Some platforms implement true NMI IPIs, which can by used by the SMP
65
+ * watchdog to detect an unresponsive CPU and pull it out of its stuck
66
+ * state with the NMI IPI, to get crash/debug data from it. This way the
67
+ * SMP watchdog can detect hardware interrupts off lockups.
38
68
*/
39
69
40
70
static cpumask_t wd_cpus_enabled __read_mostly ;
@@ -47,19 +77,7 @@ static u64 wd_timer_period_ms __read_mostly; /* interval between heartbeat */
47
77
static DEFINE_PER_CPU (struct timer_list , wd_timer ) ;
48
78
static DEFINE_PER_CPU (u64 , wd_timer_tb ) ;
49
79
50
- /*
51
- * These are for the SMP checker. CPUs clear their pending bit in their
52
- * heartbeat. If the bitmask becomes empty, the time is noted and the
53
- * bitmask is refilled.
54
- *
55
- * All CPUs clear their bit in the pending mask every timer period.
56
- * Once all have cleared, the time is noted and the bits are reset.
57
- * If the time since all clear was greater than the panic timeout,
58
- * we can panic with the list of stuck CPUs.
59
- *
60
- * This will work best with NMI IPIs for crash code so the stuck CPUs
61
- * can be pulled out to get their backtraces.
62
- */
80
+ /* SMP checker bits */
63
81
static unsigned long __wd_smp_lock ;
64
82
static cpumask_t wd_smp_cpus_pending ;
65
83
static cpumask_t wd_smp_cpus_stuck ;
0 commit comments