Skip to content

Commit 9b77ac1

Browse files
authored
wgengine: print in-flight operations on watchdog trigger (tailscale#5447)
In addition to printing goroutine stacks, explicitly track all in-flight operations and print them when the watchdog triggers (along with the time they were started at). This should make debugging watchdog failures easier, since we can look at the longest-running operation(s) first. Signed-off-by: Andrew Dunham <andrew@tailscale.com> Signed-off-by: Andrew Dunham <andrew@tailscale.com>
1 parent e1738ea commit 9b77ac1

File tree

2 files changed

+59
-6
lines changed

2 files changed

+59
-6
lines changed

wgengine/watchdog.go

Lines changed: 50 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,12 @@
88
package wgengine
99

1010
import (
11+
"fmt"
1112
"log"
1213
"net/netip"
1314
"runtime/pprof"
1415
"strings"
16+
"sync"
1517
"time"
1618

1719
"tailscale.com/envknob"
@@ -38,21 +40,49 @@ func NewWatchdog(e Engine) Engine {
3840
return e
3941
}
4042
return &watchdogEngine{
41-
wrap: e,
42-
logf: log.Printf,
43-
fatalf: log.Fatalf,
44-
maxWait: 45 * time.Second,
43+
wrap: e,
44+
logf: log.Printf,
45+
fatalf: log.Fatalf,
46+
maxWait: 45 * time.Second,
47+
inFlight: make(map[inFlightKey]time.Time),
4548
}
4649
}
4750

51+
type inFlightKey struct {
52+
op string
53+
ctr uint64
54+
}
55+
4856
type watchdogEngine struct {
4957
wrap Engine
5058
logf func(format string, args ...any)
5159
fatalf func(format string, args ...any)
5260
maxWait time.Duration
61+
62+
// Track the start time(s) of in-flight operations
63+
inFlightMu sync.Mutex
64+
inFlight map[inFlightKey]time.Time
65+
inFlightCtr uint64
5366
}
5467

5568
func (e *watchdogEngine) watchdogErr(name string, fn func() error) error {
69+
// Track all in-flight operations so we can print more useful error
70+
// messages on watchdog failure
71+
e.inFlightMu.Lock()
72+
key := inFlightKey{
73+
op: name,
74+
ctr: e.inFlightCtr,
75+
}
76+
e.inFlightCtr++
77+
e.inFlight[key] = time.Now()
78+
e.inFlightMu.Unlock()
79+
80+
defer func() {
81+
e.inFlightMu.Lock()
82+
defer e.inFlightMu.Unlock()
83+
delete(e.inFlight, key)
84+
}()
85+
5686
errCh := make(chan error)
5787
go func() {
5888
errCh <- fn()
@@ -66,6 +96,22 @@ func (e *watchdogEngine) watchdogErr(name string, fn func() error) error {
6696
buf := new(strings.Builder)
6797
pprof.Lookup("goroutine").WriteTo(buf, 1)
6898
e.logf("wgengine watchdog stacks:\n%s", buf.String())
99+
100+
// Collect the list of in-flight operations for debugging.
101+
var (
102+
b []byte
103+
now = time.Now()
104+
)
105+
e.inFlightMu.Lock()
106+
for k, t := range e.inFlight {
107+
dur := now.Sub(t).Round(time.Millisecond)
108+
b = fmt.Appendf(b, "in-flight[%d]: name=%s duration=%v start=%s\n", k.ctr, k.op, dur, t.Format(time.RFC3339Nano))
109+
}
110+
e.inFlightMu.Unlock()
111+
112+
// Print everything as a single string to avoid log
113+
// rate limits.
114+
e.logf("wgengine watchdog in-flight:\n%s", b)
69115
e.fatalf("wgengine: watchdog timeout on %s", name)
70116
return nil
71117
}

wgengine/watchdog_test.go

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ package wgengine
66

77
import (
88
"fmt"
9+
"regexp"
910
"runtime"
1011
"strings"
1112
"testing"
@@ -68,9 +69,15 @@ func TestWatchdog(t *testing.T) {
6869

6970
select {
7071
case <-fatalCalled:
71-
if !strings.Contains(logBuf.String(), "goroutine profile: total ") {
72-
t.Errorf("fatal called without watchdog stacks, got: %s", logBuf.String())
72+
logs := logBuf.String()
73+
if !strings.Contains(logs, "goroutine profile: total ") {
74+
t.Errorf("fatal called without watchdog stacks, got: %s", logs)
7375
}
76+
re := regexp.MustCompile(`(?m)^\s*in-flight\[\d+\]: name=RequestStatus duration=.* start=.*$`)
77+
if !re.MatchString(logs) {
78+
t.Errorf("fatal called without in-flight list, got: %s", logs)
79+
}
80+
7481
// expected
7582
case <-time.After(3 * time.Second):
7683
t.Fatalf("watchdog failed to fire")

0 commit comments

Comments
 (0)