Skip to content

Commit 81dba37

Browse files
committed
wgengine: remove all peer status from open timeout diagnostics
Avoid contention from fetching status for all peers, and instead fetch status for a single peer. Updates tailscale/coral#72 Signed-off-by: James Tucker <james@tailscale.com>
1 parent ad1cc6c commit 81dba37

File tree

3 files changed

+22
-97
lines changed

3 files changed

+22
-97
lines changed

wgengine/pendopen.go

+2-23
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ import (
99
"runtime"
1010
"time"
1111

12-
"tailscale.com/ipn/ipnstate"
1312
"tailscale.com/net/flowtrack"
1413
"tailscale.com/net/packet"
1514
"tailscale.com/net/tsaddr"
@@ -157,28 +156,8 @@ func (e *userspaceEngine) onOpenTimeout(flow flowtrack.Tuple) {
157156
return
158157
}
159158

160-
// We don't care if this information is perfectly up-to-date, since
161-
// we're just using it to print debug information.
162-
//
163-
// In tailscale/coral#72, we see a goroutine profile with thousands of
164-
// goroutines blocked on the mutex in getStatus here, so we wrap it in
165-
// a singleflight and accept stale information to reduce contention.
166-
st, err, _ := e.getStatusSf.Do(struct{}{}, e.getStatus)
167-
168-
var ps *ipnstate.PeerStatusLite
169-
if err == nil {
170-
for _, v := range st.Peers {
171-
if v.NodeKey == n.Key {
172-
v := v // copy
173-
ps = &v
174-
break
175-
}
176-
}
177-
} else {
178-
e.logf("open-conn-track: timeout opening %v to node %v; failed to get engine status: %v", flow, n.Key.ShortString(), err)
179-
return
180-
}
181-
if ps == nil {
159+
ps, found := e.getPeerStatusLite(n.Key)
160+
if !found {
182161
onlyZeroRoute := true // whether peerForIP returned n only because its /0 route matched
183162
for _, r := range n.AllowedIPs {
184163
if r.Bits() != 0 && r.Contains(flow.Dst.Addr()) {

wgengine/userspace.go

+20-24
Original file line numberDiff line numberDiff line change
@@ -982,25 +982,30 @@ var singleNewline = []byte{'\n'}
982982

983983
var ErrEngineClosing = errors.New("engine closing; no status")
984984

985+
func (e *userspaceEngine) getPeerStatusLite(pk key.NodePublic) (status ipnstate.PeerStatusLite, ok bool) {
986+
e.wgLock.Lock()
987+
if e.wgdev == nil {
988+
e.wgLock.Unlock()
989+
return status, false
990+
}
991+
peer := e.wgdev.LookupPeer(pk.Raw32())
992+
e.wgLock.Unlock()
993+
if peer == nil {
994+
return status, false
995+
}
996+
status.NodeKey = pk
997+
status.RxBytes = int64(wgint.PeerRxBytes(peer))
998+
status.TxBytes = int64(wgint.PeerTxBytes(peer))
999+
status.LastHandshake = time.Unix(0, wgint.PeerLastHandshakeNano(peer))
1000+
return status, true
1001+
}
1002+
9851003
func (e *userspaceEngine) getStatus() (*Status, error) {
9861004
// Grab derpConns before acquiring wgLock to not violate lock ordering;
9871005
// the DERPs method acquires magicsock.Conn.mu.
9881006
// (See comment in userspaceEngine's declaration.)
9891007
derpConns := e.magicConn.DERPs()
9901008

991-
e.wgLock.Lock()
992-
wgdev := e.wgdev
993-
e.wgLock.Unlock()
994-
995-
// Assume that once created, wgdev is typically not replaced in-flight.
996-
if wgdev == nil {
997-
// RequestStatus was invoked before the wgengine has
998-
// finished initializing. This can happen when wgegine
999-
// provides a callback to magicsock for endpoint
1000-
// updates that calls RequestStatus.
1001-
return nil, nil
1002-
}
1003-
10041009
e.mu.Lock()
10051010
closing := e.closing
10061011
peerKeys := make([]key.NodePublic, len(e.peerSequence))
@@ -1013,18 +1018,9 @@ func (e *userspaceEngine) getStatus() (*Status, error) {
10131018

10141019
peers := make([]ipnstate.PeerStatusLite, 0, len(peerKeys))
10151020
for _, key := range peerKeys {
1016-
// LookupPeer is internally locked in wgdev.
1017-
peer := wgdev.LookupPeer(key.Raw32())
1018-
if peer == nil {
1019-
continue
1021+
if status, found := e.getPeerStatusLite(key); found {
1022+
peers = append(peers, status)
10201023
}
1021-
1022-
var p ipnstate.PeerStatusLite
1023-
p.NodeKey = key
1024-
p.RxBytes = int64(wgint.PeerRxBytes(peer))
1025-
p.TxBytes = int64(wgint.PeerTxBytes(peer))
1026-
p.LastHandshake = time.Unix(0, wgint.PeerLastHandshakeNano(peer))
1027-
peers = append(peers, p)
10281024
}
10291025

10301026
return &Status{

wgengine/watchdog_test.go

-50
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,9 @@
55
package wgengine
66

77
import (
8-
"fmt"
9-
"regexp"
108
"runtime"
11-
"strings"
129
"testing"
1310
"time"
14-
15-
"tailscale.com/tstest"
1611
)
1712

1813
func TestWatchdog(t *testing.T) {
@@ -42,49 +37,4 @@ func TestWatchdog(t *testing.T) {
4237
e.RequestStatus()
4338
e.Close()
4439
})
45-
46-
t.Run("watchdog fires on blocked getStatus", func(t *testing.T) {
47-
t.Parallel()
48-
e, err := NewFakeUserspaceEngine(t.Logf, 0)
49-
if err != nil {
50-
t.Fatal(err)
51-
}
52-
t.Cleanup(e.Close)
53-
usEngine := e.(*userspaceEngine)
54-
e = NewWatchdog(e)
55-
wdEngine := e.(*watchdogEngine)
56-
wdEngine.maxWait = maxWaitMultiple * 100 * time.Millisecond
57-
58-
logBuf := new(tstest.MemLogger)
59-
fatalCalled := make(chan struct{})
60-
wdEngine.logf = logBuf.Logf
61-
wdEngine.fatalf = func(format string, args ...any) {
62-
t.Logf("FATAL: %s", fmt.Sprintf(format, args...))
63-
fatalCalled <- struct{}{}
64-
}
65-
66-
usEngine.wgLock.Lock() // blocks getStatus so the watchdog will fire
67-
68-
go e.RequestStatus()
69-
70-
select {
71-
case <-fatalCalled:
72-
logs := logBuf.String()
73-
if !strings.Contains(logs, "goroutine profile: total ") {
74-
t.Errorf("fatal called without watchdog stacks, got: %s", logs)
75-
}
76-
re := regexp.MustCompile(`(?m)^\s*in-flight\[\d+\]: name=RequestStatus duration=.* start=.*$`)
77-
if !re.MatchString(logs) {
78-
t.Errorf("fatal called without in-flight list, got: %s", logs)
79-
}
80-
81-
// expected
82-
case <-time.After(3 * time.Second):
83-
t.Fatalf("watchdog failed to fire")
84-
}
85-
86-
usEngine.wgLock.Unlock()
87-
wdEngine.fatalf = t.Fatalf
88-
wdEngine.Close()
89-
})
9040
}

0 commit comments

Comments
 (0)