Skip to content

Commit 5f5edb1

Browse files
authored
chore(healthcheck): fix DERP test flakes (#7211)
1 parent f60b557 commit 5f5edb1

File tree

6 files changed

+160
-40
lines changed

6 files changed

+160
-40
lines changed

coderd/apidoc/docs.go

Lines changed: 16 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/apidoc/swagger.json

Lines changed: 16 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/healthcheck/derp.go

Lines changed: 66 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ import (
1111
"sync/atomic"
1212
"time"
1313

14-
"golang.org/x/sync/errgroup"
1514
"golang.org/x/xerrors"
1615
"tailscale.com/derp"
1716
"tailscale.com/derp/derphttp"
@@ -21,6 +20,8 @@ import (
2120
"tailscale.com/tailcfg"
2221
"tailscale.com/types/key"
2322
tslogger "tailscale.com/types/logger"
23+
24+
"github.com/coder/coder/coderd/util/ptr"
2425
)
2526

2627
type DERPReport struct {
@@ -48,11 +49,12 @@ type DERPNodeReport struct {
4849
Healthy bool `json:"healthy"`
4950
Node *tailcfg.DERPNode `json:"node"`
5051

51-
CanExchangeMessages bool `json:"can_exchange_messages"`
52-
RoundTripPing time.Duration `json:"round_trip_ping"`
53-
UsesWebsocket bool `json:"uses_websocket"`
54-
ClientLogs [][]string `json:"client_logs"`
55-
ClientErrs [][]error `json:"client_errs"`
52+
ServerInfo derp.ServerInfoMessage `json:"node_info"`
53+
CanExchangeMessages bool `json:"can_exchange_messages"`
54+
RoundTripPing time.Duration `json:"round_trip_ping"`
55+
UsesWebsocket bool `json:"uses_websocket"`
56+
ClientLogs [][]string `json:"client_logs"`
57+
ClientErrs [][]error `json:"client_errs"`
5658

5759
STUN DERPStunReport `json:"stun"`
5860
}
@@ -161,8 +163,19 @@ func (r *DERPNodeReport) Run(ctx context.Context) {
161163
r.ClientLogs = [][]string{}
162164
r.ClientErrs = [][]error{}
163165

164-
r.doExchangeMessage(ctx)
165-
r.doSTUNTest(ctx)
166+
wg := &sync.WaitGroup{}
167+
168+
wg.Add(2)
169+
go func() {
170+
defer wg.Done()
171+
r.doExchangeMessage(ctx)
172+
}()
173+
go func() {
174+
defer wg.Done()
175+
r.doSTUNTest(ctx)
176+
}()
177+
178+
wg.Wait()
166179

167180
// We can't exchange messages with the node,
168181
if (!r.CanExchangeMessages && !r.Node.STUNOnly) ||
@@ -181,60 +194,78 @@ func (r *DERPNodeReport) doExchangeMessage(ctx context.Context) {
181194
return
182195
}
183196

184-
var peerKey atomic.Pointer[key.NodePublic]
185-
eg, ctx := errgroup.WithContext(ctx)
197+
var (
198+
peerKey atomic.Pointer[key.NodePublic]
199+
lastSent atomic.Pointer[time.Time]
200+
)
201+
ctx, cancel := context.WithCancel(ctx)
202+
defer cancel()
203+
wg := &sync.WaitGroup{}
186204

187205
receive, receiveID, err := r.derpClient(ctx, r.derpURL())
188206
if err != nil {
189207
return
190208
}
191209
defer receive.Close()
192210

193-
eg.Go(func() error {
211+
wg.Add(2)
212+
go func() {
213+
defer wg.Done()
194214
defer receive.Close()
195215

196216
pkt, err := r.recvData(receive)
197217
if err != nil {
198218
r.writeClientErr(receiveID, xerrors.Errorf("recv derp message: %w", err))
199-
return err
219+
return
200220
}
201221

202222
if *peerKey.Load() != pkt.Source {
203223
r.writeClientErr(receiveID, xerrors.Errorf("received pkt from unknown peer: %s", pkt.Source.ShortString()))
204-
return err
224+
return
205225
}
206226

207-
t, err := time.Parse(time.RFC3339Nano, string(pkt.Data))
208-
if err != nil {
209-
r.writeClientErr(receiveID, xerrors.Errorf("parse time from peer: %w", err))
210-
return err
211-
}
227+
t := lastSent.Load()
212228

213229
r.mu.Lock()
214230
r.CanExchangeMessages = true
215-
r.RoundTripPing = time.Since(t)
231+
r.RoundTripPing = time.Since(*t)
216232
r.mu.Unlock()
217-
return nil
218-
})
219-
eg.Go(func() error {
233+
234+
cancel()
235+
}()
236+
go func() {
237+
defer wg.Done()
220238
send, sendID, err := r.derpClient(ctx, r.derpURL())
221239
if err != nil {
222-
return err
240+
return
223241
}
224242
defer send.Close()
225243

226244
key := send.SelfPublicKey()
227245
peerKey.Store(&key)
228246

229-
err = send.Send(receive.SelfPublicKey(), []byte(time.Now().Format(time.RFC3339Nano)))
230-
if err != nil {
231-
r.writeClientErr(sendID, xerrors.Errorf("send derp message: %w", err))
232-
return err
247+
ticker := time.NewTicker(time.Second)
248+
defer ticker.Stop()
249+
250+
var iter uint8
251+
for {
252+
lastSent.Store(ptr.Ref(time.Now()))
253+
err = send.Send(receive.SelfPublicKey(), []byte{iter})
254+
if err != nil {
255+
r.writeClientErr(sendID, xerrors.Errorf("send derp message: %w", err))
256+
return
257+
}
258+
iter++
259+
260+
select {
261+
case <-ctx.Done():
262+
return
263+
case <-ticker.C:
264+
}
233265
}
234-
return nil
235-
})
266+
}()
236267

237-
_ = eg.Wait()
268+
wg.Wait()
238269
}
239270

240271
func (r *DERPNodeReport) doSTUNTest(ctx context.Context) {
@@ -378,7 +409,7 @@ func (r *DERPNodeReport) derpClient(ctx context.Context, derpURL *url.URL) (*der
378409
return client, id, nil
379410
}
380411

381-
func (*DERPNodeReport) recvData(client *derphttp.Client) (derp.ReceivedPacket, error) {
412+
func (r *DERPNodeReport) recvData(client *derphttp.Client) (derp.ReceivedPacket, error) {
382413
for {
383414
msg, err := client.Recv()
384415
if err != nil {
@@ -388,6 +419,10 @@ func (*DERPNodeReport) recvData(client *derphttp.Client) (derp.ReceivedPacket, e
388419
switch msg := msg.(type) {
389420
case derp.ReceivedPacket:
390421
return msg, nil
422+
case derp.ServerInfoMessage:
423+
r.mu.Lock()
424+
r.ServerInfo = msg
425+
r.mu.Unlock()
391426
default:
392427
// Drop all others!
393428
}

coderd/healthcheck/derp_test.go

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919

2020
"github.com/coder/coder/coderd/healthcheck"
2121
"github.com/coder/coder/tailnet"
22+
"github.com/coder/coder/testutil"
2223
)
2324

2425
//nolint:tparallel
@@ -66,8 +67,7 @@ func TestDERP(t *testing.T) {
6667
for _, node := range region.NodeReports {
6768
assert.True(t, node.Healthy)
6869
assert.True(t, node.CanExchangeMessages)
69-
// TODO: test this without serializing time.Time over the wire.
70-
// assert.Positive(t, node.RoundTripPing)
70+
assert.Positive(t, node.RoundTripPing)
7171
assert.Len(t, node.ClientLogs, 2)
7272
assert.Len(t, node.ClientLogs[0], 1)
7373
assert.Len(t, node.ClientErrs[0], 0)
@@ -81,9 +81,13 @@ func TestDERP(t *testing.T) {
8181
}
8282
})
8383

84-
t.Run("OK/Tailscale/Dallas", func(t *testing.T) {
84+
t.Run("Tailscale/Dallas/OK", func(t *testing.T) {
8585
t.Parallel()
8686

87+
if testutil.InCI() {
88+
t.Skip("This test depends on reaching out over the network to Tailscale servers, which is inherently flaky.")
89+
}
90+
8791
derpSrv := derp.NewServer(key.NewNode(), func(format string, args ...any) { t.Logf(format, args...) })
8892
defer derpSrv.Close()
8993
srv := httptest.NewServer(derphttp.Handler(derpSrv))
@@ -107,8 +111,7 @@ func TestDERP(t *testing.T) {
107111
for _, node := range region.NodeReports {
108112
assert.True(t, node.Healthy)
109113
assert.True(t, node.CanExchangeMessages)
110-
// TODO: test this without serializing time.Time over the wire.
111-
// assert.Positive(t, node.RoundTripPing)
114+
assert.Positive(t, node.RoundTripPing)
112115
assert.Len(t, node.ClientLogs, 2)
113116
assert.Len(t, node.ClientLogs[0], 1)
114117
assert.Len(t, node.ClientErrs[0], 0)
@@ -171,13 +174,12 @@ func TestDERP(t *testing.T) {
171174
for _, node := range region.NodeReports {
172175
assert.False(t, node.Healthy)
173176
assert.True(t, node.CanExchangeMessages)
174-
// TODO: test this without serializing time.Time over the wire.
175-
// assert.Positive(t, node.RoundTripPing)
177+
assert.Positive(t, node.RoundTripPing)
176178
assert.Len(t, node.ClientLogs, 2)
177179
assert.Len(t, node.ClientLogs[0], 3)
178180
assert.Len(t, node.ClientLogs[1], 3)
179181
assert.Len(t, node.ClientErrs, 2)
180-
assert.Len(t, node.ClientErrs[0], 1)
182+
assert.Len(t, node.ClientErrs[0], 1) // this
181183
assert.Len(t, node.ClientErrs[1], 1)
182184
assert.True(t, node.UsesWebsocket)
183185

@@ -188,7 +190,7 @@ func TestDERP(t *testing.T) {
188190
}
189191
})
190192

191-
t.Run("OK/STUNOnly", func(t *testing.T) {
193+
t.Run("STUNOnly/OK", func(t *testing.T) {
192194
t.Parallel()
193195

194196
var (

docs/api/debug.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,10 @@ curl -X GET http://coder-server:8080/api/v2/debug/health \
103103
"stunport": 0,
104104
"stuntestIP": "string"
105105
},
106+
"node_info": {
107+
"tokenBucketBytesBurst": 0,
108+
"tokenBucketBytesPerSecond": 0
109+
},
106110
"round_trip_ping": 0,
107111
"stun": {
108112
"canSTUN": true,
@@ -158,6 +162,10 @@ curl -X GET http://coder-server:8080/api/v2/debug/health \
158162
"stunport": 0,
159163
"stuntestIP": "string"
160164
},
165+
"node_info": {
166+
"tokenBucketBytesBurst": 0,
167+
"tokenBucketBytesPerSecond": 0
168+
},
161169
"round_trip_ping": 0,
162170
"stun": {
163171
"canSTUN": true,

0 commit comments

Comments
 (0)