Skip to content

Commit 9a4703a

Browse files
authored
feat(coderd/healthcheck): improve detection of STUN issues (#12951)
Adds checks to coderd/healthcheck/derphealth for STUN issues: - Alerts if there is not least one healthy STUN server, - Alerts if we see variable port mapping.
1 parent c13909a commit 9a4703a

File tree

5 files changed

+215
-3
lines changed

5 files changed

+215
-3
lines changed

cli/support.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ func (r *RootCmd) supportBundle() *serpent.Command {
101101

102102
// Check if we're running inside a workspace
103103
if val, found := os.LookupEnv("CODER"); found && val == "true" {
104-
_, _ = fmt.Fprintln(inv.Stderr, "Running inside Coder workspace; this can affect results!")
104+
cliui.Warn(inv.Stderr, "Running inside Coder workspace; this can affect results!")
105105
cliLog.Debug(inv.Context(), "running inside coder workspace")
106106
}
107107

@@ -122,7 +122,7 @@ func (r *RootCmd) supportBundle() *serpent.Command {
122122

123123
if len(inv.Args) == 0 {
124124
cliLog.Warn(inv.Context(), "no workspace specified")
125-
_, _ = fmt.Fprintln(inv.Stderr, "Warning: no workspace specified. This will result in incomplete information.")
125+
cliui.Warn(inv.Stderr, "No workspace specified. This will result in incomplete information.")
126126
} else {
127127
ws, err := namedWorkspace(inv.Context(), client, inv.Args[0])
128128
if err != nil {
@@ -191,6 +191,7 @@ func (r *RootCmd) supportBundle() *serpent.Command {
191191
return xerrors.Errorf("write support bundle to %s: %w", outputPath, err)
192192
}
193193
_, _ = fmt.Fprintln(inv.Stderr, "Wrote support bundle to "+outputPath)
194+
194195
return nil
195196
},
196197
}

coderd/healthcheck/derphealth/derp.go

+23
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ const (
3232
warningNodeUsesWebsocket = `Node uses WebSockets because the "Upgrade: DERP" header may be blocked on the load balancer.`
3333
oneNodeUnhealthy = "Region is operational, but performance might be degraded as one node is unhealthy."
3434
missingNodeReport = "Missing node health report, probably a developer error."
35+
noSTUN = "No STUN servers are available."
36+
stunMapVaryDest = "STUN returned different addresses; you may be behind a hard NAT."
3537
)
3638

3739
type ReportOptions struct {
@@ -107,9 +109,30 @@ func (r *Report) Run(ctx context.Context, opts *ReportOptions) {
107109
ncReport, netcheckErr := nc.GetReport(ctx, opts.DERPMap)
108110
r.Netcheck = ncReport
109111
r.NetcheckErr = convertError(netcheckErr)
112+
if mapVaryDest, _ := r.Netcheck.MappingVariesByDestIP.Get(); mapVaryDest {
113+
r.Warnings = append(r.Warnings, health.Messagef(health.CodeSTUNMapVaryDest, stunMapVaryDest))
114+
}
110115

111116
wg.Wait()
112117

118+
// Count the number of STUN-capable nodes.
119+
var stunCapableNodes int
120+
var stunTotalNodes int
121+
for _, region := range r.Regions {
122+
for _, node := range region.NodeReports {
123+
if node.STUN.Enabled {
124+
stunTotalNodes++
125+
}
126+
if node.STUN.CanSTUN {
127+
stunCapableNodes++
128+
}
129+
}
130+
}
131+
if stunCapableNodes == 0 && stunTotalNodes > 0 {
132+
r.Severity = health.SeverityWarning
133+
r.Warnings = append(r.Warnings, health.Messagef(health.CodeSTUNNoNodes, noSTUN))
134+
}
135+
113136
// Review region reports and select the highest severity.
114137
for _, regionReport := range r.Regions {
115138
if regionReport.Severity.Value() > r.Severity.Value() {

coderd/healthcheck/derphealth/derp_test.go

+162-1
Original file line numberDiff line numberDiff line change
@@ -129,9 +129,67 @@ func TestDERP(t *testing.T) {
129129
assert.True(t, report.Healthy)
130130
assert.Equal(t, health.SeverityWarning, report.Severity)
131131
assert.True(t, report.Dismissed)
132-
if assert.NotEmpty(t, report.Warnings) {
132+
if assert.Len(t, report.Warnings, 1) {
133133
assert.Contains(t, report.Warnings[0].Code, health.CodeDERPOneNodeUnhealthy)
134134
}
135+
for _, region := range report.Regions {
136+
assert.True(t, region.Healthy)
137+
assert.True(t, region.NodeReports[0].Healthy)
138+
assert.Empty(t, region.NodeReports[0].Warnings)
139+
assert.Equal(t, health.SeverityOK, region.NodeReports[0].Severity)
140+
assert.False(t, region.NodeReports[1].Healthy)
141+
assert.Equal(t, health.SeverityError, region.NodeReports[1].Severity)
142+
assert.Len(t, region.Warnings, 1)
143+
}
144+
})
145+
146+
t.Run("HealthyWithNoSTUN", func(t *testing.T) {
147+
t.Parallel()
148+
149+
healthyDerpSrv := derp.NewServer(key.NewNode(), func(format string, args ...any) { t.Logf(format, args...) })
150+
defer healthyDerpSrv.Close()
151+
healthySrv := httptest.NewServer(derphttp.Handler(healthyDerpSrv))
152+
defer healthySrv.Close()
153+
154+
var (
155+
ctx = context.Background()
156+
report = derphealth.Report{}
157+
derpURL, _ = url.Parse(healthySrv.URL)
158+
opts = &derphealth.ReportOptions{
159+
DERPMap: &tailcfg.DERPMap{Regions: map[int]*tailcfg.DERPRegion{
160+
1: {
161+
EmbeddedRelay: true,
162+
RegionID: 999,
163+
Nodes: []*tailcfg.DERPNode{{
164+
Name: "1a",
165+
RegionID: 999,
166+
HostName: derpURL.Host,
167+
IPv4: derpURL.Host,
168+
STUNPort: -1,
169+
InsecureForTests: true,
170+
ForceHTTP: true,
171+
}, {
172+
Name: "badstun",
173+
RegionID: 999,
174+
HostName: derpURL.Host,
175+
STUNPort: 19302,
176+
STUNOnly: true,
177+
InsecureForTests: true,
178+
ForceHTTP: true,
179+
}},
180+
},
181+
}},
182+
}
183+
)
184+
185+
report.Run(ctx, opts)
186+
187+
assert.True(t, report.Healthy)
188+
assert.Equal(t, health.SeverityWarning, report.Severity)
189+
if assert.Len(t, report.Warnings, 2) {
190+
assert.EqualValues(t, report.Warnings[1].Code, health.CodeSTUNNoNodes)
191+
assert.EqualValues(t, report.Warnings[0].Code, health.CodeDERPOneNodeUnhealthy)
192+
}
135193
for _, region := range report.Regions {
136194
assert.True(t, region.Healthy)
137195
assert.True(t, region.NodeReports[0].Healthy)
@@ -291,8 +349,10 @@ func TestDERP(t *testing.T) {
291349
report.Run(ctx, opts)
292350

293351
assert.True(t, report.Healthy)
352+
assert.Equal(t, health.SeverityOK, report.Severity)
294353
for _, region := range report.Regions {
295354
assert.True(t, region.Healthy)
355+
assert.Equal(t, health.SeverityOK, region.Severity)
296356
for _, node := range region.NodeReports {
297357
assert.True(t, node.Healthy)
298358
assert.False(t, node.CanExchangeMessages)
@@ -304,6 +364,107 @@ func TestDERP(t *testing.T) {
304364
}
305365
}
306366
})
367+
368+
t.Run("STUNOnly/OneBadOneGood", func(t *testing.T) {
369+
t.Parallel()
370+
371+
var (
372+
ctx = context.Background()
373+
report = derphealth.Report{}
374+
opts = &derphealth.ReportOptions{
375+
DERPMap: &tailcfg.DERPMap{
376+
Regions: map[int]*tailcfg.DERPRegion{
377+
1: {
378+
EmbeddedRelay: true,
379+
RegionID: 999,
380+
Nodes: []*tailcfg.DERPNode{{
381+
Name: "badstun",
382+
RegionID: 999,
383+
HostName: "badstun.example.com",
384+
STUNPort: 19302,
385+
STUNOnly: true,
386+
InsecureForTests: true,
387+
ForceHTTP: true,
388+
}, {
389+
Name: "goodstun",
390+
RegionID: 999,
391+
HostName: "stun.l.google.com",
392+
STUNPort: 19302,
393+
STUNOnly: true,
394+
InsecureForTests: true,
395+
ForceHTTP: true,
396+
}},
397+
},
398+
},
399+
},
400+
}
401+
)
402+
403+
report.Run(ctx, opts)
404+
assert.True(t, report.Healthy)
405+
assert.Equal(t, health.SeverityWarning, report.Severity)
406+
if assert.Len(t, report.Warnings, 1) {
407+
assert.Equal(t, health.CodeDERPOneNodeUnhealthy, report.Warnings[0].Code)
408+
}
409+
for _, region := range report.Regions {
410+
assert.True(t, region.Healthy)
411+
assert.Equal(t, health.SeverityWarning, region.Severity)
412+
// badstun
413+
assert.False(t, region.NodeReports[0].Healthy)
414+
assert.True(t, region.NodeReports[0].STUN.Enabled)
415+
assert.False(t, region.NodeReports[0].STUN.CanSTUN)
416+
assert.NotNil(t, region.NodeReports[0].STUN.Error)
417+
// goodstun
418+
assert.True(t, region.NodeReports[1].Healthy)
419+
assert.True(t, region.NodeReports[1].STUN.Enabled)
420+
assert.True(t, region.NodeReports[1].STUN.CanSTUN)
421+
assert.Nil(t, region.NodeReports[1].STUN.Error)
422+
}
423+
})
424+
425+
t.Run("STUNOnly/NoStun", func(t *testing.T) {
426+
t.Parallel()
427+
428+
var (
429+
ctx = context.Background()
430+
report = derphealth.Report{}
431+
opts = &derphealth.ReportOptions{
432+
DERPMap: &tailcfg.DERPMap{
433+
Regions: map[int]*tailcfg.DERPRegion{
434+
1: {
435+
EmbeddedRelay: true,
436+
RegionID: 999,
437+
Nodes: []*tailcfg.DERPNode{{
438+
Name: "badstun",
439+
RegionID: 999,
440+
HostName: "badstun.example.com",
441+
STUNPort: 19302,
442+
STUNOnly: true,
443+
InsecureForTests: true,
444+
ForceHTTP: true,
445+
}},
446+
},
447+
},
448+
},
449+
}
450+
)
451+
452+
report.Run(ctx, opts)
453+
assert.False(t, report.Healthy)
454+
assert.Equal(t, health.SeverityError, report.Severity)
455+
for _, region := range report.Regions {
456+
assert.False(t, region.Healthy)
457+
assert.Equal(t, health.SeverityError, region.Severity)
458+
for _, node := range region.NodeReports {
459+
assert.False(t, node.Healthy)
460+
assert.False(t, node.CanExchangeMessages)
461+
assert.Empty(t, node.ClientLogs)
462+
assert.True(t, node.STUN.Enabled)
463+
assert.False(t, node.STUN.CanSTUN)
464+
assert.NotNil(t, node.STUN.Error)
465+
}
466+
}
467+
})
307468
}
308469

309470
func tsDERPMap(ctx context.Context, t testing.TB) *tailcfg.DERPMap {

coderd/healthcheck/health/model.go

+2
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ const (
3636

3737
CodeDERPNodeUsesWebsocket Code = `EDERP01`
3838
CodeDERPOneNodeUnhealthy Code = `EDERP02`
39+
CodeSTUNNoNodes = `ESTUN01`
40+
CodeSTUNMapVaryDest = `ESTUN02`
3941

4042
CodeProvisionerDaemonsNoProvisionerDaemons Code = `EPD01`
4143
CodeProvisionerDaemonVersionMismatch Code = `EPD02`

docs/admin/healthcheck.md

+25
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,31 @@ curl -v "https://coder.company.com/derp"
170170
# DERP requires connection upgrade
171171
```
172172

173+
### ESTUN01
174+
175+
_No STUN servers available._
176+
177+
**Problem:** This is shown if no STUN servers are available. Coder will use STUN
178+
to establish [direct connections](../networking/stun.md). Without at least one
179+
working STUN server, direct connections may not be possible.
180+
181+
**Solution:** Ensure that the
182+
[configured STUN severs](../cli/server.md#derp-server-stun-addresses) are
183+
reachable from Coder and that UDP traffic can be sent/received on the configured
184+
port.
185+
186+
### ESTUN02
187+
188+
_STUN returned different addresses; you may be behind a hard NAT._
189+
190+
**Problem:** This is a warning shown when multiple attempts to determine our
191+
public IP address/port via STUN resulted in different `ip:port` combinations.
192+
This is a sign that you are behind a "hard NAT", and may result in difficulty
193+
establishing direct connections. However, it does not mean that direct
194+
connections are impossible.
195+
196+
**Solution:** Engage with your network administrator.
197+
173198
## Websocket
174199

175200
Coder makes heavy use of [WebSockets](https://datatracker.ietf.org/doc/rfc6455/)

0 commit comments

Comments
 (0)