Skip to content

feat(coderd/healthcheck): improve detection of STUN issues #12951

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Apr 15, 2024
5 changes: 3 additions & 2 deletions cli/support.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ func (r *RootCmd) supportBundle() *serpent.Command {

// Check if we're running inside a workspace
if val, found := os.LookupEnv("CODER"); found && val == "true" {
_, _ = fmt.Fprintln(inv.Stderr, "Running inside Coder workspace; this can affect results!")
cliui.Warn(inv.Stderr, "Running inside Coder workspace; this can affect results!")
cliLog.Debug(inv.Context(), "running inside coder workspace")
}

Expand All @@ -122,7 +122,7 @@ func (r *RootCmd) supportBundle() *serpent.Command {

if len(inv.Args) == 0 {
cliLog.Warn(inv.Context(), "no workspace specified")
_, _ = fmt.Fprintln(inv.Stderr, "Warning: no workspace specified. This will result in incomplete information.")
cliui.Warn(inv.Stderr, "No workspace specified. This will result in incomplete information.")
} else {
ws, err := namedWorkspace(inv.Context(), client, inv.Args[0])
if err != nil {
Expand Down Expand Up @@ -191,6 +191,7 @@ func (r *RootCmd) supportBundle() *serpent.Command {
return xerrors.Errorf("write support bundle to %s: %w", outputPath, err)
}
_, _ = fmt.Fprintln(inv.Stderr, "Wrote support bundle to "+outputPath)

return nil
},
}
Expand Down
23 changes: 23 additions & 0 deletions coderd/healthcheck/derphealth/derp.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ const (
warningNodeUsesWebsocket = `Node uses WebSockets because the "Upgrade: DERP" header may be blocked on the load balancer.`
oneNodeUnhealthy = "Region is operational, but performance might be degraded as one node is unhealthy."
missingNodeReport = "Missing node health report, probably a developer error."
noSTUN = "No STUN servers are available."
stunMapVaryDest = "STUN returned different addresses; you may be behind a hard NAT."
)

type ReportOptions struct {
Expand Down Expand Up @@ -107,9 +109,30 @@ func (r *Report) Run(ctx context.Context, opts *ReportOptions) {
ncReport, netcheckErr := nc.GetReport(ctx, opts.DERPMap)
r.Netcheck = ncReport
r.NetcheckErr = convertError(netcheckErr)
if mapVaryDest, _ := r.Netcheck.MappingVariesByDestIP.Get(); mapVaryDest {
r.Warnings = append(r.Warnings, health.Messagef(health.CodeSTUNMapVaryDest, stunMapVaryDest))
}

wg.Wait()

// Count the number of STUN-capable nodes.
var stunCapableNodes int
var stunTotalNodes int
for _, region := range r.Regions {
for _, node := range region.NodeReports {
if node.STUN.Enabled {
stunTotalNodes++
}
if node.STUN.CanSTUN {
stunCapableNodes++
}
}
}
if stunCapableNodes == 0 && stunTotalNodes > 0 {
r.Severity = health.SeverityWarning
r.Warnings = append(r.Warnings, health.Messagef(health.CodeSTUNNoNodes, noSTUN))
}

// Review region reports and select the highest severity.
for _, regionReport := range r.Regions {
if regionReport.Severity.Value() > r.Severity.Value() {
Expand Down
163 changes: 162 additions & 1 deletion coderd/healthcheck/derphealth/derp_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,67 @@ func TestDERP(t *testing.T) {
assert.True(t, report.Healthy)
assert.Equal(t, health.SeverityWarning, report.Severity)
assert.True(t, report.Dismissed)
if assert.NotEmpty(t, report.Warnings) {
if assert.Len(t, report.Warnings, 1) {
assert.Contains(t, report.Warnings[0].Code, health.CodeDERPOneNodeUnhealthy)
}
for _, region := range report.Regions {
assert.True(t, region.Healthy)
assert.True(t, region.NodeReports[0].Healthy)
assert.Empty(t, region.NodeReports[0].Warnings)
assert.Equal(t, health.SeverityOK, region.NodeReports[0].Severity)
assert.False(t, region.NodeReports[1].Healthy)
assert.Equal(t, health.SeverityError, region.NodeReports[1].Severity)
assert.Len(t, region.Warnings, 1)
}
})

t.Run("HealthyWithNoSTUN", func(t *testing.T) {
t.Parallel()

healthyDerpSrv := derp.NewServer(key.NewNode(), func(format string, args ...any) { t.Logf(format, args...) })
defer healthyDerpSrv.Close()
healthySrv := httptest.NewServer(derphttp.Handler(healthyDerpSrv))
defer healthySrv.Close()

var (
ctx = context.Background()
report = derphealth.Report{}
derpURL, _ = url.Parse(healthySrv.URL)
opts = &derphealth.ReportOptions{
DERPMap: &tailcfg.DERPMap{Regions: map[int]*tailcfg.DERPRegion{
1: {
EmbeddedRelay: true,
RegionID: 999,
Nodes: []*tailcfg.DERPNode{{
Name: "1a",
RegionID: 999,
HostName: derpURL.Host,
IPv4: derpURL.Host,
STUNPort: -1,
InsecureForTests: true,
ForceHTTP: true,
}, {
Name: "badstun",
RegionID: 999,
HostName: derpURL.Host,
STUNPort: 19302,
STUNOnly: true,
InsecureForTests: true,
ForceHTTP: true,
}},
},
}},
}
)

report.Run(ctx, opts)

assert.True(t, report.Healthy)
assert.Equal(t, health.SeverityWarning, report.Severity)
if assert.Len(t, report.Warnings, 2) {
assert.EqualValues(t, report.Warnings[1].Code, health.CodeSTUNNoNodes)
assert.EqualValues(t, report.Warnings[0].Code, health.CodeDERPOneNodeUnhealthy)
}
for _, region := range report.Regions {
assert.True(t, region.Healthy)
assert.True(t, region.NodeReports[0].Healthy)
Expand Down Expand Up @@ -291,8 +349,10 @@ func TestDERP(t *testing.T) {
report.Run(ctx, opts)

assert.True(t, report.Healthy)
assert.Equal(t, health.SeverityOK, report.Severity)
for _, region := range report.Regions {
assert.True(t, region.Healthy)
assert.Equal(t, health.SeverityOK, region.Severity)
for _, node := range region.NodeReports {
assert.True(t, node.Healthy)
assert.False(t, node.CanExchangeMessages)
Expand All @@ -304,6 +364,107 @@ func TestDERP(t *testing.T) {
}
}
})

t.Run("STUNOnly/OneBadOneGood", func(t *testing.T) {
t.Parallel()

var (
ctx = context.Background()
report = derphealth.Report{}
opts = &derphealth.ReportOptions{
DERPMap: &tailcfg.DERPMap{
Regions: map[int]*tailcfg.DERPRegion{
1: {
EmbeddedRelay: true,
RegionID: 999,
Nodes: []*tailcfg.DERPNode{{
Name: "badstun",
RegionID: 999,
HostName: "badstun.example.com",
STUNPort: 19302,
STUNOnly: true,
InsecureForTests: true,
ForceHTTP: true,
}, {
Name: "goodstun",
RegionID: 999,
HostName: "stun.l.google.com",
STUNPort: 19302,
STUNOnly: true,
InsecureForTests: true,
ForceHTTP: true,
}},
},
},
},
}
)

report.Run(ctx, opts)
assert.True(t, report.Healthy)
assert.Equal(t, health.SeverityWarning, report.Severity)
if assert.Len(t, report.Warnings, 1) {
assert.Equal(t, health.CodeDERPOneNodeUnhealthy, report.Warnings[0].Code)
}
for _, region := range report.Regions {
assert.True(t, region.Healthy)
assert.Equal(t, health.SeverityWarning, region.Severity)
// badstun
assert.False(t, region.NodeReports[0].Healthy)
assert.True(t, region.NodeReports[0].STUN.Enabled)
assert.False(t, region.NodeReports[0].STUN.CanSTUN)
assert.NotNil(t, region.NodeReports[0].STUN.Error)
// goodstun
assert.True(t, region.NodeReports[1].Healthy)
assert.True(t, region.NodeReports[1].STUN.Enabled)
assert.True(t, region.NodeReports[1].STUN.CanSTUN)
assert.Nil(t, region.NodeReports[1].STUN.Error)
}
})

t.Run("STUNOnly/NoStun", func(t *testing.T) {
t.Parallel()

var (
ctx = context.Background()
report = derphealth.Report{}
opts = &derphealth.ReportOptions{
DERPMap: &tailcfg.DERPMap{
Regions: map[int]*tailcfg.DERPRegion{
1: {
EmbeddedRelay: true,
RegionID: 999,
Nodes: []*tailcfg.DERPNode{{
Name: "badstun",
RegionID: 999,
HostName: "badstun.example.com",
STUNPort: 19302,
STUNOnly: true,
InsecureForTests: true,
ForceHTTP: true,
}},
},
},
},
}
)

report.Run(ctx, opts)
assert.False(t, report.Healthy)
assert.Equal(t, health.SeverityError, report.Severity)
for _, region := range report.Regions {
assert.False(t, region.Healthy)
assert.Equal(t, health.SeverityError, region.Severity)
for _, node := range region.NodeReports {
assert.False(t, node.Healthy)
assert.False(t, node.CanExchangeMessages)
assert.Empty(t, node.ClientLogs)
assert.True(t, node.STUN.Enabled)
assert.False(t, node.STUN.CanSTUN)
assert.NotNil(t, node.STUN.Error)
}
}
})
}

func tsDERPMap(ctx context.Context, t testing.TB) *tailcfg.DERPMap {
Expand Down
2 changes: 2 additions & 0 deletions coderd/healthcheck/health/model.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ const (

CodeDERPNodeUsesWebsocket Code = `EDERP01`
CodeDERPOneNodeUnhealthy Code = `EDERP02`
CodeSTUNNoNodes = `ESTUN01`
CodeSTUNMapVaryDest = `ESTUN02`

CodeProvisionerDaemonsNoProvisionerDaemons Code = `EPD01`
CodeProvisionerDaemonVersionMismatch Code = `EPD02`
Expand Down
25 changes: 25 additions & 0 deletions docs/admin/healthcheck.md
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,31 @@ curl -v "https://coder.company.com/derp"
# DERP requires connection upgrade
```

### ESTUN01

_No STUN servers available._

**Problem:** This is shown if no STUN servers are available. Coder will use STUN
to establish [direct connections](../networking/stun.md). Without at least one
working STUN server, direct connections may not be possible.

**Solution:** Ensure that the
[configured STUN severs](../cli/server.md#derp-server-stun-addresses) are
reachable from Coder and that UDP traffic can be sent/received on the configured
port.

### ESTUN02

_STUN returned different addresses; you may be behind a hard NAT._

**Problem:** This is a warning shown when multiple attempts to determine our
public IP address/port via STUN resulted in different `ip:port` combinations.
This is a sign that you are behind a "hard NAT", and may result in difficulty
establishing direct connections. However, it does not mean that direct
connections are impossible.

**Solution:** Engage with your network administrator.

## Websocket

Coder makes heavy use of [WebSockets](https://datatracker.ietf.org/doc/rfc6455/)
Expand Down
Loading