Skip to content

Commit 048dc04

Browse files
authored
feat: ensure coder remains healthy with single degraded DERP server (#10813)
1 parent abafc08 commit 048dc04

File tree

2 files changed

+61
-3
lines changed

2 files changed

+61
-3
lines changed

coderd/healthcheck/derphealth/derp.go

+11-3
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626

2727
const (
2828
warningNodeUsesWebsocket = `Node uses WebSockets because the "Upgrade: DERP" header may be blocked on the load balancer.`
29+
oneNodeUnhealthy = "Region is operational, but performance might be degraded as one node is unhealthy."
2930
)
3031

3132
// @typescript-generate Report
@@ -146,6 +147,7 @@ func (r *RegionReport) Run(ctx context.Context) {
146147
r.NodeReports = []*NodeReport{}
147148

148149
wg := &sync.WaitGroup{}
150+
var healthyNodes int // atomic.Int64 is not mandatory as we depend on RegionReport mutex.
149151

150152
wg.Add(len(r.Region.Nodes))
151153
for _, node := range r.Region.Nodes {
@@ -169,8 +171,8 @@ func (r *RegionReport) Run(ctx context.Context) {
169171

170172
r.mu.Lock()
171173
r.NodeReports = append(r.NodeReports, &nodeReport)
172-
if !nodeReport.Healthy {
173-
r.Healthy = false
174+
if nodeReport.Healthy {
175+
healthyNodes++
174176
}
175177

176178
for _, w := range nodeReport.Warnings {
@@ -179,8 +181,14 @@ func (r *RegionReport) Run(ctx context.Context) {
179181
r.mu.Unlock()
180182
}()
181183
}
182-
183184
wg.Wait()
185+
186+
// Coder allows for 1 unhealthy node in the region, unless there is only 1 node.
187+
if len(r.Region.Nodes) == 1 {
188+
r.Healthy = healthyNodes == len(r.Region.Nodes)
189+
} else if healthyNodes < len(r.Region.Nodes) {
190+
r.Warnings = append(r.Warnings, oneNodeUnhealthy)
191+
}
184192
}
185193

186194
func (r *NodeReport) derpURL() *url.URL {

coderd/healthcheck/derphealth/derp_test.go

+50
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,56 @@ func TestDERP(t *testing.T) {
8181
}
8282
})
8383

84+
t.Run("HealthyWithNodeDegraded", func(t *testing.T) {
85+
t.Parallel()
86+
87+
healthyDerpSrv := derp.NewServer(key.NewNode(), func(format string, args ...any) { t.Logf(format, args...) })
88+
defer healthyDerpSrv.Close()
89+
healthySrv := httptest.NewServer(derphttp.Handler(healthyDerpSrv))
90+
defer healthySrv.Close()
91+
92+
var (
93+
ctx = context.Background()
94+
report = derphealth.Report{}
95+
derpURL, _ = url.Parse(healthySrv.URL)
96+
opts = &derphealth.ReportOptions{
97+
DERPMap: &tailcfg.DERPMap{Regions: map[int]*tailcfg.DERPRegion{
98+
1: {
99+
EmbeddedRelay: true,
100+
RegionID: 999,
101+
Nodes: []*tailcfg.DERPNode{{
102+
Name: "1a",
103+
RegionID: 999,
104+
HostName: derpURL.Host,
105+
IPv4: derpURL.Host,
106+
STUNPort: -1,
107+
InsecureForTests: true,
108+
ForceHTTP: true,
109+
}, {
110+
Name: "1b",
111+
RegionID: 999,
112+
HostName: "derp.is.dead.tld",
113+
IPv4: "derp.is.dead.tld",
114+
STUNPort: -1,
115+
InsecureForTests: true,
116+
ForceHTTP: true,
117+
}},
118+
},
119+
}},
120+
}
121+
)
122+
123+
report.Run(ctx, opts)
124+
125+
assert.True(t, report.Healthy)
126+
for _, region := range report.Regions {
127+
assert.True(t, region.Healthy)
128+
assert.True(t, region.NodeReports[0].Healthy)
129+
assert.False(t, region.NodeReports[1].Healthy)
130+
assert.Len(t, region.Warnings, 1)
131+
}
132+
})
133+
84134
t.Run("Tailscale/Dallas/OK", func(t *testing.T) {
85135
t.Parallel()
86136

0 commit comments

Comments
 (0)