Skip to content

Commit b69c237

Browse files
authored
feat(coderd/healthcheck): allow configuring database hc threshold (#10623)
* feat(coderd/healthcheck): allow configuring database hc threshold * feat(coderd): add database hc latency, plumb through * feat(coderd): allow configuring healthcheck refresh interval
1 parent e4211cc commit b69c237

17 files changed

+288
-55
lines changed

cli/testdata/coder_server_--help.golden

+9
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,15 @@ Use a YAML configuration file when your server launch become unwieldy.
8080

8181
Write out the current server config as YAML to stdout.
8282

83+
INTROSPECTION / HEALTH CHECK OPTIONS:
84+
--health-check-refresh duration, $CODER_HEALTH_CHECK_REFRESH (default: 10m0s)
85+
Refresh interval for healthchecks.
86+
87+
--health-check-threshold-database duration, $CODER_HEALTH_CHECK_THRESHOLD_DATABASE (default: 15ms)
88+
The threshold for the database health check. If the median latency of
89+
the database exceeds this threshold over 5 attempts, the database is
90+
considered unhealthy. The default value is 15ms.
91+
8392
INTROSPECTION / LOGGING OPTIONS:
8493
--enable-terraform-debug-mode bool, $CODER_ENABLE_TERRAFORM_DEBUG_MODE (default: false)
8594
Allow administrators to enable Terraform debug output.

cli/testdata/server-config.yaml.golden

+9
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,15 @@ introspection:
232232
# Allow administrators to enable Terraform debug output.
233233
# (default: false, type: bool)
234234
enableTerraformDebugMode: false
235+
healthcheck:
236+
# Refresh interval for healthchecks.
237+
# (default: 10m0s, type: duration)
238+
refresh: 10m0s
239+
# The threshold for the database health check. If the median latency of the
240+
# database exceeds this threshold over 5 attempts, the database is considered
241+
# unhealthy. The default value is 15ms.
242+
# (default: 15ms, type: duration)
243+
thresholdDatabase: 15ms
235244
oauth2:
236245
github:
237246
# Client ID for Login with GitHub.

coderd/apidoc/docs.go

+17
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/apidoc/swagger.json

+17
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/coderd.go

+16-5
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ import (
3838
// Used for swagger docs.
3939
_ "github.com/coder/coder/v2/coderd/apidoc"
4040
"github.com/coder/coder/v2/coderd/externalauth"
41+
"github.com/coder/coder/v2/coderd/healthcheck/derphealth"
4142

4243
"cdr.dev/slog"
4344
"github.com/coder/coder/v2/buildinfo"
@@ -398,18 +399,28 @@ func New(options *Options) *API {
398399
if options.HealthcheckFunc == nil {
399400
options.HealthcheckFunc = func(ctx context.Context, apiKey string) *healthcheck.Report {
400401
return healthcheck.Run(ctx, &healthcheck.ReportOptions{
401-
DB: options.Database,
402-
AccessURL: options.AccessURL,
403-
DERPMap: api.DERPMap(),
404-
APIKey: apiKey,
402+
Database: healthcheck.DatabaseReportOptions{
403+
DB: options.Database,
404+
Threshold: options.DeploymentValues.Healthcheck.ThresholdDatabase.Value(),
405+
},
406+
Websocket: healthcheck.WebsocketReportOptions{
407+
AccessURL: options.AccessURL,
408+
APIKey: apiKey,
409+
},
410+
AccessURL: healthcheck.AccessURLReportOptions{
411+
AccessURL: options.AccessURL,
412+
},
413+
DerpHealth: derphealth.ReportOptions{
414+
DERPMap: api.DERPMap(),
415+
},
405416
})
406417
}
407418
}
408419
if options.HealthcheckTimeout == 0 {
409420
options.HealthcheckTimeout = 30 * time.Second
410421
}
411422
if options.HealthcheckRefresh == 0 {
412-
options.HealthcheckRefresh = 10 * time.Minute
423+
options.HealthcheckRefresh = options.DeploymentValues.Healthcheck.Refresh.Value()
413424
}
414425

415426
var oidcAuthURLParams map[string]string

coderd/debug.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -32,20 +32,20 @@ func (api *API) debugCoordinator(rw http.ResponseWriter, r *http.Request) {
3232
// @Router /debug/health [get]
3333
func (api *API) debugDeploymentHealth(rw http.ResponseWriter, r *http.Request) {
3434
apiKey := httpmw.APITokenFromRequest(r)
35-
ctx, cancel := context.WithTimeout(r.Context(), api.HealthcheckTimeout)
35+
ctx, cancel := context.WithTimeout(r.Context(), api.Options.HealthcheckTimeout)
3636
defer cancel()
3737

3838
// Get cached report if it exists.
3939
if report := api.healthCheckCache.Load(); report != nil {
40-
if time.Since(report.Time) < api.HealthcheckRefresh {
40+
if time.Since(report.Time) < api.Options.HealthcheckRefresh {
4141
formatHealthcheck(ctx, rw, r, report)
4242
return
4343
}
4444
}
4545

4646
resChan := api.healthCheckGroup.DoChan("", func() (*healthcheck.Report, error) {
4747
// Create a new context not tied to the request.
48-
ctx, cancel := context.WithTimeout(context.Background(), api.HealthcheckTimeout)
48+
ctx, cancel := context.WithTimeout(context.Background(), api.Options.HealthcheckTimeout)
4949
defer cancel()
5050

5151
report := api.HealthcheckFunc(ctx, apiKey)

coderd/debug_test.go

+45
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,51 @@ func TestDebugHealth(t *testing.T) {
7272
require.Equal(t, http.StatusNotFound, res.StatusCode)
7373
})
7474

75+
t.Run("Refresh", func(t *testing.T) {
76+
t.Parallel()
77+
78+
var (
79+
calls = make(chan struct{})
80+
callsDone = make(chan struct{})
81+
ctx, cancel = context.WithTimeout(context.Background(), testutil.WaitShort)
82+
client = coderdtest.New(t, &coderdtest.Options{
83+
HealthcheckRefresh: time.Microsecond,
84+
HealthcheckFunc: func(context.Context, string) *healthcheck.Report {
85+
calls <- struct{}{}
86+
return &healthcheck.Report{}
87+
},
88+
})
89+
_ = coderdtest.CreateFirstUser(t, client)
90+
)
91+
92+
defer cancel()
93+
94+
go func() {
95+
defer close(callsDone)
96+
<-calls
97+
<-time.After(testutil.IntervalFast)
98+
<-calls
99+
}()
100+
101+
res, err := client.Request(ctx, "GET", "/api/v2/debug/health", nil)
102+
require.NoError(t, err)
103+
defer res.Body.Close()
104+
_, _ = io.ReadAll(res.Body)
105+
require.Equal(t, http.StatusOK, res.StatusCode)
106+
107+
res, err = client.Request(ctx, "GET", "/api/v2/debug/health", nil)
108+
require.NoError(t, err)
109+
defer res.Body.Close()
110+
_, _ = io.ReadAll(res.Body)
111+
require.Equal(t, http.StatusOK, res.StatusCode)
112+
113+
select {
114+
case <-callsDone:
115+
case <-ctx.Done():
116+
t.Fatal("timed out waiting for calls to finish")
117+
}
118+
})
119+
75120
t.Run("Deduplicated", func(t *testing.T) {
76121
t.Parallel()
77122

coderd/healthcheck/database.go

+18-10
Original file line numberDiff line numberDiff line change
@@ -10,20 +10,30 @@ import (
1010
"github.com/coder/coder/v2/coderd/database"
1111
)
1212

13+
const (
14+
DatabaseDefaultThreshold = 15 * time.Millisecond
15+
)
16+
1317
// @typescript-generate DatabaseReport
1418
type DatabaseReport struct {
15-
Healthy bool `json:"healthy"`
16-
Reachable bool `json:"reachable"`
17-
Latency string `json:"latency"`
18-
LatencyMs int `json:"latency_ms"`
19-
Error *string `json:"error"`
19+
Healthy bool `json:"healthy"`
20+
Reachable bool `json:"reachable"`
21+
Latency string `json:"latency"`
22+
LatencyMS int64 `json:"latency_ms"`
23+
ThresholdMS int64 `json:"threshold_ms"`
24+
Error *string `json:"error"`
2025
}
2126

2227
type DatabaseReportOptions struct {
23-
DB database.Store
28+
DB database.Store
29+
Threshold time.Duration
2430
}
2531

2632
func (r *DatabaseReport) Run(ctx context.Context, opts *DatabaseReportOptions) {
33+
r.ThresholdMS = opts.Threshold.Milliseconds()
34+
if r.ThresholdMS == 0 {
35+
r.ThresholdMS = DatabaseDefaultThreshold.Milliseconds()
36+
}
2737
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
2838
defer cancel()
2939

@@ -43,10 +53,8 @@ func (r *DatabaseReport) Run(ctx context.Context, opts *DatabaseReportOptions) {
4353
// Take the median ping.
4454
latency := pings[pingCount/2]
4555
r.Latency = latency.String()
46-
r.LatencyMs = int(latency.Milliseconds())
47-
// Somewhat arbitrary, but if the latency is over 15ms, we consider it
48-
// unhealthy.
49-
if latency < 15*time.Millisecond {
56+
r.LatencyMS = latency.Milliseconds()
57+
if r.LatencyMS < r.ThresholdMS {
5058
r.Healthy = true
5159
}
5260
r.Reachable = true

coderd/healthcheck/database_test.go

+31-2
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ func TestDatabase(t *testing.T) {
3636
assert.True(t, report.Healthy)
3737
assert.True(t, report.Reachable)
3838
assert.Equal(t, ping.String(), report.Latency)
39-
assert.Equal(t, int(ping.Milliseconds()), report.LatencyMs)
39+
assert.Equal(t, ping.Milliseconds(), report.LatencyMS)
40+
assert.Equal(t, healthcheck.DatabaseDefaultThreshold.Milliseconds(), report.ThresholdMS)
4041
assert.Nil(t, report.Error)
4142
})
4243

@@ -59,6 +60,7 @@ func TestDatabase(t *testing.T) {
5960
assert.False(t, report.Reachable)
6061
assert.Zero(t, report.Latency)
6162
require.NotNil(t, report.Error)
63+
assert.Equal(t, healthcheck.DatabaseDefaultThreshold.Milliseconds(), report.ThresholdMS)
6264
assert.Contains(t, *report.Error, err.Error())
6365
})
6466

@@ -83,7 +85,34 @@ func TestDatabase(t *testing.T) {
8385
assert.True(t, report.Healthy)
8486
assert.True(t, report.Reachable)
8587
assert.Equal(t, time.Millisecond.String(), report.Latency)
86-
assert.Equal(t, 1, report.LatencyMs)
88+
assert.EqualValues(t, 1, report.LatencyMS)
89+
assert.Equal(t, healthcheck.DatabaseDefaultThreshold.Milliseconds(), report.ThresholdMS)
90+
assert.Nil(t, report.Error)
91+
})
92+
93+
t.Run("Threshold", func(t *testing.T) {
94+
t.Parallel()
95+
96+
var (
97+
ctx, cancel = context.WithTimeout(context.Background(), testutil.WaitShort)
98+
report = healthcheck.DatabaseReport{}
99+
db = dbmock.NewMockStore(gomock.NewController(t))
100+
)
101+
defer cancel()
102+
103+
db.EXPECT().Ping(gomock.Any()).Return(time.Second, nil)
104+
db.EXPECT().Ping(gomock.Any()).Return(time.Millisecond, nil)
105+
db.EXPECT().Ping(gomock.Any()).Return(time.Second, nil)
106+
db.EXPECT().Ping(gomock.Any()).Return(time.Millisecond, nil)
107+
db.EXPECT().Ping(gomock.Any()).Return(time.Second, nil)
108+
109+
report.Run(ctx, &healthcheck.DatabaseReportOptions{DB: db, Threshold: time.Second})
110+
111+
assert.False(t, report.Healthy)
112+
assert.True(t, report.Reachable)
113+
assert.Equal(t, time.Second.String(), report.Latency)
114+
assert.EqualValues(t, 1000, report.LatencyMS)
115+
assert.Equal(t, time.Second.Milliseconds(), report.ThresholdMS)
87116
assert.Nil(t, report.Error)
88117
})
89118
}

0 commit comments

Comments
 (0)