Skip to content

feat(coderd/healthcheck): allow configuring database hc threshold #10623

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Nov 13, 2023
9 changes: 9 additions & 0 deletions cli/testdata/coder_server_--help.golden
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,15 @@ Use a YAML configuration file when your server launch become unwieldy.

Write out the current server config as YAML to stdout.

INTROSPECTION / HEALTH CHECK OPTIONS:
--health-check-refresh duration, $CODER_HEALTH_CHECK_REFRESH (default: 10m0s)
Refresh interval for healthchecks.

--health-check-threshold-database duration, $CODER_HEALTH_CHECK_THRESHOLD_DATABASE (default: 15ms)
The threshold for the database health check. If the median latency of
the database exceeds this threshold over 5 attempts, the database is
considered unhealthy. The default value is 15ms.

INTROSPECTION / LOGGING OPTIONS:
--enable-terraform-debug-mode bool, $CODER_ENABLE_TERRAFORM_DEBUG_MODE (default: false)
Allow administrators to enable Terraform debug output.
Expand Down
9 changes: 9 additions & 0 deletions cli/testdata/server-config.yaml.golden
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,15 @@ introspection:
# Allow administrators to enable Terraform debug output.
# (default: false, type: bool)
enableTerraformDebugMode: false
healthcheck:
# Refresh interval for healthchecks.
# (default: 10m0s, type: duration)
refresh: 10m0s
# The threshold for the database health check. If the median latency of the
# database exceeds this threshold over 5 attempts, the database is considered
# unhealthy. The default value is 15ms.
# (default: 15ms, type: duration)
thresholdDatabase: 15ms
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this would be smoother as databaseThreshold vs thresholdDatabase (same with flags, etc)

oauth2:
github:
# Client ID for Login with GitHub.
Expand Down
17 changes: 17 additions & 0 deletions coderd/apidoc/docs.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions coderd/apidoc/swagger.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 16 additions & 5 deletions coderd/coderd.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ import (
// Used for swagger docs.
_ "github.com/coder/coder/v2/coderd/apidoc"
"github.com/coder/coder/v2/coderd/externalauth"
"github.com/coder/coder/v2/coderd/healthcheck/derphealth"

"cdr.dev/slog"
"github.com/coder/coder/v2/buildinfo"
Expand Down Expand Up @@ -398,18 +399,28 @@ func New(options *Options) *API {
if options.HealthcheckFunc == nil {
options.HealthcheckFunc = func(ctx context.Context, apiKey string) *healthcheck.Report {
return healthcheck.Run(ctx, &healthcheck.ReportOptions{
DB: options.Database,
AccessURL: options.AccessURL,
DERPMap: api.DERPMap(),
APIKey: apiKey,
Database: healthcheck.DatabaseReportOptions{
DB: options.Database,
Threshold: options.DeploymentValues.Healthcheck.ThresholdDatabase.Value(),
},
Websocket: healthcheck.WebsocketReportOptions{
AccessURL: options.AccessURL,
APIKey: apiKey,
},
AccessURL: healthcheck.AccessURLReportOptions{
AccessURL: options.AccessURL,
},
DerpHealth: derphealth.ReportOptions{
DERPMap: api.DERPMap(),
},
})
}
}
if options.HealthcheckTimeout == 0 {
options.HealthcheckTimeout = 30 * time.Second
}
if options.HealthcheckRefresh == 0 {
options.HealthcheckRefresh = 10 * time.Minute
options.HealthcheckRefresh = options.DeploymentValues.Healthcheck.Refresh.Value()
}

var oidcAuthURLParams map[string]string
Expand Down
6 changes: 3 additions & 3 deletions coderd/debug.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,20 +32,20 @@ func (api *API) debugCoordinator(rw http.ResponseWriter, r *http.Request) {
// @Router /debug/health [get]
func (api *API) debugDeploymentHealth(rw http.ResponseWriter, r *http.Request) {
apiKey := httpmw.APITokenFromRequest(r)
ctx, cancel := context.WithTimeout(r.Context(), api.HealthcheckTimeout)
ctx, cancel := context.WithTimeout(r.Context(), api.Options.HealthcheckTimeout)
defer cancel()

// Get cached report if it exists.
if report := api.healthCheckCache.Load(); report != nil {
if time.Since(report.Time) < api.HealthcheckRefresh {
if time.Since(report.Time) < api.Options.HealthcheckRefresh {
formatHealthcheck(ctx, rw, r, report)
return
}
}

resChan := api.healthCheckGroup.DoChan("", func() (*healthcheck.Report, error) {
// Create a new context not tied to the request.
ctx, cancel := context.WithTimeout(context.Background(), api.HealthcheckTimeout)
ctx, cancel := context.WithTimeout(context.Background(), api.Options.HealthcheckTimeout)
defer cancel()

report := api.HealthcheckFunc(ctx, apiKey)
Expand Down
45 changes: 45 additions & 0 deletions coderd/debug_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,51 @@ func TestDebugHealth(t *testing.T) {
require.Equal(t, http.StatusNotFound, res.StatusCode)
})

t.Run("Refresh", func(t *testing.T) {
t.Parallel()

var (
calls = make(chan struct{})
callsDone = make(chan struct{})
ctx, cancel = context.WithTimeout(context.Background(), testutil.WaitShort)
client = coderdtest.New(t, &coderdtest.Options{
HealthcheckRefresh: time.Microsecond,
HealthcheckFunc: func(context.Context, string) *healthcheck.Report {
calls <- struct{}{}
return &healthcheck.Report{}
},
})
_ = coderdtest.CreateFirstUser(t, client)
)

defer cancel()

go func() {
defer close(callsDone)
<-calls
<-time.After(testutil.IntervalFast)
<-calls
}()

res, err := client.Request(ctx, "GET", "/api/v2/debug/health", nil)
require.NoError(t, err)
defer res.Body.Close()
_, _ = io.ReadAll(res.Body)
require.Equal(t, http.StatusOK, res.StatusCode)

res, err = client.Request(ctx, "GET", "/api/v2/debug/health", nil)
require.NoError(t, err)
defer res.Body.Close()
_, _ = io.ReadAll(res.Body)
require.Equal(t, http.StatusOK, res.StatusCode)

select {
case <-callsDone:
case <-ctx.Done():
t.Fatal("timed out waiting for calls to finish")
}
})

t.Run("Deduplicated", func(t *testing.T) {
t.Parallel()

Expand Down
28 changes: 18 additions & 10 deletions coderd/healthcheck/database.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,30 @@ import (
"github.com/coder/coder/v2/coderd/database"
)

const (
DatabaseDefaultThreshold = 15 * time.Millisecond
)

// @typescript-generate DatabaseReport
type DatabaseReport struct {
Healthy bool `json:"healthy"`
Reachable bool `json:"reachable"`
Latency string `json:"latency"`
LatencyMs int `json:"latency_ms"`
Error *string `json:"error"`
Healthy bool `json:"healthy"`
Reachable bool `json:"reachable"`
Latency string `json:"latency"`
LatencyMS int64 `json:"latency_ms"`
ThresholdMS int64 `json:"threshold_ms"`
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add the Threshold string for consistency? I don't know what is the original though behind string+int64 pairs.

Copy link
Member Author

@johnstcn johnstcn Nov 13, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Me either, and the fact that I don't know why this particular fence exists makes me pause before knocking it down. I do know that _ms was added post-hoc for the UI (8ee500c) so I suspect it was simply a case of not renaming a field in the API for compatibility purposes.

Instead of parsing the Go string representation (latency) it's probably better for the UI to just create the durations from millis and then display them however is deemed best. So I would argue in favour of leaving out threshold entirely.

Error *string `json:"error"`
}

type DatabaseReportOptions struct {
DB database.Store
DB database.Store
Threshold time.Duration
}

func (r *DatabaseReport) Run(ctx context.Context, opts *DatabaseReportOptions) {
r.ThresholdMS = opts.Threshold.Milliseconds()
if r.ThresholdMS == 0 {
r.ThresholdMS = DatabaseDefaultThreshold.Milliseconds()
}
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()

Expand All @@ -43,10 +53,8 @@ func (r *DatabaseReport) Run(ctx context.Context, opts *DatabaseReportOptions) {
// Take the median ping.
latency := pings[pingCount/2]
r.Latency = latency.String()
r.LatencyMs = int(latency.Milliseconds())
// Somewhat arbitrary, but if the latency is over 15ms, we consider it
// unhealthy.
if latency < 15*time.Millisecond {
r.LatencyMS = latency.Milliseconds()
if r.LatencyMS < r.ThresholdMS {
r.Healthy = true
}
r.Reachable = true
Expand Down
33 changes: 31 additions & 2 deletions coderd/healthcheck/database_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ func TestDatabase(t *testing.T) {
assert.True(t, report.Healthy)
assert.True(t, report.Reachable)
assert.Equal(t, ping.String(), report.Latency)
assert.Equal(t, int(ping.Milliseconds()), report.LatencyMs)
assert.Equal(t, ping.Milliseconds(), report.LatencyMS)
assert.Equal(t, healthcheck.DatabaseDefaultThreshold.Milliseconds(), report.ThresholdMS)
assert.Nil(t, report.Error)
})

Expand All @@ -59,6 +60,7 @@ func TestDatabase(t *testing.T) {
assert.False(t, report.Reachable)
assert.Zero(t, report.Latency)
require.NotNil(t, report.Error)
assert.Equal(t, healthcheck.DatabaseDefaultThreshold.Milliseconds(), report.ThresholdMS)
assert.Contains(t, *report.Error, err.Error())
})

Expand All @@ -83,7 +85,34 @@ func TestDatabase(t *testing.T) {
assert.True(t, report.Healthy)
assert.True(t, report.Reachable)
assert.Equal(t, time.Millisecond.String(), report.Latency)
assert.Equal(t, 1, report.LatencyMs)
assert.EqualValues(t, 1, report.LatencyMS)
assert.Equal(t, healthcheck.DatabaseDefaultThreshold.Milliseconds(), report.ThresholdMS)
assert.Nil(t, report.Error)
})

t.Run("Threshold", func(t *testing.T) {
t.Parallel()

var (
ctx, cancel = context.WithTimeout(context.Background(), testutil.WaitShort)
report = healthcheck.DatabaseReport{}
db = dbmock.NewMockStore(gomock.NewController(t))
)
defer cancel()

db.EXPECT().Ping(gomock.Any()).Return(time.Second, nil)
db.EXPECT().Ping(gomock.Any()).Return(time.Millisecond, nil)
db.EXPECT().Ping(gomock.Any()).Return(time.Second, nil)
db.EXPECT().Ping(gomock.Any()).Return(time.Millisecond, nil)
db.EXPECT().Ping(gomock.Any()).Return(time.Second, nil)

report.Run(ctx, &healthcheck.DatabaseReportOptions{DB: db, Threshold: time.Second})

assert.False(t, report.Healthy)
assert.True(t, report.Reachable)
assert.Equal(t, time.Second.String(), report.Latency)
assert.EqualValues(t, 1000, report.LatencyMS)
assert.Equal(t, time.Second.Milliseconds(), report.ThresholdMS)
assert.Nil(t, report.Error)
})
}
Loading