Skip to content

feat(coderd/healthcheck): add health check for proxy #10846

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 24 commits into from
Nov 24, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
a17ec4c
feat(coderd/healthcheck): add health check for proxy
johnstcn Nov 21, 2023
49b6e19
rm logger
johnstcn Nov 23, 2023
c7e202c
wire it up
johnstcn Nov 23, 2023
89bae7e
add severity, convert to table-driven tests
johnstcn Nov 23, 2023
bcdc08c
dbauthz
johnstcn Nov 23, 2023
340a276
make gen; make fmt
johnstcn Nov 23, 2023
d3478c8
Merge remote-tracking branch 'origin/main' into cj/workspaceproxy-hea…
johnstcn Nov 24, 2023
c746021
fix linter import shadowing complaint
johnstcn Nov 24, 2023
05d4b09
fix generated report name
johnstcn Nov 24, 2023
56be002
gen harder
johnstcn Nov 24, 2023
1112b1c
address comments
johnstcn Nov 24, 2023
fed35bd
Merge remote-tracking branch 'origin/main' into cj/workspaceproxy-hea…
johnstcn Nov 24, 2023
7f8760a
rename WorkspaceProxies -> workspace_proxies
johnstcn Nov 24, 2023
5f5c486
fixup! rename WorkspaceProxies -> workspace_proxies
johnstcn Nov 24, 2023
b9cbdd8
use severity value instead of direct comparison
johnstcn Nov 24, 2023
ae30089
more tests
johnstcn Nov 24, 2023
21f52bb
use errors.Join instead of multierror.Append
johnstcn Nov 24, 2023
79dc190
refactor to use interface instead of atomic.Pointers
johnstcn Nov 24, 2023
2530320
rm unnecessary authz
johnstcn Nov 24, 2023
f192488
fix typegen
johnstcn Nov 24, 2023
b45e9c5
Merge remote-tracking branch 'origin/main' into cj/workspaceproxy-hea…
johnstcn Nov 24, 2023
63fbafb
make typescript stop whingeing
johnstcn Nov 24, 2023
6de94bc
Merge remote-tracking branch 'origin/main' into cj/workspaceproxy-hea…
johnstcn Nov 24, 2023
b57cdeb
remove need for gosimple nolint
johnstcn Nov 24, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
add severity, convert to table-driven tests
  • Loading branch information
johnstcn committed Nov 23, 2023
commit 89bae7effe51ceeb40f36249cf2d606bb7518ac0
4 changes: 2 additions & 2 deletions coderd/coderd.go
Original file line number Diff line number Diff line change
Expand Up @@ -430,8 +430,8 @@ func New(options *Options) *API {
},
WorkspaceProxy: healthcheck.WorkspaceProxyReportOptions{
CurrentVersion: buildinfo.Version(),
FetchWorkspaceProxies: *options.FetchWorkspaceProxiesFunc.Load(),
UpdateProxyHealth: *options.UpdateProxyHealthFunc.Load(),
FetchWorkspaceProxies: options.FetchWorkspaceProxiesFunc.Load(),
UpdateProxyHealth: options.UpdateProxyHealthFunc.Load(),
},
})
}
Expand Down
3 changes: 3 additions & 0 deletions coderd/healthcheck/healthcheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,9 @@ func Run(ctx context.Context, opts *ReportOptions) *Report {
if report.Database.Severity.Value() > report.Severity.Value() {
report.Severity = report.Database.Severity
}
if report.WorkspaceProxy.Severity.Value() > report.Severity.Value() {
report.Severity = report.WorkspaceProxy.Severity
}
return &report
}

Expand Down
93 changes: 77 additions & 16 deletions coderd/healthcheck/workspaceproxy.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,75 +2,136 @@ package healthcheck

import (
"context"
"fmt"

"golang.org/x/xerrors"

"github.com/coder/coder/v2/buildinfo"
"github.com/coder/coder/v2/coderd/healthcheck/health"
"github.com/coder/coder/v2/coderd/util/ptr"
"github.com/coder/coder/v2/codersdk"

"github.com/hashicorp/go-multierror"
)

type WorkspaceProxyReportOptions struct {
// CurrentVersion is the current server version.
// We pass this in to make it easier to test.
CurrentVersion string
// FetchWorkspaceProxies is a function that returns the available workspace proxies.
FetchWorkspaceProxies func(context.Context) (codersdk.RegionsResponse[codersdk.WorkspaceProxy], error)
FetchWorkspaceProxies *func(context.Context) (codersdk.RegionsResponse[codersdk.WorkspaceProxy], error)
// UpdateProxyHealth is a function called when healthcheck is run.
// This would normally be ProxyHealth.ForceUpdate().
// We do this because if someone mashes the healthcheck refresh button
// they would expect up-to-date data.
UpdateProxyHealth func(context.Context) error
UpdateProxyHealth *func(context.Context) error
}

// @typescript-generate Report
type WorkspaceProxyReport struct {
Healthy bool `json:"healthy"`
Warnings []string `json:"warnings"`
Error *string `json:"error"`
Healthy bool `json:"healthy"`
Severity health.Severity `json:"severity"`
Warnings []string `json:"warnings"`
Error *string `json:"error"`

WorkspaceProxies codersdk.RegionsResponse[codersdk.WorkspaceProxy]
}

func (r *WorkspaceProxyReport) Run(ctx context.Context, opts *WorkspaceProxyReportOptions) {
r.Healthy = true
r.Severity = health.SeverityOK
r.Warnings = []string{}

if opts.FetchWorkspaceProxies == nil {
return
}
fetchWorkspaceProxiesFunc := *opts.FetchWorkspaceProxies

if opts.UpdateProxyHealth == nil {
err := "opts.UpdateProxyHealth must not be nil if opts.FetchWorkspaceProxies is not nil"
r.Error = ptr.Ref(err)
return
}
updateProxyHealthFunc := *opts.UpdateProxyHealth

if err := opts.UpdateProxyHealth(ctx); err != nil {
r.Error = ptr.Ref(err.Error())
// If this fails, just mark it as a warning. It is still updated in the background.
if err := updateProxyHealthFunc(ctx); err != nil {
r.Severity = health.SeverityWarning
r.Warnings = append(r.Warnings, xerrors.Errorf("update proxy health: %w", err).Error())
return
}

proxies, err := opts.FetchWorkspaceProxies(ctx)
proxies, err := fetchWorkspaceProxiesFunc(ctx)
if err != nil {
r.Healthy = false
r.Severity = health.SeverityError
r.Error = ptr.Ref(err.Error())
return
}

r.WorkspaceProxies = proxies

var numProxies int
var healthyProxies int
var total, healthy int
for _, proxy := range r.WorkspaceProxies.Regions {
numProxies++
total++
if proxy.Healthy {
healthyProxies++
healthy++
}

if len(proxy.Status.Report.Errors) > 0 {
for _, err := range proxy.Status.Report.Errors {
r.appendError(xerrors.New(err))
}
}
}

r.Severity = calculateSeverity(total, healthy)
r.Healthy = r.Severity != health.SeverityError

// check versions
if !buildinfo.VersionsMatch(proxy.Version, opts.CurrentVersion) {
// Versions _must_ match. Perform this check last. This will clobber any other severity.
for _, proxy := range r.WorkspaceProxies.Regions {
if vErr := checkVersion(proxy, opts.CurrentVersion); vErr != nil {
r.Healthy = false
r.Warnings = append(r.Warnings, fmt.Sprintf("Proxy %q version %q does not match primary server version %q", proxy.Name, proxy.Version, opts.CurrentVersion))
r.Severity = health.SeverityError
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not suggesting, just asking: Is this always an error? Could it be a warning instead if it's a patch version difference, for instance?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Our current behaviour won't even let workspace proxies register if they don't match major.minor, but makes no further complaints if it's just a patch version difference.

Making it warn on a patch version difference sounds like a good idea in theory, but it will mean every time you upgrade, you'll get a short period of warning that there's a patch version diff. Although that's the current behaviour as well when you do a major version upgrade, so... 🤷

I'll see about adding it in a follow-up.

r.appendError(vErr)
}
}
}

// appendError multierror-appends err onto r.Error.
// We only have one error, so multiple errors need to be squashed in there.
func (r *WorkspaceProxyReport) appendError(errs ...error) {
var prevErr error
if r.Error != nil {
prevErr = xerrors.New(*r.Error)
}
r.Error = ptr.Ref(multierror.Append(prevErr, errs...).Error())
}

func checkVersion(proxy codersdk.WorkspaceProxy, currentVersion string) error {
if proxy.Version == "" {
return nil // may have not connected yet, this is OK
}
if buildinfo.VersionsMatch(proxy.Version, currentVersion) {
return nil
}

return xerrors.Errorf("proxy %q version %q does not match primary server version %q",
proxy.Name,
proxy.Version,
currentVersion,
)
}

// calculateSeverity returns:
// health.SeverityError if all proxies are unhealthy,
// health.SeverityOK if all proxies are healthy,
// health.SeverityWarning otherwise.
func calculateSeverity(total, healthy int) health.Severity {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TBH this could probably be moved to a separate utils file

if total == 0 || total == healthy {
return health.SeverityOK
}
if total-healthy == total {
return health.SeverityError
}
return health.SeverityWarning
}
Loading