Skip to content

chore: add logging around agent app health reporting #12071

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion agent/apphealth.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,12 @@ type WorkspaceAppHealthReporter func(ctx context.Context)

// NewWorkspaceAppHealthReporter creates a WorkspaceAppHealthReporter that reports app health to coderd.
func NewWorkspaceAppHealthReporter(logger slog.Logger, apps []codersdk.WorkspaceApp, postWorkspaceAgentAppHealth PostWorkspaceAgentAppHealth) WorkspaceAppHealthReporter {
logger = logger.Named("apphealth")

runHealthcheckLoop := func(ctx context.Context) error {
ctx, cancel := context.WithCancel(ctx)
defer cancel()

// no need to run this loop if no apps for this workspace.
if len(apps) == 0 {
return nil
Expand Down Expand Up @@ -87,6 +92,7 @@ func NewWorkspaceAppHealthReporter(logger slog.Logger, apps []codersdk.Workspace
return nil
}()
if err != nil {
nowUnhealthy := false
mu.Lock()
if failures[app.ID] < int(app.Healthcheck.Threshold) {
// increment the failure count and keep status the same.
Expand All @@ -96,14 +102,21 @@ func NewWorkspaceAppHealthReporter(logger slog.Logger, apps []codersdk.Workspace
// set to unhealthy if we hit the failure threshold.
// we stop incrementing at the threshold to prevent the failure value from increasing forever.
health[app.ID] = codersdk.WorkspaceAppHealthUnhealthy
nowUnhealthy = true
}
mu.Unlock()
logger.Debug(ctx, "error checking app health",
slog.F("id", app.ID.String()),
slog.F("slug", app.Slug),
slog.F("now_unhealthy", nowUnhealthy), slog.Error(err),
)
} else {
mu.Lock()
// we only need one successful health check to be considered healthy.
health[app.ID] = codersdk.WorkspaceAppHealthHealthy
failures[app.ID] = 0
mu.Unlock()
logger.Debug(ctx, "workspace app healthy", slog.F("id", app.ID.String()), slog.F("slug", app.Slug))
}

t.Reset(time.Duration(app.Healthcheck.Interval) * time.Second)
Expand Down Expand Up @@ -137,7 +150,9 @@ func NewWorkspaceAppHealthReporter(logger slog.Logger, apps []codersdk.Workspace
Healths: lastHealth,
})
if err != nil {
logger.Error(ctx, "failed to report workspace app stat", slog.Error(err))
logger.Error(ctx, "failed to report workspace app health", slog.Error(err))
} else {
logger.Debug(ctx, "sent workspace app health", slog.F("health", lastHealth))
}
}
}
Expand Down
5 changes: 5 additions & 0 deletions coderd/agentapi/apps.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ func (a *AppsAPI) BatchUpdateAppHealths(ctx context.Context, req *agentproto.Bat
return nil, err
}

a.Log.Debug(ctx, "got batch app health update",
slog.F("agent_id", workspaceAgent.ID.String()),
slog.F("updates", req.Updates),
)

if len(req.Updates) == 0 {
return &agentproto.BatchUpdateAppHealthResponse{}, nil
}
Expand Down