Skip to content

feat: add computed workspace and agent health fields to the api #8280

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jul 10, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion cli/testdata/coder_list_--output_json.golden
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@
"ttl_ms": 28800000,
"last_used_at": "[timestamp]",
"deleting_at": null,
"locked_at": null
"locked_at": null,
"health": {
"healthy": true,
"failing_sections": [],
"agents": {}
}
}
]
52 changes: 52 additions & 0 deletions coderd/apidoc/docs.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

52 changes: 52 additions & 0 deletions coderd/apidoc/swagger.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 18 additions & 0 deletions coderd/workspaceagents.go
Original file line number Diff line number Diff line change
Expand Up @@ -1262,6 +1262,24 @@ func convertWorkspaceAgent(derpMap *tailcfg.DERPMap, coordinator tailnet.Coordin
workspaceAgent.ReadyAt = &dbAgent.ReadyAt.Time
}

switch {
case workspaceAgent.Status != codersdk.WorkspaceAgentConnected && workspaceAgent.LifecycleState == codersdk.WorkspaceAgentLifecycleOff:
workspaceAgent.Health.Reason = "agent is not running"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I might be not in the loop, but what is the reason for returning the human-readable reason over API? Did you consider putting this mapping in the site/UI code?

Let's say that we want the site/UI to take an action on agent is taking too long to connect, does it mean that we have to compare strings?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a valid concern. I suppose this method allows us to skip the introduction of a new enum and simply surface the reason in the UI. This is similar to how we do with API errors currently where we have a human readable reason in the HTTP response.

If we make this an enum, it needs to be kept up-to-date over UI, CLI, etc.

case workspaceAgent.Status == codersdk.WorkspaceAgentTimeout:
workspaceAgent.Health.Reason = "agent is taking too long to connect"
case workspaceAgent.Status == codersdk.WorkspaceAgentDisconnected:
workspaceAgent.Health.Reason = "agent has lost connection"
// Note: We could also handle codersdk.WorkspaceAgentLifecycleStartTimeout
// here, but it's more of a soft issue, so we don't want to mark the agent
// as unhealthy.
case workspaceAgent.LifecycleState == codersdk.WorkspaceAgentLifecycleStartError:
workspaceAgent.Health.Reason = "agent startup script exited with an error"
case workspaceAgent.LifecycleState.ShuttingDown():
workspaceAgent.Health.Reason = "agent is shutting down"
default:
workspaceAgent.Health.Healthy = true
}

return workspaceAgent, nil
}

Expand Down
3 changes: 3 additions & 0 deletions coderd/workspaceagents_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ func TestWorkspaceAgent(t *testing.T) {
require.Equal(t, tmpDir, workspace.LatestBuild.Resources[0].Agents[0].Directory)
_, err = client.WorkspaceAgent(ctx, workspace.LatestBuild.Resources[0].Agents[0].ID)
require.NoError(t, err)
require.True(t, workspace.LatestBuild.Resources[0].Agents[0].Health.Healthy)
})
t.Run("HasFallbackTroubleshootingURL", func(t *testing.T) {
t.Parallel()
Expand Down Expand Up @@ -167,6 +168,8 @@ func TestWorkspaceAgent(t *testing.T) {
}, testutil.IntervalMedium, "agent status timeout")

require.Equal(t, wantTroubleshootingURL, workspace.LatestBuild.Resources[0].Agents[0].TroubleshootingURL)
require.False(t, workspace.LatestBuild.Resources[0].Agents[0].Health.Healthy)
require.NotEmpty(t, workspace.LatestBuild.Resources[0].Agents[0].Health.Reason)
})
}

Expand Down
11 changes: 11 additions & 0 deletions coderd/workspaces.go
Original file line number Diff line number Diff line change
Expand Up @@ -1115,6 +1115,16 @@ func convertWorkspace(
deletingAt = calculateDeletingAt(workspace, template, workspaceBuild)
)

agentHealth := make(map[uuid.UUID]codersdk.WorkspaceAgentHealth)
for _, r := range workspaceBuild.Resources {
// For now, we only consider agent healths when the workspace is running.
if r.Transition == codersdk.WorkspaceTransitionStart {
for _, a := range r.Agents {
agentHealth[a.ID] = a.Health
}
}
}

return codersdk.Workspace{
ID: workspace.ID,
CreatedAt: workspace.CreatedAt,
Expand All @@ -1135,6 +1145,7 @@ func convertWorkspace(
LastUsedAt: workspace.LastUsedAt,
DeletingAt: deletingAt,
LockedAt: lockedAt,
Health: (codersdk.WorkspaceHealth{Agents: agentHealth}).Complete(),
}
}

Expand Down
1 change: 1 addition & 0 deletions coderd/workspaces_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ func TestWorkspace(t *testing.T) {
require.NoError(t, err)
require.Equal(t, user.UserID, ws.LatestBuild.InitiatorID)
require.Equal(t, codersdk.BuildReasonInitiator, ws.LatestBuild.Reason)
require.True(t, ws.Health.Healthy)
})

t.Run("Deleted", func(t *testing.T) {
Expand Down
14 changes: 10 additions & 4 deletions codersdk/workspaceagents.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,10 +164,16 @@ type WorkspaceAgent struct {
ConnectionTimeoutSeconds int32 `json:"connection_timeout_seconds"`
TroubleshootingURL string `json:"troubleshooting_url"`
// Deprecated: Use StartupScriptBehavior instead.
LoginBeforeReady bool `json:"login_before_ready"`
ShutdownScript string `json:"shutdown_script,omitempty"`
ShutdownScriptTimeoutSeconds int32 `json:"shutdown_script_timeout_seconds"`
Subsystem AgentSubsystem `json:"subsystem"`
LoginBeforeReady bool `json:"login_before_ready"`
ShutdownScript string `json:"shutdown_script,omitempty"`
ShutdownScriptTimeoutSeconds int32 `json:"shutdown_script_timeout_seconds"`
Subsystem AgentSubsystem `json:"subsystem"`
Health WorkspaceAgentHealth `json:"health"` // Health reports the health of the agent.
}

type WorkspaceAgentHealth struct {
Healthy bool `json:"healthy"` // Healthy is true if the agent is healthy.
Reason string `json:"reason,omitempty"` // Reason is a human-readable explanation of the agent's health. It is empty if Healthy is true.
}

type DERPRegion struct {
Expand Down
24 changes: 24 additions & 0 deletions codersdk/workspaces.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,30 @@ type Workspace struct {
// unlocked by an admin. It is subject to deletion if it breaches
// the duration of the locked_ttl field on its template.
LockedAt *time.Time `json:"locked_at" format:"date-time"`
// Health reports the health of the workspace and its agents.
Health WorkspaceHealth `json:"health"`
}

type WorkspaceHealth struct {
Healthy bool `json:"healthy"` // Healthy is true if the workspace and all of its agents are healthy.
FailingSections []string `json:"failing_sections"` // FailingSections is a list of sections that have failed their healthcheck.
Agents map[uuid.UUID]WorkspaceAgentHealth `json:"agents"` // Agents is a map of agent IDs to their health.
}

// Complete returns a new copy with the Healthy flag and FailingSections
// set based on the agent healths.
//
//nolint:revive
func (wh WorkspaceHealth) Complete() WorkspaceHealth {
wh.Healthy = true
wh.FailingSections = []string{}
for id, agent := range wh.Agents {
if !agent.Healthy {
wh.Healthy = false
wh.FailingSections = append(wh.FailingSections, "agents."+id.String())
}
}
return wh
}

type WorkspacesRequest struct {
Expand Down
4 changes: 4 additions & 0 deletions docs/api/agents.md
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,10 @@ curl -X GET http://coder-server:8080/api/v2/workspaceagents/{workspaceagent} \
},
"expanded_directory": "string",
"first_connected_at": "2019-08-24T14:15:22Z",
"health": {
"healthy": true,
"reason": "string"
},
"id": "497f6eca-6276-4993-bfeb-53cbbbba6f08",
"instance_id": "string",
"last_connected_at": "2019-08-24T14:15:22Z",
Expand Down
Loading