Skip to content

feat: add computed workspace and agent health fields to the api #8280

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jul 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion cli/testdata/coder_list_--output_json.golden
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@
"ttl_ms": 28800000,
"last_used_at": "[timestamp]",
"deleting_at": null,
"locked_at": null
"locked_at": null,
"health": {
"healthy": true,
"failing_agents": []
}
}
]
49 changes: 49 additions & 0 deletions coderd/apidoc/docs.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

49 changes: 49 additions & 0 deletions coderd/apidoc/swagger.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 18 additions & 0 deletions coderd/workspaceagents.go
Original file line number Diff line number Diff line change
Expand Up @@ -1262,6 +1262,24 @@ func convertWorkspaceAgent(derpMap *tailcfg.DERPMap, coordinator tailnet.Coordin
workspaceAgent.ReadyAt = &dbAgent.ReadyAt.Time
}

switch {
case workspaceAgent.Status != codersdk.WorkspaceAgentConnected && workspaceAgent.LifecycleState == codersdk.WorkspaceAgentLifecycleOff:
workspaceAgent.Health.Reason = "agent is not running"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I might be not in the loop, but what is the reason for returning the human-readable reason over API? Did you consider putting this mapping in the site/UI code?

Let's say that we want the site/UI to take an action on agent is taking too long to connect, does it mean that we have to compare strings?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a valid concern. I suppose this method allows us to skip the introduction of a new enum and simply surface the reason in the UI. This is similar to how we do with API errors currently where we have a human readable reason in the HTTP response.

If we make this an enum, it needs to be kept up-to-date over UI, CLI, etc.

case workspaceAgent.Status == codersdk.WorkspaceAgentTimeout:
workspaceAgent.Health.Reason = "agent is taking too long to connect"
case workspaceAgent.Status == codersdk.WorkspaceAgentDisconnected:
workspaceAgent.Health.Reason = "agent has lost connection"
// Note: We could also handle codersdk.WorkspaceAgentLifecycleStartTimeout
// here, but it's more of a soft issue, so we don't want to mark the agent
// as unhealthy.
case workspaceAgent.LifecycleState == codersdk.WorkspaceAgentLifecycleStartError:
workspaceAgent.Health.Reason = "agent startup script exited with an error"
case workspaceAgent.LifecycleState.ShuttingDown():
workspaceAgent.Health.Reason = "agent is shutting down"
default:
workspaceAgent.Health.Healthy = true
}

return workspaceAgent, nil
}

Expand Down
3 changes: 3 additions & 0 deletions coderd/workspaceagents_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ func TestWorkspaceAgent(t *testing.T) {
require.Equal(t, tmpDir, workspace.LatestBuild.Resources[0].Agents[0].Directory)
_, err = client.WorkspaceAgent(ctx, workspace.LatestBuild.Resources[0].Agents[0].ID)
require.NoError(t, err)
require.True(t, workspace.LatestBuild.Resources[0].Agents[0].Health.Healthy)
})
t.Run("HasFallbackTroubleshootingURL", func(t *testing.T) {
t.Parallel()
Expand Down Expand Up @@ -167,6 +168,8 @@ func TestWorkspaceAgent(t *testing.T) {
}, testutil.IntervalMedium, "agent status timeout")

require.Equal(t, wantTroubleshootingURL, workspace.LatestBuild.Resources[0].Agents[0].TroubleshootingURL)
require.False(t, workspace.LatestBuild.Resources[0].Agents[0].Health.Healthy)
require.NotEmpty(t, workspace.LatestBuild.Resources[0].Agents[0].Health.Reason)
})
}

Expand Down
13 changes: 13 additions & 0 deletions coderd/workspaces.go
Original file line number Diff line number Diff line change
Expand Up @@ -1110,6 +1110,15 @@ func convertWorkspace(
lockedAt = &workspace.LockedAt.Time
}

failingAgents := []uuid.UUID{}
for _, resource := range workspaceBuild.Resources {
for _, agent := range resource.Agents {
if !agent.Health.Healthy {
failingAgents = append(failingAgents, agent.ID)
}
}
}

var (
ttlMillis = convertWorkspaceTTLMillis(workspace.Ttl)
deletingAt = calculateDeletingAt(workspace, template, workspaceBuild)
Expand All @@ -1135,6 +1144,10 @@ func convertWorkspace(
LastUsedAt: workspace.LastUsedAt,
DeletingAt: deletingAt,
LockedAt: lockedAt,
Health: codersdk.WorkspaceHealth{
Healthy: len(failingAgents) == 0,
FailingAgents: failingAgents,
},
}
}

Expand Down
142 changes: 142 additions & 0 deletions coderd/workspaces_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,148 @@ func TestWorkspace(t *testing.T) {
assert.Equal(t, templateDisplayName, ws.TemplateDisplayName)
assert.Equal(t, templateAllowUserCancelWorkspaceJobs, ws.TemplateAllowUserCancelWorkspaceJobs)
})

t.Run("Health", func(t *testing.T) {
t.Parallel()

t.Run("Healthy", func(t *testing.T) {
t.Parallel()
client := coderdtest.New(t, &coderdtest.Options{IncludeProvisionerDaemon: true})
user := coderdtest.CreateFirstUser(t, client)
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
Parse: echo.ParseComplete,
ProvisionApply: []*proto.Provision_Response{{
Type: &proto.Provision_Response_Complete{
Complete: &proto.Provision_Complete{
Resources: []*proto.Resource{{
Name: "some",
Type: "example",
Agents: []*proto.Agent{{
Id: uuid.NewString(),
Auth: &proto.Agent_Token{},
}},
}},
},
},
}},
})
coderdtest.AwaitTemplateVersionJob(t, client, version.ID)
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID)
workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID)
coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID)

ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
defer cancel()

workspace, err := client.Workspace(ctx, workspace.ID)
require.NoError(t, err)

agent := workspace.LatestBuild.Resources[0].Agents[0]

assert.True(t, workspace.Health.Healthy)
assert.Equal(t, []uuid.UUID{}, workspace.Health.FailingAgents)
assert.True(t, agent.Health.Healthy)
assert.Empty(t, agent.Health.Reason)
})

t.Run("Unhealthy", func(t *testing.T) {
t.Parallel()
client := coderdtest.New(t, &coderdtest.Options{IncludeProvisionerDaemon: true})
user := coderdtest.CreateFirstUser(t, client)
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
Parse: echo.ParseComplete,
ProvisionApply: []*proto.Provision_Response{{
Type: &proto.Provision_Response_Complete{
Complete: &proto.Provision_Complete{
Resources: []*proto.Resource{{
Name: "some",
Type: "example",
Agents: []*proto.Agent{{
Id: uuid.NewString(),
Auth: &proto.Agent_Token{},
ConnectionTimeoutSeconds: 1,
}},
}},
},
},
}},
})
coderdtest.AwaitTemplateVersionJob(t, client, version.ID)
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID)
workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID)
coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID)

ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
defer cancel()

var err error
testutil.Eventually(ctx, t, func(ctx context.Context) bool {
workspace, err = client.Workspace(ctx, workspace.ID)
return assert.NoError(t, err) && !workspace.Health.Healthy
}, testutil.IntervalMedium)

agent := workspace.LatestBuild.Resources[0].Agents[0]

assert.False(t, workspace.Health.Healthy)
assert.Equal(t, []uuid.UUID{agent.ID}, workspace.Health.FailingAgents)
assert.False(t, agent.Health.Healthy)
assert.NotEmpty(t, agent.Health.Reason)
})

t.Run("Mixed health", func(t *testing.T) {
t.Parallel()
client := coderdtest.New(t, &coderdtest.Options{IncludeProvisionerDaemon: true})
user := coderdtest.CreateFirstUser(t, client)
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
Parse: echo.ParseComplete,
ProvisionApply: []*proto.Provision_Response{{
Type: &proto.Provision_Response_Complete{
Complete: &proto.Provision_Complete{
Resources: []*proto.Resource{{
Name: "some",
Type: "example",
Agents: []*proto.Agent{{
Id: uuid.NewString(),
Name: "a1",
Auth: &proto.Agent_Token{},
}, {
Id: uuid.NewString(),
Name: "a2",
Auth: &proto.Agent_Token{},
ConnectionTimeoutSeconds: 1,
}},
}},
},
},
}},
})
coderdtest.AwaitTemplateVersionJob(t, client, version.ID)
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID)
workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID)
coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID)

ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
defer cancel()

var err error
testutil.Eventually(ctx, t, func(ctx context.Context) bool {
workspace, err = client.Workspace(ctx, workspace.ID)
return assert.NoError(t, err) && !workspace.Health.Healthy
}, testutil.IntervalMedium)

assert.False(t, workspace.Health.Healthy)
assert.Len(t, workspace.Health.FailingAgents, 1)

agent1 := workspace.LatestBuild.Resources[0].Agents[0]
agent2 := workspace.LatestBuild.Resources[0].Agents[1]

assert.Equal(t, []uuid.UUID{agent2.ID}, workspace.Health.FailingAgents)
assert.True(t, agent1.Health.Healthy)
assert.Empty(t, agent1.Health.Reason)
assert.False(t, agent2.Health.Healthy)
assert.NotEmpty(t, agent2.Health.Reason)
})
})
}

func TestAdminViewAllWorkspaces(t *testing.T) {
Expand Down
Loading