Skip to content

Commit d622a4d

Browse files
committed
feat: add support for stopping failed workspaces
1 parent 868e553 commit d622a4d

File tree

7 files changed

+74
-47
lines changed

7 files changed

+74
-47
lines changed

coderd/autobuild/lifecycle_executor.go

Lines changed: 40 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,11 @@ import (
1313

1414
"cdr.dev/slog"
1515
"github.com/coder/coder/coderd/database"
16+
"github.com/coder/coder/coderd/database/db2sdk"
1617
"github.com/coder/coder/coderd/database/dbauthz"
1718
"github.com/coder/coder/coderd/schedule"
1819
"github.com/coder/coder/coderd/wsbuilder"
20+
"github.com/coder/coder/codersdk"
1921
)
2022

2123
// Executor automatically starts or stops workspaces.
@@ -108,7 +110,7 @@ func (e *Executor) runOnce(t time.Time) Stats {
108110
// NOTE: If a workspace build is created with a given TTL and then the user either
109111
// changes or unsets the TTL, the deadline for the workspace build will not
110112
// have changed. This behavior is as expected per #2229.
111-
workspaces, err := e.db.GetWorkspacesEligibleForAutoStartStop(e.ctx, t)
113+
workspaces, err := e.db.GetWorkspacesEligibleForTransition(e.ctx, t)
112114
if err != nil {
113115
e.log.Error(e.ctx, "get workspaces for autostart or autostop", slog.Error(err))
114116
return stats
@@ -198,24 +200,6 @@ func (e *Executor) runOnce(t time.Time) Stats {
198200
return stats
199201
}
200202

201-
// isEligibleForTransition returns true if the workspace meets basic criteria
202-
// for transitioning to a new state.
203-
func isEligibleForTransition(ws database.Workspace, latestBuild database.WorkspaceBuild, templateSchedule schedule.TemplateScheduleOptions) bool {
204-
if ws.Deleted {
205-
return false
206-
}
207-
if templateSchedule.UserAutostartEnabled && ws.AutostartSchedule.Valid && ws.AutostartSchedule.String != "" {
208-
return true
209-
}
210-
// Don't check the template schedule to see whether it allows autostop, this
211-
// is done during the build when determining the deadline.
212-
if latestBuild.Transition == database.WorkspaceTransitionStart && !latestBuild.Deadline.IsZero() {
213-
return true
214-
}
215-
216-
return false
217-
}
218-
219203
func getNextTransition(
220204
ws database.Workspace,
221205
latestBuild database.WorkspaceBuild,
@@ -227,32 +211,37 @@ func getNextTransition(
227211
database.BuildReason,
228212
error,
229213
) {
230-
if !isEligibleForTransition(ws, latestBuild, templateSchedule) {
231-
return "", "", xerrors.Errorf("workspace ineligible for transition")
232-
}
233-
234-
if !latestJob.CompletedAt.Valid || latestJob.Error.String != "" {
235-
return "", "", xerrors.Errorf("last workspace build did not complete successfully")
236-
}
237-
238214
switch {
239-
case isEligibleForAutostop(latestBuild, currentTick):
215+
case isEligibleForAutostop(latestBuild, latestJob, currentTick):
240216
return database.WorkspaceTransitionStop, database.BuildReasonAutostop, nil
241-
case isEligibleForAutostart(ws, latestBuild, currentTick):
217+
case isEligibleForAutostart(ws, latestBuild, latestJob, templateSchedule, currentTick):
242218
return database.WorkspaceTransitionStart, database.BuildReasonAutostart, nil
219+
case isEligibleForFailedStop(latestJob, templateSchedule):
220+
return database.WorkspaceTransitionStop, database.BuildReasonAutostop, nil
243221
default:
244222
return "", "", xerrors.Errorf("last transition not valid for autostart or autostop")
245223
}
246224
}
247225

248226
// isEligibleForAutostart returns true if the workspace should be autostarted.
249-
func isEligibleForAutostart(ws database.Workspace, build database.WorkspaceBuild, currentTick time.Time) bool {
227+
func isEligibleForAutostart(ws database.Workspace, build database.WorkspaceBuild, job database.ProvisionerJob, templateSchedule schedule.TemplateScheduleOptions, currentTick time.Time) bool {
228+
// Don't attempt to autostart failed workspaces.
229+
if !job.CompletedAt.Valid || job.Error.String != "" {
230+
return false
231+
}
232+
250233
// If the last transition for the workspace was not 'stop' then the workspace
251234
// cannot be started.
252235
if build.Transition != database.WorkspaceTransitionStop {
253236
return false
254237
}
255238

239+
// If autostart isn't enabled, or the schedule isn't valid/populated we can't
240+
// autostart the workspace.
241+
if !templateSchedule.UserAutostartEnabled || !ws.AutostartSchedule.Valid || ws.AutostartSchedule.String == "" {
242+
return false
243+
}
244+
256245
sched, err := schedule.Weekly(ws.AutostartSchedule.String)
257246
if err != nil {
258247
return false
@@ -265,10 +254,30 @@ func isEligibleForAutostart(ws database.Workspace, build database.WorkspaceBuild
265254
}
266255

267256
// isEligibleForAutostart returns true if the workspace should be autostopped.
268-
func isEligibleForAutostop(build database.WorkspaceBuild, currentTick time.Time) bool {
257+
func isEligibleForAutostop(build database.WorkspaceBuild, job database.ProvisionerJob, currentTick time.Time) bool {
258+
// Don't attempt to autostop failed workspaces.
259+
if !job.CompletedAt.Valid || job.Error.String != "" {
260+
return false
261+
}
262+
269263
// A workspace must be started in order for it to be auto-stopped.
270264
return build.Transition == database.WorkspaceTransitionStart &&
271265
!build.Deadline.IsZero() &&
272266
// We do not want to stop a workspace prior to it breaching its deadline.
273267
!currentTick.Before(build.Deadline)
274268
}
269+
270+
// isEligibleForFailedStop returns true if the workspace is eligible to be stopped
271+
// due to a failed build.
272+
func isEligibleForFailedStop(job database.ProvisionerJob, templateSchedule schedule.TemplateScheduleOptions) bool {
273+
// If the template has specified a failure TLL.
274+
return templateSchedule.FailureTTL > 0 &&
275+
// And the job resulted in failure.
276+
db2sdk.ProvisionerJobStatus(job) == codersdk.ProvisionerJobFailed &&
277+
// And sufficient time has elapsed since the job has completed.
278+
(job.CompletedAt.Valid && database.Now().Sub(job.CompletedAt.Time) > templateSchedule.FailureTTL ||
279+
// Or sufficient time has elapsed since the job was canceled.
280+
job.CanceledAt.Valid && database.Now().Sub(job.CanceledAt.Time) > templateSchedule.FailureTTL ||
281+
// Or the job is stuck/abandoned.
282+
database.Now().Sub(job.UpdatedAt) > 30*time.Second)
283+
}

coderd/database/dbfake/dbfake.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3373,7 +3373,7 @@ func (q *fakeQuerier) GetWorkspaces(ctx context.Context, arg database.GetWorkspa
33733373
return workspaceRows, err
33743374
}
33753375

3376-
func (q *fakeQuerier) GetWorkspacesEligibleForAutoStartStop(ctx context.Context, now time.Time) ([]database.Workspace, error) {
3376+
func (q *fakeQuerier) GetWorkspacesEligibleForTransition(ctx context.Context, now time.Time) ([]database.Workspace, error) {
33773377
q.mutex.RLock()
33783378
defer q.mutex.RUnlock()
33793379

coderd/database/dbmetrics/dbmetrics.go

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/dbmock/dbmock.go

Lines changed: 6 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/querier.go

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries.sql.go

Lines changed: 13 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries/workspaces.sql

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -405,13 +405,15 @@ SELECT
405405
stopped_workspaces.count AS stopped_workspaces
406406
FROM pending_workspaces, building_workspaces, running_workspaces, failed_workspaces, stopped_workspaces;
407407

408-
-- name: GetWorkspacesEligibleForAutoStartStop :many
408+
-- name: GetWorkspacesEligibleForTransition :many
409409
SELECT
410410
workspaces.*
411411
FROM
412412
workspaces
413413
LEFT JOIN
414414
workspace_builds ON workspace_builds.workspace_id = workspaces.id
415+
INNER JOIN
416+
provisioner_jobs ON workspace_builds.job_id = provisioner_jobs.id
415417
WHERE
416418
workspace_builds.build_number = (
417419
SELECT
@@ -441,5 +443,12 @@ WHERE
441443
(
442444
workspace_builds.transition = 'stop'::workspace_transition AND
443445
workspaces.autostart_schedule IS NOT NULL
446+
) OR
447+
448+
-- If the workspace's most recent job resulted in an error
449+
-- it may be eligible for failed stop.
450+
(
451+
provisioner_jobs.error IS NOT NULL AND
452+
provisioner_jobs.error != ''
444453
)
445-
);
454+
) AND workspaces.deleted = 'false';

0 commit comments

Comments
 (0)