Skip to content

Commit b32d79e

Browse files
authored
fix: fix failed workspaces continuously auto-deleting (#10069)
- Fixes an issue where workspaces that are eligible for auto-deletion are retried every tick (1 minute) even if the previous deletion transition failed. The updated logic only attempts to delete workspaces that previously failed once a day (24 hours since last attempt).
1 parent 9126567 commit b32d79e

File tree

3 files changed

+118
-8
lines changed

3 files changed

+118
-8
lines changed

coderd/autobuild/lifecycle_executor.go

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ import (
2424
"github.com/coder/coder/v2/coderd/schedule"
2525
"github.com/coder/coder/v2/coderd/schedule/cron"
2626
"github.com/coder/coder/v2/coderd/wsbuilder"
27-
"github.com/coder/coder/v2/codersdk"
2827
)
2928

3029
// Executor automatically starts or stops workspaces.
@@ -310,7 +309,7 @@ func getNextTransition(
310309
// make it dormant.
311310
return "", database.BuildReasonAutolock, nil
312311

313-
case isEligibleForDelete(ws, templateSchedule, currentTick):
312+
case isEligibleForDelete(ws, templateSchedule, latestBuild, latestJob, currentTick):
314313
return database.WorkspaceTransitionDelete, database.BuildReasonAutodelete, nil
315314
default:
316315
return "", "", xerrors.Errorf("last transition not valid for autostart or autostop")
@@ -320,7 +319,7 @@ func getNextTransition(
320319
// isEligibleForAutostart returns true if the workspace should be autostarted.
321320
func isEligibleForAutostart(ws database.Workspace, build database.WorkspaceBuild, job database.ProvisionerJob, templateSchedule schedule.TemplateScheduleOptions, currentTick time.Time) bool {
322321
// Don't attempt to autostart failed workspaces.
323-
if codersdk.ProvisionerJobStatus(job.JobStatus) == codersdk.ProvisionerJobFailed {
322+
if job.JobStatus == database.ProvisionerJobStatusFailed {
324323
return false
325324
}
326325

@@ -354,7 +353,7 @@ func isEligibleForAutostart(ws database.Workspace, build database.WorkspaceBuild
354353

355354
// isEligibleForAutostart returns true if the workspace should be autostopped.
356355
func isEligibleForAutostop(ws database.Workspace, build database.WorkspaceBuild, job database.ProvisionerJob, currentTick time.Time) bool {
357-
if codersdk.ProvisionerJobStatus(job.JobStatus) == codersdk.ProvisionerJobFailed {
356+
if job.JobStatus == database.ProvisionerJobStatusFailed {
358357
return false
359358
}
360359

@@ -381,13 +380,21 @@ func isEligibleForDormantStop(ws database.Workspace, templateSchedule schedule.T
381380
currentTick.Sub(ws.LastUsedAt) > templateSchedule.TimeTilDormant
382381
}
383382

384-
func isEligibleForDelete(ws database.Workspace, templateSchedule schedule.TemplateScheduleOptions, currentTick time.Time) bool {
385-
// Only attempt to delete dormant workspaces.
386-
return ws.DormantAt.Valid && ws.DeletingAt.Valid &&
383+
func isEligibleForDelete(ws database.Workspace, templateSchedule schedule.TemplateScheduleOptions, lastBuild database.WorkspaceBuild, lastJob database.ProvisionerJob, currentTick time.Time) bool {
384+
eligible := ws.DormantAt.Valid && ws.DeletingAt.Valid &&
387385
// Dormant workspaces should only be deleted if a time_til_dormant_autodelete value is specified.
388386
templateSchedule.TimeTilDormantAutoDelete > 0 &&
389387
// The workspace must breach the time_til_dormant_autodelete value.
390388
currentTick.After(ws.DeletingAt.Time)
389+
390+
// If the last delete job failed we should wait 24 hours before trying again.
391+
// Builds are resource-intensive so retrying every minute is not productive
392+
// and will hold compute hostage.
393+
if lastBuild.Transition == database.WorkspaceTransitionDelete && lastJob.JobStatus == database.ProvisionerJobStatusFailed {
394+
return eligible && lastJob.Finished() && currentTick.Sub(lastJob.FinishedAt()) > time.Hour*24
395+
}
396+
397+
return eligible
391398
}
392399

393400
// isEligibleForFailedStop returns true if the workspace is eligible to be stopped
@@ -396,7 +403,7 @@ func isEligibleForFailedStop(build database.WorkspaceBuild, job database.Provisi
396403
// If the template has specified a failure TLL.
397404
return templateSchedule.FailureTTL > 0 &&
398405
// And the job resulted in failure.
399-
codersdk.ProvisionerJobStatus(job.JobStatus) == codersdk.ProvisionerJobFailed &&
406+
job.JobStatus == database.ProvisionerJobStatusFailed &&
400407
build.Transition == database.WorkspaceTransitionStart &&
401408
// And sufficient time has elapsed since the job has completed.
402409
job.CompletedAt.Valid &&

coderd/database/modelmethods.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -367,3 +367,19 @@ func ConvertWorkspaceRows(rows []GetWorkspacesRow) []Workspace {
367367
func (g Group) IsEveryone() bool {
368368
return g.ID == g.OrganizationID
369369
}
370+
371+
func (p ProvisionerJob) Finished() bool {
372+
return p.CanceledAt.Valid || p.CompletedAt.Valid
373+
}
374+
375+
func (p ProvisionerJob) FinishedAt() time.Time {
376+
if p.CompletedAt.Valid {
377+
return p.CompletedAt.Time
378+
}
379+
380+
if p.CanceledAt.Valid {
381+
return p.CanceledAt.Time
382+
}
383+
384+
return time.Time{}
385+
}

enterprise/coderd/workspaces_test.go

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -649,6 +649,93 @@ func TestWorkspaceAutobuild(t *testing.T) {
649649
stats = <-statsCh
650650
require.Len(t, stats.Transitions, 0)
651651
})
652+
653+
// Test that failing to auto-delete a workspace will only retry
654+
// once a day.
655+
t.Run("FailedDeleteRetryDaily", func(t *testing.T) {
656+
t.Parallel()
657+
658+
var (
659+
ticker = make(chan time.Time)
660+
statCh = make(chan autobuild.Stats)
661+
transitionTTL = time.Minute
662+
ctx = testutil.Context(t, testutil.WaitMedium)
663+
)
664+
665+
client, user := coderdenttest.New(t, &coderdenttest.Options{
666+
Options: &coderdtest.Options{
667+
AutobuildTicker: ticker,
668+
IncludeProvisionerDaemon: true,
669+
AutobuildStats: statCh,
670+
TemplateScheduleStore: schedule.NewEnterpriseTemplateScheduleStore(agplUserQuietHoursScheduleStore()),
671+
},
672+
LicenseOptions: &coderdenttest.LicenseOptions{
673+
Features: license.Features{codersdk.FeatureAdvancedTemplateScheduling: 1},
674+
},
675+
})
676+
677+
// Create a template version that passes to get a functioning workspace.
678+
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
679+
Parse: echo.ParseComplete,
680+
ProvisionPlan: echo.PlanComplete,
681+
ProvisionApply: echo.ApplyComplete,
682+
})
683+
coderdtest.AwaitTemplateVersionJobCompleted(t, client, version.ID)
684+
685+
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID)
686+
687+
ws := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID)
688+
coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, ws.LatestBuild.ID)
689+
690+
// Create a new version that will fail when we try to delete a workspace.
691+
version = coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
692+
Parse: echo.ParseComplete,
693+
ProvisionPlan: echo.PlanComplete,
694+
ProvisionApply: echo.ApplyFailed,
695+
}, func(ctvr *codersdk.CreateTemplateVersionRequest) {
696+
ctvr.TemplateID = template.ID
697+
})
698+
coderdtest.AwaitTemplateVersionJobCompleted(t, client, version.ID)
699+
700+
// Try to delete the workspace. This simulates a "failed" autodelete.
701+
build, err := client.CreateWorkspaceBuild(ctx, ws.ID, codersdk.CreateWorkspaceBuildRequest{
702+
Transition: codersdk.WorkspaceTransitionDelete,
703+
TemplateVersionID: version.ID,
704+
})
705+
require.NoError(t, err)
706+
707+
build = coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, build.ID)
708+
require.NotEmpty(t, build.Job.Error)
709+
710+
// Update our workspace to be dormant so that it qualifies for auto-deletion.
711+
err = client.UpdateWorkspaceDormancy(ctx, ws.ID, codersdk.UpdateWorkspaceDormancy{
712+
Dormant: true,
713+
})
714+
require.NoError(t, err)
715+
716+
// Enable auto-deletion for the template.
717+
_, err = client.UpdateTemplateMeta(ctx, template.ID, codersdk.UpdateTemplateMeta{
718+
TimeTilDormantAutoDeleteMillis: transitionTTL.Milliseconds(),
719+
})
720+
require.NoError(t, err)
721+
722+
ws = coderdtest.MustWorkspace(t, client, ws.ID)
723+
require.NotNil(t, ws.DeletingAt)
724+
725+
// Simulate ticking an hour after the workspace is expected to be deleted.
726+
// Under normal circumstances this should result in a transition but
727+
// since our last build resulted in failure it should be skipped.
728+
ticker <- build.Job.CompletedAt.Add(time.Hour)
729+
stats := <-statCh
730+
require.Len(t, stats.Transitions, 0)
731+
732+
// Simulate ticking a day after the workspace was last attempted to
733+
// be deleted. This should result in an attempt.
734+
ticker <- build.Job.CompletedAt.Add(time.Hour * 25)
735+
stats = <-statCh
736+
require.Len(t, stats.Transitions, 1)
737+
require.Equal(t, database.WorkspaceTransitionDelete, stats.Transitions[ws.ID])
738+
})
652739
}
653740

654741
func TestWorkspacesFiltering(t *testing.T) {

0 commit comments

Comments
 (0)