Skip to content

Commit 1b0124e

Browse files
authored
feat: automatically stop workspaces based on failure_ttl (#7989)
1 parent d434181 commit 1b0124e

File tree

17 files changed

+419
-159
lines changed

17 files changed

+419
-159
lines changed

cli/server.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ import (
6262
"github.com/coder/coder/cli/cliui"
6363
"github.com/coder/coder/cli/config"
6464
"github.com/coder/coder/coderd"
65-
"github.com/coder/coder/coderd/autobuild/executor"
65+
"github.com/coder/coder/coderd/autobuild"
6666
"github.com/coder/coder/coderd/database"
6767
"github.com/coder/coder/coderd/database/dbfake"
6868
"github.com/coder/coder/coderd/database/dbmetrics"
@@ -900,7 +900,7 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
900900

901901
autobuildPoller := time.NewTicker(cfg.AutobuildPollInterval.Value())
902902
defer autobuildPoller.Stop()
903-
autobuildExecutor := executor.New(ctx, options.Database, coderAPI.TemplateScheduleStore, logger, autobuildPoller.C)
903+
autobuildExecutor := autobuild.NewExecutor(ctx, options.Database, coderAPI.TemplateScheduleStore, logger, autobuildPoller.C)
904904
autobuildExecutor.Run()
905905

906906
// Currently there is no way to ask the server to shut

coderd/autobuild/doc.go

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
// Package autobuild contains logic for scheduling workspace
2+
// builds in the background.
3+
package autobuild

coderd/autobuild/executor/lifecycle_executor.go renamed to coderd/autobuild/lifecycle_executor.go

+89-80
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package executor
1+
package autobuild
22

33
import (
44
"context"
@@ -13,9 +13,11 @@ import (
1313

1414
"cdr.dev/slog"
1515
"github.com/coder/coder/coderd/database"
16+
"github.com/coder/coder/coderd/database/db2sdk"
1617
"github.com/coder/coder/coderd/database/dbauthz"
1718
"github.com/coder/coder/coderd/schedule"
1819
"github.com/coder/coder/coderd/wsbuilder"
20+
"github.com/coder/coder/codersdk"
1921
)
2022

2123
// Executor automatically starts or stops workspaces.
@@ -35,8 +37,8 @@ type Stats struct {
3537
Error error
3638
}
3739

38-
// New returns a new autobuild executor.
39-
func New(ctx context.Context, db database.Store, tss *atomic.Pointer[schedule.TemplateScheduleStore], log slog.Logger, tick <-chan time.Time) *Executor {
40+
// New returns a new wsactions executor.
41+
func NewExecutor(ctx context.Context, db database.Store, tss *atomic.Pointer[schedule.TemplateScheduleStore], log slog.Logger, tick <-chan time.Time) *Executor {
4042
le := &Executor{
4143
//nolint:gocritic // Autostart has a limited set of permissions.
4244
ctx: dbauthz.AsAutostart(ctx),
@@ -108,7 +110,7 @@ func (e *Executor) runOnce(t time.Time) Stats {
108110
// NOTE: If a workspace build is created with a given TTL and then the user either
109111
// changes or unsets the TTL, the deadline for the workspace build will not
110112
// have changed. This behavior is as expected per #2229.
111-
workspaces, err := e.db.GetWorkspacesEligibleForAutoStartStop(e.ctx, t)
113+
workspaces, err := e.db.GetWorkspacesEligibleForTransition(e.ctx, t)
112114
if err != nil {
113115
e.log.Error(e.ctx, "get workspaces for autostart or autostop", slog.Error(err))
114116
return stats
@@ -125,77 +127,56 @@ func (e *Executor) runOnce(t time.Time) Stats {
125127
log := e.log.With(slog.F("workspace_id", wsID))
126128

127129
eg.Go(func() error {
128-
err := e.db.InTx(func(db database.Store) error {
130+
err := e.db.InTx(func(tx database.Store) error {
129131
// Re-check eligibility since the first check was outside the
130132
// transaction and the workspace settings may have changed.
131-
ws, err := db.GetWorkspaceByID(e.ctx, wsID)
133+
ws, err := tx.GetWorkspaceByID(e.ctx, wsID)
132134
if err != nil {
133135
log.Error(e.ctx, "get workspace autostart failed", slog.Error(err))
134136
return nil
135137
}
136138

137139
// Determine the workspace state based on its latest build.
138-
priorHistory, err := db.GetLatestWorkspaceBuildByWorkspaceID(e.ctx, ws.ID)
140+
latestBuild, err := tx.GetLatestWorkspaceBuildByWorkspaceID(e.ctx, ws.ID)
139141
if err != nil {
140142
log.Warn(e.ctx, "get latest workspace build", slog.Error(err))
141143
return nil
142144
}
143-
144-
templateSchedule, err := (*(e.templateScheduleStore.Load())).GetTemplateScheduleOptions(e.ctx, db, ws.TemplateID)
145+
templateSchedule, err := (*(e.templateScheduleStore.Load())).GetTemplateScheduleOptions(e.ctx, tx, ws.TemplateID)
145146
if err != nil {
146147
log.Warn(e.ctx, "get template schedule options", slog.Error(err))
147148
return nil
148149
}
149150

150-
if !isEligibleForAutoStartStop(ws, priorHistory, templateSchedule) {
151-
return nil
152-
}
153-
154-
priorJob, err := db.GetProvisionerJobByID(e.ctx, priorHistory.JobID)
151+
latestJob, err := tx.GetProvisionerJobByID(e.ctx, latestBuild.JobID)
155152
if err != nil {
156153
log.Warn(e.ctx, "get last provisioner job for workspace %q: %w", slog.Error(err))
157154
return nil
158155
}
159156

160-
validTransition, nextTransition, err := getNextTransition(ws, priorHistory, priorJob)
157+
nextTransition, reason, err := getNextTransition(ws, latestBuild, latestJob, templateSchedule, currentTick)
161158
if err != nil {
162159
log.Debug(e.ctx, "skipping workspace", slog.Error(err))
163160
return nil
164161
}
165162

166-
if currentTick.Before(nextTransition) {
167-
log.Debug(e.ctx, "skipping workspace: too early",
168-
slog.F("next_transition_at", nextTransition),
169-
slog.F("transition", validTransition),
170-
slog.F("current_tick", currentTick),
171-
)
172-
return nil
173-
}
174-
builder := wsbuilder.New(ws, validTransition).
175-
SetLastWorkspaceBuildInTx(&priorHistory).
176-
SetLastWorkspaceBuildJobInTx(&priorJob)
177-
178-
switch validTransition {
179-
case database.WorkspaceTransitionStart:
180-
builder = builder.Reason(database.BuildReasonAutostart)
181-
case database.WorkspaceTransitionStop:
182-
builder = builder.Reason(database.BuildReasonAutostop)
183-
default:
184-
log.Error(e.ctx, "unsupported transition", slog.F("transition", validTransition))
185-
return nil
186-
}
187-
if _, _, err := builder.Build(e.ctx, db, nil); err != nil {
163+
builder := wsbuilder.New(ws, nextTransition).
164+
SetLastWorkspaceBuildInTx(&latestBuild).
165+
SetLastWorkspaceBuildJobInTx(&latestJob).
166+
Reason(reason)
167+
168+
if _, _, err := builder.Build(e.ctx, tx, nil); err != nil {
188169
log.Error(e.ctx, "unable to transition workspace",
189-
slog.F("transition", validTransition),
170+
slog.F("transition", nextTransition),
190171
slog.Error(err),
191172
)
192173
return nil
193174
}
194175
statsMu.Lock()
195-
stats.Transitions[ws.ID] = validTransition
176+
stats.Transitions[ws.ID] = nextTransition
196177
statsMu.Unlock()
197178

198-
log.Info(e.ctx, "scheduling workspace transition", slog.F("transition", validTransition))
179+
log.Info(e.ctx, "scheduling workspace transition", slog.F("transition", nextTransition))
199180

200181
return nil
201182

@@ -218,53 +199,81 @@ func (e *Executor) runOnce(t time.Time) Stats {
218199
return stats
219200
}
220201

221-
func isEligibleForAutoStartStop(ws database.Workspace, priorHistory database.WorkspaceBuild, templateSchedule schedule.TemplateScheduleOptions) bool {
222-
if ws.Deleted {
202+
func getNextTransition(
203+
ws database.Workspace,
204+
latestBuild database.WorkspaceBuild,
205+
latestJob database.ProvisionerJob,
206+
templateSchedule schedule.TemplateScheduleOptions,
207+
currentTick time.Time,
208+
) (
209+
database.WorkspaceTransition,
210+
database.BuildReason,
211+
error,
212+
) {
213+
switch {
214+
case isEligibleForAutostop(latestBuild, latestJob, currentTick):
215+
return database.WorkspaceTransitionStop, database.BuildReasonAutostop, nil
216+
case isEligibleForAutostart(ws, latestBuild, latestJob, templateSchedule, currentTick):
217+
return database.WorkspaceTransitionStart, database.BuildReasonAutostart, nil
218+
case isEligibleForFailedStop(latestBuild, latestJob, templateSchedule):
219+
return database.WorkspaceTransitionStop, database.BuildReasonAutostop, nil
220+
default:
221+
return "", "", xerrors.Errorf("last transition not valid for autostart or autostop")
222+
}
223+
}
224+
225+
// isEligibleForAutostart returns true if the workspace should be autostarted.
226+
func isEligibleForAutostart(ws database.Workspace, build database.WorkspaceBuild, job database.ProvisionerJob, templateSchedule schedule.TemplateScheduleOptions, currentTick time.Time) bool {
227+
// Don't attempt to autostart failed workspaces.
228+
if !job.CompletedAt.Valid || job.Error.String != "" {
223229
return false
224230
}
225-
if templateSchedule.UserAutostartEnabled && ws.AutostartSchedule.Valid && ws.AutostartSchedule.String != "" {
226-
return true
231+
232+
// If the last transition for the workspace was not 'stop' then the workspace
233+
// cannot be started.
234+
if build.Transition != database.WorkspaceTransitionStop {
235+
return false
227236
}
228-
// Don't check the template schedule to see whether it allows autostop, this
229-
// is done during the build when determining the deadline.
230-
if priorHistory.Transition == database.WorkspaceTransitionStart && !priorHistory.Deadline.IsZero() {
231-
return true
237+
238+
// If autostart isn't enabled, or the schedule isn't valid/populated we can't
239+
// autostart the workspace.
240+
if !templateSchedule.UserAutostartEnabled || !ws.AutostartSchedule.Valid || ws.AutostartSchedule.String == "" {
241+
return false
232242
}
233243

234-
return false
244+
sched, err := schedule.Weekly(ws.AutostartSchedule.String)
245+
if err != nil {
246+
return false
247+
}
248+
// Round down to the nearest minute, as this is the finest granularity cron supports.
249+
// Truncate is probably not necessary here, but doing it anyway to be sure.
250+
nextTransition := sched.Next(build.CreatedAt).Truncate(time.Minute)
251+
252+
return !currentTick.Before(nextTransition)
235253
}
236254

237-
func getNextTransition(
238-
ws database.Workspace,
239-
priorHistory database.WorkspaceBuild,
240-
priorJob database.ProvisionerJob,
241-
) (
242-
validTransition database.WorkspaceTransition,
243-
nextTransition time.Time,
244-
err error,
245-
) {
246-
if !priorJob.CompletedAt.Valid || priorJob.Error.String != "" {
247-
return "", time.Time{}, xerrors.Errorf("last workspace build did not complete successfully")
255+
// isEligibleForAutostart returns true if the workspace should be autostopped.
256+
func isEligibleForAutostop(build database.WorkspaceBuild, job database.ProvisionerJob, currentTick time.Time) bool {
257+
// Don't attempt to autostop failed workspaces.
258+
if !job.CompletedAt.Valid || job.Error.String != "" {
259+
return false
248260
}
249261

250-
switch priorHistory.Transition {
251-
case database.WorkspaceTransitionStart:
252-
if priorHistory.Deadline.IsZero() {
253-
return "", time.Time{}, xerrors.Errorf("latest workspace build has zero deadline")
254-
}
255-
// For stopping, do not truncate. This is inconsistent with autostart, but
256-
// it ensures we will not stop too early.
257-
return database.WorkspaceTransitionStop, priorHistory.Deadline, nil
258-
case database.WorkspaceTransitionStop:
259-
sched, err := schedule.Weekly(ws.AutostartSchedule.String)
260-
if err != nil {
261-
return "", time.Time{}, xerrors.Errorf("workspace has invalid autostart schedule: %w", err)
262-
}
263-
// Round down to the nearest minute, as this is the finest granularity cron supports.
264-
// Truncate is probably not necessary here, but doing it anyway to be sure.
265-
nextTransition = sched.Next(priorHistory.CreatedAt).Truncate(time.Minute)
266-
return database.WorkspaceTransitionStart, nextTransition, nil
267-
default:
268-
return "", time.Time{}, xerrors.Errorf("last transition not valid for autostart or autostop")
269-
}
262+
// A workspace must be started in order for it to be auto-stopped.
263+
return build.Transition == database.WorkspaceTransitionStart &&
264+
!build.Deadline.IsZero() &&
265+
// We do not want to stop a workspace prior to it breaching its deadline.
266+
!currentTick.Before(build.Deadline)
267+
}
268+
269+
// isEligibleForFailedStop returns true if the workspace is eligible to be stopped
270+
// due to a failed build.
271+
func isEligibleForFailedStop(build database.WorkspaceBuild, job database.ProvisionerJob, templateSchedule schedule.TemplateScheduleOptions) bool {
272+
// If the template has specified a failure TLL.
273+
return templateSchedule.FailureTTL > 0 &&
274+
// And the job resulted in failure.
275+
db2sdk.ProvisionerJobStatus(job) == codersdk.ProvisionerJobFailed &&
276+
build.Transition == database.WorkspaceTransitionStart &&
277+
// And sufficient time has elapsed since the job has completed.
278+
job.CompletedAt.Valid && database.Now().Sub(job.CompletedAt.Time) > templateSchedule.FailureTTL
270279
}

0 commit comments

Comments
 (0)