Skip to content

feat: automatically stop workspaces based on failure_ttl #7989

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Jun 22, 2023
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cli/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ import (
"github.com/coder/coder/cli/cliui"
"github.com/coder/coder/cli/config"
"github.com/coder/coder/coderd"
"github.com/coder/coder/coderd/autobuild/executor"
"github.com/coder/coder/coderd/autobuild"
"github.com/coder/coder/coderd/database"
"github.com/coder/coder/coderd/database/dbfake"
"github.com/coder/coder/coderd/database/dbmetrics"
Expand Down Expand Up @@ -899,7 +899,7 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.

autobuildPoller := time.NewTicker(cfg.AutobuildPollInterval.Value())
defer autobuildPoller.Stop()
autobuildExecutor := executor.New(ctx, options.Database, coderAPI.TemplateScheduleStore, logger, autobuildPoller.C)
autobuildExecutor := autobuild.NewExecutor(ctx, options.Database, coderAPI.TemplateScheduleStore, logger, autobuildPoller.C)
autobuildExecutor.Run()

// Currently there is no way to ask the server to shut
Expand Down
3 changes: 3 additions & 0 deletions coderd/autobuild/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
// Package autobuild contains logic for scheduling workspace
// builds in the background.
package autobuild
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package executor
package autobuild

import (
"context"
Expand All @@ -13,9 +13,11 @@ import (

"cdr.dev/slog"
"github.com/coder/coder/coderd/database"
"github.com/coder/coder/coderd/database/db2sdk"
"github.com/coder/coder/coderd/database/dbauthz"
"github.com/coder/coder/coderd/schedule"
"github.com/coder/coder/coderd/wsbuilder"
"github.com/coder/coder/codersdk"
)

// Executor automatically starts or stops workspaces.
Expand All @@ -35,8 +37,8 @@ type Stats struct {
Error error
}

// New returns a new autobuild executor.
func New(ctx context.Context, db database.Store, tss *atomic.Pointer[schedule.TemplateScheduleStore], log slog.Logger, tick <-chan time.Time) *Executor {
// New returns a new wsactions executor.
func NewExecutor(ctx context.Context, db database.Store, tss *atomic.Pointer[schedule.TemplateScheduleStore], log slog.Logger, tick <-chan time.Time) *Executor {
le := &Executor{
//nolint:gocritic // Autostart has a limited set of permissions.
ctx: dbauthz.AsAutostart(ctx),
Expand Down Expand Up @@ -108,7 +110,7 @@ func (e *Executor) runOnce(t time.Time) Stats {
// NOTE: If a workspace build is created with a given TTL and then the user either
// changes or unsets the TTL, the deadline for the workspace build will not
// have changed. This behavior is as expected per #2229.
workspaces, err := e.db.GetWorkspacesEligibleForAutoStartStop(e.ctx, t)
workspaces, err := e.db.GetWorkspacesEligibleForTransition(e.ctx, t)
if err != nil {
e.log.Error(e.ctx, "get workspaces for autostart or autostop", slog.Error(err))
return stats
Expand All @@ -125,77 +127,56 @@ func (e *Executor) runOnce(t time.Time) Stats {
log := e.log.With(slog.F("workspace_id", wsID))

eg.Go(func() error {
err := e.db.InTx(func(db database.Store) error {
err := e.db.InTx(func(tx database.Store) error {
// Re-check eligibility since the first check was outside the
// transaction and the workspace settings may have changed.
ws, err := db.GetWorkspaceByID(e.ctx, wsID)
ws, err := tx.GetWorkspaceByID(e.ctx, wsID)
if err != nil {
log.Error(e.ctx, "get workspace autostart failed", slog.Error(err))
return nil
}

// Determine the workspace state based on its latest build.
priorHistory, err := db.GetLatestWorkspaceBuildByWorkspaceID(e.ctx, ws.ID)
latestBuild, err := tx.GetLatestWorkspaceBuildByWorkspaceID(e.ctx, ws.ID)
if err != nil {
log.Warn(e.ctx, "get latest workspace build", slog.Error(err))
return nil
}

templateSchedule, err := (*(e.templateScheduleStore.Load())).GetTemplateScheduleOptions(e.ctx, db, ws.TemplateID)
templateSchedule, err := (*(e.templateScheduleStore.Load())).GetTemplateScheduleOptions(e.ctx, tx, ws.TemplateID)
if err != nil {
log.Warn(e.ctx, "get template schedule options", slog.Error(err))
return nil
}

if !isEligibleForAutoStartStop(ws, priorHistory, templateSchedule) {
return nil
}

priorJob, err := db.GetProvisionerJobByID(e.ctx, priorHistory.JobID)
latestJob, err := tx.GetProvisionerJobByID(e.ctx, latestBuild.JobID)
if err != nil {
log.Warn(e.ctx, "get last provisioner job for workspace %q: %w", slog.Error(err))
return nil
}

validTransition, nextTransition, err := getNextTransition(ws, priorHistory, priorJob)
nextTransition, reason, err := getNextTransition(ws, latestBuild, latestJob, templateSchedule, currentTick)
if err != nil {
log.Debug(e.ctx, "skipping workspace", slog.Error(err))
return nil
}

if currentTick.Before(nextTransition) {
log.Debug(e.ctx, "skipping workspace: too early",
slog.F("next_transition_at", nextTransition),
slog.F("transition", validTransition),
slog.F("current_tick", currentTick),
)
return nil
}
builder := wsbuilder.New(ws, validTransition).
SetLastWorkspaceBuildInTx(&priorHistory).
SetLastWorkspaceBuildJobInTx(&priorJob)

switch validTransition {
case database.WorkspaceTransitionStart:
builder = builder.Reason(database.BuildReasonAutostart)
case database.WorkspaceTransitionStop:
builder = builder.Reason(database.BuildReasonAutostop)
default:
log.Error(e.ctx, "unsupported transition", slog.F("transition", validTransition))
return nil
}
if _, _, err := builder.Build(e.ctx, db, nil); err != nil {
builder := wsbuilder.New(ws, nextTransition).
SetLastWorkspaceBuildInTx(&latestBuild).
SetLastWorkspaceBuildJobInTx(&latestJob).
Reason(reason)

if _, _, err := builder.Build(e.ctx, tx, nil); err != nil {
log.Error(e.ctx, "unable to transition workspace",
slog.F("transition", validTransition),
slog.F("transition", nextTransition),
slog.Error(err),
)
return nil
}
statsMu.Lock()
stats.Transitions[ws.ID] = validTransition
stats.Transitions[ws.ID] = nextTransition
statsMu.Unlock()

log.Info(e.ctx, "scheduling workspace transition", slog.F("transition", validTransition))
log.Info(e.ctx, "scheduling workspace transition", slog.F("transition", nextTransition))

return nil

Expand All @@ -218,53 +199,83 @@ func (e *Executor) runOnce(t time.Time) Stats {
return stats
}

func isEligibleForAutoStartStop(ws database.Workspace, priorHistory database.WorkspaceBuild, templateSchedule schedule.TemplateScheduleOptions) bool {
if ws.Deleted {
func getNextTransition(
ws database.Workspace,
latestBuild database.WorkspaceBuild,
latestJob database.ProvisionerJob,
templateSchedule schedule.TemplateScheduleOptions,
currentTick time.Time,
) (
database.WorkspaceTransition,
database.BuildReason,
error,
) {
switch {
case isEligibleForAutostop(latestBuild, latestJob, currentTick):
return database.WorkspaceTransitionStop, database.BuildReasonAutostop, nil
case isEligibleForAutostart(ws, latestBuild, latestJob, templateSchedule, currentTick):
return database.WorkspaceTransitionStart, database.BuildReasonAutostart, nil
case isEligibleForFailedStop(latestBuild, latestJob, templateSchedule):
return database.WorkspaceTransitionStop, database.BuildReasonAutostop, nil
default:
return "", "", xerrors.Errorf("last transition not valid for autostart or autostop")
}
}

// isEligibleForAutostart returns true if the workspace should be autostarted.
func isEligibleForAutostart(ws database.Workspace, build database.WorkspaceBuild, job database.ProvisionerJob, templateSchedule schedule.TemplateScheduleOptions, currentTick time.Time) bool {
// Don't attempt to autostart failed workspaces.
if !job.CompletedAt.Valid || job.Error.String != "" {
return false
}
if templateSchedule.UserAutostartEnabled && ws.AutostartSchedule.Valid && ws.AutostartSchedule.String != "" {
return true

// If the last transition for the workspace was not 'stop' then the workspace
// cannot be started.
if build.Transition != database.WorkspaceTransitionStop {
return false
}
// Don't check the template schedule to see whether it allows autostop, this
// is done during the build when determining the deadline.
if priorHistory.Transition == database.WorkspaceTransitionStart && !priorHistory.Deadline.IsZero() {
return true

// If autostart isn't enabled, or the schedule isn't valid/populated we can't
// autostart the workspace.
if !templateSchedule.UserAutostartEnabled || !ws.AutostartSchedule.Valid || ws.AutostartSchedule.String == "" {
return false
}

return false
sched, err := schedule.Weekly(ws.AutostartSchedule.String)
if err != nil {
return false
}
// Round down to the nearest minute, as this is the finest granularity cron supports.
// Truncate is probably not necessary here, but doing it anyway to be sure.
nextTransition := sched.Next(build.CreatedAt).Truncate(time.Minute)

return !currentTick.Before(nextTransition)
}

func getNextTransition(
ws database.Workspace,
priorHistory database.WorkspaceBuild,
priorJob database.ProvisionerJob,
) (
validTransition database.WorkspaceTransition,
nextTransition time.Time,
err error,
) {
if !priorJob.CompletedAt.Valid || priorJob.Error.String != "" {
return "", time.Time{}, xerrors.Errorf("last workspace build did not complete successfully")
// isEligibleForAutostart returns true if the workspace should be autostopped.
func isEligibleForAutostop(build database.WorkspaceBuild, job database.ProvisionerJob, currentTick time.Time) bool {
// Don't attempt to autostop failed workspaces.
if !job.CompletedAt.Valid || job.Error.String != "" {
return false
}

switch priorHistory.Transition {
case database.WorkspaceTransitionStart:
if priorHistory.Deadline.IsZero() {
return "", time.Time{}, xerrors.Errorf("latest workspace build has zero deadline")
}
// For stopping, do not truncate. This is inconsistent with autostart, but
// it ensures we will not stop too early.
return database.WorkspaceTransitionStop, priorHistory.Deadline, nil
case database.WorkspaceTransitionStop:
sched, err := schedule.Weekly(ws.AutostartSchedule.String)
if err != nil {
return "", time.Time{}, xerrors.Errorf("workspace has invalid autostart schedule: %w", err)
}
// Round down to the nearest minute, as this is the finest granularity cron supports.
// Truncate is probably not necessary here, but doing it anyway to be sure.
nextTransition = sched.Next(priorHistory.CreatedAt).Truncate(time.Minute)
return database.WorkspaceTransitionStart, nextTransition, nil
default:
return "", time.Time{}, xerrors.Errorf("last transition not valid for autostart or autostop")
}
// A workspace must be started in order for it to be auto-stopped.
return build.Transition == database.WorkspaceTransitionStart &&
!build.Deadline.IsZero() &&
// We do not want to stop a workspace prior to it breaching its deadline.
!currentTick.Before(build.Deadline)
}

// isEligibleForFailedStop returns true if the workspace is eligible to be stopped
// due to a failed build.
func isEligibleForFailedStop(build database.WorkspaceBuild, job database.ProvisionerJob, templateSchedule schedule.TemplateScheduleOptions) bool {
// If the template has specified a failure TLL.
return templateSchedule.FailureTTL > 0 &&
// And the job resulted in failure.
db2sdk.ProvisionerJobStatus(job) == codersdk.ProvisionerJobFailed &&
build.Transition == database.WorkspaceTransitionStart &&
// And sufficient time has elapsed since the job has completed.
(job.CompletedAt.Valid && database.Now().Sub(job.CompletedAt.Time) > templateSchedule.FailureTTL ||
// Or sufficient time has elapsed since the job was canceled.
job.CanceledAt.Valid && database.Now().Sub(job.CanceledAt.Time) > templateSchedule.FailureTTL)
}
Loading