Skip to content

fix: Prevent autobuild executor from slowing down API requests #3726

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Sep 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
172 changes: 104 additions & 68 deletions coderd/autobuild/executor/lifecycle_executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@ import (
"encoding/json"
"time"

"cdr.dev/slog"

"github.com/coder/coder/coderd/autobuild/schedule"
"github.com/coder/coder/coderd/database"

"github.com/google/uuid"
"github.com/moby/moby/pkg/namesgenerator"
"golang.org/x/sync/errgroup"
"golang.org/x/xerrors"

"cdr.dev/slog"
"github.com/coder/coder/coderd/autobuild/schedule"
"github.com/coder/coder/coderd/database"
)

// Executor automatically starts or stops workspaces.
Expand Down Expand Up @@ -89,80 +89,116 @@ func (e *Executor) runOnce(t time.Time) Stats {
stats.Error = err
}()
currentTick := t.Truncate(time.Minute)
err = e.db.InTx(func(db database.Store) error {
// TTL is set at the workspace level, and deadline at the workspace build level.
// When a workspace build is created, its deadline initially starts at zero.
// When provisionerd successfully completes a provision job, the deadline is
// set to now + TTL if the associated workspace has a TTL set. This deadline
// is what we compare against when performing autostop operations, rounded down
// to the minute.
//
// NOTE: If a workspace build is created with a given TTL and then the user either
// changes or unsets the TTL, the deadline for the workspace build will not
// have changed. This behavior is as expected per #2229.
eligibleWorkspaces, err := db.GetWorkspacesAutostart(e.ctx)
if err != nil {
return xerrors.Errorf("get eligible workspaces for autostart or autostop: %w", err)

// TTL is set at the workspace level, and deadline at the workspace build level.
// When a workspace build is created, its deadline initially starts at zero.
// When provisionerd successfully completes a provision job, the deadline is
// set to now + TTL if the associated workspace has a TTL set. This deadline
// is what we compare against when performing autostop operations, rounded down
// to the minute.
//
// NOTE: If a workspace build is created with a given TTL and then the user either
// changes or unsets the TTL, the deadline for the workspace build will not
// have changed. This behavior is as expected per #2229.
workspaces, err := e.db.GetWorkspaces(e.ctx, database.GetWorkspacesParams{
Deleted: false,
})
if err != nil {
e.log.Error(e.ctx, "get workspaces for autostart or autostop", slog.Error(err))
return stats
}

var eligibleWorkspaceIDs []uuid.UUID
for _, ws := range workspaces {
if isEligibleForAutoStartStop(ws) {
eligibleWorkspaceIDs = append(eligibleWorkspaceIDs, ws.ID)
}
}

for _, ws := range eligibleWorkspaces {
// Determine the workspace state based on its latest build.
priorHistory, err := db.GetLatestWorkspaceBuildByWorkspaceID(e.ctx, ws.ID)
if err != nil {
e.log.Warn(e.ctx, "get latest workspace build",
slog.F("workspace_id", ws.ID),
slog.Error(err),
)
continue
}
// We only use errgroup here for convenience of API, not for early
// cancellation. This means we only return nil errors in th eg.Go.
eg := errgroup.Group{}
// Limit the concurrency to avoid overloading the database.
eg.SetLimit(10)

priorJob, err := db.GetProvisionerJobByID(e.ctx, priorHistory.JobID)
if err != nil {
e.log.Warn(e.ctx, "get last provisioner job for workspace %q: %w",
slog.F("workspace_id", ws.ID),
slog.Error(err),
)
continue
}
for _, wsID := range eligibleWorkspaceIDs {
wsID := wsID
log := e.log.With(slog.F("workspace_id", wsID))

validTransition, nextTransition, err := getNextTransition(ws, priorHistory, priorJob)
if err != nil {
e.log.Debug(e.ctx, "skipping workspace",
slog.Error(err),
slog.F("workspace_id", ws.ID),
)
continue
}
eg.Go(func() error {
err := e.db.InTx(func(db database.Store) error {
// Re-check eligibility since the first check was outside the
// transaction and the workspace settings may have changed.
ws, err := db.GetWorkspaceByID(e.ctx, wsID)
if err != nil {
log.Error(e.ctx, "get workspace autostart failed", slog.Error(err))
return nil
}
if !isEligibleForAutoStartStop(ws) {
return nil
}

if currentTick.Before(nextTransition) {
e.log.Debug(e.ctx, "skipping workspace: too early",
slog.F("workspace_id", ws.ID),
slog.F("next_transition_at", nextTransition),
slog.F("transition", validTransition),
slog.F("current_tick", currentTick),
)
continue
}
// Determine the workspace state based on its latest build.
priorHistory, err := db.GetLatestWorkspaceBuildByWorkspaceID(e.ctx, ws.ID)
if err != nil {
log.Warn(e.ctx, "get latest workspace build", slog.Error(err))
return nil
}

priorJob, err := db.GetProvisionerJobByID(e.ctx, priorHistory.JobID)
if err != nil {
log.Warn(e.ctx, "get last provisioner job for workspace %q: %w", slog.Error(err))
return nil
}

validTransition, nextTransition, err := getNextTransition(ws, priorHistory, priorJob)
if err != nil {
log.Debug(e.ctx, "skipping workspace", slog.Error(err))
return nil
}

if currentTick.Before(nextTransition) {
log.Debug(e.ctx, "skipping workspace: too early",
slog.F("next_transition_at", nextTransition),
slog.F("transition", validTransition),
slog.F("current_tick", currentTick),
)
return nil
}

log.Info(e.ctx, "scheduling workspace transition", slog.F("transition", validTransition))

e.log.Info(e.ctx, "scheduling workspace transition",
slog.F("workspace_id", ws.ID),
slog.F("transition", validTransition),
)
stats.Transitions[ws.ID] = validTransition
if err := build(e.ctx, db, ws, validTransition, priorHistory, priorJob); err != nil {
log.Error(e.ctx, "unable to transition workspace",
slog.F("transition", validTransition),
slog.Error(err),
)
return nil
}

stats.Transitions[ws.ID] = validTransition
if err := build(e.ctx, db, ws, validTransition, priorHistory, priorJob); err != nil {
e.log.Error(e.ctx, "unable to transition workspace",
slog.F("workspace_id", ws.ID),
slog.F("transition", validTransition),
slog.Error(err),
)
return nil
})
if err != nil {
log.Error(e.ctx, "workspace scheduling failed", slog.Error(err))
}
}
return nil
})
return nil
})
}

// This should not happen since we don't want early cancellation.
err = eg.Wait()
if err != nil {
e.log.Error(e.ctx, "workspace scheduling errgroup failed", slog.Error(err))
}

return stats
}

func isEligibleForAutoStartStop(ws database.Workspace) bool {
return !ws.Deleted && (ws.AutostartSchedule.String != "" || ws.Ttl.Int64 > 0)
}

func getNextTransition(
ws database.Workspace,
priorHistory database.WorkspaceBuild,
Expand Down
17 changes: 2 additions & 15 deletions coderd/database/databasefake/databasefake.go
Original file line number Diff line number Diff line change
Expand Up @@ -588,20 +588,6 @@ func (q *fakeQuerier) GetWorkspaceAppsByAgentIDs(_ context.Context, ids []uuid.U
return apps, nil
}

func (q *fakeQuerier) GetWorkspacesAutostart(_ context.Context) ([]database.Workspace, error) {
q.mutex.RLock()
defer q.mutex.RUnlock()
workspaces := make([]database.Workspace, 0)
for _, ws := range q.workspaces {
if ws.AutostartSchedule.String != "" {
workspaces = append(workspaces, ws)
} else if ws.Ttl.Valid {
workspaces = append(workspaces, ws)
}
}
return workspaces, nil
}

func (q *fakeQuerier) GetWorkspaceOwnerCountsByTemplateIDs(_ context.Context, templateIDs []uuid.UUID) ([]database.GetWorkspaceOwnerCountsByTemplateIDsRow, error) {
q.mutex.RLock()
defer q.mutex.RUnlock()
Expand Down Expand Up @@ -2383,7 +2369,8 @@ func (q *fakeQuerier) GetDeploymentID(_ context.Context) (string, error) {
}

func (q *fakeQuerier) InsertLicense(
_ context.Context, arg database.InsertLicenseParams) (database.License, error) {
_ context.Context, arg database.InsertLicenseParams,
) (database.License, error) {
q.mutex.Lock()
defer q.mutex.Unlock()

Expand Down
1 change: 0 additions & 1 deletion coderd/database/querier.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

50 changes: 0 additions & 50 deletions coderd/database/queries.sql.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 0 additions & 14 deletions coderd/database/queries/workspaces.sql
Original file line number Diff line number Diff line change
Expand Up @@ -50,20 +50,6 @@ WHERE
END
;

-- name: GetWorkspacesAutostart :many
SELECT
*
FROM
workspaces
WHERE
deleted = false
AND
(
(autostart_schedule IS NOT NULL AND autostart_schedule <> '')
OR
(ttl IS NOT NULL AND ttl > 0)
);

-- name: GetWorkspaceByOwnerIDAndName :one
SELECT
*
Expand Down