Skip to content

fix: Prevent autobuild executor from slowing down API requests #3726

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Sep 2, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
fix: Prevent autobuild executor from slowing down API requests
With just a few workspaces, the autobuild executor can slow down API
requests every time it runs. This is because we started a long running
transaction and checked all eligible (for autostart) workspaces inside
that transaction. PostgreSQL doesn't know if we're modifying rows and as
such is locking the tables for read operations.

This commit changes the behavior so each workspace is checked in its own
transaction reducing the time the table/rows needs to stay locked.

For now concurrency has been arbitrarily limited to 10 workspaces at a
time, this could be made configurable or adjusted as the need arises.
  • Loading branch information
mafredri committed Sep 2, 2022
commit 11b7eb8c0fe5ef48e5e86d8376edc66d9634223a
165 changes: 96 additions & 69 deletions coderd/autobuild/executor/lifecycle_executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,18 @@ package executor

import (
"context"
"database/sql"
"encoding/json"
"time"

"cdr.dev/slog"

"github.com/coder/coder/coderd/autobuild/schedule"
"github.com/coder/coder/coderd/database"

"github.com/google/uuid"
"github.com/moby/moby/pkg/namesgenerator"
"golang.org/x/sync/errgroup"
"golang.org/x/xerrors"

"cdr.dev/slog"
"github.com/coder/coder/coderd/autobuild/schedule"
"github.com/coder/coder/coderd/database"
)

// Executor automatically starts or stops workspaces.
Expand Down Expand Up @@ -89,77 +90,103 @@ func (e *Executor) runOnce(t time.Time) Stats {
stats.Error = err
}()
currentTick := t.Truncate(time.Minute)
err = e.db.InTx(func(db database.Store) error {
// TTL is set at the workspace level, and deadline at the workspace build level.
// When a workspace build is created, its deadline initially starts at zero.
// When provisionerd successfully completes a provision job, the deadline is
// set to now + TTL if the associated workspace has a TTL set. This deadline
// is what we compare against when performing autostop operations, rounded down
// to the minute.
//
// NOTE: If a workspace build is created with a given TTL and then the user either
// changes or unsets the TTL, the deadline for the workspace build will not
// have changed. This behavior is as expected per #2229.
eligibleWorkspaces, err := db.GetWorkspacesAutostart(e.ctx)
if err != nil {
return xerrors.Errorf("get eligible workspaces for autostart or autostop: %w", err)
}

for _, ws := range eligibleWorkspaces {
// Determine the workspace state based on its latest build.
priorHistory, err := db.GetLatestWorkspaceBuildByWorkspaceID(e.ctx, ws.ID)
if err != nil {
e.log.Warn(e.ctx, "get latest workspace build",
slog.F("workspace_id", ws.ID),
slog.Error(err),
)
continue
}
// TTL is set at the workspace level, and deadline at the workspace build level.
// When a workspace build is created, its deadline initially starts at zero.
// When provisionerd successfully completes a provision job, the deadline is
// set to now + TTL if the associated workspace has a TTL set. This deadline
// is what we compare against when performing autostop operations, rounded down
// to the minute.
//
// NOTE: If a workspace build is created with a given TTL and then the user either
// changes or unsets the TTL, the deadline for the workspace build will not
// have changed. This behavior is as expected per #2229.
eligibleWorkspaces, err := e.db.GetWorkspacesAutostart(e.ctx)
if err != nil {
e.log.Error(e.ctx, "get eligible workspaces for autostart or autostop", slog.Error(err))
}

priorJob, err := db.GetProvisionerJobByID(e.ctx, priorHistory.JobID)
if err != nil {
e.log.Warn(e.ctx, "get last provisioner job for workspace %q: %w",
slog.F("workspace_id", ws.ID),
slog.Error(err),
)
continue
}
// We only use errgroup here for convenience of API, not for early
// cancellation. This means we only return nil errors in th eg.Go.
eg := errgroup.Group{}
// Limit the concurrency to avoid overloading the database.
eg.SetLimit(10)

validTransition, nextTransition, err := getNextTransition(ws, priorHistory, priorJob)
if err != nil {
e.log.Debug(e.ctx, "skipping workspace",
slog.Error(err),
slog.F("workspace_id", ws.ID),
)
continue
}
for _, ws := range eligibleWorkspaces {
ws := ws
log := e.log.With(slog.F("workspace_id", ws.ID))

if currentTick.Before(nextTransition) {
e.log.Debug(e.ctx, "skipping workspace: too early",
slog.F("workspace_id", ws.ID),
slog.F("next_transition_at", nextTransition),
slog.F("transition", validTransition),
slog.F("current_tick", currentTick),
)
continue
}
eg.Go(func() error {
err := e.db.InTx(func(db database.Store) error {
var err error

e.log.Info(e.ctx, "scheduling workspace transition",
slog.F("workspace_id", ws.ID),
slog.F("transition", validTransition),
)
// Re-check eligibility since the first check was outside the
// transaction and the workspace settings may have changed.
ws, err = db.GetWorkspaceAutostart(e.ctx, ws.ID)
if err != nil {
// Receiving ErrNoRows means the workspace settings changed
// and it is no longer eligible for autostart. Other errors
// means something went wrong.
if !xerrors.Is(err, sql.ErrNoRows) {
log.Error(e.ctx, "get workspace autostart failed", slog.Error(err))
}
return nil
}

// Determine the workspace state based on its latest build.
priorHistory, err := db.GetLatestWorkspaceBuildByWorkspaceID(e.ctx, ws.ID)
if err != nil {
log.Warn(e.ctx, "get latest workspace build", slog.Error(err))
return nil
}

priorJob, err := db.GetProvisionerJobByID(e.ctx, priorHistory.JobID)
if err != nil {
log.Warn(e.ctx, "get last provisioner job for workspace %q: %w", slog.Error(err))
return nil
}

validTransition, nextTransition, err := getNextTransition(ws, priorHistory, priorJob)
if err != nil {
log.Debug(e.ctx, "skipping workspace", slog.Error(err))
return nil
}

if currentTick.Before(nextTransition) {
log.Debug(e.ctx, "skipping workspace: too early",
slog.F("next_transition_at", nextTransition),
slog.F("transition", validTransition),
slog.F("current_tick", currentTick),
)
return nil
}

log.Info(e.ctx, "scheduling workspace transition", slog.F("transition", validTransition))

stats.Transitions[ws.ID] = validTransition
if err := build(e.ctx, db, ws, validTransition, priorHistory, priorJob); err != nil {
e.log.Error(e.ctx, "unable to transition workspace",
slog.F("workspace_id", ws.ID),
slog.F("transition", validTransition),
slog.Error(err),
)
stats.Transitions[ws.ID] = validTransition
if err := build(e.ctx, db, ws, validTransition, priorHistory, priorJob); err != nil {
log.Error(e.ctx, "unable to transition workspace",
slog.F("transition", validTransition),
slog.Error(err),
)
return nil
}

return nil
})
if err != nil {
log.Error(e.ctx, "workspace scheduling failed", slog.Error(err))
}
}
return nil
})
return nil
})
}

// This should not happen since we don't want early cancellation.
err = eg.Wait()
if err != nil {
e.log.Error(e.ctx, "workspace scheduling errgroup failed", slog.Error(err))
}

return stats
}

Expand Down
16 changes: 15 additions & 1 deletion coderd/database/databasefake/databasefake.go
Original file line number Diff line number Diff line change
Expand Up @@ -588,6 +588,19 @@ func (q *fakeQuerier) GetWorkspaceAppsByAgentIDs(_ context.Context, ids []uuid.U
return apps, nil
}

func (q *fakeQuerier) GetWorkspaceAutostart(_ context.Context, workspaceID uuid.UUID) (database.Workspace, error) {
q.mutex.RLock()
defer q.mutex.RUnlock()
for _, ws := range q.workspaces {
if ws.ID == workspaceID {
if ws.AutostartSchedule.String != "" || ws.Ttl.Valid {
return ws, nil
}
}
}
return database.Workspace{}, sql.ErrNoRows
}

func (q *fakeQuerier) GetWorkspacesAutostart(_ context.Context) ([]database.Workspace, error) {
q.mutex.RLock()
defer q.mutex.RUnlock()
Expand Down Expand Up @@ -2383,7 +2396,8 @@ func (q *fakeQuerier) GetDeploymentID(_ context.Context) (string, error) {
}

func (q *fakeQuerier) InsertLicense(
_ context.Context, arg database.InsertLicenseParams) (database.License, error) {
_ context.Context, arg database.InsertLicenseParams,
) (database.License, error) {
q.mutex.Lock()
defer q.mutex.Unlock()

Expand Down
1 change: 1 addition & 0 deletions coderd/database/querier.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

36 changes: 36 additions & 0 deletions coderd/database/queries.sql.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions coderd/database/queries/workspaces.sql
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,23 @@ WHERE
END
;

-- name: GetWorkspaceAutostart :one
SELECT
*
FROM
workspaces
WHERE
deleted = false
AND
(
id = @workspace_id
AND (
(autostart_schedule IS NOT NULL AND autostart_schedule <> '')
OR
(ttl IS NOT NULL AND ttl > 0)
)
);

-- name: GetWorkspacesAutostart :many
SELECT
*
Expand Down