Skip to content

Commit 4c18034

Browse files
authored
fix: Prevent autobuild executor from slowing down API requests (#3726)
With just a few workspaces, the autobuild executor can slow down API requests every time it runs. This is because we started a long running transaction and checked all eligible (for autostart) workspaces inside that transaction. PostgreSQL doesn't know if we're modifying rows and as such is locking the tables for read operations. This commit changes the behavior so each workspace is checked in its own transaction reducing the time the table/rows needs to stay locked. For now concurrency has been arbitrarily limited to 10 workspaces at a time, this could be made configurable or adjusted as the need arises.
1 parent 3f73243 commit 4c18034

File tree

5 files changed

+106
-148
lines changed

5 files changed

+106
-148
lines changed

coderd/autobuild/executor/lifecycle_executor.go

+104-68
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,14 @@ import (
55
"encoding/json"
66
"time"
77

8-
"cdr.dev/slog"
9-
10-
"github.com/coder/coder/coderd/autobuild/schedule"
11-
"github.com/coder/coder/coderd/database"
12-
138
"github.com/google/uuid"
149
"github.com/moby/moby/pkg/namesgenerator"
10+
"golang.org/x/sync/errgroup"
1511
"golang.org/x/xerrors"
12+
13+
"cdr.dev/slog"
14+
"github.com/coder/coder/coderd/autobuild/schedule"
15+
"github.com/coder/coder/coderd/database"
1616
)
1717

1818
// Executor automatically starts or stops workspaces.
@@ -89,80 +89,116 @@ func (e *Executor) runOnce(t time.Time) Stats {
8989
stats.Error = err
9090
}()
9191
currentTick := t.Truncate(time.Minute)
92-
err = e.db.InTx(func(db database.Store) error {
93-
// TTL is set at the workspace level, and deadline at the workspace build level.
94-
// When a workspace build is created, its deadline initially starts at zero.
95-
// When provisionerd successfully completes a provision job, the deadline is
96-
// set to now + TTL if the associated workspace has a TTL set. This deadline
97-
// is what we compare against when performing autostop operations, rounded down
98-
// to the minute.
99-
//
100-
// NOTE: If a workspace build is created with a given TTL and then the user either
101-
// changes or unsets the TTL, the deadline for the workspace build will not
102-
// have changed. This behavior is as expected per #2229.
103-
eligibleWorkspaces, err := db.GetWorkspacesAutostart(e.ctx)
104-
if err != nil {
105-
return xerrors.Errorf("get eligible workspaces for autostart or autostop: %w", err)
92+
93+
// TTL is set at the workspace level, and deadline at the workspace build level.
94+
// When a workspace build is created, its deadline initially starts at zero.
95+
// When provisionerd successfully completes a provision job, the deadline is
96+
// set to now + TTL if the associated workspace has a TTL set. This deadline
97+
// is what we compare against when performing autostop operations, rounded down
98+
// to the minute.
99+
//
100+
// NOTE: If a workspace build is created with a given TTL and then the user either
101+
// changes or unsets the TTL, the deadline for the workspace build will not
102+
// have changed. This behavior is as expected per #2229.
103+
workspaces, err := e.db.GetWorkspaces(e.ctx, database.GetWorkspacesParams{
104+
Deleted: false,
105+
})
106+
if err != nil {
107+
e.log.Error(e.ctx, "get workspaces for autostart or autostop", slog.Error(err))
108+
return stats
109+
}
110+
111+
var eligibleWorkspaceIDs []uuid.UUID
112+
for _, ws := range workspaces {
113+
if isEligibleForAutoStartStop(ws) {
114+
eligibleWorkspaceIDs = append(eligibleWorkspaceIDs, ws.ID)
106115
}
116+
}
107117

108-
for _, ws := range eligibleWorkspaces {
109-
// Determine the workspace state based on its latest build.
110-
priorHistory, err := db.GetLatestWorkspaceBuildByWorkspaceID(e.ctx, ws.ID)
111-
if err != nil {
112-
e.log.Warn(e.ctx, "get latest workspace build",
113-
slog.F("workspace_id", ws.ID),
114-
slog.Error(err),
115-
)
116-
continue
117-
}
118+
// We only use errgroup here for convenience of API, not for early
119+
// cancellation. This means we only return nil errors in th eg.Go.
120+
eg := errgroup.Group{}
121+
// Limit the concurrency to avoid overloading the database.
122+
eg.SetLimit(10)
118123

119-
priorJob, err := db.GetProvisionerJobByID(e.ctx, priorHistory.JobID)
120-
if err != nil {
121-
e.log.Warn(e.ctx, "get last provisioner job for workspace %q: %w",
122-
slog.F("workspace_id", ws.ID),
123-
slog.Error(err),
124-
)
125-
continue
126-
}
124+
for _, wsID := range eligibleWorkspaceIDs {
125+
wsID := wsID
126+
log := e.log.With(slog.F("workspace_id", wsID))
127127

128-
validTransition, nextTransition, err := getNextTransition(ws, priorHistory, priorJob)
129-
if err != nil {
130-
e.log.Debug(e.ctx, "skipping workspace",
131-
slog.Error(err),
132-
slog.F("workspace_id", ws.ID),
133-
)
134-
continue
135-
}
128+
eg.Go(func() error {
129+
err := e.db.InTx(func(db database.Store) error {
130+
// Re-check eligibility since the first check was outside the
131+
// transaction and the workspace settings may have changed.
132+
ws, err := db.GetWorkspaceByID(e.ctx, wsID)
133+
if err != nil {
134+
log.Error(e.ctx, "get workspace autostart failed", slog.Error(err))
135+
return nil
136+
}
137+
if !isEligibleForAutoStartStop(ws) {
138+
return nil
139+
}
136140

137-
if currentTick.Before(nextTransition) {
138-
e.log.Debug(e.ctx, "skipping workspace: too early",
139-
slog.F("workspace_id", ws.ID),
140-
slog.F("next_transition_at", nextTransition),
141-
slog.F("transition", validTransition),
142-
slog.F("current_tick", currentTick),
143-
)
144-
continue
145-
}
141+
// Determine the workspace state based on its latest build.
142+
priorHistory, err := db.GetLatestWorkspaceBuildByWorkspaceID(e.ctx, ws.ID)
143+
if err != nil {
144+
log.Warn(e.ctx, "get latest workspace build", slog.Error(err))
145+
return nil
146+
}
147+
148+
priorJob, err := db.GetProvisionerJobByID(e.ctx, priorHistory.JobID)
149+
if err != nil {
150+
log.Warn(e.ctx, "get last provisioner job for workspace %q: %w", slog.Error(err))
151+
return nil
152+
}
153+
154+
validTransition, nextTransition, err := getNextTransition(ws, priorHistory, priorJob)
155+
if err != nil {
156+
log.Debug(e.ctx, "skipping workspace", slog.Error(err))
157+
return nil
158+
}
159+
160+
if currentTick.Before(nextTransition) {
161+
log.Debug(e.ctx, "skipping workspace: too early",
162+
slog.F("next_transition_at", nextTransition),
163+
slog.F("transition", validTransition),
164+
slog.F("current_tick", currentTick),
165+
)
166+
return nil
167+
}
168+
169+
log.Info(e.ctx, "scheduling workspace transition", slog.F("transition", validTransition))
146170

147-
e.log.Info(e.ctx, "scheduling workspace transition",
148-
slog.F("workspace_id", ws.ID),
149-
slog.F("transition", validTransition),
150-
)
171+
stats.Transitions[ws.ID] = validTransition
172+
if err := build(e.ctx, db, ws, validTransition, priorHistory, priorJob); err != nil {
173+
log.Error(e.ctx, "unable to transition workspace",
174+
slog.F("transition", validTransition),
175+
slog.Error(err),
176+
)
177+
return nil
178+
}
151179

152-
stats.Transitions[ws.ID] = validTransition
153-
if err := build(e.ctx, db, ws, validTransition, priorHistory, priorJob); err != nil {
154-
e.log.Error(e.ctx, "unable to transition workspace",
155-
slog.F("workspace_id", ws.ID),
156-
slog.F("transition", validTransition),
157-
slog.Error(err),
158-
)
180+
return nil
181+
})
182+
if err != nil {
183+
log.Error(e.ctx, "workspace scheduling failed", slog.Error(err))
159184
}
160-
}
161-
return nil
162-
})
185+
return nil
186+
})
187+
}
188+
189+
// This should not happen since we don't want early cancellation.
190+
err = eg.Wait()
191+
if err != nil {
192+
e.log.Error(e.ctx, "workspace scheduling errgroup failed", slog.Error(err))
193+
}
194+
163195
return stats
164196
}
165197

198+
func isEligibleForAutoStartStop(ws database.Workspace) bool {
199+
return !ws.Deleted && (ws.AutostartSchedule.String != "" || ws.Ttl.Int64 > 0)
200+
}
201+
166202
func getNextTransition(
167203
ws database.Workspace,
168204
priorHistory database.WorkspaceBuild,

coderd/database/databasefake/databasefake.go

+2-15
Original file line numberDiff line numberDiff line change
@@ -588,20 +588,6 @@ func (q *fakeQuerier) GetWorkspaceAppsByAgentIDs(_ context.Context, ids []uuid.U
588588
return apps, nil
589589
}
590590

591-
func (q *fakeQuerier) GetWorkspacesAutostart(_ context.Context) ([]database.Workspace, error) {
592-
q.mutex.RLock()
593-
defer q.mutex.RUnlock()
594-
workspaces := make([]database.Workspace, 0)
595-
for _, ws := range q.workspaces {
596-
if ws.AutostartSchedule.String != "" {
597-
workspaces = append(workspaces, ws)
598-
} else if ws.Ttl.Valid {
599-
workspaces = append(workspaces, ws)
600-
}
601-
}
602-
return workspaces, nil
603-
}
604-
605591
func (q *fakeQuerier) GetWorkspaceOwnerCountsByTemplateIDs(_ context.Context, templateIDs []uuid.UUID) ([]database.GetWorkspaceOwnerCountsByTemplateIDsRow, error) {
606592
q.mutex.RLock()
607593
defer q.mutex.RUnlock()
@@ -2383,7 +2369,8 @@ func (q *fakeQuerier) GetDeploymentID(_ context.Context) (string, error) {
23832369
}
23842370

23852371
func (q *fakeQuerier) InsertLicense(
2386-
_ context.Context, arg database.InsertLicenseParams) (database.License, error) {
2372+
_ context.Context, arg database.InsertLicenseParams,
2373+
) (database.License, error) {
23872374
q.mutex.Lock()
23882375
defer q.mutex.Unlock()
23892376

coderd/database/querier.go

-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries.sql.go

-50
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries/workspaces.sql

-14
Original file line numberDiff line numberDiff line change
@@ -50,20 +50,6 @@ WHERE
5050
END
5151
;
5252

53-
-- name: GetWorkspacesAutostart :many
54-
SELECT
55-
*
56-
FROM
57-
workspaces
58-
WHERE
59-
deleted = false
60-
AND
61-
(
62-
(autostart_schedule IS NOT NULL AND autostart_schedule <> '')
63-
OR
64-
(ttl IS NOT NULL AND ttl > 0)
65-
);
66-
6753
-- name: GetWorkspaceByOwnerIDAndName :one
6854
SELECT
6955
*

0 commit comments

Comments
 (0)