Skip to content

Commit 6a57630

Browse files
committed
fix: Prevent autobuild executor from slowing down API requests
With just a few workspaces, the autobuild executor can slow down API requests every time it runs. This is because we started a long running transaction and checked all eligible (for autostart) workspaces inside that transaction. PostgreSQL doesn't know if we're modifying rows and as such is locking the tables for read operations. This commit changes the behavior so each workspace is checked in its own transaction reducing the time the table/rows needs to stay locked. For now concurrency has been arbitrarily limited to 10 workspaces at a time, this could be made configurable or adjusted as the need arises.
1 parent b9b9c2f commit 6a57630

File tree

5 files changed

+165
-70
lines changed

5 files changed

+165
-70
lines changed

coderd/autobuild/executor/lifecycle_executor.go

+96-69
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,18 @@ package executor
22

33
import (
44
"context"
5+
"database/sql"
56
"encoding/json"
67
"time"
78

8-
"cdr.dev/slog"
9-
10-
"github.com/coder/coder/coderd/autobuild/schedule"
11-
"github.com/coder/coder/coderd/database"
12-
139
"github.com/google/uuid"
1410
"github.com/moby/moby/pkg/namesgenerator"
11+
"golang.org/x/sync/errgroup"
1512
"golang.org/x/xerrors"
13+
14+
"cdr.dev/slog"
15+
"github.com/coder/coder/coderd/autobuild/schedule"
16+
"github.com/coder/coder/coderd/database"
1617
)
1718

1819
// Executor automatically starts or stops workspaces.
@@ -89,77 +90,103 @@ func (e *Executor) runOnce(t time.Time) Stats {
8990
stats.Error = err
9091
}()
9192
currentTick := t.Truncate(time.Minute)
92-
err = e.db.InTx(func(db database.Store) error {
93-
// TTL is set at the workspace level, and deadline at the workspace build level.
94-
// When a workspace build is created, its deadline initially starts at zero.
95-
// When provisionerd successfully completes a provision job, the deadline is
96-
// set to now + TTL if the associated workspace has a TTL set. This deadline
97-
// is what we compare against when performing autostop operations, rounded down
98-
// to the minute.
99-
//
100-
// NOTE: If a workspace build is created with a given TTL and then the user either
101-
// changes or unsets the TTL, the deadline for the workspace build will not
102-
// have changed. This behavior is as expected per #2229.
103-
eligibleWorkspaces, err := db.GetWorkspacesAutostart(e.ctx)
104-
if err != nil {
105-
return xerrors.Errorf("get eligible workspaces for autostart or autostop: %w", err)
106-
}
10793

108-
for _, ws := range eligibleWorkspaces {
109-
// Determine the workspace state based on its latest build.
110-
priorHistory, err := db.GetLatestWorkspaceBuildByWorkspaceID(e.ctx, ws.ID)
111-
if err != nil {
112-
e.log.Warn(e.ctx, "get latest workspace build",
113-
slog.F("workspace_id", ws.ID),
114-
slog.Error(err),
115-
)
116-
continue
117-
}
94+
// TTL is set at the workspace level, and deadline at the workspace build level.
95+
// When a workspace build is created, its deadline initially starts at zero.
96+
// When provisionerd successfully completes a provision job, the deadline is
97+
// set to now + TTL if the associated workspace has a TTL set. This deadline
98+
// is what we compare against when performing autostop operations, rounded down
99+
// to the minute.
100+
//
101+
// NOTE: If a workspace build is created with a given TTL and then the user either
102+
// changes or unsets the TTL, the deadline for the workspace build will not
103+
// have changed. This behavior is as expected per #2229.
104+
eligibleWorkspaces, err := e.db.GetWorkspacesAutostart(e.ctx)
105+
if err != nil {
106+
e.log.Error(e.ctx, "get eligible workspaces for autostart or autostop", slog.Error(err))
107+
}
118108

119-
priorJob, err := db.GetProvisionerJobByID(e.ctx, priorHistory.JobID)
120-
if err != nil {
121-
e.log.Warn(e.ctx, "get last provisioner job for workspace %q: %w",
122-
slog.F("workspace_id", ws.ID),
123-
slog.Error(err),
124-
)
125-
continue
126-
}
109+
// We only use errgroup here for convenience of API, not for early
110+
// cancellation. This means we only return nil errors in th eg.Go.
111+
eg := errgroup.Group{}
112+
// Limit the concurrency to avoid overloading the database.
113+
eg.SetLimit(10)
127114

128-
validTransition, nextTransition, err := getNextTransition(ws, priorHistory, priorJob)
129-
if err != nil {
130-
e.log.Debug(e.ctx, "skipping workspace",
131-
slog.Error(err),
132-
slog.F("workspace_id", ws.ID),
133-
)
134-
continue
135-
}
115+
for _, ws := range eligibleWorkspaces {
116+
ws := ws
117+
log := e.log.With(slog.F("workspace_id", ws.ID))
136118

137-
if currentTick.Before(nextTransition) {
138-
e.log.Debug(e.ctx, "skipping workspace: too early",
139-
slog.F("workspace_id", ws.ID),
140-
slog.F("next_transition_at", nextTransition),
141-
slog.F("transition", validTransition),
142-
slog.F("current_tick", currentTick),
143-
)
144-
continue
145-
}
119+
eg.Go(func() error {
120+
err := e.db.InTx(func(db database.Store) error {
121+
var err error
146122

147-
e.log.Info(e.ctx, "scheduling workspace transition",
148-
slog.F("workspace_id", ws.ID),
149-
slog.F("transition", validTransition),
150-
)
123+
// Re-check eligibility since the first check was outside the
124+
// transaction and the workspace settings may have changed.
125+
ws, err = db.GetWorkspaceAutostart(e.ctx, ws.ID)
126+
if err != nil {
127+
// Receiving ErrNoRows means the workspace settings changed
128+
// and it is no longer eligible for autostart. Other errors
129+
// means something went wrong.
130+
if !xerrors.Is(err, sql.ErrNoRows) {
131+
log.Error(e.ctx, "get workspace autostart failed", slog.Error(err))
132+
}
133+
return nil
134+
}
135+
136+
// Determine the workspace state based on its latest build.
137+
priorHistory, err := db.GetLatestWorkspaceBuildByWorkspaceID(e.ctx, ws.ID)
138+
if err != nil {
139+
log.Warn(e.ctx, "get latest workspace build", slog.Error(err))
140+
return nil
141+
}
142+
143+
priorJob, err := db.GetProvisionerJobByID(e.ctx, priorHistory.JobID)
144+
if err != nil {
145+
log.Warn(e.ctx, "get last provisioner job for workspace %q: %w", slog.Error(err))
146+
return nil
147+
}
148+
149+
validTransition, nextTransition, err := getNextTransition(ws, priorHistory, priorJob)
150+
if err != nil {
151+
log.Debug(e.ctx, "skipping workspace", slog.Error(err))
152+
return nil
153+
}
154+
155+
if currentTick.Before(nextTransition) {
156+
log.Debug(e.ctx, "skipping workspace: too early",
157+
slog.F("next_transition_at", nextTransition),
158+
slog.F("transition", validTransition),
159+
slog.F("current_tick", currentTick),
160+
)
161+
return nil
162+
}
163+
164+
log.Info(e.ctx, "scheduling workspace transition", slog.F("transition", validTransition))
151165

152-
stats.Transitions[ws.ID] = validTransition
153-
if err := build(e.ctx, db, ws, validTransition, priorHistory, priorJob); err != nil {
154-
e.log.Error(e.ctx, "unable to transition workspace",
155-
slog.F("workspace_id", ws.ID),
156-
slog.F("transition", validTransition),
157-
slog.Error(err),
158-
)
166+
stats.Transitions[ws.ID] = validTransition
167+
if err := build(e.ctx, db, ws, validTransition, priorHistory, priorJob); err != nil {
168+
log.Error(e.ctx, "unable to transition workspace",
169+
slog.F("transition", validTransition),
170+
slog.Error(err),
171+
)
172+
return nil
173+
}
174+
175+
return nil
176+
})
177+
if err != nil {
178+
log.Error(e.ctx, "workspace scheduling failed", slog.Error(err))
159179
}
160-
}
161-
return nil
162-
})
180+
return nil
181+
})
182+
}
183+
184+
// This should not happen since we don't want early cancellation.
185+
err = eg.Wait()
186+
if err != nil {
187+
e.log.Error(e.ctx, "workspace scheduling errgroup failed", slog.Error(err))
188+
}
189+
163190
return stats
164191
}
165192

coderd/database/databasefake/databasefake.go

+15-1
Original file line numberDiff line numberDiff line change
@@ -514,6 +514,19 @@ func (q *fakeQuerier) GetWorkspaceAppsByAgentIDs(_ context.Context, ids []uuid.U
514514
return apps, nil
515515
}
516516

517+
func (q *fakeQuerier) GetWorkspaceAutostart(_ context.Context, workspaceID uuid.UUID) (database.Workspace, error) {
518+
q.mutex.RLock()
519+
defer q.mutex.RUnlock()
520+
for _, ws := range q.workspaces {
521+
if ws.ID == workspaceID {
522+
if ws.AutostartSchedule.String != "" || ws.Ttl.Valid {
523+
return ws, nil
524+
}
525+
}
526+
}
527+
return database.Workspace{}, sql.ErrNoRows
528+
}
529+
517530
func (q *fakeQuerier) GetWorkspacesAutostart(_ context.Context) ([]database.Workspace, error) {
518531
q.mutex.RLock()
519532
defer q.mutex.RUnlock()
@@ -2304,7 +2317,8 @@ func (q *fakeQuerier) GetDeploymentID(_ context.Context) (string, error) {
23042317
}
23052318

23062319
func (q *fakeQuerier) InsertLicense(
2307-
_ context.Context, arg database.InsertLicenseParams) (database.License, error) {
2320+
_ context.Context, arg database.InsertLicenseParams,
2321+
) (database.License, error) {
23082322
q.mutex.Lock()
23092323
defer q.mutex.Unlock()
23102324

coderd/database/querier.go

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries.sql.go

+36
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries/workspaces.sql

+17
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,23 @@ WHERE
5050
END
5151
;
5252

53+
-- name: GetWorkspaceAutostart :one
54+
SELECT
55+
*
56+
FROM
57+
workspaces
58+
WHERE
59+
deleted = false
60+
AND
61+
(
62+
id = @workspace_id
63+
AND (
64+
(autostart_schedule IS NOT NULL AND autostart_schedule <> '')
65+
OR
66+
(ttl IS NOT NULL AND ttl > 0)
67+
)
68+
);
69+
5370
-- name: GetWorkspacesAutostart :many
5471
SELECT
5572
*

0 commit comments

Comments
 (0)