Skip to content

Commit 75ab16d

Browse files
authored
fix: prevent db deadlock when workspaces go dormant (#10618)
1 parent 76e7a1d commit 75ab16d

File tree

4 files changed

+113
-34
lines changed

4 files changed

+113
-34
lines changed

coderd/autobuild/lifecycle_executor.go

+32-30
Original file line numberDiff line numberDiff line change
@@ -140,42 +140,45 @@ func (e *Executor) runOnce(t time.Time) Stats {
140140

141141
eg.Go(func() error {
142142
var job *database.ProvisionerJob
143+
var auditLog *auditParams
143144
err := e.db.InTx(func(tx database.Store) error {
144145
// Re-check eligibility since the first check was outside the
145146
// transaction and the workspace settings may have changed.
146147
ws, err := tx.GetWorkspaceByID(e.ctx, wsID)
147148
if err != nil {
148-
log.Error(e.ctx, "get workspace autostart failed", slog.Error(err))
149-
return nil
149+
return xerrors.Errorf("get workspace by id: %w", err)
150150
}
151151

152152
// Determine the workspace state based on its latest build.
153153
latestBuild, err := tx.GetLatestWorkspaceBuildByWorkspaceID(e.ctx, ws.ID)
154154
if err != nil {
155-
log.Warn(e.ctx, "get latest workspace build", slog.Error(err))
156-
return nil
155+
return xerrors.Errorf("get latest workspace build: %w", err)
157156
}
158-
templateSchedule, err := (*(e.templateScheduleStore.Load())).Get(e.ctx, tx, ws.TemplateID)
157+
158+
latestJob, err := tx.GetProvisionerJobByID(e.ctx, latestBuild.JobID)
159159
if err != nil {
160-
log.Warn(e.ctx, "get template schedule options", slog.Error(err))
161-
return nil
160+
return xerrors.Errorf("get latest provisioner job: %w", err)
162161
}
163162

164-
template, err := tx.GetTemplateByID(e.ctx, ws.TemplateID)
163+
templateSchedule, err := (*(e.templateScheduleStore.Load())).Get(e.ctx, tx, ws.TemplateID)
165164
if err != nil {
166-
log.Warn(e.ctx, "get template by id", slog.Error(err))
165+
return xerrors.Errorf("get template scheduling options: %w", err)
167166
}
168-
accessControl := (*(e.accessControlStore.Load())).GetTemplateAccessControl(template)
169167

170-
latestJob, err := tx.GetProvisionerJobByID(e.ctx, latestBuild.JobID)
168+
template, err := tx.GetTemplateByID(e.ctx, ws.TemplateID)
171169
if err != nil {
172-
log.Warn(e.ctx, "get last provisioner job for workspace %q: %w", slog.Error(err))
173-
return nil
170+
return xerrors.Errorf("get template by ID: %w", err)
174171
}
175172

173+
accessControl := (*(e.accessControlStore.Load())).GetTemplateAccessControl(template)
174+
176175
nextTransition, reason, err := getNextTransition(ws, latestBuild, latestJob, templateSchedule, currentTick)
177176
if err != nil {
178177
log.Debug(e.ctx, "skipping workspace", slog.Error(err))
178+
// err is used to indicate that a workspace is not eligible
179+
// so returning nil here is ok although ultimately the distinction
180+
// doesn't matter since the transaction is read-only up to
181+
// this point.
179182
return nil
180183
}
181184

@@ -193,17 +196,16 @@ func (e *Executor) runOnce(t time.Time) Stats {
193196
}
194197

195198
build, job, err = builder.Build(e.ctx, tx, nil, audit.WorkspaceBuildBaggage{IP: "127.0.0.1"})
196-
197199
if err != nil {
198200
log.Error(e.ctx, "unable to transition workspace",
199201
slog.F("transition", nextTransition),
200202
slog.Error(err),
201203
)
202-
return nil
204+
return xerrors.Errorf("build workspace: %w", err)
203205
}
204206
}
205207

206-
// Transition the workspace to dormant if it has breached the template's
208+
// Transition the workspace to dormant if it has breached the template's
207209
// threshold for inactivity.
208210
if reason == database.BuildReasonAutolock {
209211
wsOld := ws
@@ -215,21 +217,15 @@ func (e *Executor) runOnce(t time.Time) Stats {
215217
},
216218
})
217219

218-
auditBuild(e.ctx, e.log, *e.auditor.Load(), auditParams{
219-
Build: build,
220-
Job: latestJob,
221-
Reason: reason,
222-
Old: wsOld,
223-
New: ws,
224-
Success: err == nil,
225-
})
226-
220+
auditLog = &auditParams{
221+
Build: build,
222+
Job: latestJob,
223+
Reason: reason,
224+
Old: wsOld,
225+
New: ws,
226+
}
227227
if err != nil {
228-
log.Error(e.ctx, "unable to transition workspace to dormant",
229-
slog.F("transition", nextTransition),
230-
slog.Error(err),
231-
)
232-
return nil
228+
return xerrors.Errorf("update workspace dormant deleting at: %w", err)
233229
}
234230

235231
log.Info(e.ctx, "dormant workspace",
@@ -267,6 +263,12 @@ func (e *Executor) runOnce(t time.Time) Stats {
267263
if err != nil {
268264
log.Error(e.ctx, "workspace scheduling failed", slog.Error(err))
269265
}
266+
if auditLog != nil {
267+
// If the transition didn't succeed then updating the workspace
268+
// to indicate dormant didn't either.
269+
auditLog.Success = err == nil
270+
auditBuild(e.ctx, e.log, *e.auditor.Load(), *auditLog)
271+
}
270272
if job != nil && err == nil {
271273
// Note that we can't refactor such that posting the job happens inside wsbuilder because it's called
272274
// with an outer transaction like this, and we need to make sure the outer transaction commits before

coderd/database/queries.sql.go

-2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries/workspaces.sql

-2
Original file line numberDiff line numberDiff line change
@@ -488,8 +488,6 @@ SET
488488
FROM
489489
templates
490490
WHERE
491-
workspaces.template_id = templates.id
492-
AND
493491
workspaces.id = $1
494492
RETURNING workspaces.*;
495493

enterprise/coderd/workspaces_test.go

+81
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,14 @@ import (
1616
"github.com/coder/coder/v2/coderd/autobuild"
1717
"github.com/coder/coder/v2/coderd/coderdtest"
1818
"github.com/coder/coder/v2/coderd/database"
19+
"github.com/coder/coder/v2/coderd/database/dbtestutil"
1920
"github.com/coder/coder/v2/coderd/rbac"
2021
agplschedule "github.com/coder/coder/v2/coderd/schedule"
2122
"github.com/coder/coder/v2/coderd/schedule/cron"
2223
"github.com/coder/coder/v2/coderd/util/ptr"
2324
"github.com/coder/coder/v2/codersdk"
25+
entaudit "github.com/coder/coder/v2/enterprise/audit"
26+
"github.com/coder/coder/v2/enterprise/audit/backends"
2427
"github.com/coder/coder/v2/enterprise/coderd/coderdenttest"
2528
"github.com/coder/coder/v2/enterprise/coderd/license"
2629
"github.com/coder/coder/v2/enterprise/coderd/schedule"
@@ -309,6 +312,84 @@ func TestWorkspaceAutobuild(t *testing.T) {
309312
require.True(t, ws.LastUsedAt.After(lastUsedAt))
310313
})
311314

315+
// This test serves as a regression prevention for generating
316+
// audit logs in the same transaction the transition workspaces to
317+
// the dormant state. The auditor that is passed to autobuild does
318+
// not use the transaction when inserting an audit log which can
319+
// cause a deadlock.
320+
t.Run("NoDeadlock", func(t *testing.T) {
321+
t.Parallel()
322+
323+
if !dbtestutil.WillUsePostgres() {
324+
t.Skipf("Skipping non-postgres run")
325+
}
326+
327+
var (
328+
ticker = make(chan time.Time)
329+
statCh = make(chan autobuild.Stats)
330+
inactiveTTL = time.Minute
331+
)
332+
333+
const (
334+
maxConns = 3
335+
numWorkspaces = maxConns * 5
336+
)
337+
// This is a bit bizarre but necessary so that we can
338+
// initialize our coderd with a real auditor and limit DB connections
339+
// to simulate deadlock conditions.
340+
db, pubsub, sdb := dbtestutil.NewDBWithSQLDB(t)
341+
// Set MaxOpenConns so we can ensure we aren't inadvertently acquiring
342+
// another connection from within a transaction.
343+
sdb.SetMaxOpenConns(maxConns)
344+
auditor := entaudit.NewAuditor(db, entaudit.DefaultFilter, backends.NewPostgres(db, true))
345+
346+
client, user := coderdenttest.New(t, &coderdenttest.Options{
347+
Options: &coderdtest.Options{
348+
AutobuildTicker: ticker,
349+
AutobuildStats: statCh,
350+
TemplateScheduleStore: schedule.NewEnterpriseTemplateScheduleStore(agplUserQuietHoursScheduleStore()),
351+
Database: db,
352+
Pubsub: pubsub,
353+
Auditor: auditor,
354+
IncludeProvisionerDaemon: true,
355+
},
356+
LicenseOptions: &coderdenttest.LicenseOptions{
357+
Features: license.Features{codersdk.FeatureAdvancedTemplateScheduling: 1},
358+
},
359+
})
360+
361+
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
362+
Parse: echo.ParseComplete,
363+
ProvisionPlan: echo.PlanComplete,
364+
ProvisionApply: echo.ApplyComplete,
365+
})
366+
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID, func(ctr *codersdk.CreateTemplateRequest) {
367+
ctr.TimeTilDormantMillis = ptr.Ref[int64](inactiveTTL.Milliseconds())
368+
})
369+
coderdtest.AwaitTemplateVersionJobCompleted(t, client, version.ID)
370+
371+
workspaces := make([]codersdk.Workspace, 0, numWorkspaces)
372+
for i := 0; i < numWorkspaces; i++ {
373+
ws := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID)
374+
build := coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, ws.LatestBuild.ID)
375+
require.Equal(t, codersdk.WorkspaceStatusRunning, build.Status)
376+
workspaces = append(workspaces, ws)
377+
}
378+
379+
// Simulate being inactive.
380+
ticker <- time.Now().Add(time.Hour)
381+
stats := <-statCh
382+
383+
// Expect workspace to transition to stopped state for breaching
384+
// failure TTL.
385+
require.Len(t, stats.Transitions, numWorkspaces)
386+
for _, ws := range workspaces {
387+
// The workspace should be dormant.
388+
ws = coderdtest.MustWorkspace(t, client, ws.ID)
389+
require.NotNil(t, ws.DormantAt)
390+
}
391+
})
392+
312393
t.Run("InactiveTTLTooEarly", func(t *testing.T) {
313394
t.Parallel()
314395

0 commit comments

Comments
 (0)