Skip to content

fix: prevent db deadlock when workspaces go dormant #10618

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Nov 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 32 additions & 30 deletions coderd/autobuild/lifecycle_executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,42 +140,45 @@ func (e *Executor) runOnce(t time.Time) Stats {

eg.Go(func() error {
var job *database.ProvisionerJob
var auditLog *auditParams
err := e.db.InTx(func(tx database.Store) error {
// Re-check eligibility since the first check was outside the
// transaction and the workspace settings may have changed.
ws, err := tx.GetWorkspaceByID(e.ctx, wsID)
if err != nil {
log.Error(e.ctx, "get workspace autostart failed", slog.Error(err))
return nil
return xerrors.Errorf("get workspace by id: %w", err)
Comment on lines -148 to +149
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

review: IIRC the reason this was written this way was to avoid a single error causing an entire iteration to fail. Did the previous logic negatively impact the issue here? If not, I'd advise keeping the scope of the change small.

Copy link
Collaborator Author

@sreya sreya Nov 13, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This only returns errors for the transaction not for the errgroup. Seems like we were trying to commit the transaction regardless of whether we encountered an error or not which is bizarre. That shouldn't cause an entire evaluation iteration to abort early.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah gotcha, I think I was operating on some stale cached assumptions about this code that are no longer true.

}

// Determine the workspace state based on its latest build.
latestBuild, err := tx.GetLatestWorkspaceBuildByWorkspaceID(e.ctx, ws.ID)
if err != nil {
log.Warn(e.ctx, "get latest workspace build", slog.Error(err))
return nil
return xerrors.Errorf("get latest workspace build: %w", err)
}
templateSchedule, err := (*(e.templateScheduleStore.Load())).Get(e.ctx, tx, ws.TemplateID)

latestJob, err := tx.GetProvisionerJobByID(e.ctx, latestBuild.JobID)
if err != nil {
log.Warn(e.ctx, "get template schedule options", slog.Error(err))
return nil
return xerrors.Errorf("get latest provisioner job: %w", err)
}

template, err := tx.GetTemplateByID(e.ctx, ws.TemplateID)
templateSchedule, err := (*(e.templateScheduleStore.Load())).Get(e.ctx, tx, ws.TemplateID)
if err != nil {
log.Warn(e.ctx, "get template by id", slog.Error(err))
return xerrors.Errorf("get template scheduling options: %w", err)
}
accessControl := (*(e.accessControlStore.Load())).GetTemplateAccessControl(template)

latestJob, err := tx.GetProvisionerJobByID(e.ctx, latestBuild.JobID)
template, err := tx.GetTemplateByID(e.ctx, ws.TemplateID)
if err != nil {
log.Warn(e.ctx, "get last provisioner job for workspace %q: %w", slog.Error(err))
return nil
return xerrors.Errorf("get template by ID: %w", err)
}

accessControl := (*(e.accessControlStore.Load())).GetTemplateAccessControl(template)

nextTransition, reason, err := getNextTransition(ws, latestBuild, latestJob, templateSchedule, currentTick)
if err != nil {
log.Debug(e.ctx, "skipping workspace", slog.Error(err))
// err is used to indicate that a workspace is not eligible
// so returning nil here is ok although ultimately the distinction
// doesn't matter since the transaction is read-only up to
// this point.
return nil
}

Expand All @@ -193,17 +196,16 @@ func (e *Executor) runOnce(t time.Time) Stats {
}

build, job, err = builder.Build(e.ctx, tx, nil, audit.WorkspaceBuildBaggage{IP: "127.0.0.1"})

if err != nil {
log.Error(e.ctx, "unable to transition workspace",
slog.F("transition", nextTransition),
slog.Error(err),
)
return nil
return xerrors.Errorf("build workspace: %w", err)
}
}

// Transition the workspace to dormant if it has breached the template's
// Transition the workspace to dormant if it has breached the template's
// threshold for inactivity.
if reason == database.BuildReasonAutolock {
wsOld := ws
Expand All @@ -215,21 +217,15 @@ func (e *Executor) runOnce(t time.Time) Stats {
},
})

auditBuild(e.ctx, e.log, *e.auditor.Load(), auditParams{
Build: build,
Job: latestJob,
Reason: reason,
Old: wsOld,
New: ws,
Success: err == nil,
})

auditLog = &auditParams{
Build: build,
Job: latestJob,
Reason: reason,
Old: wsOld,
New: ws,
}
if err != nil {
log.Error(e.ctx, "unable to transition workspace to dormant",
slog.F("transition", nextTransition),
slog.Error(err),
)
return nil
return xerrors.Errorf("update workspace dormant deleting at: %w", err)
}

log.Info(e.ctx, "dormant workspace",
Expand Down Expand Up @@ -267,6 +263,12 @@ func (e *Executor) runOnce(t time.Time) Stats {
if err != nil {
log.Error(e.ctx, "workspace scheduling failed", slog.Error(err))
}
if auditLog != nil {
// If the transition didn't succeed then updating the workspace
// to indicate dormant didn't either.
auditLog.Success = err == nil
auditBuild(e.ctx, e.log, *e.auditor.Load(), *auditLog)
}
if job != nil && err == nil {
// Note that we can't refactor such that posting the job happens inside wsbuilder because it's called
// with an outer transaction like this, and we need to make sure the outer transaction commits before
Expand Down
2 changes: 0 additions & 2 deletions coderd/database/queries.sql.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 0 additions & 2 deletions coderd/database/queries/workspaces.sql
Original file line number Diff line number Diff line change
Expand Up @@ -488,8 +488,6 @@ SET
FROM
templates
WHERE
workspaces.template_id = templates.id
AND
Comment on lines -491 to -492
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

review: already in the WHERE cond so 👍

workspaces.id = $1
RETURNING workspaces.*;

Expand Down
81 changes: 81 additions & 0 deletions enterprise/coderd/workspaces_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,14 @@ import (
"github.com/coder/coder/v2/coderd/autobuild"
"github.com/coder/coder/v2/coderd/coderdtest"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbtestutil"
"github.com/coder/coder/v2/coderd/rbac"
agplschedule "github.com/coder/coder/v2/coderd/schedule"
"github.com/coder/coder/v2/coderd/schedule/cron"
"github.com/coder/coder/v2/coderd/util/ptr"
"github.com/coder/coder/v2/codersdk"
entaudit "github.com/coder/coder/v2/enterprise/audit"
"github.com/coder/coder/v2/enterprise/audit/backends"
"github.com/coder/coder/v2/enterprise/coderd/coderdenttest"
"github.com/coder/coder/v2/enterprise/coderd/license"
"github.com/coder/coder/v2/enterprise/coderd/schedule"
Expand Down Expand Up @@ -309,6 +312,84 @@ func TestWorkspaceAutobuild(t *testing.T) {
require.True(t, ws.LastUsedAt.After(lastUsedAt))
})

// This test serves as a regression prevention for generating
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice 👍 We could rewrite using the new shiny dbfake but this can be a follow-up.

// audit logs in the same transaction the transition workspaces to
// the dormant state. The auditor that is passed to autobuild does
// not use the transaction when inserting an audit log which can
// cause a deadlock.
t.Run("NoDeadlock", func(t *testing.T) {
t.Parallel()

if !dbtestutil.WillUsePostgres() {
t.Skipf("Skipping non-postgres run")
}

var (
ticker = make(chan time.Time)
statCh = make(chan autobuild.Stats)
inactiveTTL = time.Minute
)

const (
maxConns = 3
numWorkspaces = maxConns * 5
)
// This is a bit bizarre but necessary so that we can
// initialize our coderd with a real auditor and limit DB connections
// to simulate deadlock conditions.
db, pubsub, sdb := dbtestutil.NewDBWithSQLDB(t)
// Set MaxOpenConns so we can ensure we aren't inadvertently acquiring
// another connection from within a transaction.
sdb.SetMaxOpenConns(maxConns)
auditor := entaudit.NewAuditor(db, entaudit.DefaultFilter, backends.NewPostgres(db, true))

client, user := coderdenttest.New(t, &coderdenttest.Options{
Options: &coderdtest.Options{
AutobuildTicker: ticker,
AutobuildStats: statCh,
TemplateScheduleStore: schedule.NewEnterpriseTemplateScheduleStore(agplUserQuietHoursScheduleStore()),
Database: db,
Pubsub: pubsub,
Auditor: auditor,
IncludeProvisionerDaemon: true,
},
LicenseOptions: &coderdenttest.LicenseOptions{
Features: license.Features{codersdk.FeatureAdvancedTemplateScheduling: 1},
},
})

version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
Parse: echo.ParseComplete,
ProvisionPlan: echo.PlanComplete,
ProvisionApply: echo.ApplyComplete,
})
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID, func(ctr *codersdk.CreateTemplateRequest) {
ctr.TimeTilDormantMillis = ptr.Ref[int64](inactiveTTL.Milliseconds())
})
coderdtest.AwaitTemplateVersionJobCompleted(t, client, version.ID)

workspaces := make([]codersdk.Workspace, 0, numWorkspaces)
for i := 0; i < numWorkspaces; i++ {
ws := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID)
build := coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, ws.LatestBuild.ID)
require.Equal(t, codersdk.WorkspaceStatusRunning, build.Status)
workspaces = append(workspaces, ws)
}

// Simulate being inactive.
ticker <- time.Now().Add(time.Hour)
stats := <-statCh

// Expect workspace to transition to stopped state for breaching
// failure TTL.
require.Len(t, stats.Transitions, numWorkspaces)
for _, ws := range workspaces {
// The workspace should be dormant.
ws = coderdtest.MustWorkspace(t, client, ws.ID)
require.NotNil(t, ws.DormantAt)
}
})

t.Run("InactiveTTLTooEarly", func(t *testing.T) {
t.Parallel()

Expand Down