diff --git a/coderd/autobuild/lifecycle_executor.go b/coderd/autobuild/lifecycle_executor.go index d2fb16b033946..8d3bfb5478dc7 100644 --- a/coderd/autobuild/lifecycle_executor.go +++ b/coderd/autobuild/lifecycle_executor.go @@ -140,42 +140,45 @@ func (e *Executor) runOnce(t time.Time) Stats { eg.Go(func() error { var job *database.ProvisionerJob + var auditLog *auditParams err := e.db.InTx(func(tx database.Store) error { // Re-check eligibility since the first check was outside the // transaction and the workspace settings may have changed. ws, err := tx.GetWorkspaceByID(e.ctx, wsID) if err != nil { - log.Error(e.ctx, "get workspace autostart failed", slog.Error(err)) - return nil + return xerrors.Errorf("get workspace by id: %w", err) } // Determine the workspace state based on its latest build. latestBuild, err := tx.GetLatestWorkspaceBuildByWorkspaceID(e.ctx, ws.ID) if err != nil { - log.Warn(e.ctx, "get latest workspace build", slog.Error(err)) - return nil + return xerrors.Errorf("get latest workspace build: %w", err) } - templateSchedule, err := (*(e.templateScheduleStore.Load())).Get(e.ctx, tx, ws.TemplateID) + + latestJob, err := tx.GetProvisionerJobByID(e.ctx, latestBuild.JobID) if err != nil { - log.Warn(e.ctx, "get template schedule options", slog.Error(err)) - return nil + return xerrors.Errorf("get latest provisioner job: %w", err) } - template, err := tx.GetTemplateByID(e.ctx, ws.TemplateID) + templateSchedule, err := (*(e.templateScheduleStore.Load())).Get(e.ctx, tx, ws.TemplateID) if err != nil { - log.Warn(e.ctx, "get template by id", slog.Error(err)) + return xerrors.Errorf("get template scheduling options: %w", err) } - accessControl := (*(e.accessControlStore.Load())).GetTemplateAccessControl(template) - latestJob, err := tx.GetProvisionerJobByID(e.ctx, latestBuild.JobID) + template, err := tx.GetTemplateByID(e.ctx, ws.TemplateID) if err != nil { - log.Warn(e.ctx, "get last provisioner job for workspace %q: %w", slog.Error(err)) - return nil + return xerrors.Errorf("get template by ID: %w", err) } + accessControl := (*(e.accessControlStore.Load())).GetTemplateAccessControl(template) + nextTransition, reason, err := getNextTransition(ws, latestBuild, latestJob, templateSchedule, currentTick) if err != nil { log.Debug(e.ctx, "skipping workspace", slog.Error(err)) + // err is used to indicate that a workspace is not eligible + // so returning nil here is ok although ultimately the distinction + // doesn't matter since the transaction is read-only up to + // this point. return nil } @@ -193,17 +196,16 @@ func (e *Executor) runOnce(t time.Time) Stats { } build, job, err = builder.Build(e.ctx, tx, nil, audit.WorkspaceBuildBaggage{IP: "127.0.0.1"}) - if err != nil { log.Error(e.ctx, "unable to transition workspace", slog.F("transition", nextTransition), slog.Error(err), ) - return nil + return xerrors.Errorf("build workspace: %w", err) } } - // Transition the workspace to dormant if it has breached the template's + // Transition the workspace to dormant if it has breached the template's // threshold for inactivity. if reason == database.BuildReasonAutolock { wsOld := ws @@ -215,21 +217,15 @@ func (e *Executor) runOnce(t time.Time) Stats { }, }) - auditBuild(e.ctx, e.log, *e.auditor.Load(), auditParams{ - Build: build, - Job: latestJob, - Reason: reason, - Old: wsOld, - New: ws, - Success: err == nil, - }) - + auditLog = &auditParams{ + Build: build, + Job: latestJob, + Reason: reason, + Old: wsOld, + New: ws, + } if err != nil { - log.Error(e.ctx, "unable to transition workspace to dormant", - slog.F("transition", nextTransition), - slog.Error(err), - ) - return nil + return xerrors.Errorf("update workspace dormant deleting at: %w", err) } log.Info(e.ctx, "dormant workspace", @@ -267,6 +263,12 @@ func (e *Executor) runOnce(t time.Time) Stats { if err != nil { log.Error(e.ctx, "workspace scheduling failed", slog.Error(err)) } + if auditLog != nil { + // If the transition didn't succeed then updating the workspace + // to indicate dormant didn't either. + auditLog.Success = err == nil + auditBuild(e.ctx, e.log, *e.auditor.Load(), *auditLog) + } if job != nil && err == nil { // Note that we can't refactor such that posting the job happens inside wsbuilder because it's called // with an outer transaction like this, and we need to make sure the outer transaction commits before diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index 2c5bcedd390c5..03aff0ea801b2 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -10813,8 +10813,6 @@ SET FROM templates WHERE - workspaces.template_id = templates.id -AND workspaces.id = $1 RETURNING workspaces.id, workspaces.created_at, workspaces.updated_at, workspaces.owner_id, workspaces.organization_id, workspaces.template_id, workspaces.deleted, workspaces.name, workspaces.autostart_schedule, workspaces.ttl, workspaces.last_used_at, workspaces.dormant_at, workspaces.deleting_at, workspaces.automatic_updates ` diff --git a/coderd/database/queries/workspaces.sql b/coderd/database/queries/workspaces.sql index 805f64d70b927..f20c9edb88b28 100644 --- a/coderd/database/queries/workspaces.sql +++ b/coderd/database/queries/workspaces.sql @@ -488,8 +488,6 @@ SET FROM templates WHERE - workspaces.template_id = templates.id -AND workspaces.id = $1 RETURNING workspaces.*; diff --git a/enterprise/coderd/workspaces_test.go b/enterprise/coderd/workspaces_test.go index 09ba533b483db..ed6621b223a4b 100644 --- a/enterprise/coderd/workspaces_test.go +++ b/enterprise/coderd/workspaces_test.go @@ -16,11 +16,14 @@ import ( "github.com/coder/coder/v2/coderd/autobuild" "github.com/coder/coder/v2/coderd/coderdtest" "github.com/coder/coder/v2/coderd/database" + "github.com/coder/coder/v2/coderd/database/dbtestutil" "github.com/coder/coder/v2/coderd/rbac" agplschedule "github.com/coder/coder/v2/coderd/schedule" "github.com/coder/coder/v2/coderd/schedule/cron" "github.com/coder/coder/v2/coderd/util/ptr" "github.com/coder/coder/v2/codersdk" + entaudit "github.com/coder/coder/v2/enterprise/audit" + "github.com/coder/coder/v2/enterprise/audit/backends" "github.com/coder/coder/v2/enterprise/coderd/coderdenttest" "github.com/coder/coder/v2/enterprise/coderd/license" "github.com/coder/coder/v2/enterprise/coderd/schedule" @@ -309,6 +312,84 @@ func TestWorkspaceAutobuild(t *testing.T) { require.True(t, ws.LastUsedAt.After(lastUsedAt)) }) + // This test serves as a regression prevention for generating + // audit logs in the same transaction the transition workspaces to + // the dormant state. The auditor that is passed to autobuild does + // not use the transaction when inserting an audit log which can + // cause a deadlock. + t.Run("NoDeadlock", func(t *testing.T) { + t.Parallel() + + if !dbtestutil.WillUsePostgres() { + t.Skipf("Skipping non-postgres run") + } + + var ( + ticker = make(chan time.Time) + statCh = make(chan autobuild.Stats) + inactiveTTL = time.Minute + ) + + const ( + maxConns = 3 + numWorkspaces = maxConns * 5 + ) + // This is a bit bizarre but necessary so that we can + // initialize our coderd with a real auditor and limit DB connections + // to simulate deadlock conditions. + db, pubsub, sdb := dbtestutil.NewDBWithSQLDB(t) + // Set MaxOpenConns so we can ensure we aren't inadvertently acquiring + // another connection from within a transaction. + sdb.SetMaxOpenConns(maxConns) + auditor := entaudit.NewAuditor(db, entaudit.DefaultFilter, backends.NewPostgres(db, true)) + + client, user := coderdenttest.New(t, &coderdenttest.Options{ + Options: &coderdtest.Options{ + AutobuildTicker: ticker, + AutobuildStats: statCh, + TemplateScheduleStore: schedule.NewEnterpriseTemplateScheduleStore(agplUserQuietHoursScheduleStore()), + Database: db, + Pubsub: pubsub, + Auditor: auditor, + IncludeProvisionerDaemon: true, + }, + LicenseOptions: &coderdenttest.LicenseOptions{ + Features: license.Features{codersdk.FeatureAdvancedTemplateScheduling: 1}, + }, + }) + + version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{ + Parse: echo.ParseComplete, + ProvisionPlan: echo.PlanComplete, + ProvisionApply: echo.ApplyComplete, + }) + template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID, func(ctr *codersdk.CreateTemplateRequest) { + ctr.TimeTilDormantMillis = ptr.Ref[int64](inactiveTTL.Milliseconds()) + }) + coderdtest.AwaitTemplateVersionJobCompleted(t, client, version.ID) + + workspaces := make([]codersdk.Workspace, 0, numWorkspaces) + for i := 0; i < numWorkspaces; i++ { + ws := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID) + build := coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, ws.LatestBuild.ID) + require.Equal(t, codersdk.WorkspaceStatusRunning, build.Status) + workspaces = append(workspaces, ws) + } + + // Simulate being inactive. + ticker <- time.Now().Add(time.Hour) + stats := <-statCh + + // Expect workspace to transition to stopped state for breaching + // failure TTL. + require.Len(t, stats.Transitions, numWorkspaces) + for _, ws := range workspaces { + // The workspace should be dormant. + ws = coderdtest.MustWorkspace(t, client, ws.ID) + require.NotNil(t, ws.DormantAt) + } + }) + t.Run("InactiveTTLTooEarly", func(t *testing.T) { t.Parallel()