diff --git a/cli/server.go b/cli/server.go index d32ed51c06007..c04514f7c76e5 100644 --- a/cli/server.go +++ b/cli/server.go @@ -87,6 +87,7 @@ import ( "github.com/coder/coder/v2/coderd/externalauth" "github.com/coder/coder/v2/coderd/gitsshkey" "github.com/coder/coder/v2/coderd/httpmw" + "github.com/coder/coder/v2/coderd/jobreaper" "github.com/coder/coder/v2/coderd/notifications" "github.com/coder/coder/v2/coderd/oauthpki" "github.com/coder/coder/v2/coderd/prometheusmetrics" @@ -95,7 +96,6 @@ import ( "github.com/coder/coder/v2/coderd/schedule" "github.com/coder/coder/v2/coderd/telemetry" "github.com/coder/coder/v2/coderd/tracing" - "github.com/coder/coder/v2/coderd/unhanger" "github.com/coder/coder/v2/coderd/updatecheck" "github.com/coder/coder/v2/coderd/util/ptr" "github.com/coder/coder/v2/coderd/util/slice" @@ -1127,11 +1127,11 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. ctx, options.Database, options.Pubsub, options.PrometheusRegistry, coderAPI.TemplateScheduleStore, &coderAPI.Auditor, coderAPI.AccessControlStore, logger, autobuildTicker.C, options.NotificationsEnqueuer) autobuildExecutor.Run() - hangDetectorTicker := time.NewTicker(vals.JobHangDetectorInterval.Value()) - defer hangDetectorTicker.Stop() - hangDetector := unhanger.New(ctx, options.Database, options.Pubsub, logger, hangDetectorTicker.C) - hangDetector.Start() - defer hangDetector.Close() + jobReaperTicker := time.NewTicker(vals.JobReaperDetectorInterval.Value()) + defer jobReaperTicker.Stop() + jobReaper := jobreaper.New(ctx, options.Database, options.Pubsub, logger, jobReaperTicker.C) + jobReaper.Start() + defer jobReaper.Close() waitForProvisionerJobs := false // Currently there is no way to ask the server to shut diff --git a/cli/testdata/server-config.yaml.golden b/cli/testdata/server-config.yaml.golden index fc76a6c2ec8a0..9995a7f389130 100644 --- a/cli/testdata/server-config.yaml.golden +++ b/cli/testdata/server-config.yaml.golden @@ -183,7 +183,7 @@ networking: # Interval to poll for scheduled workspace builds. # (default: 1m0s, type: duration) autobuildPollInterval: 1m0s -# Interval to poll for hung jobs and automatically terminate them. +# Interval to poll for hung and pending jobs and automatically terminate them. # (default: 1m0s, type: duration) jobHangDetectorInterval: 1m0s introspection: diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index e843d0d748578..1c36bfba97f28 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -68,6 +68,7 @@ import ( "github.com/coder/coder/v2/coderd/externalauth" "github.com/coder/coder/v2/coderd/gitsshkey" "github.com/coder/coder/v2/coderd/httpmw" + "github.com/coder/coder/v2/coderd/jobreaper" "github.com/coder/coder/v2/coderd/notifications" "github.com/coder/coder/v2/coderd/notifications/notificationstest" "github.com/coder/coder/v2/coderd/rbac" @@ -75,7 +76,6 @@ import ( "github.com/coder/coder/v2/coderd/runtimeconfig" "github.com/coder/coder/v2/coderd/schedule" "github.com/coder/coder/v2/coderd/telemetry" - "github.com/coder/coder/v2/coderd/unhanger" "github.com/coder/coder/v2/coderd/updatecheck" "github.com/coder/coder/v2/coderd/util/ptr" "github.com/coder/coder/v2/coderd/webpush" @@ -365,11 +365,11 @@ func NewOptions(t testing.TB, options *Options) (func(http.Handler), context.Can ).WithStatsChannel(options.AutobuildStats) lifecycleExecutor.Run() - hangDetectorTicker := time.NewTicker(options.DeploymentValues.JobHangDetectorInterval.Value()) - defer hangDetectorTicker.Stop() - hangDetector := unhanger.New(ctx, options.Database, options.Pubsub, options.Logger.Named("unhanger.detector"), hangDetectorTicker.C) - hangDetector.Start() - t.Cleanup(hangDetector.Close) + jobReaperTicker := time.NewTicker(options.DeploymentValues.JobReaperDetectorInterval.Value()) + defer jobReaperTicker.Stop() + jobReaper := jobreaper.New(ctx, options.Database, options.Pubsub, options.Logger.Named("reaper.detector"), jobReaperTicker.C) + jobReaper.Start() + t.Cleanup(jobReaper.Close) if options.TelemetryReporter == nil { options.TelemetryReporter = telemetry.NewNoop() diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index 732739b2f09a5..16d707c466104 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -170,10 +170,10 @@ var ( Identifier: rbac.RoleIdentifier{Name: "provisionerd"}, DisplayName: "Provisioner Daemon", Site: rbac.Permissions(map[string][]policy.Action{ - // TODO: Add ProvisionerJob resource type. - rbac.ResourceFile.Type: {policy.ActionRead}, - rbac.ResourceSystem.Type: {policy.WildcardSymbol}, - rbac.ResourceTemplate.Type: {policy.ActionRead, policy.ActionUpdate}, + rbac.ResourceProvisionerJobs.Type: {policy.ActionRead, policy.ActionUpdate, policy.ActionCreate}, + rbac.ResourceFile.Type: {policy.ActionRead}, + rbac.ResourceSystem.Type: {policy.WildcardSymbol}, + rbac.ResourceTemplate.Type: {policy.ActionRead, policy.ActionUpdate}, // Unsure why provisionerd needs update and read personal rbac.ResourceUser.Type: {policy.ActionRead, policy.ActionReadPersonal, policy.ActionUpdatePersonal}, rbac.ResourceWorkspaceDormant.Type: {policy.ActionDelete, policy.ActionRead, policy.ActionUpdate, policy.ActionWorkspaceStop}, @@ -219,19 +219,20 @@ var ( Scope: rbac.ScopeAll, }.WithCachedASTValue() - // See unhanger package. - subjectHangDetector = rbac.Subject{ - Type: rbac.SubjectTypeHangDetector, - FriendlyName: "Hang Detector", + // See reaper package. + subjectJobReaper = rbac.Subject{ + Type: rbac.SubjectTypeJobReaper, + FriendlyName: "Job Reaper", ID: uuid.Nil.String(), Roles: rbac.Roles([]rbac.Role{ { - Identifier: rbac.RoleIdentifier{Name: "hangdetector"}, - DisplayName: "Hang Detector Daemon", + Identifier: rbac.RoleIdentifier{Name: "jobreaper"}, + DisplayName: "Job Reaper Daemon", Site: rbac.Permissions(map[string][]policy.Action{ - rbac.ResourceSystem.Type: {policy.WildcardSymbol}, - rbac.ResourceTemplate.Type: {policy.ActionRead}, - rbac.ResourceWorkspace.Type: {policy.ActionRead, policy.ActionUpdate}, + rbac.ResourceSystem.Type: {policy.WildcardSymbol}, + rbac.ResourceTemplate.Type: {policy.ActionRead}, + rbac.ResourceWorkspace.Type: {policy.ActionRead, policy.ActionUpdate}, + rbac.ResourceProvisionerJobs.Type: {policy.ActionRead, policy.ActionUpdate}, }), Org: map[string][]rbac.Permission{}, User: []rbac.Permission{}, @@ -346,6 +347,7 @@ var ( rbac.ResourceNotificationTemplate.Type: {policy.ActionCreate, policy.ActionUpdate, policy.ActionDelete}, rbac.ResourceCryptoKey.Type: {policy.ActionCreate, policy.ActionUpdate, policy.ActionDelete}, rbac.ResourceFile.Type: {policy.ActionCreate, policy.ActionRead}, + rbac.ResourceProvisionerJobs.Type: {policy.ActionRead, policy.ActionUpdate, policy.ActionCreate}, }), Org: map[string][]rbac.Permission{}, User: []rbac.Permission{}, @@ -407,10 +409,10 @@ func AsAutostart(ctx context.Context) context.Context { return As(ctx, subjectAutostart) } -// AsHangDetector returns a context with an actor that has permissions required -// for unhanger.Detector to function. -func AsHangDetector(ctx context.Context) context.Context { - return As(ctx, subjectHangDetector) +// AsJobReaper returns a context with an actor that has permissions required +// for reaper.Detector to function. +func AsJobReaper(ctx context.Context) context.Context { + return As(ctx, subjectJobReaper) } // AsKeyRotator returns a context with an actor that has permissions required for rotating crypto keys. @@ -1085,11 +1087,10 @@ func (q *querier) AcquireNotificationMessages(ctx context.Context, arg database. return q.db.AcquireNotificationMessages(ctx, arg) } -// TODO: We need to create a ProvisionerJob resource type func (q *querier) AcquireProvisionerJob(ctx context.Context, arg database.AcquireProvisionerJobParams) (database.ProvisionerJob, error) { - // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceSystem); err != nil { - // return database.ProvisionerJob{}, err - // } + if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + return database.ProvisionerJob{}, err + } return q.db.AcquireProvisionerJob(ctx, arg) } @@ -1912,14 +1913,6 @@ func (q *querier) GetHealthSettings(ctx context.Context) (string, error) { return q.db.GetHealthSettings(ctx) } -// TODO: We need to create a ProvisionerJob resource type -func (q *querier) GetHungProvisionerJobs(ctx context.Context, hungSince time.Time) ([]database.ProvisionerJob, error) { - // if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceSystem); err != nil { - // return nil, err - // } - return q.db.GetHungProvisionerJobs(ctx, hungSince) -} - func (q *querier) GetInboxNotificationByID(ctx context.Context, id uuid.UUID) (database.InboxNotification, error) { return fetchWithAction(q.log, q.auth, policy.ActionRead, q.db.GetInboxNotificationByID)(ctx, id) } @@ -2307,6 +2300,13 @@ func (q *querier) GetProvisionerJobByID(ctx context.Context, id uuid.UUID) (data return job, nil } +func (q *querier) GetProvisionerJobByIDForUpdate(ctx context.Context, id uuid.UUID) (database.ProvisionerJob, error) { + if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { + return database.ProvisionerJob{}, err + } + return q.db.GetProvisionerJobByIDForUpdate(ctx, id) +} + func (q *querier) GetProvisionerJobTimingsByJobID(ctx context.Context, jobID uuid.UUID) ([]database.ProvisionerJobTiming, error) { _, err := q.GetProvisionerJobByID(ctx, jobID) if err != nil { @@ -2315,31 +2315,49 @@ func (q *querier) GetProvisionerJobTimingsByJobID(ctx context.Context, jobID uui return q.db.GetProvisionerJobTimingsByJobID(ctx, jobID) } -// TODO: We have a ProvisionerJobs resource, but it hasn't been checked for this use-case. func (q *querier) GetProvisionerJobsByIDs(ctx context.Context, ids []uuid.UUID) ([]database.ProvisionerJob, error) { - // if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceSystem); err != nil { - // return nil, err - // } - return q.db.GetProvisionerJobsByIDs(ctx, ids) + provisionerJobs, err := q.db.GetProvisionerJobsByIDs(ctx, ids) + if err != nil { + return nil, err + } + orgIDs := make(map[uuid.UUID]struct{}) + for _, job := range provisionerJobs { + orgIDs[job.OrganizationID] = struct{}{} + } + for orgID := range orgIDs { + if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs.InOrg(orgID)); err != nil { + return nil, err + } + } + return provisionerJobs, nil } -// TODO: We have a ProvisionerJobs resource, but it hasn't been checked for this use-case. func (q *querier) GetProvisionerJobsByIDsWithQueuePosition(ctx context.Context, ids []uuid.UUID) ([]database.GetProvisionerJobsByIDsWithQueuePositionRow, error) { + // TODO: Remove this once we have a proper rbac check for provisioner jobs. + // Details in https://github.com/coder/coder/issues/16160 return q.db.GetProvisionerJobsByIDsWithQueuePosition(ctx, ids) } func (q *querier) GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisioner(ctx context.Context, arg database.GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisionerParams) ([]database.GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisionerRow, error) { + // TODO: Remove this once we have a proper rbac check for provisioner jobs. + // Details in https://github.com/coder/coder/issues/16160 return fetchWithPostFilter(q.auth, policy.ActionRead, q.db.GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisioner)(ctx, arg) } -// TODO: We have a ProvisionerJobs resource, but it hasn't been checked for this use-case. func (q *querier) GetProvisionerJobsCreatedAfter(ctx context.Context, createdAt time.Time) ([]database.ProvisionerJob, error) { - // if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceSystem); err != nil { - // return nil, err - // } + if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { + return nil, err + } return q.db.GetProvisionerJobsCreatedAfter(ctx, createdAt) } +func (q *querier) GetProvisionerJobsToBeReaped(ctx context.Context, arg database.GetProvisionerJobsToBeReapedParams) ([]database.ProvisionerJob, error) { + if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { + return nil, err + } + return q.db.GetProvisionerJobsToBeReaped(ctx, arg) +} + func (q *querier) GetProvisionerKeyByHashedSecret(ctx context.Context, hashedSecret []byte) (database.ProvisionerKey, error) { return fetch(q.log, q.auth, q.db.GetProvisionerKeyByHashedSecret)(ctx, hashedSecret) } @@ -3524,27 +3542,22 @@ func (q *querier) InsertPresetParameters(ctx context.Context, arg database.Inser return q.db.InsertPresetParameters(ctx, arg) } -// TODO: We need to create a ProvisionerJob resource type func (q *querier) InsertProvisionerJob(ctx context.Context, arg database.InsertProvisionerJobParams) (database.ProvisionerJob, error) { - // if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceSystem); err != nil { - // return database.ProvisionerJob{}, err - // } + // TODO: Remove this once we have a proper rbac check for provisioner jobs. + // Details in https://github.com/coder/coder/issues/16160 return q.db.InsertProvisionerJob(ctx, arg) } -// TODO: We need to create a ProvisionerJob resource type func (q *querier) InsertProvisionerJobLogs(ctx context.Context, arg database.InsertProvisionerJobLogsParams) ([]database.ProvisionerJobLog, error) { - // if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceSystem); err != nil { - // return nil, err - // } + // TODO: Remove this once we have a proper rbac check for provisioner jobs. + // Details in https://github.com/coder/coder/issues/16160 return q.db.InsertProvisionerJobLogs(ctx, arg) } -// TODO: We need to create a ProvisionerJob resource type func (q *querier) InsertProvisionerJobTimings(ctx context.Context, arg database.InsertProvisionerJobTimingsParams) ([]database.ProvisionerJobTiming, error) { - // if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceSystem); err != nil { - // return nil, err - // } + if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + return nil, err + } return q.db.InsertProvisionerJobTimings(ctx, arg) } @@ -4167,15 +4180,17 @@ func (q *querier) UpdateProvisionerDaemonLastSeenAt(ctx context.Context, arg dat return q.db.UpdateProvisionerDaemonLastSeenAt(ctx, arg) } -// TODO: We need to create a ProvisionerJob resource type func (q *querier) UpdateProvisionerJobByID(ctx context.Context, arg database.UpdateProvisionerJobByIDParams) error { - // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceSystem); err != nil { - // return err - // } + if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + return err + } return q.db.UpdateProvisionerJobByID(ctx, arg) } func (q *querier) UpdateProvisionerJobWithCancelByID(ctx context.Context, arg database.UpdateProvisionerJobWithCancelByIDParams) error { + // TODO: Remove this once we have a proper rbac check for provisioner jobs. + // Details in https://github.com/coder/coder/issues/16160 + job, err := q.db.GetProvisionerJobByID(ctx, arg.ID) if err != nil { return err @@ -4242,14 +4257,20 @@ func (q *querier) UpdateProvisionerJobWithCancelByID(ctx context.Context, arg da return q.db.UpdateProvisionerJobWithCancelByID(ctx, arg) } -// TODO: We need to create a ProvisionerJob resource type func (q *querier) UpdateProvisionerJobWithCompleteByID(ctx context.Context, arg database.UpdateProvisionerJobWithCompleteByIDParams) error { - // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceSystem); err != nil { - // return err - // } + if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + return err + } return q.db.UpdateProvisionerJobWithCompleteByID(ctx, arg) } +func (q *querier) UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx context.Context, arg database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams) error { + if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + return err + } + return q.db.UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx, arg) +} + func (q *querier) UpdateReplica(ctx context.Context, arg database.UpdateReplicaParams) (database.Replica, error) { if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceSystem); err != nil { return database.Replica{}, err diff --git a/coderd/database/dbauthz/dbauthz_test.go b/coderd/database/dbauthz/dbauthz_test.go index 6dc9a32f03943..f851d0d6d1c08 100644 --- a/coderd/database/dbauthz/dbauthz_test.go +++ b/coderd/database/dbauthz/dbauthz_test.go @@ -694,9 +694,12 @@ func (s *MethodTestSuite) TestProvisionerJob() { Asserts(v.RBACObject(tpl), []policy.Action{policy.ActionRead, policy.ActionUpdate}).Returns() })) s.Run("GetProvisionerJobsByIDs", s.Subtest(func(db database.Store, check *expects) { - a := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) - b := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) - check.Args([]uuid.UUID{a.ID, b.ID}).Asserts().Returns(slice.New(a, b)) + o := dbgen.Organization(s.T(), db, database.Organization{}) + a := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: o.ID}) + b := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: o.ID}) + check.Args([]uuid.UUID{a.ID, b.ID}). + Asserts(rbac.ResourceProvisionerJobs.InOrg(o.ID), policy.ActionRead). + Returns(slice.New(a, b)) })) s.Run("GetProvisionerLogsAfterID", s.Subtest(func(db database.Store, check *expects) { u := dbgen.User(s.T(), db, database.User{}) @@ -3891,9 +3894,8 @@ func (s *MethodTestSuite) TestSystemFunctions() { check.Args().Asserts(rbac.ResourceSystem, policy.ActionDelete) })) s.Run("GetProvisionerJobsCreatedAfter", s.Subtest(func(db database.Store, check *expects) { - // TODO: add provisioner job resource type _ = dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{CreatedAt: time.Now().Add(-time.Hour)}) - check.Args(time.Now()).Asserts( /*rbac.ResourceSystem, policy.ActionRead*/ ) + check.Args(time.Now()).Asserts(rbac.ResourceProvisionerJobs, policy.ActionRead) })) s.Run("GetTemplateVersionsByIDs", s.Subtest(func(db database.Store, check *expects) { dbtestutil.DisableForeignKeysAndTriggers(s.T(), db) @@ -3976,11 +3978,11 @@ func (s *MethodTestSuite) TestSystemFunctions() { Returns([]database.WorkspaceAgent{agt}) })) s.Run("GetProvisionerJobsByIDs", s.Subtest(func(db database.Store, check *expects) { - // TODO: add a ProvisionerJob resource type - a := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) - b := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) + o := dbgen.Organization(s.T(), db, database.Organization{}) + a := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: o.ID}) + b := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: o.ID}) check.Args([]uuid.UUID{a.ID, b.ID}). - Asserts( /*rbac.ResourceSystem, policy.ActionRead*/ ). + Asserts(rbac.ResourceProvisionerJobs.InOrg(o.ID), policy.ActionRead). Returns(slice.New(a, b)) })) s.Run("InsertWorkspaceAgent", s.Subtest(func(db database.Store, check *expects) { @@ -4015,7 +4017,6 @@ func (s *MethodTestSuite) TestSystemFunctions() { }).Asserts(rbac.ResourceSystem, policy.ActionUpdate).Returns() })) s.Run("AcquireProvisionerJob", s.Subtest(func(db database.Store, check *expects) { - // TODO: we need to create a ProvisionerJob resource j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{ StartedAt: sql.NullTime{Valid: false}, UpdatedAt: time.Now(), @@ -4025,47 +4026,48 @@ func (s *MethodTestSuite) TestSystemFunctions() { OrganizationID: j.OrganizationID, Types: []database.ProvisionerType{j.Provisioner}, ProvisionerTags: must(json.Marshal(j.Tags)), - }).Asserts( /*rbac.ResourceSystem, policy.ActionUpdate*/ ) + }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) })) s.Run("UpdateProvisionerJobWithCompleteByID", s.Subtest(func(db database.Store, check *expects) { - // TODO: we need to create a ProvisionerJob resource j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.UpdateProvisionerJobWithCompleteByIDParams{ ID: j.ID, - }).Asserts( /*rbac.ResourceSystem, policy.ActionUpdate*/ ) + }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) + })) + s.Run("UpdateProvisionerJobWithCompleteWithStartedAtByID", s.Subtest(func(db database.Store, check *expects) { + j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) + check.Args(database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams{ + ID: j.ID, + }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) })) s.Run("UpdateProvisionerJobByID", s.Subtest(func(db database.Store, check *expects) { - // TODO: we need to create a ProvisionerJob resource j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.UpdateProvisionerJobByIDParams{ ID: j.ID, UpdatedAt: time.Now(), - }).Asserts( /*rbac.ResourceSystem, policy.ActionUpdate*/ ) + }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) })) s.Run("InsertProvisionerJob", s.Subtest(func(db database.Store, check *expects) { dbtestutil.DisableForeignKeysAndTriggers(s.T(), db) - // TODO: we need to create a ProvisionerJob resource check.Args(database.InsertProvisionerJobParams{ ID: uuid.New(), Provisioner: database.ProvisionerTypeEcho, StorageMethod: database.ProvisionerStorageMethodFile, Type: database.ProvisionerJobTypeWorkspaceBuild, Input: json.RawMessage("{}"), - }).Asserts( /*rbac.ResourceSystem, policy.ActionCreate*/ ) + }).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionCreate */ ) })) s.Run("InsertProvisionerJobLogs", s.Subtest(func(db database.Store, check *expects) { - // TODO: we need to create a ProvisionerJob resource j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.InsertProvisionerJobLogsParams{ JobID: j.ID, - }).Asserts( /*rbac.ResourceSystem, policy.ActionCreate*/ ) + }).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionUpdate */ ) })) s.Run("InsertProvisionerJobTimings", s.Subtest(func(db database.Store, check *expects) { - // TODO: we need to create a ProvisionerJob resource j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.InsertProvisionerJobTimingsParams{ JobID: j.ID, - }).Asserts( /*rbac.ResourceSystem, policy.ActionCreate*/ ) + }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) })) s.Run("UpsertProvisionerDaemon", s.Subtest(func(db database.Store, check *expects) { dbtestutil.DisableForeignKeysAndTriggers(s.T(), db) @@ -4201,8 +4203,8 @@ func (s *MethodTestSuite) TestSystemFunctions() { s.Run("GetFileTemplates", s.Subtest(func(db database.Store, check *expects) { check.Args(uuid.New()).Asserts(rbac.ResourceSystem, policy.ActionRead) })) - s.Run("GetHungProvisionerJobs", s.Subtest(func(db database.Store, check *expects) { - check.Args(time.Time{}).Asserts() + s.Run("GetProvisionerJobsToBeReaped", s.Subtest(func(db database.Store, check *expects) { + check.Args(database.GetProvisionerJobsToBeReapedParams{}).Asserts(rbac.ResourceProvisionerJobs, policy.ActionRead) })) s.Run("UpsertOAuthSigningKey", s.Subtest(func(db database.Store, check *expects) { check.Args("foo").Asserts(rbac.ResourceSystem, policy.ActionUpdate) @@ -4446,6 +4448,9 @@ func (s *MethodTestSuite) TestSystemFunctions() { VapidPrivateKey: "test", }).Asserts(rbac.ResourceDeploymentConfig, policy.ActionUpdate) })) + s.Run("GetProvisionerJobByIDForUpdate", s.Subtest(func(db database.Store, check *expects) { + check.Args(uuid.New()).Asserts(rbac.ResourceProvisionerJobs, policy.ActionRead).Errors(sql.ErrNoRows) + })) } func (s *MethodTestSuite) TestNotifications() { diff --git a/coderd/database/dbmem/dbmem.go b/coderd/database/dbmem/dbmem.go index 2760c0c929c58..794d46c0083d0 100644 --- a/coderd/database/dbmem/dbmem.go +++ b/coderd/database/dbmem/dbmem.go @@ -8,6 +8,7 @@ import ( "errors" "fmt" "math" + insecurerand "math/rand" //#nosec // this is only used for shuffling an array to pick random jobs to reap "reflect" "regexp" "slices" @@ -3707,23 +3708,6 @@ func (q *FakeQuerier) GetHealthSettings(_ context.Context) (string, error) { return string(q.healthSettings), nil } -func (q *FakeQuerier) GetHungProvisionerJobs(_ context.Context, hungSince time.Time) ([]database.ProvisionerJob, error) { - q.mutex.RLock() - defer q.mutex.RUnlock() - - hungJobs := []database.ProvisionerJob{} - for _, provisionerJob := range q.provisionerJobs { - if provisionerJob.StartedAt.Valid && !provisionerJob.CompletedAt.Valid && provisionerJob.UpdatedAt.Before(hungSince) { - // clone the Tags before appending, since maps are reference types and - // we don't want the caller to be able to mutate the map we have inside - // dbmem! - provisionerJob.Tags = maps.Clone(provisionerJob.Tags) - hungJobs = append(hungJobs, provisionerJob) - } - } - return hungJobs, nil -} - func (q *FakeQuerier) GetInboxNotificationByID(_ context.Context, id uuid.UUID) (database.InboxNotification, error) { q.mutex.RLock() defer q.mutex.RUnlock() @@ -4642,6 +4626,13 @@ func (q *FakeQuerier) GetProvisionerJobByID(ctx context.Context, id uuid.UUID) ( return q.getProvisionerJobByIDNoLock(ctx, id) } +func (q *FakeQuerier) GetProvisionerJobByIDForUpdate(ctx context.Context, id uuid.UUID) (database.ProvisionerJob, error) { + q.mutex.RLock() + defer q.mutex.RUnlock() + + return q.getProvisionerJobByIDNoLock(ctx, id) +} + func (q *FakeQuerier) GetProvisionerJobTimingsByJobID(_ context.Context, jobID uuid.UUID) ([]database.ProvisionerJobTiming, error) { q.mutex.RLock() defer q.mutex.RUnlock() @@ -4877,6 +4868,33 @@ func (q *FakeQuerier) GetProvisionerJobsCreatedAfter(_ context.Context, after ti return jobs, nil } +func (q *FakeQuerier) GetProvisionerJobsToBeReaped(_ context.Context, arg database.GetProvisionerJobsToBeReapedParams) ([]database.ProvisionerJob, error) { + q.mutex.RLock() + defer q.mutex.RUnlock() + maxJobs := arg.MaxJobs + + hungJobs := []database.ProvisionerJob{} + for _, provisionerJob := range q.provisionerJobs { + if !provisionerJob.CompletedAt.Valid { + if (provisionerJob.StartedAt.Valid && provisionerJob.UpdatedAt.Before(arg.HungSince)) || + (!provisionerJob.StartedAt.Valid && provisionerJob.UpdatedAt.Before(arg.PendingSince)) { + // clone the Tags before appending, since maps are reference types and + // we don't want the caller to be able to mutate the map we have inside + // dbmem! + provisionerJob.Tags = maps.Clone(provisionerJob.Tags) + hungJobs = append(hungJobs, provisionerJob) + if len(hungJobs) >= int(maxJobs) { + break + } + } + } + } + insecurerand.Shuffle(len(hungJobs), func(i, j int) { + hungJobs[i], hungJobs[j] = hungJobs[j], hungJobs[i] + }) + return hungJobs, nil +} + func (q *FakeQuerier) GetProvisionerKeyByHashedSecret(_ context.Context, hashedSecret []byte) (database.ProvisionerKey, error) { q.mutex.RLock() defer q.mutex.RUnlock() @@ -10925,6 +10943,30 @@ func (q *FakeQuerier) UpdateProvisionerJobWithCompleteByID(_ context.Context, ar return sql.ErrNoRows } +func (q *FakeQuerier) UpdateProvisionerJobWithCompleteWithStartedAtByID(_ context.Context, arg database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams) error { + if err := validateDatabaseType(arg); err != nil { + return err + } + + q.mutex.Lock() + defer q.mutex.Unlock() + + for index, job := range q.provisionerJobs { + if arg.ID != job.ID { + continue + } + job.UpdatedAt = arg.UpdatedAt + job.CompletedAt = arg.CompletedAt + job.Error = arg.Error + job.ErrorCode = arg.ErrorCode + job.StartedAt = arg.StartedAt + job.JobStatus = provisionerJobStatus(job) + q.provisionerJobs[index] = job + return nil + } + return sql.ErrNoRows +} + func (q *FakeQuerier) UpdateReplica(_ context.Context, arg database.UpdateReplicaParams) (database.Replica, error) { if err := validateDatabaseType(arg); err != nil { return database.Replica{}, err diff --git a/coderd/database/dbmetrics/querymetrics.go b/coderd/database/dbmetrics/querymetrics.go index 128e741da1d76..854d03fc666cf 100644 --- a/coderd/database/dbmetrics/querymetrics.go +++ b/coderd/database/dbmetrics/querymetrics.go @@ -865,13 +865,6 @@ func (m queryMetricsStore) GetHealthSettings(ctx context.Context) (string, error return r0, r1 } -func (m queryMetricsStore) GetHungProvisionerJobs(ctx context.Context, hungSince time.Time) ([]database.ProvisionerJob, error) { - start := time.Now() - jobs, err := m.s.GetHungProvisionerJobs(ctx, hungSince) - m.queryLatencies.WithLabelValues("GetHungProvisionerJobs").Observe(time.Since(start).Seconds()) - return jobs, err -} - func (m queryMetricsStore) GetInboxNotificationByID(ctx context.Context, id uuid.UUID) (database.InboxNotification, error) { start := time.Now() r0, r1 := m.s.GetInboxNotificationByID(ctx, id) @@ -1194,6 +1187,13 @@ func (m queryMetricsStore) GetProvisionerJobByID(ctx context.Context, id uuid.UU return job, err } +func (m queryMetricsStore) GetProvisionerJobByIDForUpdate(ctx context.Context, id uuid.UUID) (database.ProvisionerJob, error) { + start := time.Now() + r0, r1 := m.s.GetProvisionerJobByIDForUpdate(ctx, id) + m.queryLatencies.WithLabelValues("GetProvisionerJobByIDForUpdate").Observe(time.Since(start).Seconds()) + return r0, r1 +} + func (m queryMetricsStore) GetProvisionerJobTimingsByJobID(ctx context.Context, jobID uuid.UUID) ([]database.ProvisionerJobTiming, error) { start := time.Now() r0, r1 := m.s.GetProvisionerJobTimingsByJobID(ctx, jobID) @@ -1229,6 +1229,13 @@ func (m queryMetricsStore) GetProvisionerJobsCreatedAfter(ctx context.Context, c return jobs, err } +func (m queryMetricsStore) GetProvisionerJobsToBeReaped(ctx context.Context, arg database.GetProvisionerJobsToBeReapedParams) ([]database.ProvisionerJob, error) { + start := time.Now() + r0, r1 := m.s.GetProvisionerJobsToBeReaped(ctx, arg) + m.queryLatencies.WithLabelValues("GetProvisionerJobsToBeReaped").Observe(time.Since(start).Seconds()) + return r0, r1 +} + func (m queryMetricsStore) GetProvisionerKeyByHashedSecret(ctx context.Context, hashedSecret []byte) (database.ProvisionerKey, error) { start := time.Now() r0, r1 := m.s.GetProvisionerKeyByHashedSecret(ctx, hashedSecret) @@ -2699,6 +2706,13 @@ func (m queryMetricsStore) UpdateProvisionerJobWithCompleteByID(ctx context.Cont return err } +func (m queryMetricsStore) UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx context.Context, arg database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams) error { + start := time.Now() + r0 := m.s.UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx, arg) + m.queryLatencies.WithLabelValues("UpdateProvisionerJobWithCompleteWithStartedAtByID").Observe(time.Since(start).Seconds()) + return r0 +} + func (m queryMetricsStore) UpdateReplica(ctx context.Context, arg database.UpdateReplicaParams) (database.Replica, error) { start := time.Now() replica, err := m.s.UpdateReplica(ctx, arg) diff --git a/coderd/database/dbmock/dbmock.go b/coderd/database/dbmock/dbmock.go index 17b263dfb2e07..5dbe039b46758 100644 --- a/coderd/database/dbmock/dbmock.go +++ b/coderd/database/dbmock/dbmock.go @@ -1743,21 +1743,6 @@ func (mr *MockStoreMockRecorder) GetHealthSettings(ctx any) *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetHealthSettings", reflect.TypeOf((*MockStore)(nil).GetHealthSettings), ctx) } -// GetHungProvisionerJobs mocks base method. -func (m *MockStore) GetHungProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]database.ProvisionerJob, error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "GetHungProvisionerJobs", ctx, updatedAt) - ret0, _ := ret[0].([]database.ProvisionerJob) - ret1, _ := ret[1].(error) - return ret0, ret1 -} - -// GetHungProvisionerJobs indicates an expected call of GetHungProvisionerJobs. -func (mr *MockStoreMockRecorder) GetHungProvisionerJobs(ctx, updatedAt any) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetHungProvisionerJobs", reflect.TypeOf((*MockStore)(nil).GetHungProvisionerJobs), ctx, updatedAt) -} - // GetInboxNotificationByID mocks base method. func (m *MockStore) GetInboxNotificationByID(ctx context.Context, id uuid.UUID) (database.InboxNotification, error) { m.ctrl.T.Helper() @@ -2448,6 +2433,21 @@ func (mr *MockStoreMockRecorder) GetProvisionerJobByID(ctx, id any) *gomock.Call return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetProvisionerJobByID", reflect.TypeOf((*MockStore)(nil).GetProvisionerJobByID), ctx, id) } +// GetProvisionerJobByIDForUpdate mocks base method. +func (m *MockStore) GetProvisionerJobByIDForUpdate(ctx context.Context, id uuid.UUID) (database.ProvisionerJob, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetProvisionerJobByIDForUpdate", ctx, id) + ret0, _ := ret[0].(database.ProvisionerJob) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetProvisionerJobByIDForUpdate indicates an expected call of GetProvisionerJobByIDForUpdate. +func (mr *MockStoreMockRecorder) GetProvisionerJobByIDForUpdate(ctx, id any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetProvisionerJobByIDForUpdate", reflect.TypeOf((*MockStore)(nil).GetProvisionerJobByIDForUpdate), ctx, id) +} + // GetProvisionerJobTimingsByJobID mocks base method. func (m *MockStore) GetProvisionerJobTimingsByJobID(ctx context.Context, jobID uuid.UUID) ([]database.ProvisionerJobTiming, error) { m.ctrl.T.Helper() @@ -2523,6 +2523,21 @@ func (mr *MockStoreMockRecorder) GetProvisionerJobsCreatedAfter(ctx, createdAt a return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetProvisionerJobsCreatedAfter", reflect.TypeOf((*MockStore)(nil).GetProvisionerJobsCreatedAfter), ctx, createdAt) } +// GetProvisionerJobsToBeReaped mocks base method. +func (m *MockStore) GetProvisionerJobsToBeReaped(ctx context.Context, arg database.GetProvisionerJobsToBeReapedParams) ([]database.ProvisionerJob, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetProvisionerJobsToBeReaped", ctx, arg) + ret0, _ := ret[0].([]database.ProvisionerJob) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetProvisionerJobsToBeReaped indicates an expected call of GetProvisionerJobsToBeReaped. +func (mr *MockStoreMockRecorder) GetProvisionerJobsToBeReaped(ctx, arg any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetProvisionerJobsToBeReaped", reflect.TypeOf((*MockStore)(nil).GetProvisionerJobsToBeReaped), ctx, arg) +} + // GetProvisionerKeyByHashedSecret mocks base method. func (m *MockStore) GetProvisionerKeyByHashedSecret(ctx context.Context, hashedSecret []byte) (database.ProvisionerKey, error) { m.ctrl.T.Helper() @@ -5717,6 +5732,20 @@ func (mr *MockStoreMockRecorder) UpdateProvisionerJobWithCompleteByID(ctx, arg a return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateProvisionerJobWithCompleteByID", reflect.TypeOf((*MockStore)(nil).UpdateProvisionerJobWithCompleteByID), ctx, arg) } +// UpdateProvisionerJobWithCompleteWithStartedAtByID mocks base method. +func (m *MockStore) UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx context.Context, arg database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "UpdateProvisionerJobWithCompleteWithStartedAtByID", ctx, arg) + ret0, _ := ret[0].(error) + return ret0 +} + +// UpdateProvisionerJobWithCompleteWithStartedAtByID indicates an expected call of UpdateProvisionerJobWithCompleteWithStartedAtByID. +func (mr *MockStoreMockRecorder) UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx, arg any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateProvisionerJobWithCompleteWithStartedAtByID", reflect.TypeOf((*MockStore)(nil).UpdateProvisionerJobWithCompleteWithStartedAtByID), ctx, arg) +} + // UpdateReplica mocks base method. func (m *MockStore) UpdateReplica(ctx context.Context, arg database.UpdateReplicaParams) (database.Replica, error) { m.ctrl.T.Helper() diff --git a/coderd/database/querier.go b/coderd/database/querier.go index d0f74ee609724..e42f2b259cd08 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -196,7 +196,6 @@ type sqlcQuerier interface { GetGroupMembersCountByGroupID(ctx context.Context, arg GetGroupMembersCountByGroupIDParams) (int64, error) GetGroups(ctx context.Context, arg GetGroupsParams) ([]GetGroupsRow, error) GetHealthSettings(ctx context.Context) (string, error) - GetHungProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]ProvisionerJob, error) GetInboxNotificationByID(ctx context.Context, id uuid.UUID) (InboxNotification, error) // Fetches inbox notifications for a user filtered by templates and targets // param user_id: The user ID @@ -265,11 +264,16 @@ type sqlcQuerier interface { // Previous job information. GetProvisionerDaemonsWithStatusByOrganization(ctx context.Context, arg GetProvisionerDaemonsWithStatusByOrganizationParams) ([]GetProvisionerDaemonsWithStatusByOrganizationRow, error) GetProvisionerJobByID(ctx context.Context, id uuid.UUID) (ProvisionerJob, error) + // Gets a single provisioner job by ID for update. + // This is used to securely reap jobs that have been hung/pending for a long time. + GetProvisionerJobByIDForUpdate(ctx context.Context, id uuid.UUID) (ProvisionerJob, error) GetProvisionerJobTimingsByJobID(ctx context.Context, jobID uuid.UUID) ([]ProvisionerJobTiming, error) GetProvisionerJobsByIDs(ctx context.Context, ids []uuid.UUID) ([]ProvisionerJob, error) GetProvisionerJobsByIDsWithQueuePosition(ctx context.Context, ids []uuid.UUID) ([]GetProvisionerJobsByIDsWithQueuePositionRow, error) GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisioner(ctx context.Context, arg GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisionerParams) ([]GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisionerRow, error) GetProvisionerJobsCreatedAfter(ctx context.Context, createdAt time.Time) ([]ProvisionerJob, error) + // To avoid repeatedly attempting to reap the same jobs, we randomly order and limit to @max_jobs. + GetProvisionerJobsToBeReaped(ctx context.Context, arg GetProvisionerJobsToBeReapedParams) ([]ProvisionerJob, error) GetProvisionerKeyByHashedSecret(ctx context.Context, hashedSecret []byte) (ProvisionerKey, error) GetProvisionerKeyByID(ctx context.Context, id uuid.UUID) (ProvisionerKey, error) GetProvisionerKeyByName(ctx context.Context, arg GetProvisionerKeyByNameParams) (ProvisionerKey, error) @@ -566,6 +570,7 @@ type sqlcQuerier interface { UpdateProvisionerJobByID(ctx context.Context, arg UpdateProvisionerJobByIDParams) error UpdateProvisionerJobWithCancelByID(ctx context.Context, arg UpdateProvisionerJobWithCancelByIDParams) error UpdateProvisionerJobWithCompleteByID(ctx context.Context, arg UpdateProvisionerJobWithCompleteByIDParams) error + UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx context.Context, arg UpdateProvisionerJobWithCompleteWithStartedAtByIDParams) error UpdateReplica(ctx context.Context, arg UpdateReplicaParams) (Replica, error) UpdateTailnetPeerStatusByCoordinator(ctx context.Context, arg UpdateTailnetPeerStatusByCoordinatorParams) error UpdateTemplateACLByID(ctx context.Context, arg UpdateTemplateACLByIDParams) error diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index e2e38c36fe10a..00922cc3f7ff0 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -7376,71 +7376,57 @@ func (q *sqlQuerier) AcquireProvisionerJob(ctx context.Context, arg AcquireProvi return i, err } -const getHungProvisionerJobs = `-- name: GetHungProvisionerJobs :many +const getProvisionerJobByID = `-- name: GetProvisionerJobByID :one SELECT id, created_at, updated_at, started_at, canceled_at, completed_at, error, organization_id, initiator_id, provisioner, storage_method, type, input, worker_id, file_id, tags, error_code, trace_metadata, job_status FROM provisioner_jobs WHERE - updated_at < $1 - AND started_at IS NOT NULL - AND completed_at IS NULL + id = $1 ` -func (q *sqlQuerier) GetHungProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]ProvisionerJob, error) { - rows, err := q.db.QueryContext(ctx, getHungProvisionerJobs, updatedAt) - if err != nil { - return nil, err - } - defer rows.Close() - var items []ProvisionerJob - for rows.Next() { - var i ProvisionerJob - if err := rows.Scan( - &i.ID, - &i.CreatedAt, - &i.UpdatedAt, - &i.StartedAt, - &i.CanceledAt, - &i.CompletedAt, - &i.Error, - &i.OrganizationID, - &i.InitiatorID, - &i.Provisioner, - &i.StorageMethod, - &i.Type, - &i.Input, - &i.WorkerID, - &i.FileID, - &i.Tags, - &i.ErrorCode, - &i.TraceMetadata, - &i.JobStatus, - ); err != nil { - return nil, err - } - items = append(items, i) - } - if err := rows.Close(); err != nil { - return nil, err - } - if err := rows.Err(); err != nil { - return nil, err - } - return items, nil +func (q *sqlQuerier) GetProvisionerJobByID(ctx context.Context, id uuid.UUID) (ProvisionerJob, error) { + row := q.db.QueryRowContext(ctx, getProvisionerJobByID, id) + var i ProvisionerJob + err := row.Scan( + &i.ID, + &i.CreatedAt, + &i.UpdatedAt, + &i.StartedAt, + &i.CanceledAt, + &i.CompletedAt, + &i.Error, + &i.OrganizationID, + &i.InitiatorID, + &i.Provisioner, + &i.StorageMethod, + &i.Type, + &i.Input, + &i.WorkerID, + &i.FileID, + &i.Tags, + &i.ErrorCode, + &i.TraceMetadata, + &i.JobStatus, + ) + return i, err } -const getProvisionerJobByID = `-- name: GetProvisionerJobByID :one +const getProvisionerJobByIDForUpdate = `-- name: GetProvisionerJobByIDForUpdate :one SELECT id, created_at, updated_at, started_at, canceled_at, completed_at, error, organization_id, initiator_id, provisioner, storage_method, type, input, worker_id, file_id, tags, error_code, trace_metadata, job_status FROM provisioner_jobs WHERE id = $1 +FOR UPDATE +SKIP LOCKED ` -func (q *sqlQuerier) GetProvisionerJobByID(ctx context.Context, id uuid.UUID) (ProvisionerJob, error) { - row := q.db.QueryRowContext(ctx, getProvisionerJobByID, id) +// Gets a single provisioner job by ID for update. +// This is used to securely reap jobs that have been hung/pending for a long time. +func (q *sqlQuerier) GetProvisionerJobByIDForUpdate(ctx context.Context, id uuid.UUID) (ProvisionerJob, error) { + row := q.db.QueryRowContext(ctx, getProvisionerJobByIDForUpdate, id) var i ProvisionerJob err := row.Scan( &i.ID, @@ -7897,6 +7883,79 @@ func (q *sqlQuerier) GetProvisionerJobsCreatedAfter(ctx context.Context, created return items, nil } +const getProvisionerJobsToBeReaped = `-- name: GetProvisionerJobsToBeReaped :many +SELECT + id, created_at, updated_at, started_at, canceled_at, completed_at, error, organization_id, initiator_id, provisioner, storage_method, type, input, worker_id, file_id, tags, error_code, trace_metadata, job_status +FROM + provisioner_jobs +WHERE + ( + -- If the job has not been started before @pending_since, reap it. + updated_at < $1 + AND started_at IS NULL + AND completed_at IS NULL + ) + OR + ( + -- If the job has been started but not completed before @hung_since, reap it. + updated_at < $2 + AND started_at IS NOT NULL + AND completed_at IS NULL + ) +ORDER BY random() +LIMIT $3 +` + +type GetProvisionerJobsToBeReapedParams struct { + PendingSince time.Time `db:"pending_since" json:"pending_since"` + HungSince time.Time `db:"hung_since" json:"hung_since"` + MaxJobs int32 `db:"max_jobs" json:"max_jobs"` +} + +// To avoid repeatedly attempting to reap the same jobs, we randomly order and limit to @max_jobs. +func (q *sqlQuerier) GetProvisionerJobsToBeReaped(ctx context.Context, arg GetProvisionerJobsToBeReapedParams) ([]ProvisionerJob, error) { + rows, err := q.db.QueryContext(ctx, getProvisionerJobsToBeReaped, arg.PendingSince, arg.HungSince, arg.MaxJobs) + if err != nil { + return nil, err + } + defer rows.Close() + var items []ProvisionerJob + for rows.Next() { + var i ProvisionerJob + if err := rows.Scan( + &i.ID, + &i.CreatedAt, + &i.UpdatedAt, + &i.StartedAt, + &i.CanceledAt, + &i.CompletedAt, + &i.Error, + &i.OrganizationID, + &i.InitiatorID, + &i.Provisioner, + &i.StorageMethod, + &i.Type, + &i.Input, + &i.WorkerID, + &i.FileID, + &i.Tags, + &i.ErrorCode, + &i.TraceMetadata, + &i.JobStatus, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + const insertProvisionerJob = `-- name: InsertProvisionerJob :one INSERT INTO provisioner_jobs ( @@ -8105,6 +8164,40 @@ func (q *sqlQuerier) UpdateProvisionerJobWithCompleteByID(ctx context.Context, a return err } +const updateProvisionerJobWithCompleteWithStartedAtByID = `-- name: UpdateProvisionerJobWithCompleteWithStartedAtByID :exec +UPDATE + provisioner_jobs +SET + updated_at = $2, + completed_at = $3, + error = $4, + error_code = $5, + started_at = $6 +WHERE + id = $1 +` + +type UpdateProvisionerJobWithCompleteWithStartedAtByIDParams struct { + ID uuid.UUID `db:"id" json:"id"` + UpdatedAt time.Time `db:"updated_at" json:"updated_at"` + CompletedAt sql.NullTime `db:"completed_at" json:"completed_at"` + Error sql.NullString `db:"error" json:"error"` + ErrorCode sql.NullString `db:"error_code" json:"error_code"` + StartedAt sql.NullTime `db:"started_at" json:"started_at"` +} + +func (q *sqlQuerier) UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx context.Context, arg UpdateProvisionerJobWithCompleteWithStartedAtByIDParams) error { + _, err := q.db.ExecContext(ctx, updateProvisionerJobWithCompleteWithStartedAtByID, + arg.ID, + arg.UpdatedAt, + arg.CompletedAt, + arg.Error, + arg.ErrorCode, + arg.StartedAt, + ) + return err +} + const deleteProvisionerKey = `-- name: DeleteProvisionerKey :exec DELETE FROM provisioner_keys diff --git a/coderd/database/queries/provisionerjobs.sql b/coderd/database/queries/provisionerjobs.sql index 2d544aedb9bd8..4096fa5b2755a 100644 --- a/coderd/database/queries/provisionerjobs.sql +++ b/coderd/database/queries/provisionerjobs.sql @@ -41,6 +41,18 @@ FROM WHERE id = $1; +-- name: GetProvisionerJobByIDForUpdate :one +-- Gets a single provisioner job by ID for update. +-- This is used to securely reap jobs that have been hung/pending for a long time. +SELECT + * +FROM + provisioner_jobs +WHERE + id = $1 +FOR UPDATE +SKIP LOCKED; + -- name: GetProvisionerJobsByIDs :many SELECT * @@ -256,15 +268,40 @@ SET WHERE id = $1; --- name: GetHungProvisionerJobs :many +-- name: UpdateProvisionerJobWithCompleteWithStartedAtByID :exec +UPDATE + provisioner_jobs +SET + updated_at = $2, + completed_at = $3, + error = $4, + error_code = $5, + started_at = $6 +WHERE + id = $1; + +-- name: GetProvisionerJobsToBeReaped :many SELECT * FROM provisioner_jobs WHERE - updated_at < $1 - AND started_at IS NOT NULL - AND completed_at IS NULL; + ( + -- If the job has not been started before @pending_since, reap it. + updated_at < @pending_since + AND started_at IS NULL + AND completed_at IS NULL + ) + OR + ( + -- If the job has been started but not completed before @hung_since, reap it. + updated_at < @hung_since + AND started_at IS NOT NULL + AND completed_at IS NULL + ) +-- To avoid repeatedly attempting to reap the same jobs, we randomly order and limit to @max_jobs. +ORDER BY random() +LIMIT @max_jobs; -- name: InsertProvisionerJobTimings :many INSERT INTO provisioner_job_timings (job_id, started_at, ended_at, stage, source, action, resource) diff --git a/coderd/httpmw/loggermw/logger.go b/coderd/httpmw/loggermw/logger.go index 9eeb07a5f10e5..30e5e2d811ad8 100644 --- a/coderd/httpmw/loggermw/logger.go +++ b/coderd/httpmw/loggermw/logger.go @@ -132,7 +132,7 @@ var actorLogOrder = []rbac.SubjectType{ rbac.SubjectTypeAutostart, rbac.SubjectTypeCryptoKeyReader, rbac.SubjectTypeCryptoKeyRotator, - rbac.SubjectTypeHangDetector, + rbac.SubjectTypeJobReaper, rbac.SubjectTypeNotifier, rbac.SubjectTypePrebuildsOrchestrator, rbac.SubjectTypeProvisionerd, diff --git a/coderd/unhanger/detector.go b/coderd/jobreaper/detector.go similarity index 72% rename from coderd/unhanger/detector.go rename to coderd/jobreaper/detector.go index 14383b1839363..ad5774ee6b95d 100644 --- a/coderd/unhanger/detector.go +++ b/coderd/jobreaper/detector.go @@ -1,11 +1,10 @@ -package unhanger +package jobreaper import ( "context" "database/sql" "encoding/json" - "fmt" - "math/rand" //#nosec // this is only used for shuffling an array to pick random jobs to unhang + "fmt" //#nosec // this is only used for shuffling an array to pick random jobs to unhang "time" "golang.org/x/xerrors" @@ -21,10 +20,14 @@ import ( ) const ( - // HungJobDuration is the duration of time since the last update to a job - // before it is considered hung. + // HungJobDuration is the duration of time since the last update + // to a RUNNING job before it is considered hung. HungJobDuration = 5 * time.Minute + // PendingJobDuration is the duration of time since last update + // to a PENDING job before it is considered dead. + PendingJobDuration = 30 * time.Minute + // HungJobExitTimeout is the duration of time that provisioners should allow // for a graceful exit upon cancellation due to failing to send an update to // a job. @@ -38,16 +41,30 @@ const ( MaxJobsPerRun = 10 ) -// HungJobLogMessages are written to provisioner job logs when a job is hung and -// terminated. -var HungJobLogMessages = []string{ - "", - "====================", - "Coder: Build has been detected as hung for 5 minutes and will be terminated.", - "====================", - "", +// jobLogMessages are written to provisioner job logs when a job is reaped +func JobLogMessages(reapType ReapType, threshold time.Duration) []string { + return []string{ + "", + "====================", + fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and will be terminated.", reapType, threshold.Minutes()), + "====================", + "", + } +} + +type jobToReap struct { + ID uuid.UUID + Threshold time.Duration + Type ReapType } +type ReapType string + +const ( + Pending ReapType = "pending" + Hung ReapType = "hung" +) + // acquireLockError is returned when the detector fails to acquire a lock and // cancels the current run. type acquireLockError struct{} @@ -93,10 +110,10 @@ type Stats struct { Error error } -// New returns a new hang detector. +// New returns a new job reaper. func New(ctx context.Context, db database.Store, pub pubsub.Pubsub, log slog.Logger, tick <-chan time.Time) *Detector { - //nolint:gocritic // Hang detector has a limited set of permissions. - ctx, cancel := context.WithCancel(dbauthz.AsHangDetector(ctx)) + //nolint:gocritic // Job reaper has a limited set of permissions. + ctx, cancel := context.WithCancel(dbauthz.AsJobReaper(ctx)) d := &Detector{ ctx: ctx, cancel: cancel, @@ -172,34 +189,42 @@ func (d *Detector) run(t time.Time) Stats { Error: nil, } - // Find all provisioner jobs that are currently running but have not - // received an update in the last 5 minutes. - jobs, err := d.db.GetHungProvisionerJobs(ctx, t.Add(-HungJobDuration)) + // Find all provisioner jobs to be reaped + jobs, err := d.db.GetProvisionerJobsToBeReaped(ctx, database.GetProvisionerJobsToBeReapedParams{ + PendingSince: t.Add(-PendingJobDuration), + HungSince: t.Add(-HungJobDuration), + MaxJobs: MaxJobsPerRun, + }) if err != nil { - stats.Error = xerrors.Errorf("get hung provisioner jobs: %w", err) + stats.Error = xerrors.Errorf("get provisioner jobs to be reaped: %w", err) return stats } - // Limit the number of jobs we'll unhang in a single run to avoid - // timing out. - if len(jobs) > MaxJobsPerRun { - // Pick a random subset of the jobs to unhang. - rand.Shuffle(len(jobs), func(i, j int) { - jobs[i], jobs[j] = jobs[j], jobs[i] - }) - jobs = jobs[:MaxJobsPerRun] - } + jobsToReap := make([]*jobToReap, 0, len(jobs)) - // Send a message into the build log for each hung job saying that it - // has been detected and will be terminated, then mark the job as - // failed. for _, job := range jobs { + j := &jobToReap{ + ID: job.ID, + } + if job.JobStatus == database.ProvisionerJobStatusPending { + j.Threshold = PendingJobDuration + j.Type = Pending + } else { + j.Threshold = HungJobDuration + j.Type = Hung + } + jobsToReap = append(jobsToReap, j) + } + + // Send a message into the build log for each hung or pending job saying that it + // has been detected and will be terminated, then mark the job as failed. + for _, job := range jobsToReap { log := d.log.With(slog.F("job_id", job.ID)) - err := unhangJob(ctx, log, d.db, d.pubsub, job.ID) + err := reapJob(ctx, log, d.db, d.pubsub, job) if err != nil { if !(xerrors.As(err, &acquireLockError{}) || xerrors.As(err, &jobIneligibleError{})) { - log.Error(ctx, "error forcefully terminating hung provisioner job", slog.Error(err)) + log.Error(ctx, "error forcefully terminating provisioner job", slog.F("type", job.Type), slog.Error(err)) } continue } @@ -210,47 +235,34 @@ func (d *Detector) run(t time.Time) Stats { return stats } -func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub.Pubsub, jobID uuid.UUID) error { +func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub.Pubsub, jobToReap *jobToReap) error { var lowestLogID int64 err := db.InTx(func(db database.Store) error { - locked, err := db.TryAcquireLock(ctx, database.GenLockID(fmt.Sprintf("hang-detector:%s", jobID))) - if err != nil { - return xerrors.Errorf("acquire lock: %w", err) - } - if !locked { - // This error is ignored. - return acquireLockError{} - } - // Refetch the job while we hold the lock. - job, err := db.GetProvisionerJobByID(ctx, jobID) + job, err := db.GetProvisionerJobByIDForUpdate(ctx, jobToReap.ID) if err != nil { + if xerrors.Is(err, sql.ErrNoRows) { + return acquireLockError{} + } return xerrors.Errorf("get provisioner job: %w", err) } - // Check if we should still unhang it. - if !job.StartedAt.Valid { - // This shouldn't be possible to hit because the query only selects - // started and not completed jobs, and a job can't be "un-started". - return jobIneligibleError{ - Err: xerrors.New("job is not started"), - } - } if job.CompletedAt.Valid { return jobIneligibleError{ Err: xerrors.Errorf("job is completed (status %s)", job.JobStatus), } } - if job.UpdatedAt.After(time.Now().Add(-HungJobDuration)) { + if job.UpdatedAt.After(time.Now().Add(-jobToReap.Threshold)) { return jobIneligibleError{ Err: xerrors.New("job has been updated recently"), } } log.Warn( - ctx, "detected hung provisioner job, forcefully terminating", - "threshold", HungJobDuration, + ctx, "forcefully terminating provisioner job", + "type", jobToReap.Type, + "threshold", jobToReap.Threshold, ) // First, get the latest logs from the build so we can make sure @@ -260,7 +272,7 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs CreatedAfter: 0, }) if err != nil { - return xerrors.Errorf("get logs for hung job: %w", err) + return xerrors.Errorf("get logs for %s job: %w", jobToReap.Type, err) } logStage := "" if len(logs) != 0 { @@ -280,7 +292,7 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs Output: nil, } now := dbtime.Now() - for i, msg := range HungJobLogMessages { + for i, msg := range JobLogMessages(jobToReap.Type, jobToReap.Threshold) { // Set the created at in a way that ensures each message has // a unique timestamp so they will be sorted correctly. insertParams.CreatedAt = append(insertParams.CreatedAt, now.Add(time.Millisecond*time.Duration(i))) @@ -291,13 +303,22 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs } newLogs, err := db.InsertProvisionerJobLogs(ctx, insertParams) if err != nil { - return xerrors.Errorf("insert logs for hung job: %w", err) + return xerrors.Errorf("insert logs for %s job: %w", job.JobStatus, err) } lowestLogID = newLogs[0].ID // Mark the job as failed. now = dbtime.Now() - err = db.UpdateProvisionerJobWithCompleteByID(ctx, database.UpdateProvisionerJobWithCompleteByIDParams{ + + // If the job was never started (pending), set the StartedAt time to the current + // time so that the build duration is correct. + if job.JobStatus == database.ProvisionerJobStatusPending { + job.StartedAt = sql.NullTime{ + Time: now, + Valid: true, + } + } + err = db.UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx, database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams{ ID: job.ID, UpdatedAt: now, CompletedAt: sql.NullTime{ @@ -305,12 +326,13 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs Valid: true, }, Error: sql.NullString{ - String: "Coder: Build has been detected as hung for 5 minutes and has been terminated by hang detector.", + String: fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and has been terminated by the reaper.", jobToReap.Type, jobToReap.Threshold.Minutes()), Valid: true, }, ErrorCode: sql.NullString{ Valid: false, }, + StartedAt: job.StartedAt, }) if err != nil { return xerrors.Errorf("mark job as failed: %w", err) @@ -364,7 +386,7 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs if err != nil { return xerrors.Errorf("marshal log notification: %w", err) } - err = pub.Publish(provisionersdk.ProvisionerJobLogsNotifyChannel(jobID), data) + err = pub.Publish(provisionersdk.ProvisionerJobLogsNotifyChannel(jobToReap.ID), data) if err != nil { return xerrors.Errorf("publish log notification: %w", err) } diff --git a/coderd/unhanger/detector_test.go b/coderd/jobreaper/detector_test.go similarity index 73% rename from coderd/unhanger/detector_test.go rename to coderd/jobreaper/detector_test.go index 43eb62bfa884b..28457aeeca3a8 100644 --- a/coderd/unhanger/detector_test.go +++ b/coderd/jobreaper/detector_test.go @@ -1,4 +1,4 @@ -package unhanger_test +package jobreaper_test import ( "context" @@ -20,9 +20,9 @@ import ( "github.com/coder/coder/v2/coderd/database/dbauthz" "github.com/coder/coder/v2/coderd/database/dbgen" "github.com/coder/coder/v2/coderd/database/dbtestutil" + "github.com/coder/coder/v2/coderd/jobreaper" "github.com/coder/coder/v2/coderd/provisionerdserver" "github.com/coder/coder/v2/coderd/rbac" - "github.com/coder/coder/v2/coderd/unhanger" "github.com/coder/coder/v2/provisionersdk" "github.com/coder/coder/v2/testutil" ) @@ -39,10 +39,10 @@ func TestDetectorNoJobs(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan unhanger.Stats) + statsCh = make(chan jobreaper.Stats) ) - detector := unhanger.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- time.Now() @@ -62,7 +62,7 @@ func TestDetectorNoHungJobs(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan unhanger.Stats) + statsCh = make(chan jobreaper.Stats) ) // Insert some jobs that are running and haven't been updated in a while, @@ -89,7 +89,7 @@ func TestDetectorNoHungJobs(t *testing.T) { }) } - detector := unhanger.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -109,7 +109,7 @@ func TestDetectorHungWorkspaceBuild(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan unhanger.Stats) + statsCh = make(chan jobreaper.Stats) ) var ( @@ -195,7 +195,7 @@ func TestDetectorHungWorkspaceBuild(t *testing.T) { t.Log("previous job ID: ", previousWorkspaceBuildJob.ID) t.Log("current job ID: ", currentWorkspaceBuildJob.ID) - detector := unhanger.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -231,7 +231,7 @@ func TestDetectorHungWorkspaceBuildNoOverrideState(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan unhanger.Stats) + statsCh = make(chan jobreaper.Stats) ) var ( @@ -318,7 +318,7 @@ func TestDetectorHungWorkspaceBuildNoOverrideState(t *testing.T) { t.Log("previous job ID: ", previousWorkspaceBuildJob.ID) t.Log("current job ID: ", currentWorkspaceBuildJob.ID) - detector := unhanger.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -354,7 +354,7 @@ func TestDetectorHungWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *testing.T db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan unhanger.Stats) + statsCh = make(chan jobreaper.Stats) ) var ( @@ -411,7 +411,7 @@ func TestDetectorHungWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *testing.T t.Log("current job ID: ", currentWorkspaceBuildJob.ID) - detector := unhanger.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -439,6 +439,100 @@ func TestDetectorHungWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *testing.T detector.Wait() } +func TestDetectorPendingWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *testing.T) { + t.Parallel() + + var ( + ctx = testutil.Context(t, testutil.WaitLong) + db, pubsub = dbtestutil.NewDB(t) + log = testutil.Logger(t) + tickCh = make(chan time.Time) + statsCh = make(chan jobreaper.Stats) + ) + + var ( + now = time.Now() + thirtyFiveMinAgo = now.Add(-time.Minute * 35) + org = dbgen.Organization(t, db, database.Organization{}) + user = dbgen.User(t, db, database.User{}) + file = dbgen.File(t, db, database.File{}) + template = dbgen.Template(t, db, database.Template{ + OrganizationID: org.ID, + CreatedBy: user.ID, + }) + templateVersion = dbgen.TemplateVersion(t, db, database.TemplateVersion{ + OrganizationID: org.ID, + TemplateID: uuid.NullUUID{ + UUID: template.ID, + Valid: true, + }, + CreatedBy: user.ID, + }) + workspace = dbgen.Workspace(t, db, database.WorkspaceTable{ + OwnerID: user.ID, + OrganizationID: org.ID, + TemplateID: template.ID, + }) + + // First build. + expectedWorkspaceBuildState = []byte(`{"dean":"cool","colin":"also cool"}`) + currentWorkspaceBuildJob = dbgen.ProvisionerJob(t, db, pubsub, database.ProvisionerJob{ + CreatedAt: thirtyFiveMinAgo, + UpdatedAt: thirtyFiveMinAgo, + StartedAt: sql.NullTime{ + Time: time.Time{}, + Valid: false, + }, + OrganizationID: org.ID, + InitiatorID: user.ID, + Provisioner: database.ProvisionerTypeEcho, + StorageMethod: database.ProvisionerStorageMethodFile, + FileID: file.ID, + Type: database.ProvisionerJobTypeWorkspaceBuild, + Input: []byte("{}"), + }) + currentWorkspaceBuild = dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{ + WorkspaceID: workspace.ID, + TemplateVersionID: templateVersion.ID, + BuildNumber: 1, + JobID: currentWorkspaceBuildJob.ID, + // Should not be overridden. + ProvisionerState: expectedWorkspaceBuildState, + }) + ) + + t.Log("current job ID: ", currentWorkspaceBuildJob.ID) + + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector.Start() + tickCh <- now + + stats := <-statsCh + require.NoError(t, stats.Error) + require.Len(t, stats.TerminatedJobIDs, 1) + require.Equal(t, currentWorkspaceBuildJob.ID, stats.TerminatedJobIDs[0]) + + // Check that the current provisioner job was updated. + job, err := db.GetProvisionerJobByID(ctx, currentWorkspaceBuildJob.ID) + require.NoError(t, err) + require.WithinDuration(t, now, job.UpdatedAt, 30*time.Second) + require.True(t, job.CompletedAt.Valid) + require.WithinDuration(t, now, job.CompletedAt.Time, 30*time.Second) + require.True(t, job.StartedAt.Valid) + require.WithinDuration(t, now, job.StartedAt.Time, 30*time.Second) + require.True(t, job.Error.Valid) + require.Contains(t, job.Error.String, "Build has been detected as pending") + require.False(t, job.ErrorCode.Valid) + + // Check that the provisioner state was NOT updated. + build, err := db.GetWorkspaceBuildByID(ctx, currentWorkspaceBuild.ID) + require.NoError(t, err) + require.Equal(t, expectedWorkspaceBuildState, build.ProvisionerState) + + detector.Close() + detector.Wait() +} + func TestDetectorHungOtherJobTypes(t *testing.T) { t.Parallel() @@ -447,7 +541,7 @@ func TestDetectorHungOtherJobTypes(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan unhanger.Stats) + statsCh = make(chan jobreaper.Stats) ) var ( @@ -509,7 +603,7 @@ func TestDetectorHungOtherJobTypes(t *testing.T) { t.Log("template import job ID: ", templateImportJob.ID) t.Log("template dry-run job ID: ", templateDryRunJob.ID) - detector := unhanger.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -543,6 +637,113 @@ func TestDetectorHungOtherJobTypes(t *testing.T) { detector.Wait() } +func TestDetectorPendingOtherJobTypes(t *testing.T) { + t.Parallel() + + var ( + ctx = testutil.Context(t, testutil.WaitLong) + db, pubsub = dbtestutil.NewDB(t) + log = testutil.Logger(t) + tickCh = make(chan time.Time) + statsCh = make(chan jobreaper.Stats) + ) + + var ( + now = time.Now() + thirtyFiveMinAgo = now.Add(-time.Minute * 35) + org = dbgen.Organization(t, db, database.Organization{}) + user = dbgen.User(t, db, database.User{}) + file = dbgen.File(t, db, database.File{}) + + // Template import job. + templateImportJob = dbgen.ProvisionerJob(t, db, pubsub, database.ProvisionerJob{ + CreatedAt: thirtyFiveMinAgo, + UpdatedAt: thirtyFiveMinAgo, + StartedAt: sql.NullTime{ + Time: time.Time{}, + Valid: false, + }, + OrganizationID: org.ID, + InitiatorID: user.ID, + Provisioner: database.ProvisionerTypeEcho, + StorageMethod: database.ProvisionerStorageMethodFile, + FileID: file.ID, + Type: database.ProvisionerJobTypeTemplateVersionImport, + Input: []byte("{}"), + }) + _ = dbgen.TemplateVersion(t, db, database.TemplateVersion{ + OrganizationID: org.ID, + JobID: templateImportJob.ID, + CreatedBy: user.ID, + }) + ) + + // Template dry-run job. + dryRunVersion := dbgen.TemplateVersion(t, db, database.TemplateVersion{ + OrganizationID: org.ID, + CreatedBy: user.ID, + }) + input, err := json.Marshal(provisionerdserver.TemplateVersionDryRunJob{ + TemplateVersionID: dryRunVersion.ID, + }) + require.NoError(t, err) + templateDryRunJob := dbgen.ProvisionerJob(t, db, pubsub, database.ProvisionerJob{ + CreatedAt: thirtyFiveMinAgo, + UpdatedAt: thirtyFiveMinAgo, + StartedAt: sql.NullTime{ + Time: time.Time{}, + Valid: false, + }, + OrganizationID: org.ID, + InitiatorID: user.ID, + Provisioner: database.ProvisionerTypeEcho, + StorageMethod: database.ProvisionerStorageMethodFile, + FileID: file.ID, + Type: database.ProvisionerJobTypeTemplateVersionDryRun, + Input: input, + }) + + t.Log("template import job ID: ", templateImportJob.ID) + t.Log("template dry-run job ID: ", templateDryRunJob.ID) + + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector.Start() + tickCh <- now + + stats := <-statsCh + require.NoError(t, stats.Error) + require.Len(t, stats.TerminatedJobIDs, 2) + require.Contains(t, stats.TerminatedJobIDs, templateImportJob.ID) + require.Contains(t, stats.TerminatedJobIDs, templateDryRunJob.ID) + + // Check that the template import job was updated. + job, err := db.GetProvisionerJobByID(ctx, templateImportJob.ID) + require.NoError(t, err) + require.WithinDuration(t, now, job.UpdatedAt, 30*time.Second) + require.True(t, job.CompletedAt.Valid) + require.WithinDuration(t, now, job.CompletedAt.Time, 30*time.Second) + require.True(t, job.StartedAt.Valid) + require.WithinDuration(t, now, job.StartedAt.Time, 30*time.Second) + require.True(t, job.Error.Valid) + require.Contains(t, job.Error.String, "Build has been detected as pending") + require.False(t, job.ErrorCode.Valid) + + // Check that the template dry-run job was updated. + job, err = db.GetProvisionerJobByID(ctx, templateDryRunJob.ID) + require.NoError(t, err) + require.WithinDuration(t, now, job.UpdatedAt, 30*time.Second) + require.True(t, job.CompletedAt.Valid) + require.WithinDuration(t, now, job.CompletedAt.Time, 30*time.Second) + require.True(t, job.StartedAt.Valid) + require.WithinDuration(t, now, job.StartedAt.Time, 30*time.Second) + require.True(t, job.Error.Valid) + require.Contains(t, job.Error.String, "Build has been detected as pending") + require.False(t, job.ErrorCode.Valid) + + detector.Close() + detector.Wait() +} + func TestDetectorHungCanceledJob(t *testing.T) { t.Parallel() @@ -551,7 +752,7 @@ func TestDetectorHungCanceledJob(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan unhanger.Stats) + statsCh = make(chan jobreaper.Stats) ) var ( @@ -591,7 +792,7 @@ func TestDetectorHungCanceledJob(t *testing.T) { t.Log("template import job ID: ", templateImportJob.ID) - detector := unhanger.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -653,7 +854,7 @@ func TestDetectorPushesLogs(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan unhanger.Stats) + statsCh = make(chan jobreaper.Stats) ) var ( @@ -706,7 +907,7 @@ func TestDetectorPushesLogs(t *testing.T) { require.Len(t, logs, 10) } - detector := unhanger.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() // Create pubsub subscription to listen for new log events. @@ -741,12 +942,19 @@ func TestDetectorPushesLogs(t *testing.T) { CreatedAfter: after, }) require.NoError(t, err) - require.Len(t, logs, len(unhanger.HungJobLogMessages)) + threshold := jobreaper.HungJobDuration + jobType := jobreaper.Hung + if templateImportJob.JobStatus == database.ProvisionerJobStatusPending { + threshold = jobreaper.PendingJobDuration + jobType = jobreaper.Pending + } + expectedLogs := jobreaper.JobLogMessages(jobType, threshold) + require.Len(t, logs, len(expectedLogs)) for i, log := range logs { assert.Equal(t, database.LogLevelError, log.Level) assert.Equal(t, c.expectStage, log.Stage) assert.Equal(t, database.LogSourceProvisionerDaemon, log.Source) - assert.Equal(t, unhanger.HungJobLogMessages[i], log.Output) + assert.Equal(t, expectedLogs[i], log.Output) } // Double check the full log count. @@ -755,7 +963,7 @@ func TestDetectorPushesLogs(t *testing.T) { CreatedAfter: 0, }) require.NoError(t, err) - require.Len(t, logs, c.preLogCount+len(unhanger.HungJobLogMessages)) + require.Len(t, logs, c.preLogCount+len(expectedLogs)) detector.Close() detector.Wait() @@ -771,15 +979,15 @@ func TestDetectorMaxJobsPerRun(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan unhanger.Stats) + statsCh = make(chan jobreaper.Stats) org = dbgen.Organization(t, db, database.Organization{}) user = dbgen.User(t, db, database.User{}) file = dbgen.File(t, db, database.File{}) ) - // Create unhanger.MaxJobsPerRun + 1 hung jobs. + // Create MaxJobsPerRun + 1 hung jobs. now := time.Now() - for i := 0; i < unhanger.MaxJobsPerRun+1; i++ { + for i := 0; i < jobreaper.MaxJobsPerRun+1; i++ { pj := dbgen.ProvisionerJob(t, db, pubsub, database.ProvisionerJob{ CreatedAt: now.Add(-time.Hour), UpdatedAt: now.Add(-time.Hour), @@ -802,14 +1010,14 @@ func TestDetectorMaxJobsPerRun(t *testing.T) { }) } - detector := unhanger.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now - // Make sure that only unhanger.MaxJobsPerRun jobs are terminated. + // Make sure that only MaxJobsPerRun jobs are terminated. stats := <-statsCh require.NoError(t, stats.Error) - require.Len(t, stats.TerminatedJobIDs, unhanger.MaxJobsPerRun) + require.Len(t, stats.TerminatedJobIDs, jobreaper.MaxJobsPerRun) // Run the detector again and make sure that only the remaining job is // terminated. @@ -823,7 +1031,7 @@ func TestDetectorMaxJobsPerRun(t *testing.T) { } // wrapDBAuthz adds our Authorization/RBAC around the given database store, to -// ensure the unhanger has the right permissions to do its work. +// ensure the reaper has the right permissions to do its work. func wrapDBAuthz(db database.Store, logger slog.Logger) database.Store { return dbauthz.New( db, diff --git a/coderd/rbac/authz.go b/coderd/rbac/authz.go index d2c6d5d0675be..c63042a2a1363 100644 --- a/coderd/rbac/authz.go +++ b/coderd/rbac/authz.go @@ -65,7 +65,7 @@ const ( SubjectTypeUser SubjectType = "user" SubjectTypeProvisionerd SubjectType = "provisionerd" SubjectTypeAutostart SubjectType = "autostart" - SubjectTypeHangDetector SubjectType = "hang_detector" + SubjectTypeJobReaper SubjectType = "job_reaper" SubjectTypeResourceMonitor SubjectType = "resource_monitor" SubjectTypeCryptoKeyRotator SubjectType = "crypto_key_rotator" SubjectTypeCryptoKeyReader SubjectType = "crypto_key_reader" diff --git a/coderd/rbac/object_gen.go b/coderd/rbac/object_gen.go index 40b7dc87a56f8..ad1a510fd44bd 100644 --- a/coderd/rbac/object_gen.go +++ b/coderd/rbac/object_gen.go @@ -234,7 +234,9 @@ var ( // ResourceProvisionerJobs // Valid Actions + // - "ActionCreate" :: create provisioner jobs // - "ActionRead" :: read provisioner jobs + // - "ActionUpdate" :: update provisioner jobs ResourceProvisionerJobs = Object{ Type: "provisioner_jobs", } diff --git a/coderd/rbac/policy/policy.go b/coderd/rbac/policy/policy.go index 35da0892abfdb..c37e84c48f964 100644 --- a/coderd/rbac/policy/policy.go +++ b/coderd/rbac/policy/policy.go @@ -182,7 +182,9 @@ var RBACPermissions = map[string]PermissionDefinition{ }, "provisioner_jobs": { Actions: map[Action]ActionDefinition{ - ActionRead: actDef("read provisioner jobs"), + ActionRead: actDef("read provisioner jobs"), + ActionUpdate: actDef("update provisioner jobs"), + ActionCreate: actDef("create provisioner jobs"), }, }, "organization": { diff --git a/coderd/rbac/roles.go b/coderd/rbac/roles.go index 56124faee44e2..0b94a74201b16 100644 --- a/coderd/rbac/roles.go +++ b/coderd/rbac/roles.go @@ -503,7 +503,7 @@ func ReloadBuiltinRoles(opts *RoleOptions) { // the ability to create templates and provisioners has // a lot of overlap. ResourceProvisionerDaemon.Type: {policy.ActionCreate, policy.ActionRead, policy.ActionUpdate, policy.ActionDelete}, - ResourceProvisionerJobs.Type: {policy.ActionRead}, + ResourceProvisionerJobs.Type: {policy.ActionRead, policy.ActionUpdate, policy.ActionCreate}, }), }, User: []Permission{}, diff --git a/coderd/rbac/roles_test.go b/coderd/rbac/roles_test.go index e90c89914fdec..6d42a01474d1a 100644 --- a/coderd/rbac/roles_test.go +++ b/coderd/rbac/roles_test.go @@ -580,7 +580,7 @@ func TestRolePermissions(t *testing.T) { }, { Name: "ProvisionerJobs", - Actions: []policy.Action{policy.ActionRead}, + Actions: []policy.Action{policy.ActionRead, policy.ActionUpdate, policy.ActionCreate}, Resource: rbac.ResourceProvisionerJobs.InOrg(orgID), AuthorizeMap: map[bool][]hasAuthSubjects{ true: {owner, orgTemplateAdmin, orgAdmin}, diff --git a/codersdk/deployment.go b/codersdk/deployment.go index 0741bf9e3844a..39b67feb2c73a 100644 --- a/codersdk/deployment.go +++ b/codersdk/deployment.go @@ -345,7 +345,7 @@ type DeploymentValues struct { // HTTPAddress is a string because it may be set to zero to disable. HTTPAddress serpent.String `json:"http_address,omitempty" typescript:",notnull"` AutobuildPollInterval serpent.Duration `json:"autobuild_poll_interval,omitempty"` - JobHangDetectorInterval serpent.Duration `json:"job_hang_detector_interval,omitempty"` + JobReaperDetectorInterval serpent.Duration `json:"job_hang_detector_interval,omitempty"` DERP DERP `json:"derp,omitempty" typescript:",notnull"` Prometheus PrometheusConfig `json:"prometheus,omitempty" typescript:",notnull"` Pprof PprofConfig `json:"pprof,omitempty" typescript:",notnull"` @@ -1287,13 +1287,13 @@ func (c *DeploymentValues) Options() serpent.OptionSet { Annotations: serpent.Annotations{}.Mark(annotationFormatDuration, "true"), }, { - Name: "Job Hang Detector Interval", - Description: "Interval to poll for hung jobs and automatically terminate them.", + Name: "Job Reaper Detect Interval", + Description: "Interval to poll for hung and pending jobs and automatically terminate them.", Flag: "job-hang-detector-interval", Env: "CODER_JOB_HANG_DETECTOR_INTERVAL", Hidden: true, Default: time.Minute.String(), - Value: &c.JobHangDetectorInterval, + Value: &c.JobReaperDetectorInterval, YAML: "jobHangDetectorInterval", Annotations: serpent.Annotations{}.Mark(annotationFormatDuration, "true"), }, diff --git a/codersdk/rbacresources_gen.go b/codersdk/rbacresources_gen.go index 54f65767928d6..6157281f21356 100644 --- a/codersdk/rbacresources_gen.go +++ b/codersdk/rbacresources_gen.go @@ -90,7 +90,7 @@ var RBACResourceActions = map[RBACResource][]RBACAction{ ResourceOrganization: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, ResourceOrganizationMember: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, ResourceProvisionerDaemon: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, - ResourceProvisionerJobs: {ActionRead}, + ResourceProvisionerJobs: {ActionCreate, ActionRead, ActionUpdate}, ResourceReplicas: {ActionRead}, ResourceSystem: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, ResourceTailnetCoordinator: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, diff --git a/provisioner/terraform/serve.go b/provisioner/terraform/serve.go index 562946d8ef92e..3e671b0c68e56 100644 --- a/provisioner/terraform/serve.go +++ b/provisioner/terraform/serve.go @@ -16,7 +16,7 @@ import ( "cdr.dev/slog" "github.com/coder/coder/v2/coderd/database" - "github.com/coder/coder/v2/coderd/unhanger" + "github.com/coder/coder/v2/coderd/jobreaper" "github.com/coder/coder/v2/provisionersdk" ) @@ -39,9 +39,9 @@ type ServeOptions struct { // // This is a no-op on Windows where the process can't be interrupted. // - // Default value: 3 minutes (unhanger.HungJobExitTimeout). This value should + // Default value: 3 minutes (jobreaper.HungJobExitTimeout). This value should // be kept less than the value that Coder uses to mark hung jobs as failed, - // which is 5 minutes (see unhanger package). + // which is 5 minutes (see jobreaper package). ExitTimeout time.Duration } @@ -131,7 +131,7 @@ func Serve(ctx context.Context, options *ServeOptions) error { options.Tracer = trace.NewNoopTracerProvider().Tracer("noop") } if options.ExitTimeout == 0 { - options.ExitTimeout = unhanger.HungJobExitTimeout + options.ExitTimeout = jobreaper.HungJobExitTimeout } return provisionersdk.Serve(ctx, &server{ execMut: &sync.Mutex{}, diff --git a/site/src/api/rbacresourcesGenerated.ts b/site/src/api/rbacresourcesGenerated.ts index 079dcb4a87a61..3acb86c079908 100644 --- a/site/src/api/rbacresourcesGenerated.ts +++ b/site/src/api/rbacresourcesGenerated.ts @@ -130,7 +130,9 @@ export const RBACResourceActions: Partial< update: "update a provisioner daemon", }, provisioner_jobs: { + create: "create provisioner jobs", read: "read provisioner jobs", + update: "update provisioner jobs", }, replicas: { read: "read replicas",