Skip to content

feat: cancel stuck pending jobs #17803

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 24 commits into from
May 20, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
0f51f35
added queries for fetching NotStartedProvisionerJobs
ibetitsmike Apr 1, 2025
2f3d606
added detector handling of not started jobs
ibetitsmike May 9, 2025
4b252eb
filling out started_at when unhanging not started jobs
ibetitsmike May 9, 2025
ca49519
WIP
ibetitsmike May 13, 2025
af994c2
refactored to reaper & added tests
ibetitsmike May 13, 2025
3815727
Revert "filling out started_at when unhanging not started jobs"
ibetitsmike May 13, 2025
b65f620
created new ORM update to avoid forcing setting StartedAt on every Co…
ibetitsmike May 13, 2025
3c7c323
added missing dbauthz tests
ibetitsmike May 13, 2025
35df01f
added checks for StartedAt value in the updated jobs
ibetitsmike May 13, 2025
8aa1ee2
refactor from reaper to jobreaper
ibetitsmike May 14, 2025
4385933
WIP
ibetitsmike May 14, 2025
96fee51
WIP
ibetitsmike May 14, 2025
d8db119
WIP
ibetitsmike May 15, 2025
5120fb1
WIP
ibetitsmike May 15, 2025
8d4fa5a
fixed sql comments
ibetitsmike May 15, 2025
18b809c
taking a step back with RBAC
ibetitsmike May 16, 2025
0fe1404
WIP
ibetitsmike May 16, 2025
77be34e
WIP
ibetitsmike May 16, 2025
4351529
WIP
ibetitsmike May 16, 2025
c03bfa3
fixed InOrg check for provisionerjob resource
ibetitsmike May 19, 2025
a15bd1c
PR review; naming in the comments, added comments for SQL, less verbo…
ibetitsmike May 19, 2025
5b9348f
fixes to tests after lint remove rand
ibetitsmike May 19, 2025
91d2d32
readded rand to fix gen failing in CI
ibetitsmike May 19, 2025
767cb77
adjusted TODOs
ibetitsmike May 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
WIP
  • Loading branch information
ibetitsmike committed May 14, 2025
commit 4385933bd0f1c5d0741f3fc8f4909d1a1d3e336e
10 changes: 5 additions & 5 deletions cli/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -1127,11 +1127,11 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
ctx, options.Database, options.Pubsub, options.PrometheusRegistry, coderAPI.TemplateScheduleStore, &coderAPI.Auditor, coderAPI.AccessControlStore, logger, autobuildTicker.C, options.NotificationsEnqueuer)
autobuildExecutor.Run()

hangDetectorTicker := time.NewTicker(vals.JobHangDetectorInterval.Value())
defer hangDetectorTicker.Stop()
hangDetector := jobreaper.New(ctx, options.Database, options.Pubsub, logger, hangDetectorTicker.C)
hangDetector.Start()
defer hangDetector.Close()
jobReaperTicker := time.NewTicker(vals.JobReaperDetectorInterval.Value())
defer jobReaperTicker.Stop()
jobReaper := jobreaper.New(ctx, options.Database, options.Pubsub, logger, jobReaperTicker.C)
jobReaper.Start()
defer jobReaper.Close()

waitForProvisionerJobs := false
// Currently there is no way to ask the server to shut
Expand Down
10 changes: 5 additions & 5 deletions coderd/coderdtest/coderdtest.go
Original file line number Diff line number Diff line change
Expand Up @@ -365,11 +365,11 @@ func NewOptions(t testing.TB, options *Options) (func(http.Handler), context.Can
).WithStatsChannel(options.AutobuildStats)
lifecycleExecutor.Run()

hangDetectorTicker := time.NewTicker(options.DeploymentValues.JobHangDetectorInterval.Value())
defer hangDetectorTicker.Stop()
hangDetector := jobreaper.New(ctx, options.Database, options.Pubsub, options.Logger.Named("reaper.detector"), hangDetectorTicker.C)
hangDetector.Start()
t.Cleanup(hangDetector.Close)
jobReaperTicker := time.NewTicker(options.DeploymentValues.JobReaperDetectorInterval.Value())
defer jobReaperTicker.Stop()
jobReaper := jobreaper.New(ctx, options.Database, options.Pubsub, options.Logger.Named("reaper.detector"), jobReaperTicker.C)
jobReaper.Start()
t.Cleanup(jobReaper.Close)

if options.TelemetryReporter == nil {
options.TelemetryReporter = telemetry.NewNoop()
Expand Down
55 changes: 32 additions & 23 deletions coderd/database/dbauthz/dbauthz.go
Original file line number Diff line number Diff line change
Expand Up @@ -220,18 +220,19 @@ var (
}.WithCachedASTValue()

// See reaper package.
subjectHangDetector = rbac.Subject{
Type: rbac.SubjectTypeHangDetector,
FriendlyName: "Hang Detector",
subjectJobReaper = rbac.Subject{
Type: rbac.SubjectTypeJobReaper,
FriendlyName: "Job Reaper",
ID: uuid.Nil.String(),
Roles: rbac.Roles([]rbac.Role{
{
Identifier: rbac.RoleIdentifier{Name: "hangdetector"},
DisplayName: "Hang Detector Daemon",
Identifier: rbac.RoleIdentifier{Name: "jobreaper"},
DisplayName: "Job Reaper Daemon",
Site: rbac.Permissions(map[string][]policy.Action{
rbac.ResourceSystem.Type: {policy.WildcardSymbol},
rbac.ResourceTemplate.Type: {policy.ActionRead},
rbac.ResourceWorkspace.Type: {policy.ActionRead, policy.ActionUpdate},
rbac.ResourceSystem.Type: {policy.WildcardSymbol},
rbac.ResourceTemplate.Type: {policy.ActionRead},
rbac.ResourceWorkspace.Type: {policy.ActionRead, policy.ActionUpdate},
rbac.ResourceProvisionerJobs.Type: {policy.ActionRead, policy.ActionUpdate},
}),
Org: map[string][]rbac.Permission{},
User: []rbac.Permission{},
Expand Down Expand Up @@ -407,10 +408,10 @@ func AsAutostart(ctx context.Context) context.Context {
return As(ctx, subjectAutostart)
}

// AsHangDetector returns a context with an actor that has permissions required
// AsJobReaper returns a context with an actor that has permissions required
// for reaper.Detector to function.
func AsHangDetector(ctx context.Context) context.Context {
return As(ctx, subjectHangDetector)
func AsJobReaper(ctx context.Context) context.Context {
return As(ctx, subjectJobReaper)
}

// AsKeyRotator returns a context with an actor that has permissions required for rotating crypto keys.
Expand Down Expand Up @@ -1074,6 +1075,13 @@ func (q *querier) customRoleCheck(ctx context.Context, role database.CustomRole)
return nil
}

func (q *querier) GetPendingProvisionerJobs(ctx context.Context, lastUpdatedSince time.Time) ([]database.ProvisionerJob, error) {
if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil {
return nil, err
}
return q.db.GetPendingProvisionerJobs(ctx, lastUpdatedSince)
}

func (q *querier) AcquireLock(ctx context.Context, id int64) error {
return q.db.AcquireLock(ctx, id)
}
Expand Down Expand Up @@ -1912,11 +1920,10 @@ func (q *querier) GetHealthSettings(ctx context.Context) (string, error) {
return q.db.GetHealthSettings(ctx)
}

// TODO: We need to create a ProvisionerJob resource type
func (q *querier) GetHungProvisionerJobs(ctx context.Context, hungSince time.Time) ([]database.ProvisionerJob, error) {
// if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceSystem); err != nil {
// return nil, err
// }
if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil {
return nil, err
}
return q.db.GetHungProvisionerJobs(ctx, hungSince)
}

Expand Down Expand Up @@ -1992,10 +1999,6 @@ func (q *querier) GetLogoURL(ctx context.Context) (string, error) {
return q.db.GetLogoURL(ctx)
}

func (q *querier) GetNotStartedProvisionerJobs(ctx context.Context, notStartedSince time.Time) ([]database.ProvisionerJob, error) {
return q.db.GetNotStartedProvisionerJobs(ctx, notStartedSince)
}

func (q *querier) GetNotificationMessagesByStatus(ctx context.Context, arg database.GetNotificationMessagesByStatusParams) ([]database.NotificationMessage, error) {
if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceNotificationMessage); err != nil {
return nil, err
Expand Down Expand Up @@ -4180,6 +4183,10 @@ func (q *querier) UpdateProvisionerJobByID(ctx context.Context, arg database.Upd
}

func (q *querier) UpdateProvisionerJobWithCancelByID(ctx context.Context, arg database.UpdateProvisionerJobWithCancelByIDParams) error {
if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil {
return err
}

job, err := q.db.GetProvisionerJobByID(ctx, arg.ID)
if err != nil {
return err
Expand Down Expand Up @@ -4246,15 +4253,17 @@ func (q *querier) UpdateProvisionerJobWithCancelByID(ctx context.Context, arg da
return q.db.UpdateProvisionerJobWithCancelByID(ctx, arg)
}

// TODO: We need to create a ProvisionerJob resource type
func (q *querier) UpdateProvisionerJobWithCompleteByID(ctx context.Context, arg database.UpdateProvisionerJobWithCompleteByIDParams) error {
// if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceSystem); err != nil {
// return err
// }
if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil {
return err
}
return q.db.UpdateProvisionerJobWithCompleteByID(ctx, arg)
}

func (q *querier) UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx context.Context, arg database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams) error {
if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil {
return err
}
return q.db.UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx, arg)
}

Expand Down
29 changes: 10 additions & 19 deletions coderd/database/dbauthz/dbauthz_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3891,9 +3891,8 @@ func (s *MethodTestSuite) TestSystemFunctions() {
check.Args().Asserts(rbac.ResourceSystem, policy.ActionDelete)
}))
s.Run("GetProvisionerJobsCreatedAfter", s.Subtest(func(db database.Store, check *expects) {
// TODO: add provisioner job resource type
_ = dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{CreatedAt: time.Now().Add(-time.Hour)})
check.Args(time.Now()).Asserts( /*rbac.ResourceSystem, policy.ActionRead*/ )
check.Args(time.Now()).Asserts(rbac.ResourceSystem, policy.ActionRead)
}))
s.Run("GetTemplateVersionsByIDs", s.Subtest(func(db database.Store, check *expects) {
dbtestutil.DisableForeignKeysAndTriggers(s.T(), db)
Expand Down Expand Up @@ -3976,11 +3975,10 @@ func (s *MethodTestSuite) TestSystemFunctions() {
Returns([]database.WorkspaceAgent{agt})
}))
s.Run("GetProvisionerJobsByIDs", s.Subtest(func(db database.Store, check *expects) {
// TODO: add a ProvisionerJob resource type
a := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{})
b := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{})
check.Args([]uuid.UUID{a.ID, b.ID}).
Asserts( /*rbac.ResourceSystem, policy.ActionRead*/ ).
Asserts(rbac.ResourceSystem, policy.ActionRead).
Returns(slice.New(a, b))
}))
s.Run("InsertWorkspaceAgent", s.Subtest(func(db database.Store, check *expects) {
Expand Down Expand Up @@ -4015,7 +4013,6 @@ func (s *MethodTestSuite) TestSystemFunctions() {
}).Asserts(rbac.ResourceSystem, policy.ActionUpdate).Returns()
}))
s.Run("AcquireProvisionerJob", s.Subtest(func(db database.Store, check *expects) {
// TODO: we need to create a ProvisionerJob resource
j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{
StartedAt: sql.NullTime{Valid: false},
UpdatedAt: time.Now(),
Expand All @@ -4025,54 +4022,48 @@ func (s *MethodTestSuite) TestSystemFunctions() {
OrganizationID: j.OrganizationID,
Types: []database.ProvisionerType{j.Provisioner},
ProvisionerTags: must(json.Marshal(j.Tags)),
}).Asserts( /*rbac.ResourceSystem, policy.ActionUpdate*/ )
}).Asserts(rbac.ResourceSystem, policy.ActionUpdate)
}))
s.Run("UpdateProvisionerJobWithCompleteByID", s.Subtest(func(db database.Store, check *expects) {
// TODO: we need to create a ProvisionerJob resource
j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{})
check.Args(database.UpdateProvisionerJobWithCompleteByIDParams{
ID: j.ID,
}).Asserts( /*rbac.ResourceSystem, policy.ActionUpdate*/ )
}).Asserts(rbac.ResourceSystem, policy.ActionUpdate)
}))
s.Run("UpdateProvisionerJobWithCompleteWithStartedAtByID", s.Subtest(func(db database.Store, check *expects) {
// TODO: we need to create a ProvisionerJob resource
j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{})
check.Args(database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams{
ID: j.ID,
}).Asserts( /*rbac.ResourceSystem, policy.ActionUpdate*/ )
}).Asserts(rbac.ResourceSystem, policy.ActionUpdate)
}))
s.Run("UpdateProvisionerJobByID", s.Subtest(func(db database.Store, check *expects) {
// TODO: we need to create a ProvisionerJob resource
j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{})
check.Args(database.UpdateProvisionerJobByIDParams{
ID: j.ID,
UpdatedAt: time.Now(),
}).Asserts( /*rbac.ResourceSystem, policy.ActionUpdate*/ )
}).Asserts(rbac.ResourceSystem, policy.ActionUpdate)
}))
s.Run("InsertProvisionerJob", s.Subtest(func(db database.Store, check *expects) {
dbtestutil.DisableForeignKeysAndTriggers(s.T(), db)
// TODO: we need to create a ProvisionerJob resource
check.Args(database.InsertProvisionerJobParams{
ID: uuid.New(),
Provisioner: database.ProvisionerTypeEcho,
StorageMethod: database.ProvisionerStorageMethodFile,
Type: database.ProvisionerJobTypeWorkspaceBuild,
Input: json.RawMessage("{}"),
}).Asserts( /*rbac.ResourceSystem, policy.ActionCreate*/ )
}).Asserts(rbac.ResourceSystem, policy.ActionCreate)
}))
s.Run("InsertProvisionerJobLogs", s.Subtest(func(db database.Store, check *expects) {
// TODO: we need to create a ProvisionerJob resource
j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{})
check.Args(database.InsertProvisionerJobLogsParams{
JobID: j.ID,
}).Asserts( /*rbac.ResourceSystem, policy.ActionCreate*/ )
}).Asserts(rbac.ResourceSystem, policy.ActionCreate)
}))
s.Run("InsertProvisionerJobTimings", s.Subtest(func(db database.Store, check *expects) {
// TODO: we need to create a ProvisionerJob resource
j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{})
check.Args(database.InsertProvisionerJobTimingsParams{
JobID: j.ID,
}).Asserts( /*rbac.ResourceSystem, policy.ActionCreate*/ )
}).Asserts(rbac.ResourceSystem, policy.ActionCreate)
}))
s.Run("UpsertProvisionerDaemon", s.Subtest(func(db database.Store, check *expects) {
dbtestutil.DisableForeignKeysAndTriggers(s.T(), db)
Expand Down Expand Up @@ -4211,7 +4202,7 @@ func (s *MethodTestSuite) TestSystemFunctions() {
s.Run("GetHungProvisionerJobs", s.Subtest(func(db database.Store, check *expects) {
check.Args(time.Time{}).Asserts()
}))
s.Run("GetNotStartedProvisionerJobs", s.Subtest(func(db database.Store, check *expects) {
s.Run("GetPendingProvisionerJobs", s.Subtest(func(db database.Store, check *expects) {
check.Args(time.Time{}).Asserts()
}))
s.Run("UpsertOAuthSigningKey", s.Subtest(func(db database.Store, check *expects) {
Expand Down
38 changes: 21 additions & 17 deletions coderd/database/dbmem/dbmem.go
Original file line number Diff line number Diff line change
Expand Up @@ -1386,6 +1386,10 @@ func isDeprecated(template database.Template) bool {
return template.Deprecated != ""
}

func (q *FakeQuerier) GetProvisionerJobsToBeReaped(ctx context.Context, updatedAt time.Time) ([]database.ProvisionerJob, error) {
panic("not implemented")
}

func (*FakeQuerier) AcquireLock(_ context.Context, _ int64) error {
return xerrors.New("AcquireLock must only be called within a transaction")
}
Expand Down Expand Up @@ -3897,23 +3901,6 @@ func (q *FakeQuerier) GetLogoURL(_ context.Context) (string, error) {
return q.logoURL, nil
}

func (q *FakeQuerier) GetNotStartedProvisionerJobs(ctx context.Context, notStartedSince time.Time) ([]database.ProvisionerJob, error) {
q.mutex.RLock()
defer q.mutex.RUnlock()

notStartedJobs := []database.ProvisionerJob{}
for _, provisionerJob := range q.provisionerJobs {
if !provisionerJob.StartedAt.Valid && !provisionerJob.CompletedAt.Valid && provisionerJob.UpdatedAt.Before(notStartedSince) {
// clone the Tags before appending, since maps are reference types and
// we don't want the caller to be able to mutate the map we have inside
// dbmem!
provisionerJob.Tags = maps.Clone(provisionerJob.Tags)
notStartedJobs = append(notStartedJobs, provisionerJob)
}
}
return notStartedJobs, nil
}

func (q *FakeQuerier) GetNotificationMessagesByStatus(_ context.Context, arg database.GetNotificationMessagesByStatusParams) ([]database.NotificationMessage, error) {
err := validateDatabaseType(arg)
if err != nil {
Expand Down Expand Up @@ -4291,6 +4278,23 @@ func (q *FakeQuerier) GetParameterSchemasByJobID(_ context.Context, jobID uuid.U
return parameters, nil
}

func (q *FakeQuerier) GetPendingProvisionerJobs(_ context.Context, lastUpdatedSince time.Time) ([]database.ProvisionerJob, error) {
q.mutex.RLock()
defer q.mutex.RUnlock()

pendingJobs := []database.ProvisionerJob{}
for _, provisionerJob := range q.provisionerJobs {
if !provisionerJob.StartedAt.Valid && !provisionerJob.CompletedAt.Valid && provisionerJob.UpdatedAt.Before(lastUpdatedSince) {
// clone the Tags before appending, since maps are reference types and
// we don't want the caller to be able to mutate the map we have inside
// dbmem!
provisionerJob.Tags = maps.Clone(provisionerJob.Tags)
pendingJobs = append(pendingJobs, provisionerJob)
}
}
return pendingJobs, nil
}

func (*FakeQuerier) GetPrebuildMetrics(_ context.Context) ([]database.GetPrebuildMetricsRow, error) {
return nil, ErrUnimplemented
}
Expand Down
14 changes: 7 additions & 7 deletions coderd/database/dbmetrics/querymetrics.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

30 changes: 15 additions & 15 deletions coderd/database/dbmock/dbmock.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion coderd/database/querier.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading