From 0f51f357f76adc7bc48582dae0a32023469b0a64 Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Tue, 1 Apr 2025 16:06:20 +0000 Subject: [PATCH 01/24] added queries for fetching NotStartedProvisionerJobs --- coderd/database/dbauthz/dbauthz.go | 4 ++ coderd/database/dbmem/dbmem.go | 17 +++++++ coderd/database/dbmetrics/querymetrics.go | 7 +++ coderd/database/dbmock/dbmock.go | 15 ++++++ coderd/database/querier.go | 1 + coderd/database/queries.sql.go | 54 +++++++++++++++++++++ coderd/database/queries/provisionerjobs.sql | 10 ++++ 7 files changed, 108 insertions(+) diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index 732739b2f09a5..5be38ce940ccb 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -1992,6 +1992,10 @@ func (q *querier) GetLogoURL(ctx context.Context) (string, error) { return q.db.GetLogoURL(ctx) } +func (q *querier) GetNotStartedProvisionerJobs(ctx context.Context, notStartedSince time.Time) ([]database.ProvisionerJob, error) { + return q.db.GetNotStartedProvisionerJobs(ctx, notStartedSince) +} + func (q *querier) GetNotificationMessagesByStatus(ctx context.Context, arg database.GetNotificationMessagesByStatusParams) ([]database.NotificationMessage, error) { if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceNotificationMessage); err != nil { return nil, err diff --git a/coderd/database/dbmem/dbmem.go b/coderd/database/dbmem/dbmem.go index 2760c0c929c58..01eb2d66ed5de 100644 --- a/coderd/database/dbmem/dbmem.go +++ b/coderd/database/dbmem/dbmem.go @@ -3897,6 +3897,23 @@ func (q *FakeQuerier) GetLogoURL(_ context.Context) (string, error) { return q.logoURL, nil } +func (q *FakeQuerier) GetNotStartedProvisionerJobs(ctx context.Context, notStartedSince time.Time) ([]database.ProvisionerJob, error) { + q.mutex.RLock() + defer q.mutex.RUnlock() + + notStartedJobs := []database.ProvisionerJob{} + for _, provisionerJob := range q.provisionerJobs { + if !provisionerJob.StartedAt.Valid && !provisionerJob.CompletedAt.Valid && provisionerJob.UpdatedAt.Before(notStartedSince) { + // clone the Tags before appending, since maps are reference types and + // we don't want the caller to be able to mutate the map we have inside + // dbmem! + provisionerJob.Tags = maps.Clone(provisionerJob.Tags) + notStartedJobs = append(notStartedJobs, provisionerJob) + } + } + return notStartedJobs, nil +} + func (q *FakeQuerier) GetNotificationMessagesByStatus(_ context.Context, arg database.GetNotificationMessagesByStatusParams) ([]database.NotificationMessage, error) { err := validateDatabaseType(arg) if err != nil { diff --git a/coderd/database/dbmetrics/querymetrics.go b/coderd/database/dbmetrics/querymetrics.go index 128e741da1d76..1761c0c6b9960 100644 --- a/coderd/database/dbmetrics/querymetrics.go +++ b/coderd/database/dbmetrics/querymetrics.go @@ -949,6 +949,13 @@ func (m queryMetricsStore) GetLogoURL(ctx context.Context) (string, error) { return url, err } +func (m queryMetricsStore) GetNotStartedProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]database.ProvisionerJob, error) { + start := time.Now() + r0, r1 := m.s.GetNotStartedProvisionerJobs(ctx, updatedAt) + m.queryLatencies.WithLabelValues("GetNotStartedProvisionerJobs").Observe(time.Since(start).Seconds()) + return r0, r1 +} + func (m queryMetricsStore) GetNotificationMessagesByStatus(ctx context.Context, arg database.GetNotificationMessagesByStatusParams) ([]database.NotificationMessage, error) { start := time.Now() r0, r1 := m.s.GetNotificationMessagesByStatus(ctx, arg) diff --git a/coderd/database/dbmock/dbmock.go b/coderd/database/dbmock/dbmock.go index 17b263dfb2e07..1e3a6cf15cfa8 100644 --- a/coderd/database/dbmock/dbmock.go +++ b/coderd/database/dbmock/dbmock.go @@ -1923,6 +1923,21 @@ func (mr *MockStoreMockRecorder) GetLogoURL(ctx any) *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetLogoURL", reflect.TypeOf((*MockStore)(nil).GetLogoURL), ctx) } +// GetNotStartedProvisionerJobs mocks base method. +func (m *MockStore) GetNotStartedProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]database.ProvisionerJob, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetNotStartedProvisionerJobs", ctx, updatedAt) + ret0, _ := ret[0].([]database.ProvisionerJob) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetNotStartedProvisionerJobs indicates an expected call of GetNotStartedProvisionerJobs. +func (mr *MockStoreMockRecorder) GetNotStartedProvisionerJobs(ctx, updatedAt any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetNotStartedProvisionerJobs", reflect.TypeOf((*MockStore)(nil).GetNotStartedProvisionerJobs), ctx, updatedAt) +} + // GetNotificationMessagesByStatus mocks base method. func (m *MockStore) GetNotificationMessagesByStatus(ctx context.Context, arg database.GetNotificationMessagesByStatusParams) ([]database.NotificationMessage, error) { m.ctrl.T.Helper() diff --git a/coderd/database/querier.go b/coderd/database/querier.go index d0f74ee609724..86647a834d20a 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -213,6 +213,7 @@ type sqlcQuerier interface { GetLicenseByID(ctx context.Context, id int32) (License, error) GetLicenses(ctx context.Context) ([]License, error) GetLogoURL(ctx context.Context) (string, error) + GetNotStartedProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]ProvisionerJob, error) GetNotificationMessagesByStatus(ctx context.Context, arg GetNotificationMessagesByStatusParams) ([]NotificationMessage, error) // Fetch the notification report generator log indicating recent activity. GetNotificationReportGeneratorLogByTemplate(ctx context.Context, templateID uuid.UUID) (NotificationReportGeneratorLog, error) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index e2e38c36fe10a..c525f7895b410 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -7430,6 +7430,60 @@ func (q *sqlQuerier) GetHungProvisionerJobs(ctx context.Context, updatedAt time. return items, nil } +const getNotStartedProvisionerJobs = `-- name: GetNotStartedProvisionerJobs :many +SELECT + id, created_at, updated_at, started_at, canceled_at, completed_at, error, organization_id, initiator_id, provisioner, storage_method, type, input, worker_id, file_id, tags, error_code, trace_metadata, job_status +FROM + provisioner_jobs +WHERE + updated_at < $1 + AND started_at IS NULL + AND completed_at IS NULL +` + +func (q *sqlQuerier) GetNotStartedProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]ProvisionerJob, error) { + rows, err := q.db.QueryContext(ctx, getNotStartedProvisionerJobs, updatedAt) + if err != nil { + return nil, err + } + defer rows.Close() + var items []ProvisionerJob + for rows.Next() { + var i ProvisionerJob + if err := rows.Scan( + &i.ID, + &i.CreatedAt, + &i.UpdatedAt, + &i.StartedAt, + &i.CanceledAt, + &i.CompletedAt, + &i.Error, + &i.OrganizationID, + &i.InitiatorID, + &i.Provisioner, + &i.StorageMethod, + &i.Type, + &i.Input, + &i.WorkerID, + &i.FileID, + &i.Tags, + &i.ErrorCode, + &i.TraceMetadata, + &i.JobStatus, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + const getProvisionerJobByID = `-- name: GetProvisionerJobByID :one SELECT id, created_at, updated_at, started_at, canceled_at, completed_at, error, organization_id, initiator_id, provisioner, storage_method, type, input, worker_id, file_id, tags, error_code, trace_metadata, job_status diff --git a/coderd/database/queries/provisionerjobs.sql b/coderd/database/queries/provisionerjobs.sql index 2d544aedb9bd8..d125b195b88b0 100644 --- a/coderd/database/queries/provisionerjobs.sql +++ b/coderd/database/queries/provisionerjobs.sql @@ -266,6 +266,16 @@ WHERE AND started_at IS NOT NULL AND completed_at IS NULL; +-- name: GetNotStartedProvisionerJobs :many +SELECT + * +FROM + provisioner_jobs +WHERE + updated_at < $1 + AND started_at IS NULL + AND completed_at IS NULL; + -- name: InsertProvisionerJobTimings :many INSERT INTO provisioner_job_timings (job_id, started_at, ended_at, stage, source, action, resource) SELECT From 2f3d6065141f87dab893e8ae5c2d550752070cb7 Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Fri, 9 May 2025 13:12:15 +0000 Subject: [PATCH 02/24] added detector handling of not started jobs --- coderd/unhanger/detector.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/coderd/unhanger/detector.go b/coderd/unhanger/detector.go index 14383b1839363..ce3b1bb49800f 100644 --- a/coderd/unhanger/detector.go +++ b/coderd/unhanger/detector.go @@ -179,6 +179,14 @@ func (d *Detector) run(t time.Time) Stats { stats.Error = xerrors.Errorf("get hung provisioner jobs: %w", err) return stats } + // Find all provisioner jobs that are currently running but have not + // received an update in the last 5 minutes. + if err != nil { + stats.Error = xerrors.Errorf("get not started provisioner jobs: %w", err) + return stats + } + jobsUnstarted, err := d.db.GetNotStartedProvisionerJobs(ctx, t.Add(-HungJobDuration)) + jobs = append(jobs, jobsUnstarted...) // Limit the number of jobs we'll unhang in a single run to avoid // timing out. @@ -229,14 +237,6 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs return xerrors.Errorf("get provisioner job: %w", err) } - // Check if we should still unhang it. - if !job.StartedAt.Valid { - // This shouldn't be possible to hit because the query only selects - // started and not completed jobs, and a job can't be "un-started". - return jobIneligibleError{ - Err: xerrors.New("job is not started"), - } - } if job.CompletedAt.Valid { return jobIneligibleError{ Err: xerrors.Errorf("job is completed (status %s)", job.JobStatus), From 4b252eb0137c773e127866dc56a4698d20cb5919 Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Fri, 9 May 2025 13:56:08 +0000 Subject: [PATCH 03/24] filling out started_at when unhanging not started jobs --- coderd/database/queries.sql.go | 5 ++++- coderd/database/queries/provisionerjobs.sql | 3 ++- coderd/unhanger/detector.go | 12 +++++++++++- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index c525f7895b410..b46e62554a404 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -8135,7 +8135,8 @@ SET updated_at = $2, completed_at = $3, error = $4, - error_code = $5 + error_code = $5, + started_at = $6 WHERE id = $1 ` @@ -8146,6 +8147,7 @@ type UpdateProvisionerJobWithCompleteByIDParams struct { CompletedAt sql.NullTime `db:"completed_at" json:"completed_at"` Error sql.NullString `db:"error" json:"error"` ErrorCode sql.NullString `db:"error_code" json:"error_code"` + StartedAt sql.NullTime `db:"started_at" json:"started_at"` } func (q *sqlQuerier) UpdateProvisionerJobWithCompleteByID(ctx context.Context, arg UpdateProvisionerJobWithCompleteByIDParams) error { @@ -8155,6 +8157,7 @@ func (q *sqlQuerier) UpdateProvisionerJobWithCompleteByID(ctx context.Context, a arg.CompletedAt, arg.Error, arg.ErrorCode, + arg.StartedAt, ) return err } diff --git a/coderd/database/queries/provisionerjobs.sql b/coderd/database/queries/provisionerjobs.sql index d125b195b88b0..9e9273bb05352 100644 --- a/coderd/database/queries/provisionerjobs.sql +++ b/coderd/database/queries/provisionerjobs.sql @@ -252,7 +252,8 @@ SET updated_at = $2, completed_at = $3, error = $4, - error_code = $5 + error_code = $5, + started_at = $6 WHERE id = $1; diff --git a/coderd/unhanger/detector.go b/coderd/unhanger/detector.go index ce3b1bb49800f..d00b8a1de50bf 100644 --- a/coderd/unhanger/detector.go +++ b/coderd/unhanger/detector.go @@ -295,11 +295,21 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs } lowestLogID = newLogs[0].ID - // Mark the job as failed. now = dbtime.Now() + // If we are unhanging a job that was never picked up by the + // provisioner, we need to set the started_at time to the current + // time so that the build duration is correct. + if !job.StartedAt.Valid { + job.StartedAt = sql.NullTime{ + Time: now, + Valid: true, + } + } + // Mark the job as failed. err = db.UpdateProvisionerJobWithCompleteByID(ctx, database.UpdateProvisionerJobWithCompleteByIDParams{ ID: job.ID, UpdatedAt: now, + StartedAt: job.StartedAt, CompletedAt: sql.NullTime{ Time: now, Valid: true, From ca4951995b44bb6beac3f0b0ac4574d01f90a804 Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Tue, 13 May 2025 17:00:40 +0000 Subject: [PATCH 04/24] WIP --- coderd/unhanger/detector.go | 54 ++++++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/coderd/unhanger/detector.go b/coderd/unhanger/detector.go index d00b8a1de50bf..2e0c410aa0bee 100644 --- a/coderd/unhanger/detector.go +++ b/coderd/unhanger/detector.go @@ -25,6 +25,10 @@ const ( // before it is considered hung. HungJobDuration = 5 * time.Minute + // NotStartedTimeElapsed is the duration of time since the last update to a job + // before it is considered hung. + NotStartedTimeElapsed = 30 * time.Minute + // HungJobExitTimeout is the duration of time that provisioners should allow // for a graceful exit upon cancellation due to failing to send an update to // a job. @@ -38,6 +42,13 @@ const ( MaxJobsPerRun = 10 ) +type jobType string + +const ( + hungJobType jobType = "hung" + notStartedJobType jobType = "not started" +) + // HungJobLogMessages are written to provisioner job logs when a job is hung and // terminated. var HungJobLogMessages = []string{ @@ -176,17 +187,17 @@ func (d *Detector) run(t time.Time) Stats { // received an update in the last 5 minutes. jobs, err := d.db.GetHungProvisionerJobs(ctx, t.Add(-HungJobDuration)) if err != nil { - stats.Error = xerrors.Errorf("get hung provisioner jobs: %w", err) + stats.Error = xerrors.Errorf("get %s provisioner jobs: %w", hungJobType, err) return stats } - // Find all provisioner jobs that are currently running but have not - // received an update in the last 5 minutes. + // Find all provisioner jobs that have not been started yet and have not + // received an update in the last 30 minutes. + jobsNotStarted, err := d.db.GetNotStartedProvisionerJobs(ctx, t.Add(-NotStartedTimeElapsed)) if err != nil { - stats.Error = xerrors.Errorf("get not started provisioner jobs: %w", err) + stats.Error = xerrors.Errorf("get %s provisioner jobs: %w", notStartedJobType, err) return stats } - jobsUnstarted, err := d.db.GetNotStartedProvisionerJobs(ctx, t.Add(-HungJobDuration)) - jobs = append(jobs, jobsUnstarted...) + jobs = append(jobs, jobsNotStarted...) // Limit the number of jobs we'll unhang in a single run to avoid // timing out. @@ -198,7 +209,7 @@ func (d *Detector) run(t time.Time) Stats { jobs = jobs[:MaxJobsPerRun] } - // Send a message into the build log for each hung job saying that it + // Send a message into the build log for each hung or not startedjob saying that it // has been detected and will be terminated, then mark the job as // failed. for _, job := range jobs { @@ -206,8 +217,12 @@ func (d *Detector) run(t time.Time) Stats { err := unhangJob(ctx, log, d.db, d.pubsub, job.ID) if err != nil { + jobType := notStartedJobType + if job.StartedAt.Valid { + jobType = hungJobType + } if !(xerrors.As(err, &acquireLockError{}) || xerrors.As(err, &jobIneligibleError{})) { - log.Error(ctx, "error forcefully terminating hung provisioner job", slog.Error(err)) + log.Error(ctx, fmt.Sprintf("error forcefully terminating %s provisioner job", jobType), slog.Error(err)) } continue } @@ -222,7 +237,7 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs var lowestLogID int64 err := db.InTx(func(db database.Store) error { - locked, err := db.TryAcquireLock(ctx, database.GenLockID(fmt.Sprintf("hang-detector:%s", jobID))) + locked, err := db.TryAcquireLock(ctx, database.GenLockID(fmt.Sprintf("unhanger:%s", jobID))) if err != nil { return xerrors.Errorf("acquire lock: %w", err) } @@ -237,6 +252,14 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs return xerrors.Errorf("get provisioner job: %w", err) } + jobType := hungJobType + threshold := HungJobDuration.Minutes() + + if !job.StartedAt.Valid { + jobType = notStartedJobType + threshold = NotStartedTimeElapsed.Minutes() + } + if job.CompletedAt.Valid { return jobIneligibleError{ Err: xerrors.Errorf("job is completed (status %s)", job.JobStatus), @@ -249,8 +272,8 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs } log.Warn( - ctx, "detected hung provisioner job, forcefully terminating", - "threshold", HungJobDuration, + ctx, fmt.Sprintf("detected %s provisioner job, forcefully terminating", jobType), + "threshold", threshold, ) // First, get the latest logs from the build so we can make sure @@ -260,7 +283,7 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs CreatedAfter: 0, }) if err != nil { - return xerrors.Errorf("get logs for hung job: %w", err) + return xerrors.Errorf("get logs for %s job: %w", jobType, err) } logStage := "" if len(logs) != 0 { @@ -291,12 +314,13 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs } newLogs, err := db.InsertProvisionerJobLogs(ctx, insertParams) if err != nil { - return xerrors.Errorf("insert logs for hung job: %w", err) + return xerrors.Errorf("insert logs for %s job: %w", jobType, err) } lowestLogID = newLogs[0].ID now = dbtime.Now() - // If we are unhanging a job that was never picked up by the + + // If we are failing a job that was never picked up by the // provisioner, we need to set the started_at time to the current // time so that the build duration is correct. if !job.StartedAt.Valid { @@ -315,7 +339,7 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs Valid: true, }, Error: sql.NullString{ - String: "Coder: Build has been detected as hung for 5 minutes and has been terminated by hang detector.", + String: fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and has been terminated by hang detector.", jobType, threshold), Valid: true, }, ErrorCode: sql.NullString{ From af994c2fc1bc1f25e6609fd096626fdcdf6d9986 Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Tue, 13 May 2025 18:19:38 +0000 Subject: [PATCH 05/24] refactored to reaper & added tests --- cli/server.go | 4 +- coderd/coderdtest/coderdtest.go | 4 +- coderd/database/dbauthz/dbauthz.go | 4 +- coderd/{unhanger => reaper}/detector.go | 57 ++-- coderd/{unhanger => reaper}/detector_test.go | 283 +++++++++++++++++-- provisioner/terraform/serve.go | 8 +- 6 files changed, 294 insertions(+), 66 deletions(-) rename coderd/{unhanger => reaper}/detector.go (91%) rename coderd/{unhanger => reaper}/detector_test.go (72%) diff --git a/cli/server.go b/cli/server.go index d32ed51c06007..e924439ec26b2 100644 --- a/cli/server.go +++ b/cli/server.go @@ -92,10 +92,10 @@ import ( "github.com/coder/coder/v2/coderd/prometheusmetrics" "github.com/coder/coder/v2/coderd/prometheusmetrics/insights" "github.com/coder/coder/v2/coderd/promoauth" + "github.com/coder/coder/v2/coderd/reaper" "github.com/coder/coder/v2/coderd/schedule" "github.com/coder/coder/v2/coderd/telemetry" "github.com/coder/coder/v2/coderd/tracing" - "github.com/coder/coder/v2/coderd/unhanger" "github.com/coder/coder/v2/coderd/updatecheck" "github.com/coder/coder/v2/coderd/util/ptr" "github.com/coder/coder/v2/coderd/util/slice" @@ -1129,7 +1129,7 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. hangDetectorTicker := time.NewTicker(vals.JobHangDetectorInterval.Value()) defer hangDetectorTicker.Stop() - hangDetector := unhanger.New(ctx, options.Database, options.Pubsub, logger, hangDetectorTicker.C) + hangDetector := reaper.New(ctx, options.Database, options.Pubsub, logger, hangDetectorTicker.C) hangDetector.Start() defer hangDetector.Close() diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index e843d0d748578..ff418f3abaff2 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -72,10 +72,10 @@ import ( "github.com/coder/coder/v2/coderd/notifications/notificationstest" "github.com/coder/coder/v2/coderd/rbac" "github.com/coder/coder/v2/coderd/rbac/policy" + "github.com/coder/coder/v2/coderd/reaper" "github.com/coder/coder/v2/coderd/runtimeconfig" "github.com/coder/coder/v2/coderd/schedule" "github.com/coder/coder/v2/coderd/telemetry" - "github.com/coder/coder/v2/coderd/unhanger" "github.com/coder/coder/v2/coderd/updatecheck" "github.com/coder/coder/v2/coderd/util/ptr" "github.com/coder/coder/v2/coderd/webpush" @@ -367,7 +367,7 @@ func NewOptions(t testing.TB, options *Options) (func(http.Handler), context.Can hangDetectorTicker := time.NewTicker(options.DeploymentValues.JobHangDetectorInterval.Value()) defer hangDetectorTicker.Stop() - hangDetector := unhanger.New(ctx, options.Database, options.Pubsub, options.Logger.Named("unhanger.detector"), hangDetectorTicker.C) + hangDetector := reaper.New(ctx, options.Database, options.Pubsub, options.Logger.Named("reaper.detector"), hangDetectorTicker.C) hangDetector.Start() t.Cleanup(hangDetector.Close) diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index 5be38ce940ccb..4bcb7076a5a69 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -219,7 +219,7 @@ var ( Scope: rbac.ScopeAll, }.WithCachedASTValue() - // See unhanger package. + // See reaper package. subjectHangDetector = rbac.Subject{ Type: rbac.SubjectTypeHangDetector, FriendlyName: "Hang Detector", @@ -408,7 +408,7 @@ func AsAutostart(ctx context.Context) context.Context { } // AsHangDetector returns a context with an actor that has permissions required -// for unhanger.Detector to function. +// for reaper.Detector to function. func AsHangDetector(ctx context.Context) context.Context { return As(ctx, subjectHangDetector) } diff --git a/coderd/unhanger/detector.go b/coderd/reaper/detector.go similarity index 91% rename from coderd/unhanger/detector.go rename to coderd/reaper/detector.go index 2e0c410aa0bee..a7736f2a2a070 100644 --- a/coderd/unhanger/detector.go +++ b/coderd/reaper/detector.go @@ -1,4 +1,4 @@ -package unhanger +package reaper import ( "context" @@ -49,14 +49,15 @@ const ( notStartedJobType jobType = "not started" ) -// HungJobLogMessages are written to provisioner job logs when a job is hung and -// terminated. -var HungJobLogMessages = []string{ - "", - "====================", - "Coder: Build has been detected as hung for 5 minutes and will be terminated.", - "====================", - "", +// jobLogMessages are written to provisioner job logs when a job is reaped +func jobLogMessages(jobType jobType, threshold float64) []string { + return []string{ + "", + "====================", + fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and will be terminated.", jobType, threshold), + "====================", + "", + } } // acquireLockError is returned when the detector fails to acquire a lock and @@ -209,18 +210,14 @@ func (d *Detector) run(t time.Time) Stats { jobs = jobs[:MaxJobsPerRun] } - // Send a message into the build log for each hung or not startedjob saying that it - // has been detected and will be terminated, then mark the job as - // failed. + // Send a message into the build log for each hung or not started job saying that it + // has been detected and will be terminated, then mark the job as failed. for _, job := range jobs { log := d.log.With(slog.F("job_id", job.ID)) - err := unhangJob(ctx, log, d.db, d.pubsub, job.ID) + err := reapJob(ctx, log, d.db, d.pubsub, job.ID) if err != nil { - jobType := notStartedJobType - if job.StartedAt.Valid { - jobType = hungJobType - } + jobType, _ := reapParamsFromJob(job) if !(xerrors.As(err, &acquireLockError{}) || xerrors.As(err, &jobIneligibleError{})) { log.Error(ctx, fmt.Sprintf("error forcefully terminating %s provisioner job", jobType), slog.Error(err)) } @@ -233,11 +230,11 @@ func (d *Detector) run(t time.Time) Stats { return stats } -func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub.Pubsub, jobID uuid.UUID) error { +func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub.Pubsub, jobID uuid.UUID) error { var lowestLogID int64 err := db.InTx(func(db database.Store) error { - locked, err := db.TryAcquireLock(ctx, database.GenLockID(fmt.Sprintf("unhanger:%s", jobID))) + locked, err := db.TryAcquireLock(ctx, database.GenLockID(fmt.Sprintf("reaper:%s", jobID))) if err != nil { return xerrors.Errorf("acquire lock: %w", err) } @@ -252,20 +249,14 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs return xerrors.Errorf("get provisioner job: %w", err) } - jobType := hungJobType - threshold := HungJobDuration.Minutes() - - if !job.StartedAt.Valid { - jobType = notStartedJobType - threshold = NotStartedTimeElapsed.Minutes() - } + jobType, threshold := reapParamsFromJob(job) if job.CompletedAt.Valid { return jobIneligibleError{ Err: xerrors.Errorf("job is completed (status %s)", job.JobStatus), } } - if job.UpdatedAt.After(time.Now().Add(-HungJobDuration)) { + if job.UpdatedAt.After(time.Now().Add(-time.Duration(threshold) * time.Minute)) { return jobIneligibleError{ Err: xerrors.New("job has been updated recently"), } @@ -303,7 +294,7 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs Output: nil, } now := dbtime.Now() - for i, msg := range HungJobLogMessages { + for i, msg := range jobLogMessages(jobType, threshold) { // Set the created at in a way that ensures each message has // a unique timestamp so they will be sorted correctly. insertParams.CreatedAt = append(insertParams.CreatedAt, now.Add(time.Millisecond*time.Duration(i))) @@ -405,3 +396,13 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs return nil } + +func reapParamsFromJob(job database.ProvisionerJob) (jobType, float64) { + jobType := hungJobType + threshold := HungJobDuration.Minutes() + if !job.StartedAt.Valid { + jobType = notStartedJobType + threshold = NotStartedTimeElapsed.Minutes() + } + return jobType, threshold +} diff --git a/coderd/unhanger/detector_test.go b/coderd/reaper/detector_test.go similarity index 72% rename from coderd/unhanger/detector_test.go rename to coderd/reaper/detector_test.go index 43eb62bfa884b..e57e0bdccd1be 100644 --- a/coderd/unhanger/detector_test.go +++ b/coderd/reaper/detector_test.go @@ -1,4 +1,4 @@ -package unhanger_test +package reaper_test import ( "context" @@ -22,11 +22,41 @@ import ( "github.com/coder/coder/v2/coderd/database/dbtestutil" "github.com/coder/coder/v2/coderd/provisionerdserver" "github.com/coder/coder/v2/coderd/rbac" - "github.com/coder/coder/v2/coderd/unhanger" + "github.com/coder/coder/v2/coderd/reaper" "github.com/coder/coder/v2/provisionersdk" "github.com/coder/coder/v2/testutil" ) +// jobType represents the type of job being reaped +type jobType string + +const ( + hungJobType jobType = "hung" + notStartedJobType jobType = "not started" +) + +// jobLogMessages returns the messages to be written to provisioner job logs when a job is reaped +func jobLogMessages(jobType jobType, threshold float64) []string { + return []string{ + "", + "====================", + fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and will be terminated.", jobType, threshold), + "====================", + "", + } +} + +// reapParamsFromJob determines the type and threshold for a job being reaped +func reapParamsFromJob(job database.ProvisionerJob) (jobType, float64) { + jobType := hungJobType + threshold := reaper.HungJobDuration.Minutes() + if !job.StartedAt.Valid { + jobType = notStartedJobType + threshold = reaper.NotStartedTimeElapsed.Minutes() + } + return jobType, threshold +} + func TestMain(m *testing.M) { goleak.VerifyTestMain(m, testutil.GoleakOptions...) } @@ -39,10 +69,10 @@ func TestDetectorNoJobs(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan unhanger.Stats) + statsCh = make(chan reaper.Stats) ) - detector := unhanger.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- time.Now() @@ -62,7 +92,7 @@ func TestDetectorNoHungJobs(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan unhanger.Stats) + statsCh = make(chan reaper.Stats) ) // Insert some jobs that are running and haven't been updated in a while, @@ -89,7 +119,7 @@ func TestDetectorNoHungJobs(t *testing.T) { }) } - detector := unhanger.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -109,7 +139,7 @@ func TestDetectorHungWorkspaceBuild(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan unhanger.Stats) + statsCh = make(chan reaper.Stats) ) var ( @@ -195,7 +225,7 @@ func TestDetectorHungWorkspaceBuild(t *testing.T) { t.Log("previous job ID: ", previousWorkspaceBuildJob.ID) t.Log("current job ID: ", currentWorkspaceBuildJob.ID) - detector := unhanger.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -231,7 +261,7 @@ func TestDetectorHungWorkspaceBuildNoOverrideState(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan unhanger.Stats) + statsCh = make(chan reaper.Stats) ) var ( @@ -318,7 +348,7 @@ func TestDetectorHungWorkspaceBuildNoOverrideState(t *testing.T) { t.Log("previous job ID: ", previousWorkspaceBuildJob.ID) t.Log("current job ID: ", currentWorkspaceBuildJob.ID) - detector := unhanger.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -354,7 +384,7 @@ func TestDetectorHungWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *testing.T db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan unhanger.Stats) + statsCh = make(chan reaper.Stats) ) var ( @@ -411,7 +441,7 @@ func TestDetectorHungWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *testing.T t.Log("current job ID: ", currentWorkspaceBuildJob.ID) - detector := unhanger.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -439,6 +469,98 @@ func TestDetectorHungWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *testing.T detector.Wait() } +func TestDetectorNotStartedWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *testing.T) { + t.Parallel() + + var ( + ctx = testutil.Context(t, testutil.WaitLong) + db, pubsub = dbtestutil.NewDB(t) + log = testutil.Logger(t) + tickCh = make(chan time.Time) + statsCh = make(chan reaper.Stats) + ) + + var ( + now = time.Now() + thirtyFiveMinAgo = now.Add(-time.Minute * 35) + org = dbgen.Organization(t, db, database.Organization{}) + user = dbgen.User(t, db, database.User{}) + file = dbgen.File(t, db, database.File{}) + template = dbgen.Template(t, db, database.Template{ + OrganizationID: org.ID, + CreatedBy: user.ID, + }) + templateVersion = dbgen.TemplateVersion(t, db, database.TemplateVersion{ + OrganizationID: org.ID, + TemplateID: uuid.NullUUID{ + UUID: template.ID, + Valid: true, + }, + CreatedBy: user.ID, + }) + workspace = dbgen.Workspace(t, db, database.WorkspaceTable{ + OwnerID: user.ID, + OrganizationID: org.ID, + TemplateID: template.ID, + }) + + // First build. + expectedWorkspaceBuildState = []byte(`{"dean":"cool","colin":"also cool"}`) + currentWorkspaceBuildJob = dbgen.ProvisionerJob(t, db, pubsub, database.ProvisionerJob{ + CreatedAt: thirtyFiveMinAgo, + UpdatedAt: thirtyFiveMinAgo, + StartedAt: sql.NullTime{ + Time: time.Time{}, + Valid: false, + }, + OrganizationID: org.ID, + InitiatorID: user.ID, + Provisioner: database.ProvisionerTypeEcho, + StorageMethod: database.ProvisionerStorageMethodFile, + FileID: file.ID, + Type: database.ProvisionerJobTypeWorkspaceBuild, + Input: []byte("{}"), + }) + currentWorkspaceBuild = dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{ + WorkspaceID: workspace.ID, + TemplateVersionID: templateVersion.ID, + BuildNumber: 1, + JobID: currentWorkspaceBuildJob.ID, + // Should not be overridden. + ProvisionerState: expectedWorkspaceBuildState, + }) + ) + + t.Log("current job ID: ", currentWorkspaceBuildJob.ID) + + detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector.Start() + tickCh <- now + + stats := <-statsCh + require.NoError(t, stats.Error) + require.Len(t, stats.TerminatedJobIDs, 1) + require.Equal(t, currentWorkspaceBuildJob.ID, stats.TerminatedJobIDs[0]) + + // Check that the current provisioner job was updated. + job, err := db.GetProvisionerJobByID(ctx, currentWorkspaceBuildJob.ID) + require.NoError(t, err) + require.WithinDuration(t, now, job.UpdatedAt, 30*time.Second) + require.True(t, job.CompletedAt.Valid) + require.WithinDuration(t, now, job.CompletedAt.Time, 30*time.Second) + require.True(t, job.Error.Valid) + require.Contains(t, job.Error.String, "Build has been detected as not started") + require.False(t, job.ErrorCode.Valid) + + // Check that the provisioner state was NOT updated. + build, err := db.GetWorkspaceBuildByID(ctx, currentWorkspaceBuild.ID) + require.NoError(t, err) + require.Equal(t, expectedWorkspaceBuildState, build.ProvisionerState) + + detector.Close() + detector.Wait() +} + func TestDetectorHungOtherJobTypes(t *testing.T) { t.Parallel() @@ -447,7 +569,7 @@ func TestDetectorHungOtherJobTypes(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan unhanger.Stats) + statsCh = make(chan reaper.Stats) ) var ( @@ -509,7 +631,7 @@ func TestDetectorHungOtherJobTypes(t *testing.T) { t.Log("template import job ID: ", templateImportJob.ID) t.Log("template dry-run job ID: ", templateDryRunJob.ID) - detector := unhanger.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -543,6 +665,109 @@ func TestDetectorHungOtherJobTypes(t *testing.T) { detector.Wait() } +func TestDetectorNotStartedOtherJobTypes(t *testing.T) { + t.Parallel() + + var ( + ctx = testutil.Context(t, testutil.WaitLong) + db, pubsub = dbtestutil.NewDB(t) + log = testutil.Logger(t) + tickCh = make(chan time.Time) + statsCh = make(chan reaper.Stats) + ) + + var ( + now = time.Now() + thirtyFiveMinAgo = now.Add(-time.Minute * 35) + org = dbgen.Organization(t, db, database.Organization{}) + user = dbgen.User(t, db, database.User{}) + file = dbgen.File(t, db, database.File{}) + + // Template import job. + templateImportJob = dbgen.ProvisionerJob(t, db, pubsub, database.ProvisionerJob{ + CreatedAt: thirtyFiveMinAgo, + UpdatedAt: thirtyFiveMinAgo, + StartedAt: sql.NullTime{ + Time: time.Time{}, + Valid: false, + }, + OrganizationID: org.ID, + InitiatorID: user.ID, + Provisioner: database.ProvisionerTypeEcho, + StorageMethod: database.ProvisionerStorageMethodFile, + FileID: file.ID, + Type: database.ProvisionerJobTypeTemplateVersionImport, + Input: []byte("{}"), + }) + _ = dbgen.TemplateVersion(t, db, database.TemplateVersion{ + OrganizationID: org.ID, + JobID: templateImportJob.ID, + CreatedBy: user.ID, + }) + ) + + // Template dry-run job. + dryRunVersion := dbgen.TemplateVersion(t, db, database.TemplateVersion{ + OrganizationID: org.ID, + CreatedBy: user.ID, + }) + input, err := json.Marshal(provisionerdserver.TemplateVersionDryRunJob{ + TemplateVersionID: dryRunVersion.ID, + }) + require.NoError(t, err) + templateDryRunJob := dbgen.ProvisionerJob(t, db, pubsub, database.ProvisionerJob{ + CreatedAt: thirtyFiveMinAgo, + UpdatedAt: thirtyFiveMinAgo, + StartedAt: sql.NullTime{ + Time: time.Time{}, + Valid: false, + }, + OrganizationID: org.ID, + InitiatorID: user.ID, + Provisioner: database.ProvisionerTypeEcho, + StorageMethod: database.ProvisionerStorageMethodFile, + FileID: file.ID, + Type: database.ProvisionerJobTypeTemplateVersionDryRun, + Input: input, + }) + + t.Log("template import job ID: ", templateImportJob.ID) + t.Log("template dry-run job ID: ", templateDryRunJob.ID) + + detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector.Start() + tickCh <- now + + stats := <-statsCh + require.NoError(t, stats.Error) + require.Len(t, stats.TerminatedJobIDs, 2) + require.Contains(t, stats.TerminatedJobIDs, templateImportJob.ID) + require.Contains(t, stats.TerminatedJobIDs, templateDryRunJob.ID) + + // Check that the template import job was updated. + job, err := db.GetProvisionerJobByID(ctx, templateImportJob.ID) + require.NoError(t, err) + require.WithinDuration(t, now, job.UpdatedAt, 30*time.Second) + require.True(t, job.CompletedAt.Valid) + require.WithinDuration(t, now, job.CompletedAt.Time, 30*time.Second) + require.True(t, job.Error.Valid) + require.Contains(t, job.Error.String, "Build has been detected as not started") + require.False(t, job.ErrorCode.Valid) + + // Check that the template dry-run job was updated. + job, err = db.GetProvisionerJobByID(ctx, templateDryRunJob.ID) + require.NoError(t, err) + require.WithinDuration(t, now, job.UpdatedAt, 30*time.Second) + require.True(t, job.CompletedAt.Valid) + require.WithinDuration(t, now, job.CompletedAt.Time, 30*time.Second) + require.True(t, job.Error.Valid) + require.Contains(t, job.Error.String, "Build has been detected as not started") + require.False(t, job.ErrorCode.Valid) + + detector.Close() + detector.Wait() +} + func TestDetectorHungCanceledJob(t *testing.T) { t.Parallel() @@ -551,7 +776,7 @@ func TestDetectorHungCanceledJob(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan unhanger.Stats) + statsCh = make(chan reaper.Stats) ) var ( @@ -591,7 +816,7 @@ func TestDetectorHungCanceledJob(t *testing.T) { t.Log("template import job ID: ", templateImportJob.ID) - detector := unhanger.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -653,7 +878,7 @@ func TestDetectorPushesLogs(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan unhanger.Stats) + statsCh = make(chan reaper.Stats) ) var ( @@ -706,7 +931,7 @@ func TestDetectorPushesLogs(t *testing.T) { require.Len(t, logs, 10) } - detector := unhanger.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() // Create pubsub subscription to listen for new log events. @@ -741,12 +966,14 @@ func TestDetectorPushesLogs(t *testing.T) { CreatedAfter: after, }) require.NoError(t, err) - require.Len(t, logs, len(unhanger.HungJobLogMessages)) + jobType, threshold := reapParamsFromJob(templateImportJob) + expectedLogs := jobLogMessages(jobType, threshold) + require.Len(t, logs, len(expectedLogs)) for i, log := range logs { assert.Equal(t, database.LogLevelError, log.Level) assert.Equal(t, c.expectStage, log.Stage) assert.Equal(t, database.LogSourceProvisionerDaemon, log.Source) - assert.Equal(t, unhanger.HungJobLogMessages[i], log.Output) + assert.Equal(t, expectedLogs[i], log.Output) } // Double check the full log count. @@ -755,7 +982,7 @@ func TestDetectorPushesLogs(t *testing.T) { CreatedAfter: 0, }) require.NoError(t, err) - require.Len(t, logs, c.preLogCount+len(unhanger.HungJobLogMessages)) + require.Len(t, logs, c.preLogCount+len(expectedLogs)) detector.Close() detector.Wait() @@ -771,15 +998,15 @@ func TestDetectorMaxJobsPerRun(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan unhanger.Stats) + statsCh = make(chan reaper.Stats) org = dbgen.Organization(t, db, database.Organization{}) user = dbgen.User(t, db, database.User{}) file = dbgen.File(t, db, database.File{}) ) - // Create unhanger.MaxJobsPerRun + 1 hung jobs. + // Create MaxJobsPerRun + 1 hung jobs. now := time.Now() - for i := 0; i < unhanger.MaxJobsPerRun+1; i++ { + for i := 0; i < reaper.MaxJobsPerRun+1; i++ { pj := dbgen.ProvisionerJob(t, db, pubsub, database.ProvisionerJob{ CreatedAt: now.Add(-time.Hour), UpdatedAt: now.Add(-time.Hour), @@ -802,14 +1029,14 @@ func TestDetectorMaxJobsPerRun(t *testing.T) { }) } - detector := unhanger.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now - // Make sure that only unhanger.MaxJobsPerRun jobs are terminated. + // Make sure that only MaxJobsPerRun jobs are terminated. stats := <-statsCh require.NoError(t, stats.Error) - require.Len(t, stats.TerminatedJobIDs, unhanger.MaxJobsPerRun) + require.Len(t, stats.TerminatedJobIDs, reaper.MaxJobsPerRun) // Run the detector again and make sure that only the remaining job is // terminated. @@ -823,7 +1050,7 @@ func TestDetectorMaxJobsPerRun(t *testing.T) { } // wrapDBAuthz adds our Authorization/RBAC around the given database store, to -// ensure the unhanger has the right permissions to do its work. +// ensure the reaper has the right permissions to do its work. func wrapDBAuthz(db database.Store, logger slog.Logger) database.Store { return dbauthz.New( db, diff --git a/provisioner/terraform/serve.go b/provisioner/terraform/serve.go index 562946d8ef92e..01d63b3b90086 100644 --- a/provisioner/terraform/serve.go +++ b/provisioner/terraform/serve.go @@ -16,7 +16,7 @@ import ( "cdr.dev/slog" "github.com/coder/coder/v2/coderd/database" - "github.com/coder/coder/v2/coderd/unhanger" + "github.com/coder/coder/v2/coderd/reaper" "github.com/coder/coder/v2/provisionersdk" ) @@ -39,9 +39,9 @@ type ServeOptions struct { // // This is a no-op on Windows where the process can't be interrupted. // - // Default value: 3 minutes (unhanger.HungJobExitTimeout). This value should + // Default value: 3 minutes (reaper.HungJobExitTimeout). This value should // be kept less than the value that Coder uses to mark hung jobs as failed, - // which is 5 minutes (see unhanger package). + // which is 5 minutes (see reaper package). ExitTimeout time.Duration } @@ -131,7 +131,7 @@ func Serve(ctx context.Context, options *ServeOptions) error { options.Tracer = trace.NewNoopTracerProvider().Tracer("noop") } if options.ExitTimeout == 0 { - options.ExitTimeout = unhanger.HungJobExitTimeout + options.ExitTimeout = reaper.HungJobExitTimeout } return provisionersdk.Serve(ctx, &server{ execMut: &sync.Mutex{}, From 38157271858139038a15c48129539eb55b81f55f Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Tue, 13 May 2025 18:20:48 +0000 Subject: [PATCH 06/24] Revert "filling out started_at when unhanging not started jobs" This reverts commit 4b252eb0137c773e127866dc56a4698d20cb5919. --- coderd/database/queries.sql.go | 5 +---- coderd/database/queries/provisionerjobs.sql | 3 +-- coderd/reaper/detector.go | 13 +------------ 3 files changed, 3 insertions(+), 18 deletions(-) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index b46e62554a404..c525f7895b410 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -8135,8 +8135,7 @@ SET updated_at = $2, completed_at = $3, error = $4, - error_code = $5, - started_at = $6 + error_code = $5 WHERE id = $1 ` @@ -8147,7 +8146,6 @@ type UpdateProvisionerJobWithCompleteByIDParams struct { CompletedAt sql.NullTime `db:"completed_at" json:"completed_at"` Error sql.NullString `db:"error" json:"error"` ErrorCode sql.NullString `db:"error_code" json:"error_code"` - StartedAt sql.NullTime `db:"started_at" json:"started_at"` } func (q *sqlQuerier) UpdateProvisionerJobWithCompleteByID(ctx context.Context, arg UpdateProvisionerJobWithCompleteByIDParams) error { @@ -8157,7 +8155,6 @@ func (q *sqlQuerier) UpdateProvisionerJobWithCompleteByID(ctx context.Context, a arg.CompletedAt, arg.Error, arg.ErrorCode, - arg.StartedAt, ) return err } diff --git a/coderd/database/queries/provisionerjobs.sql b/coderd/database/queries/provisionerjobs.sql index 9e9273bb05352..d125b195b88b0 100644 --- a/coderd/database/queries/provisionerjobs.sql +++ b/coderd/database/queries/provisionerjobs.sql @@ -252,8 +252,7 @@ SET updated_at = $2, completed_at = $3, error = $4, - error_code = $5, - started_at = $6 + error_code = $5 WHERE id = $1; diff --git a/coderd/reaper/detector.go b/coderd/reaper/detector.go index a7736f2a2a070..3041f8e7b9961 100644 --- a/coderd/reaper/detector.go +++ b/coderd/reaper/detector.go @@ -309,22 +309,11 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub } lowestLogID = newLogs[0].ID - now = dbtime.Now() - - // If we are failing a job that was never picked up by the - // provisioner, we need to set the started_at time to the current - // time so that the build duration is correct. - if !job.StartedAt.Valid { - job.StartedAt = sql.NullTime{ - Time: now, - Valid: true, - } - } // Mark the job as failed. + now = dbtime.Now() err = db.UpdateProvisionerJobWithCompleteByID(ctx, database.UpdateProvisionerJobWithCompleteByIDParams{ ID: job.ID, UpdatedAt: now, - StartedAt: job.StartedAt, CompletedAt: sql.NullTime{ Time: now, Valid: true, From b65f620d556dd290c61af2a4530a8302f0c7fe42 Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Tue, 13 May 2025 18:31:15 +0000 Subject: [PATCH 07/24] created new ORM update to avoid forcing setting StartedAt on every Complete update --- coderd/database/dbauthz/dbauthz.go | 4 +++ coderd/database/dbmem/dbmem.go | 24 +++++++++++++++ coderd/database/dbmetrics/querymetrics.go | 7 +++++ coderd/database/dbmock/dbmock.go | 14 +++++++++ coderd/database/querier.go | 1 + coderd/database/queries.sql.go | 34 +++++++++++++++++++++ coderd/database/queries/provisionerjobs.sql | 12 ++++++++ coderd/reaper/detector.go | 12 +++++++- 8 files changed, 107 insertions(+), 1 deletion(-) diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index 4bcb7076a5a69..5db272f62a856 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -4254,6 +4254,10 @@ func (q *querier) UpdateProvisionerJobWithCompleteByID(ctx context.Context, arg return q.db.UpdateProvisionerJobWithCompleteByID(ctx, arg) } +func (q *querier) UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx context.Context, arg database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams) error { + return q.db.UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx, arg) +} + func (q *querier) UpdateReplica(ctx context.Context, arg database.UpdateReplicaParams) (database.Replica, error) { if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceSystem); err != nil { return database.Replica{}, err diff --git a/coderd/database/dbmem/dbmem.go b/coderd/database/dbmem/dbmem.go index 01eb2d66ed5de..43c804d1fdb46 100644 --- a/coderd/database/dbmem/dbmem.go +++ b/coderd/database/dbmem/dbmem.go @@ -10942,6 +10942,30 @@ func (q *FakeQuerier) UpdateProvisionerJobWithCompleteByID(_ context.Context, ar return sql.ErrNoRows } +func (q *FakeQuerier) UpdateProvisionerJobWithCompleteWithStartedAtByID(_ context.Context, arg database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams) error { + if err := validateDatabaseType(arg); err != nil { + return err + } + + q.mutex.Lock() + defer q.mutex.Unlock() + + for index, job := range q.provisionerJobs { + if arg.ID != job.ID { + continue + } + job.UpdatedAt = arg.UpdatedAt + job.CompletedAt = arg.CompletedAt + job.Error = arg.Error + job.ErrorCode = arg.ErrorCode + job.JobStatus = provisionerJobStatus(job) + job.StartedAt = arg.StartedAt + q.provisionerJobs[index] = job + return nil + } + return sql.ErrNoRows +} + func (q *FakeQuerier) UpdateReplica(_ context.Context, arg database.UpdateReplicaParams) (database.Replica, error) { if err := validateDatabaseType(arg); err != nil { return database.Replica{}, err diff --git a/coderd/database/dbmetrics/querymetrics.go b/coderd/database/dbmetrics/querymetrics.go index 1761c0c6b9960..2d3aea66a6ad8 100644 --- a/coderd/database/dbmetrics/querymetrics.go +++ b/coderd/database/dbmetrics/querymetrics.go @@ -2706,6 +2706,13 @@ func (m queryMetricsStore) UpdateProvisionerJobWithCompleteByID(ctx context.Cont return err } +func (m queryMetricsStore) UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx context.Context, arg database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams) error { + start := time.Now() + r0 := m.s.UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx, arg) + m.queryLatencies.WithLabelValues("UpdateProvisionerJobWithCompleteWithStartedAtByID").Observe(time.Since(start).Seconds()) + return r0 +} + func (m queryMetricsStore) UpdateReplica(ctx context.Context, arg database.UpdateReplicaParams) (database.Replica, error) { start := time.Now() replica, err := m.s.UpdateReplica(ctx, arg) diff --git a/coderd/database/dbmock/dbmock.go b/coderd/database/dbmock/dbmock.go index 1e3a6cf15cfa8..0e1abb4d9dca6 100644 --- a/coderd/database/dbmock/dbmock.go +++ b/coderd/database/dbmock/dbmock.go @@ -5732,6 +5732,20 @@ func (mr *MockStoreMockRecorder) UpdateProvisionerJobWithCompleteByID(ctx, arg a return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateProvisionerJobWithCompleteByID", reflect.TypeOf((*MockStore)(nil).UpdateProvisionerJobWithCompleteByID), ctx, arg) } +// UpdateProvisionerJobWithCompleteWithStartedAtByID mocks base method. +func (m *MockStore) UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx context.Context, arg database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "UpdateProvisionerJobWithCompleteWithStartedAtByID", ctx, arg) + ret0, _ := ret[0].(error) + return ret0 +} + +// UpdateProvisionerJobWithCompleteWithStartedAtByID indicates an expected call of UpdateProvisionerJobWithCompleteWithStartedAtByID. +func (mr *MockStoreMockRecorder) UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx, arg any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateProvisionerJobWithCompleteWithStartedAtByID", reflect.TypeOf((*MockStore)(nil).UpdateProvisionerJobWithCompleteWithStartedAtByID), ctx, arg) +} + // UpdateReplica mocks base method. func (m *MockStore) UpdateReplica(ctx context.Context, arg database.UpdateReplicaParams) (database.Replica, error) { m.ctrl.T.Helper() diff --git a/coderd/database/querier.go b/coderd/database/querier.go index 86647a834d20a..5167a7099dae3 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -567,6 +567,7 @@ type sqlcQuerier interface { UpdateProvisionerJobByID(ctx context.Context, arg UpdateProvisionerJobByIDParams) error UpdateProvisionerJobWithCancelByID(ctx context.Context, arg UpdateProvisionerJobWithCancelByIDParams) error UpdateProvisionerJobWithCompleteByID(ctx context.Context, arg UpdateProvisionerJobWithCompleteByIDParams) error + UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx context.Context, arg UpdateProvisionerJobWithCompleteWithStartedAtByIDParams) error UpdateReplica(ctx context.Context, arg UpdateReplicaParams) (Replica, error) UpdateTailnetPeerStatusByCoordinator(ctx context.Context, arg UpdateTailnetPeerStatusByCoordinatorParams) error UpdateTemplateACLByID(ctx context.Context, arg UpdateTemplateACLByIDParams) error diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index c525f7895b410..b8ac4452beb19 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -8159,6 +8159,40 @@ func (q *sqlQuerier) UpdateProvisionerJobWithCompleteByID(ctx context.Context, a return err } +const updateProvisionerJobWithCompleteWithStartedAtByID = `-- name: UpdateProvisionerJobWithCompleteWithStartedAtByID :exec +UPDATE + provisioner_jobs +SET + updated_at = $2, + completed_at = $3, + error = $4, + error_code = $5, + started_at = $6 +WHERE + id = $1 +` + +type UpdateProvisionerJobWithCompleteWithStartedAtByIDParams struct { + ID uuid.UUID `db:"id" json:"id"` + UpdatedAt time.Time `db:"updated_at" json:"updated_at"` + CompletedAt sql.NullTime `db:"completed_at" json:"completed_at"` + Error sql.NullString `db:"error" json:"error"` + ErrorCode sql.NullString `db:"error_code" json:"error_code"` + StartedAt sql.NullTime `db:"started_at" json:"started_at"` +} + +func (q *sqlQuerier) UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx context.Context, arg UpdateProvisionerJobWithCompleteWithStartedAtByIDParams) error { + _, err := q.db.ExecContext(ctx, updateProvisionerJobWithCompleteWithStartedAtByID, + arg.ID, + arg.UpdatedAt, + arg.CompletedAt, + arg.Error, + arg.ErrorCode, + arg.StartedAt, + ) + return err +} + const deleteProvisionerKey = `-- name: DeleteProvisionerKey :exec DELETE FROM provisioner_keys diff --git a/coderd/database/queries/provisionerjobs.sql b/coderd/database/queries/provisionerjobs.sql index d125b195b88b0..d464bdfa4d587 100644 --- a/coderd/database/queries/provisionerjobs.sql +++ b/coderd/database/queries/provisionerjobs.sql @@ -256,6 +256,18 @@ SET WHERE id = $1; +-- name: UpdateProvisionerJobWithCompleteWithStartedAtByID :exec +UPDATE + provisioner_jobs +SET + updated_at = $2, + completed_at = $3, + error = $4, + error_code = $5, + started_at = $6 +WHERE + id = $1; + -- name: GetHungProvisionerJobs :many SELECT * diff --git a/coderd/reaper/detector.go b/coderd/reaper/detector.go index 3041f8e7b9961..92d229205adc8 100644 --- a/coderd/reaper/detector.go +++ b/coderd/reaper/detector.go @@ -311,7 +311,16 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub // Mark the job as failed. now = dbtime.Now() - err = db.UpdateProvisionerJobWithCompleteByID(ctx, database.UpdateProvisionerJobWithCompleteByIDParams{ + + // If the job was never started, set the StartedAt time to the current + // time so that the build duration is correct. + if !job.StartedAt.Valid { + job.StartedAt = sql.NullTime{ + Time: now, + Valid: true, + } + } + err = db.UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx, database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams{ ID: job.ID, UpdatedAt: now, CompletedAt: sql.NullTime{ @@ -325,6 +334,7 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub ErrorCode: sql.NullString{ Valid: false, }, + StartedAt: job.StartedAt, }) if err != nil { return xerrors.Errorf("mark job as failed: %w", err) From 3c7c323ea8978b984a48feab25a9b01103f19e4a Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Tue, 13 May 2025 18:40:38 +0000 Subject: [PATCH 08/24] added missing dbauthz tests --- coderd/database/dbauthz/dbauthz_test.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/coderd/database/dbauthz/dbauthz_test.go b/coderd/database/dbauthz/dbauthz_test.go index 6dc9a32f03943..01181dd6adc38 100644 --- a/coderd/database/dbauthz/dbauthz_test.go +++ b/coderd/database/dbauthz/dbauthz_test.go @@ -4034,6 +4034,13 @@ func (s *MethodTestSuite) TestSystemFunctions() { ID: j.ID, }).Asserts( /*rbac.ResourceSystem, policy.ActionUpdate*/ ) })) + s.Run("UpdateProvisionerJobWithCompleteWithStartedAtByID", s.Subtest(func(db database.Store, check *expects) { + // TODO: we need to create a ProvisionerJob resource + j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) + check.Args(database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams{ + ID: j.ID, + }).Asserts( /*rbac.ResourceSystem, policy.ActionUpdate*/ ) + })) s.Run("UpdateProvisionerJobByID", s.Subtest(func(db database.Store, check *expects) { // TODO: we need to create a ProvisionerJob resource j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) @@ -4204,6 +4211,9 @@ func (s *MethodTestSuite) TestSystemFunctions() { s.Run("GetHungProvisionerJobs", s.Subtest(func(db database.Store, check *expects) { check.Args(time.Time{}).Asserts() })) + s.Run("GetNotStartedProvisionerJobs", s.Subtest(func(db database.Store, check *expects) { + check.Args(time.Time{}).Asserts() + })) s.Run("UpsertOAuthSigningKey", s.Subtest(func(db database.Store, check *expects) { check.Args("foo").Asserts(rbac.ResourceSystem, policy.ActionUpdate) })) From 35df01f247423ad544892c5696309c61d1b16dcb Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Tue, 13 May 2025 18:45:40 +0000 Subject: [PATCH 09/24] added checks for StartedAt value in the updated jobs --- coderd/reaper/detector_test.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/coderd/reaper/detector_test.go b/coderd/reaper/detector_test.go index e57e0bdccd1be..1fec8a1a726e7 100644 --- a/coderd/reaper/detector_test.go +++ b/coderd/reaper/detector_test.go @@ -548,6 +548,8 @@ func TestDetectorNotStartedWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *tes require.WithinDuration(t, now, job.UpdatedAt, 30*time.Second) require.True(t, job.CompletedAt.Valid) require.WithinDuration(t, now, job.CompletedAt.Time, 30*time.Second) + require.True(t, job.StartedAt.Valid) + require.WithinDuration(t, now, job.StartedAt.Time, 30*time.Second) require.True(t, job.Error.Valid) require.Contains(t, job.Error.String, "Build has been detected as not started") require.False(t, job.ErrorCode.Valid) @@ -750,6 +752,8 @@ func TestDetectorNotStartedOtherJobTypes(t *testing.T) { require.WithinDuration(t, now, job.UpdatedAt, 30*time.Second) require.True(t, job.CompletedAt.Valid) require.WithinDuration(t, now, job.CompletedAt.Time, 30*time.Second) + require.True(t, job.StartedAt.Valid) + require.WithinDuration(t, now, job.StartedAt.Time, 30*time.Second) require.True(t, job.Error.Valid) require.Contains(t, job.Error.String, "Build has been detected as not started") require.False(t, job.ErrorCode.Valid) @@ -760,6 +764,8 @@ func TestDetectorNotStartedOtherJobTypes(t *testing.T) { require.WithinDuration(t, now, job.UpdatedAt, 30*time.Second) require.True(t, job.CompletedAt.Valid) require.WithinDuration(t, now, job.CompletedAt.Time, 30*time.Second) + require.True(t, job.StartedAt.Valid) + require.WithinDuration(t, now, job.StartedAt.Time, 30*time.Second) require.True(t, job.Error.Valid) require.Contains(t, job.Error.String, "Build has been detected as not started") require.False(t, job.ErrorCode.Valid) From 8aa1ee247b5eeeb0613af6d114bf9c37c304579f Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Wed, 14 May 2025 09:42:48 +0000 Subject: [PATCH 10/24] refactor from reaper to jobreaper --- cli/server.go | 4 +- coderd/coderdtest/coderdtest.go | 4 +- coderd/{reaper => jobreaper}/detector.go | 2 +- coderd/{reaper => jobreaper}/detector_test.go | 56 +++++++++---------- provisioner/terraform/serve.go | 4 +- 5 files changed, 35 insertions(+), 35 deletions(-) rename coderd/{reaper => jobreaper}/detector.go (99%) rename coderd/{reaper => jobreaper}/detector_test.go (94%) diff --git a/cli/server.go b/cli/server.go index e924439ec26b2..994886697ff64 100644 --- a/cli/server.go +++ b/cli/server.go @@ -87,12 +87,12 @@ import ( "github.com/coder/coder/v2/coderd/externalauth" "github.com/coder/coder/v2/coderd/gitsshkey" "github.com/coder/coder/v2/coderd/httpmw" + "github.com/coder/coder/v2/coderd/jobreaper" "github.com/coder/coder/v2/coderd/notifications" "github.com/coder/coder/v2/coderd/oauthpki" "github.com/coder/coder/v2/coderd/prometheusmetrics" "github.com/coder/coder/v2/coderd/prometheusmetrics/insights" "github.com/coder/coder/v2/coderd/promoauth" - "github.com/coder/coder/v2/coderd/reaper" "github.com/coder/coder/v2/coderd/schedule" "github.com/coder/coder/v2/coderd/telemetry" "github.com/coder/coder/v2/coderd/tracing" @@ -1129,7 +1129,7 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. hangDetectorTicker := time.NewTicker(vals.JobHangDetectorInterval.Value()) defer hangDetectorTicker.Stop() - hangDetector := reaper.New(ctx, options.Database, options.Pubsub, logger, hangDetectorTicker.C) + hangDetector := jobreaper.New(ctx, options.Database, options.Pubsub, logger, hangDetectorTicker.C) hangDetector.Start() defer hangDetector.Close() diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index ff418f3abaff2..f8c9cedbed99e 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -68,11 +68,11 @@ import ( "github.com/coder/coder/v2/coderd/externalauth" "github.com/coder/coder/v2/coderd/gitsshkey" "github.com/coder/coder/v2/coderd/httpmw" + "github.com/coder/coder/v2/coderd/jobreaper" "github.com/coder/coder/v2/coderd/notifications" "github.com/coder/coder/v2/coderd/notifications/notificationstest" "github.com/coder/coder/v2/coderd/rbac" "github.com/coder/coder/v2/coderd/rbac/policy" - "github.com/coder/coder/v2/coderd/reaper" "github.com/coder/coder/v2/coderd/runtimeconfig" "github.com/coder/coder/v2/coderd/schedule" "github.com/coder/coder/v2/coderd/telemetry" @@ -367,7 +367,7 @@ func NewOptions(t testing.TB, options *Options) (func(http.Handler), context.Can hangDetectorTicker := time.NewTicker(options.DeploymentValues.JobHangDetectorInterval.Value()) defer hangDetectorTicker.Stop() - hangDetector := reaper.New(ctx, options.Database, options.Pubsub, options.Logger.Named("reaper.detector"), hangDetectorTicker.C) + hangDetector := jobreaper.New(ctx, options.Database, options.Pubsub, options.Logger.Named("reaper.detector"), hangDetectorTicker.C) hangDetector.Start() t.Cleanup(hangDetector.Close) diff --git a/coderd/reaper/detector.go b/coderd/jobreaper/detector.go similarity index 99% rename from coderd/reaper/detector.go rename to coderd/jobreaper/detector.go index 92d229205adc8..14d8068eadae6 100644 --- a/coderd/reaper/detector.go +++ b/coderd/jobreaper/detector.go @@ -1,4 +1,4 @@ -package reaper +package jobreaper import ( "context" diff --git a/coderd/reaper/detector_test.go b/coderd/jobreaper/detector_test.go similarity index 94% rename from coderd/reaper/detector_test.go rename to coderd/jobreaper/detector_test.go index 1fec8a1a726e7..dba1c013b0766 100644 --- a/coderd/reaper/detector_test.go +++ b/coderd/jobreaper/detector_test.go @@ -1,4 +1,4 @@ -package reaper_test +package jobreaper_test import ( "context" @@ -20,9 +20,9 @@ import ( "github.com/coder/coder/v2/coderd/database/dbauthz" "github.com/coder/coder/v2/coderd/database/dbgen" "github.com/coder/coder/v2/coderd/database/dbtestutil" + "github.com/coder/coder/v2/coderd/jobreaper" "github.com/coder/coder/v2/coderd/provisionerdserver" "github.com/coder/coder/v2/coderd/rbac" - "github.com/coder/coder/v2/coderd/reaper" "github.com/coder/coder/v2/provisionersdk" "github.com/coder/coder/v2/testutil" ) @@ -49,10 +49,10 @@ func jobLogMessages(jobType jobType, threshold float64) []string { // reapParamsFromJob determines the type and threshold for a job being reaped func reapParamsFromJob(job database.ProvisionerJob) (jobType, float64) { jobType := hungJobType - threshold := reaper.HungJobDuration.Minutes() + threshold := jobreaper.HungJobDuration.Minutes() if !job.StartedAt.Valid { jobType = notStartedJobType - threshold = reaper.NotStartedTimeElapsed.Minutes() + threshold = jobreaper.NotStartedTimeElapsed.Minutes() } return jobType, threshold } @@ -69,10 +69,10 @@ func TestDetectorNoJobs(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan reaper.Stats) + statsCh = make(chan jobreaper.Stats) ) - detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- time.Now() @@ -92,7 +92,7 @@ func TestDetectorNoHungJobs(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan reaper.Stats) + statsCh = make(chan jobreaper.Stats) ) // Insert some jobs that are running and haven't been updated in a while, @@ -119,7 +119,7 @@ func TestDetectorNoHungJobs(t *testing.T) { }) } - detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -139,7 +139,7 @@ func TestDetectorHungWorkspaceBuild(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan reaper.Stats) + statsCh = make(chan jobreaper.Stats) ) var ( @@ -225,7 +225,7 @@ func TestDetectorHungWorkspaceBuild(t *testing.T) { t.Log("previous job ID: ", previousWorkspaceBuildJob.ID) t.Log("current job ID: ", currentWorkspaceBuildJob.ID) - detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -261,7 +261,7 @@ func TestDetectorHungWorkspaceBuildNoOverrideState(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan reaper.Stats) + statsCh = make(chan jobreaper.Stats) ) var ( @@ -348,7 +348,7 @@ func TestDetectorHungWorkspaceBuildNoOverrideState(t *testing.T) { t.Log("previous job ID: ", previousWorkspaceBuildJob.ID) t.Log("current job ID: ", currentWorkspaceBuildJob.ID) - detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -384,7 +384,7 @@ func TestDetectorHungWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *testing.T db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan reaper.Stats) + statsCh = make(chan jobreaper.Stats) ) var ( @@ -441,7 +441,7 @@ func TestDetectorHungWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *testing.T t.Log("current job ID: ", currentWorkspaceBuildJob.ID) - detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -477,7 +477,7 @@ func TestDetectorNotStartedWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *tes db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan reaper.Stats) + statsCh = make(chan jobreaper.Stats) ) var ( @@ -533,7 +533,7 @@ func TestDetectorNotStartedWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *tes t.Log("current job ID: ", currentWorkspaceBuildJob.ID) - detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -571,7 +571,7 @@ func TestDetectorHungOtherJobTypes(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan reaper.Stats) + statsCh = make(chan jobreaper.Stats) ) var ( @@ -633,7 +633,7 @@ func TestDetectorHungOtherJobTypes(t *testing.T) { t.Log("template import job ID: ", templateImportJob.ID) t.Log("template dry-run job ID: ", templateDryRunJob.ID) - detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -675,7 +675,7 @@ func TestDetectorNotStartedOtherJobTypes(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan reaper.Stats) + statsCh = make(chan jobreaper.Stats) ) var ( @@ -736,7 +736,7 @@ func TestDetectorNotStartedOtherJobTypes(t *testing.T) { t.Log("template import job ID: ", templateImportJob.ID) t.Log("template dry-run job ID: ", templateDryRunJob.ID) - detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -782,7 +782,7 @@ func TestDetectorHungCanceledJob(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan reaper.Stats) + statsCh = make(chan jobreaper.Stats) ) var ( @@ -822,7 +822,7 @@ func TestDetectorHungCanceledJob(t *testing.T) { t.Log("template import job ID: ", templateImportJob.ID) - detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now @@ -884,7 +884,7 @@ func TestDetectorPushesLogs(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan reaper.Stats) + statsCh = make(chan jobreaper.Stats) ) var ( @@ -937,7 +937,7 @@ func TestDetectorPushesLogs(t *testing.T) { require.Len(t, logs, 10) } - detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() // Create pubsub subscription to listen for new log events. @@ -1004,7 +1004,7 @@ func TestDetectorMaxJobsPerRun(t *testing.T) { db, pubsub = dbtestutil.NewDB(t) log = testutil.Logger(t) tickCh = make(chan time.Time) - statsCh = make(chan reaper.Stats) + statsCh = make(chan jobreaper.Stats) org = dbgen.Organization(t, db, database.Organization{}) user = dbgen.User(t, db, database.User{}) file = dbgen.File(t, db, database.File{}) @@ -1012,7 +1012,7 @@ func TestDetectorMaxJobsPerRun(t *testing.T) { // Create MaxJobsPerRun + 1 hung jobs. now := time.Now() - for i := 0; i < reaper.MaxJobsPerRun+1; i++ { + for i := 0; i < jobreaper.MaxJobsPerRun+1; i++ { pj := dbgen.ProvisionerJob(t, db, pubsub, database.ProvisionerJob{ CreatedAt: now.Add(-time.Hour), UpdatedAt: now.Add(-time.Hour), @@ -1035,14 +1035,14 @@ func TestDetectorMaxJobsPerRun(t *testing.T) { }) } - detector := reaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) + detector := jobreaper.New(ctx, wrapDBAuthz(db, log), pubsub, log, tickCh).WithStatsChannel(statsCh) detector.Start() tickCh <- now // Make sure that only MaxJobsPerRun jobs are terminated. stats := <-statsCh require.NoError(t, stats.Error) - require.Len(t, stats.TerminatedJobIDs, reaper.MaxJobsPerRun) + require.Len(t, stats.TerminatedJobIDs, jobreaper.MaxJobsPerRun) // Run the detector again and make sure that only the remaining job is // terminated. diff --git a/provisioner/terraform/serve.go b/provisioner/terraform/serve.go index 01d63b3b90086..67eeaccaed05d 100644 --- a/provisioner/terraform/serve.go +++ b/provisioner/terraform/serve.go @@ -16,7 +16,7 @@ import ( "cdr.dev/slog" "github.com/coder/coder/v2/coderd/database" - "github.com/coder/coder/v2/coderd/reaper" + "github.com/coder/coder/v2/coderd/jobreaper" "github.com/coder/coder/v2/provisionersdk" ) @@ -131,7 +131,7 @@ func Serve(ctx context.Context, options *ServeOptions) error { options.Tracer = trace.NewNoopTracerProvider().Tracer("noop") } if options.ExitTimeout == 0 { - options.ExitTimeout = reaper.HungJobExitTimeout + options.ExitTimeout = jobreaper.HungJobExitTimeout } return provisionersdk.Serve(ctx, &server{ execMut: &sync.Mutex{}, From 4385933bd0f1c5d0741f3fc8f4909d1a1d3e336e Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Wed, 14 May 2025 11:47:24 +0000 Subject: [PATCH 11/24] WIP --- cli/server.go | 10 +- coderd/coderdtest/coderdtest.go | 10 +- coderd/database/dbauthz/dbauthz.go | 55 +++++----- coderd/database/dbauthz/dbauthz_test.go | 29 ++---- coderd/database/dbmem/dbmem.go | 38 +++---- coderd/database/dbmetrics/querymetrics.go | 14 +-- coderd/database/dbmock/dbmock.go | 30 +++--- coderd/database/querier.go | 2 +- coderd/database/queries.sql.go | 6 +- coderd/database/queries/provisionerjobs.sql | 2 +- coderd/httpmw/loggermw/logger.go | 2 +- coderd/jobreaper/detector.go | 106 +++++++++++--------- coderd/jobreaper/detector_test.go | 18 ++-- coderd/rbac/authz.go | 2 +- coderd/rbac/object_gen.go | 1 + coderd/rbac/policy/policy.go | 3 +- codersdk/deployment.go | 8 +- codersdk/rbacresources_gen.go | 2 +- 18 files changed, 177 insertions(+), 161 deletions(-) diff --git a/cli/server.go b/cli/server.go index 994886697ff64..c04514f7c76e5 100644 --- a/cli/server.go +++ b/cli/server.go @@ -1127,11 +1127,11 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. ctx, options.Database, options.Pubsub, options.PrometheusRegistry, coderAPI.TemplateScheduleStore, &coderAPI.Auditor, coderAPI.AccessControlStore, logger, autobuildTicker.C, options.NotificationsEnqueuer) autobuildExecutor.Run() - hangDetectorTicker := time.NewTicker(vals.JobHangDetectorInterval.Value()) - defer hangDetectorTicker.Stop() - hangDetector := jobreaper.New(ctx, options.Database, options.Pubsub, logger, hangDetectorTicker.C) - hangDetector.Start() - defer hangDetector.Close() + jobReaperTicker := time.NewTicker(vals.JobReaperDetectorInterval.Value()) + defer jobReaperTicker.Stop() + jobReaper := jobreaper.New(ctx, options.Database, options.Pubsub, logger, jobReaperTicker.C) + jobReaper.Start() + defer jobReaper.Close() waitForProvisionerJobs := false // Currently there is no way to ask the server to shut diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index f8c9cedbed99e..1c36bfba97f28 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -365,11 +365,11 @@ func NewOptions(t testing.TB, options *Options) (func(http.Handler), context.Can ).WithStatsChannel(options.AutobuildStats) lifecycleExecutor.Run() - hangDetectorTicker := time.NewTicker(options.DeploymentValues.JobHangDetectorInterval.Value()) - defer hangDetectorTicker.Stop() - hangDetector := jobreaper.New(ctx, options.Database, options.Pubsub, options.Logger.Named("reaper.detector"), hangDetectorTicker.C) - hangDetector.Start() - t.Cleanup(hangDetector.Close) + jobReaperTicker := time.NewTicker(options.DeploymentValues.JobReaperDetectorInterval.Value()) + defer jobReaperTicker.Stop() + jobReaper := jobreaper.New(ctx, options.Database, options.Pubsub, options.Logger.Named("reaper.detector"), jobReaperTicker.C) + jobReaper.Start() + t.Cleanup(jobReaper.Close) if options.TelemetryReporter == nil { options.TelemetryReporter = telemetry.NewNoop() diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index 5db272f62a856..90ee7560776d0 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -220,18 +220,19 @@ var ( }.WithCachedASTValue() // See reaper package. - subjectHangDetector = rbac.Subject{ - Type: rbac.SubjectTypeHangDetector, - FriendlyName: "Hang Detector", + subjectJobReaper = rbac.Subject{ + Type: rbac.SubjectTypeJobReaper, + FriendlyName: "Job Reaper", ID: uuid.Nil.String(), Roles: rbac.Roles([]rbac.Role{ { - Identifier: rbac.RoleIdentifier{Name: "hangdetector"}, - DisplayName: "Hang Detector Daemon", + Identifier: rbac.RoleIdentifier{Name: "jobreaper"}, + DisplayName: "Job Reaper Daemon", Site: rbac.Permissions(map[string][]policy.Action{ - rbac.ResourceSystem.Type: {policy.WildcardSymbol}, - rbac.ResourceTemplate.Type: {policy.ActionRead}, - rbac.ResourceWorkspace.Type: {policy.ActionRead, policy.ActionUpdate}, + rbac.ResourceSystem.Type: {policy.WildcardSymbol}, + rbac.ResourceTemplate.Type: {policy.ActionRead}, + rbac.ResourceWorkspace.Type: {policy.ActionRead, policy.ActionUpdate}, + rbac.ResourceProvisionerJobs.Type: {policy.ActionRead, policy.ActionUpdate}, }), Org: map[string][]rbac.Permission{}, User: []rbac.Permission{}, @@ -407,10 +408,10 @@ func AsAutostart(ctx context.Context) context.Context { return As(ctx, subjectAutostart) } -// AsHangDetector returns a context with an actor that has permissions required +// AsJobReaper returns a context with an actor that has permissions required // for reaper.Detector to function. -func AsHangDetector(ctx context.Context) context.Context { - return As(ctx, subjectHangDetector) +func AsJobReaper(ctx context.Context) context.Context { + return As(ctx, subjectJobReaper) } // AsKeyRotator returns a context with an actor that has permissions required for rotating crypto keys. @@ -1074,6 +1075,13 @@ func (q *querier) customRoleCheck(ctx context.Context, role database.CustomRole) return nil } +func (q *querier) GetPendingProvisionerJobs(ctx context.Context, lastUpdatedSince time.Time) ([]database.ProvisionerJob, error) { + if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { + return nil, err + } + return q.db.GetPendingProvisionerJobs(ctx, lastUpdatedSince) +} + func (q *querier) AcquireLock(ctx context.Context, id int64) error { return q.db.AcquireLock(ctx, id) } @@ -1912,11 +1920,10 @@ func (q *querier) GetHealthSettings(ctx context.Context) (string, error) { return q.db.GetHealthSettings(ctx) } -// TODO: We need to create a ProvisionerJob resource type func (q *querier) GetHungProvisionerJobs(ctx context.Context, hungSince time.Time) ([]database.ProvisionerJob, error) { - // if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceSystem); err != nil { - // return nil, err - // } + if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { + return nil, err + } return q.db.GetHungProvisionerJobs(ctx, hungSince) } @@ -1992,10 +1999,6 @@ func (q *querier) GetLogoURL(ctx context.Context) (string, error) { return q.db.GetLogoURL(ctx) } -func (q *querier) GetNotStartedProvisionerJobs(ctx context.Context, notStartedSince time.Time) ([]database.ProvisionerJob, error) { - return q.db.GetNotStartedProvisionerJobs(ctx, notStartedSince) -} - func (q *querier) GetNotificationMessagesByStatus(ctx context.Context, arg database.GetNotificationMessagesByStatusParams) ([]database.NotificationMessage, error) { if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceNotificationMessage); err != nil { return nil, err @@ -4180,6 +4183,10 @@ func (q *querier) UpdateProvisionerJobByID(ctx context.Context, arg database.Upd } func (q *querier) UpdateProvisionerJobWithCancelByID(ctx context.Context, arg database.UpdateProvisionerJobWithCancelByIDParams) error { + if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + return err + } + job, err := q.db.GetProvisionerJobByID(ctx, arg.ID) if err != nil { return err @@ -4246,15 +4253,17 @@ func (q *querier) UpdateProvisionerJobWithCancelByID(ctx context.Context, arg da return q.db.UpdateProvisionerJobWithCancelByID(ctx, arg) } -// TODO: We need to create a ProvisionerJob resource type func (q *querier) UpdateProvisionerJobWithCompleteByID(ctx context.Context, arg database.UpdateProvisionerJobWithCompleteByIDParams) error { - // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceSystem); err != nil { - // return err - // } + if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + return err + } return q.db.UpdateProvisionerJobWithCompleteByID(ctx, arg) } func (q *querier) UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx context.Context, arg database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams) error { + if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + return err + } return q.db.UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx, arg) } diff --git a/coderd/database/dbauthz/dbauthz_test.go b/coderd/database/dbauthz/dbauthz_test.go index 01181dd6adc38..0bc116d63987d 100644 --- a/coderd/database/dbauthz/dbauthz_test.go +++ b/coderd/database/dbauthz/dbauthz_test.go @@ -3891,9 +3891,8 @@ func (s *MethodTestSuite) TestSystemFunctions() { check.Args().Asserts(rbac.ResourceSystem, policy.ActionDelete) })) s.Run("GetProvisionerJobsCreatedAfter", s.Subtest(func(db database.Store, check *expects) { - // TODO: add provisioner job resource type _ = dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{CreatedAt: time.Now().Add(-time.Hour)}) - check.Args(time.Now()).Asserts( /*rbac.ResourceSystem, policy.ActionRead*/ ) + check.Args(time.Now()).Asserts(rbac.ResourceSystem, policy.ActionRead) })) s.Run("GetTemplateVersionsByIDs", s.Subtest(func(db database.Store, check *expects) { dbtestutil.DisableForeignKeysAndTriggers(s.T(), db) @@ -3976,11 +3975,10 @@ func (s *MethodTestSuite) TestSystemFunctions() { Returns([]database.WorkspaceAgent{agt}) })) s.Run("GetProvisionerJobsByIDs", s.Subtest(func(db database.Store, check *expects) { - // TODO: add a ProvisionerJob resource type a := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) b := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args([]uuid.UUID{a.ID, b.ID}). - Asserts( /*rbac.ResourceSystem, policy.ActionRead*/ ). + Asserts(rbac.ResourceSystem, policy.ActionRead). Returns(slice.New(a, b)) })) s.Run("InsertWorkspaceAgent", s.Subtest(func(db database.Store, check *expects) { @@ -4015,7 +4013,6 @@ func (s *MethodTestSuite) TestSystemFunctions() { }).Asserts(rbac.ResourceSystem, policy.ActionUpdate).Returns() })) s.Run("AcquireProvisionerJob", s.Subtest(func(db database.Store, check *expects) { - // TODO: we need to create a ProvisionerJob resource j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{ StartedAt: sql.NullTime{Valid: false}, UpdatedAt: time.Now(), @@ -4025,54 +4022,48 @@ func (s *MethodTestSuite) TestSystemFunctions() { OrganizationID: j.OrganizationID, Types: []database.ProvisionerType{j.Provisioner}, ProvisionerTags: must(json.Marshal(j.Tags)), - }).Asserts( /*rbac.ResourceSystem, policy.ActionUpdate*/ ) + }).Asserts(rbac.ResourceSystem, policy.ActionUpdate) })) s.Run("UpdateProvisionerJobWithCompleteByID", s.Subtest(func(db database.Store, check *expects) { - // TODO: we need to create a ProvisionerJob resource j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.UpdateProvisionerJobWithCompleteByIDParams{ ID: j.ID, - }).Asserts( /*rbac.ResourceSystem, policy.ActionUpdate*/ ) + }).Asserts(rbac.ResourceSystem, policy.ActionUpdate) })) s.Run("UpdateProvisionerJobWithCompleteWithStartedAtByID", s.Subtest(func(db database.Store, check *expects) { - // TODO: we need to create a ProvisionerJob resource j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams{ ID: j.ID, - }).Asserts( /*rbac.ResourceSystem, policy.ActionUpdate*/ ) + }).Asserts(rbac.ResourceSystem, policy.ActionUpdate) })) s.Run("UpdateProvisionerJobByID", s.Subtest(func(db database.Store, check *expects) { - // TODO: we need to create a ProvisionerJob resource j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.UpdateProvisionerJobByIDParams{ ID: j.ID, UpdatedAt: time.Now(), - }).Asserts( /*rbac.ResourceSystem, policy.ActionUpdate*/ ) + }).Asserts(rbac.ResourceSystem, policy.ActionUpdate) })) s.Run("InsertProvisionerJob", s.Subtest(func(db database.Store, check *expects) { dbtestutil.DisableForeignKeysAndTriggers(s.T(), db) - // TODO: we need to create a ProvisionerJob resource check.Args(database.InsertProvisionerJobParams{ ID: uuid.New(), Provisioner: database.ProvisionerTypeEcho, StorageMethod: database.ProvisionerStorageMethodFile, Type: database.ProvisionerJobTypeWorkspaceBuild, Input: json.RawMessage("{}"), - }).Asserts( /*rbac.ResourceSystem, policy.ActionCreate*/ ) + }).Asserts(rbac.ResourceSystem, policy.ActionCreate) })) s.Run("InsertProvisionerJobLogs", s.Subtest(func(db database.Store, check *expects) { - // TODO: we need to create a ProvisionerJob resource j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.InsertProvisionerJobLogsParams{ JobID: j.ID, - }).Asserts( /*rbac.ResourceSystem, policy.ActionCreate*/ ) + }).Asserts(rbac.ResourceSystem, policy.ActionCreate) })) s.Run("InsertProvisionerJobTimings", s.Subtest(func(db database.Store, check *expects) { - // TODO: we need to create a ProvisionerJob resource j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.InsertProvisionerJobTimingsParams{ JobID: j.ID, - }).Asserts( /*rbac.ResourceSystem, policy.ActionCreate*/ ) + }).Asserts(rbac.ResourceSystem, policy.ActionCreate) })) s.Run("UpsertProvisionerDaemon", s.Subtest(func(db database.Store, check *expects) { dbtestutil.DisableForeignKeysAndTriggers(s.T(), db) @@ -4211,7 +4202,7 @@ func (s *MethodTestSuite) TestSystemFunctions() { s.Run("GetHungProvisionerJobs", s.Subtest(func(db database.Store, check *expects) { check.Args(time.Time{}).Asserts() })) - s.Run("GetNotStartedProvisionerJobs", s.Subtest(func(db database.Store, check *expects) { + s.Run("GetPendingProvisionerJobs", s.Subtest(func(db database.Store, check *expects) { check.Args(time.Time{}).Asserts() })) s.Run("UpsertOAuthSigningKey", s.Subtest(func(db database.Store, check *expects) { diff --git a/coderd/database/dbmem/dbmem.go b/coderd/database/dbmem/dbmem.go index 43c804d1fdb46..ffc862d221c7d 100644 --- a/coderd/database/dbmem/dbmem.go +++ b/coderd/database/dbmem/dbmem.go @@ -1386,6 +1386,10 @@ func isDeprecated(template database.Template) bool { return template.Deprecated != "" } +func (q *FakeQuerier) GetProvisionerJobsToBeReaped(ctx context.Context, updatedAt time.Time) ([]database.ProvisionerJob, error) { + panic("not implemented") +} + func (*FakeQuerier) AcquireLock(_ context.Context, _ int64) error { return xerrors.New("AcquireLock must only be called within a transaction") } @@ -3897,23 +3901,6 @@ func (q *FakeQuerier) GetLogoURL(_ context.Context) (string, error) { return q.logoURL, nil } -func (q *FakeQuerier) GetNotStartedProvisionerJobs(ctx context.Context, notStartedSince time.Time) ([]database.ProvisionerJob, error) { - q.mutex.RLock() - defer q.mutex.RUnlock() - - notStartedJobs := []database.ProvisionerJob{} - for _, provisionerJob := range q.provisionerJobs { - if !provisionerJob.StartedAt.Valid && !provisionerJob.CompletedAt.Valid && provisionerJob.UpdatedAt.Before(notStartedSince) { - // clone the Tags before appending, since maps are reference types and - // we don't want the caller to be able to mutate the map we have inside - // dbmem! - provisionerJob.Tags = maps.Clone(provisionerJob.Tags) - notStartedJobs = append(notStartedJobs, provisionerJob) - } - } - return notStartedJobs, nil -} - func (q *FakeQuerier) GetNotificationMessagesByStatus(_ context.Context, arg database.GetNotificationMessagesByStatusParams) ([]database.NotificationMessage, error) { err := validateDatabaseType(arg) if err != nil { @@ -4291,6 +4278,23 @@ func (q *FakeQuerier) GetParameterSchemasByJobID(_ context.Context, jobID uuid.U return parameters, nil } +func (q *FakeQuerier) GetPendingProvisionerJobs(_ context.Context, lastUpdatedSince time.Time) ([]database.ProvisionerJob, error) { + q.mutex.RLock() + defer q.mutex.RUnlock() + + pendingJobs := []database.ProvisionerJob{} + for _, provisionerJob := range q.provisionerJobs { + if !provisionerJob.StartedAt.Valid && !provisionerJob.CompletedAt.Valid && provisionerJob.UpdatedAt.Before(lastUpdatedSince) { + // clone the Tags before appending, since maps are reference types and + // we don't want the caller to be able to mutate the map we have inside + // dbmem! + provisionerJob.Tags = maps.Clone(provisionerJob.Tags) + pendingJobs = append(pendingJobs, provisionerJob) + } + } + return pendingJobs, nil +} + func (*FakeQuerier) GetPrebuildMetrics(_ context.Context) ([]database.GetPrebuildMetricsRow, error) { return nil, ErrUnimplemented } diff --git a/coderd/database/dbmetrics/querymetrics.go b/coderd/database/dbmetrics/querymetrics.go index 2d3aea66a6ad8..7e31679d88a50 100644 --- a/coderd/database/dbmetrics/querymetrics.go +++ b/coderd/database/dbmetrics/querymetrics.go @@ -949,13 +949,6 @@ func (m queryMetricsStore) GetLogoURL(ctx context.Context) (string, error) { return url, err } -func (m queryMetricsStore) GetNotStartedProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]database.ProvisionerJob, error) { - start := time.Now() - r0, r1 := m.s.GetNotStartedProvisionerJobs(ctx, updatedAt) - m.queryLatencies.WithLabelValues("GetNotStartedProvisionerJobs").Observe(time.Since(start).Seconds()) - return r0, r1 -} - func (m queryMetricsStore) GetNotificationMessagesByStatus(ctx context.Context, arg database.GetNotificationMessagesByStatusParams) ([]database.NotificationMessage, error) { start := time.Now() r0, r1 := m.s.GetNotificationMessagesByStatus(ctx, arg) @@ -1117,6 +1110,13 @@ func (m queryMetricsStore) GetParameterSchemasByJobID(ctx context.Context, jobID return schemas, err } +func (m queryMetricsStore) GetPendingProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]database.ProvisionerJob, error) { + start := time.Now() + r0, r1 := m.s.GetPendingProvisionerJobs(ctx, updatedAt) + m.queryLatencies.WithLabelValues("GetPendingProvisionerJobs").Observe(time.Since(start).Seconds()) + return r0, r1 +} + func (m queryMetricsStore) GetPrebuildMetrics(ctx context.Context) ([]database.GetPrebuildMetricsRow, error) { start := time.Now() r0, r1 := m.s.GetPrebuildMetrics(ctx) diff --git a/coderd/database/dbmock/dbmock.go b/coderd/database/dbmock/dbmock.go index 0e1abb4d9dca6..38c0f7b35f3e9 100644 --- a/coderd/database/dbmock/dbmock.go +++ b/coderd/database/dbmock/dbmock.go @@ -1923,21 +1923,6 @@ func (mr *MockStoreMockRecorder) GetLogoURL(ctx any) *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetLogoURL", reflect.TypeOf((*MockStore)(nil).GetLogoURL), ctx) } -// GetNotStartedProvisionerJobs mocks base method. -func (m *MockStore) GetNotStartedProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]database.ProvisionerJob, error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "GetNotStartedProvisionerJobs", ctx, updatedAt) - ret0, _ := ret[0].([]database.ProvisionerJob) - ret1, _ := ret[1].(error) - return ret0, ret1 -} - -// GetNotStartedProvisionerJobs indicates an expected call of GetNotStartedProvisionerJobs. -func (mr *MockStoreMockRecorder) GetNotStartedProvisionerJobs(ctx, updatedAt any) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetNotStartedProvisionerJobs", reflect.TypeOf((*MockStore)(nil).GetNotStartedProvisionerJobs), ctx, updatedAt) -} - // GetNotificationMessagesByStatus mocks base method. func (m *MockStore) GetNotificationMessagesByStatus(ctx context.Context, arg database.GetNotificationMessagesByStatusParams) ([]database.NotificationMessage, error) { m.ctrl.T.Helper() @@ -2283,6 +2268,21 @@ func (mr *MockStoreMockRecorder) GetParameterSchemasByJobID(ctx, jobID any) *gom return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetParameterSchemasByJobID", reflect.TypeOf((*MockStore)(nil).GetParameterSchemasByJobID), ctx, jobID) } +// GetPendingProvisionerJobs mocks base method. +func (m *MockStore) GetPendingProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]database.ProvisionerJob, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetPendingProvisionerJobs", ctx, updatedAt) + ret0, _ := ret[0].([]database.ProvisionerJob) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetPendingProvisionerJobs indicates an expected call of GetPendingProvisionerJobs. +func (mr *MockStoreMockRecorder) GetPendingProvisionerJobs(ctx, updatedAt any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetPendingProvisionerJobs", reflect.TypeOf((*MockStore)(nil).GetPendingProvisionerJobs), ctx, updatedAt) +} + // GetPrebuildMetrics mocks base method. func (m *MockStore) GetPrebuildMetrics(ctx context.Context) ([]database.GetPrebuildMetricsRow, error) { m.ctrl.T.Helper() diff --git a/coderd/database/querier.go b/coderd/database/querier.go index 5167a7099dae3..52a7ff22daa98 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -213,7 +213,6 @@ type sqlcQuerier interface { GetLicenseByID(ctx context.Context, id int32) (License, error) GetLicenses(ctx context.Context) ([]License, error) GetLogoURL(ctx context.Context) (string, error) - GetNotStartedProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]ProvisionerJob, error) GetNotificationMessagesByStatus(ctx context.Context, arg GetNotificationMessagesByStatusParams) ([]NotificationMessage, error) // Fetch the notification report generator log indicating recent activity. GetNotificationReportGeneratorLogByTemplate(ctx context.Context, templateID uuid.UUID) (NotificationReportGeneratorLog, error) @@ -238,6 +237,7 @@ type sqlcQuerier interface { GetOrganizations(ctx context.Context, arg GetOrganizationsParams) ([]Organization, error) GetOrganizationsByUserID(ctx context.Context, arg GetOrganizationsByUserIDParams) ([]Organization, error) GetParameterSchemasByJobID(ctx context.Context, jobID uuid.UUID) ([]ParameterSchema, error) + GetPendingProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]ProvisionerJob, error) GetPrebuildMetrics(ctx context.Context) ([]GetPrebuildMetricsRow, error) GetPresetByID(ctx context.Context, presetID uuid.UUID) (GetPresetByIDRow, error) GetPresetByWorkspaceBuildID(ctx context.Context, workspaceBuildID uuid.UUID) (TemplateVersionPreset, error) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index b8ac4452beb19..dce0903e84b83 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -7430,7 +7430,7 @@ func (q *sqlQuerier) GetHungProvisionerJobs(ctx context.Context, updatedAt time. return items, nil } -const getNotStartedProvisionerJobs = `-- name: GetNotStartedProvisionerJobs :many +const getPendingProvisionerJobs = `-- name: GetPendingProvisionerJobs :many SELECT id, created_at, updated_at, started_at, canceled_at, completed_at, error, organization_id, initiator_id, provisioner, storage_method, type, input, worker_id, file_id, tags, error_code, trace_metadata, job_status FROM @@ -7441,8 +7441,8 @@ WHERE AND completed_at IS NULL ` -func (q *sqlQuerier) GetNotStartedProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]ProvisionerJob, error) { - rows, err := q.db.QueryContext(ctx, getNotStartedProvisionerJobs, updatedAt) +func (q *sqlQuerier) GetPendingProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]ProvisionerJob, error) { + rows, err := q.db.QueryContext(ctx, getPendingProvisionerJobs, updatedAt) if err != nil { return nil, err } diff --git a/coderd/database/queries/provisionerjobs.sql b/coderd/database/queries/provisionerjobs.sql index d464bdfa4d587..02654eb4ddc4e 100644 --- a/coderd/database/queries/provisionerjobs.sql +++ b/coderd/database/queries/provisionerjobs.sql @@ -278,7 +278,7 @@ WHERE AND started_at IS NOT NULL AND completed_at IS NULL; --- name: GetNotStartedProvisionerJobs :many +-- name: GetPendingProvisionerJobs :many SELECT * FROM diff --git a/coderd/httpmw/loggermw/logger.go b/coderd/httpmw/loggermw/logger.go index 9eeb07a5f10e5..30e5e2d811ad8 100644 --- a/coderd/httpmw/loggermw/logger.go +++ b/coderd/httpmw/loggermw/logger.go @@ -132,7 +132,7 @@ var actorLogOrder = []rbac.SubjectType{ rbac.SubjectTypeAutostart, rbac.SubjectTypeCryptoKeyReader, rbac.SubjectTypeCryptoKeyRotator, - rbac.SubjectTypeHangDetector, + rbac.SubjectTypeJobReaper, rbac.SubjectTypeNotifier, rbac.SubjectTypePrebuildsOrchestrator, rbac.SubjectTypeProvisionerd, diff --git a/coderd/jobreaper/detector.go b/coderd/jobreaper/detector.go index 14d8068eadae6..3ff1f6f6ddf86 100644 --- a/coderd/jobreaper/detector.go +++ b/coderd/jobreaper/detector.go @@ -25,9 +25,9 @@ const ( // before it is considered hung. HungJobDuration = 5 * time.Minute - // NotStartedTimeElapsed is the duration of time since the last update to a job + // PendingJobTimeElapsed is the duration of time since the last update to a job // before it is considered hung. - NotStartedTimeElapsed = 30 * time.Minute + PendingJobTimeElapsed = 30 * time.Minute // HungJobExitTimeout is the duration of time that provisioners should allow // for a graceful exit upon cancellation due to failing to send an update to @@ -42,24 +42,23 @@ const ( MaxJobsPerRun = 10 ) -type jobType string - -const ( - hungJobType jobType = "hung" - notStartedJobType jobType = "not started" -) - // jobLogMessages are written to provisioner job logs when a job is reaped -func jobLogMessages(jobType jobType, threshold float64) []string { +func jobLogMessages(jobType database.ProvisionerJobStatus, threshold time.Duration) []string { return []string{ "", "====================", - fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and will be terminated.", jobType, threshold), + fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and will be terminated.", jobType, threshold.Minutes()), "====================", "", } } +type jobToReap struct { + ID uuid.UUID + Threshold time.Duration + Status database.ProvisionerJobStatus +} + // acquireLockError is returned when the detector fails to acquire a lock and // cancels the current run. type acquireLockError struct{} @@ -108,7 +107,7 @@ type Stats struct { // New returns a new hang detector. func New(ctx context.Context, db database.Store, pub pubsub.Pubsub, log slog.Logger, tick <-chan time.Time) *Detector { //nolint:gocritic // Hang detector has a limited set of permissions. - ctx, cancel := context.WithCancel(dbauthz.AsHangDetector(ctx)) + ctx, cancel := context.WithCancel(dbauthz.AsJobReaper(ctx)) d := &Detector{ ctx: ctx, cancel: cancel, @@ -184,42 +183,55 @@ func (d *Detector) run(t time.Time) Stats { Error: nil, } + jobsToReap := make([]*jobToReap, 0, MaxJobsPerRun) + // Find all provisioner jobs that are currently running but have not // received an update in the last 5 minutes. - jobs, err := d.db.GetHungProvisionerJobs(ctx, t.Add(-HungJobDuration)) + hungJobs, err := d.db.GetHungProvisionerJobs(ctx, t.Add(-HungJobDuration)) if err != nil { - stats.Error = xerrors.Errorf("get %s provisioner jobs: %w", hungJobType, err) + stats.Error = xerrors.Errorf("get hung provisioner jobs: %w", err) return stats } + for _, job := range hungJobs { + jobsToReap = append(jobsToReap, &jobToReap{ + ID: job.ID, + Threshold: HungJobDuration, + }) + } + // Find all provisioner jobs that have not been started yet and have not // received an update in the last 30 minutes. - jobsNotStarted, err := d.db.GetNotStartedProvisionerJobs(ctx, t.Add(-NotStartedTimeElapsed)) + jobsPending, err := d.db.GetPendingProvisionerJobs(ctx, t.Add(-PendingJobTimeElapsed)) if err != nil { - stats.Error = xerrors.Errorf("get %s provisioner jobs: %w", notStartedJobType, err) + stats.Error = xerrors.Errorf("get pending provisioner jobs: %w", err) return stats } - jobs = append(jobs, jobsNotStarted...) + for _, job := range jobsPending { + jobsToReap = append(jobsToReap, &jobToReap{ + ID: job.ID, + Threshold: PendingJobTimeElapsed, + }) + } - // Limit the number of jobs we'll unhang in a single run to avoid + // Limit the number of jobs we'll reap in a single run to avoid // timing out. - if len(jobs) > MaxJobsPerRun { - // Pick a random subset of the jobs to unhang. - rand.Shuffle(len(jobs), func(i, j int) { - jobs[i], jobs[j] = jobs[j], jobs[i] + if len(jobsToReap) > MaxJobsPerRun { + // Pick a random subset of the jobs to reap. + rand.Shuffle(len(jobsToReap), func(i, j int) { + jobsToReap[i], jobsToReap[j] = jobsToReap[j], jobsToReap[i] }) - jobs = jobs[:MaxJobsPerRun] + jobsToReap = jobsToReap[:MaxJobsPerRun] } - // Send a message into the build log for each hung or not started job saying that it + // Send a message into the build log for each hung or pending job saying that it // has been detected and will be terminated, then mark the job as failed. - for _, job := range jobs { + for _, job := range jobsToReap { log := d.log.With(slog.F("job_id", job.ID)) - err := reapJob(ctx, log, d.db, d.pubsub, job.ID) + err := reapJob(ctx, log, d.db, d.pubsub, job) if err != nil { - jobType, _ := reapParamsFromJob(job) if !(xerrors.As(err, &acquireLockError{}) || xerrors.As(err, &jobIneligibleError{})) { - log.Error(ctx, fmt.Sprintf("error forcefully terminating %s provisioner job", jobType), slog.Error(err)) + log.Error(ctx, fmt.Sprintf("error forcefully terminating %s provisioner job", job.Status), slog.Error(err)) } continue } @@ -230,11 +242,11 @@ func (d *Detector) run(t time.Time) Stats { return stats } -func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub.Pubsub, jobID uuid.UUID) error { +func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub.Pubsub, jobToReap *jobToReap) error { var lowestLogID int64 err := db.InTx(func(db database.Store) error { - locked, err := db.TryAcquireLock(ctx, database.GenLockID(fmt.Sprintf("reaper:%s", jobID))) + locked, err := db.TryAcquireLock(ctx, database.GenLockID(fmt.Sprintf("reaper:%s", jobToReap.ID))) if err != nil { return xerrors.Errorf("acquire lock: %w", err) } @@ -244,26 +256,26 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub } // Refetch the job while we hold the lock. - job, err := db.GetProvisionerJobByID(ctx, jobID) + job, err := db.GetProvisionerJobByID(ctx, jobToReap.ID) if err != nil { return xerrors.Errorf("get provisioner job: %w", err) } - jobType, threshold := reapParamsFromJob(job) + threshold := reapParamsFromJob(job) if job.CompletedAt.Valid { return jobIneligibleError{ Err: xerrors.Errorf("job is completed (status %s)", job.JobStatus), } } - if job.UpdatedAt.After(time.Now().Add(-time.Duration(threshold) * time.Minute)) { + if job.UpdatedAt.After(time.Now().Add(-threshold)) { return jobIneligibleError{ Err: xerrors.New("job has been updated recently"), } } log.Warn( - ctx, fmt.Sprintf("detected %s provisioner job, forcefully terminating", jobType), + ctx, fmt.Sprintf("detected %s provisioner job, forcefully terminating", job.JobStatus), "threshold", threshold, ) @@ -274,7 +286,7 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub CreatedAfter: 0, }) if err != nil { - return xerrors.Errorf("get logs for %s job: %w", jobType, err) + return xerrors.Errorf("get logs for %s job: %w", job.JobStatus, err) } logStage := "" if len(logs) != 0 { @@ -294,7 +306,7 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub Output: nil, } now := dbtime.Now() - for i, msg := range jobLogMessages(jobType, threshold) { + for i, msg := range jobLogMessages(job.JobStatus, threshold) { // Set the created at in a way that ensures each message has // a unique timestamp so they will be sorted correctly. insertParams.CreatedAt = append(insertParams.CreatedAt, now.Add(time.Millisecond*time.Duration(i))) @@ -305,16 +317,16 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub } newLogs, err := db.InsertProvisionerJobLogs(ctx, insertParams) if err != nil { - return xerrors.Errorf("insert logs for %s job: %w", jobType, err) + return xerrors.Errorf("insert logs for %s job: %w", job.JobStatus, err) } lowestLogID = newLogs[0].ID // Mark the job as failed. now = dbtime.Now() - // If the job was never started, set the StartedAt time to the current + // If the job was never started (pending), set the StartedAt time to the current // time so that the build duration is correct. - if !job.StartedAt.Valid { + if job.JobStatus == database.ProvisionerJobStatusPending { job.StartedAt = sql.NullTime{ Time: now, Valid: true, @@ -328,7 +340,7 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub Valid: true, }, Error: sql.NullString{ - String: fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and has been terminated by hang detector.", jobType, threshold), + String: fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and has been terminated by hang detector.", job.JobStatus, threshold.Minutes()), Valid: true, }, ErrorCode: sql.NullString{ @@ -388,7 +400,7 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub if err != nil { return xerrors.Errorf("marshal log notification: %w", err) } - err = pub.Publish(provisionersdk.ProvisionerJobLogsNotifyChannel(jobID), data) + err = pub.Publish(provisionersdk.ProvisionerJobLogsNotifyChannel(jobToReap.ID), data) if err != nil { return xerrors.Errorf("publish log notification: %w", err) } @@ -396,12 +408,10 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub return nil } -func reapParamsFromJob(job database.ProvisionerJob) (jobType, float64) { - jobType := hungJobType - threshold := HungJobDuration.Minutes() - if !job.StartedAt.Valid { - jobType = notStartedJobType - threshold = NotStartedTimeElapsed.Minutes() +func reapParamsFromJob(job database.ProvisionerJob) time.Duration { + threshold := HungJobDuration + if job.JobStatus == database.ProvisionerJobStatusPending { + threshold = PendingJobTimeElapsed } - return jobType, threshold + return threshold } diff --git a/coderd/jobreaper/detector_test.go b/coderd/jobreaper/detector_test.go index dba1c013b0766..ada9a9853f032 100644 --- a/coderd/jobreaper/detector_test.go +++ b/coderd/jobreaper/detector_test.go @@ -31,8 +31,8 @@ import ( type jobType string const ( - hungJobType jobType = "hung" - notStartedJobType jobType = "not started" + hungJobType jobType = "hung" + pendingJobType jobType = "pending" ) // jobLogMessages returns the messages to be written to provisioner job logs when a job is reaped @@ -51,8 +51,8 @@ func reapParamsFromJob(job database.ProvisionerJob) (jobType, float64) { jobType := hungJobType threshold := jobreaper.HungJobDuration.Minutes() if !job.StartedAt.Valid { - jobType = notStartedJobType - threshold = jobreaper.NotStartedTimeElapsed.Minutes() + jobType = pendingJobType + threshold = jobreaper.PendingJobTimeElapsed.Minutes() } return jobType, threshold } @@ -469,7 +469,7 @@ func TestDetectorHungWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *testing.T detector.Wait() } -func TestDetectorNotStartedWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *testing.T) { +func TestDetectorPendingWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *testing.T) { t.Parallel() var ( @@ -551,7 +551,7 @@ func TestDetectorNotStartedWorkspaceBuildNoOverrideStateIfNoExistingBuild(t *tes require.True(t, job.StartedAt.Valid) require.WithinDuration(t, now, job.StartedAt.Time, 30*time.Second) require.True(t, job.Error.Valid) - require.Contains(t, job.Error.String, "Build has been detected as not started") + require.Contains(t, job.Error.String, "Build has been detected as pending") require.False(t, job.ErrorCode.Valid) // Check that the provisioner state was NOT updated. @@ -667,7 +667,7 @@ func TestDetectorHungOtherJobTypes(t *testing.T) { detector.Wait() } -func TestDetectorNotStartedOtherJobTypes(t *testing.T) { +func TestDetectorPendingOtherJobTypes(t *testing.T) { t.Parallel() var ( @@ -755,7 +755,7 @@ func TestDetectorNotStartedOtherJobTypes(t *testing.T) { require.True(t, job.StartedAt.Valid) require.WithinDuration(t, now, job.StartedAt.Time, 30*time.Second) require.True(t, job.Error.Valid) - require.Contains(t, job.Error.String, "Build has been detected as not started") + require.Contains(t, job.Error.String, "Build has been detected as pending") require.False(t, job.ErrorCode.Valid) // Check that the template dry-run job was updated. @@ -767,7 +767,7 @@ func TestDetectorNotStartedOtherJobTypes(t *testing.T) { require.True(t, job.StartedAt.Valid) require.WithinDuration(t, now, job.StartedAt.Time, 30*time.Second) require.True(t, job.Error.Valid) - require.Contains(t, job.Error.String, "Build has been detected as not started") + require.Contains(t, job.Error.String, "Build has been detected as pending") require.False(t, job.ErrorCode.Valid) detector.Close() diff --git a/coderd/rbac/authz.go b/coderd/rbac/authz.go index d2c6d5d0675be..c63042a2a1363 100644 --- a/coderd/rbac/authz.go +++ b/coderd/rbac/authz.go @@ -65,7 +65,7 @@ const ( SubjectTypeUser SubjectType = "user" SubjectTypeProvisionerd SubjectType = "provisionerd" SubjectTypeAutostart SubjectType = "autostart" - SubjectTypeHangDetector SubjectType = "hang_detector" + SubjectTypeJobReaper SubjectType = "job_reaper" SubjectTypeResourceMonitor SubjectType = "resource_monitor" SubjectTypeCryptoKeyRotator SubjectType = "crypto_key_rotator" SubjectTypeCryptoKeyReader SubjectType = "crypto_key_reader" diff --git a/coderd/rbac/object_gen.go b/coderd/rbac/object_gen.go index 40b7dc87a56f8..0336616b5e550 100644 --- a/coderd/rbac/object_gen.go +++ b/coderd/rbac/object_gen.go @@ -235,6 +235,7 @@ var ( // ResourceProvisionerJobs // Valid Actions // - "ActionRead" :: read provisioner jobs + // - "ActionUpdate" :: update provisioner jobs ResourceProvisionerJobs = Object{ Type: "provisioner_jobs", } diff --git a/coderd/rbac/policy/policy.go b/coderd/rbac/policy/policy.go index 35da0892abfdb..070692c5e50c6 100644 --- a/coderd/rbac/policy/policy.go +++ b/coderd/rbac/policy/policy.go @@ -182,7 +182,8 @@ var RBACPermissions = map[string]PermissionDefinition{ }, "provisioner_jobs": { Actions: map[Action]ActionDefinition{ - ActionRead: actDef("read provisioner jobs"), + ActionRead: actDef("read provisioner jobs"), + ActionUpdate: actDef("update provisioner jobs"), }, }, "organization": { diff --git a/codersdk/deployment.go b/codersdk/deployment.go index 0741bf9e3844a..39b67feb2c73a 100644 --- a/codersdk/deployment.go +++ b/codersdk/deployment.go @@ -345,7 +345,7 @@ type DeploymentValues struct { // HTTPAddress is a string because it may be set to zero to disable. HTTPAddress serpent.String `json:"http_address,omitempty" typescript:",notnull"` AutobuildPollInterval serpent.Duration `json:"autobuild_poll_interval,omitempty"` - JobHangDetectorInterval serpent.Duration `json:"job_hang_detector_interval,omitempty"` + JobReaperDetectorInterval serpent.Duration `json:"job_hang_detector_interval,omitempty"` DERP DERP `json:"derp,omitempty" typescript:",notnull"` Prometheus PrometheusConfig `json:"prometheus,omitempty" typescript:",notnull"` Pprof PprofConfig `json:"pprof,omitempty" typescript:",notnull"` @@ -1287,13 +1287,13 @@ func (c *DeploymentValues) Options() serpent.OptionSet { Annotations: serpent.Annotations{}.Mark(annotationFormatDuration, "true"), }, { - Name: "Job Hang Detector Interval", - Description: "Interval to poll for hung jobs and automatically terminate them.", + Name: "Job Reaper Detect Interval", + Description: "Interval to poll for hung and pending jobs and automatically terminate them.", Flag: "job-hang-detector-interval", Env: "CODER_JOB_HANG_DETECTOR_INTERVAL", Hidden: true, Default: time.Minute.String(), - Value: &c.JobHangDetectorInterval, + Value: &c.JobReaperDetectorInterval, YAML: "jobHangDetectorInterval", Annotations: serpent.Annotations{}.Mark(annotationFormatDuration, "true"), }, diff --git a/codersdk/rbacresources_gen.go b/codersdk/rbacresources_gen.go index 54f65767928d6..737b7beeba29d 100644 --- a/codersdk/rbacresources_gen.go +++ b/codersdk/rbacresources_gen.go @@ -90,7 +90,7 @@ var RBACResourceActions = map[RBACResource][]RBACAction{ ResourceOrganization: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, ResourceOrganizationMember: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, ResourceProvisionerDaemon: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, - ResourceProvisionerJobs: {ActionRead}, + ResourceProvisionerJobs: {ActionRead, ActionUpdate}, ResourceReplicas: {ActionRead}, ResourceSystem: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, ResourceTailnetCoordinator: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, From 96fee5183e7848f01970c30709d56d13bf2a0e11 Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Wed, 14 May 2025 13:12:31 +0000 Subject: [PATCH 12/24] WIP --- coderd/database/dbauthz/dbauthz.go | 61 ++++++++++++++---------------- coderd/rbac/roles.go | 2 +- coderd/rbac/roles_test.go | 2 +- codersdk/rbacresources_gen.go | 2 +- 4 files changed, 31 insertions(+), 36 deletions(-) diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index 90ee7560776d0..d01d86bd35f01 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -170,10 +170,10 @@ var ( Identifier: rbac.RoleIdentifier{Name: "provisionerd"}, DisplayName: "Provisioner Daemon", Site: rbac.Permissions(map[string][]policy.Action{ - // TODO: Add ProvisionerJob resource type. - rbac.ResourceFile.Type: {policy.ActionRead}, - rbac.ResourceSystem.Type: {policy.WildcardSymbol}, - rbac.ResourceTemplate.Type: {policy.ActionRead, policy.ActionUpdate}, + rbac.ResourceProvisionerJobs.Type: {policy.ActionRead, policy.ActionUpdate}, + rbac.ResourceFile.Type: {policy.ActionRead}, + rbac.ResourceSystem.Type: {policy.WildcardSymbol}, + rbac.ResourceTemplate.Type: {policy.ActionRead, policy.ActionUpdate}, // Unsure why provisionerd needs update and read personal rbac.ResourceUser.Type: {policy.ActionRead, policy.ActionReadPersonal, policy.ActionUpdatePersonal}, rbac.ResourceWorkspaceDormant.Type: {policy.ActionDelete, policy.ActionRead, policy.ActionUpdate, policy.ActionWorkspaceStop}, @@ -1093,11 +1093,10 @@ func (q *querier) AcquireNotificationMessages(ctx context.Context, arg database. return q.db.AcquireNotificationMessages(ctx, arg) } -// TODO: We need to create a ProvisionerJob resource type func (q *querier) AcquireProvisionerJob(ctx context.Context, arg database.AcquireProvisionerJobParams) (database.ProvisionerJob, error) { - // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceSystem); err != nil { - // return database.ProvisionerJob{}, err - // } + if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + return database.ProvisionerJob{}, err + } return q.db.AcquireProvisionerJob(ctx, arg) } @@ -2322,16 +2321,17 @@ func (q *querier) GetProvisionerJobTimingsByJobID(ctx context.Context, jobID uui return q.db.GetProvisionerJobTimingsByJobID(ctx, jobID) } -// TODO: We have a ProvisionerJobs resource, but it hasn't been checked for this use-case. func (q *querier) GetProvisionerJobsByIDs(ctx context.Context, ids []uuid.UUID) ([]database.ProvisionerJob, error) { - // if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceSystem); err != nil { - // return nil, err - // } + if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { + return nil, err + } return q.db.GetProvisionerJobsByIDs(ctx, ids) } -// TODO: We have a ProvisionerJobs resource, but it hasn't been checked for this use-case. func (q *querier) GetProvisionerJobsByIDsWithQueuePosition(ctx context.Context, ids []uuid.UUID) ([]database.GetProvisionerJobsByIDsWithQueuePositionRow, error) { + if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { + return nil, err + } return q.db.GetProvisionerJobsByIDsWithQueuePosition(ctx, ids) } @@ -2339,11 +2339,10 @@ func (q *querier) GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndP return fetchWithPostFilter(q.auth, policy.ActionRead, q.db.GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisioner)(ctx, arg) } -// TODO: We have a ProvisionerJobs resource, but it hasn't been checked for this use-case. func (q *querier) GetProvisionerJobsCreatedAfter(ctx context.Context, createdAt time.Time) ([]database.ProvisionerJob, error) { - // if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceSystem); err != nil { - // return nil, err - // } + if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { + return nil, err + } return q.db.GetProvisionerJobsCreatedAfter(ctx, createdAt) } @@ -3531,27 +3530,24 @@ func (q *querier) InsertPresetParameters(ctx context.Context, arg database.Inser return q.db.InsertPresetParameters(ctx, arg) } -// TODO: We need to create a ProvisionerJob resource type func (q *querier) InsertProvisionerJob(ctx context.Context, arg database.InsertProvisionerJobParams) (database.ProvisionerJob, error) { - // if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceSystem); err != nil { - // return database.ProvisionerJob{}, err - // } + if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceProvisionerJobs); err != nil { + return database.ProvisionerJob{}, err + } return q.db.InsertProvisionerJob(ctx, arg) } -// TODO: We need to create a ProvisionerJob resource type func (q *querier) InsertProvisionerJobLogs(ctx context.Context, arg database.InsertProvisionerJobLogsParams) ([]database.ProvisionerJobLog, error) { - // if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceSystem); err != nil { - // return nil, err - // } + if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceProvisionerJobs); err != nil { + return nil, err + } return q.db.InsertProvisionerJobLogs(ctx, arg) } -// TODO: We need to create a ProvisionerJob resource type func (q *querier) InsertProvisionerJobTimings(ctx context.Context, arg database.InsertProvisionerJobTimingsParams) ([]database.ProvisionerJobTiming, error) { - // if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceSystem); err != nil { - // return nil, err - // } + if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceProvisionerJobs); err != nil { + return nil, err + } return q.db.InsertProvisionerJobTimings(ctx, arg) } @@ -4174,11 +4170,10 @@ func (q *querier) UpdateProvisionerDaemonLastSeenAt(ctx context.Context, arg dat return q.db.UpdateProvisionerDaemonLastSeenAt(ctx, arg) } -// TODO: We need to create a ProvisionerJob resource type func (q *querier) UpdateProvisionerJobByID(ctx context.Context, arg database.UpdateProvisionerJobByIDParams) error { - // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceSystem); err != nil { - // return err - // } + if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + return err + } return q.db.UpdateProvisionerJobByID(ctx, arg) } diff --git a/coderd/rbac/roles.go b/coderd/rbac/roles.go index 56124faee44e2..0b94a74201b16 100644 --- a/coderd/rbac/roles.go +++ b/coderd/rbac/roles.go @@ -503,7 +503,7 @@ func ReloadBuiltinRoles(opts *RoleOptions) { // the ability to create templates and provisioners has // a lot of overlap. ResourceProvisionerDaemon.Type: {policy.ActionCreate, policy.ActionRead, policy.ActionUpdate, policy.ActionDelete}, - ResourceProvisionerJobs.Type: {policy.ActionRead}, + ResourceProvisionerJobs.Type: {policy.ActionRead, policy.ActionUpdate, policy.ActionCreate}, }), }, User: []Permission{}, diff --git a/coderd/rbac/roles_test.go b/coderd/rbac/roles_test.go index e90c89914fdec..5f1381caedf27 100644 --- a/coderd/rbac/roles_test.go +++ b/coderd/rbac/roles_test.go @@ -580,7 +580,7 @@ func TestRolePermissions(t *testing.T) { }, { Name: "ProvisionerJobs", - Actions: []policy.Action{policy.ActionRead}, + Actions: []policy.Action{policy.ActionRead, policy.ActionUpdate}, Resource: rbac.ResourceProvisionerJobs.InOrg(orgID), AuthorizeMap: map[bool][]hasAuthSubjects{ true: {owner, orgTemplateAdmin, orgAdmin}, diff --git a/codersdk/rbacresources_gen.go b/codersdk/rbacresources_gen.go index 737b7beeba29d..8a89ac6d5e507 100644 --- a/codersdk/rbacresources_gen.go +++ b/codersdk/rbacresources_gen.go @@ -90,7 +90,7 @@ var RBACResourceActions = map[RBACResource][]RBACAction{ ResourceOrganization: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, ResourceOrganizationMember: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, ResourceProvisionerDaemon: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, - ResourceProvisionerJobs: {ActionRead, ActionUpdate}, + ResourceProvisionerJobs: {ActionRead, ActionUpdate, ActionCreate}, ResourceReplicas: {ActionRead}, ResourceSystem: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, ResourceTailnetCoordinator: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, From d8db119c6bcf3337d7c68b1740a176c8aa951558 Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Thu, 15 May 2025 16:46:41 +0000 Subject: [PATCH 13/24] WIP --- cli/testdata/server-config.yaml.golden | 2 +- coderd/database/dbauthz/dbauthz.go | 30 ++-- coderd/database/dbmem/dbmem.go | 62 +++---- coderd/database/dbmetrics/querymetrics.go | 21 +-- coderd/database/dbmock/dbmock.go | 45 ++--- coderd/database/querier.go | 3 +- coderd/database/queries.sql.go | 181 ++++++++------------ coderd/database/queries/provisionerjobs.sql | 31 ++-- coderd/jobreaper/detector.go | 97 +++++------ coderd/jobreaper/detector_test.go | 2 +- coderd/rbac/roles.go | 1 + site/src/api/rbacresourcesGenerated.ts | 1 + 12 files changed, 194 insertions(+), 282 deletions(-) diff --git a/cli/testdata/server-config.yaml.golden b/cli/testdata/server-config.yaml.golden index fc76a6c2ec8a0..9995a7f389130 100644 --- a/cli/testdata/server-config.yaml.golden +++ b/cli/testdata/server-config.yaml.golden @@ -183,7 +183,7 @@ networking: # Interval to poll for scheduled workspace builds. # (default: 1m0s, type: duration) autobuildPollInterval: 1m0s -# Interval to poll for hung jobs and automatically terminate them. +# Interval to poll for hung and pending jobs and automatically terminate them. # (default: 1m0s, type: duration) jobHangDetectorInterval: 1m0s introspection: diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index d01d86bd35f01..31e21a0f7788e 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -170,7 +170,7 @@ var ( Identifier: rbac.RoleIdentifier{Name: "provisionerd"}, DisplayName: "Provisioner Daemon", Site: rbac.Permissions(map[string][]policy.Action{ - rbac.ResourceProvisionerJobs.Type: {policy.ActionRead, policy.ActionUpdate}, + rbac.ResourceProvisionerJobs.Type: {policy.ActionRead, policy.ActionUpdate, policy.ActionCreate}, rbac.ResourceFile.Type: {policy.ActionRead}, rbac.ResourceSystem.Type: {policy.WildcardSymbol}, rbac.ResourceTemplate.Type: {policy.ActionRead, policy.ActionUpdate}, @@ -1075,13 +1075,6 @@ func (q *querier) customRoleCheck(ctx context.Context, role database.CustomRole) return nil } -func (q *querier) GetPendingProvisionerJobs(ctx context.Context, lastUpdatedSince time.Time) ([]database.ProvisionerJob, error) { - if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { - return nil, err - } - return q.db.GetPendingProvisionerJobs(ctx, lastUpdatedSince) -} - func (q *querier) AcquireLock(ctx context.Context, id int64) error { return q.db.AcquireLock(ctx, id) } @@ -1919,13 +1912,6 @@ func (q *querier) GetHealthSettings(ctx context.Context) (string, error) { return q.db.GetHealthSettings(ctx) } -func (q *querier) GetHungProvisionerJobs(ctx context.Context, hungSince time.Time) ([]database.ProvisionerJob, error) { - if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { - return nil, err - } - return q.db.GetHungProvisionerJobs(ctx, hungSince) -} - func (q *querier) GetInboxNotificationByID(ctx context.Context, id uuid.UUID) (database.InboxNotification, error) { return fetchWithAction(q.log, q.auth, policy.ActionRead, q.db.GetInboxNotificationByID)(ctx, id) } @@ -2336,6 +2322,9 @@ func (q *querier) GetProvisionerJobsByIDsWithQueuePosition(ctx context.Context, } func (q *querier) GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisioner(ctx context.Context, arg database.GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisionerParams) ([]database.GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisionerRow, error) { + if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { + return nil, err + } return fetchWithPostFilter(q.auth, policy.ActionRead, q.db.GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisioner)(ctx, arg) } @@ -2346,6 +2335,13 @@ func (q *querier) GetProvisionerJobsCreatedAfter(ctx context.Context, createdAt return q.db.GetProvisionerJobsCreatedAfter(ctx, createdAt) } +func (q *querier) GetProvisionerJobsToBeReaped(ctx context.Context, arg database.GetProvisionerJobsToBeReapedParams) ([]database.ProvisionerJob, error) { + if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { + return nil, err + } + return q.db.GetProvisionerJobsToBeReaped(ctx, arg) +} + func (q *querier) GetProvisionerKeyByHashedSecret(ctx context.Context, hashedSecret []byte) (database.ProvisionerKey, error) { return fetch(q.log, q.auth, q.db.GetProvisionerKeyByHashedSecret)(ctx, hashedSecret) } @@ -3538,14 +3534,14 @@ func (q *querier) InsertProvisionerJob(ctx context.Context, arg database.InsertP } func (q *querier) InsertProvisionerJobLogs(ctx context.Context, arg database.InsertProvisionerJobLogsParams) ([]database.ProvisionerJobLog, error) { - if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceProvisionerJobs); err != nil { + if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { return nil, err } return q.db.InsertProvisionerJobLogs(ctx, arg) } func (q *querier) InsertProvisionerJobTimings(ctx context.Context, arg database.InsertProvisionerJobTimingsParams) ([]database.ProvisionerJobTiming, error) { - if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceProvisionerJobs); err != nil { + if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { return nil, err } return q.db.InsertProvisionerJobTimings(ctx, arg) diff --git a/coderd/database/dbmem/dbmem.go b/coderd/database/dbmem/dbmem.go index ffc862d221c7d..33eeaf5f5f936 100644 --- a/coderd/database/dbmem/dbmem.go +++ b/coderd/database/dbmem/dbmem.go @@ -1386,10 +1386,6 @@ func isDeprecated(template database.Template) bool { return template.Deprecated != "" } -func (q *FakeQuerier) GetProvisionerJobsToBeReaped(ctx context.Context, updatedAt time.Time) ([]database.ProvisionerJob, error) { - panic("not implemented") -} - func (*FakeQuerier) AcquireLock(_ context.Context, _ int64) error { return xerrors.New("AcquireLock must only be called within a transaction") } @@ -3711,23 +3707,6 @@ func (q *FakeQuerier) GetHealthSettings(_ context.Context) (string, error) { return string(q.healthSettings), nil } -func (q *FakeQuerier) GetHungProvisionerJobs(_ context.Context, hungSince time.Time) ([]database.ProvisionerJob, error) { - q.mutex.RLock() - defer q.mutex.RUnlock() - - hungJobs := []database.ProvisionerJob{} - for _, provisionerJob := range q.provisionerJobs { - if provisionerJob.StartedAt.Valid && !provisionerJob.CompletedAt.Valid && provisionerJob.UpdatedAt.Before(hungSince) { - // clone the Tags before appending, since maps are reference types and - // we don't want the caller to be able to mutate the map we have inside - // dbmem! - provisionerJob.Tags = maps.Clone(provisionerJob.Tags) - hungJobs = append(hungJobs, provisionerJob) - } - } - return hungJobs, nil -} - func (q *FakeQuerier) GetInboxNotificationByID(_ context.Context, id uuid.UUID) (database.InboxNotification, error) { q.mutex.RLock() defer q.mutex.RUnlock() @@ -4278,23 +4257,6 @@ func (q *FakeQuerier) GetParameterSchemasByJobID(_ context.Context, jobID uuid.U return parameters, nil } -func (q *FakeQuerier) GetPendingProvisionerJobs(_ context.Context, lastUpdatedSince time.Time) ([]database.ProvisionerJob, error) { - q.mutex.RLock() - defer q.mutex.RUnlock() - - pendingJobs := []database.ProvisionerJob{} - for _, provisionerJob := range q.provisionerJobs { - if !provisionerJob.StartedAt.Valid && !provisionerJob.CompletedAt.Valid && provisionerJob.UpdatedAt.Before(lastUpdatedSince) { - // clone the Tags before appending, since maps are reference types and - // we don't want the caller to be able to mutate the map we have inside - // dbmem! - provisionerJob.Tags = maps.Clone(provisionerJob.Tags) - pendingJobs = append(pendingJobs, provisionerJob) - } - } - return pendingJobs, nil -} - func (*FakeQuerier) GetPrebuildMetrics(_ context.Context) ([]database.GetPrebuildMetricsRow, error) { return nil, ErrUnimplemented } @@ -4898,6 +4860,30 @@ func (q *FakeQuerier) GetProvisionerJobsCreatedAfter(_ context.Context, after ti return jobs, nil } +func (q *FakeQuerier) GetProvisionerJobsToBeReaped(_ context.Context, arg database.GetProvisionerJobsToBeReapedParams) ([]database.ProvisionerJob, error) { + q.mutex.RLock() + defer q.mutex.RUnlock() + maxJobs := arg.MaxJobs + + hungJobs := []database.ProvisionerJob{} + for _, provisionerJob := range q.provisionerJobs { + if !provisionerJob.CompletedAt.Valid { + if (provisionerJob.StartedAt.Valid && provisionerJob.UpdatedAt.Before(arg.HungSince)) || + (!provisionerJob.StartedAt.Valid && provisionerJob.UpdatedAt.Before(arg.PendingSince)) { + // clone the Tags before appending, since maps are reference types and + // we don't want the caller to be able to mutate the map we have inside + // dbmem! + provisionerJob.Tags = maps.Clone(provisionerJob.Tags) + hungJobs = append(hungJobs, provisionerJob) + if len(hungJobs) >= int(maxJobs) { + return hungJobs, nil + } + } + } + } + return hungJobs, nil +} + func (q *FakeQuerier) GetProvisionerKeyByHashedSecret(_ context.Context, hashedSecret []byte) (database.ProvisionerKey, error) { q.mutex.RLock() defer q.mutex.RUnlock() diff --git a/coderd/database/dbmetrics/querymetrics.go b/coderd/database/dbmetrics/querymetrics.go index 7e31679d88a50..4be271533a97c 100644 --- a/coderd/database/dbmetrics/querymetrics.go +++ b/coderd/database/dbmetrics/querymetrics.go @@ -865,13 +865,6 @@ func (m queryMetricsStore) GetHealthSettings(ctx context.Context) (string, error return r0, r1 } -func (m queryMetricsStore) GetHungProvisionerJobs(ctx context.Context, hungSince time.Time) ([]database.ProvisionerJob, error) { - start := time.Now() - jobs, err := m.s.GetHungProvisionerJobs(ctx, hungSince) - m.queryLatencies.WithLabelValues("GetHungProvisionerJobs").Observe(time.Since(start).Seconds()) - return jobs, err -} - func (m queryMetricsStore) GetInboxNotificationByID(ctx context.Context, id uuid.UUID) (database.InboxNotification, error) { start := time.Now() r0, r1 := m.s.GetInboxNotificationByID(ctx, id) @@ -1110,13 +1103,6 @@ func (m queryMetricsStore) GetParameterSchemasByJobID(ctx context.Context, jobID return schemas, err } -func (m queryMetricsStore) GetPendingProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]database.ProvisionerJob, error) { - start := time.Now() - r0, r1 := m.s.GetPendingProvisionerJobs(ctx, updatedAt) - m.queryLatencies.WithLabelValues("GetPendingProvisionerJobs").Observe(time.Since(start).Seconds()) - return r0, r1 -} - func (m queryMetricsStore) GetPrebuildMetrics(ctx context.Context) ([]database.GetPrebuildMetricsRow, error) { start := time.Now() r0, r1 := m.s.GetPrebuildMetrics(ctx) @@ -1236,6 +1222,13 @@ func (m queryMetricsStore) GetProvisionerJobsCreatedAfter(ctx context.Context, c return jobs, err } +func (m queryMetricsStore) GetProvisionerJobsToBeReaped(ctx context.Context, arg database.GetProvisionerJobsToBeReapedParams) ([]database.ProvisionerJob, error) { + start := time.Now() + r0, r1 := m.s.GetProvisionerJobsToBeReaped(ctx, arg) + m.queryLatencies.WithLabelValues("GetProvisionerJobsToBeReaped").Observe(time.Since(start).Seconds()) + return r0, r1 +} + func (m queryMetricsStore) GetProvisionerKeyByHashedSecret(ctx context.Context, hashedSecret []byte) (database.ProvisionerKey, error) { start := time.Now() r0, r1 := m.s.GetProvisionerKeyByHashedSecret(ctx, hashedSecret) diff --git a/coderd/database/dbmock/dbmock.go b/coderd/database/dbmock/dbmock.go index 38c0f7b35f3e9..5f0024aa22c42 100644 --- a/coderd/database/dbmock/dbmock.go +++ b/coderd/database/dbmock/dbmock.go @@ -1743,21 +1743,6 @@ func (mr *MockStoreMockRecorder) GetHealthSettings(ctx any) *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetHealthSettings", reflect.TypeOf((*MockStore)(nil).GetHealthSettings), ctx) } -// GetHungProvisionerJobs mocks base method. -func (m *MockStore) GetHungProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]database.ProvisionerJob, error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "GetHungProvisionerJobs", ctx, updatedAt) - ret0, _ := ret[0].([]database.ProvisionerJob) - ret1, _ := ret[1].(error) - return ret0, ret1 -} - -// GetHungProvisionerJobs indicates an expected call of GetHungProvisionerJobs. -func (mr *MockStoreMockRecorder) GetHungProvisionerJobs(ctx, updatedAt any) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetHungProvisionerJobs", reflect.TypeOf((*MockStore)(nil).GetHungProvisionerJobs), ctx, updatedAt) -} - // GetInboxNotificationByID mocks base method. func (m *MockStore) GetInboxNotificationByID(ctx context.Context, id uuid.UUID) (database.InboxNotification, error) { m.ctrl.T.Helper() @@ -2268,21 +2253,6 @@ func (mr *MockStoreMockRecorder) GetParameterSchemasByJobID(ctx, jobID any) *gom return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetParameterSchemasByJobID", reflect.TypeOf((*MockStore)(nil).GetParameterSchemasByJobID), ctx, jobID) } -// GetPendingProvisionerJobs mocks base method. -func (m *MockStore) GetPendingProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]database.ProvisionerJob, error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "GetPendingProvisionerJobs", ctx, updatedAt) - ret0, _ := ret[0].([]database.ProvisionerJob) - ret1, _ := ret[1].(error) - return ret0, ret1 -} - -// GetPendingProvisionerJobs indicates an expected call of GetPendingProvisionerJobs. -func (mr *MockStoreMockRecorder) GetPendingProvisionerJobs(ctx, updatedAt any) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetPendingProvisionerJobs", reflect.TypeOf((*MockStore)(nil).GetPendingProvisionerJobs), ctx, updatedAt) -} - // GetPrebuildMetrics mocks base method. func (m *MockStore) GetPrebuildMetrics(ctx context.Context) ([]database.GetPrebuildMetricsRow, error) { m.ctrl.T.Helper() @@ -2538,6 +2508,21 @@ func (mr *MockStoreMockRecorder) GetProvisionerJobsCreatedAfter(ctx, createdAt a return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetProvisionerJobsCreatedAfter", reflect.TypeOf((*MockStore)(nil).GetProvisionerJobsCreatedAfter), ctx, createdAt) } +// GetProvisionerJobsToBeReaped mocks base method. +func (m *MockStore) GetProvisionerJobsToBeReaped(ctx context.Context, arg database.GetProvisionerJobsToBeReapedParams) ([]database.ProvisionerJob, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetProvisionerJobsToBeReaped", ctx, arg) + ret0, _ := ret[0].([]database.ProvisionerJob) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetProvisionerJobsToBeReaped indicates an expected call of GetProvisionerJobsToBeReaped. +func (mr *MockStoreMockRecorder) GetProvisionerJobsToBeReaped(ctx, arg any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetProvisionerJobsToBeReaped", reflect.TypeOf((*MockStore)(nil).GetProvisionerJobsToBeReaped), ctx, arg) +} + // GetProvisionerKeyByHashedSecret mocks base method. func (m *MockStore) GetProvisionerKeyByHashedSecret(ctx context.Context, hashedSecret []byte) (database.ProvisionerKey, error) { m.ctrl.T.Helper() diff --git a/coderd/database/querier.go b/coderd/database/querier.go index 52a7ff22daa98..af85e9d0d9c22 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -196,7 +196,6 @@ type sqlcQuerier interface { GetGroupMembersCountByGroupID(ctx context.Context, arg GetGroupMembersCountByGroupIDParams) (int64, error) GetGroups(ctx context.Context, arg GetGroupsParams) ([]GetGroupsRow, error) GetHealthSettings(ctx context.Context) (string, error) - GetHungProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]ProvisionerJob, error) GetInboxNotificationByID(ctx context.Context, id uuid.UUID) (InboxNotification, error) // Fetches inbox notifications for a user filtered by templates and targets // param user_id: The user ID @@ -237,7 +236,6 @@ type sqlcQuerier interface { GetOrganizations(ctx context.Context, arg GetOrganizationsParams) ([]Organization, error) GetOrganizationsByUserID(ctx context.Context, arg GetOrganizationsByUserIDParams) ([]Organization, error) GetParameterSchemasByJobID(ctx context.Context, jobID uuid.UUID) ([]ParameterSchema, error) - GetPendingProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]ProvisionerJob, error) GetPrebuildMetrics(ctx context.Context) ([]GetPrebuildMetricsRow, error) GetPresetByID(ctx context.Context, presetID uuid.UUID) (GetPresetByIDRow, error) GetPresetByWorkspaceBuildID(ctx context.Context, workspaceBuildID uuid.UUID) (TemplateVersionPreset, error) @@ -271,6 +269,7 @@ type sqlcQuerier interface { GetProvisionerJobsByIDsWithQueuePosition(ctx context.Context, ids []uuid.UUID) ([]GetProvisionerJobsByIDsWithQueuePositionRow, error) GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisioner(ctx context.Context, arg GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisionerParams) ([]GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisionerRow, error) GetProvisionerJobsCreatedAfter(ctx context.Context, createdAt time.Time) ([]ProvisionerJob, error) + GetProvisionerJobsToBeReaped(ctx context.Context, arg GetProvisionerJobsToBeReapedParams) ([]ProvisionerJob, error) GetProvisionerKeyByHashedSecret(ctx context.Context, hashedSecret []byte) (ProvisionerKey, error) GetProvisionerKeyByID(ctx context.Context, id uuid.UUID) (ProvisionerKey, error) GetProvisionerKeyByName(ctx context.Context, arg GetProvisionerKeyByNameParams) (ProvisionerKey, error) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index dce0903e84b83..10727fa75dec6 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -7376,114 +7376,6 @@ func (q *sqlQuerier) AcquireProvisionerJob(ctx context.Context, arg AcquireProvi return i, err } -const getHungProvisionerJobs = `-- name: GetHungProvisionerJobs :many -SELECT - id, created_at, updated_at, started_at, canceled_at, completed_at, error, organization_id, initiator_id, provisioner, storage_method, type, input, worker_id, file_id, tags, error_code, trace_metadata, job_status -FROM - provisioner_jobs -WHERE - updated_at < $1 - AND started_at IS NOT NULL - AND completed_at IS NULL -` - -func (q *sqlQuerier) GetHungProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]ProvisionerJob, error) { - rows, err := q.db.QueryContext(ctx, getHungProvisionerJobs, updatedAt) - if err != nil { - return nil, err - } - defer rows.Close() - var items []ProvisionerJob - for rows.Next() { - var i ProvisionerJob - if err := rows.Scan( - &i.ID, - &i.CreatedAt, - &i.UpdatedAt, - &i.StartedAt, - &i.CanceledAt, - &i.CompletedAt, - &i.Error, - &i.OrganizationID, - &i.InitiatorID, - &i.Provisioner, - &i.StorageMethod, - &i.Type, - &i.Input, - &i.WorkerID, - &i.FileID, - &i.Tags, - &i.ErrorCode, - &i.TraceMetadata, - &i.JobStatus, - ); err != nil { - return nil, err - } - items = append(items, i) - } - if err := rows.Close(); err != nil { - return nil, err - } - if err := rows.Err(); err != nil { - return nil, err - } - return items, nil -} - -const getPendingProvisionerJobs = `-- name: GetPendingProvisionerJobs :many -SELECT - id, created_at, updated_at, started_at, canceled_at, completed_at, error, organization_id, initiator_id, provisioner, storage_method, type, input, worker_id, file_id, tags, error_code, trace_metadata, job_status -FROM - provisioner_jobs -WHERE - updated_at < $1 - AND started_at IS NULL - AND completed_at IS NULL -` - -func (q *sqlQuerier) GetPendingProvisionerJobs(ctx context.Context, updatedAt time.Time) ([]ProvisionerJob, error) { - rows, err := q.db.QueryContext(ctx, getPendingProvisionerJobs, updatedAt) - if err != nil { - return nil, err - } - defer rows.Close() - var items []ProvisionerJob - for rows.Next() { - var i ProvisionerJob - if err := rows.Scan( - &i.ID, - &i.CreatedAt, - &i.UpdatedAt, - &i.StartedAt, - &i.CanceledAt, - &i.CompletedAt, - &i.Error, - &i.OrganizationID, - &i.InitiatorID, - &i.Provisioner, - &i.StorageMethod, - &i.Type, - &i.Input, - &i.WorkerID, - &i.FileID, - &i.Tags, - &i.ErrorCode, - &i.TraceMetadata, - &i.JobStatus, - ); err != nil { - return nil, err - } - items = append(items, i) - } - if err := rows.Close(); err != nil { - return nil, err - } - if err := rows.Err(); err != nil { - return nil, err - } - return items, nil -} - const getProvisionerJobByID = `-- name: GetProvisionerJobByID :one SELECT id, created_at, updated_at, started_at, canceled_at, completed_at, error, organization_id, initiator_id, provisioner, storage_method, type, input, worker_id, file_id, tags, error_code, trace_metadata, job_status @@ -7951,6 +7843,79 @@ func (q *sqlQuerier) GetProvisionerJobsCreatedAfter(ctx context.Context, created return items, nil } +const getProvisionerJobsToBeReaped = `-- name: GetProvisionerJobsToBeReaped :many +SELECT + id, created_at, updated_at, started_at, canceled_at, completed_at, error, organization_id, initiator_id, provisioner, storage_method, type, input, worker_id, file_id, tags, error_code, trace_metadata, job_status +FROM + provisioner_jobs +WHERE + ( + -- If the job has not been started within $1, reap it. + updated_at < $1 + AND started_at IS NULL + AND completed_at IS NULL + ) + OR + ( + -- If the job has been started but not completed within $2, reap it. + updated_at < $2 + AND started_at IS NOT NULL + AND completed_at IS NULL + ) +ORDER BY random() +LIMIT $3 +FOR UPDATE SKIP LOCKED +` + +type GetProvisionerJobsToBeReapedParams struct { + PendingSince time.Time `db:"pending_since" json:"pending_since"` + HungSince time.Time `db:"hung_since" json:"hung_since"` + MaxJobs int32 `db:"max_jobs" json:"max_jobs"` +} + +func (q *sqlQuerier) GetProvisionerJobsToBeReaped(ctx context.Context, arg GetProvisionerJobsToBeReapedParams) ([]ProvisionerJob, error) { + rows, err := q.db.QueryContext(ctx, getProvisionerJobsToBeReaped, arg.PendingSince, arg.HungSince, arg.MaxJobs) + if err != nil { + return nil, err + } + defer rows.Close() + var items []ProvisionerJob + for rows.Next() { + var i ProvisionerJob + if err := rows.Scan( + &i.ID, + &i.CreatedAt, + &i.UpdatedAt, + &i.StartedAt, + &i.CanceledAt, + &i.CompletedAt, + &i.Error, + &i.OrganizationID, + &i.InitiatorID, + &i.Provisioner, + &i.StorageMethod, + &i.Type, + &i.Input, + &i.WorkerID, + &i.FileID, + &i.Tags, + &i.ErrorCode, + &i.TraceMetadata, + &i.JobStatus, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + const insertProvisionerJob = `-- name: InsertProvisionerJob :one INSERT INTO provisioner_jobs ( diff --git a/coderd/database/queries/provisionerjobs.sql b/coderd/database/queries/provisionerjobs.sql index 02654eb4ddc4e..b455babf95be6 100644 --- a/coderd/database/queries/provisionerjobs.sql +++ b/coderd/database/queries/provisionerjobs.sql @@ -268,25 +268,28 @@ SET WHERE id = $1; --- name: GetHungProvisionerJobs :many +-- name: GetProvisionerJobsToBeReaped :many SELECT * FROM provisioner_jobs WHERE - updated_at < $1 - AND started_at IS NOT NULL - AND completed_at IS NULL; - --- name: GetPendingProvisionerJobs :many -SELECT - * -FROM - provisioner_jobs -WHERE - updated_at < $1 - AND started_at IS NULL - AND completed_at IS NULL; + ( + -- If the job has not been started within $1, reap it. + updated_at < @pending_since + AND started_at IS NULL + AND completed_at IS NULL + ) + OR + ( + -- If the job has been started but not completed within $2, reap it. + updated_at < @hung_since + AND started_at IS NOT NULL + AND completed_at IS NULL + ) +ORDER BY random() +LIMIT @max_jobs +FOR UPDATE SKIP LOCKED; -- name: InsertProvisionerJobTimings :many INSERT INTO provisioner_job_timings (job_id, started_at, ended_at, stage, source, action, resource) diff --git a/coderd/jobreaper/detector.go b/coderd/jobreaper/detector.go index 3ff1f6f6ddf86..762a04c819230 100644 --- a/coderd/jobreaper/detector.go +++ b/coderd/jobreaper/detector.go @@ -4,8 +4,7 @@ import ( "context" "database/sql" "encoding/json" - "fmt" - "math/rand" //#nosec // this is only used for shuffling an array to pick random jobs to unhang + "fmt" //#nosec // this is only used for shuffling an array to pick random jobs to unhang "time" "golang.org/x/xerrors" @@ -25,9 +24,9 @@ const ( // before it is considered hung. HungJobDuration = 5 * time.Minute - // PendingJobTimeElapsed is the duration of time since the last update to a job + // PendingJobDuration is the duration of time since the last update to a job // before it is considered hung. - PendingJobTimeElapsed = 30 * time.Minute + PendingJobDuration = 30 * time.Minute // HungJobExitTimeout is the duration of time that provisioners should allow // for a graceful exit upon cancellation due to failing to send an update to @@ -43,11 +42,11 @@ const ( ) // jobLogMessages are written to provisioner job logs when a job is reaped -func jobLogMessages(jobType database.ProvisionerJobStatus, threshold time.Duration) []string { +func jobLogMessages(reapType ReapType, threshold time.Duration) []string { return []string{ "", "====================", - fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and will be terminated.", jobType, threshold.Minutes()), + fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and will be terminated.", reapType, threshold.Minutes()), "====================", "", } @@ -56,9 +55,16 @@ func jobLogMessages(jobType database.ProvisionerJobStatus, threshold time.Durati type jobToReap struct { ID uuid.UUID Threshold time.Duration - Status database.ProvisionerJobStatus + Type ReapType } +type ReapType string + +const ( + Pending ReapType = "pending" + Hung ReapType = "hung" +) + // acquireLockError is returned when the detector fails to acquire a lock and // cancels the current run. type acquireLockError struct{} @@ -183,44 +189,31 @@ func (d *Detector) run(t time.Time) Stats { Error: nil, } - jobsToReap := make([]*jobToReap, 0, MaxJobsPerRun) - - // Find all provisioner jobs that are currently running but have not - // received an update in the last 5 minutes. - hungJobs, err := d.db.GetHungProvisionerJobs(ctx, t.Add(-HungJobDuration)) + // Find all provisioner jobs to be reaped + jobs, err := d.db.GetProvisionerJobsToBeReaped(ctx, database.GetProvisionerJobsToBeReapedParams{ + PendingSince: t.Add(-PendingJobDuration), + HungSince: t.Add(-HungJobDuration), + MaxJobs: MaxJobsPerRun, + }) if err != nil { - stats.Error = xerrors.Errorf("get hung provisioner jobs: %w", err) + stats.Error = xerrors.Errorf("get provisioner jobs to be reaped: %w", err) return stats } - for _, job := range hungJobs { - jobsToReap = append(jobsToReap, &jobToReap{ - ID: job.ID, - Threshold: HungJobDuration, - }) - } - // Find all provisioner jobs that have not been started yet and have not - // received an update in the last 30 minutes. - jobsPending, err := d.db.GetPendingProvisionerJobs(ctx, t.Add(-PendingJobTimeElapsed)) - if err != nil { - stats.Error = xerrors.Errorf("get pending provisioner jobs: %w", err) - return stats - } - for _, job := range jobsPending { - jobsToReap = append(jobsToReap, &jobToReap{ - ID: job.ID, - Threshold: PendingJobTimeElapsed, - }) - } + jobsToReap := make([]*jobToReap, 0, len(jobs)) - // Limit the number of jobs we'll reap in a single run to avoid - // timing out. - if len(jobsToReap) > MaxJobsPerRun { - // Pick a random subset of the jobs to reap. - rand.Shuffle(len(jobsToReap), func(i, j int) { - jobsToReap[i], jobsToReap[j] = jobsToReap[j], jobsToReap[i] - }) - jobsToReap = jobsToReap[:MaxJobsPerRun] + for _, job := range jobs { + j := &jobToReap{ + ID: job.ID, + } + if job.JobStatus == database.ProvisionerJobStatusPending { + j.Threshold = PendingJobDuration + j.Type = Pending + } else { + j.Threshold = HungJobDuration + j.Type = Hung + } + jobsToReap = append(jobsToReap, j) } // Send a message into the build log for each hung or pending job saying that it @@ -231,7 +224,7 @@ func (d *Detector) run(t time.Time) Stats { err := reapJob(ctx, log, d.db, d.pubsub, job) if err != nil { if !(xerrors.As(err, &acquireLockError{}) || xerrors.As(err, &jobIneligibleError{})) { - log.Error(ctx, fmt.Sprintf("error forcefully terminating %s provisioner job", job.Status), slog.Error(err)) + log.Error(ctx, fmt.Sprintf("error forcefully terminating %s provisioner job", job.Type), slog.Error(err)) } continue } @@ -261,22 +254,20 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub return xerrors.Errorf("get provisioner job: %w", err) } - threshold := reapParamsFromJob(job) - if job.CompletedAt.Valid { return jobIneligibleError{ Err: xerrors.Errorf("job is completed (status %s)", job.JobStatus), } } - if job.UpdatedAt.After(time.Now().Add(-threshold)) { + if job.UpdatedAt.After(time.Now().Add(-jobToReap.Threshold)) { return jobIneligibleError{ Err: xerrors.New("job has been updated recently"), } } log.Warn( - ctx, fmt.Sprintf("detected %s provisioner job, forcefully terminating", job.JobStatus), - "threshold", threshold, + ctx, fmt.Sprintf("detected %s provisioner job, forcefully terminating", jobToReap.Type), + "threshold", jobToReap.Threshold, ) // First, get the latest logs from the build so we can make sure @@ -286,7 +277,7 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub CreatedAfter: 0, }) if err != nil { - return xerrors.Errorf("get logs for %s job: %w", job.JobStatus, err) + return xerrors.Errorf("get logs for %s job: %w", jobToReap.Type, err) } logStage := "" if len(logs) != 0 { @@ -306,7 +297,7 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub Output: nil, } now := dbtime.Now() - for i, msg := range jobLogMessages(job.JobStatus, threshold) { + for i, msg := range jobLogMessages(jobToReap.Type, jobToReap.Threshold) { // Set the created at in a way that ensures each message has // a unique timestamp so they will be sorted correctly. insertParams.CreatedAt = append(insertParams.CreatedAt, now.Add(time.Millisecond*time.Duration(i))) @@ -340,7 +331,7 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub Valid: true, }, Error: sql.NullString{ - String: fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and has been terminated by hang detector.", job.JobStatus, threshold.Minutes()), + String: fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and has been terminated by hang detector.", jobToReap.Type, jobToReap.Threshold.Minutes()), Valid: true, }, ErrorCode: sql.NullString{ @@ -407,11 +398,3 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub return nil } - -func reapParamsFromJob(job database.ProvisionerJob) time.Duration { - threshold := HungJobDuration - if job.JobStatus == database.ProvisionerJobStatusPending { - threshold = PendingJobTimeElapsed - } - return threshold -} diff --git a/coderd/jobreaper/detector_test.go b/coderd/jobreaper/detector_test.go index ada9a9853f032..5d2a57b0fb47c 100644 --- a/coderd/jobreaper/detector_test.go +++ b/coderd/jobreaper/detector_test.go @@ -52,7 +52,7 @@ func reapParamsFromJob(job database.ProvisionerJob) (jobType, float64) { threshold := jobreaper.HungJobDuration.Minutes() if !job.StartedAt.Valid { jobType = pendingJobType - threshold = jobreaper.PendingJobTimeElapsed.Minutes() + threshold = jobreaper.PendingJobDuration.Minutes() } return jobType, threshold } diff --git a/coderd/rbac/roles.go b/coderd/rbac/roles.go index 0b94a74201b16..b9bcff5d42bdf 100644 --- a/coderd/rbac/roles.go +++ b/coderd/rbac/roles.go @@ -273,6 +273,7 @@ func ReloadBuiltinRoles(opts *RoleOptions) { Permissions(map[string][]policy.Action{ ResourceWorkspace.Type: ownerWorkspaceActions, ResourceWorkspaceDormant.Type: {policy.ActionRead, policy.ActionDelete, policy.ActionCreate, policy.ActionUpdate, policy.ActionWorkspaceStop}, + ResourceProvisionerJobs.Type: {policy.ActionRead, policy.ActionUpdate, policy.ActionCreate}, })...), Org: map[string][]Permission{}, User: []Permission{}, diff --git a/site/src/api/rbacresourcesGenerated.ts b/site/src/api/rbacresourcesGenerated.ts index 079dcb4a87a61..7016b78d876e0 100644 --- a/site/src/api/rbacresourcesGenerated.ts +++ b/site/src/api/rbacresourcesGenerated.ts @@ -131,6 +131,7 @@ export const RBACResourceActions: Partial< }, provisioner_jobs: { read: "read provisioner jobs", + update: "update provisioner jobs", }, replicas: { read: "read replicas", From 5120fb1d66852d2e0c32a0c893b648e3dc4aab1a Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Thu, 15 May 2025 19:19:04 +0000 Subject: [PATCH 14/24] WIP --- coderd/database/dbauthz/dbauthz.go | 1 + coderd/database/dbauthz/dbauthz_test.go | 27 +++++++++++-------------- coderd/rbac/object_gen.go | 1 + coderd/rbac/policy/policy.go | 1 + coderd/rbac/roles_test.go | 2 +- codersdk/rbacresources_gen.go | 2 +- 6 files changed, 17 insertions(+), 17 deletions(-) diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index 31e21a0f7788e..24adf526f7010 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -347,6 +347,7 @@ var ( rbac.ResourceNotificationTemplate.Type: {policy.ActionCreate, policy.ActionUpdate, policy.ActionDelete}, rbac.ResourceCryptoKey.Type: {policy.ActionCreate, policy.ActionUpdate, policy.ActionDelete}, rbac.ResourceFile.Type: {policy.ActionCreate, policy.ActionRead}, + rbac.ResourceProvisionerJobs.Type: {policy.ActionRead, policy.ActionUpdate, policy.ActionCreate}, }), Org: map[string][]rbac.Permission{}, User: []rbac.Permission{}, diff --git a/coderd/database/dbauthz/dbauthz_test.go b/coderd/database/dbauthz/dbauthz_test.go index 0bc116d63987d..517ef486c54a7 100644 --- a/coderd/database/dbauthz/dbauthz_test.go +++ b/coderd/database/dbauthz/dbauthz_test.go @@ -3892,7 +3892,7 @@ func (s *MethodTestSuite) TestSystemFunctions() { })) s.Run("GetProvisionerJobsCreatedAfter", s.Subtest(func(db database.Store, check *expects) { _ = dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{CreatedAt: time.Now().Add(-time.Hour)}) - check.Args(time.Now()).Asserts(rbac.ResourceSystem, policy.ActionRead) + check.Args(time.Now()).Asserts(rbac.ResourceProvisionerJobs, policy.ActionRead) })) s.Run("GetTemplateVersionsByIDs", s.Subtest(func(db database.Store, check *expects) { dbtestutil.DisableForeignKeysAndTriggers(s.T(), db) @@ -3978,7 +3978,7 @@ func (s *MethodTestSuite) TestSystemFunctions() { a := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) b := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args([]uuid.UUID{a.ID, b.ID}). - Asserts(rbac.ResourceSystem, policy.ActionRead). + Asserts(rbac.ResourceProvisionerJobs, policy.ActionRead). Returns(slice.New(a, b)) })) s.Run("InsertWorkspaceAgent", s.Subtest(func(db database.Store, check *expects) { @@ -4022,26 +4022,26 @@ func (s *MethodTestSuite) TestSystemFunctions() { OrganizationID: j.OrganizationID, Types: []database.ProvisionerType{j.Provisioner}, ProvisionerTags: must(json.Marshal(j.Tags)), - }).Asserts(rbac.ResourceSystem, policy.ActionUpdate) + }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) })) s.Run("UpdateProvisionerJobWithCompleteByID", s.Subtest(func(db database.Store, check *expects) { j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.UpdateProvisionerJobWithCompleteByIDParams{ ID: j.ID, - }).Asserts(rbac.ResourceSystem, policy.ActionUpdate) + }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) })) s.Run("UpdateProvisionerJobWithCompleteWithStartedAtByID", s.Subtest(func(db database.Store, check *expects) { j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams{ ID: j.ID, - }).Asserts(rbac.ResourceSystem, policy.ActionUpdate) + }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) })) s.Run("UpdateProvisionerJobByID", s.Subtest(func(db database.Store, check *expects) { j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.UpdateProvisionerJobByIDParams{ ID: j.ID, UpdatedAt: time.Now(), - }).Asserts(rbac.ResourceSystem, policy.ActionUpdate) + }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) })) s.Run("InsertProvisionerJob", s.Subtest(func(db database.Store, check *expects) { dbtestutil.DisableForeignKeysAndTriggers(s.T(), db) @@ -4051,19 +4051,19 @@ func (s *MethodTestSuite) TestSystemFunctions() { StorageMethod: database.ProvisionerStorageMethodFile, Type: database.ProvisionerJobTypeWorkspaceBuild, Input: json.RawMessage("{}"), - }).Asserts(rbac.ResourceSystem, policy.ActionCreate) + }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionCreate) })) s.Run("InsertProvisionerJobLogs", s.Subtest(func(db database.Store, check *expects) { j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.InsertProvisionerJobLogsParams{ JobID: j.ID, - }).Asserts(rbac.ResourceSystem, policy.ActionCreate) + }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) })) s.Run("InsertProvisionerJobTimings", s.Subtest(func(db database.Store, check *expects) { j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.InsertProvisionerJobTimingsParams{ JobID: j.ID, - }).Asserts(rbac.ResourceSystem, policy.ActionCreate) + }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) })) s.Run("UpsertProvisionerDaemon", s.Subtest(func(db database.Store, check *expects) { dbtestutil.DisableForeignKeysAndTriggers(s.T(), db) @@ -4199,11 +4199,8 @@ func (s *MethodTestSuite) TestSystemFunctions() { s.Run("GetFileTemplates", s.Subtest(func(db database.Store, check *expects) { check.Args(uuid.New()).Asserts(rbac.ResourceSystem, policy.ActionRead) })) - s.Run("GetHungProvisionerJobs", s.Subtest(func(db database.Store, check *expects) { - check.Args(time.Time{}).Asserts() - })) - s.Run("GetPendingProvisionerJobs", s.Subtest(func(db database.Store, check *expects) { - check.Args(time.Time{}).Asserts() + s.Run("GetProvisionerJobsToBeReaped", s.Subtest(func(db database.Store, check *expects) { + check.Args(database.GetProvisionerJobsToBeReapedParams{}).Asserts(rbac.ResourceProvisionerJobs, policy.ActionRead) })) s.Run("UpsertOAuthSigningKey", s.Subtest(func(db database.Store, check *expects) { check.Args("foo").Asserts(rbac.ResourceSystem, policy.ActionUpdate) @@ -4282,7 +4279,7 @@ func (s *MethodTestSuite) TestSystemFunctions() { check.Args([]uuid.UUID{uuid.New()}).Asserts(rbac.ResourceSystem, policy.ActionRead) })) s.Run("GetProvisionerJobsByIDsWithQueuePosition", s.Subtest(func(db database.Store, check *expects) { - check.Args([]uuid.UUID{}).Asserts() + check.Args([]uuid.UUID{}).Asserts(rbac.ResourceProvisionerJobs, policy.ActionRead) })) s.Run("GetReplicaByID", s.Subtest(func(db database.Store, check *expects) { check.Args(uuid.New()).Asserts(rbac.ResourceSystem, policy.ActionRead).Errors(sql.ErrNoRows) diff --git a/coderd/rbac/object_gen.go b/coderd/rbac/object_gen.go index 0336616b5e550..ad1a510fd44bd 100644 --- a/coderd/rbac/object_gen.go +++ b/coderd/rbac/object_gen.go @@ -234,6 +234,7 @@ var ( // ResourceProvisionerJobs // Valid Actions + // - "ActionCreate" :: create provisioner jobs // - "ActionRead" :: read provisioner jobs // - "ActionUpdate" :: update provisioner jobs ResourceProvisionerJobs = Object{ diff --git a/coderd/rbac/policy/policy.go b/coderd/rbac/policy/policy.go index 070692c5e50c6..c37e84c48f964 100644 --- a/coderd/rbac/policy/policy.go +++ b/coderd/rbac/policy/policy.go @@ -184,6 +184,7 @@ var RBACPermissions = map[string]PermissionDefinition{ Actions: map[Action]ActionDefinition{ ActionRead: actDef("read provisioner jobs"), ActionUpdate: actDef("update provisioner jobs"), + ActionCreate: actDef("create provisioner jobs"), }, }, "organization": { diff --git a/coderd/rbac/roles_test.go b/coderd/rbac/roles_test.go index 5f1381caedf27..6d42a01474d1a 100644 --- a/coderd/rbac/roles_test.go +++ b/coderd/rbac/roles_test.go @@ -580,7 +580,7 @@ func TestRolePermissions(t *testing.T) { }, { Name: "ProvisionerJobs", - Actions: []policy.Action{policy.ActionRead, policy.ActionUpdate}, + Actions: []policy.Action{policy.ActionRead, policy.ActionUpdate, policy.ActionCreate}, Resource: rbac.ResourceProvisionerJobs.InOrg(orgID), AuthorizeMap: map[bool][]hasAuthSubjects{ true: {owner, orgTemplateAdmin, orgAdmin}, diff --git a/codersdk/rbacresources_gen.go b/codersdk/rbacresources_gen.go index 8a89ac6d5e507..6157281f21356 100644 --- a/codersdk/rbacresources_gen.go +++ b/codersdk/rbacresources_gen.go @@ -90,7 +90,7 @@ var RBACResourceActions = map[RBACResource][]RBACAction{ ResourceOrganization: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, ResourceOrganizationMember: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, ResourceProvisionerDaemon: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, - ResourceProvisionerJobs: {ActionRead, ActionUpdate, ActionCreate}, + ResourceProvisionerJobs: {ActionCreate, ActionRead, ActionUpdate}, ResourceReplicas: {ActionRead}, ResourceSystem: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, ResourceTailnetCoordinator: {ActionCreate, ActionDelete, ActionRead, ActionUpdate}, From 8d4fa5aaaf7af956917d81d58760d721c7c4b058 Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Thu, 15 May 2025 19:20:53 +0000 Subject: [PATCH 15/24] fixed sql comments --- coderd/database/queries/provisionerjobs.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/coderd/database/queries/provisionerjobs.sql b/coderd/database/queries/provisionerjobs.sql index b455babf95be6..2317ea38ac683 100644 --- a/coderd/database/queries/provisionerjobs.sql +++ b/coderd/database/queries/provisionerjobs.sql @@ -275,14 +275,14 @@ FROM provisioner_jobs WHERE ( - -- If the job has not been started within $1, reap it. + -- If the job has not been started before @pending_since, reap it. updated_at < @pending_since AND started_at IS NULL AND completed_at IS NULL ) OR ( - -- If the job has been started but not completed within $2, reap it. + -- If the job has been started but not completed before @hung_since, reap it. updated_at < @hung_since AND started_at IS NOT NULL AND completed_at IS NULL From 18b809c6438850ad0d808074b6d0f2098d7bee0c Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Fri, 16 May 2025 14:16:49 +0000 Subject: [PATCH 16/24] taking a step back with RBAC --- coderd/database/dbauthz/dbauthz.go | 77 +++++++++++++------------ coderd/database/dbauthz/dbauthz_test.go | 20 +++---- coderd/database/queries.sql.go | 4 +- site/src/api/rbacresourcesGenerated.ts | 1 + 4 files changed, 54 insertions(+), 48 deletions(-) diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index 24adf526f7010..56f0479457cbf 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -1088,9 +1088,9 @@ func (q *querier) AcquireNotificationMessages(ctx context.Context, arg database. } func (q *querier) AcquireProvisionerJob(ctx context.Context, arg database.AcquireProvisionerJobParams) (database.ProvisionerJob, error) { - if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { - return database.ProvisionerJob{}, err - } + // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + // return database.ProvisionerJob{}, err + // } return q.db.AcquireProvisionerJob(ctx, arg) } @@ -2309,30 +2309,31 @@ func (q *querier) GetProvisionerJobTimingsByJobID(ctx context.Context, jobID uui } func (q *querier) GetProvisionerJobsByIDs(ctx context.Context, ids []uuid.UUID) ([]database.ProvisionerJob, error) { - if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { - return nil, err - } + // if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { + // return nil, err + // } return q.db.GetProvisionerJobsByIDs(ctx, ids) } func (q *querier) GetProvisionerJobsByIDsWithQueuePosition(ctx context.Context, ids []uuid.UUID) ([]database.GetProvisionerJobsByIDsWithQueuePositionRow, error) { - if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { - return nil, err - } + // if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { + // return nil, err + // } + // policy.ActionRead, rbac.ResourceProvisionerJobs.InOrg(org.ID) return q.db.GetProvisionerJobsByIDsWithQueuePosition(ctx, ids) } func (q *querier) GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisioner(ctx context.Context, arg database.GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisionerParams) ([]database.GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisionerRow, error) { - if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { - return nil, err - } + // if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { + // return nil, err + // } return fetchWithPostFilter(q.auth, policy.ActionRead, q.db.GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisioner)(ctx, arg) } func (q *querier) GetProvisionerJobsCreatedAfter(ctx context.Context, createdAt time.Time) ([]database.ProvisionerJob, error) { - if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { - return nil, err - } + // if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { + // return nil, err + // } return q.db.GetProvisionerJobsCreatedAfter(ctx, createdAt) } @@ -3528,23 +3529,27 @@ func (q *querier) InsertPresetParameters(ctx context.Context, arg database.Inser } func (q *querier) InsertProvisionerJob(ctx context.Context, arg database.InsertProvisionerJobParams) (database.ProvisionerJob, error) { - if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceProvisionerJobs); err != nil { - return database.ProvisionerJob{}, err - } + // TODO: Remove this once we have a proper rbac check for provisioner jobs. + // Currently ProvisionerJobs are not associated with a user, so we can't + // check for a user's permissions. We'd need to check for the associated workspace + // and verify ownership through that. + // if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceProvisionerJobs); err != nil { + // return database.ProvisionerJob{}, err + // } return q.db.InsertProvisionerJob(ctx, arg) } func (q *querier) InsertProvisionerJobLogs(ctx context.Context, arg database.InsertProvisionerJobLogsParams) ([]database.ProvisionerJobLog, error) { - if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { - return nil, err - } + // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + // return nil, err + // } return q.db.InsertProvisionerJobLogs(ctx, arg) } func (q *querier) InsertProvisionerJobTimings(ctx context.Context, arg database.InsertProvisionerJobTimingsParams) ([]database.ProvisionerJobTiming, error) { - if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { - return nil, err - } + // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + // return nil, err + // } return q.db.InsertProvisionerJobTimings(ctx, arg) } @@ -4168,16 +4173,16 @@ func (q *querier) UpdateProvisionerDaemonLastSeenAt(ctx context.Context, arg dat } func (q *querier) UpdateProvisionerJobByID(ctx context.Context, arg database.UpdateProvisionerJobByIDParams) error { - if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { - return err - } + // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + // return err + // } return q.db.UpdateProvisionerJobByID(ctx, arg) } func (q *querier) UpdateProvisionerJobWithCancelByID(ctx context.Context, arg database.UpdateProvisionerJobWithCancelByIDParams) error { - if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { - return err - } + // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + // return err + // } job, err := q.db.GetProvisionerJobByID(ctx, arg.ID) if err != nil { @@ -4246,16 +4251,16 @@ func (q *querier) UpdateProvisionerJobWithCancelByID(ctx context.Context, arg da } func (q *querier) UpdateProvisionerJobWithCompleteByID(ctx context.Context, arg database.UpdateProvisionerJobWithCompleteByIDParams) error { - if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { - return err - } + // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + // return err + // } return q.db.UpdateProvisionerJobWithCompleteByID(ctx, arg) } func (q *querier) UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx context.Context, arg database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams) error { - if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { - return err - } + // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + // return err + // } return q.db.UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx, arg) } diff --git a/coderd/database/dbauthz/dbauthz_test.go b/coderd/database/dbauthz/dbauthz_test.go index 517ef486c54a7..3fb5e3d62f135 100644 --- a/coderd/database/dbauthz/dbauthz_test.go +++ b/coderd/database/dbauthz/dbauthz_test.go @@ -3892,7 +3892,7 @@ func (s *MethodTestSuite) TestSystemFunctions() { })) s.Run("GetProvisionerJobsCreatedAfter", s.Subtest(func(db database.Store, check *expects) { _ = dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{CreatedAt: time.Now().Add(-time.Hour)}) - check.Args(time.Now()).Asserts(rbac.ResourceProvisionerJobs, policy.ActionRead) + check.Args(time.Now()).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionRead */ ) })) s.Run("GetTemplateVersionsByIDs", s.Subtest(func(db database.Store, check *expects) { dbtestutil.DisableForeignKeysAndTriggers(s.T(), db) @@ -3978,7 +3978,7 @@ func (s *MethodTestSuite) TestSystemFunctions() { a := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) b := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args([]uuid.UUID{a.ID, b.ID}). - Asserts(rbac.ResourceProvisionerJobs, policy.ActionRead). + Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionRead */ ). Returns(slice.New(a, b)) })) s.Run("InsertWorkspaceAgent", s.Subtest(func(db database.Store, check *expects) { @@ -4022,26 +4022,26 @@ func (s *MethodTestSuite) TestSystemFunctions() { OrganizationID: j.OrganizationID, Types: []database.ProvisionerType{j.Provisioner}, ProvisionerTags: must(json.Marshal(j.Tags)), - }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) + }).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionUpdate */ ) })) s.Run("UpdateProvisionerJobWithCompleteByID", s.Subtest(func(db database.Store, check *expects) { j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.UpdateProvisionerJobWithCompleteByIDParams{ ID: j.ID, - }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) + }).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionUpdate */ ) })) s.Run("UpdateProvisionerJobWithCompleteWithStartedAtByID", s.Subtest(func(db database.Store, check *expects) { j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams{ ID: j.ID, - }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) + }).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionUpdate */ ) })) s.Run("UpdateProvisionerJobByID", s.Subtest(func(db database.Store, check *expects) { j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.UpdateProvisionerJobByIDParams{ ID: j.ID, UpdatedAt: time.Now(), - }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) + }).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionUpdate */ ) })) s.Run("InsertProvisionerJob", s.Subtest(func(db database.Store, check *expects) { dbtestutil.DisableForeignKeysAndTriggers(s.T(), db) @@ -4051,19 +4051,19 @@ func (s *MethodTestSuite) TestSystemFunctions() { StorageMethod: database.ProvisionerStorageMethodFile, Type: database.ProvisionerJobTypeWorkspaceBuild, Input: json.RawMessage("{}"), - }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionCreate) + }).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionCreate */ ) })) s.Run("InsertProvisionerJobLogs", s.Subtest(func(db database.Store, check *expects) { j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.InsertProvisionerJobLogsParams{ JobID: j.ID, - }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) + }).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionUpdate */ ) })) s.Run("InsertProvisionerJobTimings", s.Subtest(func(db database.Store, check *expects) { j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.InsertProvisionerJobTimingsParams{ JobID: j.ID, - }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) + }).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionUpdate */ ) })) s.Run("UpsertProvisionerDaemon", s.Subtest(func(db database.Store, check *expects) { dbtestutil.DisableForeignKeysAndTriggers(s.T(), db) @@ -4279,7 +4279,7 @@ func (s *MethodTestSuite) TestSystemFunctions() { check.Args([]uuid.UUID{uuid.New()}).Asserts(rbac.ResourceSystem, policy.ActionRead) })) s.Run("GetProvisionerJobsByIDsWithQueuePosition", s.Subtest(func(db database.Store, check *expects) { - check.Args([]uuid.UUID{}).Asserts(rbac.ResourceProvisionerJobs, policy.ActionRead) + check.Args([]uuid.UUID{}).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionRead */ ) })) s.Run("GetReplicaByID", s.Subtest(func(db database.Store, check *expects) { check.Args(uuid.New()).Asserts(rbac.ResourceSystem, policy.ActionRead).Errors(sql.ErrNoRows) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index 10727fa75dec6..06aba2e412071 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -7850,14 +7850,14 @@ FROM provisioner_jobs WHERE ( - -- If the job has not been started within $1, reap it. + -- If the job has not been started before @pending_since, reap it. updated_at < $1 AND started_at IS NULL AND completed_at IS NULL ) OR ( - -- If the job has been started but not completed within $2, reap it. + -- If the job has been started but not completed before @hung_since, reap it. updated_at < $2 AND started_at IS NOT NULL AND completed_at IS NULL diff --git a/site/src/api/rbacresourcesGenerated.ts b/site/src/api/rbacresourcesGenerated.ts index 7016b78d876e0..3acb86c079908 100644 --- a/site/src/api/rbacresourcesGenerated.ts +++ b/site/src/api/rbacresourcesGenerated.ts @@ -130,6 +130,7 @@ export const RBACResourceActions: Partial< update: "update a provisioner daemon", }, provisioner_jobs: { + create: "create provisioner jobs", read: "read provisioner jobs", update: "update provisioner jobs", }, From 0fe1404bb3c5777fa9983fe7bf268b05873e25f8 Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Fri, 16 May 2025 14:40:32 +0000 Subject: [PATCH 17/24] WIP --- coderd/database/dbauthz/dbauthz.go | 7 ++++ coderd/database/dbauthz/dbauthz_test.go | 3 ++ coderd/database/dbmem/dbmem.go | 7 ++++ coderd/database/dbmetrics/querymetrics.go | 7 ++++ coderd/database/dbmock/dbmock.go | 15 ++++++++ coderd/database/querier.go | 3 ++ coderd/database/queries.sql.go | 41 ++++++++++++++++++++- coderd/database/queries/provisionerjobs.sql | 15 +++++++- coderd/jobreaper/detector.go | 14 ++----- 9 files changed, 99 insertions(+), 13 deletions(-) diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index 56f0479457cbf..43210084eec5e 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -2300,6 +2300,13 @@ func (q *querier) GetProvisionerJobByID(ctx context.Context, id uuid.UUID) (data return job, nil } +func (q *querier) GetProvisionerJobByIDForUpdate(ctx context.Context, id uuid.UUID) (database.ProvisionerJob, error) { + if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { + return database.ProvisionerJob{}, err + } + return q.db.GetProvisionerJobByIDForUpdate(ctx, id) +} + func (q *querier) GetProvisionerJobTimingsByJobID(ctx context.Context, jobID uuid.UUID) ([]database.ProvisionerJobTiming, error) { _, err := q.GetProvisionerJobByID(ctx, jobID) if err != nil { diff --git a/coderd/database/dbauthz/dbauthz_test.go b/coderd/database/dbauthz/dbauthz_test.go index 3fb5e3d62f135..57532855955ee 100644 --- a/coderd/database/dbauthz/dbauthz_test.go +++ b/coderd/database/dbauthz/dbauthz_test.go @@ -4444,6 +4444,9 @@ func (s *MethodTestSuite) TestSystemFunctions() { VapidPrivateKey: "test", }).Asserts(rbac.ResourceDeploymentConfig, policy.ActionUpdate) })) + s.Run("GetProvisionerJobByIDForUpdate", s.Subtest(func(db database.Store, check *expects) { + check.Args(uuid.New()).Asserts(rbac.ResourceProvisionerJobs, policy.ActionRead).Errors(sql.ErrNoRows) + })) } func (s *MethodTestSuite) TestNotifications() { diff --git a/coderd/database/dbmem/dbmem.go b/coderd/database/dbmem/dbmem.go index 33eeaf5f5f936..d6725b066b732 100644 --- a/coderd/database/dbmem/dbmem.go +++ b/coderd/database/dbmem/dbmem.go @@ -4625,6 +4625,13 @@ func (q *FakeQuerier) GetProvisionerJobByID(ctx context.Context, id uuid.UUID) ( return q.getProvisionerJobByIDNoLock(ctx, id) } +func (q *FakeQuerier) GetProvisionerJobByIDForUpdate(ctx context.Context, id uuid.UUID) (database.ProvisionerJob, error) { + q.mutex.RLock() + defer q.mutex.RUnlock() + + return q.getProvisionerJobByIDNoLock(ctx, id) +} + func (q *FakeQuerier) GetProvisionerJobTimingsByJobID(_ context.Context, jobID uuid.UUID) ([]database.ProvisionerJobTiming, error) { q.mutex.RLock() defer q.mutex.RUnlock() diff --git a/coderd/database/dbmetrics/querymetrics.go b/coderd/database/dbmetrics/querymetrics.go index 4be271533a97c..854d03fc666cf 100644 --- a/coderd/database/dbmetrics/querymetrics.go +++ b/coderd/database/dbmetrics/querymetrics.go @@ -1187,6 +1187,13 @@ func (m queryMetricsStore) GetProvisionerJobByID(ctx context.Context, id uuid.UU return job, err } +func (m queryMetricsStore) GetProvisionerJobByIDForUpdate(ctx context.Context, id uuid.UUID) (database.ProvisionerJob, error) { + start := time.Now() + r0, r1 := m.s.GetProvisionerJobByIDForUpdate(ctx, id) + m.queryLatencies.WithLabelValues("GetProvisionerJobByIDForUpdate").Observe(time.Since(start).Seconds()) + return r0, r1 +} + func (m queryMetricsStore) GetProvisionerJobTimingsByJobID(ctx context.Context, jobID uuid.UUID) ([]database.ProvisionerJobTiming, error) { start := time.Now() r0, r1 := m.s.GetProvisionerJobTimingsByJobID(ctx, jobID) diff --git a/coderd/database/dbmock/dbmock.go b/coderd/database/dbmock/dbmock.go index 5f0024aa22c42..5dbe039b46758 100644 --- a/coderd/database/dbmock/dbmock.go +++ b/coderd/database/dbmock/dbmock.go @@ -2433,6 +2433,21 @@ func (mr *MockStoreMockRecorder) GetProvisionerJobByID(ctx, id any) *gomock.Call return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetProvisionerJobByID", reflect.TypeOf((*MockStore)(nil).GetProvisionerJobByID), ctx, id) } +// GetProvisionerJobByIDForUpdate mocks base method. +func (m *MockStore) GetProvisionerJobByIDForUpdate(ctx context.Context, id uuid.UUID) (database.ProvisionerJob, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetProvisionerJobByIDForUpdate", ctx, id) + ret0, _ := ret[0].(database.ProvisionerJob) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetProvisionerJobByIDForUpdate indicates an expected call of GetProvisionerJobByIDForUpdate. +func (mr *MockStoreMockRecorder) GetProvisionerJobByIDForUpdate(ctx, id any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetProvisionerJobByIDForUpdate", reflect.TypeOf((*MockStore)(nil).GetProvisionerJobByIDForUpdate), ctx, id) +} + // GetProvisionerJobTimingsByJobID mocks base method. func (m *MockStore) GetProvisionerJobTimingsByJobID(ctx context.Context, jobID uuid.UUID) ([]database.ProvisionerJobTiming, error) { m.ctrl.T.Helper() diff --git a/coderd/database/querier.go b/coderd/database/querier.go index af85e9d0d9c22..b956aea67c130 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -264,6 +264,9 @@ type sqlcQuerier interface { // Previous job information. GetProvisionerDaemonsWithStatusByOrganization(ctx context.Context, arg GetProvisionerDaemonsWithStatusByOrganizationParams) ([]GetProvisionerDaemonsWithStatusByOrganizationRow, error) GetProvisionerJobByID(ctx context.Context, id uuid.UUID) (ProvisionerJob, error) + // Gets a single provisioner job by ID for update. + // This is used to securely reap jobs that have been hung/pending for a long time. + GetProvisionerJobByIDForUpdate(ctx context.Context, id uuid.UUID) (ProvisionerJob, error) GetProvisionerJobTimingsByJobID(ctx context.Context, jobID uuid.UUID) ([]ProvisionerJobTiming, error) GetProvisionerJobsByIDs(ctx context.Context, ids []uuid.UUID) ([]ProvisionerJob, error) GetProvisionerJobsByIDsWithQueuePosition(ctx context.Context, ids []uuid.UUID) ([]GetProvisionerJobsByIDsWithQueuePositionRow, error) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index 06aba2e412071..e119c0f002ea2 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -7412,6 +7412,46 @@ func (q *sqlQuerier) GetProvisionerJobByID(ctx context.Context, id uuid.UUID) (P return i, err } +const getProvisionerJobByIDForUpdate = `-- name: GetProvisionerJobByIDForUpdate :one +SELECT + id, created_at, updated_at, started_at, canceled_at, completed_at, error, organization_id, initiator_id, provisioner, storage_method, type, input, worker_id, file_id, tags, error_code, trace_metadata, job_status +FROM + provisioner_jobs +WHERE + id = $1 +FOR UPDATE +SKIP LOCKED +` + +// Gets a single provisioner job by ID for update. +// This is used to securely reap jobs that have been hung/pending for a long time. +func (q *sqlQuerier) GetProvisionerJobByIDForUpdate(ctx context.Context, id uuid.UUID) (ProvisionerJob, error) { + row := q.db.QueryRowContext(ctx, getProvisionerJobByIDForUpdate, id) + var i ProvisionerJob + err := row.Scan( + &i.ID, + &i.CreatedAt, + &i.UpdatedAt, + &i.StartedAt, + &i.CanceledAt, + &i.CompletedAt, + &i.Error, + &i.OrganizationID, + &i.InitiatorID, + &i.Provisioner, + &i.StorageMethod, + &i.Type, + &i.Input, + &i.WorkerID, + &i.FileID, + &i.Tags, + &i.ErrorCode, + &i.TraceMetadata, + &i.JobStatus, + ) + return i, err +} + const getProvisionerJobTimingsByJobID = `-- name: GetProvisionerJobTimingsByJobID :many SELECT job_id, started_at, ended_at, stage, source, action, resource FROM provisioner_job_timings WHERE job_id = $1 @@ -7864,7 +7904,6 @@ WHERE ) ORDER BY random() LIMIT $3 -FOR UPDATE SKIP LOCKED ` type GetProvisionerJobsToBeReapedParams struct { diff --git a/coderd/database/queries/provisionerjobs.sql b/coderd/database/queries/provisionerjobs.sql index 2317ea38ac683..1a01cbdf8ddd4 100644 --- a/coderd/database/queries/provisionerjobs.sql +++ b/coderd/database/queries/provisionerjobs.sql @@ -41,6 +41,18 @@ FROM WHERE id = $1; +-- name: GetProvisionerJobByIDForUpdate :one +-- Gets a single provisioner job by ID for update. +-- This is used to securely reap jobs that have been hung/pending for a long time. +SELECT + * +FROM + provisioner_jobs +WHERE + id = $1 +FOR UPDATE +SKIP LOCKED; + -- name: GetProvisionerJobsByIDs :many SELECT * @@ -288,8 +300,7 @@ WHERE AND completed_at IS NULL ) ORDER BY random() -LIMIT @max_jobs -FOR UPDATE SKIP LOCKED; +LIMIT @max_jobs; -- name: InsertProvisionerJobTimings :many INSERT INTO provisioner_job_timings (job_id, started_at, ended_at, stage, source, action, resource) diff --git a/coderd/jobreaper/detector.go b/coderd/jobreaper/detector.go index 762a04c819230..da33d2f537905 100644 --- a/coderd/jobreaper/detector.go +++ b/coderd/jobreaper/detector.go @@ -239,18 +239,12 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub var lowestLogID int64 err := db.InTx(func(db database.Store) error { - locked, err := db.TryAcquireLock(ctx, database.GenLockID(fmt.Sprintf("reaper:%s", jobToReap.ID))) - if err != nil { - return xerrors.Errorf("acquire lock: %w", err) - } - if !locked { - // This error is ignored. - return acquireLockError{} - } - // Refetch the job while we hold the lock. - job, err := db.GetProvisionerJobByID(ctx, jobToReap.ID) + job, err := db.GetProvisionerJobByIDForUpdate(ctx, jobToReap.ID) if err != nil { + if xerrors.Is(err, sql.ErrNoRows) { + return acquireLockError{} + } return xerrors.Errorf("get provisioner job: %w", err) } From 77be34e62b1e0f498d5873f1fe20bad4f2c4f2b2 Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Fri, 16 May 2025 16:32:46 +0000 Subject: [PATCH 18/24] WIP --- coderd/database/dbauthz/dbauthz.go | 19 ++++++++++++------- coderd/database/dbauthz/dbauthz_test.go | 20 ++++++++++++-------- coderd/rbac/roles.go | 1 - 3 files changed, 24 insertions(+), 16 deletions(-) diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index 43210084eec5e..a2aee40ba801d 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -2316,9 +2316,15 @@ func (q *querier) GetProvisionerJobTimingsByJobID(ctx context.Context, jobID uui } func (q *querier) GetProvisionerJobsByIDs(ctx context.Context, ids []uuid.UUID) ([]database.ProvisionerJob, error) { - // if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { - // return nil, err - // } + provisionerJobs, err := q.db.GetProvisionerJobsByIDs(ctx, ids) + if err != nil { + return nil, err + } + for _, job := range provisionerJobs { + if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs.InOrg(job.OrganizationID)); err != nil { + return nil, err + } + } return q.db.GetProvisionerJobsByIDs(ctx, ids) } @@ -2326,7 +2332,6 @@ func (q *querier) GetProvisionerJobsByIDsWithQueuePosition(ctx context.Context, // if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { // return nil, err // } - // policy.ActionRead, rbac.ResourceProvisionerJobs.InOrg(org.ID) return q.db.GetProvisionerJobsByIDsWithQueuePosition(ctx, ids) } @@ -2338,9 +2343,9 @@ func (q *querier) GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndP } func (q *querier) GetProvisionerJobsCreatedAfter(ctx context.Context, createdAt time.Time) ([]database.ProvisionerJob, error) { - // if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { - // return nil, err - // } + if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { + return nil, err + } return q.db.GetProvisionerJobsCreatedAfter(ctx, createdAt) } diff --git a/coderd/database/dbauthz/dbauthz_test.go b/coderd/database/dbauthz/dbauthz_test.go index 57532855955ee..381c9b55b8352 100644 --- a/coderd/database/dbauthz/dbauthz_test.go +++ b/coderd/database/dbauthz/dbauthz_test.go @@ -694,9 +694,12 @@ func (s *MethodTestSuite) TestProvisionerJob() { Asserts(v.RBACObject(tpl), []policy.Action{policy.ActionRead, policy.ActionUpdate}).Returns() })) s.Run("GetProvisionerJobsByIDs", s.Subtest(func(db database.Store, check *expects) { - a := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) - b := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) - check.Args([]uuid.UUID{a.ID, b.ID}).Asserts().Returns(slice.New(a, b)) + orgID := uuid.New() + a := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: orgID}) + b := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: orgID}) + check.Args([]uuid.UUID{a.ID, b.ID}). + Asserts(rbac.ResourceProvisionerJobs.InOrg(orgID), policy.ActionRead, rbac.ResourceProvisionerJobs.InOrg(orgID), policy.ActionRead). + Returns(slice.New(a, b)) })) s.Run("GetProvisionerLogsAfterID", s.Subtest(func(db database.Store, check *expects) { u := dbgen.User(s.T(), db, database.User{}) @@ -3892,7 +3895,7 @@ func (s *MethodTestSuite) TestSystemFunctions() { })) s.Run("GetProvisionerJobsCreatedAfter", s.Subtest(func(db database.Store, check *expects) { _ = dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{CreatedAt: time.Now().Add(-time.Hour)}) - check.Args(time.Now()).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionRead */ ) + check.Args(time.Now()).Asserts(rbac.ResourceProvisionerJobs, policy.ActionRead) })) s.Run("GetTemplateVersionsByIDs", s.Subtest(func(db database.Store, check *expects) { dbtestutil.DisableForeignKeysAndTriggers(s.T(), db) @@ -3975,10 +3978,11 @@ func (s *MethodTestSuite) TestSystemFunctions() { Returns([]database.WorkspaceAgent{agt}) })) s.Run("GetProvisionerJobsByIDs", s.Subtest(func(db database.Store, check *expects) { - a := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) - b := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) + orgID := uuid.New() + a := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: orgID}) + b := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: orgID}) check.Args([]uuid.UUID{a.ID, b.ID}). - Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionRead */ ). + Asserts(rbac.ResourceProvisionerJobs.InOrg(orgID), policy.ActionRead, rbac.ResourceProvisionerJobs.InOrg(orgID), policy.ActionRead). Returns(slice.New(a, b)) })) s.Run("InsertWorkspaceAgent", s.Subtest(func(db database.Store, check *expects) { @@ -4279,7 +4283,7 @@ func (s *MethodTestSuite) TestSystemFunctions() { check.Args([]uuid.UUID{uuid.New()}).Asserts(rbac.ResourceSystem, policy.ActionRead) })) s.Run("GetProvisionerJobsByIDsWithQueuePosition", s.Subtest(func(db database.Store, check *expects) { - check.Args([]uuid.UUID{}).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionRead */ ) + check.Args([]uuid.UUID{}).Asserts( /* rbac.ResourceProvisionerJobs.InOrg(orgID), policy.ActionRead */ ) })) s.Run("GetReplicaByID", s.Subtest(func(db database.Store, check *expects) { check.Args(uuid.New()).Asserts(rbac.ResourceSystem, policy.ActionRead).Errors(sql.ErrNoRows) diff --git a/coderd/rbac/roles.go b/coderd/rbac/roles.go index b9bcff5d42bdf..0b94a74201b16 100644 --- a/coderd/rbac/roles.go +++ b/coderd/rbac/roles.go @@ -273,7 +273,6 @@ func ReloadBuiltinRoles(opts *RoleOptions) { Permissions(map[string][]policy.Action{ ResourceWorkspace.Type: ownerWorkspaceActions, ResourceWorkspaceDormant.Type: {policy.ActionRead, policy.ActionDelete, policy.ActionCreate, policy.ActionUpdate, policy.ActionWorkspaceStop}, - ResourceProvisionerJobs.Type: {policy.ActionRead, policy.ActionUpdate, policy.ActionCreate}, })...), Org: map[string][]Permission{}, User: []Permission{}, From 4351529183f4c507a9ef8d35e073ad5573a5e33f Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Fri, 16 May 2025 17:02:49 +0000 Subject: [PATCH 19/24] WIP --- coderd/database/dbauthz/dbauthz.go | 30 ++++++++++++------------- coderd/database/dbauthz/dbauthz_test.go | 10 ++++----- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index a2aee40ba801d..e2b72a544595e 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -1088,9 +1088,9 @@ func (q *querier) AcquireNotificationMessages(ctx context.Context, arg database. } func (q *querier) AcquireProvisionerJob(ctx context.Context, arg database.AcquireProvisionerJobParams) (database.ProvisionerJob, error) { - // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { - // return database.ProvisionerJob{}, err - // } + if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + return database.ProvisionerJob{}, err + } return q.db.AcquireProvisionerJob(ctx, arg) } @@ -3559,9 +3559,9 @@ func (q *querier) InsertProvisionerJobLogs(ctx context.Context, arg database.Ins } func (q *querier) InsertProvisionerJobTimings(ctx context.Context, arg database.InsertProvisionerJobTimingsParams) ([]database.ProvisionerJobTiming, error) { - // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { - // return nil, err - // } + if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + return nil, err + } return q.db.InsertProvisionerJobTimings(ctx, arg) } @@ -4185,9 +4185,9 @@ func (q *querier) UpdateProvisionerDaemonLastSeenAt(ctx context.Context, arg dat } func (q *querier) UpdateProvisionerJobByID(ctx context.Context, arg database.UpdateProvisionerJobByIDParams) error { - // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { - // return err - // } + if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + return err + } return q.db.UpdateProvisionerJobByID(ctx, arg) } @@ -4263,16 +4263,16 @@ func (q *querier) UpdateProvisionerJobWithCancelByID(ctx context.Context, arg da } func (q *querier) UpdateProvisionerJobWithCompleteByID(ctx context.Context, arg database.UpdateProvisionerJobWithCompleteByIDParams) error { - // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { - // return err - // } + if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + return err + } return q.db.UpdateProvisionerJobWithCompleteByID(ctx, arg) } func (q *querier) UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx context.Context, arg database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams) error { - // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { - // return err - // } + if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { + return err + } return q.db.UpdateProvisionerJobWithCompleteWithStartedAtByID(ctx, arg) } diff --git a/coderd/database/dbauthz/dbauthz_test.go b/coderd/database/dbauthz/dbauthz_test.go index 381c9b55b8352..07344b30261ad 100644 --- a/coderd/database/dbauthz/dbauthz_test.go +++ b/coderd/database/dbauthz/dbauthz_test.go @@ -4026,26 +4026,26 @@ func (s *MethodTestSuite) TestSystemFunctions() { OrganizationID: j.OrganizationID, Types: []database.ProvisionerType{j.Provisioner}, ProvisionerTags: must(json.Marshal(j.Tags)), - }).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionUpdate */ ) + }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) })) s.Run("UpdateProvisionerJobWithCompleteByID", s.Subtest(func(db database.Store, check *expects) { j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.UpdateProvisionerJobWithCompleteByIDParams{ ID: j.ID, - }).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionUpdate */ ) + }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) })) s.Run("UpdateProvisionerJobWithCompleteWithStartedAtByID", s.Subtest(func(db database.Store, check *expects) { j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.UpdateProvisionerJobWithCompleteWithStartedAtByIDParams{ ID: j.ID, - }).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionUpdate */ ) + }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) })) s.Run("UpdateProvisionerJobByID", s.Subtest(func(db database.Store, check *expects) { j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.UpdateProvisionerJobByIDParams{ ID: j.ID, UpdatedAt: time.Now(), - }).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionUpdate */ ) + }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) })) s.Run("InsertProvisionerJob", s.Subtest(func(db database.Store, check *expects) { dbtestutil.DisableForeignKeysAndTriggers(s.T(), db) @@ -4067,7 +4067,7 @@ func (s *MethodTestSuite) TestSystemFunctions() { j := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{}) check.Args(database.InsertProvisionerJobTimingsParams{ JobID: j.ID, - }).Asserts( /* rbac.ResourceProvisionerJobs, policy.ActionUpdate */ ) + }).Asserts(rbac.ResourceProvisionerJobs, policy.ActionUpdate) })) s.Run("UpsertProvisionerDaemon", s.Subtest(func(db database.Store, check *expects) { dbtestutil.DisableForeignKeysAndTriggers(s.T(), db) From c03bfa39ac791e899a931c0c31f9bd0e7c2fe916 Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Mon, 19 May 2025 07:57:25 +0000 Subject: [PATCH 20/24] fixed InOrg check for provisionerjob resource --- coderd/database/dbauthz/dbauthz.go | 2 +- coderd/database/dbauthz/dbauthz_test.go | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index e2b72a544595e..032f37339c21a 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -2325,7 +2325,7 @@ func (q *querier) GetProvisionerJobsByIDs(ctx context.Context, ids []uuid.UUID) return nil, err } } - return q.db.GetProvisionerJobsByIDs(ctx, ids) + return provisionerJobs, nil } func (q *querier) GetProvisionerJobsByIDsWithQueuePosition(ctx context.Context, ids []uuid.UUID) ([]database.GetProvisionerJobsByIDsWithQueuePositionRow, error) { diff --git a/coderd/database/dbauthz/dbauthz_test.go b/coderd/database/dbauthz/dbauthz_test.go index 07344b30261ad..5eb5115c51ef2 100644 --- a/coderd/database/dbauthz/dbauthz_test.go +++ b/coderd/database/dbauthz/dbauthz_test.go @@ -694,11 +694,11 @@ func (s *MethodTestSuite) TestProvisionerJob() { Asserts(v.RBACObject(tpl), []policy.Action{policy.ActionRead, policy.ActionUpdate}).Returns() })) s.Run("GetProvisionerJobsByIDs", s.Subtest(func(db database.Store, check *expects) { - orgID := uuid.New() - a := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: orgID}) - b := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: orgID}) + o := dbgen.Organization(s.T(), db, database.Organization{}) + a := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: o.ID}) + b := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: o.ID}) check.Args([]uuid.UUID{a.ID, b.ID}). - Asserts(rbac.ResourceProvisionerJobs.InOrg(orgID), policy.ActionRead, rbac.ResourceProvisionerJobs.InOrg(orgID), policy.ActionRead). + Asserts(rbac.ResourceProvisionerJobs.InOrg(o.ID), policy.ActionRead, rbac.ResourceProvisionerJobs.InOrg(o.ID), policy.ActionRead). Returns(slice.New(a, b)) })) s.Run("GetProvisionerLogsAfterID", s.Subtest(func(db database.Store, check *expects) { @@ -3978,11 +3978,11 @@ func (s *MethodTestSuite) TestSystemFunctions() { Returns([]database.WorkspaceAgent{agt}) })) s.Run("GetProvisionerJobsByIDs", s.Subtest(func(db database.Store, check *expects) { - orgID := uuid.New() - a := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: orgID}) - b := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: orgID}) + o := dbgen.Organization(s.T(), db, database.Organization{}) + a := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: o.ID}) + b := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: o.ID}) check.Args([]uuid.UUID{a.ID, b.ID}). - Asserts(rbac.ResourceProvisionerJobs.InOrg(orgID), policy.ActionRead, rbac.ResourceProvisionerJobs.InOrg(orgID), policy.ActionRead). + Asserts(rbac.ResourceProvisionerJobs.InOrg(o.ID), policy.ActionRead, rbac.ResourceProvisionerJobs.InOrg(o.ID), policy.ActionRead). Returns(slice.New(a, b)) })) s.Run("InsertWorkspaceAgent", s.Subtest(func(db database.Store, check *expects) { @@ -4283,7 +4283,7 @@ func (s *MethodTestSuite) TestSystemFunctions() { check.Args([]uuid.UUID{uuid.New()}).Asserts(rbac.ResourceSystem, policy.ActionRead) })) s.Run("GetProvisionerJobsByIDsWithQueuePosition", s.Subtest(func(db database.Store, check *expects) { - check.Args([]uuid.UUID{}).Asserts( /* rbac.ResourceProvisionerJobs.InOrg(orgID), policy.ActionRead */ ) + check.Args([]uuid.UUID{}).Asserts() })) s.Run("GetReplicaByID", s.Subtest(func(db database.Store, check *expects) { check.Args(uuid.New()).Asserts(rbac.ResourceSystem, policy.ActionRead).Errors(sql.ErrNoRows) From a15bd1cff30f2916ffbfd07c5b34623e2e1a52fb Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Mon, 19 May 2025 14:31:33 +0000 Subject: [PATCH 21/24] PR review; naming in the comments, added comments for SQL, less verbose logging --- coderd/database/dbauthz/dbauthz.go | 7 +--- coderd/database/dbmem/dbmem.go | 8 +++-- coderd/database/queries/provisionerjobs.sql | 1 + coderd/jobreaper/detector.go | 23 ++++++------ coderd/jobreaper/detector_test.go | 39 ++++----------------- provisioner/terraform/serve.go | 4 +-- 6 files changed, 29 insertions(+), 53 deletions(-) diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index 032f37339c21a..35727f14b8665 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -3542,12 +3542,7 @@ func (q *querier) InsertPresetParameters(ctx context.Context, arg database.Inser func (q *querier) InsertProvisionerJob(ctx context.Context, arg database.InsertProvisionerJobParams) (database.ProvisionerJob, error) { // TODO: Remove this once we have a proper rbac check for provisioner jobs. - // Currently ProvisionerJobs are not associated with a user, so we can't - // check for a user's permissions. We'd need to check for the associated workspace - // and verify ownership through that. - // if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceProvisionerJobs); err != nil { - // return database.ProvisionerJob{}, err - // } + // Details in https://github.com/coder/coder/issues/16160 return q.db.InsertProvisionerJob(ctx, arg) } diff --git a/coderd/database/dbmem/dbmem.go b/coderd/database/dbmem/dbmem.go index d6725b066b732..3475fd3bf9fbe 100644 --- a/coderd/database/dbmem/dbmem.go +++ b/coderd/database/dbmem/dbmem.go @@ -8,6 +8,7 @@ import ( "errors" "fmt" "math" + "math/rand/v2" "reflect" "regexp" "slices" @@ -4883,11 +4884,14 @@ func (q *FakeQuerier) GetProvisionerJobsToBeReaped(_ context.Context, arg databa provisionerJob.Tags = maps.Clone(provisionerJob.Tags) hungJobs = append(hungJobs, provisionerJob) if len(hungJobs) >= int(maxJobs) { - return hungJobs, nil + break } } } } + rand.Shuffle(len(hungJobs), func(i, j int) { + hungJobs[i], hungJobs[j] = hungJobs[j], hungJobs[i] + }) return hungJobs, nil } @@ -10955,8 +10959,8 @@ func (q *FakeQuerier) UpdateProvisionerJobWithCompleteWithStartedAtByID(_ contex job.CompletedAt = arg.CompletedAt job.Error = arg.Error job.ErrorCode = arg.ErrorCode - job.JobStatus = provisionerJobStatus(job) job.StartedAt = arg.StartedAt + job.JobStatus = provisionerJobStatus(job) q.provisionerJobs[index] = job return nil } diff --git a/coderd/database/queries/provisionerjobs.sql b/coderd/database/queries/provisionerjobs.sql index 1a01cbdf8ddd4..4096fa5b2755a 100644 --- a/coderd/database/queries/provisionerjobs.sql +++ b/coderd/database/queries/provisionerjobs.sql @@ -299,6 +299,7 @@ WHERE AND started_at IS NOT NULL AND completed_at IS NULL ) +-- To avoid repeatedly attempting to reap the same jobs, we randomly order and limit to @max_jobs. ORDER BY random() LIMIT @max_jobs; diff --git a/coderd/jobreaper/detector.go b/coderd/jobreaper/detector.go index da33d2f537905..ad5774ee6b95d 100644 --- a/coderd/jobreaper/detector.go +++ b/coderd/jobreaper/detector.go @@ -20,12 +20,12 @@ import ( ) const ( - // HungJobDuration is the duration of time since the last update to a job - // before it is considered hung. + // HungJobDuration is the duration of time since the last update + // to a RUNNING job before it is considered hung. HungJobDuration = 5 * time.Minute - // PendingJobDuration is the duration of time since the last update to a job - // before it is considered hung. + // PendingJobDuration is the duration of time since last update + // to a PENDING job before it is considered dead. PendingJobDuration = 30 * time.Minute // HungJobExitTimeout is the duration of time that provisioners should allow @@ -42,7 +42,7 @@ const ( ) // jobLogMessages are written to provisioner job logs when a job is reaped -func jobLogMessages(reapType ReapType, threshold time.Duration) []string { +func JobLogMessages(reapType ReapType, threshold time.Duration) []string { return []string{ "", "====================", @@ -110,9 +110,9 @@ type Stats struct { Error error } -// New returns a new hang detector. +// New returns a new job reaper. func New(ctx context.Context, db database.Store, pub pubsub.Pubsub, log slog.Logger, tick <-chan time.Time) *Detector { - //nolint:gocritic // Hang detector has a limited set of permissions. + //nolint:gocritic // Job reaper has a limited set of permissions. ctx, cancel := context.WithCancel(dbauthz.AsJobReaper(ctx)) d := &Detector{ ctx: ctx, @@ -224,7 +224,7 @@ func (d *Detector) run(t time.Time) Stats { err := reapJob(ctx, log, d.db, d.pubsub, job) if err != nil { if !(xerrors.As(err, &acquireLockError{}) || xerrors.As(err, &jobIneligibleError{})) { - log.Error(ctx, fmt.Sprintf("error forcefully terminating %s provisioner job", job.Type), slog.Error(err)) + log.Error(ctx, "error forcefully terminating provisioner job", slog.F("type", job.Type), slog.Error(err)) } continue } @@ -260,7 +260,8 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub } log.Warn( - ctx, fmt.Sprintf("detected %s provisioner job, forcefully terminating", jobToReap.Type), + ctx, "forcefully terminating provisioner job", + "type", jobToReap.Type, "threshold", jobToReap.Threshold, ) @@ -291,7 +292,7 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub Output: nil, } now := dbtime.Now() - for i, msg := range jobLogMessages(jobToReap.Type, jobToReap.Threshold) { + for i, msg := range JobLogMessages(jobToReap.Type, jobToReap.Threshold) { // Set the created at in a way that ensures each message has // a unique timestamp so they will be sorted correctly. insertParams.CreatedAt = append(insertParams.CreatedAt, now.Add(time.Millisecond*time.Duration(i))) @@ -325,7 +326,7 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub Valid: true, }, Error: sql.NullString{ - String: fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and has been terminated by hang detector.", jobToReap.Type, jobToReap.Threshold.Minutes()), + String: fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and has been terminated by the reaper.", jobToReap.Type, jobToReap.Threshold.Minutes()), Valid: true, }, ErrorCode: sql.NullString{ diff --git a/coderd/jobreaper/detector_test.go b/coderd/jobreaper/detector_test.go index 5d2a57b0fb47c..28457aeeca3a8 100644 --- a/coderd/jobreaper/detector_test.go +++ b/coderd/jobreaper/detector_test.go @@ -27,36 +27,6 @@ import ( "github.com/coder/coder/v2/testutil" ) -// jobType represents the type of job being reaped -type jobType string - -const ( - hungJobType jobType = "hung" - pendingJobType jobType = "pending" -) - -// jobLogMessages returns the messages to be written to provisioner job logs when a job is reaped -func jobLogMessages(jobType jobType, threshold float64) []string { - return []string{ - "", - "====================", - fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and will be terminated.", jobType, threshold), - "====================", - "", - } -} - -// reapParamsFromJob determines the type and threshold for a job being reaped -func reapParamsFromJob(job database.ProvisionerJob) (jobType, float64) { - jobType := hungJobType - threshold := jobreaper.HungJobDuration.Minutes() - if !job.StartedAt.Valid { - jobType = pendingJobType - threshold = jobreaper.PendingJobDuration.Minutes() - } - return jobType, threshold -} - func TestMain(m *testing.M) { goleak.VerifyTestMain(m, testutil.GoleakOptions...) } @@ -972,8 +942,13 @@ func TestDetectorPushesLogs(t *testing.T) { CreatedAfter: after, }) require.NoError(t, err) - jobType, threshold := reapParamsFromJob(templateImportJob) - expectedLogs := jobLogMessages(jobType, threshold) + threshold := jobreaper.HungJobDuration + jobType := jobreaper.Hung + if templateImportJob.JobStatus == database.ProvisionerJobStatusPending { + threshold = jobreaper.PendingJobDuration + jobType = jobreaper.Pending + } + expectedLogs := jobreaper.JobLogMessages(jobType, threshold) require.Len(t, logs, len(expectedLogs)) for i, log := range logs { assert.Equal(t, database.LogLevelError, log.Level) diff --git a/provisioner/terraform/serve.go b/provisioner/terraform/serve.go index 67eeaccaed05d..3e671b0c68e56 100644 --- a/provisioner/terraform/serve.go +++ b/provisioner/terraform/serve.go @@ -39,9 +39,9 @@ type ServeOptions struct { // // This is a no-op on Windows where the process can't be interrupted. // - // Default value: 3 minutes (reaper.HungJobExitTimeout). This value should + // Default value: 3 minutes (jobreaper.HungJobExitTimeout). This value should // be kept less than the value that Coder uses to mark hung jobs as failed, - // which is 5 minutes (see reaper package). + // which is 5 minutes (see jobreaper package). ExitTimeout time.Duration } From 5b9348f40a8a4e0189066977cb5de30202fd2bbf Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Mon, 19 May 2025 15:00:58 +0000 Subject: [PATCH 22/24] fixes to tests after lint remove rand --- coderd/database/dbmem/dbmem.go | 2 +- coderd/database/querier.go | 1 + coderd/database/queries.sql.go | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/coderd/database/dbmem/dbmem.go b/coderd/database/dbmem/dbmem.go index 3475fd3bf9fbe..5486f930b8321 100644 --- a/coderd/database/dbmem/dbmem.go +++ b/coderd/database/dbmem/dbmem.go @@ -8,7 +8,7 @@ import ( "errors" "fmt" "math" - "math/rand/v2" + "math/rand/v2" //nolint:staticcheck // used only in tests to randomize list order "reflect" "regexp" "slices" diff --git a/coderd/database/querier.go b/coderd/database/querier.go index b956aea67c130..e42f2b259cd08 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -272,6 +272,7 @@ type sqlcQuerier interface { GetProvisionerJobsByIDsWithQueuePosition(ctx context.Context, ids []uuid.UUID) ([]GetProvisionerJobsByIDsWithQueuePositionRow, error) GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisioner(ctx context.Context, arg GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisionerParams) ([]GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisionerRow, error) GetProvisionerJobsCreatedAfter(ctx context.Context, createdAt time.Time) ([]ProvisionerJob, error) + // To avoid repeatedly attempting to reap the same jobs, we randomly order and limit to @max_jobs. GetProvisionerJobsToBeReaped(ctx context.Context, arg GetProvisionerJobsToBeReapedParams) ([]ProvisionerJob, error) GetProvisionerKeyByHashedSecret(ctx context.Context, hashedSecret []byte) (ProvisionerKey, error) GetProvisionerKeyByID(ctx context.Context, id uuid.UUID) (ProvisionerKey, error) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index e119c0f002ea2..00922cc3f7ff0 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -7912,6 +7912,7 @@ type GetProvisionerJobsToBeReapedParams struct { MaxJobs int32 `db:"max_jobs" json:"max_jobs"` } +// To avoid repeatedly attempting to reap the same jobs, we randomly order and limit to @max_jobs. func (q *sqlQuerier) GetProvisionerJobsToBeReaped(ctx context.Context, arg GetProvisionerJobsToBeReapedParams) ([]ProvisionerJob, error) { rows, err := q.db.QueryContext(ctx, getProvisionerJobsToBeReaped, arg.PendingSince, arg.HungSince, arg.MaxJobs) if err != nil { From 91d2d3201c86531c121c7981cfc0d76c19e26afd Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Mon, 19 May 2025 15:22:38 +0000 Subject: [PATCH 23/24] readded rand to fix gen failing in CI --- coderd/database/dbmem/dbmem.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/coderd/database/dbmem/dbmem.go b/coderd/database/dbmem/dbmem.go index 5486f930b8321..d3b518b2a6460 100644 --- a/coderd/database/dbmem/dbmem.go +++ b/coderd/database/dbmem/dbmem.go @@ -8,7 +8,7 @@ import ( "errors" "fmt" "math" - "math/rand/v2" //nolint:staticcheck // used only in tests to randomize list order + "math/rand" //#nosec // this is only used for shuffling an array to pick random jobs to reap "reflect" "regexp" "slices" From 767cb77cf4bdac0827cf9b90a9e781d3bb6f32b4 Mon Sep 17 00:00:00 2001 From: Michael Suchacz <203725896+ibetitsmike@users.noreply.github.com> Date: Tue, 20 May 2025 13:04:35 +0000 Subject: [PATCH 24/24] adjusted TODOs --- coderd/database/dbauthz/dbauthz.go | 26 ++++++++++++------------- coderd/database/dbauthz/dbauthz_test.go | 4 ++-- coderd/database/dbmem/dbmem.go | 4 ++-- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index 35727f14b8665..16d707c466104 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -2320,8 +2320,12 @@ func (q *querier) GetProvisionerJobsByIDs(ctx context.Context, ids []uuid.UUID) if err != nil { return nil, err } + orgIDs := make(map[uuid.UUID]struct{}) for _, job := range provisionerJobs { - if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs.InOrg(job.OrganizationID)); err != nil { + orgIDs[job.OrganizationID] = struct{}{} + } + for orgID := range orgIDs { + if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs.InOrg(orgID)); err != nil { return nil, err } } @@ -2329,16 +2333,14 @@ func (q *querier) GetProvisionerJobsByIDs(ctx context.Context, ids []uuid.UUID) } func (q *querier) GetProvisionerJobsByIDsWithQueuePosition(ctx context.Context, ids []uuid.UUID) ([]database.GetProvisionerJobsByIDsWithQueuePositionRow, error) { - // if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { - // return nil, err - // } + // TODO: Remove this once we have a proper rbac check for provisioner jobs. + // Details in https://github.com/coder/coder/issues/16160 return q.db.GetProvisionerJobsByIDsWithQueuePosition(ctx, ids) } func (q *querier) GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisioner(ctx context.Context, arg database.GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisionerParams) ([]database.GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisionerRow, error) { - // if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceProvisionerJobs); err != nil { - // return nil, err - // } + // TODO: Remove this once we have a proper rbac check for provisioner jobs. + // Details in https://github.com/coder/coder/issues/16160 return fetchWithPostFilter(q.auth, policy.ActionRead, q.db.GetProvisionerJobsByOrganizationAndStatusWithQueuePositionAndProvisioner)(ctx, arg) } @@ -3547,9 +3549,8 @@ func (q *querier) InsertProvisionerJob(ctx context.Context, arg database.InsertP } func (q *querier) InsertProvisionerJobLogs(ctx context.Context, arg database.InsertProvisionerJobLogsParams) ([]database.ProvisionerJobLog, error) { - // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { - // return nil, err - // } + // TODO: Remove this once we have a proper rbac check for provisioner jobs. + // Details in https://github.com/coder/coder/issues/16160 return q.db.InsertProvisionerJobLogs(ctx, arg) } @@ -4187,9 +4188,8 @@ func (q *querier) UpdateProvisionerJobByID(ctx context.Context, arg database.Upd } func (q *querier) UpdateProvisionerJobWithCancelByID(ctx context.Context, arg database.UpdateProvisionerJobWithCancelByIDParams) error { - // if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerJobs); err != nil { - // return err - // } + // TODO: Remove this once we have a proper rbac check for provisioner jobs. + // Details in https://github.com/coder/coder/issues/16160 job, err := q.db.GetProvisionerJobByID(ctx, arg.ID) if err != nil { diff --git a/coderd/database/dbauthz/dbauthz_test.go b/coderd/database/dbauthz/dbauthz_test.go index 5eb5115c51ef2..f851d0d6d1c08 100644 --- a/coderd/database/dbauthz/dbauthz_test.go +++ b/coderd/database/dbauthz/dbauthz_test.go @@ -698,7 +698,7 @@ func (s *MethodTestSuite) TestProvisionerJob() { a := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: o.ID}) b := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: o.ID}) check.Args([]uuid.UUID{a.ID, b.ID}). - Asserts(rbac.ResourceProvisionerJobs.InOrg(o.ID), policy.ActionRead, rbac.ResourceProvisionerJobs.InOrg(o.ID), policy.ActionRead). + Asserts(rbac.ResourceProvisionerJobs.InOrg(o.ID), policy.ActionRead). Returns(slice.New(a, b)) })) s.Run("GetProvisionerLogsAfterID", s.Subtest(func(db database.Store, check *expects) { @@ -3982,7 +3982,7 @@ func (s *MethodTestSuite) TestSystemFunctions() { a := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: o.ID}) b := dbgen.ProvisionerJob(s.T(), db, nil, database.ProvisionerJob{OrganizationID: o.ID}) check.Args([]uuid.UUID{a.ID, b.ID}). - Asserts(rbac.ResourceProvisionerJobs.InOrg(o.ID), policy.ActionRead, rbac.ResourceProvisionerJobs.InOrg(o.ID), policy.ActionRead). + Asserts(rbac.ResourceProvisionerJobs.InOrg(o.ID), policy.ActionRead). Returns(slice.New(a, b)) })) s.Run("InsertWorkspaceAgent", s.Subtest(func(db database.Store, check *expects) { diff --git a/coderd/database/dbmem/dbmem.go b/coderd/database/dbmem/dbmem.go index d3b518b2a6460..794d46c0083d0 100644 --- a/coderd/database/dbmem/dbmem.go +++ b/coderd/database/dbmem/dbmem.go @@ -8,7 +8,7 @@ import ( "errors" "fmt" "math" - "math/rand" //#nosec // this is only used for shuffling an array to pick random jobs to reap + insecurerand "math/rand" //#nosec // this is only used for shuffling an array to pick random jobs to reap "reflect" "regexp" "slices" @@ -4889,7 +4889,7 @@ func (q *FakeQuerier) GetProvisionerJobsToBeReaped(_ context.Context, arg databa } } } - rand.Shuffle(len(hungJobs), func(i, j int) { + insecurerand.Shuffle(len(hungJobs), func(i, j int) { hungJobs[i], hungJobs[j] = hungJobs[j], hungJobs[i] }) return hungJobs, nil