Skip to content

feat: cancel stuck pending jobs #17803

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 24 commits into from
May 20, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
0f51f35
added queries for fetching NotStartedProvisionerJobs
ibetitsmike Apr 1, 2025
2f3d606
added detector handling of not started jobs
ibetitsmike May 9, 2025
4b252eb
filling out started_at when unhanging not started jobs
ibetitsmike May 9, 2025
ca49519
WIP
ibetitsmike May 13, 2025
af994c2
refactored to reaper & added tests
ibetitsmike May 13, 2025
3815727
Revert "filling out started_at when unhanging not started jobs"
ibetitsmike May 13, 2025
b65f620
created new ORM update to avoid forcing setting StartedAt on every Co…
ibetitsmike May 13, 2025
3c7c323
added missing dbauthz tests
ibetitsmike May 13, 2025
35df01f
added checks for StartedAt value in the updated jobs
ibetitsmike May 13, 2025
8aa1ee2
refactor from reaper to jobreaper
ibetitsmike May 14, 2025
4385933
WIP
ibetitsmike May 14, 2025
96fee51
WIP
ibetitsmike May 14, 2025
d8db119
WIP
ibetitsmike May 15, 2025
5120fb1
WIP
ibetitsmike May 15, 2025
8d4fa5a
fixed sql comments
ibetitsmike May 15, 2025
18b809c
taking a step back with RBAC
ibetitsmike May 16, 2025
0fe1404
WIP
ibetitsmike May 16, 2025
77be34e
WIP
ibetitsmike May 16, 2025
4351529
WIP
ibetitsmike May 16, 2025
c03bfa3
fixed InOrg check for provisionerjob resource
ibetitsmike May 19, 2025
a15bd1c
PR review; naming in the comments, added comments for SQL, less verbo…
ibetitsmike May 19, 2025
5b9348f
fixes to tests after lint remove rand
ibetitsmike May 19, 2025
91d2d32
readded rand to fix gen failing in CI
ibetitsmike May 19, 2025
767cb77
adjusted TODOs
ibetitsmike May 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
PR review; naming in the comments, added comments for SQL, less verbo…
…se logging
  • Loading branch information
ibetitsmike committed May 19, 2025
commit a15bd1cff30f2916ffbfd07c5b34623e2e1a52fb
7 changes: 1 addition & 6 deletions coderd/database/dbauthz/dbauthz.go
Original file line number Diff line number Diff line change
Expand Up @@ -3542,12 +3542,7 @@ func (q *querier) InsertPresetParameters(ctx context.Context, arg database.Inser

func (q *querier) InsertProvisionerJob(ctx context.Context, arg database.InsertProvisionerJobParams) (database.ProvisionerJob, error) {
// TODO: Remove this once we have a proper rbac check for provisioner jobs.
// Currently ProvisionerJobs are not associated with a user, so we can't
// check for a user's permissions. We'd need to check for the associated workspace
// and verify ownership through that.
// if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceProvisionerJobs); err != nil {
// return database.ProvisionerJob{}, err
// }
// Details in https://github.com/coder/coder/issues/16160
return q.db.InsertProvisionerJob(ctx, arg)
}

Expand Down
8 changes: 6 additions & 2 deletions coderd/database/dbmem/dbmem.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"errors"
"fmt"
"math"
"math/rand/v2"
"reflect"
"regexp"
"slices"
Expand Down Expand Up @@ -4883,11 +4884,14 @@
provisionerJob.Tags = maps.Clone(provisionerJob.Tags)
hungJobs = append(hungJobs, provisionerJob)
if len(hungJobs) >= int(maxJobs) {
return hungJobs, nil
break
}
}
}
}

Check failure on line 4891 in coderd/database/dbmem/dbmem.go

View workflow job for this annotation

GitHub Actions / gen

undefined: rand
rand.Shuffle(len(hungJobs), func(i, j int) {
hungJobs[i], hungJobs[j] = hungJobs[j], hungJobs[i]
})
return hungJobs, nil
}

Expand Down Expand Up @@ -10955,8 +10959,8 @@
job.CompletedAt = arg.CompletedAt
job.Error = arg.Error
job.ErrorCode = arg.ErrorCode
job.JobStatus = provisionerJobStatus(job)
job.StartedAt = arg.StartedAt
job.JobStatus = provisionerJobStatus(job)
q.provisionerJobs[index] = job
return nil
}
Expand Down
1 change: 1 addition & 0 deletions coderd/database/queries/provisionerjobs.sql
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,7 @@ WHERE
AND started_at IS NOT NULL
AND completed_at IS NULL
)
-- To avoid repeatedly attempting to reap the same jobs, we randomly order and limit to @max_jobs.
ORDER BY random()
LIMIT @max_jobs;

Expand Down
23 changes: 12 additions & 11 deletions coderd/jobreaper/detector.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@ import (
)

const (
// HungJobDuration is the duration of time since the last update to a job
// before it is considered hung.
// HungJobDuration is the duration of time since the last update
// to a RUNNING job before it is considered hung.
HungJobDuration = 5 * time.Minute

// PendingJobDuration is the duration of time since the last update to a job
// before it is considered hung.
// PendingJobDuration is the duration of time since last update
// to a PENDING job before it is considered dead.
PendingJobDuration = 30 * time.Minute

// HungJobExitTimeout is the duration of time that provisioners should allow
Expand All @@ -42,7 +42,7 @@ const (
)

// jobLogMessages are written to provisioner job logs when a job is reaped
func jobLogMessages(reapType ReapType, threshold time.Duration) []string {
func JobLogMessages(reapType ReapType, threshold time.Duration) []string {
return []string{
"",
"====================",
Expand Down Expand Up @@ -110,9 +110,9 @@ type Stats struct {
Error error
}

// New returns a new hang detector.
// New returns a new job reaper.
func New(ctx context.Context, db database.Store, pub pubsub.Pubsub, log slog.Logger, tick <-chan time.Time) *Detector {
//nolint:gocritic // Hang detector has a limited set of permissions.
//nolint:gocritic // Job reaper has a limited set of permissions.
ctx, cancel := context.WithCancel(dbauthz.AsJobReaper(ctx))
d := &Detector{
ctx: ctx,
Expand Down Expand Up @@ -224,7 +224,7 @@ func (d *Detector) run(t time.Time) Stats {
err := reapJob(ctx, log, d.db, d.pubsub, job)
if err != nil {
if !(xerrors.As(err, &acquireLockError{}) || xerrors.As(err, &jobIneligibleError{})) {
log.Error(ctx, fmt.Sprintf("error forcefully terminating %s provisioner job", job.Type), slog.Error(err))
log.Error(ctx, "error forcefully terminating provisioner job", slog.F("type", job.Type), slog.Error(err))
}
continue
}
Expand Down Expand Up @@ -260,7 +260,8 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub
}

log.Warn(
ctx, fmt.Sprintf("detected %s provisioner job, forcefully terminating", jobToReap.Type),
ctx, "forcefully terminating provisioner job",
"type", jobToReap.Type,
"threshold", jobToReap.Threshold,
)

Expand Down Expand Up @@ -291,7 +292,7 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub
Output: nil,
}
now := dbtime.Now()
for i, msg := range jobLogMessages(jobToReap.Type, jobToReap.Threshold) {
for i, msg := range JobLogMessages(jobToReap.Type, jobToReap.Threshold) {
// Set the created at in a way that ensures each message has
// a unique timestamp so they will be sorted correctly.
insertParams.CreatedAt = append(insertParams.CreatedAt, now.Add(time.Millisecond*time.Duration(i)))
Expand Down Expand Up @@ -325,7 +326,7 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub
Valid: true,
},
Error: sql.NullString{
String: fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and has been terminated by hang detector.", jobToReap.Type, jobToReap.Threshold.Minutes()),
String: fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and has been terminated by the reaper.", jobToReap.Type, jobToReap.Threshold.Minutes()),
Valid: true,
},
ErrorCode: sql.NullString{
Expand Down
39 changes: 7 additions & 32 deletions coderd/jobreaper/detector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,36 +27,6 @@ import (
"github.com/coder/coder/v2/testutil"
)

// jobType represents the type of job being reaped
type jobType string

const (
hungJobType jobType = "hung"
pendingJobType jobType = "pending"
)

// jobLogMessages returns the messages to be written to provisioner job logs when a job is reaped
func jobLogMessages(jobType jobType, threshold float64) []string {
return []string{
"",
"====================",
fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and will be terminated.", jobType, threshold),
"====================",
"",
}
}

// reapParamsFromJob determines the type and threshold for a job being reaped
func reapParamsFromJob(job database.ProvisionerJob) (jobType, float64) {
jobType := hungJobType
threshold := jobreaper.HungJobDuration.Minutes()
if !job.StartedAt.Valid {
jobType = pendingJobType
threshold = jobreaper.PendingJobDuration.Minutes()
}
return jobType, threshold
}

func TestMain(m *testing.M) {
goleak.VerifyTestMain(m, testutil.GoleakOptions...)
}
Expand Down Expand Up @@ -972,8 +942,13 @@ func TestDetectorPushesLogs(t *testing.T) {
CreatedAfter: after,
})
require.NoError(t, err)
jobType, threshold := reapParamsFromJob(templateImportJob)
expectedLogs := jobLogMessages(jobType, threshold)
threshold := jobreaper.HungJobDuration
jobType := jobreaper.Hung
if templateImportJob.JobStatus == database.ProvisionerJobStatusPending {
threshold = jobreaper.PendingJobDuration
jobType = jobreaper.Pending
}
expectedLogs := jobreaper.JobLogMessages(jobType, threshold)
require.Len(t, logs, len(expectedLogs))
for i, log := range logs {
assert.Equal(t, database.LogLevelError, log.Level)
Expand Down
4 changes: 2 additions & 2 deletions provisioner/terraform/serve.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,9 @@ type ServeOptions struct {
//
// This is a no-op on Windows where the process can't be interrupted.
//
// Default value: 3 minutes (reaper.HungJobExitTimeout). This value should
// Default value: 3 minutes (jobreaper.HungJobExitTimeout). This value should
// be kept less than the value that Coder uses to mark hung jobs as failed,
// which is 5 minutes (see reaper package).
// which is 5 minutes (see jobreaper package).
ExitTimeout time.Duration
}

Expand Down
Loading