Skip to content

feat: add provisioner job hang detector #7927

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Jun 25, 2023
Merged
Prev Previous commit
Next Next commit
fixup! Merge branch 'main' into dean/hang-detector
  • Loading branch information
deansheather committed Jun 21, 2023
commit f25938a15f7e75dda859c8bb28bcbb1924cbf125
25 changes: 16 additions & 9 deletions coderd/unhanger/detector.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,15 +58,15 @@ func (acquireLockError) Error() string {
return "lock is held by another client"
}

// jobNotRunningError is returned when the detector attempts to terminate a job
// that is not running.
type jobNotRunningError struct {
Status codersdk.ProvisionerJobStatus
// jobInelligibleError is returned when a job is not eligible to be terminated
// anymore.
type jobInelligibleError struct {
Err error
}

// Error implements error.
func (e jobNotRunningError) Error() string {
return fmt.Sprintf("job is not running (status: %s)", e.Status)
func (e jobInelligibleError) Error() string {
return fmt.Sprintf("job is no longer eligible to be terminated: %s", e.Err)
}

// Detector automatically detects hung provisioner jobs, sends messages into the
Expand Down Expand Up @@ -201,7 +201,7 @@ func (d *Detector) run(t time.Time) Stats {
log := d.log.With(slog.F("job_id", job.ID))

err := unhangJob(ctx, log, d.db, d.pubsub, job.ID)
if err != nil && !(xerrors.As(err, &acquireLockError{}) || xerrors.As(err, &jobNotRunningError{})) {
if err != nil && !(xerrors.As(err, &acquireLockError{}) || xerrors.As(err, &jobInelligibleError{})) {
log.Error(ctx, "error forcefully terminating hung provisioner job", slog.Error(err))
continue
}
Expand Down Expand Up @@ -230,10 +230,17 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs
if err != nil {
return xerrors.Errorf("get provisioner job: %w", err)
}

// Check if we should still unhang it.
jobStatus := db2sdk.ProvisionerJobStatus(job)
if jobStatus != codersdk.ProvisionerJobRunning {
return jobNotRunningError{
Status: jobStatus,
return jobInelligibleError{
Err: xerrors.Errorf("job is not running (status %s)", jobStatus),
}
}
if job.UpdatedAt.After(time.Now().Add(-HungJobDuration)) {
return jobInelligibleError{
Err: xerrors.New("job has been updated recently"),
}
}

Expand Down