Skip to content

Commit 7a62534

Browse files
authored
fix: allow unhanger to unhang canceling jobs (#8529)
1 parent 7ed17b2 commit 7a62534

File tree

2 files changed

+79
-6
lines changed

2 files changed

+79
-6
lines changed

coderd/unhanger/detector.go

+13-6
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ import (
1717
"github.com/coder/coder/coderd/database/db2sdk"
1818
"github.com/coder/coder/coderd/database/dbauthz"
1919
"github.com/coder/coder/coderd/database/pubsub"
20-
"github.com/coder/coder/codersdk"
2120
"github.com/coder/coder/provisionersdk"
2221
)
2322

@@ -201,8 +200,10 @@ func (d *Detector) run(t time.Time) Stats {
201200
log := d.log.With(slog.F("job_id", job.ID))
202201

203202
err := unhangJob(ctx, log, d.db, d.pubsub, job.ID)
204-
if err != nil && !(xerrors.As(err, &acquireLockError{}) || xerrors.As(err, &jobInelligibleError{})) {
205-
log.Error(ctx, "error forcefully terminating hung provisioner job", slog.Error(err))
203+
if err != nil {
204+
if !(xerrors.As(err, &acquireLockError{}) || xerrors.As(err, &jobInelligibleError{})) {
205+
log.Error(ctx, "error forcefully terminating hung provisioner job", slog.Error(err))
206+
}
206207
continue
207208
}
208209

@@ -232,10 +233,16 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs
232233
}
233234

234235
// Check if we should still unhang it.
235-
jobStatus := db2sdk.ProvisionerJobStatus(job)
236-
if jobStatus != codersdk.ProvisionerJobRunning {
236+
if !job.StartedAt.Valid {
237+
// This shouldn't be possible to hit because the query only selects
238+
// started and not completed jobs, and a job can't be "un-started".
239+
return jobInelligibleError{
240+
Err: xerrors.New("job is not started"),
241+
}
242+
}
243+
if job.CompletedAt.Valid {
237244
return jobInelligibleError{
238-
Err: xerrors.Errorf("job is not running (status %s)", jobStatus),
245+
Err: xerrors.Errorf("job is completed (status %s)", db2sdk.ProvisionerJobStatus(job)),
239246
}
240247
}
241248
if job.UpdatedAt.After(time.Now().Add(-HungJobDuration)) {

coderd/unhanger/detector_test.go

+66
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,72 @@ func TestDetectorHungOtherJobTypes(t *testing.T) {
525525
detector.Wait()
526526
}
527527

528+
func TestDetectorHungCanceledJob(t *testing.T) {
529+
t.Parallel()
530+
531+
var (
532+
ctx = testutil.Context(t, testutil.WaitLong)
533+
db, pubsub = dbtestutil.NewDB(t)
534+
log = slogtest.Make(t, nil)
535+
tickCh = make(chan time.Time)
536+
statsCh = make(chan unhanger.Stats)
537+
)
538+
539+
var (
540+
now = time.Now()
541+
tenMinAgo = now.Add(-time.Minute * 10)
542+
sixMinAgo = now.Add(-time.Minute * 6)
543+
org = dbgen.Organization(t, db, database.Organization{})
544+
user = dbgen.User(t, db, database.User{})
545+
file = dbgen.File(t, db, database.File{})
546+
547+
// Template import job.
548+
templateImportJob = dbgen.ProvisionerJob(t, db, database.ProvisionerJob{
549+
CreatedAt: tenMinAgo,
550+
CanceledAt: sql.NullTime{
551+
Time: tenMinAgo,
552+
Valid: true,
553+
},
554+
UpdatedAt: sixMinAgo,
555+
StartedAt: sql.NullTime{
556+
Time: tenMinAgo,
557+
Valid: true,
558+
},
559+
OrganizationID: org.ID,
560+
InitiatorID: user.ID,
561+
Provisioner: database.ProvisionerTypeEcho,
562+
StorageMethod: database.ProvisionerStorageMethodFile,
563+
FileID: file.ID,
564+
Type: database.ProvisionerJobTypeTemplateVersionImport,
565+
Input: []byte("{}"),
566+
})
567+
)
568+
569+
t.Log("template import job ID: ", templateImportJob.ID)
570+
571+
detector := unhanger.New(ctx, db, pubsub, log, tickCh).WithStatsChannel(statsCh)
572+
detector.Start()
573+
tickCh <- now
574+
575+
stats := <-statsCh
576+
require.NoError(t, stats.Error)
577+
require.Len(t, stats.TerminatedJobIDs, 1)
578+
require.Contains(t, stats.TerminatedJobIDs, templateImportJob.ID)
579+
580+
// Check that the job was updated.
581+
job, err := db.GetProvisionerJobByID(ctx, templateImportJob.ID)
582+
require.NoError(t, err)
583+
require.WithinDuration(t, now, job.UpdatedAt, 30*time.Second)
584+
require.True(t, job.CompletedAt.Valid)
585+
require.WithinDuration(t, now, job.CompletedAt.Time, 30*time.Second)
586+
require.True(t, job.Error.Valid)
587+
require.Contains(t, job.Error.String, "Build has been detected as hung")
588+
require.False(t, job.ErrorCode.Valid)
589+
590+
detector.Close()
591+
detector.Wait()
592+
}
593+
528594
func TestDetectorPushesLogs(t *testing.T) {
529595
t.Parallel()
530596

0 commit comments

Comments
 (0)