Skip to content

Commit ca49519

Browse files
committed
WIP
1 parent 4b252eb commit ca49519

File tree

1 file changed

+39
-15
lines changed

1 file changed

+39
-15
lines changed

coderd/unhanger/detector.go

Lines changed: 39 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ const (
2525
// before it is considered hung.
2626
HungJobDuration = 5 * time.Minute
2727

28+
// NotStartedTimeElapsed is the duration of time since the last update to a job
29+
// before it is considered hung.
30+
NotStartedTimeElapsed = 30 * time.Minute
31+
2832
// HungJobExitTimeout is the duration of time that provisioners should allow
2933
// for a graceful exit upon cancellation due to failing to send an update to
3034
// a job.
@@ -38,6 +42,13 @@ const (
3842
MaxJobsPerRun = 10
3943
)
4044

45+
type jobType string
46+
47+
const (
48+
hungJobType jobType = "hung"
49+
notStartedJobType jobType = "not started"
50+
)
51+
4152
// HungJobLogMessages are written to provisioner job logs when a job is hung and
4253
// terminated.
4354
var HungJobLogMessages = []string{
@@ -176,17 +187,17 @@ func (d *Detector) run(t time.Time) Stats {
176187
// received an update in the last 5 minutes.
177188
jobs, err := d.db.GetHungProvisionerJobs(ctx, t.Add(-HungJobDuration))
178189
if err != nil {
179-
stats.Error = xerrors.Errorf("get hung provisioner jobs: %w", err)
190+
stats.Error = xerrors.Errorf("get %s provisioner jobs: %w", hungJobType, err)
180191
return stats
181192
}
182-
// Find all provisioner jobs that are currently running but have not
183-
// received an update in the last 5 minutes.
193+
// Find all provisioner jobs that have not been started yet and have not
194+
// received an update in the last 30 minutes.
195+
jobsNotStarted, err := d.db.GetNotStartedProvisionerJobs(ctx, t.Add(-NotStartedTimeElapsed))
184196
if err != nil {
185-
stats.Error = xerrors.Errorf("get not started provisioner jobs: %w", err)
197+
stats.Error = xerrors.Errorf("get %s provisioner jobs: %w", notStartedJobType, err)
186198
return stats
187199
}
188-
jobsUnstarted, err := d.db.GetNotStartedProvisionerJobs(ctx, t.Add(-HungJobDuration))
189-
jobs = append(jobs, jobsUnstarted...)
200+
jobs = append(jobs, jobsNotStarted...)
190201

191202
// Limit the number of jobs we'll unhang in a single run to avoid
192203
// timing out.
@@ -198,16 +209,20 @@ func (d *Detector) run(t time.Time) Stats {
198209
jobs = jobs[:MaxJobsPerRun]
199210
}
200211

201-
// Send a message into the build log for each hung job saying that it
212+
// Send a message into the build log for each hung or not startedjob saying that it
202213
// has been detected and will be terminated, then mark the job as
203214
// failed.
204215
for _, job := range jobs {
205216
log := d.log.With(slog.F("job_id", job.ID))
206217

207218
err := unhangJob(ctx, log, d.db, d.pubsub, job.ID)
208219
if err != nil {
220+
jobType := notStartedJobType
221+
if job.StartedAt.Valid {
222+
jobType = hungJobType
223+
}
209224
if !(xerrors.As(err, &acquireLockError{}) || xerrors.As(err, &jobIneligibleError{})) {
210-
log.Error(ctx, "error forcefully terminating hung provisioner job", slog.Error(err))
225+
log.Error(ctx, fmt.Sprintf("error forcefully terminating %s provisioner job", jobType), slog.Error(err))
211226
}
212227
continue
213228
}
@@ -222,7 +237,7 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs
222237
var lowestLogID int64
223238

224239
err := db.InTx(func(db database.Store) error {
225-
locked, err := db.TryAcquireLock(ctx, database.GenLockID(fmt.Sprintf("hang-detector:%s", jobID)))
240+
locked, err := db.TryAcquireLock(ctx, database.GenLockID(fmt.Sprintf("unhanger:%s", jobID)))
226241
if err != nil {
227242
return xerrors.Errorf("acquire lock: %w", err)
228243
}
@@ -237,6 +252,14 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs
237252
return xerrors.Errorf("get provisioner job: %w", err)
238253
}
239254

255+
jobType := hungJobType
256+
threshold := HungJobDuration.Minutes()
257+
258+
if !job.StartedAt.Valid {
259+
jobType = notStartedJobType
260+
threshold = NotStartedTimeElapsed.Minutes()
261+
}
262+
240263
if job.CompletedAt.Valid {
241264
return jobIneligibleError{
242265
Err: xerrors.Errorf("job is completed (status %s)", job.JobStatus),
@@ -249,8 +272,8 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs
249272
}
250273

251274
log.Warn(
252-
ctx, "detected hung provisioner job, forcefully terminating",
253-
"threshold", HungJobDuration,
275+
ctx, fmt.Sprintf("detected %s provisioner job, forcefully terminating", jobType),
276+
"threshold", threshold,
254277
)
255278

256279
// First, get the latest logs from the build so we can make sure
@@ -260,7 +283,7 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs
260283
CreatedAfter: 0,
261284
})
262285
if err != nil {
263-
return xerrors.Errorf("get logs for hung job: %w", err)
286+
return xerrors.Errorf("get logs for %s job: %w", jobType, err)
264287
}
265288
logStage := ""
266289
if len(logs) != 0 {
@@ -291,12 +314,13 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs
291314
}
292315
newLogs, err := db.InsertProvisionerJobLogs(ctx, insertParams)
293316
if err != nil {
294-
return xerrors.Errorf("insert logs for hung job: %w", err)
317+
return xerrors.Errorf("insert logs for %s job: %w", jobType, err)
295318
}
296319
lowestLogID = newLogs[0].ID
297320

298321
now = dbtime.Now()
299-
// If we are unhanging a job that was never picked up by the
322+
323+
// If we are failing a job that was never picked up by the
300324
// provisioner, we need to set the started_at time to the current
301325
// time so that the build duration is correct.
302326
if !job.StartedAt.Valid {
@@ -315,7 +339,7 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs
315339
Valid: true,
316340
},
317341
Error: sql.NullString{
318-
String: "Coder: Build has been detected as hung for 5 minutes and has been terminated by hang detector.",
342+
String: fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and has been terminated by hang detector.", jobType, threshold),
319343
Valid: true,
320344
},
321345
ErrorCode: sql.NullString{

0 commit comments

Comments
 (0)