@@ -25,6 +25,10 @@ const (
25
25
// before it is considered hung.
26
26
HungJobDuration = 5 * time .Minute
27
27
28
+ // NotStartedTimeElapsed is the duration of time since the last update to a job
29
+ // before it is considered hung.
30
+ NotStartedTimeElapsed = 30 * time .Minute
31
+
28
32
// HungJobExitTimeout is the duration of time that provisioners should allow
29
33
// for a graceful exit upon cancellation due to failing to send an update to
30
34
// a job.
@@ -38,6 +42,13 @@ const (
38
42
MaxJobsPerRun = 10
39
43
)
40
44
45
+ type jobType string
46
+
47
+ const (
48
+ hungJobType jobType = "hung"
49
+ notStartedJobType jobType = "not started"
50
+ )
51
+
41
52
// HungJobLogMessages are written to provisioner job logs when a job is hung and
42
53
// terminated.
43
54
var HungJobLogMessages = []string {
@@ -176,17 +187,17 @@ func (d *Detector) run(t time.Time) Stats {
176
187
// received an update in the last 5 minutes.
177
188
jobs , err := d .db .GetHungProvisionerJobs (ctx , t .Add (- HungJobDuration ))
178
189
if err != nil {
179
- stats .Error = xerrors .Errorf ("get hung provisioner jobs: %w" , err )
190
+ stats .Error = xerrors .Errorf ("get %s provisioner jobs: %w" , hungJobType , err )
180
191
return stats
181
192
}
182
- // Find all provisioner jobs that are currently running but have not
183
- // received an update in the last 5 minutes.
193
+ // Find all provisioner jobs that have not been started yet and have not
194
+ // received an update in the last 30 minutes.
195
+ jobsNotStarted , err := d .db .GetNotStartedProvisionerJobs (ctx , t .Add (- NotStartedTimeElapsed ))
184
196
if err != nil {
185
- stats .Error = xerrors .Errorf ("get not started provisioner jobs: %w" , err )
197
+ stats .Error = xerrors .Errorf ("get %s provisioner jobs: %w" , notStartedJobType , err )
186
198
return stats
187
199
}
188
- jobsUnstarted , err := d .db .GetNotStartedProvisionerJobs (ctx , t .Add (- HungJobDuration ))
189
- jobs = append (jobs , jobsUnstarted ... )
200
+ jobs = append (jobs , jobsNotStarted ... )
190
201
191
202
// Limit the number of jobs we'll unhang in a single run to avoid
192
203
// timing out.
@@ -198,16 +209,20 @@ func (d *Detector) run(t time.Time) Stats {
198
209
jobs = jobs [:MaxJobsPerRun ]
199
210
}
200
211
201
- // Send a message into the build log for each hung job saying that it
212
+ // Send a message into the build log for each hung or not startedjob saying that it
202
213
// has been detected and will be terminated, then mark the job as
203
214
// failed.
204
215
for _ , job := range jobs {
205
216
log := d .log .With (slog .F ("job_id" , job .ID ))
206
217
207
218
err := unhangJob (ctx , log , d .db , d .pubsub , job .ID )
208
219
if err != nil {
220
+ jobType := notStartedJobType
221
+ if job .StartedAt .Valid {
222
+ jobType = hungJobType
223
+ }
209
224
if ! (xerrors .As (err , & acquireLockError {}) || xerrors .As (err , & jobIneligibleError {})) {
210
- log .Error (ctx , "error forcefully terminating hung provisioner job" , slog .Error (err ))
225
+ log .Error (ctx , fmt . Sprintf ( "error forcefully terminating %s provisioner job" , jobType ) , slog .Error (err ))
211
226
}
212
227
continue
213
228
}
@@ -222,7 +237,7 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs
222
237
var lowestLogID int64
223
238
224
239
err := db .InTx (func (db database.Store ) error {
225
- locked , err := db .TryAcquireLock (ctx , database .GenLockID (fmt .Sprintf ("hang-detector :%s" , jobID )))
240
+ locked , err := db .TryAcquireLock (ctx , database .GenLockID (fmt .Sprintf ("unhanger :%s" , jobID )))
226
241
if err != nil {
227
242
return xerrors .Errorf ("acquire lock: %w" , err )
228
243
}
@@ -237,6 +252,14 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs
237
252
return xerrors .Errorf ("get provisioner job: %w" , err )
238
253
}
239
254
255
+ jobType := hungJobType
256
+ threshold := HungJobDuration .Minutes ()
257
+
258
+ if ! job .StartedAt .Valid {
259
+ jobType = notStartedJobType
260
+ threshold = NotStartedTimeElapsed .Minutes ()
261
+ }
262
+
240
263
if job .CompletedAt .Valid {
241
264
return jobIneligibleError {
242
265
Err : xerrors .Errorf ("job is completed (status %s)" , job .JobStatus ),
@@ -249,8 +272,8 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs
249
272
}
250
273
251
274
log .Warn (
252
- ctx , "detected hung provisioner job, forcefully terminating" ,
253
- "threshold" , HungJobDuration ,
275
+ ctx , fmt . Sprintf ( "detected %s provisioner job, forcefully terminating" , jobType ) ,
276
+ "threshold" , threshold ,
254
277
)
255
278
256
279
// First, get the latest logs from the build so we can make sure
@@ -260,7 +283,7 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs
260
283
CreatedAfter : 0 ,
261
284
})
262
285
if err != nil {
263
- return xerrors .Errorf ("get logs for hung job: %w" , err )
286
+ return xerrors .Errorf ("get logs for %s job: %w" , jobType , err )
264
287
}
265
288
logStage := ""
266
289
if len (logs ) != 0 {
@@ -291,12 +314,13 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs
291
314
}
292
315
newLogs , err := db .InsertProvisionerJobLogs (ctx , insertParams )
293
316
if err != nil {
294
- return xerrors .Errorf ("insert logs for hung job: %w" , err )
317
+ return xerrors .Errorf ("insert logs for %s job: %w" , jobType , err )
295
318
}
296
319
lowestLogID = newLogs [0 ].ID
297
320
298
321
now = dbtime .Now ()
299
- // If we are unhanging a job that was never picked up by the
322
+
323
+ // If we are failing a job that was never picked up by the
300
324
// provisioner, we need to set the started_at time to the current
301
325
// time so that the build duration is correct.
302
326
if ! job .StartedAt .Valid {
@@ -315,7 +339,7 @@ func unhangJob(ctx context.Context, log slog.Logger, db database.Store, pub pubs
315
339
Valid : true ,
316
340
},
317
341
Error : sql.NullString {
318
- String : "Coder: Build has been detected as hung for 5 minutes and has been terminated by hang detector." ,
342
+ String : fmt . Sprintf ( "Coder: Build has been detected as %s for %.0f minutes and has been terminated by hang detector." , jobType , threshold ) ,
319
343
Valid : true ,
320
344
},
321
345
ErrorCode : sql.NullString {
0 commit comments