@@ -20,12 +20,12 @@ import (
20
20
)
21
21
22
22
const (
23
- // HungJobDuration is the duration of time since the last update to a job
24
- // before it is considered hung.
23
+ // HungJobDuration is the duration of time since the last update
24
+ // to a RUNNING job before it is considered hung.
25
25
HungJobDuration = 5 * time .Minute
26
26
27
- // PendingJobDuration is the duration of time since the last update to a job
28
- // before it is considered hung .
27
+ // PendingJobDuration is the duration of time since last update
28
+ // to a PENDING job before it is considered dead .
29
29
PendingJobDuration = 30 * time .Minute
30
30
31
31
// HungJobExitTimeout is the duration of time that provisioners should allow
@@ -42,7 +42,7 @@ const (
42
42
)
43
43
44
44
// jobLogMessages are written to provisioner job logs when a job is reaped
45
- func jobLogMessages (reapType ReapType , threshold time.Duration ) []string {
45
+ func JobLogMessages (reapType ReapType , threshold time.Duration ) []string {
46
46
return []string {
47
47
"" ,
48
48
"====================" ,
@@ -110,9 +110,9 @@ type Stats struct {
110
110
Error error
111
111
}
112
112
113
- // New returns a new hang detector .
113
+ // New returns a new job reaper .
114
114
func New (ctx context.Context , db database.Store , pub pubsub.Pubsub , log slog.Logger , tick <- chan time.Time ) * Detector {
115
- //nolint:gocritic // Hang detector has a limited set of permissions.
115
+ //nolint:gocritic // Job reaper has a limited set of permissions.
116
116
ctx , cancel := context .WithCancel (dbauthz .AsJobReaper (ctx ))
117
117
d := & Detector {
118
118
ctx : ctx ,
@@ -224,7 +224,7 @@ func (d *Detector) run(t time.Time) Stats {
224
224
err := reapJob (ctx , log , d .db , d .pubsub , job )
225
225
if err != nil {
226
226
if ! (xerrors .As (err , & acquireLockError {}) || xerrors .As (err , & jobIneligibleError {})) {
227
- log .Error (ctx , fmt . Sprintf ( "error forcefully terminating %s provisioner job" , job .Type ), slog .Error (err ))
227
+ log .Error (ctx , "error forcefully terminating provisioner job" , slog . F ( "type " , job .Type ), slog .Error (err ))
228
228
}
229
229
continue
230
230
}
@@ -260,7 +260,8 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub
260
260
}
261
261
262
262
log .Warn (
263
- ctx , fmt .Sprintf ("detected %s provisioner job, forcefully terminating" , jobToReap .Type ),
263
+ ctx , "forcefully terminating provisioner job" ,
264
+ "type" , jobToReap .Type ,
264
265
"threshold" , jobToReap .Threshold ,
265
266
)
266
267
@@ -291,7 +292,7 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub
291
292
Output : nil ,
292
293
}
293
294
now := dbtime .Now ()
294
- for i , msg := range jobLogMessages (jobToReap .Type , jobToReap .Threshold ) {
295
+ for i , msg := range JobLogMessages (jobToReap .Type , jobToReap .Threshold ) {
295
296
// Set the created at in a way that ensures each message has
296
297
// a unique timestamp so they will be sorted correctly.
297
298
insertParams .CreatedAt = append (insertParams .CreatedAt , now .Add (time .Millisecond * time .Duration (i )))
@@ -325,7 +326,7 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub
325
326
Valid : true ,
326
327
},
327
328
Error : sql.NullString {
328
- String : fmt .Sprintf ("Coder: Build has been detected as %s for %.0f minutes and has been terminated by hang detector ." , jobToReap .Type , jobToReap .Threshold .Minutes ()),
329
+ String : fmt .Sprintf ("Coder: Build has been detected as %s for %.0f minutes and has been terminated by the reaper ." , jobToReap .Type , jobToReap .Threshold .Minutes ()),
329
330
Valid : true ,
330
331
},
331
332
ErrorCode : sql.NullString {
0 commit comments