Skip to content

Commit a15bd1c

Browse files
committed
PR review; naming in the comments, added comments for SQL, less verbose logging
1 parent c03bfa3 commit a15bd1c

File tree

6 files changed

+29
-53
lines changed

6 files changed

+29
-53
lines changed

coderd/database/dbauthz/dbauthz.go

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3542,12 +3542,7 @@ func (q *querier) InsertPresetParameters(ctx context.Context, arg database.Inser
35423542

35433543
func (q *querier) InsertProvisionerJob(ctx context.Context, arg database.InsertProvisionerJobParams) (database.ProvisionerJob, error) {
35443544
// TODO: Remove this once we have a proper rbac check for provisioner jobs.
3545-
// Currently ProvisionerJobs are not associated with a user, so we can't
3546-
// check for a user's permissions. We'd need to check for the associated workspace
3547-
// and verify ownership through that.
3548-
// if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceProvisionerJobs); err != nil {
3549-
// return database.ProvisionerJob{}, err
3550-
// }
3545+
// Details in https://github.com/coder/coder/issues/16160
35513546
return q.db.InsertProvisionerJob(ctx, arg)
35523547
}
35533548

coderd/database/dbmem/dbmem.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"errors"
99
"fmt"
1010
"math"
11+
"math/rand/v2"
1112
"reflect"
1213
"regexp"
1314
"slices"
@@ -4883,11 +4884,14 @@ func (q *FakeQuerier) GetProvisionerJobsToBeReaped(_ context.Context, arg databa
48834884
provisionerJob.Tags = maps.Clone(provisionerJob.Tags)
48844885
hungJobs = append(hungJobs, provisionerJob)
48854886
if len(hungJobs) >= int(maxJobs) {
4886-
return hungJobs, nil
4887+
break
48874888
}
48884889
}
48894890
}
48904891
}
4892+
rand.Shuffle(len(hungJobs), func(i, j int) {
4893+
hungJobs[i], hungJobs[j] = hungJobs[j], hungJobs[i]
4894+
})
48914895
return hungJobs, nil
48924896
}
48934897

@@ -10955,8 +10959,8 @@ func (q *FakeQuerier) UpdateProvisionerJobWithCompleteWithStartedAtByID(_ contex
1095510959
job.CompletedAt = arg.CompletedAt
1095610960
job.Error = arg.Error
1095710961
job.ErrorCode = arg.ErrorCode
10958-
job.JobStatus = provisionerJobStatus(job)
1095910962
job.StartedAt = arg.StartedAt
10963+
job.JobStatus = provisionerJobStatus(job)
1096010964
q.provisionerJobs[index] = job
1096110965
return nil
1096210966
}

coderd/database/queries/provisionerjobs.sql

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,7 @@ WHERE
299299
AND started_at IS NOT NULL
300300
AND completed_at IS NULL
301301
)
302+
-- To avoid repeatedly attempting to reap the same jobs, we randomly order and limit to @max_jobs.
302303
ORDER BY random()
303304
LIMIT @max_jobs;
304305

coderd/jobreaper/detector.go

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -20,12 +20,12 @@ import (
2020
)
2121

2222
const (
23-
// HungJobDuration is the duration of time since the last update to a job
24-
// before it is considered hung.
23+
// HungJobDuration is the duration of time since the last update
24+
// to a RUNNING job before it is considered hung.
2525
HungJobDuration = 5 * time.Minute
2626

27-
// PendingJobDuration is the duration of time since the last update to a job
28-
// before it is considered hung.
27+
// PendingJobDuration is the duration of time since last update
28+
// to a PENDING job before it is considered dead.
2929
PendingJobDuration = 30 * time.Minute
3030

3131
// HungJobExitTimeout is the duration of time that provisioners should allow
@@ -42,7 +42,7 @@ const (
4242
)
4343

4444
// jobLogMessages are written to provisioner job logs when a job is reaped
45-
func jobLogMessages(reapType ReapType, threshold time.Duration) []string {
45+
func JobLogMessages(reapType ReapType, threshold time.Duration) []string {
4646
return []string{
4747
"",
4848
"====================",
@@ -110,9 +110,9 @@ type Stats struct {
110110
Error error
111111
}
112112

113-
// New returns a new hang detector.
113+
// New returns a new job reaper.
114114
func New(ctx context.Context, db database.Store, pub pubsub.Pubsub, log slog.Logger, tick <-chan time.Time) *Detector {
115-
//nolint:gocritic // Hang detector has a limited set of permissions.
115+
//nolint:gocritic // Job reaper has a limited set of permissions.
116116
ctx, cancel := context.WithCancel(dbauthz.AsJobReaper(ctx))
117117
d := &Detector{
118118
ctx: ctx,
@@ -224,7 +224,7 @@ func (d *Detector) run(t time.Time) Stats {
224224
err := reapJob(ctx, log, d.db, d.pubsub, job)
225225
if err != nil {
226226
if !(xerrors.As(err, &acquireLockError{}) || xerrors.As(err, &jobIneligibleError{})) {
227-
log.Error(ctx, fmt.Sprintf("error forcefully terminating %s provisioner job", job.Type), slog.Error(err))
227+
log.Error(ctx, "error forcefully terminating provisioner job", slog.F("type", job.Type), slog.Error(err))
228228
}
229229
continue
230230
}
@@ -260,7 +260,8 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub
260260
}
261261

262262
log.Warn(
263-
ctx, fmt.Sprintf("detected %s provisioner job, forcefully terminating", jobToReap.Type),
263+
ctx, "forcefully terminating provisioner job",
264+
"type", jobToReap.Type,
264265
"threshold", jobToReap.Threshold,
265266
)
266267

@@ -291,7 +292,7 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub
291292
Output: nil,
292293
}
293294
now := dbtime.Now()
294-
for i, msg := range jobLogMessages(jobToReap.Type, jobToReap.Threshold) {
295+
for i, msg := range JobLogMessages(jobToReap.Type, jobToReap.Threshold) {
295296
// Set the created at in a way that ensures each message has
296297
// a unique timestamp so they will be sorted correctly.
297298
insertParams.CreatedAt = append(insertParams.CreatedAt, now.Add(time.Millisecond*time.Duration(i)))
@@ -325,7 +326,7 @@ func reapJob(ctx context.Context, log slog.Logger, db database.Store, pub pubsub
325326
Valid: true,
326327
},
327328
Error: sql.NullString{
328-
String: fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and has been terminated by hang detector.", jobToReap.Type, jobToReap.Threshold.Minutes()),
329+
String: fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and has been terminated by the reaper.", jobToReap.Type, jobToReap.Threshold.Minutes()),
329330
Valid: true,
330331
},
331332
ErrorCode: sql.NullString{

coderd/jobreaper/detector_test.go

Lines changed: 7 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -27,36 +27,6 @@ import (
2727
"github.com/coder/coder/v2/testutil"
2828
)
2929

30-
// jobType represents the type of job being reaped
31-
type jobType string
32-
33-
const (
34-
hungJobType jobType = "hung"
35-
pendingJobType jobType = "pending"
36-
)
37-
38-
// jobLogMessages returns the messages to be written to provisioner job logs when a job is reaped
39-
func jobLogMessages(jobType jobType, threshold float64) []string {
40-
return []string{
41-
"",
42-
"====================",
43-
fmt.Sprintf("Coder: Build has been detected as %s for %.0f minutes and will be terminated.", jobType, threshold),
44-
"====================",
45-
"",
46-
}
47-
}
48-
49-
// reapParamsFromJob determines the type and threshold for a job being reaped
50-
func reapParamsFromJob(job database.ProvisionerJob) (jobType, float64) {
51-
jobType := hungJobType
52-
threshold := jobreaper.HungJobDuration.Minutes()
53-
if !job.StartedAt.Valid {
54-
jobType = pendingJobType
55-
threshold = jobreaper.PendingJobDuration.Minutes()
56-
}
57-
return jobType, threshold
58-
}
59-
6030
func TestMain(m *testing.M) {
6131
goleak.VerifyTestMain(m, testutil.GoleakOptions...)
6232
}
@@ -972,8 +942,13 @@ func TestDetectorPushesLogs(t *testing.T) {
972942
CreatedAfter: after,
973943
})
974944
require.NoError(t, err)
975-
jobType, threshold := reapParamsFromJob(templateImportJob)
976-
expectedLogs := jobLogMessages(jobType, threshold)
945+
threshold := jobreaper.HungJobDuration
946+
jobType := jobreaper.Hung
947+
if templateImportJob.JobStatus == database.ProvisionerJobStatusPending {
948+
threshold = jobreaper.PendingJobDuration
949+
jobType = jobreaper.Pending
950+
}
951+
expectedLogs := jobreaper.JobLogMessages(jobType, threshold)
977952
require.Len(t, logs, len(expectedLogs))
978953
for i, log := range logs {
979954
assert.Equal(t, database.LogLevelError, log.Level)

provisioner/terraform/serve.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,9 @@ type ServeOptions struct {
3939
//
4040
// This is a no-op on Windows where the process can't be interrupted.
4141
//
42-
// Default value: 3 minutes (reaper.HungJobExitTimeout). This value should
42+
// Default value: 3 minutes (jobreaper.HungJobExitTimeout). This value should
4343
// be kept less than the value that Coder uses to mark hung jobs as failed,
44-
// which is 5 minutes (see reaper package).
44+
// which is 5 minutes (see jobreaper package).
4545
ExitTimeout time.Duration
4646
}
4747

0 commit comments

Comments
 (0)