Skip to content

Commit a38e949

Browse files
committed
feat: add provisioner job hang detector
1 parent 9ec1fcf commit a38e949

23 files changed

+1106
-32
lines changed

cli/server.go

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ import (
7777
"github.com/coder/coder/coderd/schedule"
7878
"github.com/coder/coder/coderd/telemetry"
7979
"github.com/coder/coder/coderd/tracing"
80+
"github.com/coder/coder/coderd/unhanger"
8081
"github.com/coder/coder/coderd/updatecheck"
8182
"github.com/coder/coder/coderd/util/slice"
8283
"github.com/coder/coder/coderd/workspaceapps"
@@ -892,11 +893,16 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
892893
return xerrors.Errorf("notify systemd: %w", err)
893894
}
894895

895-
autobuildPoller := time.NewTicker(cfg.AutobuildPollInterval.Value())
896-
defer autobuildPoller.Stop()
897-
autobuildExecutor := executor.New(ctx, options.Database, coderAPI.TemplateScheduleStore, logger, autobuildPoller.C)
896+
autobuildTicker := time.NewTicker(cfg.AutobuildPollInterval.Value())
897+
defer autobuildTicker.Stop()
898+
autobuildExecutor := executor.New(ctx, options.Database, coderAPI.TemplateScheduleStore, logger, autobuildTicker.C)
898899
autobuildExecutor.Run()
899900

901+
hangDetectorTicker := time.NewTicker(cfg.JobHangDetectorInterval.Value())
902+
defer hangDetectorTicker.Stop()
903+
hangDetector := unhanger.New(ctx, options.Database, options.Pubsub, logger, hangDetectorTicker.C)
904+
hangDetector.Run()
905+
900906
// Currently there is no way to ask the server to shut
901907
// itself down, so any exit signal will result in a non-zero
902908
// exit of the server.

cli/testdata/server-config.yaml.golden

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,9 @@ networking:
138138
# Interval to poll for scheduled workspace builds.
139139
# (default: 1m0s, type: duration)
140140
autobuildPollInterval: 1m0s
141+
# Interval to poll for hung jobs and automatically terminate them.
142+
# (default: 1m0s, type: duration)
143+
jobHangDetectorInterval: 1m0s
141144
introspection:
142145
prometheus:
143146
# Serve prometheus metrics on the address defined by prometheus address.

coderd/apidoc/docs.go

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/apidoc/swagger.json

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/coderdtest/coderdtest.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ import (
6666
"github.com/coder/coder/coderd/rbac"
6767
"github.com/coder/coder/coderd/schedule"
6868
"github.com/coder/coder/coderd/telemetry"
69+
"github.com/coder/coder/coderd/unhanger"
6970
"github.com/coder/coder/coderd/updatecheck"
7071
"github.com/coder/coder/coderd/util/ptr"
7172
"github.com/coder/coder/coderd/workspaceapps"
@@ -239,6 +240,11 @@ func NewOptions(t testing.TB, options *Options) (func(http.Handler), context.Can
239240
).WithStatsChannel(options.AutobuildStats)
240241
lifecycleExecutor.Run()
241242

243+
hangDetectorTicker := time.NewTicker(options.DeploymentValues.JobHangDetectorInterval.Value())
244+
defer hangDetectorTicker.Stop()
245+
hangDetector := unhanger.New(ctx, options.Database, options.Pubsub, slogtest.Make(t, nil).Named("unhanger.detector"), hangDetectorTicker.C)
246+
hangDetector.Run()
247+
242248
var mutex sync.RWMutex
243249
var handler http.Handler
244250
srv := httptest.NewUnstartedServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {

coderd/database/db2sdk/db2sdk.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ package db2sdk
33

44
import (
55
"encoding/json"
6-
"time"
76

87
"github.com/coder/coder/coderd/database"
98
"github.com/coder/coder/coderd/parameter"
@@ -78,6 +77,9 @@ func TemplateVersionParameter(param database.TemplateVersionParameter) (codersdk
7877
}
7978

8079
func ProvisionerJobStatus(provisionerJob database.ProvisionerJob) codersdk.ProvisionerJobStatus {
80+
// The case where jobs are hung is handled by the unhang package. We can't
81+
// just return Failed here when it's hung because that doesn't reflect in
82+
// the database.
8183
switch {
8284
case provisionerJob.CanceledAt.Valid:
8385
if !provisionerJob.CompletedAt.Valid {
@@ -94,8 +96,6 @@ func ProvisionerJobStatus(provisionerJob database.ProvisionerJob) codersdk.Provi
9496
return codersdk.ProvisionerJobSucceeded
9597
}
9698
return codersdk.ProvisionerJobFailed
97-
case database.Now().Sub(provisionerJob.UpdatedAt) > 30*time.Second:
98-
return codersdk.ProvisionerJobFailed
9999
default:
100100
return codersdk.ProvisionerJobRunning
101101
}

coderd/database/db2sdk/db2sdk_test.go

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -96,17 +96,6 @@ func TestProvisionerJobStatus(t *testing.T) {
9696
},
9797
status: codersdk.ProvisionerJobFailed,
9898
},
99-
{
100-
name: "not_updated",
101-
job: database.ProvisionerJob{
102-
StartedAt: sql.NullTime{
103-
Time: database.Now().Add(-time.Minute),
104-
Valid: true,
105-
},
106-
UpdatedAt: database.Now().Add(-31 * time.Second),
107-
},
108-
status: codersdk.ProvisionerJobFailed,
109-
},
11099
{
111100
name: "updated",
112101
job: database.ProvisionerJob{

coderd/database/dbauthz/dbauthz.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,25 @@ var (
172172
Scope: rbac.ScopeAll,
173173
}.WithCachedASTValue()
174174

175+
// See unhanger package.
176+
subjectHangDetector = rbac.Subject{
177+
ID: uuid.Nil.String(),
178+
Roles: rbac.Roles([]rbac.Role{
179+
{
180+
Name: "hangdetector",
181+
DisplayName: "Hang Detector Daemon",
182+
Site: rbac.Permissions(map[string][]rbac.Action{
183+
rbac.ResourceSystem.Type: {rbac.WildcardSymbol},
184+
rbac.ResourceTemplate.Type: {rbac.ActionRead},
185+
rbac.ResourceWorkspace.Type: {rbac.ActionRead, rbac.ActionUpdate},
186+
}),
187+
Org: map[string][]rbac.Permission{},
188+
User: []rbac.Permission{},
189+
},
190+
}),
191+
Scope: rbac.ScopeAll,
192+
}.WithCachedASTValue()
193+
175194
subjectSystemRestricted = rbac.Subject{
176195
ID: uuid.Nil.String(),
177196
Roles: rbac.Roles([]rbac.Role{
@@ -213,6 +232,12 @@ func AsAutostart(ctx context.Context) context.Context {
213232
return context.WithValue(ctx, authContextKey{}, subjectAutostart)
214233
}
215234

235+
// AsHangDetector returns a context with an actor that has permissions required
236+
// for unhanger.Detector to function.
237+
func AsHangDetector(ctx context.Context) context.Context {
238+
return context.WithValue(ctx, authContextKey{}, subjectHangDetector)
239+
}
240+
216241
// AsSystemRestricted returns a context with an actor that has permissions
217242
// required for various system operations (login, logout, metrics cache).
218243
func AsSystemRestricted(ctx context.Context) context.Context {

coderd/database/dbauthz/system.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,14 @@ func (q *querier) InsertProvisionerJobLogs(ctx context.Context, arg database.Ins
392392
return q.db.InsertProvisionerJobLogs(ctx, arg)
393393
}
394394

395+
// TODO: We need to create a ProvisionerJob resource type
396+
func (q *querier) GetHungProvisionerJobs(ctx context.Context, hungSince time.Time) ([]database.ProvisionerJob, error) {
397+
// if err := q.authorizeContext(ctx, rbac.ActionCreate, rbac.ResourceSystem); err != nil {
398+
// return nil, err
399+
// }
400+
return q.db.GetHungProvisionerJobs(ctx, hungSince)
401+
}
402+
395403
func (q *querier) InsertWorkspaceAgentStartupLogs(ctx context.Context, arg database.InsertWorkspaceAgentStartupLogsParams) ([]database.WorkspaceAgentStartupLog, error) {
396404
return q.db.InsertWorkspaceAgentStartupLogs(ctx, arg)
397405
}

coderd/database/dbfake/databasefake.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2556,6 +2556,19 @@ func (q *fakeQuerier) getProvisionerJobByIDNoLock(_ context.Context, id uuid.UUI
25562556
return database.ProvisionerJob{}, sql.ErrNoRows
25572557
}
25582558

2559+
func (q *fakeQuerier) GetHungProvisionerJobs(_ context.Context, hungSince time.Time) ([]database.ProvisionerJob, error) {
2560+
q.mutex.RLock()
2561+
defer q.mutex.RUnlock()
2562+
2563+
hungJobs := []database.ProvisionerJob{}
2564+
for _, provisionerJob := range q.provisionerJobs {
2565+
if provisionerJob.StartedAt.Valid && !provisionerJob.CompletedAt.Valid && provisionerJob.UpdatedAt.Before(hungSince) {
2566+
hungJobs = append(hungJobs, provisionerJob)
2567+
}
2568+
}
2569+
return hungJobs, nil
2570+
}
2571+
25592572
func (q *fakeQuerier) GetWorkspaceResourceByID(_ context.Context, id uuid.UUID) (database.WorkspaceResource, error) {
25602573
q.mutex.RLock()
25612574
defer q.mutex.RUnlock()

coderd/database/dbmetrics/dbmetrics.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1518,3 +1518,10 @@ func (m metricsStore) GetAuthorizedUserCount(ctx context.Context, arg database.G
15181518
m.queryLatencies.WithLabelValues("GetAuthorizedUserCount").Observe(time.Since(start).Seconds())
15191519
return count, err
15201520
}
1521+
1522+
func (m metricsStore) GetHungProvisionerJobs(ctx context.Context, hungSince time.Time) ([]database.ProvisionerJob, error) {
1523+
start := time.Now()
1524+
jobs, err := m.s.GetHungProvisionerJobs(ctx, hungSince)
1525+
m.queryLatencies.WithLabelValues("GetHungProvisionerJobs").Observe(time.Since(start).Seconds())
1526+
return jobs, err
1527+
}

coderd/database/dbmock/store.go

Lines changed: 15 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/lock.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
package database
22

33
// Well-known lock IDs for lock functions in the database. These should not
4-
// change. If locks are deprecated, they should be kept to avoid reusing the
5-
// same ID.
4+
// change. If locks are deprecated, they should be kept in this list to avoid
5+
// reusing the same ID.
66
const (
7-
LockIDDeploymentSetup = iota + 1
7+
lockIDUnused = iota
8+
LockIDDeploymentSetup = iota
9+
LockIDHangDetector = iota
810
)

coderd/database/querier.go

Lines changed: 1 addition & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries.sql.go

Lines changed: 53 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries/lock.sql

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,11 @@
33
--
44
-- This must be called from within a transaction. The lock will be automatically
55
-- released when the transaction ends.
6-
--
7-
-- Use database.LockID() to generate a unique lock ID from a string.
86
SELECT pg_advisory_xact_lock($1);
97

108
-- name: TryAcquireLock :one
119
-- Non blocking lock. Returns true if the lock was acquired, false otherwise.
1210
--
1311
-- This must be called from within a transaction. The lock will be automatically
1412
-- released when the transaction ends.
15-
--
16-
-- Use database.LockID() to generate a unique lock ID from a string.
1713
SELECT pg_try_advisory_xact_lock($1);

coderd/database/queries/provisionerjobs.sql

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,3 +96,13 @@ SET
9696
error_code = $5
9797
WHERE
9898
id = $1;
99+
100+
-- name: GetHungProvisionerJobs :many
101+
SELECT
102+
*
103+
FROM
104+
provisioner_jobs
105+
WHERE
106+
updated_at < $1
107+
AND started_at IS NOT NULL
108+
AND completed_at IS NULL;

0 commit comments

Comments
 (0)