Skip to content

Commit 98a5ae7

Browse files
authored
feat: add provisioner job hang detector (coder#7927)
1 parent 3671846 commit 98a5ae7

File tree

28 files changed

+1414
-54
lines changed

28 files changed

+1414
-54
lines changed

cli/server.go

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ import (
7878
"github.com/coder/coder/coderd/schedule"
7979
"github.com/coder/coder/coderd/telemetry"
8080
"github.com/coder/coder/coderd/tracing"
81+
"github.com/coder/coder/coderd/unhanger"
8182
"github.com/coder/coder/coderd/updatecheck"
8283
"github.com/coder/coder/coderd/util/slice"
8384
"github.com/coder/coder/coderd/workspaceapps"
@@ -898,11 +899,17 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
898899
return xerrors.Errorf("notify systemd: %w", err)
899900
}
900901

901-
autobuildPoller := time.NewTicker(cfg.AutobuildPollInterval.Value())
902-
defer autobuildPoller.Stop()
903-
autobuildExecutor := autobuild.NewExecutor(ctx, options.Database, coderAPI.TemplateScheduleStore, logger, autobuildPoller.C)
902+
autobuildTicker := time.NewTicker(cfg.AutobuildPollInterval.Value())
903+
defer autobuildTicker.Stop()
904+
autobuildExecutor := autobuild.NewExecutor(ctx, options.Database, coderAPI.TemplateScheduleStore, logger, autobuildTicker.C)
904905
autobuildExecutor.Run()
905906

907+
hangDetectorTicker := time.NewTicker(cfg.JobHangDetectorInterval.Value())
908+
defer hangDetectorTicker.Stop()
909+
hangDetector := unhanger.New(ctx, options.Database, options.Pubsub, logger, hangDetectorTicker.C)
910+
hangDetector.Start()
911+
defer hangDetector.Close()
912+
906913
// Currently there is no way to ask the server to shut
907914
// itself down, so any exit signal will result in a non-zero
908915
// exit of the server.

cli/testdata/server-config.yaml.golden

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,9 @@ networking:
148148
# Interval to poll for scheduled workspace builds.
149149
# (default: 1m0s, type: duration)
150150
autobuildPollInterval: 1m0s
151+
# Interval to poll for hung jobs and automatically terminate them.
152+
# (default: 1m0s, type: duration)
153+
jobHangDetectorInterval: 1m0s
151154
introspection:
152155
prometheus:
153156
# Serve prometheus metrics on the address defined by prometheus address.

coderd/apidoc/docs.go

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/apidoc/swagger.json

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/coderdtest/coderdtest.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ import (
6868
"github.com/coder/coder/coderd/rbac"
6969
"github.com/coder/coder/coderd/schedule"
7070
"github.com/coder/coder/coderd/telemetry"
71+
"github.com/coder/coder/coderd/unhanger"
7172
"github.com/coder/coder/coderd/updatecheck"
7273
"github.com/coder/coder/coderd/util/ptr"
7374
"github.com/coder/coder/coderd/workspaceapps"
@@ -256,6 +257,12 @@ func NewOptions(t testing.TB, options *Options) (func(http.Handler), context.Can
256257
).WithStatsChannel(options.AutobuildStats)
257258
lifecycleExecutor.Run()
258259

260+
hangDetectorTicker := time.NewTicker(options.DeploymentValues.JobHangDetectorInterval.Value())
261+
defer hangDetectorTicker.Stop()
262+
hangDetector := unhanger.New(ctx, options.Database, options.Pubsub, slogtest.Make(t, nil).Named("unhanger.detector"), hangDetectorTicker.C)
263+
hangDetector.Start()
264+
t.Cleanup(hangDetector.Close)
265+
259266
var mutex sync.RWMutex
260267
var handler http.Handler
261268
srv := httptest.NewUnstartedServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {

coderd/database/db.go

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,6 @@ import (
1818
"golang.org/x/xerrors"
1919
)
2020

21-
// Well-known lock IDs for lock functions in the database. These should not
22-
// change. If locks are deprecated, they should be kept to avoid reusing the
23-
// same ID.
24-
const (
25-
LockIDDeploymentSetup = iota + 1
26-
)
27-
2821
// Store contains all queryable database functions.
2922
// It extends the generated interface to add transaction support.
3023
type Store interface {

coderd/database/db2sdk/db2sdk.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ package db2sdk
33

44
import (
55
"encoding/json"
6-
"time"
76

87
"github.com/google/uuid"
98

@@ -81,6 +80,9 @@ func TemplateVersionParameter(param database.TemplateVersionParameter) (codersdk
8180
}
8281

8382
func ProvisionerJobStatus(provisionerJob database.ProvisionerJob) codersdk.ProvisionerJobStatus {
83+
// The case where jobs are hung is handled by the unhang package. We can't
84+
// just return Failed here when it's hung because that doesn't reflect in
85+
// the database.
8486
switch {
8587
case provisionerJob.CanceledAt.Valid:
8688
if !provisionerJob.CompletedAt.Valid {
@@ -97,8 +99,6 @@ func ProvisionerJobStatus(provisionerJob database.ProvisionerJob) codersdk.Provi
9799
return codersdk.ProvisionerJobSucceeded
98100
}
99101
return codersdk.ProvisionerJobFailed
100-
case database.Now().Sub(provisionerJob.UpdatedAt) > 30*time.Second:
101-
return codersdk.ProvisionerJobFailed
102102
default:
103103
return codersdk.ProvisionerJobRunning
104104
}

coderd/database/db2sdk/db2sdk_test.go

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -96,17 +96,6 @@ func TestProvisionerJobStatus(t *testing.T) {
9696
},
9797
status: codersdk.ProvisionerJobFailed,
9898
},
99-
{
100-
name: "not_updated",
101-
job: database.ProvisionerJob{
102-
StartedAt: sql.NullTime{
103-
Time: database.Now().Add(-time.Minute),
104-
Valid: true,
105-
},
106-
UpdatedAt: database.Now().Add(-31 * time.Second),
107-
},
108-
status: codersdk.ProvisionerJobFailed,
109-
},
11099
{
111100
name: "updated",
112101
job: database.ProvisionerJob{

coderd/database/dbauthz/dbauthz.go

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,25 @@ var (
176176
Scope: rbac.ScopeAll,
177177
}.WithCachedASTValue()
178178

179+
// See unhanger package.
180+
subjectHangDetector = rbac.Subject{
181+
ID: uuid.Nil.String(),
182+
Roles: rbac.Roles([]rbac.Role{
183+
{
184+
Name: "hangdetector",
185+
DisplayName: "Hang Detector Daemon",
186+
Site: rbac.Permissions(map[string][]rbac.Action{
187+
rbac.ResourceSystem.Type: {rbac.WildcardSymbol},
188+
rbac.ResourceTemplate.Type: {rbac.ActionRead},
189+
rbac.ResourceWorkspace.Type: {rbac.ActionRead, rbac.ActionUpdate},
190+
}),
191+
Org: map[string][]rbac.Permission{},
192+
User: []rbac.Permission{},
193+
},
194+
}),
195+
Scope: rbac.ScopeAll,
196+
}.WithCachedASTValue()
197+
179198
subjectSystemRestricted = rbac.Subject{
180199
ID: uuid.Nil.String(),
181200
Roles: rbac.Roles([]rbac.Role{
@@ -217,6 +236,12 @@ func AsAutostart(ctx context.Context) context.Context {
217236
return context.WithValue(ctx, authContextKey{}, subjectAutostart)
218237
}
219238

239+
// AsHangDetector returns a context with an actor that has permissions required
240+
// for unhanger.Detector to function.
241+
func AsHangDetector(ctx context.Context) context.Context {
242+
return context.WithValue(ctx, authContextKey{}, subjectHangDetector)
243+
}
244+
220245
// AsSystemRestricted returns a context with an actor that has permissions
221246
// required for various system operations (login, logout, metrics cache).
222247
func AsSystemRestricted(ctx context.Context) context.Context {
@@ -950,6 +975,14 @@ func (q *querier) GetGroupsByOrganizationID(ctx context.Context, organizationID
950975
return fetchWithPostFilter(q.auth, q.db.GetGroupsByOrganizationID)(ctx, organizationID)
951976
}
952977

978+
// TODO: We need to create a ProvisionerJob resource type
979+
func (q *querier) GetHungProvisionerJobs(ctx context.Context, hungSince time.Time) ([]database.ProvisionerJob, error) {
980+
// if err := q.authorizeContext(ctx, rbac.ActionCreate, rbac.ResourceSystem); err != nil {
981+
// return nil, err
982+
// }
983+
return q.db.GetHungProvisionerJobs(ctx, hungSince)
984+
}
985+
953986
func (q *querier) GetLastUpdateCheck(ctx context.Context) (string, error) {
954987
if err := q.authorizeContext(ctx, rbac.ActionRead, rbac.ResourceSystem); err != nil {
955988
return "", err

coderd/database/dbfake/dbfake.go

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1753,6 +1753,19 @@ func (q *fakeQuerier) GetGroupsByOrganizationID(_ context.Context, organizationI
17531753
return groups, nil
17541754
}
17551755

1756+
func (q *fakeQuerier) GetHungProvisionerJobs(_ context.Context, hungSince time.Time) ([]database.ProvisionerJob, error) {
1757+
q.mutex.RLock()
1758+
defer q.mutex.RUnlock()
1759+
1760+
hungJobs := []database.ProvisionerJob{}
1761+
for _, provisionerJob := range q.provisionerJobs {
1762+
if provisionerJob.StartedAt.Valid && !provisionerJob.CompletedAt.Valid && provisionerJob.UpdatedAt.Before(hungSince) {
1763+
hungJobs = append(hungJobs, provisionerJob)
1764+
}
1765+
}
1766+
return hungJobs, nil
1767+
}
1768+
17561769
func (q *fakeQuerier) GetLastUpdateCheck(_ context.Context) (string, error) {
17571770
q.mutex.RLock()
17581771
defer q.mutex.RUnlock()
@@ -2135,7 +2148,7 @@ func (q *fakeQuerier) GetProvisionerLogsAfterID(_ context.Context, arg database.
21352148
if jobLog.JobID != arg.JobID {
21362149
continue
21372150
}
2138-
if arg.CreatedAfter != 0 && jobLog.ID < arg.CreatedAfter {
2151+
if jobLog.ID <= arg.CreatedAfter {
21392152
continue
21402153
}
21412154
logs = append(logs, jobLog)

coderd/database/dbmetrics/dbmetrics.go

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/dbmock/dbmock.go

Lines changed: 15 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/lock.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
package database
2+
3+
import "hash/fnv"
4+
5+
// Well-known lock IDs for lock functions in the database. These should not
6+
// change. If locks are deprecated, they should be kept in this list to avoid
7+
// reusing the same ID.
8+
const (
9+
// Keep the unused iota here so we don't need + 1 every time
10+
lockIDUnused = iota
11+
LockIDDeploymentSetup
12+
)
13+
14+
// GenLockID generates a unique and consistent lock ID from a given string.
15+
func GenLockID(name string) int64 {
16+
hash := fnv.New64()
17+
_, _ = hash.Write([]byte(name))
18+
return int64(hash.Sum64())
19+
}

coderd/database/querier.go

Lines changed: 1 addition & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries.sql.go

Lines changed: 53 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries/lock.sql

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,11 @@
33
--
44
-- This must be called from within a transaction. The lock will be automatically
55
-- released when the transaction ends.
6-
--
7-
-- Use database.LockID() to generate a unique lock ID from a string.
86
SELECT pg_advisory_xact_lock($1);
97

108
-- name: TryAcquireLock :one
119
-- Non blocking lock. Returns true if the lock was acquired, false otherwise.
1210
--
1311
-- This must be called from within a transaction. The lock will be automatically
1412
-- released when the transaction ends.
15-
--
16-
-- Use database.LockID() to generate a unique lock ID from a string.
1713
SELECT pg_try_advisory_xact_lock($1);

0 commit comments

Comments
 (0)