Skip to content

feat: add provisioner job hang detector #7927

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Jun 25, 2023
Merged
13 changes: 10 additions & 3 deletions cli/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ import (
"github.com/coder/coder/coderd/schedule"
"github.com/coder/coder/coderd/telemetry"
"github.com/coder/coder/coderd/tracing"
"github.com/coder/coder/coderd/unhanger"
"github.com/coder/coder/coderd/updatecheck"
"github.com/coder/coder/coderd/util/slice"
"github.com/coder/coder/coderd/workspaceapps"
Expand Down Expand Up @@ -898,11 +899,17 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
return xerrors.Errorf("notify systemd: %w", err)
}

autobuildPoller := time.NewTicker(cfg.AutobuildPollInterval.Value())
defer autobuildPoller.Stop()
autobuildExecutor := autobuild.NewExecutor(ctx, options.Database, coderAPI.TemplateScheduleStore, logger, autobuildPoller.C)
autobuildTicker := time.NewTicker(cfg.AutobuildPollInterval.Value())
defer autobuildTicker.Stop()
autobuildExecutor := autobuild.NewExecutor(ctx, options.Database, coderAPI.TemplateScheduleStore, logger, autobuildTicker.C)
autobuildExecutor.Run()

hangDetectorTicker := time.NewTicker(cfg.JobHangDetectorInterval.Value())
defer hangDetectorTicker.Stop()
hangDetector := unhanger.New(ctx, options.Database, options.Pubsub, logger, hangDetectorTicker.C)
hangDetector.Start()
defer hangDetector.Close()

// Currently there is no way to ask the server to shut
// itself down, so any exit signal will result in a non-zero
// exit of the server.
Expand Down
3 changes: 3 additions & 0 deletions cli/testdata/server-config.yaml.golden
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,9 @@ networking:
# Interval to poll for scheduled workspace builds.
# (default: 1m0s, type: duration)
autobuildPollInterval: 1m0s
# Interval to poll for hung jobs and automatically terminate them.
# (default: 1m0s, type: duration)
jobHangDetectorInterval: 1m0s
introspection:
prometheus:
# Serve prometheus metrics on the address defined by prometheus address.
Expand Down
3 changes: 3 additions & 0 deletions coderd/apidoc/docs.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions coderd/apidoc/swagger.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions coderd/coderdtest/coderdtest.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ import (
"github.com/coder/coder/coderd/rbac"
"github.com/coder/coder/coderd/schedule"
"github.com/coder/coder/coderd/telemetry"
"github.com/coder/coder/coderd/unhanger"
"github.com/coder/coder/coderd/updatecheck"
"github.com/coder/coder/coderd/util/ptr"
"github.com/coder/coder/coderd/workspaceapps"
Expand Down Expand Up @@ -256,6 +257,12 @@ func NewOptions(t testing.TB, options *Options) (func(http.Handler), context.Can
).WithStatsChannel(options.AutobuildStats)
lifecycleExecutor.Run()

hangDetectorTicker := time.NewTicker(options.DeploymentValues.JobHangDetectorInterval.Value())
defer hangDetectorTicker.Stop()
hangDetector := unhanger.New(ctx, options.Database, options.Pubsub, slogtest.Make(t, nil).Named("unhanger.detector"), hangDetectorTicker.C)
hangDetector.Start()
t.Cleanup(hangDetector.Close)

var mutex sync.RWMutex
var handler http.Handler
srv := httptest.NewUnstartedServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
Expand Down
7 changes: 0 additions & 7 deletions coderd/database/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,6 @@ import (
"golang.org/x/xerrors"
)

// Well-known lock IDs for lock functions in the database. These should not
// change. If locks are deprecated, they should be kept to avoid reusing the
// same ID.
const (
LockIDDeploymentSetup = iota + 1
)

// Store contains all queryable database functions.
// It extends the generated interface to add transaction support.
type Store interface {
Expand Down
6 changes: 3 additions & 3 deletions coderd/database/db2sdk/db2sdk.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package db2sdk

import (
"encoding/json"
"time"

"github.com/google/uuid"

Expand Down Expand Up @@ -81,6 +80,9 @@ func TemplateVersionParameter(param database.TemplateVersionParameter) (codersdk
}

func ProvisionerJobStatus(provisionerJob database.ProvisionerJob) codersdk.ProvisionerJobStatus {
// The case where jobs are hung is handled by the unhang package. We can't
// just return Failed here when it's hung because that doesn't reflect in
// the database.
switch {
case provisionerJob.CanceledAt.Valid:
if !provisionerJob.CompletedAt.Valid {
Expand All @@ -97,8 +99,6 @@ func ProvisionerJobStatus(provisionerJob database.ProvisionerJob) codersdk.Provi
return codersdk.ProvisionerJobSucceeded
}
return codersdk.ProvisionerJobFailed
case database.Now().Sub(provisionerJob.UpdatedAt) > 30*time.Second:
return codersdk.ProvisionerJobFailed
default:
return codersdk.ProvisionerJobRunning
}
Expand Down
11 changes: 0 additions & 11 deletions coderd/database/db2sdk/db2sdk_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,17 +96,6 @@ func TestProvisionerJobStatus(t *testing.T) {
},
status: codersdk.ProvisionerJobFailed,
},
{
name: "not_updated",
job: database.ProvisionerJob{
StartedAt: sql.NullTime{
Time: database.Now().Add(-time.Minute),
Valid: true,
},
UpdatedAt: database.Now().Add(-31 * time.Second),
},
status: codersdk.ProvisionerJobFailed,
},
{
name: "updated",
job: database.ProvisionerJob{
Expand Down
33 changes: 33 additions & 0 deletions coderd/database/dbauthz/dbauthz.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,25 @@ var (
Scope: rbac.ScopeAll,
}.WithCachedASTValue()

// See unhanger package.
subjectHangDetector = rbac.Subject{
ID: uuid.Nil.String(),
Roles: rbac.Roles([]rbac.Role{
{
Name: "hangdetector",
DisplayName: "Hang Detector Daemon",
Site: rbac.Permissions(map[string][]rbac.Action{
rbac.ResourceSystem.Type: {rbac.WildcardSymbol},
rbac.ResourceTemplate.Type: {rbac.ActionRead},
rbac.ResourceWorkspace.Type: {rbac.ActionRead, rbac.ActionUpdate},
}),
Org: map[string][]rbac.Permission{},
User: []rbac.Permission{},
},
}),
Scope: rbac.ScopeAll,
}.WithCachedASTValue()

subjectSystemRestricted = rbac.Subject{
ID: uuid.Nil.String(),
Roles: rbac.Roles([]rbac.Role{
Expand Down Expand Up @@ -217,6 +236,12 @@ func AsAutostart(ctx context.Context) context.Context {
return context.WithValue(ctx, authContextKey{}, subjectAutostart)
}

// AsHangDetector returns a context with an actor that has permissions required
// for unhanger.Detector to function.
func AsHangDetector(ctx context.Context) context.Context {
return context.WithValue(ctx, authContextKey{}, subjectHangDetector)
}

// AsSystemRestricted returns a context with an actor that has permissions
// required for various system operations (login, logout, metrics cache).
func AsSystemRestricted(ctx context.Context) context.Context {
Expand Down Expand Up @@ -950,6 +975,14 @@ func (q *querier) GetGroupsByOrganizationID(ctx context.Context, organizationID
return fetchWithPostFilter(q.auth, q.db.GetGroupsByOrganizationID)(ctx, organizationID)
}

// TODO: We need to create a ProvisionerJob resource type
func (q *querier) GetHungProvisionerJobs(ctx context.Context, hungSince time.Time) ([]database.ProvisionerJob, error) {
// if err := q.authorizeContext(ctx, rbac.ActionCreate, rbac.ResourceSystem); err != nil {
// return nil, err
// }
return q.db.GetHungProvisionerJobs(ctx, hungSince)
}

func (q *querier) GetLastUpdateCheck(ctx context.Context) (string, error) {
if err := q.authorizeContext(ctx, rbac.ActionRead, rbac.ResourceSystem); err != nil {
return "", err
Expand Down
15 changes: 14 additions & 1 deletion coderd/database/dbfake/dbfake.go
Original file line number Diff line number Diff line change
Expand Up @@ -1753,6 +1753,19 @@ func (q *fakeQuerier) GetGroupsByOrganizationID(_ context.Context, organizationI
return groups, nil
}

func (q *fakeQuerier) GetHungProvisionerJobs(_ context.Context, hungSince time.Time) ([]database.ProvisionerJob, error) {
q.mutex.RLock()
defer q.mutex.RUnlock()

hungJobs := []database.ProvisionerJob{}
for _, provisionerJob := range q.provisionerJobs {
if provisionerJob.StartedAt.Valid && !provisionerJob.CompletedAt.Valid && provisionerJob.UpdatedAt.Before(hungSince) {
hungJobs = append(hungJobs, provisionerJob)
}
}
return hungJobs, nil
}

func (q *fakeQuerier) GetLastUpdateCheck(_ context.Context) (string, error) {
q.mutex.RLock()
defer q.mutex.RUnlock()
Expand Down Expand Up @@ -2135,7 +2148,7 @@ func (q *fakeQuerier) GetProvisionerLogsAfterID(_ context.Context, arg database.
if jobLog.JobID != arg.JobID {
continue
}
if arg.CreatedAfter != 0 && jobLog.ID < arg.CreatedAfter {
if jobLog.ID <= arg.CreatedAfter {
continue
}
logs = append(logs, jobLog)
Expand Down
7 changes: 7 additions & 0 deletions coderd/database/dbmetrics/dbmetrics.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 15 additions & 0 deletions coderd/database/dbmock/dbmock.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 19 additions & 0 deletions coderd/database/lock.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
package database

import "hash/fnv"

// Well-known lock IDs for lock functions in the database. These should not
// change. If locks are deprecated, they should be kept in this list to avoid
// reusing the same ID.
const (
// Keep the unused iota here so we don't need + 1 every time
lockIDUnused = iota
LockIDDeploymentSetup
)

// GenLockID generates a unique and consistent lock ID from a given string.
func GenLockID(name string) int64 {
hash := fnv.New64()
_, _ = hash.Write([]byte(name))
return int64(hash.Sum64())
}
5 changes: 1 addition & 4 deletions coderd/database/querier.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

57 changes: 53 additions & 4 deletions coderd/database/queries.sql.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 0 additions & 4 deletions coderd/database/queries/lock.sql
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,11 @@
--
-- This must be called from within a transaction. The lock will be automatically
-- released when the transaction ends.
--
-- Use database.LockID() to generate a unique lock ID from a string.
SELECT pg_advisory_xact_lock($1);

-- name: TryAcquireLock :one
-- Non blocking lock. Returns true if the lock was acquired, false otherwise.
--
-- This must be called from within a transaction. The lock will be automatically
-- released when the transaction ends.
--
-- Use database.LockID() to generate a unique lock ID from a string.
SELECT pg_try_advisory_xact_lock($1);
Loading