Skip to content

feat: add provisioner job hang detector #7927

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Jun 25, 2023
Merged
12 changes: 9 additions & 3 deletions cli/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ import (
"github.com/coder/coder/coderd/schedule"
"github.com/coder/coder/coderd/telemetry"
"github.com/coder/coder/coderd/tracing"
"github.com/coder/coder/coderd/unhanger"
"github.com/coder/coder/coderd/updatecheck"
"github.com/coder/coder/coderd/util/slice"
"github.com/coder/coder/coderd/workspaceapps"
Expand Down Expand Up @@ -892,11 +893,16 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
return xerrors.Errorf("notify systemd: %w", err)
}

autobuildPoller := time.NewTicker(cfg.AutobuildPollInterval.Value())
defer autobuildPoller.Stop()
autobuildExecutor := executor.New(ctx, options.Database, coderAPI.TemplateScheduleStore, logger, autobuildPoller.C)
autobuildTicker := time.NewTicker(cfg.AutobuildPollInterval.Value())
defer autobuildTicker.Stop()
autobuildExecutor := executor.New(ctx, options.Database, coderAPI.TemplateScheduleStore, logger, autobuildTicker.C)
autobuildExecutor.Run()

hangDetectorTicker := time.NewTicker(cfg.JobHangDetectorInterval.Value())
defer hangDetectorTicker.Stop()
hangDetector := unhanger.New(ctx, options.Database, options.Pubsub, logger, hangDetectorTicker.C)
hangDetector.Run()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since unhanger has no Close method, nor is Run blocking, we have no way to ensure this service is actually shut down on exit. Ideally we would replace ctx dependence outside New function with a synchronous Close method to ensure cleanup. Alternatively we could defer <-hangDetector.Wait() but then the caller must be careful to cancel the context later in the stack (easier to make mistakes).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a Close function which cancels the context


// Currently there is no way to ask the server to shut
// itself down, so any exit signal will result in a non-zero
// exit of the server.
Expand Down
3 changes: 3 additions & 0 deletions cli/testdata/server-config.yaml.golden
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,9 @@ networking:
# Interval to poll for scheduled workspace builds.
# (default: 1m0s, type: duration)
autobuildPollInterval: 1m0s
# Interval to poll for hung jobs and automatically terminate them.
# (default: 1m0s, type: duration)
jobHangDetectorInterval: 1m0s
introspection:
prometheus:
# Serve prometheus metrics on the address defined by prometheus address.
Expand Down
3 changes: 3 additions & 0 deletions coderd/apidoc/docs.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions coderd/apidoc/swagger.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions coderd/coderdtest/coderdtest.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ import (
"github.com/coder/coder/coderd/rbac"
"github.com/coder/coder/coderd/schedule"
"github.com/coder/coder/coderd/telemetry"
"github.com/coder/coder/coderd/unhanger"
"github.com/coder/coder/coderd/updatecheck"
"github.com/coder/coder/coderd/util/ptr"
"github.com/coder/coder/coderd/workspaceapps"
Expand Down Expand Up @@ -250,6 +251,11 @@ func NewOptions(t testing.TB, options *Options) (func(http.Handler), context.Can
).WithStatsChannel(options.AutobuildStats)
lifecycleExecutor.Run()

hangDetectorTicker := time.NewTicker(options.DeploymentValues.JobHangDetectorInterval.Value())
defer hangDetectorTicker.Stop()
hangDetector := unhanger.New(ctx, options.Database, options.Pubsub, slogtest.Make(t, nil).Named("unhanger.detector"), hangDetectorTicker.C)
hangDetector.Run()

var mutex sync.RWMutex
var handler http.Handler
srv := httptest.NewUnstartedServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
Expand Down
6 changes: 3 additions & 3 deletions coderd/database/db2sdk/db2sdk.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package db2sdk

import (
"encoding/json"
"time"

"github.com/coder/coder/coderd/database"
"github.com/coder/coder/coderd/parameter"
Expand Down Expand Up @@ -78,6 +77,9 @@ func TemplateVersionParameter(param database.TemplateVersionParameter) (codersdk
}

func ProvisionerJobStatus(provisionerJob database.ProvisionerJob) codersdk.ProvisionerJobStatus {
// The case where jobs are hung is handled by the unhang package. We can't
// just return Failed here when it's hung because that doesn't reflect in
// the database.
switch {
case provisionerJob.CanceledAt.Valid:
if !provisionerJob.CompletedAt.Valid {
Expand All @@ -94,8 +96,6 @@ func ProvisionerJobStatus(provisionerJob database.ProvisionerJob) codersdk.Provi
return codersdk.ProvisionerJobSucceeded
}
return codersdk.ProvisionerJobFailed
case database.Now().Sub(provisionerJob.UpdatedAt) > 30*time.Second:
return codersdk.ProvisionerJobFailed
default:
return codersdk.ProvisionerJobRunning
}
Expand Down
11 changes: 0 additions & 11 deletions coderd/database/db2sdk/db2sdk_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,17 +96,6 @@ func TestProvisionerJobStatus(t *testing.T) {
},
status: codersdk.ProvisionerJobFailed,
},
{
name: "not_updated",
job: database.ProvisionerJob{
StartedAt: sql.NullTime{
Time: database.Now().Add(-time.Minute),
Valid: true,
},
UpdatedAt: database.Now().Add(-31 * time.Second),
},
status: codersdk.ProvisionerJobFailed,
},
{
name: "updated",
job: database.ProvisionerJob{
Expand Down
25 changes: 25 additions & 0 deletions coderd/database/dbauthz/dbauthz.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,25 @@ var (
Scope: rbac.ScopeAll,
}.WithCachedASTValue()

// See unhanger package.
subjectHangDetector = rbac.Subject{
ID: uuid.Nil.String(),
Roles: rbac.Roles([]rbac.Role{
{
Name: "hangdetector",
DisplayName: "Hang Detector Daemon",
Site: rbac.Permissions(map[string][]rbac.Action{
rbac.ResourceSystem.Type: {rbac.WildcardSymbol},
rbac.ResourceTemplate.Type: {rbac.ActionRead},
rbac.ResourceWorkspace.Type: {rbac.ActionRead, rbac.ActionUpdate},
}),
Org: map[string][]rbac.Permission{},
User: []rbac.Permission{},
},
}),
Scope: rbac.ScopeAll,
}.WithCachedASTValue()

subjectSystemRestricted = rbac.Subject{
ID: uuid.Nil.String(),
Roles: rbac.Roles([]rbac.Role{
Expand Down Expand Up @@ -213,6 +232,12 @@ func AsAutostart(ctx context.Context) context.Context {
return context.WithValue(ctx, authContextKey{}, subjectAutostart)
}

// AsHangDetector returns a context with an actor that has permissions required
// for unhanger.Detector to function.
func AsHangDetector(ctx context.Context) context.Context {
return context.WithValue(ctx, authContextKey{}, subjectHangDetector)
}

// AsSystemRestricted returns a context with an actor that has permissions
// required for various system operations (login, logout, metrics cache).
func AsSystemRestricted(ctx context.Context) context.Context {
Expand Down
8 changes: 8 additions & 0 deletions coderd/database/dbauthz/system.go
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,14 @@ func (q *querier) InsertProvisionerJobLogs(ctx context.Context, arg database.Ins
return q.db.InsertProvisionerJobLogs(ctx, arg)
}

// TODO: We need to create a ProvisionerJob resource type
func (q *querier) GetHungProvisionerJobs(ctx context.Context, hungSince time.Time) ([]database.ProvisionerJob, error) {
// if err := q.authorizeContext(ctx, rbac.ActionCreate, rbac.ResourceSystem); err != nil {
// return nil, err
// }
return q.db.GetHungProvisionerJobs(ctx, hungSince)
}

func (q *querier) InsertWorkspaceAgentStartupLogs(ctx context.Context, arg database.InsertWorkspaceAgentStartupLogsParams) ([]database.WorkspaceAgentStartupLog, error) {
return q.db.InsertWorkspaceAgentStartupLogs(ctx, arg)
}
Expand Down
13 changes: 13 additions & 0 deletions coderd/database/dbfake/databasefake.go
Original file line number Diff line number Diff line change
Expand Up @@ -2561,6 +2561,19 @@ func (q *fakeQuerier) getProvisionerJobByIDNoLock(_ context.Context, id uuid.UUI
return database.ProvisionerJob{}, sql.ErrNoRows
}

func (q *fakeQuerier) GetHungProvisionerJobs(_ context.Context, hungSince time.Time) ([]database.ProvisionerJob, error) {
q.mutex.RLock()
defer q.mutex.RUnlock()

hungJobs := []database.ProvisionerJob{}
for _, provisionerJob := range q.provisionerJobs {
if provisionerJob.StartedAt.Valid && !provisionerJob.CompletedAt.Valid && provisionerJob.UpdatedAt.Before(hungSince) {
hungJobs = append(hungJobs, provisionerJob)
}
}
return hungJobs, nil
}

func (q *fakeQuerier) GetWorkspaceResourceByID(_ context.Context, id uuid.UUID) (database.WorkspaceResource, error) {
q.mutex.RLock()
defer q.mutex.RUnlock()
Expand Down
7 changes: 7 additions & 0 deletions coderd/database/dbmetrics/dbmetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -1532,3 +1532,10 @@ func (m metricsStore) GetDefaultProxyConfig(ctx context.Context) (database.GetDe
m.queryLatencies.WithLabelValues("GetDefaultProxyConfig").Observe(time.Since(start).Seconds())
return resp, err
}

func (m metricsStore) GetHungProvisionerJobs(ctx context.Context, hungSince time.Time) ([]database.ProvisionerJob, error) {
start := time.Now()
jobs, err := m.s.GetHungProvisionerJobs(ctx, hungSince)
m.queryLatencies.WithLabelValues("GetHungProvisionerJobs").Observe(time.Since(start).Seconds())
return jobs, err
}
15 changes: 15 additions & 0 deletions coderd/database/dbmock/store.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 5 additions & 3 deletions coderd/database/lock.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
package database

// Well-known lock IDs for lock functions in the database. These should not
// change. If locks are deprecated, they should be kept to avoid reusing the
// same ID.
// change. If locks are deprecated, they should be kept in this list to avoid
// reusing the same ID.
const (
LockIDDeploymentSetup = iota + 1
lockIDUnused = iota
LockIDDeploymentSetup = iota
LockIDHangDetector = iota
)
5 changes: 1 addition & 4 deletions coderd/database/querier.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

57 changes: 53 additions & 4 deletions coderd/database/queries.sql.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 0 additions & 4 deletions coderd/database/queries/lock.sql
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,11 @@
--
-- This must be called from within a transaction. The lock will be automatically
-- released when the transaction ends.
--
-- Use database.LockID() to generate a unique lock ID from a string.
SELECT pg_advisory_xact_lock($1);

-- name: TryAcquireLock :one
-- Non blocking lock. Returns true if the lock was acquired, false otherwise.
--
-- This must be called from within a transaction. The lock will be automatically
-- released when the transaction ends.
--
-- Use database.LockID() to generate a unique lock ID from a string.
SELECT pg_try_advisory_xact_lock($1);
10 changes: 10 additions & 0 deletions coderd/database/queries/provisionerjobs.sql
Original file line number Diff line number Diff line change
Expand Up @@ -96,3 +96,13 @@ SET
error_code = $5
WHERE
id = $1;

-- name: GetHungProvisionerJobs :many
SELECT
*
FROM
provisioner_jobs
WHERE
updated_at < $1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor semantic sanity check, do we want inclusive or exclusive check here? I do feel like <= would make sense here considering the terminology (hung since).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it matters since it's time comparison, so the precision is millisecond (or smaller) and the chance of a collision is tiny (and doesn't have any consequence)

AND started_at IS NOT NULL
AND completed_at IS NULL;
Loading