Skip to content

feat: add provisioner job hang detector #7927

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Jun 25, 2023
Prev Previous commit
Next Next commit
chore: ensure provisioner kills upon failed update
  • Loading branch information
deansheather committed Jun 9, 2023
commit 530e8f20bea70e9b97330ff5068ebdb8c24cb73c
13 changes: 10 additions & 3 deletions coderd/unhanger/detector.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,16 @@ import (
"github.com/coder/coder/provisionersdk"
)

// HungJobDuration is the duration of time since the last update to a job before
// it is considered hung.
const HungJobDuration = 5 * time.Minute
const (
// HungJobDuration is the duration of time since the last update to a job
// before it is considered hung.
HungJobDuration = 5 * time.Minute

// HungJobExitTimeout is the duration of time that provisioners should allow
// for a graceful exit upon cancellation due to failing to send an update to
// a job.
HungJobExitTimeout = 3 * time.Minute
)

// HungJobLogMessages are written to provisioner job logs when a job is hung and
// terminated.
Expand Down
8 changes: 5 additions & 3 deletions provisioner/terraform/provision.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,21 +49,23 @@ func (s *server) Provision(stream proto.DRPCProvisioner_ProvisionStream) error {
ctx, cancel := context.WithCancel(ctx)
defer cancel()

// Create a separate context for forcefull cancellation not tied to
// Create a separate context for forceful cancellation not tied to
// the stream so that we can control when to terminate the process.
killCtx, kill := context.WithCancel(context.Background())
defer kill()

// Ensure processes are eventually cleaned up on graceful
// cancellation or disconnect.
go func() {
<-stream.Context().Done()
<-ctx.Done()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I need to ask a (perhaps stupid) question since we're relying on exit/hung timeouts to be predictable.

Do we have anyway to ensure that this specific context is cancelled in the event that heartbeats or updates are hanging/failing/timing out. Let's say network conditions are such that the stream doesn't die and this stream context remains open, but provisioner heartbeats to coderd are not coming through (perhaps stream writes simply hang).

Or, let's say it takes 3 minutes longer for this context to be cancelled than what hang detector is expecting. We would then be waiting 3 + 3 minutes and thus still potentially be canceling (SIGINT) the terraform apply for a minute after the job is marked as terminated.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a 30 second timeout to updates, and failed heartbeats will cause the stream context to be canceled which should result in graceful cancellation starting immediately


// TODO(mafredri): We should track this provision request as
// part of graceful server shutdown procedure. Waiting on a
// process here should delay provisioner/coder shutdown.
t := time.NewTimer(s.exitTimeout)
defer t.Stop()
select {
case <-time.After(s.exitTimeout):
case <-t.C:
kill()
case <-killCtx.Done():
}
Expand Down
53 changes: 51 additions & 2 deletions provisioner/terraform/provision_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,7 @@ func TestProvision_Cancel(t *testing.T) {
require.NoError(t, err)

ctx, api := setupProvisioner(t, &provisionerServeOptions{
binaryPath: binPath,
exitTimeout: time.Nanosecond,
binaryPath: binPath,
})

response, err := api.Provision(ctx)
Expand Down Expand Up @@ -187,6 +186,56 @@ func TestProvision_Cancel(t *testing.T) {
}
}

func TestProvision_CancelTimeout(t *testing.T) {
t.Parallel()
if runtime.GOOS == "windows" {
t.Skip("This test uses interrupts and is not supported on Windows")
}

dir := t.TempDir()
binPath := filepath.Join(dir, "terraform")

content := "#!/bin/sh\nset -eu\nsleep 15"
err := os.WriteFile(binPath, []byte(content), 0o755) //#nosec
require.NoError(t, err)

ctx, api := setupProvisioner(t, &provisionerServeOptions{
binaryPath: binPath,
exitTimeout: time.Second,
})

response, err := api.Provision(ctx)
require.NoError(t, err)
err = response.Send(&proto.Provision_Request{
Type: &proto.Provision_Request_Apply{
Apply: &proto.Provision_Apply{
Config: &proto.Provision_Config{
Directory: dir,
Metadata: &proto.Provision_Metadata{},
},
},
},
})
require.NoError(t, err)

err = response.Send(&proto.Provision_Request{
Type: &proto.Provision_Request_Cancel{
Cancel: &proto.Provision_Cancel{},
},
})
require.NoError(t, err)

for {
msg, err := response.Recv()
require.NoError(t, err)

if c := msg.GetComplete(); c != nil {
require.Contains(t, c.Error, "killed")
break
}
}
}

func TestProvision(t *testing.T) {
t.Parallel()

Expand Down
18 changes: 8 additions & 10 deletions provisioner/terraform/serve.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,10 @@ import (
"golang.org/x/xerrors"

"cdr.dev/slog"
"github.com/coder/coder/coderd/unhanger"
"github.com/coder/coder/provisionersdk"
)

const (
defaultExitTimeout = 5 * time.Minute
)

type ServeOptions struct {
*provisionersdk.ServeOptions

Expand All @@ -31,14 +28,15 @@ type ServeOptions struct {
Tracer trace.Tracer

// ExitTimeout defines how long we will wait for a running Terraform
// command to exit (cleanly) if the provision was stopped. This only
// happens when the command is still running after the provision
// stream is closed. If the provision is canceled via RPC, this
// timeout will not be used.
// command to exit (cleanly) if the provision was stopped. This
// happens when the provision is canceled via RPC and when the command is
// still running after the provision stream is closed.
//
// This is a no-op on Windows where the process can't be interrupted.
//
// Default value: 5 minutes.
// Default value: 3 minutes. This value should be kept less than the value
// that Coder uses to mark hung jobs as failed, which is 5 minutes (see
// unhanger package).
ExitTimeout time.Duration
}

Expand Down Expand Up @@ -96,7 +94,7 @@ func Serve(ctx context.Context, options *ServeOptions) error {
options.Tracer = trace.NewNoopTracerProvider().Tracer("noop")
}
if options.ExitTimeout == 0 {
options.ExitTimeout = defaultExitTimeout
options.ExitTimeout = unhanger.HungJobExitTimeout
}
return provisionersdk.Serve(ctx, &server{
execMut: &sync.Mutex{},
Expand Down
5 changes: 3 additions & 2 deletions provisionerd/runner/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -537,6 +537,7 @@ func (r *Runner) heartbeatRoutine(ctx context.Context) {

resp, err := r.sendHeartbeat(ctx)
if err != nil {
// Calling Fail starts cancellation so the process will exit.
err = r.Fail(ctx, r.failedJobf("send periodic update: %s", err))
if err != nil {
r.logger.Error(ctx, "failed to call FailJob", slog.Error(err))
Expand All @@ -547,9 +548,9 @@ func (r *Runner) heartbeatRoutine(ctx context.Context) {
ticker.Reset(r.updateInterval)
continue
}
r.logger.Info(ctx, "attempting graceful cancelation")
r.logger.Info(ctx, "attempting graceful cancellation")
r.Cancel()
// Hard-cancel the job after a minute of pending cancelation.
// Mark the job as failed after a minute of pending cancellation.
timer := time.NewTimer(r.forceCancelInterval)
select {
case <-timer.C:
Expand Down