chore: ensure provisioner kills upon failed update

coder · deansheather · Jun 25, 2023 · Jun 9, 2023 · Jun 9, 2023 · Jun 9, 2023
commit 530e8f20bea70e9b97330ff5068ebdb8c24cb73c
diff --git a/coderd/unhanger/detector.go b/coderd/unhanger/detector.go
@@ -19,9 +19,16 @@ import (
 	"github.com/coder/coder/provisionersdk"
 )
 
-// HungJobDuration is the duration of time since the last update to a job before
-// it is considered hung.
-const HungJobDuration = 5 * time.Minute
+const (
+	// HungJobDuration is the duration of time since the last update to a job
+	// before it is considered hung.
+	HungJobDuration = 5 * time.Minute
+
+	// HungJobExitTimeout is the duration of time that provisioners should allow
+	// for a graceful exit upon cancellation due to failing to send an update to
+	// a job.
+	HungJobExitTimeout = 3 * time.Minute
+)
 
 // HungJobLogMessages are written to provisioner job logs when a job is hung and
 // terminated.

diff --git a/provisioner/terraform/provision.go b/provisioner/terraform/provision.go
@@ -49,21 +49,23 @@ func (s *server) Provision(stream proto.DRPCProvisioner_ProvisionStream) error {
 	ctx, cancel := context.WithCancel(ctx)
 	defer cancel()
 
-	// Create a separate context for forcefull cancellation not tied to
+	// Create a separate context for forceful cancellation not tied to
 	// the stream so that we can control when to terminate the process.
 	killCtx, kill := context.WithCancel(context.Background())
 	defer kill()
 
 	// Ensure processes are eventually cleaned up on graceful
 	// cancellation or disconnect.
 	go func() {
-		<-stream.Context().Done()
+		<-ctx.Done()
 
 		// TODO(mafredri): We should track this provision request as
 		// part of graceful server shutdown procedure. Waiting on a
 		// process here should delay provisioner/coder shutdown.
+		t := time.NewTimer(s.exitTimeout)
+		defer t.Stop()
 		select {
-		case <-time.After(s.exitTimeout):
+		case <-t.C:
 			kill()
 		case <-killCtx.Done():
 		}

diff --git a/provisioner/terraform/provision_test.go b/provisioner/terraform/provision_test.go
@@ -130,8 +130,7 @@ func TestProvision_Cancel(t *testing.T) {
 			require.NoError(t, err)
 
 			ctx, api := setupProvisioner(t, &provisionerServeOptions{
-				binaryPath:  binPath,
-				exitTimeout: time.Nanosecond,
+				binaryPath: binPath,
 			})
 
 			response, err := api.Provision(ctx)
@@ -187,6 +186,56 @@ func TestProvision_Cancel(t *testing.T) {
 	}
 }
 
+func TestProvision_CancelTimeout(t *testing.T) {
+	t.Parallel()
+	if runtime.GOOS == "windows" {
+		t.Skip("This test uses interrupts and is not supported on Windows")
+	}
+
+	dir := t.TempDir()
+	binPath := filepath.Join(dir, "terraform")
+
+	content := "#!/bin/sh\nset -eu\nsleep 15"
+	err := os.WriteFile(binPath, []byte(content), 0o755) //#nosec
+	require.NoError(t, err)
+
+	ctx, api := setupProvisioner(t, &provisionerServeOptions{
+		binaryPath:  binPath,
+		exitTimeout: time.Second,
+	})
+
+	response, err := api.Provision(ctx)
+	require.NoError(t, err)
+	err = response.Send(&proto.Provision_Request{
+		Type: &proto.Provision_Request_Apply{
+			Apply: &proto.Provision_Apply{
+				Config: &proto.Provision_Config{
+					Directory: dir,
+					Metadata:  &proto.Provision_Metadata{},
+				},
+			},
+		},
+	})
+	require.NoError(t, err)
+
+	err = response.Send(&proto.Provision_Request{
+		Type: &proto.Provision_Request_Cancel{
+			Cancel: &proto.Provision_Cancel{},
+		},
+	})
+	require.NoError(t, err)
+
+	for {
+		msg, err := response.Recv()
+		require.NoError(t, err)
+
+		if c := msg.GetComplete(); c != nil {
+			require.Contains(t, c.Error, "killed")
+			break
+		}
+	}
+}
+
 func TestProvision(t *testing.T) {
 	t.Parallel()
 

diff --git a/provisioner/terraform/serve.go b/provisioner/terraform/serve.go
@@ -12,13 +12,10 @@ import (
 	"golang.org/x/xerrors"
 
 	"cdr.dev/slog"
+	"github.com/coder/coder/coderd/unhanger"
 	"github.com/coder/coder/provisionersdk"
 )
 
-const (
-	defaultExitTimeout = 5 * time.Minute
-)
-
 type ServeOptions struct {
 	*provisionersdk.ServeOptions
 
@@ -31,14 +28,15 @@ type ServeOptions struct {
 	Tracer    trace.Tracer
 
 	// ExitTimeout defines how long we will wait for a running Terraform
-	// command to exit (cleanly) if the provision was stopped. This only
-	// happens when the command is still running after the provision
-	// stream is closed. If the provision is canceled via RPC, this
-	// timeout will not be used.
+	// command to exit (cleanly) if the provision was stopped. This
+	// happens when the provision is canceled via RPC and when the command is
+	// still running after the provision stream is closed.
 	//
 	// This is a no-op on Windows where the process can't be interrupted.
 	//
-	// Default value: 5 minutes.
+	// Default value: 3 minutes. This value should be kept less than the value
+	// that Coder uses to mark hung jobs as failed, which is 5 minutes (see
+	// unhanger package).
 	ExitTimeout time.Duration
 }
 
@@ -96,7 +94,7 @@ func Serve(ctx context.Context, options *ServeOptions) error {
 		options.Tracer = trace.NewNoopTracerProvider().Tracer("noop")
 	}
 	if options.ExitTimeout == 0 {
-		options.ExitTimeout = defaultExitTimeout
+		options.ExitTimeout = unhanger.HungJobExitTimeout
 	}
 	return provisionersdk.Serve(ctx, &server{
 		execMut:     &sync.Mutex{},

diff --git a/provisionerd/runner/runner.go b/provisionerd/runner/runner.go
@@ -537,6 +537,7 @@ func (r *Runner) heartbeatRoutine(ctx context.Context) {
 
 		resp, err := r.sendHeartbeat(ctx)
 		if err != nil {
+			// Calling Fail starts cancellation so the process will exit.
 			err = r.Fail(ctx, r.failedJobf("send periodic update: %s", err))
 			if err != nil {
 				r.logger.Error(ctx, "failed to call FailJob", slog.Error(err))
@@ -547,9 +548,9 @@ func (r *Runner) heartbeatRoutine(ctx context.Context) {
 			ticker.Reset(r.updateInterval)
 			continue
 		}
-		r.logger.Info(ctx, "attempting graceful cancelation")
+		r.logger.Info(ctx, "attempting graceful cancellation")
 		r.Cancel()
-		// Hard-cancel the job after a minute of pending cancelation.
+		// Mark the job as failed after a minute of pending cancellation.
 		timer := time.NewTimer(r.forceCancelInterval)
 		select {
 		case <-timer.C: