Skip to content

Commit 530e8f2

Browse files
committed
chore: ensure provisioner kills upon failed update
1 parent dfaf836 commit 530e8f2

File tree

5 files changed

+77
-20
lines changed

5 files changed

+77
-20
lines changed

coderd/unhanger/detector.go

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,16 @@ import (
1919
"github.com/coder/coder/provisionersdk"
2020
)
2121

22-
// HungJobDuration is the duration of time since the last update to a job before
23-
// it is considered hung.
24-
const HungJobDuration = 5 * time.Minute
22+
const (
23+
// HungJobDuration is the duration of time since the last update to a job
24+
// before it is considered hung.
25+
HungJobDuration = 5 * time.Minute
26+
27+
// HungJobExitTimeout is the duration of time that provisioners should allow
28+
// for a graceful exit upon cancellation due to failing to send an update to
29+
// a job.
30+
HungJobExitTimeout = 3 * time.Minute
31+
)
2532

2633
// HungJobLogMessages are written to provisioner job logs when a job is hung and
2734
// terminated.

provisioner/terraform/provision.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,21 +49,23 @@ func (s *server) Provision(stream proto.DRPCProvisioner_ProvisionStream) error {
4949
ctx, cancel := context.WithCancel(ctx)
5050
defer cancel()
5151

52-
// Create a separate context for forcefull cancellation not tied to
52+
// Create a separate context for forceful cancellation not tied to
5353
// the stream so that we can control when to terminate the process.
5454
killCtx, kill := context.WithCancel(context.Background())
5555
defer kill()
5656

5757
// Ensure processes are eventually cleaned up on graceful
5858
// cancellation or disconnect.
5959
go func() {
60-
<-stream.Context().Done()
60+
<-ctx.Done()
6161

6262
// TODO(mafredri): We should track this provision request as
6363
// part of graceful server shutdown procedure. Waiting on a
6464
// process here should delay provisioner/coder shutdown.
65+
t := time.NewTimer(s.exitTimeout)
66+
defer t.Stop()
6567
select {
66-
case <-time.After(s.exitTimeout):
68+
case <-t.C:
6769
kill()
6870
case <-killCtx.Done():
6971
}

provisioner/terraform/provision_test.go

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,8 +130,7 @@ func TestProvision_Cancel(t *testing.T) {
130130
require.NoError(t, err)
131131

132132
ctx, api := setupProvisioner(t, &provisionerServeOptions{
133-
binaryPath: binPath,
134-
exitTimeout: time.Nanosecond,
133+
binaryPath: binPath,
135134
})
136135

137136
response, err := api.Provision(ctx)
@@ -187,6 +186,56 @@ func TestProvision_Cancel(t *testing.T) {
187186
}
188187
}
189188

189+
func TestProvision_CancelTimeout(t *testing.T) {
190+
t.Parallel()
191+
if runtime.GOOS == "windows" {
192+
t.Skip("This test uses interrupts and is not supported on Windows")
193+
}
194+
195+
dir := t.TempDir()
196+
binPath := filepath.Join(dir, "terraform")
197+
198+
content := "#!/bin/sh\nset -eu\nsleep 15"
199+
err := os.WriteFile(binPath, []byte(content), 0o755) //#nosec
200+
require.NoError(t, err)
201+
202+
ctx, api := setupProvisioner(t, &provisionerServeOptions{
203+
binaryPath: binPath,
204+
exitTimeout: time.Second,
205+
})
206+
207+
response, err := api.Provision(ctx)
208+
require.NoError(t, err)
209+
err = response.Send(&proto.Provision_Request{
210+
Type: &proto.Provision_Request_Apply{
211+
Apply: &proto.Provision_Apply{
212+
Config: &proto.Provision_Config{
213+
Directory: dir,
214+
Metadata: &proto.Provision_Metadata{},
215+
},
216+
},
217+
},
218+
})
219+
require.NoError(t, err)
220+
221+
err = response.Send(&proto.Provision_Request{
222+
Type: &proto.Provision_Request_Cancel{
223+
Cancel: &proto.Provision_Cancel{},
224+
},
225+
})
226+
require.NoError(t, err)
227+
228+
for {
229+
msg, err := response.Recv()
230+
require.NoError(t, err)
231+
232+
if c := msg.GetComplete(); c != nil {
233+
require.Contains(t, c.Error, "killed")
234+
break
235+
}
236+
}
237+
}
238+
190239
func TestProvision(t *testing.T) {
191240
t.Parallel()
192241

provisioner/terraform/serve.go

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,10 @@ import (
1212
"golang.org/x/xerrors"
1313

1414
"cdr.dev/slog"
15+
"github.com/coder/coder/coderd/unhanger"
1516
"github.com/coder/coder/provisionersdk"
1617
)
1718

18-
const (
19-
defaultExitTimeout = 5 * time.Minute
20-
)
21-
2219
type ServeOptions struct {
2320
*provisionersdk.ServeOptions
2421

@@ -31,14 +28,15 @@ type ServeOptions struct {
3128
Tracer trace.Tracer
3229

3330
// ExitTimeout defines how long we will wait for a running Terraform
34-
// command to exit (cleanly) if the provision was stopped. This only
35-
// happens when the command is still running after the provision
36-
// stream is closed. If the provision is canceled via RPC, this
37-
// timeout will not be used.
31+
// command to exit (cleanly) if the provision was stopped. This
32+
// happens when the provision is canceled via RPC and when the command is
33+
// still running after the provision stream is closed.
3834
//
3935
// This is a no-op on Windows where the process can't be interrupted.
4036
//
41-
// Default value: 5 minutes.
37+
// Default value: 3 minutes. This value should be kept less than the value
38+
// that Coder uses to mark hung jobs as failed, which is 5 minutes (see
39+
// unhanger package).
4240
ExitTimeout time.Duration
4341
}
4442

@@ -96,7 +94,7 @@ func Serve(ctx context.Context, options *ServeOptions) error {
9694
options.Tracer = trace.NewNoopTracerProvider().Tracer("noop")
9795
}
9896
if options.ExitTimeout == 0 {
99-
options.ExitTimeout = defaultExitTimeout
97+
options.ExitTimeout = unhanger.HungJobExitTimeout
10098
}
10199
return provisionersdk.Serve(ctx, &server{
102100
execMut: &sync.Mutex{},

provisionerd/runner/runner.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -537,6 +537,7 @@ func (r *Runner) heartbeatRoutine(ctx context.Context) {
537537

538538
resp, err := r.sendHeartbeat(ctx)
539539
if err != nil {
540+
// Calling Fail starts cancellation so the process will exit.
540541
err = r.Fail(ctx, r.failedJobf("send periodic update: %s", err))
541542
if err != nil {
542543
r.logger.Error(ctx, "failed to call FailJob", slog.Error(err))
@@ -547,9 +548,9 @@ func (r *Runner) heartbeatRoutine(ctx context.Context) {
547548
ticker.Reset(r.updateInterval)
548549
continue
549550
}
550-
r.logger.Info(ctx, "attempting graceful cancelation")
551+
r.logger.Info(ctx, "attempting graceful cancellation")
551552
r.Cancel()
552-
// Hard-cancel the job after a minute of pending cancelation.
553+
// Mark the job as failed after a minute of pending cancellation.
553554
timer := time.NewTimer(r.forceCancelInterval)
554555
select {
555556
case <-timer.C:

0 commit comments

Comments
 (0)