Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
fix: separate signals for passive, active, and forced shutdown
`SIGTERM`: Passive shutdown stopping provisioner daemons from accepting new
jobs but waiting for existing jobs to successfully complete.

`SIGINT` (old existing behavior): Notify provisioner daemons to cancel in-flight jobs, wait 5s for jobs to be exited, then force quit.

`SIGKILL`: Untouched from before, will force-quit.
  • Loading branch information
kylecarbs committed Feb 28, 2024
commit 8d160ae4dd1495dd4b7407bd91d14b533b3278a6
37 changes: 31 additions & 6 deletions cli/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -337,16 +337,18 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.

// Register signals early on so that graceful shutdown can't
// be interrupted by additional signals. Note that we avoid
// shadowing cancel() (from above) here because notifyStop()
// shadowing cancel() (from above) here because stopCancel()
// restores default behavior for the signals. This protects
// the shutdown sequence from abruptly terminating things
// like: database migrations, provisioner work, workspace
// cleanup in dev-mode, etc.
//
// To get out of a graceful shutdown, the user can send
// SIGQUIT with ctrl+\ or SIGKILL with `kill -9`.
notifyCtx, notifyStop := inv.SignalNotifyContext(ctx, InterruptSignals...)
defer notifyStop()
stopCtx, stopCancel := signalNotifyContext(ctx, inv, StopSignals...)
defer stopCancel()
interruptCtx, interruptCancel := signalNotifyContext(ctx, inv, InterruptSignals...)
defer interruptCancel()

cacheDir := vals.CacheDir.String()
err = os.MkdirAll(cacheDir, 0o700)
Expand Down Expand Up @@ -1028,13 +1030,18 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
hangDetector.Start()
defer hangDetector.Close()

graceful := false
// Currently there is no way to ask the server to shut
// itself down, so any exit signal will result in a non-zero
// exit of the server.
var exitErr error
select {
case <-notifyCtx.Done():
exitErr = notifyCtx.Err()
case <-stopCtx.Done():
exitErr = stopCtx.Err()
graceful = true
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel both methods are graceful, so perhaps a rename here is appropriate?

Suggested change
graceful = true
waitForProvisionerJobs = true

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that's reasonable.

_, _ = io.WriteString(inv.Stdout, cliui.Bold("Stop caught, waiting for provisioner jobs to complete and gracefully exiting. Use ctrl+\\ to force quit"))
case <-interruptCtx.Done():
exitErr = interruptCtx.Err()
_, _ = io.WriteString(inv.Stdout, cliui.Bold("Interrupt caught, gracefully exiting. Use ctrl+\\ to force quit"))
case <-tunnelDone:
exitErr = xerrors.New("dev tunnel closed unexpectedly")
Expand Down Expand Up @@ -1082,7 +1089,16 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
defer wg.Done()

r.Verbosef(inv, "Shutting down provisioner daemon %d...", id)
err := shutdownWithTimeout(provisionerDaemon.Shutdown, 5*time.Second)
timeout := 5 * time.Second
if graceful {
// It can last for a long time...
timeout = 30 * time.Minute
}

err := shutdownWithTimeout(func(ctx context.Context) error {
// We only want to cancel active jobs if we aren't exiting gracefully.
return provisionerDaemon.Shutdown(ctx, !graceful)
}, timeout)
if err != nil {
cliui.Errorf(inv.Stderr, "Failed to shut down provisioner daemon %d: %s\n", id, err)
return
Expand Down Expand Up @@ -2512,3 +2528,12 @@ func escapePostgresURLUserInfo(v string) (string, error) {

return v, nil
}

func signalNotifyContext(ctx context.Context, inv *clibase.Invocation, sig ...os.Signal) (context.Context, context.CancelFunc) {
// On Windows, some of our signal functions lack support.
// If we pass in no signals, we should just return the context as-is.
if len(sig) == 0 {
return context.WithCancel(ctx)
}
return inv.SignalNotifyContext(ctx, sig...)
}
18 changes: 17 additions & 1 deletion cli/signal_unix.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,24 @@ import (
"syscall"
)

// StopSignals are used to gracefully exit.
// An example is exiting provisioner daemons but not canceling
// jobs, allowing a successful and clean exit.
var StopSignals = []os.Signal{
syscall.SIGTERM,
}

// InterruptSignals are used to less gracefully exit.
// An example is canceling a job, waiting for a timeout,
// and then exiting.
var InterruptSignals = []os.Signal{
// SIGINT
os.Interrupt,
syscall.SIGTERM,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a breaking change in all CLI commands that handle signals. They used to handle TERM but no longer do.

syscall.SIGHUP,
}

// KillSignals will force exit.
var KillSignals = []os.Signal{
// SIGKILL
os.Kill,
}
12 changes: 11 additions & 1 deletion cli/signal_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,14 @@ import (
"os"
)

var InterruptSignals = []os.Signal{os.Interrupt}
// Check the UNIX file for the comments on the signals

var StopSignals = []os.Signal{}

var InterruptSignals = []os.Signal{
os.Interrupt,
}

var KillSignals = []os.Signal{
os.Kill,
}
2 changes: 1 addition & 1 deletion coderd/coderdtest/coderdtest.go
Original file line number Diff line number Diff line change
Expand Up @@ -498,7 +498,7 @@ func (c *provisionerdCloser) Close() error {
c.closed = true
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
defer cancel()
shutdownErr := c.d.Shutdown(ctx)
shutdownErr := c.d.Shutdown(ctx, true)
closeErr := c.d.Close()
if shutdownErr != nil {
return shutdownErr
Expand Down
2 changes: 1 addition & 1 deletion enterprise/cli/provisionerdaemons.go
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ func (r *RootCmd) provisionerDaemonStart() *clibase.Cmd {
cliui.Errorf(inv.Stderr, "Unexpected error, shutting down server: %s\n", exitErr)
}

err = srv.Shutdown(ctx)
err = srv.Shutdown(ctx, false)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here we're changing the default behavior for interrupt, and since we split interrupt and stop, we're also not handling SIGTERM here anymore.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed

if err != nil {
return xerrors.Errorf("shutdown: %w", err)
}
Expand Down
2 changes: 1 addition & 1 deletion enterprise/coderd/provisionerdaemons_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,7 @@ func TestProvisionerDaemonServe(t *testing.T) {
build := coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, workspace.LatestBuild.ID)
require.Equal(t, codersdk.WorkspaceStatusRunning, build.Status)

err = pd.Shutdown(ctx)
err = pd.Shutdown(ctx, false)
require.NoError(t, err)
err = terraformServer.Close()
require.NoError(t, err)
Expand Down
9 changes: 6 additions & 3 deletions provisionerd/provisionerd.go
Original file line number Diff line number Diff line change
Expand Up @@ -474,15 +474,18 @@ func (p *Server) isClosed() bool {
}
}

// Shutdown triggers a graceful exit of each registered provisioner.
func (p *Server) Shutdown(ctx context.Context) error {
// Shutdown gracefully exists with the option to cancel the active job.
// If false, it will wait for the job to complete.
//
//nolint:revive
func (p *Server) Shutdown(ctx context.Context, cancelActiveJob bool) error {
p.mutex.Lock()
p.opts.Logger.Info(ctx, "attempting graceful shutdown")
if !p.shuttingDownB {
close(p.shuttingDownCh)
p.shuttingDownB = true
}
if p.activeJob != nil {
if cancelActiveJob && p.activeJob != nil {
p.activeJob.Cancel()
}
p.mutex.Unlock()
Expand Down
12 changes: 6 additions & 6 deletions provisionerd/provisionerd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -671,7 +671,7 @@ func TestProvisionerd(t *testing.T) {
}),
})
require.Condition(t, closedWithin(updateChan, testutil.WaitShort))
err := server.Shutdown(context.Background())
err := server.Shutdown(context.Background(), false)
require.NoError(t, err)
require.Condition(t, closedWithin(completeChan, testutil.WaitShort))
require.NoError(t, server.Close())
Expand Down Expand Up @@ -762,7 +762,7 @@ func TestProvisionerd(t *testing.T) {
require.Condition(t, closedWithin(completeChan, testutil.WaitShort))
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
defer cancel()
require.NoError(t, server.Shutdown(ctx))
require.NoError(t, server.Shutdown(ctx, false))
require.NoError(t, server.Close())
})

Expand Down Expand Up @@ -853,7 +853,7 @@ func TestProvisionerd(t *testing.T) {
require.Condition(t, closedWithin(completeChan, testutil.WaitShort))
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
defer cancel()
require.NoError(t, server.Shutdown(ctx))
require.NoError(t, server.Shutdown(ctx, false))
require.NoError(t, server.Close())
})

Expand Down Expand Up @@ -944,7 +944,7 @@ func TestProvisionerd(t *testing.T) {
t.Log("completeChan closed")
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
defer cancel()
require.NoError(t, server.Shutdown(ctx))
require.NoError(t, server.Shutdown(ctx, false))
require.NoError(t, server.Close())
})

Expand Down Expand Up @@ -1039,7 +1039,7 @@ func TestProvisionerd(t *testing.T) {
require.Condition(t, closedWithin(completeChan, testutil.WaitShort))
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
defer cancel()
require.NoError(t, server.Shutdown(ctx))
require.NoError(t, server.Shutdown(ctx, false))
require.NoError(t, server.Close())
assert.Equal(t, ops[len(ops)-1], "CompleteJob")
assert.Contains(t, ops[0:len(ops)-1], "Log: Cleaning Up | ")
Expand Down Expand Up @@ -1076,7 +1076,7 @@ func createProvisionerd(t *testing.T, dialer provisionerd.Dialer, connector prov
t.Cleanup(func() {
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
defer cancel()
_ = server.Shutdown(ctx)
_ = server.Shutdown(ctx, false)
_ = server.Close()
})
return server
Expand Down