Skip to content

Commit 895df54

Browse files
authored
fix: separate signals for passive, active, and forced shutdown (#12358)
* fix: separate signals for passive, active, and forced shutdown `SIGTERM`: Passive shutdown stopping provisioner daemons from accepting new jobs but waiting for existing jobs to successfully complete. `SIGINT` (old existing behavior): Notify provisioner daemons to cancel in-flight jobs, wait 5s for jobs to be exited, then force quit. `SIGKILL`: Untouched from before, will force-quit. * Revert dramatic signal changes * Rename * Fix shutdown behavior for provisioner daemons * Add test for graceful shutdown
1 parent 2c947c1 commit 895df54

18 files changed

+136
-35
lines changed

cli/agent.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ func (r *RootCmd) workspaceAgent() *clibase.Cmd {
125125
args := append(os.Args, "--no-reap")
126126
err := reaper.ForkReap(
127127
reaper.WithExecArgs(args...),
128-
reaper.WithCatchSignals(InterruptSignals...),
128+
reaper.WithCatchSignals(StopSignals...),
129129
)
130130
if err != nil {
131131
logger.Error(ctx, "agent process reaper unable to fork", slog.Error(err))
@@ -144,7 +144,7 @@ func (r *RootCmd) workspaceAgent() *clibase.Cmd {
144144
// Note that we don't want to handle these signals in the
145145
// process that runs as PID 1, that's why we do this after
146146
// the reaper forked.
147-
ctx, stopNotify := inv.SignalNotifyContext(ctx, InterruptSignals...)
147+
ctx, stopNotify := inv.SignalNotifyContext(ctx, StopSignals...)
148148
defer stopNotify()
149149

150150
// DumpHandler does signal handling, so we call it after the

cli/exp_scaletest.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -890,7 +890,7 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd {
890890
Handler: func(inv *clibase.Invocation) (err error) {
891891
ctx := inv.Context()
892892

893-
notifyCtx, stop := signal.NotifyContext(ctx, InterruptSignals...) // Checked later.
893+
notifyCtx, stop := signal.NotifyContext(ctx, StopSignals...) // Checked later.
894894
defer stop()
895895
ctx = notifyCtx
896896

cli/externalauth.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ fi
6565
Handler: func(inv *clibase.Invocation) error {
6666
ctx := inv.Context()
6767

68-
ctx, stop := inv.SignalNotifyContext(ctx, InterruptSignals...)
68+
ctx, stop := inv.SignalNotifyContext(ctx, StopSignals...)
6969
defer stop()
7070

7171
client, err := r.createAgentClient()

cli/gitaskpass.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ func (r *RootCmd) gitAskpass() *clibase.Cmd {
2525
Handler: func(inv *clibase.Invocation) error {
2626
ctx := inv.Context()
2727

28-
ctx, stop := inv.SignalNotifyContext(ctx, InterruptSignals...)
28+
ctx, stop := inv.SignalNotifyContext(ctx, StopSignals...)
2929
defer stop()
3030

3131
user, host, err := gitauth.ParseAskpass(inv.Args[0])

cli/gitssh.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ func (r *RootCmd) gitssh() *clibase.Cmd {
2929

3030
// Catch interrupt signals to ensure the temporary private
3131
// key file is cleaned up on most cases.
32-
ctx, stop := inv.SignalNotifyContext(ctx, InterruptSignals...)
32+
ctx, stop := inv.SignalNotifyContext(ctx, StopSignals...)
3333
defer stop()
3434

3535
// Early check so errors are reported immediately.

cli/server.go

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -337,16 +337,18 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
337337

338338
// Register signals early on so that graceful shutdown can't
339339
// be interrupted by additional signals. Note that we avoid
340-
// shadowing cancel() (from above) here because notifyStop()
340+
// shadowing cancel() (from above) here because stopCancel()
341341
// restores default behavior for the signals. This protects
342342
// the shutdown sequence from abruptly terminating things
343343
// like: database migrations, provisioner work, workspace
344344
// cleanup in dev-mode, etc.
345345
//
346346
// To get out of a graceful shutdown, the user can send
347347
// SIGQUIT with ctrl+\ or SIGKILL with `kill -9`.
348-
notifyCtx, notifyStop := inv.SignalNotifyContext(ctx, InterruptSignals...)
349-
defer notifyStop()
348+
stopCtx, stopCancel := signalNotifyContext(ctx, inv, StopSignalsNoInterrupt...)
349+
defer stopCancel()
350+
interruptCtx, interruptCancel := signalNotifyContext(ctx, inv, InterruptSignals...)
351+
defer interruptCancel()
350352

351353
cacheDir := vals.CacheDir.String()
352354
err = os.MkdirAll(cacheDir, 0o700)
@@ -1028,13 +1030,18 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
10281030
hangDetector.Start()
10291031
defer hangDetector.Close()
10301032

1033+
waitForProvisionerJobs := false
10311034
// Currently there is no way to ask the server to shut
10321035
// itself down, so any exit signal will result in a non-zero
10331036
// exit of the server.
10341037
var exitErr error
10351038
select {
1036-
case <-notifyCtx.Done():
1037-
exitErr = notifyCtx.Err()
1039+
case <-stopCtx.Done():
1040+
exitErr = stopCtx.Err()
1041+
waitForProvisionerJobs = true
1042+
_, _ = io.WriteString(inv.Stdout, cliui.Bold("Stop caught, waiting for provisioner jobs to complete and gracefully exiting. Use ctrl+\\ to force quit"))
1043+
case <-interruptCtx.Done():
1044+
exitErr = interruptCtx.Err()
10381045
_, _ = io.WriteString(inv.Stdout, cliui.Bold("Interrupt caught, gracefully exiting. Use ctrl+\\ to force quit"))
10391046
case <-tunnelDone:
10401047
exitErr = xerrors.New("dev tunnel closed unexpectedly")
@@ -1082,7 +1089,16 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
10821089
defer wg.Done()
10831090

10841091
r.Verbosef(inv, "Shutting down provisioner daemon %d...", id)
1085-
err := shutdownWithTimeout(provisionerDaemon.Shutdown, 5*time.Second)
1092+
timeout := 5 * time.Second
1093+
if waitForProvisionerJobs {
1094+
// It can last for a long time...
1095+
timeout = 30 * time.Minute
1096+
}
1097+
1098+
err := shutdownWithTimeout(func(ctx context.Context) error {
1099+
// We only want to cancel active jobs if we aren't exiting gracefully.
1100+
return provisionerDaemon.Shutdown(ctx, !waitForProvisionerJobs)
1101+
}, timeout)
10861102
if err != nil {
10871103
cliui.Errorf(inv.Stderr, "Failed to shut down provisioner daemon %d: %s\n", id, err)
10881104
return
@@ -2512,3 +2528,12 @@ func escapePostgresURLUserInfo(v string) (string, error) {
25122528

25132529
return v, nil
25142530
}
2531+
2532+
func signalNotifyContext(ctx context.Context, inv *clibase.Invocation, sig ...os.Signal) (context.Context, context.CancelFunc) {
2533+
// On Windows, some of our signal functions lack support.
2534+
// If we pass in no signals, we should just return the context as-is.
2535+
if len(sig) == 0 {
2536+
return context.WithCancel(ctx)
2537+
}
2538+
return inv.SignalNotifyContext(ctx, sig...)
2539+
}

cli/server_createadminuser.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ func (r *RootCmd) newCreateAdminUserCommand() *clibase.Cmd {
4747
logger = logger.Leveled(slog.LevelDebug)
4848
}
4949

50-
ctx, cancel := inv.SignalNotifyContext(ctx, InterruptSignals...)
50+
ctx, cancel := inv.SignalNotifyContext(ctx, StopSignals...)
5151
defer cancel()
5252

5353
if newUserDBURL == "" {

cli/server_test.go

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"net/url"
2222
"os"
2323
"path/filepath"
24+
"reflect"
2425
"runtime"
2526
"strconv"
2627
"strings"
@@ -1605,7 +1606,7 @@ func TestServer_Production(t *testing.T) {
16051606
}
16061607

16071608
//nolint:tparallel,paralleltest // This test cannot be run in parallel due to signal handling.
1608-
func TestServer_Shutdown(t *testing.T) {
1609+
func TestServer_InterruptShutdown(t *testing.T) {
16091610
t.Skip("This test issues an interrupt signal which will propagate to the test runner.")
16101611

16111612
if runtime.GOOS == "windows" {
@@ -1638,6 +1639,46 @@ func TestServer_Shutdown(t *testing.T) {
16381639
require.NoError(t, err)
16391640
}
16401641

1642+
func TestServer_GracefulShutdown(t *testing.T) {
1643+
t.Parallel()
1644+
if runtime.GOOS == "windows" {
1645+
// Sending interrupt signal isn't supported on Windows!
1646+
t.SkipNow()
1647+
}
1648+
ctx, cancelFunc := context.WithCancel(context.Background())
1649+
defer cancelFunc()
1650+
1651+
root, cfg := clitest.New(t,
1652+
"server",
1653+
"--in-memory",
1654+
"--http-address", ":0",
1655+
"--access-url", "http://example.com",
1656+
"--provisioner-daemons", "1",
1657+
"--cache-dir", t.TempDir(),
1658+
)
1659+
var stopFunc context.CancelFunc
1660+
root = root.WithTestSignalNotifyContext(t, func(parent context.Context, signals ...os.Signal) (context.Context, context.CancelFunc) {
1661+
if !reflect.DeepEqual(cli.StopSignalsNoInterrupt, signals) {
1662+
return context.WithCancel(ctx)
1663+
}
1664+
var ctx context.Context
1665+
ctx, stopFunc = context.WithCancel(parent)
1666+
return ctx, stopFunc
1667+
})
1668+
serverErr := make(chan error, 1)
1669+
pty := ptytest.New(t).Attach(root)
1670+
go func() {
1671+
serverErr <- root.WithContext(ctx).Run()
1672+
}()
1673+
_ = waitAccessURL(t, cfg)
1674+
// It's fair to assume `stopFunc` isn't nil here, because the server
1675+
// has started and access URL is propagated.
1676+
stopFunc()
1677+
pty.ExpectMatch("waiting for provisioner jobs to complete")
1678+
err := <-serverErr
1679+
require.NoError(t, err)
1680+
}
1681+
16411682
func BenchmarkServerHelp(b *testing.B) {
16421683
// server --help is a good proxy for measuring the
16431684
// constant overhead of each command.

cli/signal_unix.go

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,23 @@ import (
77
"syscall"
88
)
99

10-
var InterruptSignals = []os.Signal{
10+
// StopSignals is the list of signals that are used for handling
11+
// shutdown behavior.
12+
var StopSignals = []os.Signal{
1113
os.Interrupt,
1214
syscall.SIGTERM,
1315
syscall.SIGHUP,
1416
}
17+
18+
// StopSignals is the list of signals that are used for handling
19+
// graceful shutdown behavior.
20+
var StopSignalsNoInterrupt = []os.Signal{
21+
syscall.SIGTERM,
22+
syscall.SIGHUP,
23+
}
24+
25+
// InterruptSignals is the list of signals that are used for handling
26+
// immediate shutdown behavior.
27+
var InterruptSignals = []os.Signal{
28+
os.Interrupt,
29+
}

cli/signal_windows.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,12 @@ import (
66
"os"
77
)
88

9-
var InterruptSignals = []os.Signal{os.Interrupt}
9+
var StopSignals = []os.Signal{
10+
os.Interrupt,
11+
}
12+
13+
var StopSignalsNoInterrupt = []os.Signal{}
14+
15+
var InterruptSignals = []os.Signal{
16+
os.Interrupt,
17+
}

0 commit comments

Comments
 (0)