From c0836086e611822ac89842180357c3bdab6466ab Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Mon, 30 Jan 2023 10:10:27 +0000 Subject: [PATCH 1/2] feat(agent): Handle signals and shutdown gracefully This change allows the agent to handle common shutdown signals like interrupt, hangup and terminate and initiate a graceful shutdown. As long as terraform providers initiate graceful shutdowns via the aforementioned signals, things like SSH connections will be closed immediately on shutdown instead of being left hanging/timing out due to the agent being abruptly killed. Refs: #4677, #5901 --- cli/agent.go | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/cli/agent.go b/cli/agent.go index fe334c2c2de3f..da010617ebe39 100644 --- a/cli/agent.go +++ b/cli/agent.go @@ -7,6 +7,7 @@ import ( "net/http/pprof" "net/url" "os" + "os/signal" "path/filepath" "runtime" "time" @@ -35,12 +36,10 @@ func workspaceAgent() *cobra.Command { Use: "agent", // This command isn't useful to manually execute. Hidden: true, - RunE: func(cmd *cobra.Command, args []string) error { + RunE: func(cmd *cobra.Command, _ []string) error { ctx, cancel := context.WithCancel(cmd.Context()) defer cancel() - go dumpHandler(ctx) - rawURL, err := cmd.Flags().GetString(varAgentURL) if err != nil { return xerrors.Errorf("CODER_AGENT_URL must be set: %w", err) @@ -50,18 +49,18 @@ func workspaceAgent() *cobra.Command { return xerrors.Errorf("parse %q: %w", rawURL, err) } - logWriter := &lumberjack.Logger{ - Filename: filepath.Join(os.TempDir(), "coder-agent.log"), - MaxSize: 5, // MB - } - defer logWriter.Close() - logger := slog.Make(sloghuman.Sink(cmd.ErrOrStderr()), sloghuman.Sink(logWriter)).Leveled(slog.LevelDebug) - isLinux := runtime.GOOS == "linux" // Spawn a reaper so that we don't accumulate a ton // of zombie processes. if reaper.IsInitProcess() && !noReap && isLinux { + logWriter := &lumberjack.Logger{ + Filename: filepath.Join(os.TempDir(), "coder-agent-init.log"), + MaxSize: 5, // MB + } + defer logWriter.Close() + logger := slog.Make(sloghuman.Sink(cmd.ErrOrStderr()), sloghuman.Sink(logWriter)).Leveled(slog.LevelDebug) + logger.Info(ctx, "spawning reaper process") // Do not start a reaper on the child process. It's important // to do this else we fork bomb ourselves. @@ -76,6 +75,27 @@ func workspaceAgent() *cobra.Command { return nil } + // Handle interrupt signals to allow for graceful shutdown, + // note that calling stopNotify disables the signal handler + // and the next interrupt will terminate the program. + // + // Note that we don't want to handle these signals in the + // process that runs as PID 1, that's why we do this after + // the reaper forked. + ctx, stopNotify := signal.NotifyContext(ctx, InterruptSignals...) + defer stopNotify() + + // dumpHandler does signal handling, so we call it after the + // reaper. + go dumpHandler(ctx) + + logWriter := &lumberjack.Logger{ + Filename: filepath.Join(os.TempDir(), "coder-agent.log"), + MaxSize: 5, // MB + } + defer logWriter.Close() + logger := slog.Make(sloghuman.Sink(cmd.ErrOrStderr()), sloghuman.Sink(logWriter)).Leveled(slog.LevelDebug) + version := buildinfo.Version() logger.Info(ctx, "starting agent", slog.F("url", coderURL), From e125637aba3e77b8d45f695797d411634a078e50 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Mon, 30 Jan 2023 10:19:53 +0000 Subject: [PATCH 2/2] fixup! feat(agent): Handle signals and shutdown gracefully --- cli/agent.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cli/agent.go b/cli/agent.go index da010617ebe39..89a103464029d 100644 --- a/cli/agent.go +++ b/cli/agent.go @@ -77,7 +77,8 @@ func workspaceAgent() *cobra.Command { // Handle interrupt signals to allow for graceful shutdown, // note that calling stopNotify disables the signal handler - // and the next interrupt will terminate the program. + // and the next interrupt will terminate the program (you + // probably want cancel instead). // // Note that we don't want to handle these signals in the // process that runs as PID 1, that's why we do this after