From b6741b1ff167cb024240400ece59afd838d42d6a Mon Sep 17 00:00:00 2001 From: Dean Sheather Date: Wed, 11 Jan 2023 00:23:08 +0000 Subject: [PATCH 1/4] feat: add SIQGUIT handler for the CLI --- cmd/coder/main.go | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/cmd/coder/main.go b/cmd/coder/main.go index 177b3a469a21c..08908994a565c 100644 --- a/cmd/coder/main.go +++ b/cmd/coder/main.go @@ -1,10 +1,15 @@ package main import ( + "context" "errors" "fmt" "math/rand" "os" + "os/signal" + "path/filepath" + "runtime" + "syscall" "time" _ "time/tzdata" @@ -15,6 +20,48 @@ import ( func main() { rand.Seed(time.Now().UnixMicro()) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // Add a custom SIGQUIT handler that outputs to stderr and a well-known file + // in the home directory. This also prevents SIGQUITs from killing the CLI. + go func() { + sigs := make(chan os.Signal, 1) + signal.Notify(sigs, syscall.SIGQUIT) + for { + select { + case <-ctx.Done(): + return + case <-sigs: + } + + buf := make([]byte, 10_000_000) + stacklen := runtime.Stack(buf, true) + + _, _ = fmt.Fprintf(os.Stderr, "SIGQUIT:\n%s\n", buf[:stacklen]) + + // Write to a well-known file. + dir, err := os.UserHomeDir() + if err != nil { + dir = os.TempDir() + } + fpath := filepath.Join(dir, fmt.Sprintf("coder-agent-%s.dump", time.Now().Format("2006-01-02T15:04:05.000Z"))) + _, _ = fmt.Fprintf(os.Stderr, "writing dump to %q\n", fpath) + + f, err := os.Create(fpath) + if err != nil { + _, _ = fmt.Fprintf(os.Stderr, "failed to open dump file: %v\n", err.Error()) + continue + } + _, err = f.Write(buf[:stacklen]) + _ = f.Close() + if err != nil { + _, _ = fmt.Fprintf(os.Stderr, "failed to open dump file: %v\n", err.Error()) + continue + } + } + }() + cmd, err := cli.Root(cli.AGPL()).ExecuteC() if err != nil { if errors.Is(err, cliui.Canceled) { From e026aa3a7670d435ff0c558e343eabd2f5d980a5 Mon Sep 17 00:00:00 2001 From: Dean Sheather Date: Wed, 11 Jan 2023 13:52:41 +0000 Subject: [PATCH 2/4] fixup! feat: add SIQGUIT handler for the CLI --- cli/agent.go | 2 ++ cli/root.go | 91 +++++++++++++++++++++++++++++++++++++++++++++++ cli/server.go | 12 ++++--- cmd/coder/main.go | 47 ------------------------ 4 files changed, 100 insertions(+), 52 deletions(-) diff --git a/cli/agent.go b/cli/agent.go index cbe62c77a6b4e..baf55eccea222 100644 --- a/cli/agent.go +++ b/cli/agent.go @@ -39,6 +39,8 @@ func workspaceAgent() *cobra.Command { ctx, cancel := context.WithCancel(cmd.Context()) defer cancel() + go dumpHandler(ctx) + rawURL, err := cmd.Flags().GetString(varAgentURL) if err != nil { return xerrors.Errorf("CODER_AGENT_URL must be set: %w", err) diff --git a/cli/root.go b/cli/root.go index 190bb35ac8a63..6728253a02aa4 100644 --- a/cli/root.go +++ b/cli/root.go @@ -8,8 +8,11 @@ import ( "net/http" "net/url" "os" + "os/signal" + "path/filepath" "runtime" "strings" + "syscall" "text/template" "time" @@ -631,3 +634,91 @@ func (h *headerTransport) RoundTrip(req *http.Request) (*http.Response, error) { } return h.transport.RoundTrip(req) } + +// dumpHandler provides a custom SIGQUIT and SIGTRAP handler that dumps the +// stacktrace of all goroutines to stderr and a well-known file in the home +// directory. This is useful for debugging deadlock issues that may occur in +// production in workspaces, since the default Go runtime will only dump to +// stderr (which is often difficult/impossible to read in a workspace). +// +// SIGQUITs will still cause the program to exit (similarly to the default Go +// runtime behavior). +// +// A SIGQUIT handler will not be registered if GOTRACEBACK=crash. +// +// On Windows this immediately returns. +func dumpHandler(ctx context.Context) { + if runtime.GOOS == "windows" { + // free up the goroutine since it'll be permanently blocked anyways + return + } + + listenSignals := []os.Signal{syscall.SIGTRAP} + if os.Getenv("GOTRACEBACK") != "crash" { + listenSignals = append(listenSignals, syscall.SIGQUIT) + } + + sigs := make(chan os.Signal, 1) + signal.Notify(sigs, listenSignals...) + for { + sigStr := "" + select { + case <-ctx.Done(): + return + case sig := <-sigs: + switch sig { + case syscall.SIGQUIT: + sigStr = "SIGQUIT" + case syscall.SIGTRAP: + sigStr = "SIGTRAP" + } + } + + // Start with a 1MB buffer and keep doubling it until we can fit the + // entire stacktrace, stopping early once we reach 64MB. + buf := make([]byte, 1_000_000) + stacklen := 0 + for { + stacklen = runtime.Stack(buf, true) + if stacklen < len(buf) { + break + } + if 2*len(buf) > 64_000_000 { + // Write a message to the end of the buffer saying that it was + // truncated. + const truncatedMsg = "\n\n\nstack trace truncated due to size\n" + copy(buf[len(buf)-len(truncatedMsg):], truncatedMsg) + break + } + buf = make([]byte, 2*len(buf)) + } + + _, _ = fmt.Fprintf(os.Stderr, "%s:\n%s\n", sigStr, buf[:stacklen]) + + // Write to a well-known file. + dir, err := os.UserHomeDir() + if err != nil { + dir = os.TempDir() + } + fpath := filepath.Join(dir, fmt.Sprintf("coder-agent-%s.dump", time.Now().Format("2006-01-02T15:04:05.000Z"))) + _, _ = fmt.Fprintf(os.Stderr, "writing dump to %q\n", fpath) + + f, err := os.Create(fpath) + if err != nil { + _, _ = fmt.Fprintf(os.Stderr, "failed to open dump file: %v\n", err.Error()) + goto done + } + _, err = f.Write(buf[:stacklen]) + _ = f.Close() + if err != nil { + _, _ = fmt.Fprintf(os.Stderr, "failed to open dump file: %v\n", err.Error()) + goto done + } + + done: + if sigStr == "SIGQUIT" { + //nolint:revive + os.Exit(1) + } + } +} diff --git a/cli/server.go b/cli/server.go index 2b4e32c17484b..a543f55a0e8c6 100644 --- a/cli/server.go +++ b/cli/server.go @@ -81,6 +81,13 @@ func Server(vip *viper.Viper, newAPI func(context.Context, *coderd.Options) (*co Use: "server", Short: "Start a Coder server", RunE: func(cmd *cobra.Command, args []string) error { + // Main command context for managing cancellation of running + // services. + ctx, cancel := context.WithCancel(cmd.Context()) + defer cancel() + + go dumpHandler(ctx) + cfg, err := deployment.Config(cmd.Flags(), vip) if err != nil { return xerrors.Errorf("getting deployment config: %w", err) @@ -123,11 +130,6 @@ func Server(vip *viper.Viper, newAPI func(context.Context, *coderd.Options) (*co logger = logger.AppendSinks(tracing.SlogSink{}) } - // Main command context for managing cancellation - // of running services. - ctx, cancel := context.WithCancel(cmd.Context()) - defer cancel() - // Register signals early on so that graceful shutdown can't // be interrupted by additional signals. Note that we avoid // shadowing cancel() (from above) here because notifyStop() diff --git a/cmd/coder/main.go b/cmd/coder/main.go index 08908994a565c..177b3a469a21c 100644 --- a/cmd/coder/main.go +++ b/cmd/coder/main.go @@ -1,15 +1,10 @@ package main import ( - "context" "errors" "fmt" "math/rand" "os" - "os/signal" - "path/filepath" - "runtime" - "syscall" "time" _ "time/tzdata" @@ -20,48 +15,6 @@ import ( func main() { rand.Seed(time.Now().UnixMicro()) - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - // Add a custom SIGQUIT handler that outputs to stderr and a well-known file - // in the home directory. This also prevents SIGQUITs from killing the CLI. - go func() { - sigs := make(chan os.Signal, 1) - signal.Notify(sigs, syscall.SIGQUIT) - for { - select { - case <-ctx.Done(): - return - case <-sigs: - } - - buf := make([]byte, 10_000_000) - stacklen := runtime.Stack(buf, true) - - _, _ = fmt.Fprintf(os.Stderr, "SIGQUIT:\n%s\n", buf[:stacklen]) - - // Write to a well-known file. - dir, err := os.UserHomeDir() - if err != nil { - dir = os.TempDir() - } - fpath := filepath.Join(dir, fmt.Sprintf("coder-agent-%s.dump", time.Now().Format("2006-01-02T15:04:05.000Z"))) - _, _ = fmt.Fprintf(os.Stderr, "writing dump to %q\n", fpath) - - f, err := os.Create(fpath) - if err != nil { - _, _ = fmt.Fprintf(os.Stderr, "failed to open dump file: %v\n", err.Error()) - continue - } - _, err = f.Write(buf[:stacklen]) - _ = f.Close() - if err != nil { - _, _ = fmt.Fprintf(os.Stderr, "failed to open dump file: %v\n", err.Error()) - continue - } - } - }() - cmd, err := cli.Root(cli.AGPL()).ExecuteC() if err != nil { if errors.Is(err, cliui.Canceled) { From ac2d645e1a3d833fea4a0ebd5033164ab1124e8a Mon Sep 17 00:00:00 2001 From: Dean Sheather Date: Wed, 11 Jan 2023 13:54:30 +0000 Subject: [PATCH 3/4] fixup! feat: add SIQGUIT handler for the CLI --- cli/root.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cli/root.go b/cli/root.go index 6728253a02aa4..6bbc469dbbcc9 100644 --- a/cli/root.go +++ b/cli/root.go @@ -660,6 +660,8 @@ func dumpHandler(ctx context.Context) { sigs := make(chan os.Signal, 1) signal.Notify(sigs, listenSignals...) + defer signal.Stop(sigs) + for { sigStr := "" select { From 1d5e71de61f84b98ff96cba0541e3ec9aa67f5e3 Mon Sep 17 00:00:00 2001 From: Dean Sheather Date: Thu, 12 Jan 2023 02:15:47 +1000 Subject: [PATCH 4/4] Update cli/root.go --- cli/root.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cli/root.go b/cli/root.go index 6bbc469dbbcc9..e7fdab6b91264 100644 --- a/cli/root.go +++ b/cli/root.go @@ -713,7 +713,7 @@ func dumpHandler(ctx context.Context) { _, err = f.Write(buf[:stacklen]) _ = f.Close() if err != nil { - _, _ = fmt.Fprintf(os.Stderr, "failed to open dump file: %v\n", err.Error()) + _, _ = fmt.Fprintf(os.Stderr, "failed to write dump file: %v\n", err.Error()) goto done }