Skip to content

feat: add SIGQUIT/SIGTRAP handler for the CLI #5665

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jan 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cli/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ func workspaceAgent() *cobra.Command {
ctx, cancel := context.WithCancel(cmd.Context())
defer cancel()

go dumpHandler(ctx)

rawURL, err := cmd.Flags().GetString(varAgentURL)
if err != nil {
return xerrors.Errorf("CODER_AGENT_URL must be set: %w", err)
Expand Down
93 changes: 93 additions & 0 deletions cli/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,11 @@ import (
"net/http"
"net/url"
"os"
"os/signal"
"path/filepath"
"runtime"
"strings"
"syscall"
"text/template"
"time"

Expand Down Expand Up @@ -631,3 +634,93 @@ func (h *headerTransport) RoundTrip(req *http.Request) (*http.Response, error) {
}
return h.transport.RoundTrip(req)
}

// dumpHandler provides a custom SIGQUIT and SIGTRAP handler that dumps the
// stacktrace of all goroutines to stderr and a well-known file in the home
// directory. This is useful for debugging deadlock issues that may occur in
// production in workspaces, since the default Go runtime will only dump to
// stderr (which is often difficult/impossible to read in a workspace).
//
// SIGQUITs will still cause the program to exit (similarly to the default Go
// runtime behavior).
//
// A SIGQUIT handler will not be registered if GOTRACEBACK=crash.
//
// On Windows this immediately returns.
func dumpHandler(ctx context.Context) {
if runtime.GOOS == "windows" {
// free up the goroutine since it'll be permanently blocked anyways
return
}

listenSignals := []os.Signal{syscall.SIGTRAP}
if os.Getenv("GOTRACEBACK") != "crash" {
listenSignals = append(listenSignals, syscall.SIGQUIT)
}

sigs := make(chan os.Signal, 1)
signal.Notify(sigs, listenSignals...)
defer signal.Stop(sigs)

for {
sigStr := ""
select {
case <-ctx.Done():
return
case sig := <-sigs:
switch sig {
case syscall.SIGQUIT:
sigStr = "SIGQUIT"
case syscall.SIGTRAP:
sigStr = "SIGTRAP"
}
}

// Start with a 1MB buffer and keep doubling it until we can fit the
// entire stacktrace, stopping early once we reach 64MB.
buf := make([]byte, 1_000_000)
stacklen := 0
for {
stacklen = runtime.Stack(buf, true)
if stacklen < len(buf) {
break
}
if 2*len(buf) > 64_000_000 {
// Write a message to the end of the buffer saying that it was
// truncated.
const truncatedMsg = "\n\n\nstack trace truncated due to size\n"
copy(buf[len(buf)-len(truncatedMsg):], truncatedMsg)
break
}
buf = make([]byte, 2*len(buf))
}

_, _ = fmt.Fprintf(os.Stderr, "%s:\n%s\n", sigStr, buf[:stacklen])

// Write to a well-known file.
dir, err := os.UserHomeDir()
if err != nil {
dir = os.TempDir()
}
fpath := filepath.Join(dir, fmt.Sprintf("coder-agent-%s.dump", time.Now().Format("2006-01-02T15:04:05.000Z")))
_, _ = fmt.Fprintf(os.Stderr, "writing dump to %q\n", fpath)

f, err := os.Create(fpath)
if err != nil {
_, _ = fmt.Fprintf(os.Stderr, "failed to open dump file: %v\n", err.Error())
goto done
}
_, err = f.Write(buf[:stacklen])
_ = f.Close()
if err != nil {
_, _ = fmt.Fprintf(os.Stderr, "failed to write dump file: %v\n", err.Error())
goto done
}

done:
if sigStr == "SIGQUIT" {
//nolint:revive
os.Exit(1)
}
}
}
12 changes: 7 additions & 5 deletions cli/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,13 @@ func Server(vip *viper.Viper, newAPI func(context.Context, *coderd.Options) (*co
Use: "server",
Short: "Start a Coder server",
RunE: func(cmd *cobra.Command, args []string) error {
// Main command context for managing cancellation of running
// services.
ctx, cancel := context.WithCancel(cmd.Context())
defer cancel()

go dumpHandler(ctx)

cfg, err := deployment.Config(cmd.Flags(), vip)
if err != nil {
return xerrors.Errorf("getting deployment config: %w", err)
Expand Down Expand Up @@ -123,11 +130,6 @@ func Server(vip *viper.Viper, newAPI func(context.Context, *coderd.Options) (*co
logger = logger.AppendSinks(tracing.SlogSink{})
}

// Main command context for managing cancellation
// of running services.
ctx, cancel := context.WithCancel(cmd.Context())
defer cancel()

// Register signals early on so that graceful shutdown can't
// be interrupted by additional signals. Note that we avoid
// shadowing cancel() (from above) here because notifyStop()
Expand Down