Skip to content

Commit e72a2ad

Browse files
authored
feat: add SIGQUIT/SIGTRAP handler for the CLI (#5665)
1 parent 69241d0 commit e72a2ad

File tree

3 files changed

+102
-5
lines changed

3 files changed

+102
-5
lines changed

cli/agent.go

+2
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ func workspaceAgent() *cobra.Command {
3939
ctx, cancel := context.WithCancel(cmd.Context())
4040
defer cancel()
4141

42+
go dumpHandler(ctx)
43+
4244
rawURL, err := cmd.Flags().GetString(varAgentURL)
4345
if err != nil {
4446
return xerrors.Errorf("CODER_AGENT_URL must be set: %w", err)

cli/root.go

+93
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,11 @@ import (
88
"net/http"
99
"net/url"
1010
"os"
11+
"os/signal"
12+
"path/filepath"
1113
"runtime"
1214
"strings"
15+
"syscall"
1316
"text/template"
1417
"time"
1518

@@ -631,3 +634,93 @@ func (h *headerTransport) RoundTrip(req *http.Request) (*http.Response, error) {
631634
}
632635
return h.transport.RoundTrip(req)
633636
}
637+
638+
// dumpHandler provides a custom SIGQUIT and SIGTRAP handler that dumps the
639+
// stacktrace of all goroutines to stderr and a well-known file in the home
640+
// directory. This is useful for debugging deadlock issues that may occur in
641+
// production in workspaces, since the default Go runtime will only dump to
642+
// stderr (which is often difficult/impossible to read in a workspace).
643+
//
644+
// SIGQUITs will still cause the program to exit (similarly to the default Go
645+
// runtime behavior).
646+
//
647+
// A SIGQUIT handler will not be registered if GOTRACEBACK=crash.
648+
//
649+
// On Windows this immediately returns.
650+
func dumpHandler(ctx context.Context) {
651+
if runtime.GOOS == "windows" {
652+
// free up the goroutine since it'll be permanently blocked anyways
653+
return
654+
}
655+
656+
listenSignals := []os.Signal{syscall.SIGTRAP}
657+
if os.Getenv("GOTRACEBACK") != "crash" {
658+
listenSignals = append(listenSignals, syscall.SIGQUIT)
659+
}
660+
661+
sigs := make(chan os.Signal, 1)
662+
signal.Notify(sigs, listenSignals...)
663+
defer signal.Stop(sigs)
664+
665+
for {
666+
sigStr := ""
667+
select {
668+
case <-ctx.Done():
669+
return
670+
case sig := <-sigs:
671+
switch sig {
672+
case syscall.SIGQUIT:
673+
sigStr = "SIGQUIT"
674+
case syscall.SIGTRAP:
675+
sigStr = "SIGTRAP"
676+
}
677+
}
678+
679+
// Start with a 1MB buffer and keep doubling it until we can fit the
680+
// entire stacktrace, stopping early once we reach 64MB.
681+
buf := make([]byte, 1_000_000)
682+
stacklen := 0
683+
for {
684+
stacklen = runtime.Stack(buf, true)
685+
if stacklen < len(buf) {
686+
break
687+
}
688+
if 2*len(buf) > 64_000_000 {
689+
// Write a message to the end of the buffer saying that it was
690+
// truncated.
691+
const truncatedMsg = "\n\n\nstack trace truncated due to size\n"
692+
copy(buf[len(buf)-len(truncatedMsg):], truncatedMsg)
693+
break
694+
}
695+
buf = make([]byte, 2*len(buf))
696+
}
697+
698+
_, _ = fmt.Fprintf(os.Stderr, "%s:\n%s\n", sigStr, buf[:stacklen])
699+
700+
// Write to a well-known file.
701+
dir, err := os.UserHomeDir()
702+
if err != nil {
703+
dir = os.TempDir()
704+
}
705+
fpath := filepath.Join(dir, fmt.Sprintf("coder-agent-%s.dump", time.Now().Format("2006-01-02T15:04:05.000Z")))
706+
_, _ = fmt.Fprintf(os.Stderr, "writing dump to %q\n", fpath)
707+
708+
f, err := os.Create(fpath)
709+
if err != nil {
710+
_, _ = fmt.Fprintf(os.Stderr, "failed to open dump file: %v\n", err.Error())
711+
goto done
712+
}
713+
_, err = f.Write(buf[:stacklen])
714+
_ = f.Close()
715+
if err != nil {
716+
_, _ = fmt.Fprintf(os.Stderr, "failed to write dump file: %v\n", err.Error())
717+
goto done
718+
}
719+
720+
done:
721+
if sigStr == "SIGQUIT" {
722+
//nolint:revive
723+
os.Exit(1)
724+
}
725+
}
726+
}

cli/server.go

+7-5
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,13 @@ func Server(vip *viper.Viper, newAPI func(context.Context, *coderd.Options) (*co
8181
Use: "server",
8282
Short: "Start a Coder server",
8383
RunE: func(cmd *cobra.Command, args []string) error {
84+
// Main command context for managing cancellation of running
85+
// services.
86+
ctx, cancel := context.WithCancel(cmd.Context())
87+
defer cancel()
88+
89+
go dumpHandler(ctx)
90+
8491
cfg, err := deployment.Config(cmd.Flags(), vip)
8592
if err != nil {
8693
return xerrors.Errorf("getting deployment config: %w", err)
@@ -123,11 +130,6 @@ func Server(vip *viper.Viper, newAPI func(context.Context, *coderd.Options) (*co
123130
logger = logger.AppendSinks(tracing.SlogSink{})
124131
}
125132

126-
// Main command context for managing cancellation
127-
// of running services.
128-
ctx, cancel := context.WithCancel(cmd.Context())
129-
defer cancel()
130-
131133
// Register signals early on so that graceful shutdown can't
132134
// be interrupted by additional signals. Note that we avoid
133135
// shadowing cancel() (from above) here because notifyStop()

0 commit comments

Comments
 (0)