Skip to content

Commit f20cc66

Browse files
authored
fix: give SSH stdio sessions a chance to close before closing netstack (#10815)
Man, graceful shutdown is hard. Even after my changes, we were still hitting a graceful shutdown race: https://github.com/coder/coder/runs/18886842123 The problem was that while we attempt a graceful shutdown at the SSH layer by closing the session for writing, we were not giving it a chance to complete before continuing to tear down the stack of closers, including one that closes the netstack, and thus drop the TCP connection before it closes.
1 parent b25e5dc commit f20cc66

File tree

1 file changed

+20
-2
lines changed

1 file changed

+20
-2
lines changed

cli/ssh.go

+20-2
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ func (r *RootCmd) ssh() *clibase.Cmd {
228228
if err != nil {
229229
return xerrors.Errorf("connect SSH: %w", err)
230230
}
231-
copier := &rawSSHCopier{conn: rawSSH, r: inv.Stdin, w: inv.Stdout}
231+
copier := newRawSSHCopier(logger, rawSSH, inv.Stdin, inv.Stdout)
232232
if err = stack.push("rawSSHCopier", copier); err != nil {
233233
return err
234234
}
@@ -853,9 +853,16 @@ type rawSSHCopier struct {
853853
logger slog.Logger
854854
r io.Reader
855855
w io.Writer
856+
857+
done chan struct{}
858+
}
859+
860+
func newRawSSHCopier(logger slog.Logger, conn *gonet.TCPConn, r io.Reader, w io.Writer) *rawSSHCopier {
861+
return &rawSSHCopier{conn: conn, logger: logger, r: r, w: w, done: make(chan struct{})}
856862
}
857863

858864
func (c *rawSSHCopier) copy(wg *sync.WaitGroup) {
865+
defer close(c.done)
859866
logCtx := context.Background()
860867
wg.Add(1)
861868
go func() {
@@ -890,5 +897,16 @@ func (c *rawSSHCopier) copy(wg *sync.WaitGroup) {
890897
}
891898

892899
func (c *rawSSHCopier) Close() error {
893-
return c.conn.CloseWrite()
900+
err := c.conn.CloseWrite()
901+
902+
// give the copy() call a chance to return on a timeout, so that we don't
903+
// continue tearing down and close the underlying netstack before the SSH
904+
// session has a chance to gracefully shut down.
905+
t := time.NewTimer(5 * time.Second)
906+
defer t.Stop()
907+
select {
908+
case <-c.done:
909+
case <-t.C:
910+
}
911+
return err
894912
}

0 commit comments

Comments
 (0)