Skip to content

Commit b96f6b4

Browse files
authored
fix: ensure ssh cleanup happens on cmd error
I noticed in my logs that sometimes `coder ssh` doesn't gracefully disconnect from the coordinator. The cause is the `closerStack` construct we use in that function. It has two paths to start closing things down: 1. explicit `close()` which we do in `defer` 2. context cancellation, which happens if the cli function returns an error sometimes the ssh remote command returns an error, and this triggers context cancellation of the `closerStack`. That is fine in and of itself, but we still want the explicit `close()` to wait until everything is closed before returning, since that's where we do cleanup, including the graceful disconnect. Prior to this fix the `close()` just immediately exits if another goroutine is closing the stack. Here we add a wait until everything is done.
1 parent c8aa99a commit b96f6b4

File tree

2 files changed

+60
-0
lines changed

2 files changed

+60
-0
lines changed

cli/ssh.go

+4
Original file line numberDiff line numberDiff line change
@@ -876,6 +876,7 @@ type closerStack struct {
876876
closed bool
877877
logger slog.Logger
878878
err error
879+
wg sync.WaitGroup
879880
}
880881

881882
func newCloserStack(ctx context.Context, logger slog.Logger) *closerStack {
@@ -893,10 +894,13 @@ func (c *closerStack) close(err error) {
893894
c.Lock()
894895
if c.closed {
895896
c.Unlock()
897+
c.wg.Wait()
896898
return
897899
}
898900
c.closed = true
899901
c.err = err
902+
c.wg.Add(1)
903+
defer c.wg.Done()
900904
c.Unlock()
901905

902906
for i := len(c.closers) - 1; i >= 0; i-- {

cli/ssh_internal_test.go

+56
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"net/url"
66
"testing"
7+
"time"
78

89
"github.com/stretchr/testify/assert"
910
"github.com/stretchr/testify/require"
@@ -127,6 +128,43 @@ func TestCloserStack_PushAfterClose(t *testing.T) {
127128
require.Equal(t, []*fakeCloser{fc1, fc0}, *closes, "should close fc1")
128129
}
129130

131+
func TestCloserStack_CloseAfterContext(t *testing.T) {
132+
t.Parallel()
133+
testCtx := testutil.Context(t, testutil.WaitShort)
134+
ctx, cancel := context.WithCancel(testCtx)
135+
defer cancel()
136+
logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}).Leveled(slog.LevelDebug)
137+
uut := newCloserStack(ctx, logger)
138+
ac := &asyncCloser{
139+
t: t,
140+
ctx: testCtx,
141+
complete: make(chan struct{}),
142+
started: make(chan struct{}),
143+
}
144+
err := uut.push("async", ac)
145+
require.NoError(t, err)
146+
cancel()
147+
testutil.RequireRecvCtx(testCtx, t, ac.started)
148+
149+
closed := make(chan struct{})
150+
go func() {
151+
defer close(closed)
152+
uut.close(nil)
153+
}()
154+
155+
// since the asyncCloser is still waiting, we shouldn't complete uut.close()
156+
select {
157+
case <-time.After(testutil.IntervalFast):
158+
// OK!
159+
case <-closed:
160+
t.Fatal("closed before stack was finished")
161+
}
162+
163+
// complete the asyncCloser
164+
close(ac.complete)
165+
testutil.RequireRecvCtx(testCtx, t, closed)
166+
}
167+
130168
type fakeCloser struct {
131169
closes *[]*fakeCloser
132170
err error
@@ -136,3 +174,21 @@ func (c *fakeCloser) Close() error {
136174
*c.closes = append(*c.closes, c)
137175
return c.err
138176
}
177+
178+
type asyncCloser struct {
179+
t *testing.T
180+
ctx context.Context
181+
started chan struct{}
182+
complete chan struct{}
183+
}
184+
185+
func (c *asyncCloser) Close() error {
186+
close(c.started)
187+
select {
188+
case <-c.ctx.Done():
189+
c.t.Error("timed out")
190+
return c.ctx.Err()
191+
case <-c.complete:
192+
return nil
193+
}
194+
}

0 commit comments

Comments
 (0)