Skip to content

Commit 32d5875

Browse files
authored
fix: wait for server tailnet background routines to exit on Close (coder#15183)
fixes coder/internal#114 We need to wait for ServerTailnet goroutines to finish when closing down, otherwise we can race with the shutdown of coderd & the coordinator, which causes errors.
1 parent 343f8ec commit 32d5875

File tree

2 files changed

+24
-9
lines changed

2 files changed

+24
-9
lines changed

coderd/tailnet.go

+22-9
Original file line numberDiff line numberDiff line change
@@ -91,13 +91,15 @@ func NewServerTailnet(
9191
})
9292
}
9393

94-
derpMapUpdaterClosed := make(chan struct{})
94+
bgRoutines := &sync.WaitGroup{}
9595
originalDerpMap := derpMapFn()
9696
// it's important to set the DERPRegionDialer above _before_ we set the DERP map so that if
9797
// there is an embedded relay, we use the local in-memory dialer.
9898
conn.SetDERPMap(originalDerpMap)
99+
bgRoutines.Add(1)
99100
go func() {
100-
defer close(derpMapUpdaterClosed)
101+
defer bgRoutines.Done()
102+
defer logger.Debug(ctx, "polling DERPMap exited")
101103

102104
ticker := time.NewTicker(5 * time.Second)
103105
defer ticker.Stop()
@@ -120,7 +122,7 @@ func NewServerTailnet(
120122
tn := &ServerTailnet{
121123
ctx: serverCtx,
122124
cancel: cancel,
123-
derpMapUpdaterClosed: derpMapUpdaterClosed,
125+
bgRoutines: bgRoutines,
124126
logger: logger,
125127
tracer: traceProvider.Tracer(tracing.TracerName),
126128
conn: conn,
@@ -170,8 +172,15 @@ func NewServerTailnet(
170172
// registering the callback also triggers send of the initial node
171173
tn.coordinatee.SetNodeCallback(tn.nodeCallback)
172174

173-
go tn.watchAgentUpdates()
174-
go tn.expireOldAgents()
175+
tn.bgRoutines.Add(2)
176+
go func() {
177+
defer tn.bgRoutines.Done()
178+
tn.watchAgentUpdates()
179+
}()
180+
go func() {
181+
defer tn.bgRoutines.Done()
182+
tn.expireOldAgents()
183+
}()
175184
return tn, nil
176185
}
177186

@@ -204,6 +213,7 @@ func (s *ServerTailnet) Collect(metrics chan<- prometheus.Metric) {
204213
}
205214

206215
func (s *ServerTailnet) expireOldAgents() {
216+
defer s.logger.Debug(s.ctx, "stopped expiring old agents")
207217
const (
208218
tick = 5 * time.Minute
209219
cutoff = 30 * time.Minute
@@ -255,6 +265,7 @@ func (s *ServerTailnet) doExpireOldAgents(cutoff time.Duration) {
255265
}
256266

257267
func (s *ServerTailnet) watchAgentUpdates() {
268+
defer s.logger.Debug(s.ctx, "stopped watching agent updates")
258269
for {
259270
conn := s.getAgentConn()
260271
resp, ok := conn.NextUpdate(s.ctx)
@@ -317,9 +328,9 @@ func (s *ServerTailnet) reinitCoordinator() {
317328
}
318329

319330
type ServerTailnet struct {
320-
ctx context.Context
321-
cancel func()
322-
derpMapUpdaterClosed chan struct{}
331+
ctx context.Context
332+
cancel func()
333+
bgRoutines *sync.WaitGroup
323334

324335
logger slog.Logger
325336
tracer trace.Tracer
@@ -532,10 +543,12 @@ func (c *netConnCloser) Close() error {
532543
}
533544

534545
func (s *ServerTailnet) Close() error {
546+
s.logger.Info(s.ctx, "closing server tailnet")
547+
defer s.logger.Debug(s.ctx, "server tailnet close complete")
535548
s.cancel()
536549
_ = s.conn.Close()
537550
s.transport.CloseIdleConnections()
538-
<-s.derpMapUpdaterClosed
551+
s.bgRoutines.Wait()
539552
return nil
540553
}
541554

enterprise/wsproxy/wsproxy.go

+2
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,8 @@ func (s *Server) RegisterNow() error {
419419
}
420420

421421
func (s *Server) Close() error {
422+
s.Logger.Info(s.ctx, "closing workspace proxy server")
423+
defer s.Logger.Debug(s.ctx, "finished closing workspace proxy server")
422424
s.cancel()
423425

424426
var err error

0 commit comments

Comments
 (0)