Skip to content

Commit f016901

Browse files
coadlerpull[bot]
authored andcommitted
fix: routinely ping agent websocket to ensure liveness (#5824)
1 parent e41a389 commit f016901

File tree

4 files changed

+42
-4
lines changed

4 files changed

+42
-4
lines changed

agent/agent.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -430,6 +430,9 @@ func (a *agent) createTailnet(ctx context.Context, derpMap *tailcfg.DERPMap) (_
430430
// runCoordinator runs a coordinator and returns whether a reconnect
431431
// should occur.
432432
func (a *agent) runCoordinator(ctx context.Context, network *tailnet.Conn) error {
433+
ctx, cancel := context.WithCancel(ctx)
434+
defer cancel()
435+
433436
coordinator, err := a.client.ListenWorkspaceAgent(ctx)
434437
if err != nil {
435438
return err

cli/agent.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ func workspaceAgent() *cobra.Command {
8383
slog.F("version", version),
8484
)
8585
client := codersdk.New(coderURL)
86+
client.Logger = logger
8687
// Set a reasonable timeout so requests can't hang forever!
8788
client.HTTPClient.Timeout = 10 * time.Second
8889

codersdk/workspaceagents.go

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,42 @@ func (c *Client) ListenWorkspaceAgent(ctx context.Context) (net.Conn, error) {
340340
return nil, readBodyAsError(res)
341341
}
342342

343+
// Ping once every 30 seconds to ensure that the websocket is alive. If we
344+
// don't get a response within 30s we kill the websocket and reconnect.
345+
// See: https://github.com/coder/coder/pull/5824
346+
go func() {
347+
tick := 30 * time.Second
348+
ticker := time.NewTicker(tick)
349+
defer ticker.Stop()
350+
defer func() {
351+
c.Logger.Debug(ctx, "coordinate pinger exited")
352+
}()
353+
for {
354+
select {
355+
case <-ctx.Done():
356+
return
357+
case start := <-ticker.C:
358+
ctx, cancel := context.WithTimeout(ctx, tick)
359+
360+
err := conn.Ping(ctx)
361+
if err != nil {
362+
c.Logger.Error(ctx, "workspace agent coordinate ping", slog.Error(err))
363+
364+
err := conn.Close(websocket.StatusGoingAway, "Ping failed")
365+
if err != nil {
366+
c.Logger.Error(ctx, "close workspace agent coordinate websocket", slog.Error(err))
367+
}
368+
369+
cancel()
370+
return
371+
}
372+
373+
c.Logger.Debug(ctx, "got coordinate pong", slog.F("took", time.Since(start)))
374+
cancel()
375+
}
376+
}
377+
}()
378+
343379
return websocket.NetConn(ctx, conn, websocket.MessageBinary), nil
344380
}
345381

provisionerd/provisionerd_test.go

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,6 @@ import (
1212
"testing"
1313
"time"
1414

15-
"github.com/coder/coder/provisionerd/runner"
16-
"github.com/coder/coder/testutil"
17-
1815
"github.com/hashicorp/yamux"
1916
"github.com/stretchr/testify/assert"
2017
"github.com/stretchr/testify/require"
@@ -26,11 +23,12 @@ import (
2623

2724
"cdr.dev/slog"
2825
"cdr.dev/slog/sloggers/slogtest"
29-
3026
"github.com/coder/coder/provisionerd"
3127
"github.com/coder/coder/provisionerd/proto"
28+
"github.com/coder/coder/provisionerd/runner"
3229
"github.com/coder/coder/provisionersdk"
3330
sdkproto "github.com/coder/coder/provisionersdk/proto"
31+
"github.com/coder/coder/testutil"
3432
)
3533

3634
func TestMain(m *testing.M) {

0 commit comments

Comments
 (0)