From 03f0745902f9a224bb190b77acf3a9821586da40 Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Mon, 23 Jan 2023 12:36:28 -0600 Subject: [PATCH 1/4] fix: routinely ping agent websocket to ensure liveness --- agent/agent.go | 3 +++ cli/agent.go | 1 + codersdk/workspaceagents.go | 33 +++++++++++++++++++++++++++++++ provisionerd/provisionerd_test.go | 6 ++---- 4 files changed, 39 insertions(+), 4 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index 47d9c394a86b9..95ef8d713e3fd 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -430,6 +430,9 @@ func (a *agent) createTailnet(ctx context.Context, derpMap *tailcfg.DERPMap) (_ // runCoordinator runs a coordinator and returns whether a reconnect // should occur. func (a *agent) runCoordinator(ctx context.Context, network *tailnet.Conn) error { + ctx, cancel := context.WithCancel(ctx) + defer cancel() + coordinator, err := a.client.ListenWorkspaceAgent(ctx) if err != nil { return err diff --git a/cli/agent.go b/cli/agent.go index 95744aa340b84..2edc472bd6dc3 100644 --- a/cli/agent.go +++ b/cli/agent.go @@ -83,6 +83,7 @@ func workspaceAgent() *cobra.Command { slog.F("version", version), ) client := codersdk.New(coderURL) + client.Logger = logger // Set a reasonable timeout so requests can't hang forever! client.HTTPClient.Timeout = 10 * time.Second diff --git a/codersdk/workspaceagents.go b/codersdk/workspaceagents.go index 93ac907ab445e..5c50b58ae082a 100644 --- a/codersdk/workspaceagents.go +++ b/codersdk/workspaceagents.go @@ -340,6 +340,39 @@ func (c *Client) ListenWorkspaceAgent(ctx context.Context) (net.Conn, error) { return nil, readBodyAsError(res) } + go func() { + tick := 30 * time.Second + ticker := time.NewTicker(tick) + defer ticker.Stop() + defer func() { + c.Logger.Debug(ctx, "coordinate pinger exited") + }() + for { + select { + case <-ctx.Done(): + return + case start := <-ticker.C: + ctx, cancel := context.WithTimeout(ctx, tick) + + err := conn.Ping(ctx) + if err != nil { + c.Logger.Error(ctx, "workspace agent coordinate ping", slog.Error(err)) + + err := conn.Close(websocket.StatusAbnormalClosure, "Ping failed") + if err != nil { + c.Logger.Error(ctx, "close workspace agent coordinate websocket", slog.Error(err)) + } + + cancel() + return + } + + c.Logger.Debug(ctx, "got coordinate pong", slog.F("took", time.Since(start))) + cancel() + } + } + }() + return websocket.NetConn(ctx, conn, websocket.MessageBinary), nil } diff --git a/provisionerd/provisionerd_test.go b/provisionerd/provisionerd_test.go index d65dceb581713..44884800fa47b 100644 --- a/provisionerd/provisionerd_test.go +++ b/provisionerd/provisionerd_test.go @@ -12,9 +12,6 @@ import ( "testing" "time" - "github.com/coder/coder/provisionerd/runner" - "github.com/coder/coder/testutil" - "github.com/hashicorp/yamux" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -26,11 +23,12 @@ import ( "cdr.dev/slog" "cdr.dev/slog/sloggers/slogtest" - "github.com/coder/coder/provisionerd" "github.com/coder/coder/provisionerd/proto" + "github.com/coder/coder/provisionerd/runner" "github.com/coder/coder/provisionersdk" sdkproto "github.com/coder/coder/provisionersdk/proto" + "github.com/coder/coder/testutil" ) func TestMain(m *testing.M) { From 4f01023d9f9a9719a955abffb77008c3ca4e7b11 Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Mon, 23 Jan 2023 13:06:28 -0600 Subject: [PATCH 2/4] add comment --- codersdk/workspaceagents.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/codersdk/workspaceagents.go b/codersdk/workspaceagents.go index 5c50b58ae082a..ac0d01f617933 100644 --- a/codersdk/workspaceagents.go +++ b/codersdk/workspaceagents.go @@ -340,6 +340,8 @@ func (c *Client) ListenWorkspaceAgent(ctx context.Context) (net.Conn, error) { return nil, readBodyAsError(res) } + // Ping once every 30 seconds to ensure that the websocket is alive. If we + // don't get a response within 30s we kill the websocket and reconnect. go func() { tick := 30 * time.Second ticker := time.NewTicker(tick) From a56c9f1104905ef6de7e077ebd7be2fdd200980c Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Mon, 23 Jan 2023 13:10:11 -0600 Subject: [PATCH 3/4] fixup! add comment --- codersdk/workspaceagents.go | 1 + 1 file changed, 1 insertion(+) diff --git a/codersdk/workspaceagents.go b/codersdk/workspaceagents.go index ac0d01f617933..6352ad74f11a5 100644 --- a/codersdk/workspaceagents.go +++ b/codersdk/workspaceagents.go @@ -342,6 +342,7 @@ func (c *Client) ListenWorkspaceAgent(ctx context.Context) (net.Conn, error) { // Ping once every 30 seconds to ensure that the websocket is alive. If we // don't get a response within 30s we kill the websocket and reconnect. + // See: https://github.com/coder/coder/pull/5824 go func() { tick := 30 * time.Second ticker := time.NewTicker(tick) From a72fc72ca36394de406fbae8107d509ba40ff9a9 Mon Sep 17 00:00:00 2001 From: Colin Adler Date: Mon, 23 Jan 2023 13:52:40 -0600 Subject: [PATCH 4/4] use `websocket.StatusGoingAway` --- codersdk/workspaceagents.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codersdk/workspaceagents.go b/codersdk/workspaceagents.go index 6352ad74f11a5..5f033bddb367c 100644 --- a/codersdk/workspaceagents.go +++ b/codersdk/workspaceagents.go @@ -361,7 +361,7 @@ func (c *Client) ListenWorkspaceAgent(ctx context.Context) (net.Conn, error) { if err != nil { c.Logger.Error(ctx, "workspace agent coordinate ping", slog.Error(err)) - err := conn.Close(websocket.StatusAbnormalClosure, "Ping failed") + err := conn.Close(websocket.StatusGoingAway, "Ping failed") if err != nil { c.Logger.Error(ctx, "close workspace agent coordinate websocket", slog.Error(err)) }