From a1bc769ab9ec270cd697a142550ac1a2ac723799 Mon Sep 17 00:00:00 2001 From: Spike Curtis Date: Fri, 5 May 2023 07:56:33 +0000 Subject: [PATCH 1/4] work around websocket deadline bug Signed-off-by: Spike Curtis --- tailnet/coordinator.go | 23 +++++++++++++++- tailnet/coordinator_test.go | 55 +++++++++++++++++++++++++++++++++---- 2 files changed, 71 insertions(+), 7 deletions(-) diff --git a/tailnet/coordinator.go b/tailnet/coordinator.go index 2f11566ded9a1..345a8ef1c03ec 100644 --- a/tailnet/coordinator.go +++ b/tailnet/coordinator.go @@ -206,6 +206,10 @@ func (t *TrackedConn) Close() error { return t.conn.Close() } +// WriteTimeout is the amount of time we wait to write a node update to a connection before we declare it hung. +// It is exported so that tests can use it. +const WriteTimeout = time.Second * 5 + // SendUpdates reads node updates and writes them to the connection. Ends when writes hit an error or context is // canceled. func (t *TrackedConn) SendUpdates() { @@ -223,7 +227,7 @@ func (t *TrackedConn) SendUpdates() { // Set a deadline so that hung connections don't put back pressure on the system. // Node updates are tiny, so even the dinkiest connection can handle them if it's not hung. - err = t.conn.SetWriteDeadline(time.Now().Add(5 * time.Second)) + err = t.conn.SetWriteDeadline(time.Now().Add(WriteTimeout)) if err != nil { // often, this is just because the connection is closed/broken, so only log at debug. t.logger.Debug(t.ctx, "unable to set write deadline", slog.Error(err)) @@ -238,6 +242,23 @@ func (t *TrackedConn) SendUpdates() { return } t.logger.Debug(t.ctx, "wrote nodes", slog.F("nodes", nodes)) + + // nhooyr.io/websocket has a bugged implementation of deadlines on a websocket net.Conn. What they are + // *supposed* to do is set a deadline for any subsequent writes to complete, otherwise the call to Write() + // fails. What nhooyr.io/websocket does is set a timer, after which it expires the websocket write context. + // If this timer fires, then the next write will fail *even if we set a new write deadline*. So, after + // our successful write, it is important that we reset the deadline before it fires. + // + // SetWriteDeadline(0) is *supposed* to cancel the deadline, but again, nhooyr.io/websocket is bugged, and + // interprets 0 as the Golang time epoch, midnight on Jan 1, year 1, some two thousand years in the past. + // So, we set the write deadline an absurd amount of time in the future: 100 years. + err = t.conn.SetWriteDeadline(time.Now().Add(time.Hour * 24 * 365 * 100)) + if err != nil { + // often, this is just because the connection is closed/broken, so only log at debug. + t.logger.Debug(t.ctx, "unable to extend write deadline", slog.Error(err)) + _ = t.Close() + return + } } } } diff --git a/tailnet/coordinator_test.go b/tailnet/coordinator_test.go index 407f5bb2cf767..100d2ed426499 100644 --- a/tailnet/coordinator_test.go +++ b/tailnet/coordinator_test.go @@ -1,11 +1,17 @@ package tailnet_test import ( + "context" "encoding/json" "net" + "net/http" + "net/http/httptest" + "sync/atomic" "testing" "time" + "nhooyr.io/websocket" + "cdr.dev/slog" "cdr.dev/slog/sloggers/slogtest" @@ -74,7 +80,10 @@ func TestCoordinator(t *testing.T) { logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug) coordinator := tailnet.NewCoordinator(logger) - agentWS, agentServerWS := net.Pipe() + // in this test we use real websockets to test use of deadlines + ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitSuperLong) + defer cancel() + agentWS, agentServerWS := websocketConn(ctx, t) defer agentWS.Close() agentNodeChan := make(chan []*tailnet.Node) sendAgentNode, agentErrChan := tailnet.ServeCoordinator(agentWS, func(nodes []*tailnet.Node) error { @@ -93,7 +102,7 @@ func TestCoordinator(t *testing.T) { return coordinator.Node(agentID) != nil }, testutil.WaitShort, testutil.IntervalFast) - clientWS, clientServerWS := net.Pipe() + clientWS, clientServerWS := websocketConn(ctx, t) defer clientWS.Close() defer clientServerWS.Close() clientNodeChan := make(chan []*tailnet.Node) @@ -108,16 +117,28 @@ func TestCoordinator(t *testing.T) { assert.NoError(t, err) close(closeClientChan) }() - agentNodes := <-clientNodeChan - require.Len(t, agentNodes, 1) + select { + case agentNodes := <-clientNodeChan: + require.Len(t, agentNodes, 1) + case <-ctx.Done(): + t.Fatal("timed out") + } sendClientNode(&tailnet.Node{}) clientNodes := <-agentNodeChan require.Len(t, clientNodes, 1) + // wait longer than the internal wait timeout. + // this tests for regression of https://github.com/coder/coder/issues/7428 + time.Sleep(tailnet.WriteTimeout * 3 / 2) + // Ensure an update to the agent node reaches the client! sendAgentNode(&tailnet.Node{}) - agentNodes = <-clientNodeChan - require.Len(t, agentNodes, 1) + select { + case agentNodes := <-clientNodeChan: + require.Len(t, agentNodes, 1) + case <-ctx.Done(): + t.Fatal("timed out") + } // Close the agent WebSocket so a new one can connect. err := agentWS.Close() @@ -334,3 +355,25 @@ func TestCoordinator_AgentUpdateWhileClientConnects(t *testing.T) { require.Len(t, cNodes, 1) require.Equal(t, 1, cNodes[0].PreferredDERP) } + +func websocketConn(ctx context.Context, t *testing.T) (client net.Conn, server net.Conn) { + t.Helper() + accepted := atomic.Bool{} + s := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) { + // we should only call this handler once + require.False(t, accepted.Load()) + accepted.Store(true) + wss, err := websocket.Accept(rw, r, nil) + require.NoError(t, err) + server = websocket.NetConn(r.Context(), wss, websocket.MessageBinary) + + // hold open until request context canceled + <-r.Context().Done() + })) + t.Cleanup(s.Close) + // nolint: bodyclose + wsc, _, err := websocket.Dial(ctx, s.URL, nil) + require.NoError(t, err) + client = websocket.NetConn(ctx, wsc, websocket.MessageBinary) + return client, server +} From f145b3de1a8e0266b1f2558d32f5c090bbd2695e Mon Sep 17 00:00:00 2001 From: Spike Curtis Date: Fri, 5 May 2023 08:59:38 +0000 Subject: [PATCH 2/4] Use test context to hold websocket open Signed-off-by: Spike Curtis --- tailnet/coordinator_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tailnet/coordinator_test.go b/tailnet/coordinator_test.go index 100d2ed426499..a389233ed9bbe 100644 --- a/tailnet/coordinator_test.go +++ b/tailnet/coordinator_test.go @@ -367,8 +367,8 @@ func websocketConn(ctx context.Context, t *testing.T) (client net.Conn, server n require.NoError(t, err) server = websocket.NetConn(r.Context(), wss, websocket.MessageBinary) - // hold open until request context canceled - <-r.Context().Done() + // hold open until context canceled + <-ctx.Done() })) t.Cleanup(s.Close) // nolint: bodyclose From 411b5c9ae3ef39b7ab41cbe8283e7ef2d771d809 Mon Sep 17 00:00:00 2001 From: Spike Curtis Date: Fri, 5 May 2023 09:19:07 +0000 Subject: [PATCH 3/4] Fix race creating test websocket Signed-off-by: Spike Curtis --- tailnet/coordinator_test.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tailnet/coordinator_test.go b/tailnet/coordinator_test.go index a389233ed9bbe..94c6f6da58341 100644 --- a/tailnet/coordinator_test.go +++ b/tailnet/coordinator_test.go @@ -6,7 +6,6 @@ import ( "net" "net/http" "net/http/httptest" - "sync/atomic" "testing" "time" @@ -358,14 +357,13 @@ func TestCoordinator_AgentUpdateWhileClientConnects(t *testing.T) { func websocketConn(ctx context.Context, t *testing.T) (client net.Conn, server net.Conn) { t.Helper() - accepted := atomic.Bool{} + sc := make(chan net.Conn, 1) s := httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) { - // we should only call this handler once - require.False(t, accepted.Load()) - accepted.Store(true) wss, err := websocket.Accept(rw, r, nil) require.NoError(t, err) - server = websocket.NetConn(r.Context(), wss, websocket.MessageBinary) + conn := websocket.NetConn(r.Context(), wss, websocket.MessageBinary) + sc <- conn + close(sc) // there can be only one // hold open until context canceled <-ctx.Done() @@ -375,5 +373,7 @@ func websocketConn(ctx context.Context, t *testing.T) (client net.Conn, server n wsc, _, err := websocket.Dial(ctx, s.URL, nil) require.NoError(t, err) client = websocket.NetConn(ctx, wsc, websocket.MessageBinary) + server, ok := <-sc + require.True(t, ok) return client, server } From fb8a81f72fb15defa9cc51af175596d9fb943db1 Mon Sep 17 00:00:00 2001 From: Spike Curtis Date: Fri, 5 May 2023 11:36:12 +0000 Subject: [PATCH 4/4] set write deadline to time.Time zero Signed-off-by: Spike Curtis --- tailnet/coordinator.go | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tailnet/coordinator.go b/tailnet/coordinator.go index 345a8ef1c03ec..5ee49cd194f16 100644 --- a/tailnet/coordinator.go +++ b/tailnet/coordinator.go @@ -248,11 +248,7 @@ func (t *TrackedConn) SendUpdates() { // fails. What nhooyr.io/websocket does is set a timer, after which it expires the websocket write context. // If this timer fires, then the next write will fail *even if we set a new write deadline*. So, after // our successful write, it is important that we reset the deadline before it fires. - // - // SetWriteDeadline(0) is *supposed* to cancel the deadline, but again, nhooyr.io/websocket is bugged, and - // interprets 0 as the Golang time epoch, midnight on Jan 1, year 1, some two thousand years in the past. - // So, we set the write deadline an absurd amount of time in the future: 100 years. - err = t.conn.SetWriteDeadline(time.Now().Add(time.Hour * 24 * 365 * 100)) + err = t.conn.SetWriteDeadline(time.Time{}) if err != nil { // often, this is just because the connection is closed/broken, so only log at debug. t.logger.Debug(t.ctx, "unable to extend write deadline", slog.Error(err))