From df575a000490ee338a95670f68d8be4bef458750 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Thu, 4 May 2023 15:31:30 +0000 Subject: [PATCH 1/7] test(tailnet): Add TestTransmitHang test with packet capture and logging This commit adds a test that occasionally triggers the issue in #7262. When triggered, the test takes e.g. ~30s to complete instead of 2s. The test also logs both from both `tailnet.Conn`s into a log file and produces pcap files from each side. --- tailnet/conn.go | 30 +++++++++++++ tailnet/conn_test.go | 105 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 135 insertions(+) diff --git a/tailnet/conn.go b/tailnet/conn.go index 20a00d94d6804..5de3f0d6bfa15 100644 --- a/tailnet/conn.go +++ b/tailnet/conn.go @@ -33,6 +33,7 @@ import ( "tailscale.com/types/netlogtype" "tailscale.com/types/netmap" "tailscale.com/wgengine" + "tailscale.com/wgengine/capture" "tailscale.com/wgengine/filter" "tailscale.com/wgengine/magicsock" "tailscale.com/wgengine/monitor" @@ -308,6 +309,8 @@ type Conn struct { closed chan struct{} logger slog.Logger blockEndpoints bool + capture *capture.Sink + captureCount int dialer *tsdial.Dialer tunDevice *tstun.Wrapper @@ -582,6 +585,10 @@ func (c *Conn) Close() error { default: } close(c.closed) + if c.capture != nil { + c.capture.Close() + c.capture = nil + } c.mutex.Unlock() var wg sync.WaitGroup @@ -840,6 +847,29 @@ func (c *Conn) MagicsockServeHTTPDebug(w http.ResponseWriter, r *http.Request) { c.magicConn.ServeHTTPDebug(w, r) } +func (c *Conn) Capture(w io.Writer) (unregister func()) { + c.mutex.Lock() + defer c.mutex.Unlock() + + if c.capture == nil { + c.capture = capture.New() + c.wireguardEngine.InstallCaptureHook(c.capture.LogPacket) + } + unreg := c.capture.RegisterOutput(w) + c.captureCount++ + return func() { + c.mutex.Lock() + defer c.mutex.Unlock() + unreg() + c.captureCount-- + if c.captureCount == 0 { + c.wireguardEngine.InstallCaptureHook(nil) + _ = c.capture.Close() + c.capture = nil + } + } +} + type listenKey struct { network string host string diff --git a/tailnet/conn_test.go b/tailnet/conn_test.go index 2e19379e6df03..fa490c448f038 100644 --- a/tailnet/conn_test.go +++ b/tailnet/conn_test.go @@ -2,7 +2,11 @@ package tailnet_test import ( "context" + "io" "net/netip" + "os" + "path/filepath" + "strings" "testing" "github.com/stretchr/testify/assert" @@ -10,6 +14,7 @@ import ( "go.uber.org/goleak" "cdr.dev/slog" + "cdr.dev/slog/sloggers/sloghuman" "cdr.dev/slog/sloggers/slogtest" "github.com/coder/coder/tailnet" "github.com/coder/coder/tailnet/tailnettest" @@ -195,3 +200,103 @@ func TestConn_PreferredDERP(t *testing.T) { t.Fatal("timed out waiting for node") } } + +func TestTransmitHang(t *testing.T) { + t.Parallel() + + // Not using t.TempDir() here so that we keep logs afterwards. + captureDir, err := os.MkdirTemp("", "tailnet-test-") + require.NoError(t, err) + + testLog, err := os.Create(filepath.Join(captureDir, "test.log")) + require.NoError(t, err) + defer testLog.Close() + recvCapture, err := os.Create(filepath.Join(captureDir, "recv.pcap")) + require.NoError(t, err) + defer recvCapture.Close() + sendCapture, err := os.Create(filepath.Join(captureDir, "send.pcap")) + require.NoError(t, err) + defer sendCapture.Close() + + logger := slogtest.Make(t, nil). + Leveled(slog.LevelDebug). + AppendSinks(sloghuman.Sink(testLog)) + + t.Logf("test log file: %v", testLog.Name()) + t.Logf("recv capture file: %v", recvCapture.Name()) + t.Logf("send capture file: %v", sendCapture.Name()) + + derpMap := tailnettest.RunDERPAndSTUN(t) + updateNodes := func(c *tailnet.Conn) func(*tailnet.Node) { + return func(node *tailnet.Node) { + err := c.UpdateNodes([]*tailnet.Node{node}, false) + assert.NoError(t, err) + } + } + + recvIP := tailnet.IP() + recv, err := tailnet.NewConn(&tailnet.Options{ + Addresses: []netip.Prefix{netip.PrefixFrom(recvIP, 128)}, + Logger: logger.Named("recv"), + DERPMap: derpMap, + }) + require.NoError(t, err) + defer recv.Close() + recvCaptureStop := recv.Capture(recvCapture) + defer recvCaptureStop() + + send, err := tailnet.NewConn(&tailnet.Options{ + Addresses: []netip.Prefix{netip.PrefixFrom(tailnet.IP(), 128)}, + Logger: logger.Named("send"), + DERPMap: derpMap, + }) + require.NoError(t, err) + defer send.Close() + sendCaptureStop := send.Capture(sendCapture) + defer sendCaptureStop() + + recv.SetNodeCallback(updateNodes(send)) + send.SetNodeCallback(updateNodes(recv)) + + ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong) + defer cancel() + + require.True(t, send.AwaitReachable(ctx, recvIP)) + + copyDone := make(chan struct{}) + go func() { + defer close(copyDone) + + ln, err := recv.Listen("tcp", ":35565") + if !assert.NoError(t, err) { + return + } + defer ln.Close() + + r, err := ln.Accept() + if !assert.NoError(t, err) { + return + } + defer r.Close() + + _, err = io.Copy(io.Discard, r) + assert.NoError(t, err) + }() + + w, err := send.DialContextTCP(ctx, netip.AddrPortFrom(recvIP, 35565)) + require.NoError(t, err) + + payload := []byte(strings.Repeat("hello world\n", 65536/12)) + size := 0 + for i := 0; i < 1024*2; i++ { + logger.Debug(ctx, "write payload", slog.F("num", i), slog.F("transmitted_kb", size/1024)) + n, err := w.Write(payload) + require.NoError(t, err) + size += n + } + + err = w.Close() + require.NoError(t, err) + + <-copyDone +} From 2573049603838b9b12538b3009c5c3f366312cc2 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Wed, 24 May 2023 17:34:09 +0000 Subject: [PATCH 2/7] Add timeout --- tailnet/conn_test.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tailnet/conn_test.go b/tailnet/conn_test.go index fa490c448f038..ac41562946811 100644 --- a/tailnet/conn_test.go +++ b/tailnet/conn_test.go @@ -8,6 +8,7 @@ import ( "path/filepath" "strings" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -286,6 +287,8 @@ func TestTransmitHang(t *testing.T) { w, err := send.DialContextTCP(ctx, netip.AddrPortFrom(recvIP, 35565)) require.NoError(t, err) + now := time.Now() + payload := []byte(strings.Repeat("hello world\n", 65536/12)) size := 0 for i := 0; i < 1024*2; i++ { @@ -299,4 +302,8 @@ func TestTransmitHang(t *testing.T) { require.NoError(t, err) <-copyDone + + if time.Since(now) > 10*time.Second { + t.Fatal("took too long to transmit") + } } From c03611cf9dfcc119b46dc1ba7814d026ef512939 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Thu, 25 May 2023 11:51:27 +0000 Subject: [PATCH 3/7] Update gvisor and tailscale --- go.mod | 4 ++-- go.sum | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/go.mod b/go.mod index f1b1e58f75831..de450a77622f4 100644 --- a/go.mod +++ b/go.mod @@ -36,7 +36,7 @@ replace github.com/dlclark/regexp2 => github.com/dlclark/regexp2 v1.7.0 // There are a few minor changes we make to Tailscale that we're slowly upstreaming. Compare here: // https://github.com/tailscale/tailscale/compare/main...coder:tailscale:main -replace tailscale.com => github.com/coder/tailscale v0.0.0-20230522123520-74712221d00f +replace tailscale.com => github.com/coder/tailscale v1.1.1-0.20230525114851-c9cd3cef997c // Switch to our fork that imports fixes from http://github.com/tailscale/ssh. // See: https://github.com/coder/coder/issues/3371 @@ -173,7 +173,7 @@ require ( google.golang.org/protobuf v1.30.0 gopkg.in/natefinch/lumberjack.v2 v2.2.1 gopkg.in/yaml.v3 v3.0.1 - gvisor.dev/gvisor v0.0.0-20221203005347-703fd9b7fbc0 + gvisor.dev/gvisor v0.0.0-20230504175454-7b0a1988a28f nhooyr.io/websocket v1.8.7 storj.io/drpc v0.0.33-0.20230420154621-9716137f6037 tailscale.com v1.32.3 diff --git a/go.sum b/go.sum index b39045d6fd8a7..1054edf804858 100644 --- a/go.sum +++ b/go.sum @@ -298,8 +298,8 @@ github.com/cilium/ebpf v0.2.0/go.mod h1:To2CFviqOWL/M0gIMsvSMlqe7em/l1ALkX1PyjrX github.com/cilium/ebpf v0.4.0/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs= github.com/cilium/ebpf v0.6.2/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs= github.com/cilium/ebpf v0.7.0/go.mod h1:/oI2+1shJiTGAMgl6/RgJr36Eo1jzrRcAWbcXO2usCA= -github.com/cilium/ebpf v0.8.1 h1:bLSSEbBLqGPXxls55pGr5qWZaTqcmfDJHhou7t254ao= github.com/cilium/ebpf v0.8.1/go.mod h1:f5zLIM0FSNuAkSyLAN7X+Hy6yznlF1mNiWUMfxMtrgk= +github.com/cilium/ebpf v0.9.3 h1:5KtxXZU+scyERvkJMEm16TbScVvuuMrlhPly78ZMbSc= github.com/clbanning/mxj/v2 v2.5.7 h1:7q5lvUpaPF/WOkqgIDiwjBJaznaLCCBd78pi8ZyAnE0= github.com/clbanning/mxj/v2 v2.5.7/go.mod h1:hNiWqW14h+kc+MdF9C6/YoRfjEJoR3ou6tn/Qo+ve2s= github.com/cli/safeexec v1.0.0 h1:0VngyaIyqACHdcMNWfo6+KdUYnqEr2Sg+bSP1pdF+dI= @@ -334,8 +334,8 @@ github.com/coder/retry v1.3.1-0.20230210155434-e90a2e1e091d h1:09JG37IgTB6n3ouX9 github.com/coder/retry v1.3.1-0.20230210155434-e90a2e1e091d/go.mod h1:r+1J5i/989wt6CUeNSuvFKKA9hHuKKPMxdzDbTuvwwk= github.com/coder/ssh v0.0.0-20230421140225-04bb837133e1 h1:LBw76rEDuhNJyohve11mbvYv5CmCLmcuUQGiz7Guk50= github.com/coder/ssh v0.0.0-20230421140225-04bb837133e1/go.mod h1:ZSS+CUoKHDrqVakTfTWUlKSr9MtMFkC4UvtQKD7O914= -github.com/coder/tailscale v0.0.0-20230522123520-74712221d00f h1:F0Xr1d8h8dAHn7tab1HXuzYFkcjmCydnEfdMbkOhlVk= -github.com/coder/tailscale v0.0.0-20230522123520-74712221d00f/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA= +github.com/coder/tailscale v1.1.1-0.20230525114851-c9cd3cef997c h1:0mo9brqvL9RP+qaqGpw+QDOnV+rQ3jwfaZrDOlOWVdk= +github.com/coder/tailscale v1.1.1-0.20230525114851-c9cd3cef997c/go.mod h1:Yv5VkPOCBociNBjE6CjPpld+t0P+Xu4dE1wOcfVYebg= github.com/coder/terraform-provider-coder v0.6.23 h1:O2Rcj0umez4DfVdGnKZi63z1Xzxd0IQOn9VQDB8YU8g= github.com/coder/terraform-provider-coder v0.6.23/go.mod h1:UIfU3bYNeSzJJvHyJ30tEKjD6Z9utloI+HUM/7n94CY= github.com/coder/wgtunnel v0.1.5 h1:WP3sCj/3iJ34eKvpMQEp1oJHvm24RYh0NHbj1kfUKfs= @@ -2475,8 +2475,8 @@ gotest.tools/v3 v3.0.2/go.mod h1:3SzNCllyD9/Y+b5r9JIKQ474KzkZyqLqEfYqMsX94Bk= gotest.tools/v3 v3.0.3/go.mod h1:Z7Lb0S5l+klDB31fvDQX8ss/FlKDxtlFlw3Oa8Ymbl8= gotest.tools/v3 v3.1.0/go.mod h1:fHy7eyTmJFO5bQbUsEGQ1v4m2J3Jz9eWL54TP2/ZuYQ= gotest.tools/v3 v3.4.0 h1:ZazjZUfuVeZGLAmlKKuyv3IKP5orXcwtOwDQH6YVr6o= -gvisor.dev/gvisor v0.0.0-20221203005347-703fd9b7fbc0 h1:Wobr37noukisGxpKo5jAsLREcpj61RxrWYzD8uwveOY= -gvisor.dev/gvisor v0.0.0-20221203005347-703fd9b7fbc0/go.mod h1:Dn5idtptoW1dIos9U6A2rpebLs/MtTwFacjKb8jLdQA= +gvisor.dev/gvisor v0.0.0-20230504175454-7b0a1988a28f h1:8GE2MRjGiFmfpon8dekPI08jEuNMQzSffVHgdupcO4E= +gvisor.dev/gvisor v0.0.0-20230504175454-7b0a1988a28f/go.mod h1:pzr6sy8gDLfVmDAg8OYrlKvGEHw5C3PGTiBXBTCx76Q= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= @@ -2484,7 +2484,7 @@ honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWh honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg= honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k= honnef.co/go/tools v0.0.1-2020.1.4/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k= -honnef.co/go/tools v0.4.0-0.dev.0.20230130122044-c30b15588105 h1:2OzOQ+1scFmv2dt7x+wNxgikA/Rn2qKrvc/CJCVuAJM= +honnef.co/go/tools v0.4.2 h1:6qXr+R5w+ktL5UkwEbPp+fEvfyoMPche6GkOpGHZcLc= howett.net/plist v1.0.0 h1:7CrbWYbPPO/PyNy38b2EB/+gYbjCe2DXBxgtOOZbSQM= howett.net/plist v1.0.0/go.mod h1:lqaXoTrLY4hg8tnEzNru53gicrbv7rrk+2xJA/7hw9g= inet.af/peercred v0.0.0-20210906144145-0893ea02156a h1:qdkS8Q5/i10xU2ArJMKYhVa1DORzBfYS/qA2UK2jheg= From 57025f5318c2a68b66a7e536fc014737da7056ee Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Thu, 25 May 2023 12:08:43 +0000 Subject: [PATCH 4/7] Refactor test-cases to run until the issue is triggered --- tailnet/conn_test.go | 85 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 81 insertions(+), 4 deletions(-) diff --git a/tailnet/conn_test.go b/tailnet/conn_test.go index ac41562946811..96b1bc3fb4c7d 100644 --- a/tailnet/conn_test.go +++ b/tailnet/conn_test.go @@ -2,17 +2,21 @@ package tailnet_test import ( "context" + "fmt" "io" "net/netip" "os" "path/filepath" + "runtime" "strings" + "sync/atomic" "testing" "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.uber.org/goleak" + "tailscale.com/envknob" "cdr.dev/slog" "cdr.dev/slog/sloggers/sloghuman" @@ -202,13 +206,66 @@ func TestConn_PreferredDERP(t *testing.T) { } } -func TestTransmitHang(t *testing.T) { - t.Parallel() +//nolint:paralleltest +func TestTransmitHang_TCPSACK_Disabled(t *testing.T) { + runTestTransmitHangMain(t) +} + +//nolint:paralleltest +func TestTransmitHang_TCPSACK_Enabled(t *testing.T) { + t.Setenv("TS_DEBUG_NETSTACK_ENABLE_TCPSACK", "true") // Restore after test. + envknob.Setenv("TS_DEBUG_NETSTACK_ENABLE_TCPSACK", "true") + runTestTransmitHangMain(t) +} + +func runTestTransmitHangMain(t *testing.T) { + curMaxProcs := runtime.GOMAXPROCS(0) + t.Cleanup(func() { + runtime.GOMAXPROCS(curMaxProcs) + }) + for _, env := range []string{"TS_DEBUG_DISCO", "TS_DEBUG_DERP", "TS_DEBUG_NETSTACK"} { + t.Setenv(env, "true") // Restore after test. + envknob.Setenv(env, "true") + } + + var failed atomic.Bool + for !failed.Load() { + for _, maxProcs := range []int{2, 3, curMaxProcs} { + if failed.Load() { + return + } + t.Run(fmt.Sprintf("GOMAXPROCS=%d", maxProcs), func(t *testing.T) { + t.Logf("GOMAXPROCS=%d", maxProcs) + runtime.GOMAXPROCS(maxProcs) + + for i := 0; i < 5; i++ { + t.Run(fmt.Sprintf("n=%d", i), func(t *testing.T) { + t.Parallel() + t.Cleanup(func() { + if t.Failed() { + failed.Store(true) + } + }) + runTestTransmitHang(t, 20*time.Second) + }) + } + }) + } + } +} +func runTestTransmitHang(t *testing.T, timeout time.Duration) { // Not using t.TempDir() here so that we keep logs afterwards. captureDir, err := os.MkdirTemp("", "tailnet-test-") require.NoError(t, err) + t.Cleanup(func() { + // Only keep failed runs. + if !t.Failed() { + _ = os.RemoveAll(captureDir) + } + }) + testLog, err := os.Create(filepath.Join(captureDir, "test.log")) require.NoError(t, err) defer testLog.Close() @@ -227,6 +284,8 @@ func TestTransmitHang(t *testing.T) { t.Logf("recv capture file: %v", recvCapture.Name()) t.Logf("send capture file: %v", sendCapture.Name()) + logger.Info(context.Background(), "starting test", slog.F("GOMAXPROCS", runtime.GOMAXPROCS(0)), slog.F("timeout", timeout)) + derpMap := tailnettest.RunDERPAndSTUN(t) updateNodes := func(c *tailnet.Conn) func(*tailnet.Node) { return func(node *tailnet.Node) { @@ -262,7 +321,9 @@ func TestTransmitHang(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong) defer cancel() + logger.Info(ctx, "waiting for receiver to be reachable (by sender)") require.True(t, send.AwaitReachable(ctx, recvIP)) + logger.Info(ctx, "wait complete") copyDone := make(chan struct{}) go func() { @@ -284,17 +345,33 @@ func TestTransmitHang(t *testing.T) { assert.NoError(t, err) }() + logger.Info(ctx, "dialing receiver") w, err := send.DialContextTCP(ctx, netip.AddrPortFrom(recvIP, 35565)) require.NoError(t, err) + defer w.Close() + logger.Info(ctx, "dial complete") now := time.Now() payload := []byte(strings.Repeat("hello world\n", 65536/12)) size := 0 + retries := 0 + writeTimeout := 2 * time.Second for i := 0; i < 1024*2; i++ { logger.Debug(ctx, "write payload", slog.F("num", i), slog.F("transmitted_kb", size/1024)) + Retry: + w.SetWriteDeadline(time.Now().Add(writeTimeout)) n, err := w.Write(payload) - require.NoError(t, err) + if err != nil { + if time.Duration(retries)*writeTimeout < timeout { + t.Errorf("write failed: %v", err) + t.Logf("retrying (%v)", retries) + retries++ + goto Retry + } else { + require.NoError(t, err) + } + } size += n } @@ -303,7 +380,7 @@ func TestTransmitHang(t *testing.T) { <-copyDone - if time.Since(now) > 10*time.Second { + if time.Since(now) > timeout { t.Fatal("took too long to transmit") } } From 15681dfc802c9ea028623462d3146df81f60fc77 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Thu, 25 May 2023 15:05:04 +0000 Subject: [PATCH 5/7] Add pprof labels and dump goroutines to log --- tailnet/conn_test.go | 56 ++++++++++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 17 deletions(-) diff --git a/tailnet/conn_test.go b/tailnet/conn_test.go index 96b1bc3fb4c7d..0691eecf4a06a 100644 --- a/tailnet/conn_test.go +++ b/tailnet/conn_test.go @@ -4,10 +4,13 @@ import ( "context" "fmt" "io" + "net" "net/netip" "os" "path/filepath" "runtime" + "runtime/pprof" + "strconv" "strings" "sync/atomic" "testing" @@ -17,6 +20,7 @@ import ( "github.com/stretchr/testify/require" "go.uber.org/goleak" "tailscale.com/envknob" + "tailscale.com/tailcfg" "cdr.dev/slog" "cdr.dev/slog/sloggers/sloghuman" @@ -286,7 +290,12 @@ func runTestTransmitHang(t *testing.T, timeout time.Duration) { logger.Info(context.Background(), "starting test", slog.F("GOMAXPROCS", runtime.GOMAXPROCS(0)), slog.F("timeout", timeout)) - derpMap := tailnettest.RunDERPAndSTUN(t) + ctx := context.Background() + + var derpMap *tailcfg.DERPMap + pprof.Do(ctx, pprof.Labels("id", "tailnettest.derp-and-stun"), func(_ context.Context) { + derpMap = tailnettest.RunDERPAndSTUN(t) + }) updateNodes := func(c *tailnet.Conn) func(*tailnet.Node) { return func(node *tailnet.Node) { err := c.UpdateNodes([]*tailnet.Node{node}, false) @@ -295,20 +304,26 @@ func runTestTransmitHang(t *testing.T, timeout time.Duration) { } recvIP := tailnet.IP() - recv, err := tailnet.NewConn(&tailnet.Options{ - Addresses: []netip.Prefix{netip.PrefixFrom(recvIP, 128)}, - Logger: logger.Named("recv"), - DERPMap: derpMap, + var recv *tailnet.Conn + pprof.Do(ctx, pprof.Labels("id", "tailnet.recv"), func(_ context.Context) { + recv, err = tailnet.NewConn(&tailnet.Options{ + Addresses: []netip.Prefix{netip.PrefixFrom(recvIP, 128)}, + Logger: logger.Named("recv"), + DERPMap: derpMap, + }) }) require.NoError(t, err) defer recv.Close() recvCaptureStop := recv.Capture(recvCapture) defer recvCaptureStop() - send, err := tailnet.NewConn(&tailnet.Options{ - Addresses: []netip.Prefix{netip.PrefixFrom(tailnet.IP(), 128)}, - Logger: logger.Named("send"), - DERPMap: derpMap, + var send *tailnet.Conn + pprof.Do(ctx, pprof.Labels("id", "tailnet.send"), func(_ context.Context) { + send, err = tailnet.NewConn(&tailnet.Options{ + Addresses: []netip.Prefix{netip.PrefixFrom(tailnet.IP(), 128)}, + Logger: logger.Named("send"), + DERPMap: derpMap, + }) }) require.NoError(t, err) defer send.Close() @@ -318,7 +333,7 @@ func runTestTransmitHang(t *testing.T, timeout time.Duration) { recv.SetNodeCallback(updateNodes(send)) send.SetNodeCallback(updateNodes(recv)) - ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong) + ctx, cancel := context.WithTimeout(ctx, testutil.WaitLong) defer cancel() logger.Info(ctx, "waiting for receiver to be reachable (by sender)") @@ -326,7 +341,7 @@ func runTestTransmitHang(t *testing.T, timeout time.Duration) { logger.Info(ctx, "wait complete") copyDone := make(chan struct{}) - go func() { + go pprof.Do(ctx, pprof.Labels("id", "tailnet.recv.listener"), func(_ context.Context) { defer close(copyDone) ln, err := recv.Listen("tcp", ":35565") @@ -343,10 +358,13 @@ func runTestTransmitHang(t *testing.T, timeout time.Duration) { _, err = io.Copy(io.Discard, r) assert.NoError(t, err) - }() + }) logger.Info(ctx, "dialing receiver") - w, err := send.DialContextTCP(ctx, netip.AddrPortFrom(recvIP, 35565)) + var w net.Conn + pprof.Do(ctx, pprof.Labels("id", "tailnet.send.dial-recv"), func(ctx context.Context) { + w, err = send.DialContextTCP(ctx, netip.AddrPortFrom(recvIP, 35565)) + }) require.NoError(t, err) defer w.Close() logger.Info(ctx, "dial complete") @@ -360,13 +378,17 @@ func runTestTransmitHang(t *testing.T, timeout time.Duration) { for i := 0; i < 1024*2; i++ { logger.Debug(ctx, "write payload", slog.F("num", i), slog.F("transmitted_kb", size/1024)) Retry: - w.SetWriteDeadline(time.Now().Add(writeTimeout)) - n, err := w.Write(payload) + n := 0 + pprof.Do(ctx, pprof.Labels("id", "tailnet.send.write", "iter", strconv.Itoa(i)), func(_ context.Context) { + _ = w.SetWriteDeadline(time.Now().Add(writeTimeout)) + n, err = w.Write(payload) + }) if err != nil { if time.Duration(retries)*writeTimeout < timeout { - t.Errorf("write failed: %v", err) - t.Logf("retrying (%v)", retries) + _ = pprof.Lookup("goroutine").WriteTo(testLog, 1) + logger.Error(ctx, "write failed", slog.Error(err)) retries++ + logger.Info(ctx, "retrying", slog.F("try", retries)) goto Retry } else { require.NoError(t, err) From f26d7fdb9277853c86f19889b859c2176ad8fce7 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Thu, 25 May 2023 15:16:24 +0000 Subject: [PATCH 6/7] Simplify loop, remove parallel tests --- tailnet/conn_test.go | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/tailnet/conn_test.go b/tailnet/conn_test.go index 0691eecf4a06a..fc458296f54b1 100644 --- a/tailnet/conn_test.go +++ b/tailnet/conn_test.go @@ -12,7 +12,6 @@ import ( "runtime/pprof" "strconv" "strings" - "sync/atomic" "testing" "time" @@ -232,28 +231,17 @@ func runTestTransmitHangMain(t *testing.T) { envknob.Setenv(env, "true") } - var failed atomic.Bool - for !failed.Load() { + for ok := true; ok; { for _, maxProcs := range []int{2, 3, curMaxProcs} { - if failed.Load() { - return - } - t.Run(fmt.Sprintf("GOMAXPROCS=%d", maxProcs), func(t *testing.T) { + ok = t.Run(fmt.Sprintf("GOMAXPROCS=%d", maxProcs), func(t *testing.T) { t.Logf("GOMAXPROCS=%d", maxProcs) runtime.GOMAXPROCS(maxProcs) - for i := 0; i < 5; i++ { - t.Run(fmt.Sprintf("n=%d", i), func(t *testing.T) { - t.Parallel() - t.Cleanup(func() { - if t.Failed() { - failed.Store(true) - } - }) - runTestTransmitHang(t, 20*time.Second) - }) - } + runTestTransmitHang(t, 20*time.Second) }) + if !ok { + break + } } } } From 2ce53910fce25ebb2460d1c696eb4351831ab8dc Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Thu, 25 May 2023 17:13:57 +0000 Subject: [PATCH 7/7] Write goroutine dumps to file good/bad --- tailnet/conn_test.go | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/tailnet/conn_test.go b/tailnet/conn_test.go index fc458296f54b1..b21e214deab40 100644 --- a/tailnet/conn_test.go +++ b/tailnet/conn_test.go @@ -1,6 +1,7 @@ package tailnet_test import ( + "bytes" "context" "fmt" "io" @@ -363,6 +364,7 @@ func runTestTransmitHang(t *testing.T, timeout time.Duration) { size := 0 retries := 0 writeTimeout := 2 * time.Second + goroutineGoodDumpWritten := false for i := 0; i < 1024*2; i++ { logger.Debug(ctx, "write payload", slog.F("num", i), slog.F("transmitted_kb", size/1024)) Retry: @@ -373,14 +375,28 @@ func runTestTransmitHang(t *testing.T, timeout time.Duration) { }) if err != nil { if time.Duration(retries)*writeTimeout < timeout { - _ = pprof.Lookup("goroutine").WriteTo(testLog, 1) + var b bytes.Buffer + _ = pprof.Lookup("goroutine").WriteTo(&b, 1) logger.Error(ctx, "write failed", slog.Error(err)) + _, _ = testLog.Write(b.Bytes()) + f, err := os.Create(filepath.Join(captureDir, fmt.Sprintf("goroutine-bad-%d.txt", i))) + if err == nil { + _, _ = f.Write(b.Bytes()) + _ = f.Close() + } retries++ logger.Info(ctx, "retrying", slog.F("try", retries)) goto Retry } else { require.NoError(t, err) } + } else if !goroutineGoodDumpWritten { + f, err := os.Create(filepath.Join(captureDir, fmt.Sprintf("goroutine-good-%d.txt", i))) + if err == nil { + _ = pprof.Lookup("goroutine").WriteTo(f, 1) + _ = f.Close() + goroutineGoodDumpWritten = true + } } size += n }