diff --git a/go.mod b/go.mod index f1b1e58f75831..de450a77622f4 100644 --- a/go.mod +++ b/go.mod @@ -36,7 +36,7 @@ replace github.com/dlclark/regexp2 => github.com/dlclark/regexp2 v1.7.0 // There are a few minor changes we make to Tailscale that we're slowly upstreaming. Compare here: // https://github.com/tailscale/tailscale/compare/main...coder:tailscale:main -replace tailscale.com => github.com/coder/tailscale v0.0.0-20230522123520-74712221d00f +replace tailscale.com => github.com/coder/tailscale v1.1.1-0.20230525114851-c9cd3cef997c // Switch to our fork that imports fixes from http://github.com/tailscale/ssh. // See: https://github.com/coder/coder/issues/3371 @@ -173,7 +173,7 @@ require ( google.golang.org/protobuf v1.30.0 gopkg.in/natefinch/lumberjack.v2 v2.2.1 gopkg.in/yaml.v3 v3.0.1 - gvisor.dev/gvisor v0.0.0-20221203005347-703fd9b7fbc0 + gvisor.dev/gvisor v0.0.0-20230504175454-7b0a1988a28f nhooyr.io/websocket v1.8.7 storj.io/drpc v0.0.33-0.20230420154621-9716137f6037 tailscale.com v1.32.3 diff --git a/go.sum b/go.sum index b39045d6fd8a7..1054edf804858 100644 --- a/go.sum +++ b/go.sum @@ -298,8 +298,8 @@ github.com/cilium/ebpf v0.2.0/go.mod h1:To2CFviqOWL/M0gIMsvSMlqe7em/l1ALkX1PyjrX github.com/cilium/ebpf v0.4.0/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs= github.com/cilium/ebpf v0.6.2/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs= github.com/cilium/ebpf v0.7.0/go.mod h1:/oI2+1shJiTGAMgl6/RgJr36Eo1jzrRcAWbcXO2usCA= -github.com/cilium/ebpf v0.8.1 h1:bLSSEbBLqGPXxls55pGr5qWZaTqcmfDJHhou7t254ao= github.com/cilium/ebpf v0.8.1/go.mod h1:f5zLIM0FSNuAkSyLAN7X+Hy6yznlF1mNiWUMfxMtrgk= +github.com/cilium/ebpf v0.9.3 h1:5KtxXZU+scyERvkJMEm16TbScVvuuMrlhPly78ZMbSc= github.com/clbanning/mxj/v2 v2.5.7 h1:7q5lvUpaPF/WOkqgIDiwjBJaznaLCCBd78pi8ZyAnE0= github.com/clbanning/mxj/v2 v2.5.7/go.mod h1:hNiWqW14h+kc+MdF9C6/YoRfjEJoR3ou6tn/Qo+ve2s= github.com/cli/safeexec v1.0.0 h1:0VngyaIyqACHdcMNWfo6+KdUYnqEr2Sg+bSP1pdF+dI= @@ -334,8 +334,8 @@ github.com/coder/retry v1.3.1-0.20230210155434-e90a2e1e091d h1:09JG37IgTB6n3ouX9 github.com/coder/retry v1.3.1-0.20230210155434-e90a2e1e091d/go.mod h1:r+1J5i/989wt6CUeNSuvFKKA9hHuKKPMxdzDbTuvwwk= github.com/coder/ssh v0.0.0-20230421140225-04bb837133e1 h1:LBw76rEDuhNJyohve11mbvYv5CmCLmcuUQGiz7Guk50= github.com/coder/ssh v0.0.0-20230421140225-04bb837133e1/go.mod h1:ZSS+CUoKHDrqVakTfTWUlKSr9MtMFkC4UvtQKD7O914= -github.com/coder/tailscale v0.0.0-20230522123520-74712221d00f h1:F0Xr1d8h8dAHn7tab1HXuzYFkcjmCydnEfdMbkOhlVk= -github.com/coder/tailscale v0.0.0-20230522123520-74712221d00f/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA= +github.com/coder/tailscale v1.1.1-0.20230525114851-c9cd3cef997c h1:0mo9brqvL9RP+qaqGpw+QDOnV+rQ3jwfaZrDOlOWVdk= +github.com/coder/tailscale v1.1.1-0.20230525114851-c9cd3cef997c/go.mod h1:Yv5VkPOCBociNBjE6CjPpld+t0P+Xu4dE1wOcfVYebg= github.com/coder/terraform-provider-coder v0.6.23 h1:O2Rcj0umez4DfVdGnKZi63z1Xzxd0IQOn9VQDB8YU8g= github.com/coder/terraform-provider-coder v0.6.23/go.mod h1:UIfU3bYNeSzJJvHyJ30tEKjD6Z9utloI+HUM/7n94CY= github.com/coder/wgtunnel v0.1.5 h1:WP3sCj/3iJ34eKvpMQEp1oJHvm24RYh0NHbj1kfUKfs= @@ -2475,8 +2475,8 @@ gotest.tools/v3 v3.0.2/go.mod h1:3SzNCllyD9/Y+b5r9JIKQ474KzkZyqLqEfYqMsX94Bk= gotest.tools/v3 v3.0.3/go.mod h1:Z7Lb0S5l+klDB31fvDQX8ss/FlKDxtlFlw3Oa8Ymbl8= gotest.tools/v3 v3.1.0/go.mod h1:fHy7eyTmJFO5bQbUsEGQ1v4m2J3Jz9eWL54TP2/ZuYQ= gotest.tools/v3 v3.4.0 h1:ZazjZUfuVeZGLAmlKKuyv3IKP5orXcwtOwDQH6YVr6o= -gvisor.dev/gvisor v0.0.0-20221203005347-703fd9b7fbc0 h1:Wobr37noukisGxpKo5jAsLREcpj61RxrWYzD8uwveOY= -gvisor.dev/gvisor v0.0.0-20221203005347-703fd9b7fbc0/go.mod h1:Dn5idtptoW1dIos9U6A2rpebLs/MtTwFacjKb8jLdQA= +gvisor.dev/gvisor v0.0.0-20230504175454-7b0a1988a28f h1:8GE2MRjGiFmfpon8dekPI08jEuNMQzSffVHgdupcO4E= +gvisor.dev/gvisor v0.0.0-20230504175454-7b0a1988a28f/go.mod h1:pzr6sy8gDLfVmDAg8OYrlKvGEHw5C3PGTiBXBTCx76Q= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= @@ -2484,7 +2484,7 @@ honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWh honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg= honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k= honnef.co/go/tools v0.0.1-2020.1.4/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k= -honnef.co/go/tools v0.4.0-0.dev.0.20230130122044-c30b15588105 h1:2OzOQ+1scFmv2dt7x+wNxgikA/Rn2qKrvc/CJCVuAJM= +honnef.co/go/tools v0.4.2 h1:6qXr+R5w+ktL5UkwEbPp+fEvfyoMPche6GkOpGHZcLc= howett.net/plist v1.0.0 h1:7CrbWYbPPO/PyNy38b2EB/+gYbjCe2DXBxgtOOZbSQM= howett.net/plist v1.0.0/go.mod h1:lqaXoTrLY4hg8tnEzNru53gicrbv7rrk+2xJA/7hw9g= inet.af/peercred v0.0.0-20210906144145-0893ea02156a h1:qdkS8Q5/i10xU2ArJMKYhVa1DORzBfYS/qA2UK2jheg= diff --git a/tailnet/conn.go b/tailnet/conn.go index 20a00d94d6804..5de3f0d6bfa15 100644 --- a/tailnet/conn.go +++ b/tailnet/conn.go @@ -33,6 +33,7 @@ import ( "tailscale.com/types/netlogtype" "tailscale.com/types/netmap" "tailscale.com/wgengine" + "tailscale.com/wgengine/capture" "tailscale.com/wgengine/filter" "tailscale.com/wgengine/magicsock" "tailscale.com/wgengine/monitor" @@ -308,6 +309,8 @@ type Conn struct { closed chan struct{} logger slog.Logger blockEndpoints bool + capture *capture.Sink + captureCount int dialer *tsdial.Dialer tunDevice *tstun.Wrapper @@ -582,6 +585,10 @@ func (c *Conn) Close() error { default: } close(c.closed) + if c.capture != nil { + c.capture.Close() + c.capture = nil + } c.mutex.Unlock() var wg sync.WaitGroup @@ -840,6 +847,29 @@ func (c *Conn) MagicsockServeHTTPDebug(w http.ResponseWriter, r *http.Request) { c.magicConn.ServeHTTPDebug(w, r) } +func (c *Conn) Capture(w io.Writer) (unregister func()) { + c.mutex.Lock() + defer c.mutex.Unlock() + + if c.capture == nil { + c.capture = capture.New() + c.wireguardEngine.InstallCaptureHook(c.capture.LogPacket) + } + unreg := c.capture.RegisterOutput(w) + c.captureCount++ + return func() { + c.mutex.Lock() + defer c.mutex.Unlock() + unreg() + c.captureCount-- + if c.captureCount == 0 { + c.wireguardEngine.InstallCaptureHook(nil) + _ = c.capture.Close() + c.capture = nil + } + } +} + type listenKey struct { network string host string diff --git a/tailnet/conn_test.go b/tailnet/conn_test.go index 2e19379e6df03..b21e214deab40 100644 --- a/tailnet/conn_test.go +++ b/tailnet/conn_test.go @@ -1,15 +1,29 @@ package tailnet_test import ( + "bytes" "context" + "fmt" + "io" + "net" "net/netip" + "os" + "path/filepath" + "runtime" + "runtime/pprof" + "strconv" + "strings" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.uber.org/goleak" + "tailscale.com/envknob" + "tailscale.com/tailcfg" "cdr.dev/slog" + "cdr.dev/slog/sloggers/sloghuman" "cdr.dev/slog/sloggers/slogtest" "github.com/coder/coder/tailnet" "github.com/coder/coder/tailnet/tailnettest" @@ -195,3 +209,204 @@ func TestConn_PreferredDERP(t *testing.T) { t.Fatal("timed out waiting for node") } } + +//nolint:paralleltest +func TestTransmitHang_TCPSACK_Disabled(t *testing.T) { + runTestTransmitHangMain(t) +} + +//nolint:paralleltest +func TestTransmitHang_TCPSACK_Enabled(t *testing.T) { + t.Setenv("TS_DEBUG_NETSTACK_ENABLE_TCPSACK", "true") // Restore after test. + envknob.Setenv("TS_DEBUG_NETSTACK_ENABLE_TCPSACK", "true") + runTestTransmitHangMain(t) +} + +func runTestTransmitHangMain(t *testing.T) { + curMaxProcs := runtime.GOMAXPROCS(0) + t.Cleanup(func() { + runtime.GOMAXPROCS(curMaxProcs) + }) + for _, env := range []string{"TS_DEBUG_DISCO", "TS_DEBUG_DERP", "TS_DEBUG_NETSTACK"} { + t.Setenv(env, "true") // Restore after test. + envknob.Setenv(env, "true") + } + + for ok := true; ok; { + for _, maxProcs := range []int{2, 3, curMaxProcs} { + ok = t.Run(fmt.Sprintf("GOMAXPROCS=%d", maxProcs), func(t *testing.T) { + t.Logf("GOMAXPROCS=%d", maxProcs) + runtime.GOMAXPROCS(maxProcs) + + runTestTransmitHang(t, 20*time.Second) + }) + if !ok { + break + } + } + } +} + +func runTestTransmitHang(t *testing.T, timeout time.Duration) { + // Not using t.TempDir() here so that we keep logs afterwards. + captureDir, err := os.MkdirTemp("", "tailnet-test-") + require.NoError(t, err) + + t.Cleanup(func() { + // Only keep failed runs. + if !t.Failed() { + _ = os.RemoveAll(captureDir) + } + }) + + testLog, err := os.Create(filepath.Join(captureDir, "test.log")) + require.NoError(t, err) + defer testLog.Close() + recvCapture, err := os.Create(filepath.Join(captureDir, "recv.pcap")) + require.NoError(t, err) + defer recvCapture.Close() + sendCapture, err := os.Create(filepath.Join(captureDir, "send.pcap")) + require.NoError(t, err) + defer sendCapture.Close() + + logger := slogtest.Make(t, nil). + Leveled(slog.LevelDebug). + AppendSinks(sloghuman.Sink(testLog)) + + t.Logf("test log file: %v", testLog.Name()) + t.Logf("recv capture file: %v", recvCapture.Name()) + t.Logf("send capture file: %v", sendCapture.Name()) + + logger.Info(context.Background(), "starting test", slog.F("GOMAXPROCS", runtime.GOMAXPROCS(0)), slog.F("timeout", timeout)) + + ctx := context.Background() + + var derpMap *tailcfg.DERPMap + pprof.Do(ctx, pprof.Labels("id", "tailnettest.derp-and-stun"), func(_ context.Context) { + derpMap = tailnettest.RunDERPAndSTUN(t) + }) + updateNodes := func(c *tailnet.Conn) func(*tailnet.Node) { + return func(node *tailnet.Node) { + err := c.UpdateNodes([]*tailnet.Node{node}, false) + assert.NoError(t, err) + } + } + + recvIP := tailnet.IP() + var recv *tailnet.Conn + pprof.Do(ctx, pprof.Labels("id", "tailnet.recv"), func(_ context.Context) { + recv, err = tailnet.NewConn(&tailnet.Options{ + Addresses: []netip.Prefix{netip.PrefixFrom(recvIP, 128)}, + Logger: logger.Named("recv"), + DERPMap: derpMap, + }) + }) + require.NoError(t, err) + defer recv.Close() + recvCaptureStop := recv.Capture(recvCapture) + defer recvCaptureStop() + + var send *tailnet.Conn + pprof.Do(ctx, pprof.Labels("id", "tailnet.send"), func(_ context.Context) { + send, err = tailnet.NewConn(&tailnet.Options{ + Addresses: []netip.Prefix{netip.PrefixFrom(tailnet.IP(), 128)}, + Logger: logger.Named("send"), + DERPMap: derpMap, + }) + }) + require.NoError(t, err) + defer send.Close() + sendCaptureStop := send.Capture(sendCapture) + defer sendCaptureStop() + + recv.SetNodeCallback(updateNodes(send)) + send.SetNodeCallback(updateNodes(recv)) + + ctx, cancel := context.WithTimeout(ctx, testutil.WaitLong) + defer cancel() + + logger.Info(ctx, "waiting for receiver to be reachable (by sender)") + require.True(t, send.AwaitReachable(ctx, recvIP)) + logger.Info(ctx, "wait complete") + + copyDone := make(chan struct{}) + go pprof.Do(ctx, pprof.Labels("id", "tailnet.recv.listener"), func(_ context.Context) { + defer close(copyDone) + + ln, err := recv.Listen("tcp", ":35565") + if !assert.NoError(t, err) { + return + } + defer ln.Close() + + r, err := ln.Accept() + if !assert.NoError(t, err) { + return + } + defer r.Close() + + _, err = io.Copy(io.Discard, r) + assert.NoError(t, err) + }) + + logger.Info(ctx, "dialing receiver") + var w net.Conn + pprof.Do(ctx, pprof.Labels("id", "tailnet.send.dial-recv"), func(ctx context.Context) { + w, err = send.DialContextTCP(ctx, netip.AddrPortFrom(recvIP, 35565)) + }) + require.NoError(t, err) + defer w.Close() + logger.Info(ctx, "dial complete") + + now := time.Now() + + payload := []byte(strings.Repeat("hello world\n", 65536/12)) + size := 0 + retries := 0 + writeTimeout := 2 * time.Second + goroutineGoodDumpWritten := false + for i := 0; i < 1024*2; i++ { + logger.Debug(ctx, "write payload", slog.F("num", i), slog.F("transmitted_kb", size/1024)) + Retry: + n := 0 + pprof.Do(ctx, pprof.Labels("id", "tailnet.send.write", "iter", strconv.Itoa(i)), func(_ context.Context) { + _ = w.SetWriteDeadline(time.Now().Add(writeTimeout)) + n, err = w.Write(payload) + }) + if err != nil { + if time.Duration(retries)*writeTimeout < timeout { + var b bytes.Buffer + _ = pprof.Lookup("goroutine").WriteTo(&b, 1) + logger.Error(ctx, "write failed", slog.Error(err)) + _, _ = testLog.Write(b.Bytes()) + f, err := os.Create(filepath.Join(captureDir, fmt.Sprintf("goroutine-bad-%d.txt", i))) + if err == nil { + _, _ = f.Write(b.Bytes()) + _ = f.Close() + } + retries++ + logger.Info(ctx, "retrying", slog.F("try", retries)) + goto Retry + } else { + require.NoError(t, err) + } + } else if !goroutineGoodDumpWritten { + f, err := os.Create(filepath.Join(captureDir, fmt.Sprintf("goroutine-good-%d.txt", i))) + if err == nil { + _ = pprof.Lookup("goroutine").WriteTo(f, 1) + _ = f.Close() + goroutineGoodDumpWritten = true + } + } + size += n + } + + err = w.Close() + require.NoError(t, err) + + <-copyDone + + if time.Since(now) > timeout { + t.Fatal("took too long to transmit") + } +}