Skip to content

test(tailnet): Add TestTransmitHang test with packet capture and logging #7414

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ replace github.com/dlclark/regexp2 => github.com/dlclark/regexp2 v1.7.0

// There are a few minor changes we make to Tailscale that we're slowly upstreaming. Compare here:
// https://github.com/tailscale/tailscale/compare/main...coder:tailscale:main
replace tailscale.com => github.com/coder/tailscale v0.0.0-20230522123520-74712221d00f
replace tailscale.com => github.com/coder/tailscale v1.1.1-0.20230525114851-c9cd3cef997c

// Switch to our fork that imports fixes from http://github.com/tailscale/ssh.
// See: https://github.com/coder/coder/issues/3371
Expand Down Expand Up @@ -173,7 +173,7 @@ require (
google.golang.org/protobuf v1.30.0
gopkg.in/natefinch/lumberjack.v2 v2.2.1
gopkg.in/yaml.v3 v3.0.1
gvisor.dev/gvisor v0.0.0-20221203005347-703fd9b7fbc0
gvisor.dev/gvisor v0.0.0-20230504175454-7b0a1988a28f
nhooyr.io/websocket v1.8.7
storj.io/drpc v0.0.33-0.20230420154621-9716137f6037
tailscale.com v1.32.3
Expand Down
12 changes: 6 additions & 6 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -298,8 +298,8 @@ github.com/cilium/ebpf v0.2.0/go.mod h1:To2CFviqOWL/M0gIMsvSMlqe7em/l1ALkX1PyjrX
github.com/cilium/ebpf v0.4.0/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs=
github.com/cilium/ebpf v0.6.2/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs=
github.com/cilium/ebpf v0.7.0/go.mod h1:/oI2+1shJiTGAMgl6/RgJr36Eo1jzrRcAWbcXO2usCA=
github.com/cilium/ebpf v0.8.1 h1:bLSSEbBLqGPXxls55pGr5qWZaTqcmfDJHhou7t254ao=
github.com/cilium/ebpf v0.8.1/go.mod h1:f5zLIM0FSNuAkSyLAN7X+Hy6yznlF1mNiWUMfxMtrgk=
github.com/cilium/ebpf v0.9.3 h1:5KtxXZU+scyERvkJMEm16TbScVvuuMrlhPly78ZMbSc=
github.com/clbanning/mxj/v2 v2.5.7 h1:7q5lvUpaPF/WOkqgIDiwjBJaznaLCCBd78pi8ZyAnE0=
github.com/clbanning/mxj/v2 v2.5.7/go.mod h1:hNiWqW14h+kc+MdF9C6/YoRfjEJoR3ou6tn/Qo+ve2s=
github.com/cli/safeexec v1.0.0 h1:0VngyaIyqACHdcMNWfo6+KdUYnqEr2Sg+bSP1pdF+dI=
Expand Down Expand Up @@ -334,8 +334,8 @@ github.com/coder/retry v1.3.1-0.20230210155434-e90a2e1e091d h1:09JG37IgTB6n3ouX9
github.com/coder/retry v1.3.1-0.20230210155434-e90a2e1e091d/go.mod h1:r+1J5i/989wt6CUeNSuvFKKA9hHuKKPMxdzDbTuvwwk=
github.com/coder/ssh v0.0.0-20230421140225-04bb837133e1 h1:LBw76rEDuhNJyohve11mbvYv5CmCLmcuUQGiz7Guk50=
github.com/coder/ssh v0.0.0-20230421140225-04bb837133e1/go.mod h1:ZSS+CUoKHDrqVakTfTWUlKSr9MtMFkC4UvtQKD7O914=
github.com/coder/tailscale v0.0.0-20230522123520-74712221d00f h1:F0Xr1d8h8dAHn7tab1HXuzYFkcjmCydnEfdMbkOhlVk=
github.com/coder/tailscale v0.0.0-20230522123520-74712221d00f/go.mod h1:jpg+77g19FpXL43U1VoIqoSg1K/Vh5CVxycGldQ8KhA=
github.com/coder/tailscale v1.1.1-0.20230525114851-c9cd3cef997c h1:0mo9brqvL9RP+qaqGpw+QDOnV+rQ3jwfaZrDOlOWVdk=
github.com/coder/tailscale v1.1.1-0.20230525114851-c9cd3cef997c/go.mod h1:Yv5VkPOCBociNBjE6CjPpld+t0P+Xu4dE1wOcfVYebg=
github.com/coder/terraform-provider-coder v0.6.23 h1:O2Rcj0umez4DfVdGnKZi63z1Xzxd0IQOn9VQDB8YU8g=
github.com/coder/terraform-provider-coder v0.6.23/go.mod h1:UIfU3bYNeSzJJvHyJ30tEKjD6Z9utloI+HUM/7n94CY=
github.com/coder/wgtunnel v0.1.5 h1:WP3sCj/3iJ34eKvpMQEp1oJHvm24RYh0NHbj1kfUKfs=
Expand Down Expand Up @@ -2475,16 +2475,16 @@ gotest.tools/v3 v3.0.2/go.mod h1:3SzNCllyD9/Y+b5r9JIKQ474KzkZyqLqEfYqMsX94Bk=
gotest.tools/v3 v3.0.3/go.mod h1:Z7Lb0S5l+klDB31fvDQX8ss/FlKDxtlFlw3Oa8Ymbl8=
gotest.tools/v3 v3.1.0/go.mod h1:fHy7eyTmJFO5bQbUsEGQ1v4m2J3Jz9eWL54TP2/ZuYQ=
gotest.tools/v3 v3.4.0 h1:ZazjZUfuVeZGLAmlKKuyv3IKP5orXcwtOwDQH6YVr6o=
gvisor.dev/gvisor v0.0.0-20221203005347-703fd9b7fbc0 h1:Wobr37noukisGxpKo5jAsLREcpj61RxrWYzD8uwveOY=
gvisor.dev/gvisor v0.0.0-20221203005347-703fd9b7fbc0/go.mod h1:Dn5idtptoW1dIos9U6A2rpebLs/MtTwFacjKb8jLdQA=
gvisor.dev/gvisor v0.0.0-20230504175454-7b0a1988a28f h1:8GE2MRjGiFmfpon8dekPI08jEuNMQzSffVHgdupcO4E=
gvisor.dev/gvisor v0.0.0-20230504175454-7b0a1988a28f/go.mod h1:pzr6sy8gDLfVmDAg8OYrlKvGEHw5C3PGTiBXBTCx76Q=
honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg=
honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k=
honnef.co/go/tools v0.0.1-2020.1.4/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k=
honnef.co/go/tools v0.4.0-0.dev.0.20230130122044-c30b15588105 h1:2OzOQ+1scFmv2dt7x+wNxgikA/Rn2qKrvc/CJCVuAJM=
honnef.co/go/tools v0.4.2 h1:6qXr+R5w+ktL5UkwEbPp+fEvfyoMPche6GkOpGHZcLc=
howett.net/plist v1.0.0 h1:7CrbWYbPPO/PyNy38b2EB/+gYbjCe2DXBxgtOOZbSQM=
howett.net/plist v1.0.0/go.mod h1:lqaXoTrLY4hg8tnEzNru53gicrbv7rrk+2xJA/7hw9g=
inet.af/peercred v0.0.0-20210906144145-0893ea02156a h1:qdkS8Q5/i10xU2ArJMKYhVa1DORzBfYS/qA2UK2jheg=
Expand Down
30 changes: 30 additions & 0 deletions tailnet/conn.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
"tailscale.com/types/netlogtype"
"tailscale.com/types/netmap"
"tailscale.com/wgengine"
"tailscale.com/wgengine/capture"
"tailscale.com/wgengine/filter"
"tailscale.com/wgengine/magicsock"
"tailscale.com/wgengine/monitor"
Expand Down Expand Up @@ -308,6 +309,8 @@ type Conn struct {
closed chan struct{}
logger slog.Logger
blockEndpoints bool
capture *capture.Sink
captureCount int

dialer *tsdial.Dialer
tunDevice *tstun.Wrapper
Expand Down Expand Up @@ -582,6 +585,10 @@ func (c *Conn) Close() error {
default:
}
close(c.closed)
if c.capture != nil {
c.capture.Close()
c.capture = nil
}
c.mutex.Unlock()

var wg sync.WaitGroup
Expand Down Expand Up @@ -840,6 +847,29 @@ func (c *Conn) MagicsockServeHTTPDebug(w http.ResponseWriter, r *http.Request) {
c.magicConn.ServeHTTPDebug(w, r)
}

func (c *Conn) Capture(w io.Writer) (unregister func()) {
c.mutex.Lock()
defer c.mutex.Unlock()

if c.capture == nil {
c.capture = capture.New()
c.wireguardEngine.InstallCaptureHook(c.capture.LogPacket)
}
unreg := c.capture.RegisterOutput(w)
c.captureCount++
return func() {
c.mutex.Lock()
defer c.mutex.Unlock()
unreg()
c.captureCount--
if c.captureCount == 0 {
c.wireguardEngine.InstallCaptureHook(nil)
_ = c.capture.Close()
c.capture = nil
}
}
}

type listenKey struct {
network string
host string
Expand Down
215 changes: 215 additions & 0 deletions tailnet/conn_test.go
Original file line number Diff line number Diff line change
@@ -1,15 +1,29 @@
package tailnet_test

import (
"bytes"
"context"
"fmt"
"io"
"net"
"net/netip"
"os"
"path/filepath"
"runtime"
"runtime/pprof"
"strconv"
"strings"
"testing"
"time"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/goleak"
"tailscale.com/envknob"
"tailscale.com/tailcfg"

"cdr.dev/slog"
"cdr.dev/slog/sloggers/sloghuman"
"cdr.dev/slog/sloggers/slogtest"
"github.com/coder/coder/tailnet"
"github.com/coder/coder/tailnet/tailnettest"
Expand Down Expand Up @@ -195,3 +209,204 @@ func TestConn_PreferredDERP(t *testing.T) {
t.Fatal("timed out waiting for node")
}
}

//nolint:paralleltest
func TestTransmitHang_TCPSACK_Disabled(t *testing.T) {
runTestTransmitHangMain(t)
}

//nolint:paralleltest
func TestTransmitHang_TCPSACK_Enabled(t *testing.T) {
t.Setenv("TS_DEBUG_NETSTACK_ENABLE_TCPSACK", "true") // Restore after test.
envknob.Setenv("TS_DEBUG_NETSTACK_ENABLE_TCPSACK", "true")
runTestTransmitHangMain(t)
}

func runTestTransmitHangMain(t *testing.T) {
curMaxProcs := runtime.GOMAXPROCS(0)
t.Cleanup(func() {
runtime.GOMAXPROCS(curMaxProcs)
})
for _, env := range []string{"TS_DEBUG_DISCO", "TS_DEBUG_DERP", "TS_DEBUG_NETSTACK"} {
t.Setenv(env, "true") // Restore after test.
envknob.Setenv(env, "true")
}

for ok := true; ok; {
for _, maxProcs := range []int{2, 3, curMaxProcs} {
ok = t.Run(fmt.Sprintf("GOMAXPROCS=%d", maxProcs), func(t *testing.T) {
t.Logf("GOMAXPROCS=%d", maxProcs)
runtime.GOMAXPROCS(maxProcs)

runTestTransmitHang(t, 20*time.Second)
})
if !ok {
break
}
}
}
}

func runTestTransmitHang(t *testing.T, timeout time.Duration) {
// Not using t.TempDir() here so that we keep logs afterwards.
captureDir, err := os.MkdirTemp("", "tailnet-test-")
require.NoError(t, err)

t.Cleanup(func() {
// Only keep failed runs.
if !t.Failed() {
_ = os.RemoveAll(captureDir)
}
})

testLog, err := os.Create(filepath.Join(captureDir, "test.log"))
require.NoError(t, err)
defer testLog.Close()
recvCapture, err := os.Create(filepath.Join(captureDir, "recv.pcap"))
require.NoError(t, err)
defer recvCapture.Close()
sendCapture, err := os.Create(filepath.Join(captureDir, "send.pcap"))
require.NoError(t, err)
defer sendCapture.Close()

logger := slogtest.Make(t, nil).
Leveled(slog.LevelDebug).
AppendSinks(sloghuman.Sink(testLog))

t.Logf("test log file: %v", testLog.Name())
t.Logf("recv capture file: %v", recvCapture.Name())
t.Logf("send capture file: %v", sendCapture.Name())

logger.Info(context.Background(), "starting test", slog.F("GOMAXPROCS", runtime.GOMAXPROCS(0)), slog.F("timeout", timeout))

ctx := context.Background()

var derpMap *tailcfg.DERPMap
pprof.Do(ctx, pprof.Labels("id", "tailnettest.derp-and-stun"), func(_ context.Context) {
derpMap = tailnettest.RunDERPAndSTUN(t)
})
updateNodes := func(c *tailnet.Conn) func(*tailnet.Node) {
return func(node *tailnet.Node) {
err := c.UpdateNodes([]*tailnet.Node{node}, false)
assert.NoError(t, err)
}
}

recvIP := tailnet.IP()
var recv *tailnet.Conn
pprof.Do(ctx, pprof.Labels("id", "tailnet.recv"), func(_ context.Context) {
recv, err = tailnet.NewConn(&tailnet.Options{
Addresses: []netip.Prefix{netip.PrefixFrom(recvIP, 128)},
Logger: logger.Named("recv"),
DERPMap: derpMap,
})
})
require.NoError(t, err)
defer recv.Close()
recvCaptureStop := recv.Capture(recvCapture)
defer recvCaptureStop()

var send *tailnet.Conn
pprof.Do(ctx, pprof.Labels("id", "tailnet.send"), func(_ context.Context) {
send, err = tailnet.NewConn(&tailnet.Options{
Addresses: []netip.Prefix{netip.PrefixFrom(tailnet.IP(), 128)},
Logger: logger.Named("send"),
DERPMap: derpMap,
})
})
require.NoError(t, err)
defer send.Close()
sendCaptureStop := send.Capture(sendCapture)
defer sendCaptureStop()

recv.SetNodeCallback(updateNodes(send))
send.SetNodeCallback(updateNodes(recv))

ctx, cancel := context.WithTimeout(ctx, testutil.WaitLong)
defer cancel()

logger.Info(ctx, "waiting for receiver to be reachable (by sender)")
require.True(t, send.AwaitReachable(ctx, recvIP))
logger.Info(ctx, "wait complete")

copyDone := make(chan struct{})
go pprof.Do(ctx, pprof.Labels("id", "tailnet.recv.listener"), func(_ context.Context) {
defer close(copyDone)

ln, err := recv.Listen("tcp", ":35565")
if !assert.NoError(t, err) {
return
}
defer ln.Close()

r, err := ln.Accept()
if !assert.NoError(t, err) {
return
}
defer r.Close()

_, err = io.Copy(io.Discard, r)
assert.NoError(t, err)
})

logger.Info(ctx, "dialing receiver")
var w net.Conn
pprof.Do(ctx, pprof.Labels("id", "tailnet.send.dial-recv"), func(ctx context.Context) {
w, err = send.DialContextTCP(ctx, netip.AddrPortFrom(recvIP, 35565))
})
require.NoError(t, err)
defer w.Close()
logger.Info(ctx, "dial complete")

now := time.Now()

payload := []byte(strings.Repeat("hello world\n", 65536/12))
size := 0
retries := 0
writeTimeout := 2 * time.Second
goroutineGoodDumpWritten := false
for i := 0; i < 1024*2; i++ {
logger.Debug(ctx, "write payload", slog.F("num", i), slog.F("transmitted_kb", size/1024))
Retry:
n := 0
pprof.Do(ctx, pprof.Labels("id", "tailnet.send.write", "iter", strconv.Itoa(i)), func(_ context.Context) {
_ = w.SetWriteDeadline(time.Now().Add(writeTimeout))
n, err = w.Write(payload)
})
if err != nil {
if time.Duration(retries)*writeTimeout < timeout {
var b bytes.Buffer
_ = pprof.Lookup("goroutine").WriteTo(&b, 1)
logger.Error(ctx, "write failed", slog.Error(err))
_, _ = testLog.Write(b.Bytes())
f, err := os.Create(filepath.Join(captureDir, fmt.Sprintf("goroutine-bad-%d.txt", i)))
if err == nil {
_, _ = f.Write(b.Bytes())
_ = f.Close()
}
retries++
logger.Info(ctx, "retrying", slog.F("try", retries))
goto Retry
} else {
require.NoError(t, err)
}
} else if !goroutineGoodDumpWritten {
f, err := os.Create(filepath.Join(captureDir, fmt.Sprintf("goroutine-good-%d.txt", i)))
if err == nil {
_ = pprof.Lookup("goroutine").WriteTo(f, 1)
_ = f.Close()
goroutineGoodDumpWritten = true
}
}
size += n
}

err = w.Close()
require.NoError(t, err)

<-copyDone

if time.Since(now) > timeout {
t.Fatal("took too long to transmit")
}
}