Skip to content

Commit e2ba9e7

Browse files
authored
chore: retry TestAgent_Dial subtests (coder#19387)
Closes coder/internal#595
1 parent a8c89a1 commit e2ba9e7

File tree

4 files changed

+308
-59
lines changed

4 files changed

+308
-59
lines changed

agent/agent_test.go

Lines changed: 68 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -2668,19 +2668,19 @@ func TestAgent_Dial(t *testing.T) {
26682668

26692669
cases := []struct {
26702670
name string
2671-
setup func(t *testing.T) net.Listener
2671+
setup func(t testing.TB) net.Listener
26722672
}{
26732673
{
26742674
name: "TCP",
2675-
setup: func(t *testing.T) net.Listener {
2675+
setup: func(t testing.TB) net.Listener {
26762676
l, err := net.Listen("tcp", "127.0.0.1:0")
26772677
require.NoError(t, err, "create TCP listener")
26782678
return l
26792679
},
26802680
},
26812681
{
26822682
name: "UDP",
2683-
setup: func(t *testing.T) net.Listener {
2683+
setup: func(t testing.TB) net.Listener {
26842684
addr := net.UDPAddr{
26852685
IP: net.ParseIP("127.0.0.1"),
26862686
Port: 0,
@@ -2698,57 +2698,68 @@ func TestAgent_Dial(t *testing.T) {
26982698

26992699
// The purpose of this test is to ensure that a client can dial a
27002700
// listener in the workspace over tailnet.
2701-
l := c.setup(t)
2702-
done := make(chan struct{})
2703-
defer func() {
2704-
l.Close()
2705-
<-done
2706-
}()
2707-
2708-
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
2709-
defer cancel()
2710-
2711-
go func() {
2712-
defer close(done)
2713-
for range 2 {
2714-
c, err := l.Accept()
2715-
if assert.NoError(t, err, "accept connection") {
2716-
testAccept(ctx, t, c)
2717-
_ = c.Close()
2701+
//
2702+
// The OS sometimes drops packets if the system can't keep up with
2703+
// them. For TCP packets, it's typically fine due to
2704+
// retransmissions, but for UDP packets, it can fail this test.
2705+
//
2706+
// The OS gets involved for the Wireguard traffic (either via DERP
2707+
// or direct UDP), and also for the traffic between the agent and
2708+
// the listener in the "workspace".
2709+
//
2710+
// To avoid this, we'll retry this test up to 3 times.
2711+
testutil.RunRetry(t, 3, func(t testing.TB) {
2712+
ctx := testutil.Context(t, testutil.WaitLong)
2713+
2714+
l := c.setup(t)
2715+
done := make(chan struct{})
2716+
defer func() {
2717+
l.Close()
2718+
<-done
2719+
}()
2720+
2721+
go func() {
2722+
defer close(done)
2723+
for range 2 {
2724+
c, err := l.Accept()
2725+
if assert.NoError(t, err, "accept connection") {
2726+
testAccept(ctx, t, c)
2727+
_ = c.Close()
2728+
}
27182729
}
2719-
}
2720-
}()
2730+
}()
27212731

2722-
agentID := uuid.UUID{0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8}
2723-
//nolint:dogsled
2724-
agentConn, _, _, _, _ := setupAgent(t, agentsdk.Manifest{
2725-
AgentID: agentID,
2726-
}, 0)
2727-
require.True(t, agentConn.AwaitReachable(ctx))
2728-
conn, err := agentConn.DialContext(ctx, l.Addr().Network(), l.Addr().String())
2729-
require.NoError(t, err)
2730-
testDial(ctx, t, conn)
2731-
err = conn.Close()
2732-
require.NoError(t, err)
2732+
agentID := uuid.UUID{0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8}
2733+
//nolint:dogsled
2734+
agentConn, _, _, _, _ := setupAgent(t, agentsdk.Manifest{
2735+
AgentID: agentID,
2736+
}, 0)
2737+
require.True(t, agentConn.AwaitReachable(ctx))
2738+
conn, err := agentConn.DialContext(ctx, l.Addr().Network(), l.Addr().String())
2739+
require.NoError(t, err)
2740+
testDial(ctx, t, conn)
2741+
err = conn.Close()
2742+
require.NoError(t, err)
27332743

2734-
// also connect via the CoderServicePrefix, to test that we can reach the agent on this
2735-
// IP. This will be required for CoderVPN.
2736-
_, rawPort, _ := net.SplitHostPort(l.Addr().String())
2737-
port, _ := strconv.ParseUint(rawPort, 10, 16)
2738-
ipp := netip.AddrPortFrom(tailnet.CoderServicePrefix.AddrFromUUID(agentID), uint16(port))
2739-
2740-
switch l.Addr().Network() {
2741-
case "tcp":
2742-
conn, err = agentConn.Conn.DialContextTCP(ctx, ipp)
2743-
case "udp":
2744-
conn, err = agentConn.Conn.DialContextUDP(ctx, ipp)
2745-
default:
2746-
t.Fatalf("unknown network: %s", l.Addr().Network())
2747-
}
2748-
require.NoError(t, err)
2749-
testDial(ctx, t, conn)
2750-
err = conn.Close()
2751-
require.NoError(t, err)
2744+
// also connect via the CoderServicePrefix, to test that we can reach the agent on this
2745+
// IP. This will be required for CoderVPN.
2746+
_, rawPort, _ := net.SplitHostPort(l.Addr().String())
2747+
port, _ := strconv.ParseUint(rawPort, 10, 16)
2748+
ipp := netip.AddrPortFrom(tailnet.CoderServicePrefix.AddrFromUUID(agentID), uint16(port))
2749+
2750+
switch l.Addr().Network() {
2751+
case "tcp":
2752+
conn, err = agentConn.Conn.DialContextTCP(ctx, ipp)
2753+
case "udp":
2754+
conn, err = agentConn.Conn.DialContextUDP(ctx, ipp)
2755+
default:
2756+
t.Fatalf("unknown network: %s", l.Addr().Network())
2757+
}
2758+
require.NoError(t, err)
2759+
testDial(ctx, t, conn)
2760+
err = conn.Close()
2761+
require.NoError(t, err)
2762+
})
27522763
})
27532764
}
27542765
}
@@ -3251,7 +3262,7 @@ func setupSSHSessionOnPort(
32513262
return session
32523263
}
32533264

3254-
func setupAgent(t *testing.T, metadata agentsdk.Manifest, ptyTimeout time.Duration, opts ...func(*agenttest.Client, *agent.Options)) (
3265+
func setupAgent(t testing.TB, metadata agentsdk.Manifest, ptyTimeout time.Duration, opts ...func(*agenttest.Client, *agent.Options)) (
32553266
*workspacesdk.AgentConn,
32563267
*agenttest.Client,
32573268
<-chan *proto.Stats,
@@ -3349,7 +3360,7 @@ func setupAgent(t *testing.T, metadata agentsdk.Manifest, ptyTimeout time.Durati
33493360

33503361
var dialTestPayload = []byte("dean-was-here123")
33513362

3352-
func testDial(ctx context.Context, t *testing.T, c net.Conn) {
3363+
func testDial(ctx context.Context, t testing.TB, c net.Conn) {
33533364
t.Helper()
33543365

33553366
if deadline, ok := ctx.Deadline(); ok {
@@ -3365,7 +3376,7 @@ func testDial(ctx context.Context, t *testing.T, c net.Conn) {
33653376
assertReadPayload(t, c, dialTestPayload)
33663377
}
33673378

3368-
func testAccept(ctx context.Context, t *testing.T, c net.Conn) {
3379+
func testAccept(ctx context.Context, t testing.TB, c net.Conn) {
33693380
t.Helper()
33703381
defer c.Close()
33713382

@@ -3382,7 +3393,7 @@ func testAccept(ctx context.Context, t *testing.T, c net.Conn) {
33823393
assertWritePayload(t, c, dialTestPayload)
33833394
}
33843395

3385-
func assertReadPayload(t *testing.T, r io.Reader, payload []byte) {
3396+
func assertReadPayload(t testing.TB, r io.Reader, payload []byte) {
33863397
t.Helper()
33873398
b := make([]byte, len(payload)+16)
33883399
n, err := r.Read(b)
@@ -3391,11 +3402,11 @@ func assertReadPayload(t *testing.T, r io.Reader, payload []byte) {
33913402
assert.Equal(t, payload, b[:n])
33923403
}
33933404

3394-
func assertWritePayload(t *testing.T, w io.Writer, payload []byte) {
3405+
func assertWritePayload(t testing.TB, w io.Writer, payload []byte) {
33953406
t.Helper()
33963407
n, err := w.Write(payload)
33973408
assert.NoError(t, err, "write payload")
3398-
assert.Equal(t, len(payload), n, "payload length does not match")
3409+
assert.Equal(t, len(payload), n, "written payload length does not match")
33993410
}
34003411

34013412
func testSessionOutput(t *testing.T, session *ssh.Session, expected, unexpected []string, expectedRe *regexp.Regexp) {

tailnet/tailnettest/tailnettest.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ func DERPIsEmbedded(cfg *derpAndSTUNCfg) {
4545
}
4646

4747
// RunDERPAndSTUN creates a DERP mapping for tests.
48-
func RunDERPAndSTUN(t *testing.T, opts ...DERPAndStunOption) (*tailcfg.DERPMap, *derp.Server) {
48+
func RunDERPAndSTUN(t testing.TB, opts ...DERPAndStunOption) (*tailcfg.DERPMap, *derp.Server) {
4949
cfg := new(derpAndSTUNCfg)
5050
for _, o := range opts {
5151
o(cfg)

testutil/ctx.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import (
66
"time"
77
)
88

9-
func Context(t *testing.T, dur time.Duration) context.Context {
9+
func Context(t testing.TB, dur time.Duration) context.Context {
1010
ctx, cancel := context.WithTimeout(context.Background(), dur)
1111
t.Cleanup(cancel)
1212
return ctx

0 commit comments

Comments
 (0)