Skip to content

fix(codersdk): keep workspace agent connection open after dial context #10863

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Nov 27, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions coderd/workspaceagents_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -445,13 +445,19 @@ func TestWorkspaceAgentTailnet(t *testing.T) {
_ = agenttest.New(t, client.URL, authToken)
resources := coderdtest.AwaitWorkspaceAgents(t, client, ws.ID)

ctx, cancelFunc := context.WithCancel(context.Background())
defer cancelFunc()
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
defer cancel()
conn, err := client.DialWorkspaceAgent(ctx, resources[0].Agents[0].ID, &codersdk.DialWorkspaceAgentOptions{
Logger: slogtest.Make(t, nil).Named("client").Leveled(slog.LevelDebug),
})
require.NoError(t, err)
defer conn.Close()

// Connection should remain open even if the dial context is canceled.
cancel()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: I'm wondering if you can extract this to a separate function to make sure that cancel() is called only once in defer, instead of adding an extra cancel() call.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't really matter if it's called multiple times (it's no-op for context cancel funcs). I typically always defer as a precaution to ensure it's cleaned up, then call it manually when it's time accepting that there will be a double cancel (this is especially important when using require that can short-circuit a test).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't really matter if it's called multiple times

I know, I'm looking at this cancel() juggling more from the clean code perspective.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alright, refactored 👍🏻

ctx, cancel = context.WithTimeout(context.Background(), testutil.WaitLong)
defer cancel()

sshClient, err := conn.SSHClient(ctx)
require.NoError(t, err)
session, err := sshClient.NewSession()
Expand Down Expand Up @@ -1416,7 +1422,8 @@ func TestWorkspaceAgent_UpdatedDERP(t *testing.T) {
agentID := resources[0].Agents[0].ID

// Connect from a client.
ctx := testutil.Context(t, testutil.WaitLong)
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
defer cancel()
conn1, err := client.DialWorkspaceAgent(ctx, agentID, &codersdk.DialWorkspaceAgentOptions{
Logger: logger.Named("client1"),
})
Expand All @@ -1425,6 +1432,11 @@ func TestWorkspaceAgent_UpdatedDERP(t *testing.T) {
ok := conn1.AwaitReachable(ctx)
require.True(t, ok)

// Connection should remain open even if the dial context is canceled.
cancel()
ctx, cancel = context.WithTimeout(context.Background(), testutil.WaitLong)
defer cancel()

// Change the DERP map and change the region ID.
newDerpMap, _ := tailnettest.RunDERPAndSTUN(t)
require.NotNil(t, newDerpMap)
Expand Down
35 changes: 24 additions & 11 deletions codersdk/workspaceagents.go
Original file line number Diff line number Diff line change
Expand Up @@ -258,12 +258,12 @@ type DialWorkspaceAgentOptions struct {
BlockEndpoints bool
}

func (c *Client) DialWorkspaceAgent(ctx context.Context, agentID uuid.UUID, options *DialWorkspaceAgentOptions) (agentConn *WorkspaceAgentConn, err error) {
func (c *Client) DialWorkspaceAgent(dialCtx context.Context, agentID uuid.UUID, options *DialWorkspaceAgentOptions) (agentConn *WorkspaceAgentConn, err error) {
if options == nil {
options = &DialWorkspaceAgentOptions{}
}

connInfo, err := c.WorkspaceAgentConnectionInfo(ctx, agentID)
connInfo, err := c.WorkspaceAgentConnectionInfo(dialCtx, agentID)
if err != nil {
return nil, xerrors.Errorf("get connection info: %w", err)
}
Expand Down Expand Up @@ -302,7 +302,10 @@ func (c *Client) DialWorkspaceAgent(ctx context.Context, agentID uuid.UUID, opti
tokenHeader = c.SessionTokenHeader
}
headers.Set(tokenHeader, c.SessionToken())
ctx, cancel := context.WithCancel(ctx)

// New context, separate from dialCtx. We don't want to cancel the
// connection if dialCtx is canceled.
ctx, cancel := context.WithCancel(context.Background())
defer func() {
if err != nil {
cancel()
Expand All @@ -317,6 +320,7 @@ func (c *Client) DialWorkspaceAgent(ctx context.Context, agentID uuid.UUID, opti
firstCoordinator := make(chan error)
go func() {
defer close(closedCoordinator)
firstCoordinator := firstCoordinator // Shadowed so it can be reassigned outside goroutine.
isFirst := true
for retrier := retry.New(50*time.Millisecond, 10*time.Second); retrier.Wait(ctx); {
options.Logger.Debug(ctx, "connecting")
Expand Down Expand Up @@ -369,6 +373,7 @@ func (c *Client) DialWorkspaceAgent(ctx context.Context, agentID uuid.UUID, opti
firstDerpMap := make(chan error)
go func() {
defer close(closedDerpMap)
firstDerpMap := firstDerpMap // Shadowed so it can be reassigned outside goroutine.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we want to modify this from a separate goroutine? Is this not a potential data race?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I put this here to safe-guard against data-race since we assign firstDerpMap = nil in the parent goroutine after we receive the first value (error or closed chan).

But I just realized, this is not needed. We only ever send one value (error or close, not both), so there is no race.

*Iff we had not shadowed and done both: send error and close the chan, then the close(firstDerpMap) would've been racy because we're reassigning in the other routine.

isFirst := true
for retrier := retry.New(50*time.Millisecond, 10*time.Second); retrier.Wait(ctx); {
options.Logger.Debug(ctx, "connecting to server for derp map updates")
Expand Down Expand Up @@ -420,13 +425,21 @@ func (c *Client) DialWorkspaceAgent(ctx context.Context, agentID uuid.UUID, opti
}
}()

err = <-firstCoordinator
if err != nil {
return nil, err
}
err = <-firstDerpMap
if err != nil {
return nil, err
for firstCoordinator != nil || firstDerpMap != nil {
select {
case <-dialCtx.Done():
return nil, dialCtx.Err()
case err = <-firstCoordinator:
if err != nil {
return nil, err
}
firstCoordinator = nil
case err = <-firstDerpMap:
if err != nil {
return nil, err
}
firstDerpMap = nil
}
}

agentConn = NewWorkspaceAgentConn(conn, WorkspaceAgentConnOptions{
Expand All @@ -444,7 +457,7 @@ func (c *Client) DialWorkspaceAgent(ctx context.Context, agentID uuid.UUID, opti
},
})

if !agentConn.AwaitReachable(ctx) {
if !agentConn.AwaitReachable(dialCtx) {
_ = agentConn.Close()
return nil, xerrors.Errorf("timed out waiting for agent to become reachable: %w", ctx.Err())
}
Expand Down