Skip to content

fix: Refactor agent to consume API client #4715

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 24, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Fix client reconnections
  • Loading branch information
kylecarbs committed Oct 24, 2022
commit 13a060e3288a6d5ceeae712f7d0e339c80ac0c58
39 changes: 21 additions & 18 deletions agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,21 +115,23 @@ type agent struct {
// failure, you'll want the agent to reconnect.
func (a *agent) runLoop(ctx context.Context) {
for retrier := retry.New(100*time.Millisecond, 10*time.Second); retrier.Wait(ctx); {
ctx, cancelFunc := context.WithCancel(ctx)
a.logger.Info(ctx, "running loop")
err := a.run(ctx)
// Cancel after the run is complete to clean up any leaked resources!
cancelFunc()
if err != nil {
if errors.Is(err, context.Canceled) {
return
}
if a.isClosed() {
return
}
a.logger.Warn(context.Background(), "failed to run loop", slog.Error(err))
if err == nil {
continue
}
a.logger.Info(ctx, "running loop")
if errors.Is(err, context.Canceled) {
return
}
if a.isClosed() {
return
}
if errors.Is(err, io.EOF) {
a.logger.Info(ctx, "likely disconnected from coder", slog.Error(err))
continue
}
a.logger.Warn(ctx, "run exited with error", slog.Error(err))
}
}

Expand Down Expand Up @@ -170,15 +172,18 @@ func (a *agent) run(ctx context.Context) error {
}

// This automatically closes when the context ends!
appReporterCtx, appReporterCtxCancel := context.WithCancel(ctx)
defer appReporterCtxCancel()
go NewWorkspaceAppHealthReporter(
a.logger, metadata.Apps, a.client.PostWorkspaceAgentAppHealth)(ctx)
a.logger, metadata.Apps, a.client.PostWorkspaceAgentAppHealth)(appReporterCtx)

a.logger.Debug(ctx, "running tailnet with derpmap", slog.F("derpmap", metadata.DERPMap))

a.closeMutex.Lock()
network := a.network
a.closeMutex.Unlock()
if a.network == nil {
a.logger.Debug(ctx, "creating tailnet")
network, err = a.createTailnet(ctx, metadata.DERPMap)
if err != nil {
return xerrors.Errorf("create tailnet: %w", err)
Expand All @@ -191,8 +196,10 @@ func (a *agent) run(ctx context.Context) error {
network.SetDERPMap(metadata.DERPMap)
}

a.logger.Debug(ctx, "running coordinator")
err = a.runCoordinator(ctx, network)
if err != nil {
a.logger.Debug(ctx, "coordinator exited", slog.Error(err))
return xerrors.Errorf("run coordinator: %w", err)
}
return nil
Expand Down Expand Up @@ -315,11 +322,7 @@ func (a *agent) createTailnet(ctx context.Context, derpMap *tailcfg.DERPMap) (*t
// runCoordinator runs a coordinator and returns whether a reconnect
// should occur.
func (a *agent) runCoordinator(ctx context.Context, network *tailnet.Conn) error {
var coordinator net.Conn
var err error
// An exponential back-off occurs when the connection is failing to dial.
// This is to prevent server spam in case of a coderd outage.
coordinator, err = a.client.ListenWorkspaceAgent(ctx)
coordinator, err := a.client.ListenWorkspaceAgent(ctx)
if err != nil {
return err
}
Expand Down Expand Up @@ -683,7 +686,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, msg codersdk.Reconnec

ptty, process, err := pty.Start(cmd)
if err != nil {
a.logger.Error(ctx, "start reconnecting pty command", slog.F("id", msg.ID))
a.logger.Error(ctx, "start reconnecting pty command", slog.F("id", msg.ID), slog.Error(err))
return
}

Expand Down
56 changes: 49 additions & 7 deletions agent/agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"strconv"
"strings"
"sync"
"sync/atomic"
"testing"
"time"

Expand Down Expand Up @@ -503,6 +504,45 @@ func TestAgent(t *testing.T) {
require.NoError(t, err)
t.Logf("%.2f MBits/s", res[len(res)-1].MBitsPerSecond())
})

t.Run("Reconnect", func(t *testing.T) {
t.Parallel()
// After the agent is disconnected from a coordinator, it's supposed
// to reconnect!
coordinator := tailnet.NewCoordinator()
agentID := uuid.New()
statsCh := make(chan *codersdk.AgentStats)
derpMap := tailnettest.RunDERPAndSTUN(t)
client := &client{
t: t,
agentID: agentID,
metadata: codersdk.WorkspaceAgentMetadata{
DERPMap: derpMap,
},
statsChan: statsCh,
coordinator: coordinator,
}
initialized := atomic.Int32{}
closer := agent.New(agent.Options{
ExchangeToken: func(ctx context.Context) error {
initialized.Add(1)
return nil
},
Client: client,
Logger: slogtest.Make(t, nil).Leveled(slog.LevelInfo),
})
t.Cleanup(func() {
_ = closer.Close()
})

require.Eventually(t, func() bool {
return coordinator.Node(agentID) != nil
}, testutil.WaitShort, testutil.IntervalFast)
client.lastWorkspaceAgent()
require.Eventually(t, func() bool {
return initialized.Load() == 2
}, testutil.WaitShort, testutil.IntervalFast)
})
}

func setupSSHCommand(t *testing.T, beforeArgs []string, afterArgs []string) *exec.Cmd {
Expand Down Expand Up @@ -639,11 +679,12 @@ func assertWritePayload(t *testing.T, w io.Writer, payload []byte) {
}

type client struct {
t *testing.T
agentID uuid.UUID
metadata codersdk.WorkspaceAgentMetadata
statsChan chan *codersdk.AgentStats
coordinator tailnet.Coordinator
t *testing.T
agentID uuid.UUID
metadata codersdk.WorkspaceAgentMetadata
statsChan chan *codersdk.AgentStats
coordinator tailnet.Coordinator
lastWorkspaceAgent func()
}

func (c *client) WorkspaceAgentMetadata(_ context.Context) (codersdk.WorkspaceAgentMetadata, error) {
Expand All @@ -653,11 +694,12 @@ func (c *client) WorkspaceAgentMetadata(_ context.Context) (codersdk.WorkspaceAg
func (c *client) ListenWorkspaceAgent(_ context.Context) (net.Conn, error) {
clientConn, serverConn := net.Pipe()
closed := make(chan struct{})
c.t.Cleanup(func() {
c.lastWorkspaceAgent = func() {
_ = serverConn.Close()
_ = clientConn.Close()
<-closed
})
}
c.t.Cleanup(c.lastWorkspaceAgent)
go func() {
_ = c.coordinator.ServeAgent(serverConn, c.agentID)
close(closed)
Expand Down
3 changes: 3 additions & 0 deletions cli/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"os"
"path/filepath"
"runtime"
"time"

"cloud.google.com/go/compute/metadata"
"github.com/spf13/cobra"
Expand Down Expand Up @@ -78,6 +79,8 @@ func workspaceAgent() *cobra.Command {
slog.F("version", version),
)
client := codersdk.New(coderURL)
// Set a reasonable timeout so requests can't hang forever!
client.HTTPClient.Timeout = 10 * time.Second

if pprofEnabled {
srvClose := serveHandler(cmd.Context(), logger, nil, pprofAddress, "pprof")
Expand Down
3 changes: 3 additions & 0 deletions codersdk/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,9 @@ func (c *Client) Request(ctx context.Context, method, path string, body interfac
// readBodyAsError reads the response as an .Message, and
// wraps it in a codersdk.Error type for easy marshaling.
func readBodyAsError(res *http.Response) error {
if res == nil {
return xerrors.Errorf("no body returned")
}
defer res.Body.Close()
contentType := res.Header.Get("Content-Type")

Expand Down