Skip to content

feat: reinitialize agents when a prebuilt workspace is claimed #17475

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 28 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
c09c9b9
WIP: agent reinitialization
SasSwart Apr 21, 2025
476fe71
fix assignment to nil map
SasSwart Apr 21, 2025
8c8bca6
fix: ensure prebuilt workspace agent tokens are reused when a prebuil…
SasSwart Apr 23, 2025
7ce4eea
test agent reinitialization
SasSwart Apr 24, 2025
52ac64e
remove defunct metric
SasSwart Apr 24, 2025
362db7c
Remove todo
SasSwart Apr 25, 2025
dcc7379
test that we trigger workspace agent reinitialization under the right…
SasSwart Apr 28, 2025
ff66b3f
slight improvements to a test
SasSwart Apr 28, 2025
efff5d9
review notes to improve legibility
SasSwart Apr 28, 2025
cebd5db
add an integration test for prebuilt workspace agent reinitialization
SasSwart Apr 29, 2025
2679138
Merge remote-tracking branch 'origin/main' into jjs/prebuilds-agent-r…
SasSwart Apr 29, 2025
9feebef
enable the premium license in a prebuilds integration test
SasSwart Apr 29, 2025
b117b5c
encapsulate WaitForReinitLoop for easier testing
SasSwart Apr 30, 2025
a22b414
introduce unit testable abstraction layers
SasSwart Apr 30, 2025
9bbd2c7
test workspace claim pubsub
SasSwart May 1, 2025
5804201
add tests for agent reinitialization
SasSwart May 1, 2025
7e8dcee
review notes
SasSwart May 1, 2025
725f97b
Merge remote-tracking branch 'origin/main' into jjs/prebuilds-agent-r…
SasSwart May 1, 2025
a9b1567
make fmt lint
SasSwart May 1, 2025
21ee970
remove go mod replace
SasSwart May 1, 2025
e54d7e7
remove defunct logging
SasSwart May 1, 2025
2799858
update dependency on terraform-provider-coder
SasSwart May 2, 2025
1d93003
update dependency on terraform-provider-coder
SasSwart May 2, 2025
763fc12
go mod tidy
SasSwart May 2, 2025
0f879c7
make -B gen
SasSwart May 2, 2025
61784c9
dont require ids to InsertPresetParameters
SasSwart May 2, 2025
604eb27
dont require ids to InsertPresetParameters
SasSwart May 2, 2025
bf4d2cf
fix: set the running agent token
dannykopping May 2, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ import (
"tailscale.com/util/clientmetric"

"cdr.dev/slog"

"github.com/coder/retry"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

inconsistent formatting of imports


"github.com/coder/clistat"
"github.com/coder/coder/v2/agent/agentcontainers"
"github.com/coder/coder/v2/agent/agentexec"
Expand All @@ -53,7 +56,6 @@ import (
"github.com/coder/coder/v2/tailnet"
tailnetproto "github.com/coder/coder/v2/tailnet/proto"
"github.com/coder/quartz"
"github.com/coder/retry"
)

const (
Expand Down Expand Up @@ -363,9 +365,11 @@ func (a *agent) runLoop() {
if ctx.Err() != nil {
// Context canceled errors may come from websocket pings, so we
// don't want to use `errors.Is(err, context.Canceled)` here.
a.logger.Warn(ctx, "runLoop exited with error", slog.Error(ctx.Err()))
return
}
if a.isClosed() {
a.logger.Warn(ctx, "runLoop exited because agent is closed")
return
}
if errors.Is(err, io.EOF) {
Expand Down Expand Up @@ -1046,7 +1050,11 @@ func (a *agent) run() (retErr error) {
return a.statsReporter.reportLoop(ctx, aAPI)
})

return connMan.wait()
err = connMan.wait()
if err != nil {
a.logger.Warn(context.Background(), "connection manager errored", slog.Error(err))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be info or debug, as it can throw benign error like being closed or context canceled.

}
return err
}

// handleManifest returns a function that fetches and processes the manifest
Expand Down
112 changes: 69 additions & 43 deletions cli/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ import (
"cdr.dev/slog/sloggers/sloghuman"
"cdr.dev/slog/sloggers/slogjson"
"cdr.dev/slog/sloggers/slogstackdriver"
"github.com/coder/serpent"

"github.com/coder/coder/v2/agent"
"github.com/coder/coder/v2/agent/agentexec"
"github.com/coder/coder/v2/agent/agentssh"
Expand All @@ -33,7 +35,6 @@ import (
"github.com/coder/coder/v2/cli/clilog"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/codersdk/agentsdk"
"github.com/coder/serpent"
)

func (r *RootCmd) workspaceAgent() *serpent.Command {
Expand Down Expand Up @@ -62,8 +63,10 @@ func (r *RootCmd) workspaceAgent() *serpent.Command {
// This command isn't useful to manually execute.
Hidden: true,
Handler: func(inv *serpent.Invocation) error {
ctx, cancel := context.WithCancel(inv.Context())
defer cancel()
ctx, cancel := context.WithCancelCause(inv.Context())
defer func() {
cancel(xerrors.New("agent exited"))
}()

var (
ignorePorts = map[int]string{}
Expand Down Expand Up @@ -280,7 +283,6 @@ func (r *RootCmd) workspaceAgent() *serpent.Command {
return xerrors.Errorf("add executable to $PATH: %w", err)
}

prometheusRegistry := prometheus.NewRegistry()
subsystemsRaw := inv.Environ.Get(agent.EnvAgentSubsystem)
subsystems := []codersdk.AgentSubsystem{}
for _, s := range strings.Split(subsystemsRaw, ",") {
Expand Down Expand Up @@ -324,45 +326,69 @@ func (r *RootCmd) workspaceAgent() *serpent.Command {
logger.Info(ctx, "agent devcontainer detection not enabled")
}

agnt := agent.New(agent.Options{
Client: client,
Logger: logger,
LogDir: logDir,
ScriptDataDir: scriptDataDir,
// #nosec G115 - Safe conversion as tailnet listen port is within uint16 range (0-65535)
TailnetListenPort: uint16(tailnetListenPort),
ExchangeToken: func(ctx context.Context) (string, error) {
if exchangeToken == nil {
return client.SDK.SessionToken(), nil
}
resp, err := exchangeToken(ctx)
if err != nil {
return "", err
}
client.SetSessionToken(resp.SessionToken)
return resp.SessionToken, nil
},
EnvironmentVariables: environmentVariables,
IgnorePorts: ignorePorts,
SSHMaxTimeout: sshMaxTimeout,
Subsystems: subsystems,

PrometheusRegistry: prometheusRegistry,
BlockFileTransfer: blockFileTransfer,
Execer: execer,

ExperimentalDevcontainersEnabled: experimentalDevcontainersEnabled,
})

promHandler := agent.PrometheusMetricsHandler(prometheusRegistry, logger)
prometheusSrvClose := ServeHandler(ctx, logger, promHandler, prometheusAddress, "prometheus")
defer prometheusSrvClose()

debugSrvClose := ServeHandler(ctx, logger, agnt.HTTPDebug(), debugAddress, "debug")
defer debugSrvClose()

<-ctx.Done()
return agnt.Close()
reinitEvents := agentsdk.WaitForReinitLoop(ctx, logger, client)

var (
lastErr error
mustExit bool
)
for {
prometheusRegistry := prometheus.NewRegistry()

agnt := agent.New(agent.Options{
Client: client,
Logger: logger,
LogDir: logDir,
ScriptDataDir: scriptDataDir,
// #nosec G115 - Safe conversion as tailnet listen port is within uint16 range (0-65535)
TailnetListenPort: uint16(tailnetListenPort),
ExchangeToken: func(ctx context.Context) (string, error) {
if exchangeToken == nil {
return client.SDK.SessionToken(), nil
}
resp, err := exchangeToken(ctx)
if err != nil {
return "", err
}
client.SetSessionToken(resp.SessionToken)
return resp.SessionToken, nil
},
EnvironmentVariables: environmentVariables,
IgnorePorts: ignorePorts,
SSHMaxTimeout: sshMaxTimeout,
Subsystems: subsystems,

PrometheusRegistry: prometheusRegistry,
BlockFileTransfer: blockFileTransfer,
Execer: execer,
ExperimentalDevcontainersEnabled: experimentalDevcontainersEnabled,
})

promHandler := agent.PrometheusMetricsHandler(prometheusRegistry, logger)
prometheusSrvClose := ServeHandler(ctx, logger, promHandler, prometheusAddress, "prometheus")

debugSrvClose := ServeHandler(ctx, logger, agnt.HTTPDebug(), debugAddress, "debug")

select {
case <-ctx.Done():
logger.Warn(ctx, "agent shutting down", slog.Error(ctx.Err()), slog.Error(context.Cause(ctx)))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Warn seems alarmist; lots of legit reasons to shut down. Also, it's unexpected to include more than one call to slog.Error

mustExit = true
case event := <-reinitEvents:
logger.Warn(ctx, "agent received instruction to reinitialize",
slog.F("user_id", event.UserID), slog.F("workspace_id", event.WorkspaceID), slog.F("reason", event.Reason))
}

lastErr = agnt.Close()
debugSrvClose()
prometheusSrvClose()

if mustExit {
break
}

logger.Info(ctx, "agent reinitializing")
}
return lastErr
},
}

Expand Down
60 changes: 60 additions & 0 deletions cli/agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@ import (
"github.com/coder/coder/v2/coderd/coderdtest"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbfake"
"github.com/coder/coder/v2/coderd/database/dbtestutil"
"github.com/coder/coder/v2/coderd/prebuilds"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/codersdk/agentsdk"
"github.com/coder/coder/v2/codersdk/workspacesdk"
"github.com/coder/coder/v2/provisionersdk/proto"
"github.com/coder/coder/v2/testutil"
Expand Down Expand Up @@ -321,6 +324,63 @@ func TestWorkspaceAgent(t *testing.T) {
})
}

func TestAgent_Prebuild(t *testing.T) {
t.Parallel()

db, pubsub := dbtestutil.NewDB(t)
client := coderdtest.New(t, &coderdtest.Options{
Database: db,
Pubsub: pubsub,
})
user := coderdtest.CreateFirstUser(t, client)
presetID := uuid.New()
tv := dbfake.TemplateVersion(t, db).Seed(database.TemplateVersion{
OrganizationID: user.OrganizationID,
CreatedBy: user.UserID,
}).Preset(database.TemplateVersionPreset{
ID: presetID,
}).Do()
r := dbfake.WorkspaceBuild(t, db, database.WorkspaceTable{
OwnerID: prebuilds.SystemUserID,
TemplateID: tv.Template.ID,
}).WithAgent(func(a []*proto.Agent) []*proto.Agent {
a[0].Scripts = []*proto.Script{
{
DisplayName: "Prebuild Test Script",
Script: "sleep 5", // Make reinitialization take long enough to assert that it happened
RunOnStart: true,
},
}
return a
}).Do()

// Spin up an agent
logDir := t.TempDir()
inv, _ := clitest.New(t,
"agent",
"--auth", "token",
"--agent-token", r.AgentToken,
"--agent-url", client.URL.String(),
"--log-dir", logDir,
)
clitest.Start(t, inv)

// Check that the agent is in a happy steady state
waiter := coderdtest.NewWorkspaceAgentWaiter(t, client, r.Workspace.ID)
waiter.WaitFor(coderdtest.AgentsReady)

// Trigger reinitialization
channel := agentsdk.PrebuildClaimedChannel(r.Workspace.ID)
err := pubsub.Publish(channel, []byte(user.UserID.String()))
require.NoError(t, err)

// Check that the agent reinitializes
waiter.WaitFor(coderdtest.AgentsNotReady)

// Check that reinitialization completed
waiter.WaitFor(coderdtest.AgentsReady)
}

func matchAgentWithVersion(rs []codersdk.WorkspaceResource) bool {
if len(rs) < 1 {
return false
Expand Down
48 changes: 48 additions & 0 deletions coderd/apidoc/docs.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

40 changes: 40 additions & 0 deletions coderd/apidoc/swagger.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading