Skip to content

feat: reinitialize agents when a prebuilt workspace is claimed #17475

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 49 commits into from
May 14, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
c09c9b9
WIP: agent reinitialization
SasSwart Apr 21, 2025
476fe71
fix assignment to nil map
SasSwart Apr 21, 2025
8c8bca6
fix: ensure prebuilt workspace agent tokens are reused when a prebuil…
SasSwart Apr 23, 2025
7ce4eea
test agent reinitialization
SasSwart Apr 24, 2025
52ac64e
remove defunct metric
SasSwart Apr 24, 2025
362db7c
Remove todo
SasSwart Apr 25, 2025
dcc7379
test that we trigger workspace agent reinitialization under the right…
SasSwart Apr 28, 2025
ff66b3f
slight improvements to a test
SasSwart Apr 28, 2025
efff5d9
review notes to improve legibility
SasSwart Apr 28, 2025
cebd5db
add an integration test for prebuilt workspace agent reinitialization
SasSwart Apr 29, 2025
2679138
Merge remote-tracking branch 'origin/main' into jjs/prebuilds-agent-r…
SasSwart Apr 29, 2025
9feebef
enable the premium license in a prebuilds integration test
SasSwart Apr 29, 2025
b117b5c
encapsulate WaitForReinitLoop for easier testing
SasSwart Apr 30, 2025
a22b414
introduce unit testable abstraction layers
SasSwart Apr 30, 2025
9bbd2c7
test workspace claim pubsub
SasSwart May 1, 2025
5804201
add tests for agent reinitialization
SasSwart May 1, 2025
7e8dcee
review notes
SasSwart May 1, 2025
725f97b
Merge remote-tracking branch 'origin/main' into jjs/prebuilds-agent-r…
SasSwart May 1, 2025
a9b1567
make fmt lint
SasSwart May 1, 2025
21ee970
remove go mod replace
SasSwart May 1, 2025
e54d7e7
remove defunct logging
SasSwart May 1, 2025
2799858
update dependency on terraform-provider-coder
SasSwart May 2, 2025
1d93003
update dependency on terraform-provider-coder
SasSwart May 2, 2025
763fc12
go mod tidy
SasSwart May 2, 2025
0f879c7
make -B gen
SasSwart May 2, 2025
61784c9
dont require ids to InsertPresetParameters
SasSwart May 2, 2025
604eb27
dont require ids to InsertPresetParameters
SasSwart May 2, 2025
bf4d2cf
fix: set the running agent token
dannykopping May 2, 2025
38b4f0d
fix: use http client without timeout like we do in connectRPCVersion
dannykopping May 5, 2025
20df538
review notes
SasSwart May 6, 2025
4bb3b68
Merge remote-tracking branch 'origin/main' into jjs/prebuilds-agent-r…
SasSwart May 7, 2025
83972db
bump provisionerd proto version
SasSwart May 7, 2025
146b158
fix: fetch the previous agent when we need its token for prebuilt wor…
SasSwart May 12, 2025
5eb16cd
Merge remote-tracking branch 'origin/main' into jjs/prebuilds-agent-r…
SasSwart May 12, 2025
730d803
make -B lint
SasSwart May 12, 2025
150adc0
Test GetWorkspaceAgentsByBuildID
SasSwart May 12, 2025
b4ecf10
Rename GetWorkspaceAgentsByWorkspaceAndBuildNumber
SasSwart May 12, 2025
3fa3edf
make gen
SasSwart May 12, 2025
7e45919
fix a race condition
SasSwart May 12, 2025
a632508
Merge remote-tracking branch 'origin/main' into jjs/prebuilds-agent-r…
SasSwart May 12, 2025
72125ec
Merge remote-tracking branch 'origin/main' into jjs/prebuilds-agent-r…
SasSwart May 13, 2025
b65eea7
fix provisionerdserver test for prebuild claims
SasSwart May 13, 2025
e1339f3
fix race conditions
SasSwart May 13, 2025
c1a8ba6
Merge remote-tracking branch 'origin/main' into jjs/prebuilds-agent-r…
SasSwart May 13, 2025
5363dcc
Make TestReinitializeAgent more robust
SasSwart May 13, 2025
7ad9b6d
fix tests
SasSwart May 14, 2025
394571d
make -B gen
SasSwart May 14, 2025
890747b
remove a potential race in reinitialization testing in TestCompleteJob
SasSwart May 14, 2025
b3870db
fix a potential race in TestReinit
SasSwart May 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
review notes
  • Loading branch information
SasSwart committed May 6, 2025
commit 20df5388676a3c353b2517f6a8f37c10744aa3db
6 changes: 2 additions & 4 deletions agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,6 @@ import (
"tailscale.com/util/clientmetric"

"cdr.dev/slog"

"github.com/coder/retry"

"github.com/coder/clistat"
"github.com/coder/coder/v2/agent/agentcontainers"
"github.com/coder/coder/v2/agent/agentexec"
Expand All @@ -56,6 +53,7 @@ import (
"github.com/coder/coder/v2/tailnet"
tailnetproto "github.com/coder/coder/v2/tailnet/proto"
"github.com/coder/quartz"
"github.com/coder/retry"
)

const (
Expand Down Expand Up @@ -1052,7 +1050,7 @@ func (a *agent) run() (retErr error) {

err = connMan.wait()
if err != nil {
a.logger.Warn(context.Background(), "connection manager errored", slog.Error(err))
a.logger.Info(context.Background(), "connection manager errored", slog.Error(err))
}
return err
}
Expand Down
4 changes: 2 additions & 2 deletions cli/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -371,10 +371,10 @@ func (r *RootCmd) workspaceAgent() *serpent.Command {

select {
case <-ctx.Done():
logger.Warn(ctx, "agent shutting down", slog.Error(ctx.Err()), slog.Error(context.Cause(ctx)))
logger.Info(ctx, "agent shutting down", slog.Error(context.Cause(ctx)))
mustExit = true
case event := <-reinitEvents:
logger.Warn(ctx, "agent received instruction to reinitialize",
logger.Info(ctx, "agent received instruction to reinitialize",
slog.F("user_id", event.UserID), slog.F("workspace_id", event.WorkspaceID), slog.F("reason", event.Reason))
}

Expand Down
60 changes: 0 additions & 60 deletions cli/agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,7 @@ import (
"github.com/coder/coder/v2/coderd/coderdtest"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbfake"
"github.com/coder/coder/v2/coderd/database/dbtestutil"
"github.com/coder/coder/v2/coderd/prebuilds"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/codersdk/agentsdk"
"github.com/coder/coder/v2/codersdk/workspacesdk"
"github.com/coder/coder/v2/provisionersdk/proto"
"github.com/coder/coder/v2/testutil"
Expand Down Expand Up @@ -324,63 +321,6 @@ func TestWorkspaceAgent(t *testing.T) {
})
}

func TestAgent_Prebuild(t *testing.T) {
t.Parallel()

db, pubsub := dbtestutil.NewDB(t)
client := coderdtest.New(t, &coderdtest.Options{
Database: db,
Pubsub: pubsub,
})
user := coderdtest.CreateFirstUser(t, client)
presetID := uuid.New()
tv := dbfake.TemplateVersion(t, db).Seed(database.TemplateVersion{
OrganizationID: user.OrganizationID,
CreatedBy: user.UserID,
}).Preset(database.TemplateVersionPreset{
ID: presetID,
}).Do()
r := dbfake.WorkspaceBuild(t, db, database.WorkspaceTable{
OwnerID: prebuilds.SystemUserID,
TemplateID: tv.Template.ID,
}).WithAgent(func(a []*proto.Agent) []*proto.Agent {
a[0].Scripts = []*proto.Script{
{
DisplayName: "Prebuild Test Script",
Script: "sleep 5", // Make reinitialization take long enough to assert that it happened
RunOnStart: true,
},
}
return a
}).Do()

// Spin up an agent
logDir := t.TempDir()
inv, _ := clitest.New(t,
"agent",
"--auth", "token",
"--agent-token", r.AgentToken,
"--agent-url", client.URL.String(),
"--log-dir", logDir,
)
clitest.Start(t, inv)

// Check that the agent is in a happy steady state
waiter := coderdtest.NewWorkspaceAgentWaiter(t, client, r.Workspace.ID)
waiter.WaitFor(coderdtest.AgentsReady)

// Trigger reinitialization
channel := agentsdk.PrebuildClaimedChannel(r.Workspace.ID)
err := pubsub.Publish(channel, []byte(user.UserID.String()))
require.NoError(t, err)

// Check that the agent reinitializes
waiter.WaitFor(coderdtest.AgentsNotReady)

// Check that reinitialization completed
waiter.WaitFor(coderdtest.AgentsReady)
}

func matchAgentWithVersion(rs []codersdk.WorkspaceResource) bool {
if len(rs) < 1 {
return false
Expand Down
10 changes: 5 additions & 5 deletions coderd/coderdtest/coderdtest.go
Original file line number Diff line number Diff line change
Expand Up @@ -1105,27 +1105,27 @@ func (w WorkspaceAgentWaiter) MatchResources(m func([]codersdk.WorkspaceResource
return w
}

// WaitForCriterium represents a boolean assertion to be made against each agent
// that a given WorkspaceAgentWaited knows about. Each WaitForCriterium should apply
// WaitForAgentFn represents a boolean assertion to be made against each agent
// that a given WorkspaceAgentWaited knows about. Each WaitForAgentFn should apply
// the check to a single agent, but it should be named for plural, because `func (w WorkspaceAgentWaiter) WaitFor`
// applies the check to all agents that it is aware of. This ensures that the public API of the waiter
// reads correctly. For example:
//
// waiter := coderdtest.NewWorkspaceAgentWaiter(t, client, r.Workspace.ID)
// waiter.WaitFor(coderdtest.AgentsReady)
type WaitForCriterium func(agent codersdk.WorkspaceAgent) bool
type WaitForAgentFn func(agent codersdk.WorkspaceAgent) bool

// AgentsReady checks that the latest lifecycle state of an agent is "Ready".
func AgentsReady(agent codersdk.WorkspaceAgent) bool {
return agent.LifecycleState == codersdk.WorkspaceAgentLifecycleReady
}

// AgentsReady checks that the latest lifecycle state of an agent is anything except "Ready".
// AgentsNotReady checks that the latest lifecycle state of an agent is anything except "Ready".
func AgentsNotReady(agent codersdk.WorkspaceAgent) bool {
return !AgentsReady(agent)
}

func (w WorkspaceAgentWaiter) WaitFor(criteria ...WaitForCriterium) {
func (w WorkspaceAgentWaiter) WaitFor(criteria ...WaitForAgentFn) {
w.t.Helper()

agentNamesMap := make(map[string]struct{}, len(w.agentNames))
Expand Down
19 changes: 11 additions & 8 deletions coderd/prebuilds/claim.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,17 @@ type PubsubWorkspaceClaimListener struct {
ps pubsub.Pubsub
}

func (p PubsubWorkspaceClaimListener) ListenForWorkspaceClaims(ctx context.Context, workspaceID uuid.UUID) (func(), <-chan agentsdk.ReinitializationEvent, error) {
// ListenForWorkspaceClaims subscribes to a pubsub channel and sends any received events on the chan that it returns.
// pubsub.Pubsub does not communicate when its last callback has been called after it has been closed. As such the chan
// returned by this method is never closed. Call the returned cancel() function to close the subscription when it is no longer needed.
// cancel() will be called if ctx expires or is canceled.
func (p PubsubWorkspaceClaimListener) ListenForWorkspaceClaims(ctx context.Context, workspaceID uuid.UUID, reinitEvents chan<- agentsdk.ReinitializationEvent) (func(), error) {
select {
case <-ctx.Done():
return func() {}, nil, ctx.Err()
return func() {}, ctx.Err()
default:
}

workspaceClaims := make(chan agentsdk.ReinitializationEvent, 1)
cancelSub, err := p.ps.Subscribe(agentsdk.PrebuildClaimedChannel(workspaceID), func(inner context.Context, id []byte) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As we talked about on a call --- the pubsub is not considered reliable transport, and we can miss events.

This can lead to a situation where the agent misses the reinit signal and never reinitializes, even though it has been claimed.

The deep problem here is that we are using the PubSub to send material information (who the new owner is), rather than just a kick that there is new information available (the workspace has a new owner). In the latter case, when there is an error, we can recover by re-querying the database to find the owner, and then decide whether we need to signal the agent with new information. This requires the handler to keep track of the last owner it sent, but that's a trivial amount of memory to keep.

I don't necessarily think this needs to be fixed in this PR, since the plan is to move to a new "stream of manifest" architecture, but as we implement that, we need to keep error handling in mind on both sides: coderd recovers from a pubsub error by querying the database (or closing the connection if the database query fails), and then deciding whether there is something new to send. The agent recovers from a dropped connection by redialing, and then checking the new manifest against it's existing one to see if it needs to take any action.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for identifying this. Let's defer it beyond this PR to the next release if there are no objections. I'd like to get to this as part of the manifest streaming work.

claimantID, err := uuid.ParseBytes(id)
if err != nil {
Expand All @@ -56,25 +59,25 @@ func (p PubsubWorkspaceClaimListener) ListenForWorkspaceClaims(ctx context.Conte
WorkspaceID: workspaceID,
Reason: agentsdk.ReinitializeReasonPrebuildClaimed,
}

select {
case <-ctx.Done():
return
case <-inner.Done():
return
case workspaceClaims <- claim:
case reinitEvents <- claim:
default:
return
}
})
if err != nil {
close(workspaceClaims)
return func() {}, nil, xerrors.Errorf("failed to subscribe to prebuild claimed channel: %w", err)
return func() {}, xerrors.Errorf("failed to subscribe to prebuild claimed channel: %w", err)
}

var once sync.Once
cancel := func() {
once.Do(func() {
cancelSub()
close(workspaceClaims)
})
}

Expand All @@ -83,5 +86,5 @@ func (p PubsubWorkspaceClaimListener) ListenForWorkspaceClaims(ctx context.Conte
cancel()
}()

return cancel, workspaceClaims, nil
return cancel, nil
}
55 changes: 10 additions & 45 deletions coderd/prebuilds/claim_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,55 +66,17 @@ func TestPubsubWorkspaceClaimPublisher(t *testing.T) {

func TestPubsubWorkspaceClaimListener(t *testing.T) {
t.Parallel()
t.Run("stops listening if context canceled", func(t *testing.T) {
t.Parallel()

ps := pubsub.NewInMemory()
listener := prebuilds.NewPubsubWorkspaceClaimListener(ps, slogtest.Make(t, nil))

ctx, cancel := context.WithCancel(context.Background())

cancelFunc, claims, err := listener.ListenForWorkspaceClaims(ctx, uuid.New())
require.NoError(t, err)
defer cancelFunc()

cancel()
// Channel should be closed immediately due to context cancellation
select {
case _, ok := <-claims:
require.False(t, ok)
case <-time.After(testutil.WaitShort):
t.Fatal("timeout waiting for closed channel")
}
})

t.Run("stops listening if cancel func is called", func(t *testing.T) {
t.Parallel()

ps := pubsub.NewInMemory()
listener := prebuilds.NewPubsubWorkspaceClaimListener(ps, slogtest.Make(t, nil))

cancelFunc, claims, err := listener.ListenForWorkspaceClaims(context.Background(), uuid.New())
require.NoError(t, err)

cancelFunc()
select {
case _, ok := <-claims:
require.False(t, ok)
case <-time.After(testutil.WaitShort):
t.Fatal("timeout waiting for closed channel")
}
})

t.Run("finds claim events for its workspace", func(t *testing.T) {
t.Parallel()

ps := pubsub.NewInMemory()
listener := prebuilds.NewPubsubWorkspaceClaimListener(ps, slogtest.Make(t, nil))

claims := make(chan agentsdk.ReinitializationEvent, 1) // Buffer to avoid messing with goroutines in the rest of the test

workspaceID := uuid.New()
userID := uuid.New()
cancelFunc, claims, err := listener.ListenForWorkspaceClaims(context.Background(), workspaceID)
cancelFunc, err := listener.ListenForWorkspaceClaims(context.Background(), workspaceID, claims)
require.NoError(t, err)
defer cancelFunc()

Expand All @@ -125,11 +87,12 @@ func TestPubsubWorkspaceClaimListener(t *testing.T) {

// Verify we receive the claim
select {
case claim := <-claims:
case claim, ok := <-claims:
require.True(t, ok, "received on a closed channel")
require.Equal(t, userID, claim.UserID)
require.Equal(t, workspaceID, claim.WorkspaceID)
require.Equal(t, agentsdk.ReinitializeReasonPrebuildClaimed, claim.Reason)
case <-time.After(time.Second):
case <-time.After(testutil.WaitSuperLong): // TODO: revert to waitshort
t.Fatal("timeout waiting for claim")
}
})
Expand All @@ -140,9 +103,10 @@ func TestPubsubWorkspaceClaimListener(t *testing.T) {
ps := pubsub.NewInMemory()
listener := prebuilds.NewPubsubWorkspaceClaimListener(ps, slogtest.Make(t, nil))

claims := make(chan agentsdk.ReinitializationEvent)
workspaceID := uuid.New()
otherWorkspaceID := uuid.New()
cancelFunc, claims, err := listener.ListenForWorkspaceClaims(context.Background(), workspaceID)
cancelFunc, err := listener.ListenForWorkspaceClaims(context.Background(), workspaceID, claims)
require.NoError(t, err)
defer cancelFunc()

Expand All @@ -163,10 +127,11 @@ func TestPubsubWorkspaceClaimListener(t *testing.T) {
t.Run("communicates the error if it can't subscribe", func(t *testing.T) {
t.Parallel()

claims := make(chan agentsdk.ReinitializationEvent)
ps := &brokenPubsub{}
listener := prebuilds.NewPubsubWorkspaceClaimListener(ps, slogtest.Make(t, nil))

_, _, err := listener.ListenForWorkspaceClaims(context.Background(), uuid.New())
_, err := listener.ListenForWorkspaceClaims(context.Background(), uuid.New(), claims)
require.ErrorContains(t, err, "failed to subscribe to prebuild claimed channel")
})
}
Expand Down
30 changes: 18 additions & 12 deletions coderd/provisionerdserver/provisionerdserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -620,11 +620,24 @@ func (s *server) acquireProtoJob(ctx context.Context, job database.ProvisionerJo
}

runningAgentAuthTokens := []*sdkproto.RunningAgentAuthToken{}
for agentID, token := range input.RunningAgentAuthTokens {
runningAgentAuthTokens = append(runningAgentAuthTokens, &sdkproto.RunningAgentAuthToken{
AgentId: agentID.String(),
Token: token,
})
if input.PrebuildClaimedByUser != uuid.Nil {
// runningAgentAuthTokens are *only* used for prebuilds. We fetch them when we want to rebuild a prebuilt workspace
// but not generate new agent tokens. The provisionerdserver will push them down to
// the provisioner (and ultimately to the `coder_agent` resource in the Terraform provider) where they will be
// reused. Context: the agent token is often used in immutable attributes of workspace resource (e.g. VM/container)
// to initialize the agent, so if that value changes it will necessitate a replacement of that resource, thus
// obviating the whole point of the prebuild.
agents, err := s.Database.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, workspace.ID)
if err != nil {
s.Logger.Error(ctx, "failed to retrieve running agents of claimed prebuilt workspace",
slog.F("workspace_id", workspace.ID), slog.Error(err))
}
for _, agent := range agents {
runningAgentAuthTokens = append(runningAgentAuthTokens, &sdkproto.RunningAgentAuthToken{
AgentId: agent.ID.String(),
Token: agent.AuthToken.String(),
})
}
}

protoJob.Type = &proto.AcquiredJob_WorkspaceBuild_{
Expand Down Expand Up @@ -2503,13 +2516,6 @@ type WorkspaceProvisionJob struct {
IsPrebuild bool `json:"is_prebuild,omitempty"`
PrebuildClaimedByUser uuid.UUID `json:"prebuild_claimed_by,omitempty"`
LogLevel string `json:"log_level,omitempty"`
// RunningAgentAuthTokens is *only* used for prebuilds. We pass it down when we want to rebuild a prebuilt workspace
// but not generate new agent tokens. The provisionerdserver will retrieve these tokens and push them down to
// the provisioner (and ultimately to the `coder_agent` resource in the Terraform provider) where they will be
// reused. Context: the agent token is often used in immutable attributes of workspace resource (e.g. VM/container)
// to initialize the agent, so if that value changes it will necessitate a replacement of that resource, thus
// obviating the whole point of the prebuild.
RunningAgentAuthTokens map[uuid.UUID]string `json:"running_agent_auth_tokens"`
}

// TemplateVersionDryRunJob is the payload for the "template_version_dry_run" job type.
Expand Down
12 changes: 9 additions & 3 deletions coderd/workspaceagents.go
Original file line number Diff line number Diff line change
Expand Up @@ -1210,9 +1210,10 @@ func (api *API) workspaceAgentReinit(rw http.ResponseWriter, r *http.Request) {

log.Info(ctx, "agent waiting for reinit instruction")

cancel, reinitEvents, err := prebuilds.NewPubsubWorkspaceClaimListener(api.Pubsub, log).ListenForWorkspaceClaims(ctx, workspace.ID)
reinitEvents := make(chan agentsdk.ReinitializationEvent)
cancel, err = prebuilds.NewPubsubWorkspaceClaimListener(api.Pubsub, log).ListenForWorkspaceClaims(ctx, workspace.ID, reinitEvents)
if err != nil {
log.Error(ctx, "failed to subscribe to prebuild claimed channel", slog.Error(err))
log.Error(ctx, "subscribe to prebuild claimed channel", slog.Error(err))
httpapi.InternalServerError(rw, xerrors.New("failed to subscribe to prebuild claimed channel"))
return
}
Expand All @@ -1221,7 +1222,12 @@ func (api *API) workspaceAgentReinit(rw http.ResponseWriter, r *http.Request) {
transmitter := agentsdk.NewSSEAgentReinitTransmitter(log, rw, r)

err = transmitter.Transmit(ctx, reinitEvents)
if err != nil {
switch {
case errors.Is(err, agentsdk.ErrTransmissionSourceClosed):
log.Info(ctx, "agent reinitialization subscription closed", slog.F("workspace_agent_id", workspaceAgent.ID))
case errors.Is(err, agentsdk.ErrTransmissionTargetClosed):
log.Info(ctx, "agent connection closed", slog.F("workspace_agent_id", workspaceAgent.ID))
case err != nil:
log.Error(ctx, "failed to stream agent reinit events", slog.Error(err))
httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
Message: "Internal error streaming agent reinitialization events.",
Expand Down
Loading