Skip to content

Commit 2b6c229

Browse files
authored
fix: Trigger workspace event after agent timeout seconds (coder#5141)
Fixes coder#5116
1 parent e94b27b commit 2b6c229

File tree

3 files changed

+75
-27
lines changed

3 files changed

+75
-27
lines changed

.vscode/settings.json

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
"enablements",
3434
"errgroup",
3535
"eventsourcemock",
36+
"Failf",
3637
"fatih",
3738
"Formik",
3839
"gitauth",
@@ -160,10 +161,7 @@
160161
"xstate",
161162
"yamux"
162163
],
163-
"cSpell.ignorePaths": [
164-
"site/package.json",
165-
".vscode/settings.json"
166-
],
164+
"cSpell.ignorePaths": ["site/package.json", ".vscode/settings.json"],
167165
"emeraldwalk.runonsave": {
168166
"commands": [
169167
{
@@ -195,10 +193,7 @@
195193
// To reduce redundancy in tests, it's covered by other packages.
196194
// Since package coverage pairing can't be defined, all packages cover
197195
// all other packages.
198-
"go.testFlags": [
199-
"-short",
200-
"-coverpkg=./..."
201-
],
196+
"go.testFlags": ["-short", "-coverpkg=./..."],
202197
// We often use a version of TypeScript that's ahead of the version shipped
203198
// with VS Code.
204199
"typescript.tsdk": "./site/node_modules/typescript/lib"

coderd/provisionerdserver/provisionerdserver.go

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ import (
1414

1515
"github.com/google/uuid"
1616
"github.com/tabbed/pqtype"
17+
"golang.org/x/exp/maps"
18+
"golang.org/x/exp/slices"
1719
"golang.org/x/xerrors"
1820
protobuf "google.golang.org/protobuf/proto"
1921

@@ -631,14 +633,58 @@ func (server *Server) CompleteJob(ctx context.Context, completed *proto.Complete
631633
if err != nil {
632634
return xerrors.Errorf("update workspace build: %w", err)
633635
}
636+
637+
agentTimeouts := make(map[time.Duration]bool) // A set of agent timeouts.
634638
// This could be a bulk insert to improve performance.
635639
for _, protoResource := range jobType.WorkspaceBuild.Resources {
640+
for _, protoAgent := range protoResource.Agents {
641+
dur := time.Duration(protoAgent.GetConnectionTimeoutSeconds()) * time.Second
642+
agentTimeouts[dur] = true
643+
}
636644
err = InsertWorkspaceResource(ctx, db, job.ID, workspaceBuild.Transition, protoResource, telemetrySnapshot)
637645
if err != nil {
638646
return xerrors.Errorf("insert provisioner job: %w", err)
639647
}
640648
}
641649

650+
// On start, we want to ensure that workspace agents timeout statuses
651+
// are propagated. This method is simple and does not protect against
652+
// notifying in edge cases like when a workspace is stopped soon
653+
// after being started.
654+
//
655+
// Agent timeouts could be minutes apart, resulting in an unresponsive
656+
// experience, so we'll notify after every unique timeout seconds.
657+
if !input.DryRun && workspaceBuild.Transition == database.WorkspaceTransitionStart && len(agentTimeouts) > 0 {
658+
timeouts := maps.Keys(agentTimeouts)
659+
slices.Sort(timeouts)
660+
661+
var updates []<-chan time.Time
662+
for _, d := range timeouts {
663+
server.Logger.Debug(ctx, "triggering workspace notification after agent timeout",
664+
slog.F("workspace_build_id", workspaceBuild.ID),
665+
slog.F("timeout", d),
666+
)
667+
// Agents are inserted with `database.Now()`, this triggers a
668+
// workspace event approximately after created + timeout seconds.
669+
updates = append(updates, time.After(d))
670+
}
671+
go func() {
672+
for _, wait := range updates {
673+
// Wait for the next potential timeout to occur. Note that we
674+
// can't listen on the context here because we will hang around
675+
// after this function has returned. The server also doesn't
676+
// have a shutdown signal we can listen to.
677+
<-wait
678+
if err := server.Pubsub.Publish(codersdk.WorkspaceNotifyChannel(workspaceBuild.WorkspaceID), []byte{}); err != nil {
679+
server.Logger.Error(ctx, "workspace notification after agent timeout failed",
680+
slog.F("workspace_build_id", workspaceBuild.ID),
681+
slog.Error(err),
682+
)
683+
}
684+
}
685+
}()
686+
}
687+
642688
if workspaceBuild.Transition != database.WorkspaceTransitionDelete {
643689
// This is for deleting a workspace!
644690
return nil

coderd/workspaces_test.go

Lines changed: 26 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1357,6 +1357,7 @@ func TestWorkspaceWatcher(t *testing.T) {
13571357
Auth: &proto.Agent_Token{
13581358
Token: authToken,
13591359
},
1360+
ConnectionTimeoutSeconds: 1,
13601361
}},
13611362
}},
13621363
},
@@ -1372,58 +1373,64 @@ func TestWorkspaceWatcher(t *testing.T) {
13721373

13731374
wc, err := client.WatchWorkspace(ctx, workspace.ID)
13741375
require.NoError(t, err)
1375-
wait := func() {
1376+
1377+
// Wait events are easier to debug with timestamped logs.
1378+
logger := slogtest.Make(t, nil).Named(t.Name()).Leveled(slog.LevelDebug)
1379+
wait := func(event string) {
13761380
select {
13771381
case <-ctx.Done():
1378-
t.Fail()
1382+
require.FailNow(t, "timed out waiting for event", event)
13791383
case <-wc:
1384+
logger.Info(ctx, "done waiting for event", slog.F("event", event))
13801385
}
13811386
}
13821387

13831388
coderdtest.CreateWorkspaceBuild(t, client, workspace, database.WorkspaceTransitionStart)
1384-
// the workspace build being created
1385-
wait()
1386-
// the workspace build being acquired
1387-
wait()
1388-
// the workspace build completing
1389-
wait()
1389+
wait("workspace build being created")
1390+
wait("workspace build being acquired")
1391+
wait("workspace build completing")
1392+
1393+
// Unfortunately, this will add ~1s to the test due to the granularity
1394+
// of agent timeout seconds. However, if we don't do this we won't know
1395+
// which trigger we received when waiting for connection.
1396+
//
1397+
// Note that the first timeout is from `coderdtest.CreateWorkspace` and
1398+
// the latter is from `coderdtest.CreateWorkspaceBuild`.
1399+
wait("agent timeout after create")
1400+
wait("agent timeout after start")
13901401

13911402
agentClient := codersdk.New(client.URL)
13921403
agentClient.SetSessionToken(authToken)
13931404
agentCloser := agent.New(agent.Options{
13941405
Client: agentClient,
1395-
Logger: slogtest.Make(t, nil).Named("agent").Leveled(slog.LevelDebug),
1406+
Logger: logger.Named("agent"),
13961407
})
13971408
defer func() {
13981409
_ = agentCloser.Close()
13991410
}()
14001411

1401-
// the agent connected
1402-
wait()
1412+
wait("agent connected")
14031413
agentCloser.Close()
1404-
// the agent disconnected
1405-
wait()
1414+
wait("agent disconnected")
14061415

14071416
closeFunc.Close()
14081417
build := coderdtest.CreateWorkspaceBuild(t, client, workspace, database.WorkspaceTransitionStart)
1409-
// First is for the workspace build itself
1410-
wait()
1418+
wait("first is for the workspace build itself")
14111419
err = client.CancelWorkspaceBuild(ctx, build.ID)
14121420
require.NoError(t, err)
1413-
// Second is for the build cancel
1414-
wait()
1421+
wait("second is for the build cancel")
14151422

14161423
err = client.UpdateWorkspace(ctx, workspace.ID, codersdk.UpdateWorkspaceRequest{
14171424
Name: "another",
14181425
})
14191426
require.NoError(t, err)
1420-
wait()
1427+
wait("update workspace name")
14211428

14221429
err = client.UpdateActiveTemplateVersion(ctx, template.ID, codersdk.UpdateActiveTemplateVersion{
14231430
ID: template.ActiveVersionID,
14241431
})
14251432
require.NoError(t, err)
1426-
wait()
1433+
wait("update active template version")
14271434

14281435
cancel()
14291436
}

0 commit comments

Comments
 (0)