Skip to content

Commit 534f954

Browse files
committed
Add tests, improve lifecycle reporting
1 parent 1e53635 commit 534f954

File tree

6 files changed

+277
-44
lines changed

6 files changed

+277
-44
lines changed

agent/agent.go

Lines changed: 51 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ func New(options Options) io.Closer {
102102
exchangeToken: options.ExchangeToken,
103103
filesystem: options.Filesystem,
104104
tempDir: options.TempDir,
105+
lifecycleUpdate: make(chan struct{}, 1),
105106
}
106107
a.init(ctx)
107108
return a
@@ -128,8 +129,9 @@ type agent struct {
128129
sessionToken atomic.Pointer[string]
129130
sshServer *ssh.Server
130131

131-
lifecycleMu sync.Mutex // Protects following.
132-
lifecycleState codersdk.WorkspaceAgentLifecycle
132+
lifecycleUpdate chan struct{}
133+
lifecycleMu sync.Mutex // Protects following.
134+
lifecycleState codersdk.WorkspaceAgentLifecycle
133135

134136
network *tailnet.Conn
135137
}
@@ -139,6 +141,8 @@ type agent struct {
139141
// may be happening, but regardless after the intermittent
140142
// failure, you'll want the agent to reconnect.
141143
func (a *agent) runLoop(ctx context.Context) {
144+
go a.reportLifecycleLoop(ctx)
145+
142146
for retrier := retry.New(100*time.Millisecond, 10*time.Second); retrier.Wait(ctx); {
143147
a.logger.Info(ctx, "running loop")
144148
err := a.run(ctx)
@@ -160,27 +164,51 @@ func (a *agent) runLoop(ctx context.Context) {
160164
}
161165
}
162166

163-
func (a *agent) setLifecycle(ctx context.Context, state codersdk.WorkspaceAgentLifecycle) {
164-
a.lifecycleMu.Lock()
165-
defer a.lifecycleMu.Unlock()
167+
// reportLifecycleLoop reports the current lifecycle state once.
168+
// Only the latest state is reported, intermediate states may be
169+
// lost if the agent can't communicate with the API.
170+
func (a *agent) reportLifecycleLoop(ctx context.Context) {
171+
var lastReported codersdk.WorkspaceAgentLifecycle
172+
for {
173+
select {
174+
case <-a.lifecycleUpdate:
175+
case <-ctx.Done():
176+
return
177+
}
166178

167-
a.lifecycleState = state
179+
for r := retry.New(time.Second, 15*time.Second); r.Wait(ctx); {
180+
a.lifecycleMu.Lock()
181+
state := a.lifecycleState
182+
a.lifecycleMu.Unlock()
168183

169-
var err error
170-
for r := retry.New(time.Second, 30*time.Second); r.Wait(ctx); {
171-
err = a.client.PostWorkspaceAgentLifecycle(ctx, codersdk.PostWorkspaceAgentLifecycleRequest{
172-
State: state,
173-
})
174-
if err == nil {
175-
return
184+
if state == lastReported {
185+
continue
186+
}
187+
188+
err := a.client.PostWorkspaceAgentLifecycle(ctx, codersdk.PostWorkspaceAgentLifecycleRequest{
189+
State: state,
190+
})
191+
if err == nil {
192+
lastReported = state
193+
break
194+
}
195+
if xerrors.Is(err, context.Canceled) || xerrors.Is(err, context.DeadlineExceeded) {
196+
return
197+
}
198+
// If we fail to report the state we probably shouldn't exit, log only.
199+
a.logger.Error(ctx, "post state", slog.Error(err))
176200
}
177201
}
178-
if xerrors.Is(err, context.Canceled) || xerrors.Is(err, context.DeadlineExceeded) || a.isClosed() {
179-
return
180-
}
181-
if err != nil {
182-
// If we fail to report the state we probably shouldn't exit, log only.
183-
a.logger.Error(ctx, "post state", slog.Error(err))
202+
}
203+
204+
func (a *agent) setLifecycle(state codersdk.WorkspaceAgentLifecycle) {
205+
a.lifecycleMu.Lock()
206+
defer a.lifecycleMu.Unlock()
207+
208+
a.lifecycleState = state
209+
select {
210+
case a.lifecycleUpdate <- struct{}{}:
211+
default:
184212
}
185213
}
186214

@@ -224,14 +252,14 @@ func (a *agent) run(ctx context.Context) error {
224252
timeout = t.C
225253
}
226254

227-
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleStarting)
255+
a.setLifecycle(codersdk.WorkspaceAgentLifecycleStarting)
228256

229257
var err error
230258
select {
231259
case err = <-scriptDone:
232260
case <-timeout:
233261
a.logger.Warn(ctx, "startup script timed out")
234-
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleStartTimeout)
262+
a.setLifecycle(codersdk.WorkspaceAgentLifecycleStartTimeout)
235263
err = <-scriptDone // The script can still complete after a timeout.
236264
}
237265
if errors.Is(err, context.Canceled) {
@@ -240,7 +268,7 @@ func (a *agent) run(ctx context.Context) error {
240268
execTime := time.Since(scriptStart)
241269
if err != nil {
242270
a.logger.Warn(ctx, "startup script failed", slog.F("execution_time", execTime), slog.Error(err))
243-
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleStartError)
271+
a.setLifecycle(codersdk.WorkspaceAgentLifecycleStartError)
244272
return
245273
}
246274
a.logger.Info(ctx, "startup script completed", slog.F("execution_time", execTime))
@@ -255,7 +283,7 @@ func (a *agent) run(ctx context.Context) error {
255283
}
256284
}
257285

258-
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleReady)
286+
a.setLifecycle(codersdk.WorkspaceAgentLifecycleReady)
259287
}()
260288
}
261289

agent/agent_test.go

Lines changed: 124 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ func TestAgent_Stats_SSH(t *testing.T) {
5757
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
5858
defer cancel()
5959

60-
conn, stats, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{}, 0)
60+
conn, _, stats, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{}, 0)
6161

6262
sshClient, err := conn.SSHClient(ctx)
6363
require.NoError(t, err)
@@ -83,7 +83,7 @@ func TestAgent_Stats_ReconnectingPTY(t *testing.T) {
8383
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
8484
defer cancel()
8585

86-
conn, stats, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{}, 0)
86+
conn, _, stats, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{}, 0)
8787

8888
ptyConn, err := conn.ReconnectingPTY(ctx, uuid.New(), 128, 128, "/bin/bash")
8989
require.NoError(t, err)
@@ -531,7 +531,7 @@ func TestAgent_SFTP(t *testing.T) {
531531
if runtime.GOOS == "windows" {
532532
home = "/" + strings.ReplaceAll(home, "\\", "/")
533533
}
534-
conn, _, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{}, 0)
534+
conn, _, _, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{}, 0)
535535
sshClient, err := conn.SSHClient(ctx)
536536
require.NoError(t, err)
537537
defer sshClient.Close()
@@ -562,7 +562,7 @@ func TestAgent_SCP(t *testing.T) {
562562
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
563563
defer cancel()
564564

565-
conn, _, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{}, 0)
565+
conn, _, _, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{}, 0)
566566
sshClient, err := conn.SSHClient(ctx)
567567
require.NoError(t, err)
568568
defer sshClient.Close()
@@ -666,7 +666,7 @@ func TestAgent_StartupScript(t *testing.T) {
666666
t.Skip("This test doesn't work on Windows for some reason...")
667667
}
668668
content := "output"
669-
_, _, fs := setupAgent(t, codersdk.WorkspaceAgentMetadata{
669+
_, _, _, fs := setupAgent(t, codersdk.WorkspaceAgentMetadata{
670670
StartupScript: "echo " + content,
671671
}, 0)
672672
var gotContent string
@@ -694,6 +694,97 @@ func TestAgent_StartupScript(t *testing.T) {
694694
require.Equal(t, content, strings.TrimSpace(gotContent))
695695
}
696696

697+
func TestAgent_Lifecycle(t *testing.T) {
698+
t.Parallel()
699+
700+
t.Run("Timeout", func(t *testing.T) {
701+
t.Parallel()
702+
703+
_, client, _, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{
704+
StartupScript: "sleep 10",
705+
StartupScriptTimeout: time.Nanosecond,
706+
}, 0)
707+
708+
want := []codersdk.WorkspaceAgentLifecycle{
709+
codersdk.WorkspaceAgentLifecycleStarting,
710+
codersdk.WorkspaceAgentLifecycleStartTimeout,
711+
}
712+
713+
var got []codersdk.WorkspaceAgentLifecycle
714+
assert.Eventually(t, func() bool {
715+
got = client.getLifecycleStates()
716+
return len(got) > 0 && got[len(got)-1] == want[len(want)-1]
717+
}, testutil.WaitShort, testutil.IntervalMedium)
718+
switch len(got) {
719+
case 1:
720+
// This can happen if lifecycle states are too
721+
// fast, only the latest on is reported.
722+
require.Equal(t, want[1:], got)
723+
default:
724+
// This is the expected case.
725+
require.Equal(t, want, got)
726+
}
727+
})
728+
729+
t.Run("Error", func(t *testing.T) {
730+
t.Parallel()
731+
732+
_, client, _, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{
733+
StartupScript: "false",
734+
StartupScriptTimeout: 30 * time.Second,
735+
}, 0)
736+
737+
want := []codersdk.WorkspaceAgentLifecycle{
738+
codersdk.WorkspaceAgentLifecycleStarting,
739+
codersdk.WorkspaceAgentLifecycleStartError,
740+
}
741+
742+
var got []codersdk.WorkspaceAgentLifecycle
743+
assert.Eventually(t, func() bool {
744+
got = client.getLifecycleStates()
745+
return len(got) > 0 && got[len(got)-1] == want[len(want)-1]
746+
}, testutil.WaitShort, testutil.IntervalMedium)
747+
switch len(got) {
748+
case 1:
749+
// This can happen if lifecycle states are too
750+
// fast, only the latest on is reported.
751+
require.Equal(t, want[1:], got)
752+
default:
753+
// This is the expected case.
754+
require.Equal(t, want, got)
755+
}
756+
})
757+
758+
t.Run("Ready", func(t *testing.T) {
759+
t.Parallel()
760+
761+
_, client, _, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{
762+
StartupScript: "true",
763+
StartupScriptTimeout: 30 * time.Second,
764+
}, 0)
765+
766+
want := []codersdk.WorkspaceAgentLifecycle{
767+
codersdk.WorkspaceAgentLifecycleStarting,
768+
codersdk.WorkspaceAgentLifecycleReady,
769+
}
770+
771+
var got []codersdk.WorkspaceAgentLifecycle
772+
assert.Eventually(t, func() bool {
773+
got = client.getLifecycleStates()
774+
return len(got) > 0 && got[len(got)-1] == want[len(want)-1]
775+
}, testutil.WaitShort, testutil.IntervalMedium)
776+
switch len(got) {
777+
case 1:
778+
// This can happen if lifecycle states are too
779+
// fast, only the latest on is reported.
780+
require.Equal(t, want[1:], got)
781+
default:
782+
// This is the expected case.
783+
require.Equal(t, want, got)
784+
}
785+
})
786+
}
787+
697788
func TestAgent_ReconnectingPTY(t *testing.T) {
698789
t.Parallel()
699790
if runtime.GOOS == "windows" {
@@ -706,7 +797,7 @@ func TestAgent_ReconnectingPTY(t *testing.T) {
706797
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
707798
defer cancel()
708799

709-
conn, _, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{}, 0)
800+
conn, _, _, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{}, 0)
710801
id := uuid.New()
711802
netConn, err := conn.ReconnectingPTY(ctx, id, 100, 100, "/bin/bash")
712803
require.NoError(t, err)
@@ -807,7 +898,7 @@ func TestAgent_Dial(t *testing.T) {
807898
}
808899
}()
809900

810-
conn, _, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{}, 0)
901+
conn, _, _, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{}, 0)
811902
require.True(t, conn.AwaitReachable(context.Background()))
812903
conn1, err := conn.DialContext(context.Background(), l.Addr().Network(), l.Addr().String())
813904
require.NoError(t, err)
@@ -828,7 +919,7 @@ func TestAgent_Speedtest(t *testing.T) {
828919
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
829920
defer cancel()
830921
derpMap := tailnettest.RunDERPAndSTUN(t)
831-
conn, _, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{
922+
conn, _, _, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{
832923
DERPMap: derpMap,
833924
}, 0)
834925
defer conn.Close()
@@ -913,7 +1004,7 @@ func TestAgent_WriteVSCodeConfigs(t *testing.T) {
9131004
}
9141005

9151006
func setupSSHCommand(t *testing.T, beforeArgs []string, afterArgs []string) *exec.Cmd {
916-
agentConn, _, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{}, 0)
1007+
agentConn, _, _, _ := setupAgent(t, codersdk.WorkspaceAgentMetadata{}, 0)
9171008
listener, err := net.Listen("tcp", "127.0.0.1:0")
9181009
require.NoError(t, err)
9191010
waitGroup := sync.WaitGroup{}
@@ -959,7 +1050,7 @@ func setupSSHCommand(t *testing.T, beforeArgs []string, afterArgs []string) *exe
9591050
func setupSSHSession(t *testing.T, options codersdk.WorkspaceAgentMetadata) *ssh.Session {
9601051
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
9611052
defer cancel()
962-
conn, _, _ := setupAgent(t, options, 0)
1053+
conn, _, _, _ := setupAgent(t, options, 0)
9631054
sshClient, err := conn.SSHClient(ctx)
9641055
require.NoError(t, err)
9651056
t.Cleanup(func() {
@@ -981,6 +1072,7 @@ func (c closeFunc) Close() error {
9811072

9821073
func setupAgent(t *testing.T, metadata codersdk.WorkspaceAgentMetadata, ptyTimeout time.Duration) (
9831074
*codersdk.AgentConn,
1075+
*client,
9841076
<-chan *codersdk.AgentStats,
9851077
afero.Fs,
9861078
) {
@@ -994,14 +1086,15 @@ func setupAgent(t *testing.T, metadata codersdk.WorkspaceAgentMetadata, ptyTimeo
9941086
agentID := uuid.New()
9951087
statsCh := make(chan *codersdk.AgentStats, 50)
9961088
fs := afero.NewMemMapFs()
1089+
c := &client{
1090+
t: t,
1091+
agentID: agentID,
1092+
metadata: metadata,
1093+
statsChan: statsCh,
1094+
coordinator: coordinator,
1095+
}
9971096
closer := agent.New(agent.Options{
998-
Client: &client{
999-
t: t,
1000-
agentID: agentID,
1001-
metadata: metadata,
1002-
statsChan: statsCh,
1003-
coordinator: coordinator,
1004-
},
1097+
Client: c,
10051098
Filesystem: fs,
10061099
Logger: slogtest.Make(t, nil).Leveled(slog.LevelDebug),
10071100
ReconnectingPTYTimeout: ptyTimeout,
@@ -1034,7 +1127,7 @@ func setupAgent(t *testing.T, metadata codersdk.WorkspaceAgentMetadata, ptyTimeo
10341127
conn.SetNodeCallback(sendNode)
10351128
return &codersdk.AgentConn{
10361129
Conn: conn,
1037-
}, statsCh, fs
1130+
}, c, statsCh, fs
10381131
}
10391132

10401133
var dialTestPayload = []byte("dean-was-here123")
@@ -1075,6 +1168,9 @@ type client struct {
10751168
statsChan chan *codersdk.AgentStats
10761169
coordinator tailnet.Coordinator
10771170
lastWorkspaceAgent func()
1171+
1172+
mu sync.Mutex // Protects following.
1173+
lifecycleStates []codersdk.WorkspaceAgentLifecycle
10781174
}
10791175

10801176
func (c *client) WorkspaceAgentMetadata(_ context.Context) (codersdk.WorkspaceAgentMetadata, error) {
@@ -1130,7 +1226,16 @@ func (c *client) AgentReportStats(ctx context.Context, _ slog.Logger, stats func
11301226
}), nil
11311227
}
11321228

1133-
func (*client) PostWorkspaceAgentLifecycle(_ context.Context, _ codersdk.PostWorkspaceAgentLifecycleRequest) error {
1229+
func (c *client) getLifecycleStates() []codersdk.WorkspaceAgentLifecycle {
1230+
c.mu.Lock()
1231+
defer c.mu.Unlock()
1232+
return c.lifecycleStates
1233+
}
1234+
1235+
func (c *client) PostWorkspaceAgentLifecycle(_ context.Context, req codersdk.PostWorkspaceAgentLifecycleRequest) error {
1236+
c.mu.Lock()
1237+
defer c.mu.Unlock()
1238+
c.lifecycleStates = append(c.lifecycleStates, req.State)
11341239
return nil
11351240
}
11361241

0 commit comments

Comments
 (0)