@@ -121,7 +121,10 @@ func New(options Options) io.Closer {
121
121
logDir : options .LogDir ,
122
122
tempDir : options .TempDir ,
123
123
lifecycleUpdate : make (chan struct {}, 1 ),
124
- connStatsChan : make (chan * agentsdk.Stats , 1 ),
124
+ lifecycleReported : make (chan codersdk.WorkspaceAgentLifecycle , 1 ),
125
+ // TODO: This is a temporary hack to make tests not flake.
126
+ // @kylecarbs has a better solution in here: https://github.com/coder/coder/pull/6469
127
+ connStatsChan : make (chan * agentsdk.Stats , 8 ),
125
128
}
126
129
a .init (ctx )
127
130
return a
@@ -149,9 +152,10 @@ type agent struct {
149
152
sessionToken atomic.Pointer [string ]
150
153
sshServer * ssh.Server
151
154
152
- lifecycleUpdate chan struct {}
153
- lifecycleMu sync.Mutex // Protects following.
154
- lifecycleState codersdk.WorkspaceAgentLifecycle
155
+ lifecycleUpdate chan struct {}
156
+ lifecycleReported chan codersdk.WorkspaceAgentLifecycle
157
+ lifecycleMu sync.RWMutex // Protects following.
158
+ lifecycleState codersdk.WorkspaceAgentLifecycle
155
159
156
160
network * tailnet.Conn
157
161
connStatsChan chan * agentsdk.Stats
@@ -207,9 +211,9 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
207
211
}
208
212
209
213
for r := retry .New (time .Second , 15 * time .Second ); r .Wait (ctx ); {
210
- a .lifecycleMu .Lock ()
214
+ a .lifecycleMu .RLock ()
211
215
state := a .lifecycleState
212
- a .lifecycleMu .Unlock ()
216
+ a .lifecycleMu .RUnlock ()
213
217
214
218
if state == lastReported {
215
219
break
@@ -222,6 +226,11 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
222
226
})
223
227
if err == nil {
224
228
lastReported = state
229
+ select {
230
+ case a .lifecycleReported <- state :
231
+ case <- a .lifecycleReported :
232
+ a .lifecycleReported <- state
233
+ }
225
234
break
226
235
}
227
236
if xerrors .Is (err , context .Canceled ) || xerrors .Is (err , context .DeadlineExceeded ) {
@@ -233,13 +242,20 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
233
242
}
234
243
}
235
244
245
+ // setLifecycle sets the lifecycle state and notifies the lifecycle loop.
246
+ // The state is only updated if it's a valid state transition.
236
247
func (a * agent ) setLifecycle (ctx context.Context , state codersdk.WorkspaceAgentLifecycle ) {
237
248
a .lifecycleMu .Lock ()
238
- defer a .lifecycleMu .Unlock ()
239
-
240
- a .logger .Debug (ctx , "set lifecycle state" , slog .F ("state" , state ), slog .F ("previous" , a .lifecycleState ))
241
-
249
+ lastState := a .lifecycleState
250
+ if slices .Index (codersdk .WorkspaceAgentLifecycleOrder , lastState ) > slices .Index (codersdk .WorkspaceAgentLifecycleOrder , state ) {
251
+ a .logger .Warn (ctx , "attempted to set lifecycle state to a previous state" , slog .F ("last" , lastState ), slog .F ("state" , state ))
252
+ a .lifecycleMu .Unlock ()
253
+ return
254
+ }
242
255
a .lifecycleState = state
256
+ a .logger .Debug (ctx , "set lifecycle state" , slog .F ("state" , state ), slog .F ("last" , lastState ))
257
+ a .lifecycleMu .Unlock ()
258
+
243
259
select {
244
260
case a .lifecycleUpdate <- struct {}{}:
245
261
default :
@@ -299,9 +315,10 @@ func (a *agent) run(ctx context.Context) error {
299
315
}
300
316
}
301
317
318
+ lifecycleState := codersdk .WorkspaceAgentLifecycleReady
302
319
scriptDone := make (chan error , 1 )
303
320
scriptStart := time .Now ()
304
- err : = a .trackConnGoroutine (func () {
321
+ err = a .trackConnGoroutine (func () {
305
322
defer close (scriptDone )
306
323
scriptDone <- a .runStartupScript (ctx , metadata .StartupScript )
307
324
})
@@ -329,16 +346,17 @@ func (a *agent) run(ctx context.Context) error {
329
346
if errors .Is (err , context .Canceled ) {
330
347
return
331
348
}
332
- execTime := time .Since (scriptStart )
333
- lifecycleStatus := codersdk .WorkspaceAgentLifecycleReady
334
- if err != nil {
335
- a .logger .Warn (ctx , "startup script failed" , slog .F ("execution_time" , execTime ), slog .Error (err ))
336
- lifecycleStatus = codersdk .WorkspaceAgentLifecycleStartError
337
- } else {
338
- a .logger .Info (ctx , "startup script completed" , slog .F ("execution_time" , execTime ))
349
+ // Only log if there was a startup script.
350
+ if metadata .StartupScript != "" {
351
+ execTime := time .Since (scriptStart )
352
+ if err != nil {
353
+ a .logger .Warn (ctx , "startup script failed" , slog .F ("execution_time" , execTime ), slog .Error (err ))
354
+ lifecycleState = codersdk .WorkspaceAgentLifecycleStartError
355
+ } else {
356
+ a .logger .Info (ctx , "startup script completed" , slog .F ("execution_time" , execTime ))
357
+ }
339
358
}
340
-
341
- a .setLifecycle (ctx , lifecycleStatus )
359
+ a .setLifecycle (ctx , lifecycleState )
342
360
}()
343
361
}
344
362
@@ -606,14 +624,22 @@ func (a *agent) runCoordinator(ctx context.Context, network *tailnet.Conn) error
606
624
}
607
625
608
626
func (a * agent ) runStartupScript (ctx context.Context , script string ) error {
627
+ return a .runScript (ctx , "startup" , script )
628
+ }
629
+
630
+ func (a * agent ) runShutdownScript (ctx context.Context , script string ) error {
631
+ return a .runScript (ctx , "shutdown" , script )
632
+ }
633
+
634
+ func (a * agent ) runScript (ctx context.Context , lifecycle , script string ) error {
609
635
if script == "" {
610
636
return nil
611
637
}
612
638
613
- a .logger .Info (ctx , "running startup script" , slog .F ("script" , script ))
614
- writer , err := a .filesystem .OpenFile (filepath .Join (a .logDir , "coder-startup -script.log" ), os .O_CREATE | os .O_RDWR , 0o600 )
639
+ a .logger .Info (ctx , "running script" , slog . F ( "lifecycle" , lifecycle ) , slog .F ("script" , script ))
640
+ writer , err := a .filesystem .OpenFile (filepath .Join (a .logDir , fmt . Sprintf ( "coder-%s -script.log" , lifecycle ) ), os .O_CREATE | os .O_RDWR , 0o600 )
615
641
if err != nil {
616
- return xerrors .Errorf ("open startup script log file: %w" , err )
642
+ return xerrors .Errorf ("open %s script log file: %w" , lifecycle , err )
617
643
}
618
644
defer func () {
619
645
_ = writer .Close ()
@@ -774,7 +800,7 @@ func (a *agent) createCommand(ctx context.Context, rawCommand string, env []stri
774
800
775
801
rawMetadata := a .metadata .Load ()
776
802
if rawMetadata == nil {
777
- return nil , xerrors .Errorf ("no metadata was provided: %w" , err )
803
+ return nil , xerrors .Errorf ("no metadata was provided" )
778
804
}
779
805
metadata , valid := rawMetadata .(agentsdk.Metadata )
780
806
if ! valid {
@@ -1290,13 +1316,73 @@ func (a *agent) Close() error {
1290
1316
if a .isClosed () {
1291
1317
return nil
1292
1318
}
1319
+
1320
+ ctx := context .Background ()
1321
+ a .setLifecycle (ctx , codersdk .WorkspaceAgentLifecycleShuttingDown )
1322
+
1323
+ lifecycleState := codersdk .WorkspaceAgentLifecycleOff
1324
+ if metadata , ok := a .metadata .Load ().(agentsdk.Metadata ); ok && metadata .ShutdownScript != "" {
1325
+ scriptDone := make (chan error , 1 )
1326
+ scriptStart := time .Now ()
1327
+ go func () {
1328
+ defer close (scriptDone )
1329
+ scriptDone <- a .runShutdownScript (ctx , metadata .ShutdownScript )
1330
+ }()
1331
+
1332
+ var timeout <- chan time.Time
1333
+ // If timeout is zero, an older version of the coder
1334
+ // provider was used. Otherwise a timeout is always > 0.
1335
+ if metadata .ShutdownScriptTimeout > 0 {
1336
+ t := time .NewTimer (metadata .ShutdownScriptTimeout )
1337
+ defer t .Stop ()
1338
+ timeout = t .C
1339
+ }
1340
+
1341
+ var err error
1342
+ select {
1343
+ case err = <- scriptDone :
1344
+ case <- timeout :
1345
+ a .logger .Warn (ctx , "shutdown script timed out" )
1346
+ a .setLifecycle (ctx , codersdk .WorkspaceAgentLifecycleShutdownTimeout )
1347
+ err = <- scriptDone // The script can still complete after a timeout.
1348
+ }
1349
+ execTime := time .Since (scriptStart )
1350
+ if err != nil {
1351
+ a .logger .Warn (ctx , "shutdown script failed" , slog .F ("execution_time" , execTime ), slog .Error (err ))
1352
+ lifecycleState = codersdk .WorkspaceAgentLifecycleShutdownError
1353
+ } else {
1354
+ a .logger .Info (ctx , "shutdown script completed" , slog .F ("execution_time" , execTime ))
1355
+ }
1356
+ }
1357
+
1358
+ // Set final state and wait for it to be reported because context
1359
+ // cancellation will stop the report loop.
1360
+ a .setLifecycle (ctx , lifecycleState )
1361
+
1362
+ // Wait for the lifecycle to be reported, but don't wait forever so
1363
+ // that we don't break user expectations.
1364
+ ctx , cancel := context .WithTimeout (ctx , 5 * time .Second )
1365
+ defer cancel ()
1366
+ lifecycleWaitLoop:
1367
+ for {
1368
+ select {
1369
+ case <- ctx .Done ():
1370
+ break lifecycleWaitLoop
1371
+ case s := <- a .lifecycleReported :
1372
+ if s == lifecycleState {
1373
+ break lifecycleWaitLoop
1374
+ }
1375
+ }
1376
+ }
1377
+
1293
1378
close (a .closed )
1294
1379
a .closeCancel ()
1380
+ _ = a .sshServer .Close ()
1295
1381
if a .network != nil {
1296
1382
_ = a .network .Close ()
1297
1383
}
1298
- _ = a .sshServer .Close ()
1299
1384
a .connCloseWait .Wait ()
1385
+
1300
1386
return nil
1301
1387
}
1302
1388
0 commit comments