Skip to content

feat(agent): Add shutdown lifecycle states and shutdown_script support #6139

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Mar 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 108 additions & 24 deletions agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ func New(options Options) io.Closer {
logDir: options.LogDir,
tempDir: options.TempDir,
lifecycleUpdate: make(chan struct{}, 1),
lifecycleReported: make(chan codersdk.WorkspaceAgentLifecycle, 1),
connStatsChan: make(chan *agentsdk.Stats, 1),
}
a.init(ctx)
Expand Down Expand Up @@ -149,9 +150,10 @@ type agent struct {
sessionToken atomic.Pointer[string]
sshServer *ssh.Server

lifecycleUpdate chan struct{}
lifecycleMu sync.Mutex // Protects following.
lifecycleState codersdk.WorkspaceAgentLifecycle
lifecycleUpdate chan struct{}
lifecycleReported chan codersdk.WorkspaceAgentLifecycle
lifecycleMu sync.RWMutex // Protects following.
lifecycleState codersdk.WorkspaceAgentLifecycle

network *tailnet.Conn
connStatsChan chan *agentsdk.Stats
Expand Down Expand Up @@ -207,9 +209,9 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
}

for r := retry.New(time.Second, 15*time.Second); r.Wait(ctx); {
a.lifecycleMu.Lock()
a.lifecycleMu.RLock()
state := a.lifecycleState
a.lifecycleMu.Unlock()
a.lifecycleMu.RUnlock()

if state == lastReported {
break
Expand All @@ -222,6 +224,11 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
})
if err == nil {
lastReported = state
select {
case a.lifecycleReported <- state:
case <-a.lifecycleReported:
a.lifecycleReported <- state
}
break
}
if xerrors.Is(err, context.Canceled) || xerrors.Is(err, context.DeadlineExceeded) {
Expand All @@ -233,13 +240,20 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
}
}

// setLifecycle sets the lifecycle state and notifies the lifecycle loop.
// The state is only updated if it's a valid state transition.
func (a *agent) setLifecycle(ctx context.Context, state codersdk.WorkspaceAgentLifecycle) {
a.lifecycleMu.Lock()
defer a.lifecycleMu.Unlock()

a.logger.Debug(ctx, "set lifecycle state", slog.F("state", state), slog.F("previous", a.lifecycleState))

lastState := a.lifecycleState
if slices.Index(codersdk.WorkspaceAgentLifecycleOrder, lastState) > slices.Index(codersdk.WorkspaceAgentLifecycleOrder, state) {
a.logger.Warn(ctx, "attempted to set lifecycle state to a previous state", slog.F("last", lastState), slog.F("state", state))
a.lifecycleMu.Unlock()
return
}
a.lifecycleState = state
a.logger.Debug(ctx, "set lifecycle state", slog.F("state", state), slog.F("last", lastState))
a.lifecycleMu.Unlock()

select {
case a.lifecycleUpdate <- struct{}{}:
default:
Expand Down Expand Up @@ -299,9 +313,10 @@ func (a *agent) run(ctx context.Context) error {
}
}

lifecycleState := codersdk.WorkspaceAgentLifecycleReady
scriptDone := make(chan error, 1)
scriptStart := time.Now()
err := a.trackConnGoroutine(func() {
err = a.trackConnGoroutine(func() {
defer close(scriptDone)
scriptDone <- a.runStartupScript(ctx, metadata.StartupScript)
})
Expand Down Expand Up @@ -329,16 +344,17 @@ func (a *agent) run(ctx context.Context) error {
if errors.Is(err, context.Canceled) {
return
}
execTime := time.Since(scriptStart)
lifecycleStatus := codersdk.WorkspaceAgentLifecycleReady
if err != nil {
a.logger.Warn(ctx, "startup script failed", slog.F("execution_time", execTime), slog.Error(err))
lifecycleStatus = codersdk.WorkspaceAgentLifecycleStartError
} else {
a.logger.Info(ctx, "startup script completed", slog.F("execution_time", execTime))
// Only log if there was a startup script.
if metadata.StartupScript != "" {
execTime := time.Since(scriptStart)
if err != nil {
a.logger.Warn(ctx, "startup script failed", slog.F("execution_time", execTime), slog.Error(err))
lifecycleState = codersdk.WorkspaceAgentLifecycleStartError
} else {
a.logger.Info(ctx, "startup script completed", slog.F("execution_time", execTime))
}
}

a.setLifecycle(ctx, lifecycleStatus)
a.setLifecycle(ctx, lifecycleState)
}()
}

Expand Down Expand Up @@ -606,14 +622,22 @@ func (a *agent) runCoordinator(ctx context.Context, network *tailnet.Conn) error
}

func (a *agent) runStartupScript(ctx context.Context, script string) error {
return a.runScript(ctx, "startup", script)
}

func (a *agent) runShutdownScript(ctx context.Context, script string) error {
return a.runScript(ctx, "shutdown", script)
}

func (a *agent) runScript(ctx context.Context, lifecycle, script string) error {
if script == "" {
return nil
}

a.logger.Info(ctx, "running startup script", slog.F("script", script))
writer, err := a.filesystem.OpenFile(filepath.Join(a.logDir, "coder-startup-script.log"), os.O_CREATE|os.O_RDWR, 0o600)
a.logger.Info(ctx, "running script", slog.F("lifecycle", lifecycle), slog.F("script", script))
writer, err := a.filesystem.OpenFile(filepath.Join(a.logDir, fmt.Sprintf("coder-%s-script.log", lifecycle)), os.O_CREATE|os.O_RDWR, 0o600)
if err != nil {
return xerrors.Errorf("open startup script log file: %w", err)
return xerrors.Errorf("open %s script log file: %w", lifecycle, err)
}
defer func() {
_ = writer.Close()
Expand Down Expand Up @@ -774,7 +798,7 @@ func (a *agent) createCommand(ctx context.Context, rawCommand string, env []stri

rawMetadata := a.metadata.Load()
if rawMetadata == nil {
return nil, xerrors.Errorf("no metadata was provided: %w", err)
return nil, xerrors.Errorf("no metadata was provided")
}
metadata, valid := rawMetadata.(agentsdk.Metadata)
if !valid {
Expand Down Expand Up @@ -1290,13 +1314,73 @@ func (a *agent) Close() error {
if a.isClosed() {
return nil
}

ctx := context.Background()
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleShuttingDown)

lifecycleState := codersdk.WorkspaceAgentLifecycleOff
if metadata, ok := a.metadata.Load().(agentsdk.Metadata); ok && metadata.ShutdownScript != "" {
scriptDone := make(chan error, 1)
scriptStart := time.Now()
go func() {
defer close(scriptDone)
scriptDone <- a.runShutdownScript(ctx, metadata.ShutdownScript)
}()

var timeout <-chan time.Time
// If timeout is zero, an older version of the coder
// provider was used. Otherwise a timeout is always > 0.
if metadata.ShutdownScriptTimeout > 0 {
t := time.NewTimer(metadata.ShutdownScriptTimeout)
defer t.Stop()
timeout = t.C
}

var err error
select {
case err = <-scriptDone:
case <-timeout:
a.logger.Warn(ctx, "shutdown script timed out")
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleShutdownTimeout)
err = <-scriptDone // The script can still complete after a timeout.
}
execTime := time.Since(scriptStart)
if err != nil {
a.logger.Warn(ctx, "shutdown script failed", slog.F("execution_time", execTime), slog.Error(err))
lifecycleState = codersdk.WorkspaceAgentLifecycleShutdownError
} else {
a.logger.Info(ctx, "shutdown script completed", slog.F("execution_time", execTime))
}
}

// Set final state and wait for it to be reported because context
// cancellation will stop the report loop.
a.setLifecycle(ctx, lifecycleState)

// Wait for the lifecycle to be reported, but don't wait forever so
// that we don't break user expectations.
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()
lifecycleWaitLoop:
for {
select {
case <-ctx.Done():
break lifecycleWaitLoop
case s := <-a.lifecycleReported:
if s == lifecycleState {
break lifecycleWaitLoop
}
}
}

close(a.closed)
a.closeCancel()
_ = a.sshServer.Close()
if a.network != nil {
_ = a.network.Close()
}
_ = a.sshServer.Close()
a.connCloseWait.Wait()

return nil
}

Expand Down
Loading