Skip to content

Commit 1262eef

Browse files
kylecarbsmatifali
andauthored
feat: add support for coder_script (#9584)
* Add basic migrations * Improve schema * Refactor agent scripts into it's own package * Support legacy start and stop script format * Pipe the scripts! * Finish the piping * Fix context usage * It works! * Fix sql query * Fix SQL query * Rename `LogSourceID` -> `SourceID` * Fix the FE * fmt * Rename migrations * Fix log tests * Fix lint err * Fix gen * Fix story type * Rename source to script * Fix schema jank * Uncomment test * Rename proto to TimeoutSeconds * Fix comments * Fix comments * Fix legacy endpoint without specified log_source * Fix non-blocking by default in agent * Fix resources tests * Fix dbfake * Fix resources * Fix linting I think * Add fixtures * fmt * Fix startup script behavior * Fix comments * Fix context * Fix cancel * Fix SQL tests * Fix e2e tests * Interrupt on Windows * Fix agent leaking script process * Fix migrations * Fix stories * Fix duplicate logs appearing * Gen * Fix log location * Fix tests * Fix tests * Fix log output * Show display name in output * Fix print * Return timeout on start context * Gen * Fix fixture * Fix the agent status * Fix startup timeout msg * Fix command using shared context * Fix timeout draining * Change signal type * Add deterministic colors to startup script logs --------- Co-authored-by: Muhammad Atif Ali <atif@coder.com>
1 parent dac1375 commit 1262eef

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

61 files changed

+3817
-2114
lines changed

agent/agent.go

Lines changed: 39 additions & 147 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ import (
1212
"net/http"
1313
"net/netip"
1414
"os"
15-
"os/exec"
1615
"os/user"
1716
"path/filepath"
1817
"runtime"
@@ -37,6 +36,7 @@ import (
3736

3837
"cdr.dev/slog"
3938
"github.com/coder/coder/v2/agent/agentproc"
39+
"github.com/coder/coder/v2/agent/agentscripts"
4040
"github.com/coder/coder/v2/agent/agentssh"
4141
"github.com/coder/coder/v2/agent/reconnectingpty"
4242
"github.com/coder/coder/v2/buildinfo"
@@ -196,6 +196,7 @@ type agent struct {
196196

197197
manifest atomic.Pointer[agentsdk.Manifest] // manifest is atomic because values can change after reconnection.
198198
reportMetadataInterval time.Duration
199+
scriptRunner *agentscripts.Runner
199200
serviceBanner atomic.Pointer[codersdk.ServiceBannerConfig] // serviceBanner is atomic because it is periodically updated.
200201
serviceBannerRefreshInterval time.Duration
201202
sessionToken atomic.Pointer[string]
@@ -238,7 +239,13 @@ func (a *agent) init(ctx context.Context) {
238239
sshSrv.Manifest = &a.manifest
239240
sshSrv.ServiceBanner = &a.serviceBanner
240241
a.sshServer = sshSrv
241-
242+
a.scriptRunner = agentscripts.New(agentscripts.Options{
243+
LogDir: a.logDir,
244+
Logger: a.logger,
245+
SSHServer: sshSrv,
246+
Filesystem: a.filesystem,
247+
PatchLogs: a.client.PatchLogs,
248+
})
242249
go a.runLoop(ctx)
243250
}
244251

@@ -657,41 +664,29 @@ func (a *agent) run(ctx context.Context) error {
657664
}
658665
}
659666

660-
lifecycleState := codersdk.WorkspaceAgentLifecycleReady
661-
scriptDone := make(chan error, 1)
662-
err = a.trackConnGoroutine(func() {
663-
defer close(scriptDone)
664-
scriptDone <- a.runStartupScript(ctx, manifest.StartupScript)
665-
})
667+
err = a.scriptRunner.Init(manifest.Scripts)
666668
if err != nil {
667-
return xerrors.Errorf("track startup script: %w", err)
669+
return xerrors.Errorf("init script runner: %w", err)
668670
}
669-
go func() {
670-
var timeout <-chan time.Time
671-
// If timeout is zero, an older version of the coder
672-
// provider was used. Otherwise a timeout is always > 0.
673-
if manifest.StartupScriptTimeout > 0 {
674-
t := time.NewTimer(manifest.StartupScriptTimeout)
675-
defer t.Stop()
676-
timeout = t.C
677-
}
678-
679-
var err error
680-
select {
681-
case err = <-scriptDone:
682-
case <-timeout:
683-
a.logger.Warn(ctx, "script timed out", slog.F("lifecycle", "startup"), slog.F("timeout", manifest.StartupScriptTimeout))
684-
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleStartTimeout)
685-
err = <-scriptDone // The script can still complete after a timeout.
686-
}
671+
err = a.trackConnGoroutine(func() {
672+
err := a.scriptRunner.Execute(ctx, func(script codersdk.WorkspaceAgentScript) bool {
673+
return script.RunOnStart
674+
})
687675
if err != nil {
688-
if errors.Is(err, context.Canceled) {
689-
return
676+
a.logger.Warn(ctx, "startup script failed", slog.Error(err))
677+
if errors.Is(err, agentscripts.ErrTimeout) {
678+
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleStartTimeout)
679+
} else {
680+
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleStartError)
690681
}
691-
lifecycleState = codersdk.WorkspaceAgentLifecycleStartError
682+
} else {
683+
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleReady)
692684
}
693-
a.setLifecycle(ctx, lifecycleState)
694-
}()
685+
a.scriptRunner.StartCron()
686+
})
687+
if err != nil {
688+
return xerrors.Errorf("track conn goroutine: %w", err)
689+
}
695690
}
696691

697692
// This automatically closes when the context ends!
@@ -1006,93 +1001,6 @@ func (a *agent) runDERPMapSubscriber(ctx context.Context, network *tailnet.Conn)
10061001
}
10071002
}
10081003

1009-
func (a *agent) runStartupScript(ctx context.Context, script string) error {
1010-
return a.runScript(ctx, "startup", script)
1011-
}
1012-
1013-
func (a *agent) runShutdownScript(ctx context.Context, script string) error {
1014-
return a.runScript(ctx, "shutdown", script)
1015-
}
1016-
1017-
func (a *agent) runScript(ctx context.Context, lifecycle, script string) (err error) {
1018-
if script == "" {
1019-
return nil
1020-
}
1021-
1022-
logger := a.logger.With(slog.F("lifecycle", lifecycle))
1023-
1024-
logger.Info(ctx, fmt.Sprintf("running %s script", lifecycle), slog.F("script", script))
1025-
fileWriter, err := a.filesystem.OpenFile(filepath.Join(a.logDir, fmt.Sprintf("coder-%s-script.log", lifecycle)), os.O_CREATE|os.O_RDWR, 0o600)
1026-
if err != nil {
1027-
return xerrors.Errorf("open %s script log file: %w", lifecycle, err)
1028-
}
1029-
defer func() {
1030-
err := fileWriter.Close()
1031-
if err != nil {
1032-
logger.Warn(ctx, fmt.Sprintf("close %s script log file", lifecycle), slog.Error(err))
1033-
}
1034-
}()
1035-
1036-
cmdPty, err := a.sshServer.CreateCommand(ctx, script, nil)
1037-
if err != nil {
1038-
return xerrors.Errorf("%s script: create command: %w", lifecycle, err)
1039-
}
1040-
cmd := cmdPty.AsExec()
1041-
1042-
var stdout, stderr io.Writer = fileWriter, fileWriter
1043-
if lifecycle == "startup" {
1044-
send, flushAndClose := agentsdk.LogsSender(a.client.PatchLogs, logger)
1045-
// If ctx is canceled here (or in a writer below), we may be
1046-
// discarding logs, but that's okay because we're shutting down
1047-
// anyway. We could consider creating a new context here if we
1048-
// want better control over flush during shutdown.
1049-
defer func() {
1050-
if err := flushAndClose(ctx); err != nil {
1051-
logger.Warn(ctx, "flush startup logs failed", slog.Error(err))
1052-
}
1053-
}()
1054-
1055-
infoW := agentsdk.StartupLogsWriter(ctx, send, codersdk.WorkspaceAgentLogSourceStartupScript, codersdk.LogLevelInfo)
1056-
defer infoW.Close()
1057-
errW := agentsdk.StartupLogsWriter(ctx, send, codersdk.WorkspaceAgentLogSourceStartupScript, codersdk.LogLevelError)
1058-
defer errW.Close()
1059-
1060-
stdout = io.MultiWriter(fileWriter, infoW)
1061-
stderr = io.MultiWriter(fileWriter, errW)
1062-
}
1063-
1064-
cmd.Stdout = stdout
1065-
cmd.Stderr = stderr
1066-
1067-
start := time.Now()
1068-
defer func() {
1069-
end := time.Now()
1070-
execTime := end.Sub(start)
1071-
exitCode := 0
1072-
if err != nil {
1073-
exitCode = 255 // Unknown status.
1074-
var exitError *exec.ExitError
1075-
if xerrors.As(err, &exitError) {
1076-
exitCode = exitError.ExitCode()
1077-
}
1078-
logger.Warn(ctx, fmt.Sprintf("%s script failed", lifecycle), slog.F("execution_time", execTime), slog.F("exit_code", exitCode), slog.Error(err))
1079-
} else {
1080-
logger.Info(ctx, fmt.Sprintf("%s script completed", lifecycle), slog.F("execution_time", execTime), slog.F("exit_code", exitCode))
1081-
}
1082-
}()
1083-
1084-
err = cmd.Run()
1085-
if err != nil {
1086-
// cmd.Run does not return a context canceled error, it returns "signal: killed".
1087-
if ctx.Err() != nil {
1088-
return ctx.Err()
1089-
}
1090-
1091-
return xerrors.Errorf("%s script: run: %w", lifecycle, err)
1092-
}
1093-
return nil
1094-
}
1095-
10961004
func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, msg codersdk.WorkspaceAgentReconnectingPTYInit, conn net.Conn) (retErr error) {
10971005
defer conn.Close()
10981006
a.metrics.connectionsTotal.Add(1)
@@ -1475,39 +1383,23 @@ func (a *agent) Close() error {
14751383
}
14761384

14771385
lifecycleState := codersdk.WorkspaceAgentLifecycleOff
1478-
if manifest := a.manifest.Load(); manifest != nil && manifest.ShutdownScript != "" {
1479-
scriptDone := make(chan error, 1)
1480-
go func() {
1481-
defer close(scriptDone)
1482-
scriptDone <- a.runShutdownScript(ctx, manifest.ShutdownScript)
1483-
}()
1484-
1485-
var timeout <-chan time.Time
1486-
// If timeout is zero, an older version of the coder
1487-
// provider was used. Otherwise a timeout is always > 0.
1488-
if manifest.ShutdownScriptTimeout > 0 {
1489-
t := time.NewTimer(manifest.ShutdownScriptTimeout)
1490-
defer t.Stop()
1491-
timeout = t.C
1492-
}
1493-
1494-
var err error
1495-
select {
1496-
case err = <-scriptDone:
1497-
case <-timeout:
1498-
a.logger.Warn(ctx, "script timed out", slog.F("lifecycle", "shutdown"), slog.F("timeout", manifest.ShutdownScriptTimeout))
1499-
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleShutdownTimeout)
1500-
err = <-scriptDone // The script can still complete after a timeout.
1501-
}
1502-
if err != nil {
1386+
err = a.scriptRunner.Execute(ctx, func(script codersdk.WorkspaceAgentScript) bool {
1387+
return script.RunOnStop
1388+
})
1389+
if err != nil {
1390+
if errors.Is(err, agentscripts.ErrTimeout) {
1391+
lifecycleState = codersdk.WorkspaceAgentLifecycleShutdownTimeout
1392+
} else {
15031393
lifecycleState = codersdk.WorkspaceAgentLifecycleShutdownError
15041394
}
15051395
}
1506-
1507-
// Set final state and wait for it to be reported because context
1508-
// cancellation will stop the report loop.
15091396
a.setLifecycle(ctx, lifecycleState)
15101397

1398+
err = a.scriptRunner.Close()
1399+
if err != nil {
1400+
a.logger.Error(ctx, "script runner close", slog.Error(err))
1401+
}
1402+
15111403
// Wait for the lifecycle to be reported, but don't wait forever so
15121404
// that we don't break user expectations.
15131405
ctx, cancel := context.WithTimeout(ctx, 5*time.Second)

0 commit comments

Comments
 (0)