coder · DanielleMaywood · Sep 24, 2024 · Sep 17, 2024 · Sep 17, 2024 · Sep 17, 2024
diff --git a/agent/agent.go b/agent/agent.go
@@ -941,17 +941,15 @@ func (a *agent) handleManifest(manifestOK *checkpoint) func(ctx context.Context,
 				}
 			}
 
-			err = a.scriptRunner.Init(manifest.Scripts)
+			err = a.scriptRunner.Init(manifest.Scripts, aAPI.ScriptCompleted)
 			if err != nil {
 				return xerrors.Errorf("init script runner: %w", err)
 			}
 			err = a.trackGoroutine(func() {
 				start := time.Now()
 				// here we use the graceful context because the script runner is not directly tied
 				// to the agent API.
-				err := a.scriptRunner.Execute(a.gracefulCtx, func(script codersdk.WorkspaceAgentScript) bool {
-					return script.RunOnStart
-				})
+				err := a.scriptRunner.Execute(a.gracefulCtx, agentscripts.ExecuteStartScripts)
 				// Measure the time immediately after the script has finished
 				dur := time.Since(start).Seconds()
 				if err != nil {
@@ -1844,9 +1842,7 @@ func (a *agent) Close() error {
 	a.gracefulCancel()
 
 	lifecycleState := codersdk.WorkspaceAgentLifecycleOff
-	err = a.scriptRunner.Execute(a.hardCtx, func(script codersdk.WorkspaceAgentScript) bool {
-		return script.RunOnStop
-	})
+	err = a.scriptRunner.Execute(a.hardCtx, agentscripts.ExecuteStopScripts)
 	if err != nil {
 		a.logger.Warn(a.hardCtx, "shutdown script(s) failed", slog.Error(err))
 		if errors.Is(err, agentscripts.ErrTimeout) {

diff --git a/agent/agent_test.go b/agent/agent_test.go
@@ -1517,10 +1517,12 @@ func TestAgent_Lifecycle(t *testing.T) {
 			agentsdk.Manifest{
 				DERPMap: derpMap,
 				Scripts: []codersdk.WorkspaceAgentScript{{
+					ID:         uuid.New(),
 					LogPath:    "coder-startup-script.log",
 					Script:     "echo 1",
 					RunOnStart: true,
 				}, {
+					ID:        uuid.New(),
 					LogPath:   "coder-shutdown-script.log",
 					Script:    "echo " + expected,
 					RunOnStop: true,

diff --git a/agent/agentscripts/agentscripts.go b/agent/agentscripts/agentscripts.go
@@ -19,10 +19,13 @@ import (
 	"github.com/spf13/afero"
 	"golang.org/x/sync/errgroup"
 	"golang.org/x/xerrors"
+	"google.golang.org/protobuf/types/known/timestamppb"
 
 	"cdr.dev/slog"
 
 	"github.com/coder/coder/v2/agent/agentssh"
+	"github.com/coder/coder/v2/agent/proto"
+	"github.com/coder/coder/v2/coderd/database/dbtime"
 	"github.com/coder/coder/v2/codersdk"
 	"github.com/coder/coder/v2/codersdk/agentsdk"
 )
@@ -75,18 +78,21 @@ func New(opts Options) *Runner {
 	}
 }
 
+type ScriptCompletedFunc func(context.Context, *proto.WorkspaceAgentScriptCompletedRequest) (*proto.WorkspaceAgentScriptCompletedResponse, error)
+
 type Runner struct {
 	Options
 
-	cronCtx       context.Context
-	cronCtxCancel context.CancelFunc
-	cmdCloseWait  sync.WaitGroup
-	closed        chan struct{}
-	closeMutex    sync.Mutex
-	cron          *cron.Cron
-	initialized   atomic.Bool
-	scripts       []codersdk.WorkspaceAgentScript
-	dataDir       string
+	cronCtx         context.Context
+	cronCtxCancel   context.CancelFunc
+	cmdCloseWait    sync.WaitGroup
+	closed          chan struct{}
+	closeMutex      sync.Mutex
+	cron            *cron.Cron
+	initialized     atomic.Bool
+	scripts         []codersdk.WorkspaceAgentScript
+	dataDir         string
+	scriptCompleted ScriptCompletedFunc
 
 	// scriptsExecuted includes all scripts executed by the workspace agent. Agents
 	// execute startup scripts, and scripts on a cron schedule. Both will increment
@@ -116,12 +122,13 @@ func (r *Runner) RegisterMetrics(reg prometheus.Registerer) {
 // Init initializes the runner with the provided scripts.
 // It also schedules any scripts that have a schedule.
 // This function must be called before Execute.
-func (r *Runner) Init(scripts []codersdk.WorkspaceAgentScript) error {
+func (r *Runner) Init(scripts []codersdk.WorkspaceAgentScript, scriptCompleted ScriptCompletedFunc) error {
 	if r.initialized.Load() {
 		return xerrors.New("init: already initialized")
 	}
 	r.initialized.Store(true)
 	r.scripts = scripts
+	r.scriptCompleted = scriptCompleted
 	r.Logger.Info(r.cronCtx, "initializing agent scripts", slog.F("script_count", len(scripts)), slog.F("log_dir", r.LogDir))
 
 	err := r.Filesystem.MkdirAll(r.ScriptBinDir(), 0o700)
@@ -135,7 +142,7 @@ func (r *Runner) Init(scripts []codersdk.WorkspaceAgentScript) error {
 		}
 		script := script
 		_, err := r.cron.AddFunc(script.Cron, func() {
-			err := r.trackRun(r.cronCtx, script)
+			err := r.trackRun(r.cronCtx, script, ExecuteCronScripts)
 			if err != nil {
 				r.Logger.Warn(context.Background(), "run agent script on schedule", slog.Error(err))
 			}
@@ -172,22 +179,33 @@ func (r *Runner) StartCron() {
 	}
 }
 
+// ExecuteOption describes what scripts we want to execute.
+type ExecuteOption int
+
+// ExecuteOption enums.
+const (
+	ExecuteAllScripts ExecuteOption = iota
+	ExecuteStartScripts
+	ExecuteStopScripts
+	ExecuteCronScripts
+)
+
 // Execute runs a set of scripts according to a filter.
-func (r *Runner) Execute(ctx context.Context, filter func(script codersdk.WorkspaceAgentScript) bool) error {
-	if filter == nil {
-		// Execute em' all!
-		filter = func(script codersdk.WorkspaceAgentScript) bool {
-			return true
-		}
-	}
+func (r *Runner) Execute(ctx context.Context, option ExecuteOption) error {
 	var eg errgroup.Group
 	for _, script := range r.scripts {
-		if !filter(script) {
+		runScript := (option == ExecuteStartScripts && script.RunOnStart) ||
+			(option == ExecuteStopScripts && script.RunOnStop) ||
+			(option == ExecuteCronScripts && script.Cron != "") ||
+			option == ExecuteAllScripts
+
+		if !runScript {
 			continue
 		}
+
 		script := script
 		eg.Go(func() error {
-			err := r.trackRun(ctx, script)
+			err := r.trackRun(ctx, script, option)
 			if err != nil {
 				return xerrors.Errorf("run agent script %q: %w", script.LogSourceID, err)
 			}
@@ -198,8 +216,8 @@ func (r *Runner) Execute(ctx context.Context, filter func(script codersdk.Worksp
 }
 
 // trackRun wraps "run" with metrics.
-func (r *Runner) trackRun(ctx context.Context, script codersdk.WorkspaceAgentScript) error {
-	err := r.run(ctx, script)
+func (r *Runner) trackRun(ctx context.Context, script codersdk.WorkspaceAgentScript, option ExecuteOption) error {
+	err := r.run(ctx, script, option)
 	if err != nil {
 		r.scriptsExecuted.WithLabelValues("false").Add(1)
 	} else {
@@ -212,7 +230,7 @@ func (r *Runner) trackRun(ctx context.Context, script codersdk.WorkspaceAgentScr
 // If the timeout is exceeded, the process is sent an interrupt signal.
 // If the process does not exit after a few seconds, it is forcefully killed.
 // This function immediately returns after a timeout, and does not wait for the process to exit.
-func (r *Runner) run(ctx context.Context, script codersdk.WorkspaceAgentScript) error {
+func (r *Runner) run(ctx context.Context, script codersdk.WorkspaceAgentScript, option ExecuteOption) error {
 	logPath := script.LogPath
 	if logPath == "" {
 		logPath = fmt.Sprintf("coder-script-%s.log", script.LogSourceID)
@@ -299,9 +317,9 @@ func (r *Runner) run(ctx context.Context, script codersdk.WorkspaceAgentScript)
 	cmd.Stdout = io.MultiWriter(fileWriter, infoW)
 	cmd.Stderr = io.MultiWriter(fileWriter, errW)
 
-	start := time.Now()
+	start := dbtime.Now()
 	defer func() {
-		end := time.Now()
+		end := dbtime.Now()
 		execTime := end.Sub(start)
 		exitCode := 0
 		if err != nil {
@@ -314,6 +332,60 @@ func (r *Runner) run(ctx context.Context, script codersdk.WorkspaceAgentScript)
 		} else {
 			logger.Info(ctx, fmt.Sprintf("%s script completed", logPath), slog.F("execution_time", execTime), slog.F("exit_code", exitCode))
 		}
+
+		if r.scriptCompleted == nil {
+			logger.Debug(ctx, "r.scriptCompleted unexpectedly nil")
+			return
+		}
+
+		// We want to check this outside of the goroutine to avoid a race condition
+		timedOut := errors.Is(err, ErrTimeout)
+		pipesLeftOpen := errors.Is(err, ErrOutputPipesOpen)
+
+		err = r.trackCommandGoroutine(func() {
+			var stage proto.Timing_Stage
+			switch option {
+			case ExecuteStartScripts:
+				stage = proto.Timing_START
+			case ExecuteStopScripts:
+				stage = proto.Timing_STOP
+			case ExecuteCronScripts:
+				stage = proto.Timing_CRON
+			}
+
+			var status proto.Timing_Status
+			switch {
+			case timedOut:
+				status = proto.Timing_TIMED_OUT
+			case pipesLeftOpen:
+				status = proto.Timing_PIPES_LEFT_OPEN
+			case exitCode != 0:
+				status = proto.Timing_EXIT_FAILURE
+			default:
+				status = proto.Timing_OK
+			}
+
+			reportTimeout := 30 * time.Second
+			reportCtx, cancel := context.WithTimeout(context.Background(), reportTimeout)
+			defer cancel()
+
+			_, err := r.scriptCompleted(reportCtx, &proto.WorkspaceAgentScriptCompletedRequest{
+				Timing: &proto.Timing{
+					ScriptId: script.ID[:],
+					Start:    timestamppb.New(start),
+					End:      timestamppb.New(end),
+					ExitCode: int32(exitCode),
+					Stage:    stage,
+					Status:   status,
+				},
+			})
+			if err != nil {
+				logger.Error(ctx, fmt.Sprintf("reporting script completed: %s", err.Error()))
+			}
+		})
+		if err != nil {
+			logger.Error(ctx, fmt.Sprintf("reporting script completed: track command goroutine: %s", err.Error()))
+		}
 	}()
 
 	err = cmd.Start()

diff --git a/agent/agentscripts/agentscripts_test.go b/agent/agentscripts/agentscripts_test.go
@@ -17,6 +17,7 @@ import (
 	"cdr.dev/slog/sloggers/slogtest"
 	"github.com/coder/coder/v2/agent/agentscripts"
 	"github.com/coder/coder/v2/agent/agentssh"
+	"github.com/coder/coder/v2/agent/agenttest"
 	"github.com/coder/coder/v2/codersdk"
 	"github.com/coder/coder/v2/codersdk/agentsdk"
 	"github.com/coder/coder/v2/testutil"
@@ -34,14 +35,13 @@ func TestExecuteBasic(t *testing.T) {
 		return fLogger
 	})
 	defer runner.Close()
+	aAPI := agenttest.NewFakeAgentAPI(t, slogtest.Make(t, nil), nil, nil)
 	err := runner.Init([]codersdk.WorkspaceAgentScript{{
 		LogSourceID: uuid.New(),
 		Script:      "echo hello",
-	}})
+	}}, aAPI.ScriptCompleted)
 	require.NoError(t, err)
-	require.NoError(t, runner.Execute(context.Background(), func(script codersdk.WorkspaceAgentScript) bool {
-		return true
-	}))
+	require.NoError(t, runner.Execute(context.Background(), agentscripts.ExecuteAllScripts))
 	log := testutil.RequireRecvCtx(ctx, t, fLogger.logs)
 	require.Equal(t, "hello", log.Output)
 }
@@ -61,18 +61,17 @@ func TestEnv(t *testing.T) {
 			cmd.exe /c echo %CODER_SCRIPT_BIN_DIR%
 		`
 	}
+	aAPI := agenttest.NewFakeAgentAPI(t, slogtest.Make(t, nil), nil, nil)
 	err := runner.Init([]codersdk.WorkspaceAgentScript{{
 		LogSourceID: id,
 		Script:      script,
-	}})
+	}}, aAPI.ScriptCompleted)
 	require.NoError(t, err)
 
 	ctx := testutil.Context(t, testutil.WaitLong)
 
 	done := testutil.Go(t, func() {
-		err := runner.Execute(ctx, func(script codersdk.WorkspaceAgentScript) bool {
-			return true
-		})
+		err := runner.Execute(ctx, agentscripts.ExecuteAllScripts)
 		assert.NoError(t, err)
 	})
 	defer func() {
@@ -103,13 +102,44 @@ func TestTimeout(t *testing.T) {
 	t.Parallel()
 	runner := setup(t, nil)
 	defer runner.Close()
+	aAPI := agenttest.NewFakeAgentAPI(t, slogtest.Make(t, nil), nil, nil)
 	err := runner.Init([]codersdk.WorkspaceAgentScript{{
 		LogSourceID: uuid.New(),
 		Script:      "sleep infinity",
 		Timeout:     time.Millisecond,
-	}})
+	}}, aAPI.ScriptCompleted)
 	require.NoError(t, err)
-	require.ErrorIs(t, runner.Execute(context.Background(), nil), agentscripts.ErrTimeout)
+	require.ErrorIs(t, runner.Execute(context.Background(), agentscripts.ExecuteAllScripts), agentscripts.ErrTimeout)
+}
+
+func TestScriptReportsTiming(t *testing.T) {
+	t.Parallel()
+
+	ctx := testutil.Context(t, testutil.WaitShort)
+	fLogger := newFakeScriptLogger()
+	runner := setup(t, func(uuid2 uuid.UUID) agentscripts.ScriptLogger {
+		return fLogger
+	})
+
+	aAPI := agenttest.NewFakeAgentAPI(t, slogtest.Make(t, nil), nil, nil)
+	err := runner.Init([]codersdk.WorkspaceAgentScript{{
+		DisplayName: "say-hello",
+		LogSourceID: uuid.New(),
+		Script:      "echo hello",
+	}}, aAPI.ScriptCompleted)
+	require.NoError(t, err)
+	require.NoError(t, runner.Execute(ctx, agentscripts.ExecuteAllScripts))
+	runner.Close()
+
+	log := testutil.RequireRecvCtx(ctx, t, fLogger.logs)
+	require.Equal(t, "hello", log.Output)
+
+	timings := aAPI.GetTimings()
+	require.Equal(t, 1, len(timings))
+
+	timing := timings[0]
+	require.Equal(t, int32(0), timing.ExitCode)
+	require.GreaterOrEqual(t, timing.End.AsTime(), timing.Start.AsTime())
 }
 
 // TestCronClose exists because cron.Run() can happen after cron.Close().

diff --git a/agent/agenttest/client.go b/agent/agenttest/client.go
@@ -170,6 +170,7 @@ type FakeAgentAPI struct {
 	logsCh          chan<- *agentproto.BatchCreateLogsRequest
 	lifecycleStates []codersdk.WorkspaceAgentLifecycle
 	metadata        map[string]agentsdk.Metadata
+	timings         []*agentproto.Timing
 
 	getAnnouncementBannersFunc func() ([]codersdk.BannerConfig, error)
 }
@@ -182,6 +183,12 @@ func (*FakeAgentAPI) GetServiceBanner(context.Context, *agentproto.GetServiceBan
 	return &agentproto.ServiceBanner{}, nil
 }
 
+func (f *FakeAgentAPI) GetTimings() []*agentproto.Timing {
+	f.Lock()
+	defer f.Unlock()
+	return slices.Clone(f.timings)
+}
+
 func (f *FakeAgentAPI) SetAnnouncementBannersFunc(fn func() ([]codersdk.BannerConfig, error)) {
 	f.Lock()
 	defer f.Unlock()
@@ -301,6 +308,14 @@ func (f *FakeAgentAPI) BatchCreateLogs(ctx context.Context, req *agentproto.Batc
 	return &agentproto.BatchCreateLogsResponse{}, nil
 }
 
+func (f *FakeAgentAPI) ScriptCompleted(_ context.Context, req *agentproto.WorkspaceAgentScriptCompletedRequest) (*agentproto.WorkspaceAgentScriptCompletedResponse, error) {
+	f.Lock()
+	f.timings = append(f.timings, req.Timing)
+	f.Unlock()
+
+	return &agentproto.WorkspaceAgentScriptCompletedResponse{}, nil
+}
+
 func NewFakeAgentAPI(t testing.TB, logger slog.Logger, manifest *agentproto.Manifest, statsCh chan *agentproto.Stats) *FakeAgentAPI {
 	return &FakeAgentAPI{
 		t:           t,