Skip to content

Commit ae522c5

Browse files
feat: add agent timings (coder#14713)
* feat: begin impl of agent script timings * feat: add job_id and display_name to script timings * fix: increment migration number * fix: rename migrations from 251 to 254 * test: get tests compiling * fix: appease the linter * fix: get tests passing again * fix: drop column from correct table * test: add fixture for agent script timings * fix: typo * fix: use job id used in provisioner job timings * fix: increment migration number * test: behaviour of script runner * test: rewrite test * test: does exit 1 script break things? * test: rewrite test again * fix: revert change Not sure how this came to be, I do not recall manually changing these files. * fix: let code breathe * fix: wrap errors * fix: justify nolint * fix: swap require.Equal argument order * fix: add mutex operations * feat: add 'ran_on_start' and 'blocked_login' fields * fix: update testdata fixture * fix: refer to agent_id instead of job_id in timings * fix: JobID -> AgentID in dbauthz_test * fix: add 'id' to scripts, make timing refer to script id * fix: fix broken tests and convert bug * fix: update testdata fixtures * fix: update testdata fixtures again * feat: capture stage and if script timed out * fix: update migration number * test: add test for script api * fix: fake db query * fix: use UTC time * fix: ensure r.scriptComplete is not nil * fix: move err check to right after call * fix: uppercase sql * fix: use dbtime.Now() * fix: debug log on r.scriptCompleted being nil * fix: ensure correct rbac permissions * chore: remove DisplayName * fix: get tests passing * fix: remove space in sql up * docs: document ExecuteOption * fix: drop 'RETURNING' from sql * chore: remove 'display_name' from timing table * fix: testdata fixture * fix: put r.scriptCompleted call in goroutine * fix: track goroutine for test + use separate context for reporting * fix: appease linter, handle trackCommandGoroutine error * fix: resolve race condition * feat: replace timed_out column with status column * test: update testdata fixture * fix: apply suggestions from review * revert: linter changes
1 parent b894407 commit ae522c5

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+1367
-232
lines changed

agent/agent.go

+3-7
Original file line numberDiff line numberDiff line change
@@ -941,17 +941,15 @@ func (a *agent) handleManifest(manifestOK *checkpoint) func(ctx context.Context,
941941
}
942942
}
943943

944-
err = a.scriptRunner.Init(manifest.Scripts)
944+
err = a.scriptRunner.Init(manifest.Scripts, aAPI.ScriptCompleted)
945945
if err != nil {
946946
return xerrors.Errorf("init script runner: %w", err)
947947
}
948948
err = a.trackGoroutine(func() {
949949
start := time.Now()
950950
// here we use the graceful context because the script runner is not directly tied
951951
// to the agent API.
952-
err := a.scriptRunner.Execute(a.gracefulCtx, func(script codersdk.WorkspaceAgentScript) bool {
953-
return script.RunOnStart
954-
})
952+
err := a.scriptRunner.Execute(a.gracefulCtx, agentscripts.ExecuteStartScripts)
955953
// Measure the time immediately after the script has finished
956954
dur := time.Since(start).Seconds()
957955
if err != nil {
@@ -1844,9 +1842,7 @@ func (a *agent) Close() error {
18441842
a.gracefulCancel()
18451843

18461844
lifecycleState := codersdk.WorkspaceAgentLifecycleOff
1847-
err = a.scriptRunner.Execute(a.hardCtx, func(script codersdk.WorkspaceAgentScript) bool {
1848-
return script.RunOnStop
1849-
})
1845+
err = a.scriptRunner.Execute(a.hardCtx, agentscripts.ExecuteStopScripts)
18501846
if err != nil {
18511847
a.logger.Warn(a.hardCtx, "shutdown script(s) failed", slog.Error(err))
18521848
if errors.Is(err, agentscripts.ErrTimeout) {

agent/agent_test.go

+2
Original file line numberDiff line numberDiff line change
@@ -1517,10 +1517,12 @@ func TestAgent_Lifecycle(t *testing.T) {
15171517
agentsdk.Manifest{
15181518
DERPMap: derpMap,
15191519
Scripts: []codersdk.WorkspaceAgentScript{{
1520+
ID: uuid.New(),
15201521
LogPath: "coder-startup-script.log",
15211522
Script: "echo 1",
15221523
RunOnStart: true,
15231524
}, {
1525+
ID: uuid.New(),
15241526
LogPath: "coder-shutdown-script.log",
15251527
Script: "echo " + expected,
15261528
RunOnStop: true,

agent/agentscripts/agentscripts.go

+97-25
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,13 @@ import (
1919
"github.com/spf13/afero"
2020
"golang.org/x/sync/errgroup"
2121
"golang.org/x/xerrors"
22+
"google.golang.org/protobuf/types/known/timestamppb"
2223

2324
"cdr.dev/slog"
2425

2526
"github.com/coder/coder/v2/agent/agentssh"
27+
"github.com/coder/coder/v2/agent/proto"
28+
"github.com/coder/coder/v2/coderd/database/dbtime"
2629
"github.com/coder/coder/v2/codersdk"
2730
"github.com/coder/coder/v2/codersdk/agentsdk"
2831
)
@@ -75,18 +78,21 @@ func New(opts Options) *Runner {
7578
}
7679
}
7780

81+
type ScriptCompletedFunc func(context.Context, *proto.WorkspaceAgentScriptCompletedRequest) (*proto.WorkspaceAgentScriptCompletedResponse, error)
82+
7883
type Runner struct {
7984
Options
8085

81-
cronCtx context.Context
82-
cronCtxCancel context.CancelFunc
83-
cmdCloseWait sync.WaitGroup
84-
closed chan struct{}
85-
closeMutex sync.Mutex
86-
cron *cron.Cron
87-
initialized atomic.Bool
88-
scripts []codersdk.WorkspaceAgentScript
89-
dataDir string
86+
cronCtx context.Context
87+
cronCtxCancel context.CancelFunc
88+
cmdCloseWait sync.WaitGroup
89+
closed chan struct{}
90+
closeMutex sync.Mutex
91+
cron *cron.Cron
92+
initialized atomic.Bool
93+
scripts []codersdk.WorkspaceAgentScript
94+
dataDir string
95+
scriptCompleted ScriptCompletedFunc
9096

9197
// scriptsExecuted includes all scripts executed by the workspace agent. Agents
9298
// execute startup scripts, and scripts on a cron schedule. Both will increment
@@ -116,12 +122,13 @@ func (r *Runner) RegisterMetrics(reg prometheus.Registerer) {
116122
// Init initializes the runner with the provided scripts.
117123
// It also schedules any scripts that have a schedule.
118124
// This function must be called before Execute.
119-
func (r *Runner) Init(scripts []codersdk.WorkspaceAgentScript) error {
125+
func (r *Runner) Init(scripts []codersdk.WorkspaceAgentScript, scriptCompleted ScriptCompletedFunc) error {
120126
if r.initialized.Load() {
121127
return xerrors.New("init: already initialized")
122128
}
123129
r.initialized.Store(true)
124130
r.scripts = scripts
131+
r.scriptCompleted = scriptCompleted
125132
r.Logger.Info(r.cronCtx, "initializing agent scripts", slog.F("script_count", len(scripts)), slog.F("log_dir", r.LogDir))
126133

127134
err := r.Filesystem.MkdirAll(r.ScriptBinDir(), 0o700)
@@ -135,7 +142,7 @@ func (r *Runner) Init(scripts []codersdk.WorkspaceAgentScript) error {
135142
}
136143
script := script
137144
_, err := r.cron.AddFunc(script.Cron, func() {
138-
err := r.trackRun(r.cronCtx, script)
145+
err := r.trackRun(r.cronCtx, script, ExecuteCronScripts)
139146
if err != nil {
140147
r.Logger.Warn(context.Background(), "run agent script on schedule", slog.Error(err))
141148
}
@@ -172,22 +179,33 @@ func (r *Runner) StartCron() {
172179
}
173180
}
174181

182+
// ExecuteOption describes what scripts we want to execute.
183+
type ExecuteOption int
184+
185+
// ExecuteOption enums.
186+
const (
187+
ExecuteAllScripts ExecuteOption = iota
188+
ExecuteStartScripts
189+
ExecuteStopScripts
190+
ExecuteCronScripts
191+
)
192+
175193
// Execute runs a set of scripts according to a filter.
176-
func (r *Runner) Execute(ctx context.Context, filter func(script codersdk.WorkspaceAgentScript) bool) error {
177-
if filter == nil {
178-
// Execute em' all!
179-
filter = func(script codersdk.WorkspaceAgentScript) bool {
180-
return true
181-
}
182-
}
194+
func (r *Runner) Execute(ctx context.Context, option ExecuteOption) error {
183195
var eg errgroup.Group
184196
for _, script := range r.scripts {
185-
if !filter(script) {
197+
runScript := (option == ExecuteStartScripts && script.RunOnStart) ||
198+
(option == ExecuteStopScripts && script.RunOnStop) ||
199+
(option == ExecuteCronScripts && script.Cron != "") ||
200+
option == ExecuteAllScripts
201+
202+
if !runScript {
186203
continue
187204
}
205+
188206
script := script
189207
eg.Go(func() error {
190-
err := r.trackRun(ctx, script)
208+
err := r.trackRun(ctx, script, option)
191209
if err != nil {
192210
return xerrors.Errorf("run agent script %q: %w", script.LogSourceID, err)
193211
}
@@ -198,8 +216,8 @@ func (r *Runner) Execute(ctx context.Context, filter func(script codersdk.Worksp
198216
}
199217

200218
// trackRun wraps "run" with metrics.
201-
func (r *Runner) trackRun(ctx context.Context, script codersdk.WorkspaceAgentScript) error {
202-
err := r.run(ctx, script)
219+
func (r *Runner) trackRun(ctx context.Context, script codersdk.WorkspaceAgentScript, option ExecuteOption) error {
220+
err := r.run(ctx, script, option)
203221
if err != nil {
204222
r.scriptsExecuted.WithLabelValues("false").Add(1)
205223
} else {
@@ -212,7 +230,7 @@ func (r *Runner) trackRun(ctx context.Context, script codersdk.WorkspaceAgentScr
212230
// If the timeout is exceeded, the process is sent an interrupt signal.
213231
// If the process does not exit after a few seconds, it is forcefully killed.
214232
// This function immediately returns after a timeout, and does not wait for the process to exit.
215-
func (r *Runner) run(ctx context.Context, script codersdk.WorkspaceAgentScript) error {
233+
func (r *Runner) run(ctx context.Context, script codersdk.WorkspaceAgentScript, option ExecuteOption) error {
216234
logPath := script.LogPath
217235
if logPath == "" {
218236
logPath = fmt.Sprintf("coder-script-%s.log", script.LogSourceID)
@@ -299,9 +317,9 @@ func (r *Runner) run(ctx context.Context, script codersdk.WorkspaceAgentScript)
299317
cmd.Stdout = io.MultiWriter(fileWriter, infoW)
300318
cmd.Stderr = io.MultiWriter(fileWriter, errW)
301319

302-
start := time.Now()
320+
start := dbtime.Now()
303321
defer func() {
304-
end := time.Now()
322+
end := dbtime.Now()
305323
execTime := end.Sub(start)
306324
exitCode := 0
307325
if err != nil {
@@ -314,6 +332,60 @@ func (r *Runner) run(ctx context.Context, script codersdk.WorkspaceAgentScript)
314332
} else {
315333
logger.Info(ctx, fmt.Sprintf("%s script completed", logPath), slog.F("execution_time", execTime), slog.F("exit_code", exitCode))
316334
}
335+
336+
if r.scriptCompleted == nil {
337+
logger.Debug(ctx, "r.scriptCompleted unexpectedly nil")
338+
return
339+
}
340+
341+
// We want to check this outside of the goroutine to avoid a race condition
342+
timedOut := errors.Is(err, ErrTimeout)
343+
pipesLeftOpen := errors.Is(err, ErrOutputPipesOpen)
344+
345+
err = r.trackCommandGoroutine(func() {
346+
var stage proto.Timing_Stage
347+
switch option {
348+
case ExecuteStartScripts:
349+
stage = proto.Timing_START
350+
case ExecuteStopScripts:
351+
stage = proto.Timing_STOP
352+
case ExecuteCronScripts:
353+
stage = proto.Timing_CRON
354+
}
355+
356+
var status proto.Timing_Status
357+
switch {
358+
case timedOut:
359+
status = proto.Timing_TIMED_OUT
360+
case pipesLeftOpen:
361+
status = proto.Timing_PIPES_LEFT_OPEN
362+
case exitCode != 0:
363+
status = proto.Timing_EXIT_FAILURE
364+
default:
365+
status = proto.Timing_OK
366+
}
367+
368+
reportTimeout := 30 * time.Second
369+
reportCtx, cancel := context.WithTimeout(context.Background(), reportTimeout)
370+
defer cancel()
371+
372+
_, err := r.scriptCompleted(reportCtx, &proto.WorkspaceAgentScriptCompletedRequest{
373+
Timing: &proto.Timing{
374+
ScriptId: script.ID[:],
375+
Start: timestamppb.New(start),
376+
End: timestamppb.New(end),
377+
ExitCode: int32(exitCode),
378+
Stage: stage,
379+
Status: status,
380+
},
381+
})
382+
if err != nil {
383+
logger.Error(ctx, fmt.Sprintf("reporting script completed: %s", err.Error()))
384+
}
385+
})
386+
if err != nil {
387+
logger.Error(ctx, fmt.Sprintf("reporting script completed: track command goroutine: %s", err.Error()))
388+
}
317389
}()
318390

319391
err = cmd.Start()

agent/agentscripts/agentscripts_test.go

+40-10
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
"cdr.dev/slog/sloggers/slogtest"
1818
"github.com/coder/coder/v2/agent/agentscripts"
1919
"github.com/coder/coder/v2/agent/agentssh"
20+
"github.com/coder/coder/v2/agent/agenttest"
2021
"github.com/coder/coder/v2/codersdk"
2122
"github.com/coder/coder/v2/codersdk/agentsdk"
2223
"github.com/coder/coder/v2/testutil"
@@ -34,14 +35,13 @@ func TestExecuteBasic(t *testing.T) {
3435
return fLogger
3536
})
3637
defer runner.Close()
38+
aAPI := agenttest.NewFakeAgentAPI(t, slogtest.Make(t, nil), nil, nil)
3739
err := runner.Init([]codersdk.WorkspaceAgentScript{{
3840
LogSourceID: uuid.New(),
3941
Script: "echo hello",
40-
}})
42+
}}, aAPI.ScriptCompleted)
4143
require.NoError(t, err)
42-
require.NoError(t, runner.Execute(context.Background(), func(script codersdk.WorkspaceAgentScript) bool {
43-
return true
44-
}))
44+
require.NoError(t, runner.Execute(context.Background(), agentscripts.ExecuteAllScripts))
4545
log := testutil.RequireRecvCtx(ctx, t, fLogger.logs)
4646
require.Equal(t, "hello", log.Output)
4747
}
@@ -61,18 +61,17 @@ func TestEnv(t *testing.T) {
6161
cmd.exe /c echo %CODER_SCRIPT_BIN_DIR%
6262
`
6363
}
64+
aAPI := agenttest.NewFakeAgentAPI(t, slogtest.Make(t, nil), nil, nil)
6465
err := runner.Init([]codersdk.WorkspaceAgentScript{{
6566
LogSourceID: id,
6667
Script: script,
67-
}})
68+
}}, aAPI.ScriptCompleted)
6869
require.NoError(t, err)
6970

7071
ctx := testutil.Context(t, testutil.WaitLong)
7172

7273
done := testutil.Go(t, func() {
73-
err := runner.Execute(ctx, func(script codersdk.WorkspaceAgentScript) bool {
74-
return true
75-
})
74+
err := runner.Execute(ctx, agentscripts.ExecuteAllScripts)
7675
assert.NoError(t, err)
7776
})
7877
defer func() {
@@ -103,13 +102,44 @@ func TestTimeout(t *testing.T) {
103102
t.Parallel()
104103
runner := setup(t, nil)
105104
defer runner.Close()
105+
aAPI := agenttest.NewFakeAgentAPI(t, slogtest.Make(t, nil), nil, nil)
106106
err := runner.Init([]codersdk.WorkspaceAgentScript{{
107107
LogSourceID: uuid.New(),
108108
Script: "sleep infinity",
109109
Timeout: time.Millisecond,
110-
}})
110+
}}, aAPI.ScriptCompleted)
111111
require.NoError(t, err)
112-
require.ErrorIs(t, runner.Execute(context.Background(), nil), agentscripts.ErrTimeout)
112+
require.ErrorIs(t, runner.Execute(context.Background(), agentscripts.ExecuteAllScripts), agentscripts.ErrTimeout)
113+
}
114+
115+
func TestScriptReportsTiming(t *testing.T) {
116+
t.Parallel()
117+
118+
ctx := testutil.Context(t, testutil.WaitShort)
119+
fLogger := newFakeScriptLogger()
120+
runner := setup(t, func(uuid2 uuid.UUID) agentscripts.ScriptLogger {
121+
return fLogger
122+
})
123+
124+
aAPI := agenttest.NewFakeAgentAPI(t, slogtest.Make(t, nil), nil, nil)
125+
err := runner.Init([]codersdk.WorkspaceAgentScript{{
126+
DisplayName: "say-hello",
127+
LogSourceID: uuid.New(),
128+
Script: "echo hello",
129+
}}, aAPI.ScriptCompleted)
130+
require.NoError(t, err)
131+
require.NoError(t, runner.Execute(ctx, agentscripts.ExecuteAllScripts))
132+
runner.Close()
133+
134+
log := testutil.RequireRecvCtx(ctx, t, fLogger.logs)
135+
require.Equal(t, "hello", log.Output)
136+
137+
timings := aAPI.GetTimings()
138+
require.Equal(t, 1, len(timings))
139+
140+
timing := timings[0]
141+
require.Equal(t, int32(0), timing.ExitCode)
142+
require.GreaterOrEqual(t, timing.End.AsTime(), timing.Start.AsTime())
113143
}
114144

115145
// TestCronClose exists because cron.Run() can happen after cron.Close().

agent/agenttest/client.go

+15
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ type FakeAgentAPI struct {
170170
logsCh chan<- *agentproto.BatchCreateLogsRequest
171171
lifecycleStates []codersdk.WorkspaceAgentLifecycle
172172
metadata map[string]agentsdk.Metadata
173+
timings []*agentproto.Timing
173174

174175
getAnnouncementBannersFunc func() ([]codersdk.BannerConfig, error)
175176
}
@@ -182,6 +183,12 @@ func (*FakeAgentAPI) GetServiceBanner(context.Context, *agentproto.GetServiceBan
182183
return &agentproto.ServiceBanner{}, nil
183184
}
184185

186+
func (f *FakeAgentAPI) GetTimings() []*agentproto.Timing {
187+
f.Lock()
188+
defer f.Unlock()
189+
return slices.Clone(f.timings)
190+
}
191+
185192
func (f *FakeAgentAPI) SetAnnouncementBannersFunc(fn func() ([]codersdk.BannerConfig, error)) {
186193
f.Lock()
187194
defer f.Unlock()
@@ -301,6 +308,14 @@ func (f *FakeAgentAPI) BatchCreateLogs(ctx context.Context, req *agentproto.Batc
301308
return &agentproto.BatchCreateLogsResponse{}, nil
302309
}
303310

311+
func (f *FakeAgentAPI) ScriptCompleted(_ context.Context, req *agentproto.WorkspaceAgentScriptCompletedRequest) (*agentproto.WorkspaceAgentScriptCompletedResponse, error) {
312+
f.Lock()
313+
f.timings = append(f.timings, req.Timing)
314+
f.Unlock()
315+
316+
return &agentproto.WorkspaceAgentScriptCompletedResponse{}, nil
317+
}
318+
304319
func NewFakeAgentAPI(t testing.TB, logger slog.Logger, manifest *agentproto.Manifest, statsCh chan *agentproto.Stats) *FakeAgentAPI {
305320
return &FakeAgentAPI{
306321
t: t,

0 commit comments

Comments
 (0)