Skip to content

feat: add agent timings #14713

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 57 commits into from
Sep 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
57 commits
Select commit Hold shift + click to select a range
bcfacf7
feat: begin impl of agent script timings
DanielleMaywood Sep 17, 2024
d55b2e6
feat: add job_id and display_name to script timings
DanielleMaywood Sep 17, 2024
05d8a8b
fix: increment migration number
DanielleMaywood Sep 17, 2024
d54f6c3
fix: rename migrations from 251 to 254
DanielleMaywood Sep 18, 2024
d573c67
test: get tests compiling
DanielleMaywood Sep 18, 2024
09a1a42
fix: appease the linter
DanielleMaywood Sep 18, 2024
6d829f3
fix: get tests passing again
DanielleMaywood Sep 18, 2024
4df7831
fix: drop column from correct table
DanielleMaywood Sep 18, 2024
2565f0a
test: add fixture for agent script timings
DanielleMaywood Sep 18, 2024
669b837
fix: typo
DanielleMaywood Sep 18, 2024
df51673
fix: use job id used in provisioner job timings
DanielleMaywood Sep 18, 2024
c6d71d1
fix: increment migration number
DanielleMaywood Sep 18, 2024
a306e48
test: behaviour of script runner
DanielleMaywood Sep 18, 2024
4b0056c
test: rewrite test
DanielleMaywood Sep 18, 2024
14b68a1
test: does exit 1 script break things?
DanielleMaywood Sep 18, 2024
b22cb57
test: rewrite test again
DanielleMaywood Sep 18, 2024
ef5e2fe
fix: revert change
DanielleMaywood Sep 19, 2024
a5b412b
fix: let code breathe
DanielleMaywood Sep 19, 2024
57ebf30
fix: wrap errors
DanielleMaywood Sep 19, 2024
2a49f67
fix: justify nolint
DanielleMaywood Sep 19, 2024
0698584
fix: swap require.Equal argument order
DanielleMaywood Sep 19, 2024
3580069
fix: add mutex operations
DanielleMaywood Sep 19, 2024
9cb1252
feat: add 'ran_on_start' and 'blocked_login' fields
DanielleMaywood Sep 19, 2024
0b0d1ef
fix: update testdata fixture
DanielleMaywood Sep 19, 2024
eb857c1
fix: refer to agent_id instead of job_id in timings
DanielleMaywood Sep 19, 2024
586d88f
fix: JobID -> AgentID in dbauthz_test
DanielleMaywood Sep 19, 2024
b6289bb
fix: add 'id' to scripts, make timing refer to script id
DanielleMaywood Sep 19, 2024
863c3dc
fix: fix broken tests and convert bug
DanielleMaywood Sep 19, 2024
b8d5d1d
fix: update testdata fixtures
DanielleMaywood Sep 19, 2024
61c26ea
fix: update testdata fixtures again
DanielleMaywood Sep 19, 2024
7f8b6f9
feat: capture stage and if script timed out
DanielleMaywood Sep 20, 2024
c20ac32
fix: update migration number
DanielleMaywood Sep 20, 2024
c652133
test: add test for script api
DanielleMaywood Sep 20, 2024
1199b64
fix: fake db query
DanielleMaywood Sep 20, 2024
afa61eb
fix: use UTC time
DanielleMaywood Sep 20, 2024
8d325e2
fix: ensure r.scriptComplete is not nil
DanielleMaywood Sep 20, 2024
aeee582
fix: move err check to right after call
DanielleMaywood Sep 20, 2024
424069c
fix: uppercase sql
DanielleMaywood Sep 20, 2024
9b43a94
fix: use dbtime.Now()
DanielleMaywood Sep 20, 2024
ed31199
fix: debug log on r.scriptCompleted being nil
DanielleMaywood Sep 20, 2024
8814711
fix: ensure correct rbac permissions
DanielleMaywood Sep 20, 2024
0414623
chore: remove DisplayName
DanielleMaywood Sep 20, 2024
a8192a5
Merge branch 'main' into dm-add-agent-timings
DanielleMaywood Sep 20, 2024
08a466a
fix: get tests passing
DanielleMaywood Sep 20, 2024
8e7c757
fix: remove space in sql up
DanielleMaywood Sep 20, 2024
621071e
docs: document ExecuteOption
DanielleMaywood Sep 20, 2024
3b4df92
fix: drop 'RETURNING' from sql
DanielleMaywood Sep 20, 2024
44127b8
chore: remove 'display_name' from timing table
DanielleMaywood Sep 20, 2024
5846216
fix: testdata fixture
DanielleMaywood Sep 20, 2024
c43a143
fix: put r.scriptCompleted call in goroutine
DanielleMaywood Sep 23, 2024
13889bf
fix: track goroutine for test + use separate context for reporting
DanielleMaywood Sep 23, 2024
bf32b89
fix: appease linter, handle trackCommandGoroutine error
DanielleMaywood Sep 23, 2024
7fe6d8c
fix: resolve race condition
DanielleMaywood Sep 23, 2024
d7e86c6
feat: replace timed_out column with status column
DanielleMaywood Sep 23, 2024
6e338f2
test: update testdata fixture
DanielleMaywood Sep 23, 2024
79a620b
fix: apply suggestions from review
DanielleMaywood Sep 24, 2024
180307f
revert: linter changes
DanielleMaywood Sep 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 3 additions & 7 deletions agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -941,17 +941,15 @@ func (a *agent) handleManifest(manifestOK *checkpoint) func(ctx context.Context,
}
}

err = a.scriptRunner.Init(manifest.Scripts)
err = a.scriptRunner.Init(manifest.Scripts, aAPI.ScriptCompleted)
if err != nil {
return xerrors.Errorf("init script runner: %w", err)
}
err = a.trackGoroutine(func() {
start := time.Now()
// here we use the graceful context because the script runner is not directly tied
// to the agent API.
err := a.scriptRunner.Execute(a.gracefulCtx, func(script codersdk.WorkspaceAgentScript) bool {
return script.RunOnStart
})
err := a.scriptRunner.Execute(a.gracefulCtx, agentscripts.ExecuteStartScripts)
// Measure the time immediately after the script has finished
dur := time.Since(start).Seconds()
if err != nil {
Expand Down Expand Up @@ -1844,9 +1842,7 @@ func (a *agent) Close() error {
a.gracefulCancel()

lifecycleState := codersdk.WorkspaceAgentLifecycleOff
err = a.scriptRunner.Execute(a.hardCtx, func(script codersdk.WorkspaceAgentScript) bool {
return script.RunOnStop
})
err = a.scriptRunner.Execute(a.hardCtx, agentscripts.ExecuteStopScripts)
if err != nil {
a.logger.Warn(a.hardCtx, "shutdown script(s) failed", slog.Error(err))
if errors.Is(err, agentscripts.ErrTimeout) {
Expand Down
2 changes: 2 additions & 0 deletions agent/agent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1517,10 +1517,12 @@ func TestAgent_Lifecycle(t *testing.T) {
agentsdk.Manifest{
DERPMap: derpMap,
Scripts: []codersdk.WorkspaceAgentScript{{
ID: uuid.New(),
LogPath: "coder-startup-script.log",
Script: "echo 1",
RunOnStart: true,
}, {
ID: uuid.New(),
LogPath: "coder-shutdown-script.log",
Script: "echo " + expected,
RunOnStop: true,
Expand Down
122 changes: 97 additions & 25 deletions agent/agentscripts/agentscripts.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,13 @@ import (
"github.com/spf13/afero"
"golang.org/x/sync/errgroup"
"golang.org/x/xerrors"
"google.golang.org/protobuf/types/known/timestamppb"

"cdr.dev/slog"

"github.com/coder/coder/v2/agent/agentssh"
"github.com/coder/coder/v2/agent/proto"
"github.com/coder/coder/v2/coderd/database/dbtime"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/codersdk/agentsdk"
)
Expand Down Expand Up @@ -75,18 +78,21 @@ func New(opts Options) *Runner {
}
}

type ScriptCompletedFunc func(context.Context, *proto.WorkspaceAgentScriptCompletedRequest) (*proto.WorkspaceAgentScriptCompletedResponse, error)

type Runner struct {
Options

cronCtx context.Context
cronCtxCancel context.CancelFunc
cmdCloseWait sync.WaitGroup
closed chan struct{}
closeMutex sync.Mutex
cron *cron.Cron
initialized atomic.Bool
scripts []codersdk.WorkspaceAgentScript
dataDir string
cronCtx context.Context
cronCtxCancel context.CancelFunc
cmdCloseWait sync.WaitGroup
closed chan struct{}
closeMutex sync.Mutex
cron *cron.Cron
initialized atomic.Bool
scripts []codersdk.WorkspaceAgentScript
dataDir string
scriptCompleted ScriptCompletedFunc

// scriptsExecuted includes all scripts executed by the workspace agent. Agents
// execute startup scripts, and scripts on a cron schedule. Both will increment
Expand Down Expand Up @@ -116,12 +122,13 @@ func (r *Runner) RegisterMetrics(reg prometheus.Registerer) {
// Init initializes the runner with the provided scripts.
// It also schedules any scripts that have a schedule.
// This function must be called before Execute.
func (r *Runner) Init(scripts []codersdk.WorkspaceAgentScript) error {
func (r *Runner) Init(scripts []codersdk.WorkspaceAgentScript, scriptCompleted ScriptCompletedFunc) error {
if r.initialized.Load() {
return xerrors.New("init: already initialized")
}
r.initialized.Store(true)
r.scripts = scripts
r.scriptCompleted = scriptCompleted
r.Logger.Info(r.cronCtx, "initializing agent scripts", slog.F("script_count", len(scripts)), slog.F("log_dir", r.LogDir))

err := r.Filesystem.MkdirAll(r.ScriptBinDir(), 0o700)
Expand All @@ -135,7 +142,7 @@ func (r *Runner) Init(scripts []codersdk.WorkspaceAgentScript) error {
}
script := script
_, err := r.cron.AddFunc(script.Cron, func() {
err := r.trackRun(r.cronCtx, script)
err := r.trackRun(r.cronCtx, script, ExecuteCronScripts)
if err != nil {
r.Logger.Warn(context.Background(), "run agent script on schedule", slog.Error(err))
}
Expand Down Expand Up @@ -172,22 +179,33 @@ func (r *Runner) StartCron() {
}
}

// ExecuteOption describes what scripts we want to execute.
type ExecuteOption int

// ExecuteOption enums.
const (
ExecuteAllScripts ExecuteOption = iota
ExecuteStartScripts
ExecuteStopScripts
ExecuteCronScripts
)

// Execute runs a set of scripts according to a filter.
func (r *Runner) Execute(ctx context.Context, filter func(script codersdk.WorkspaceAgentScript) bool) error {
if filter == nil {
// Execute em' all!
filter = func(script codersdk.WorkspaceAgentScript) bool {
return true
}
}
func (r *Runner) Execute(ctx context.Context, option ExecuteOption) error {
var eg errgroup.Group
for _, script := range r.scripts {
if !filter(script) {
runScript := (option == ExecuteStartScripts && script.RunOnStart) ||
(option == ExecuteStopScripts && script.RunOnStop) ||
(option == ExecuteCronScripts && script.Cron != "") ||
option == ExecuteAllScripts

if !runScript {
continue
}

script := script
eg.Go(func() error {
err := r.trackRun(ctx, script)
err := r.trackRun(ctx, script, option)
if err != nil {
return xerrors.Errorf("run agent script %q: %w", script.LogSourceID, err)
}
Expand All @@ -198,8 +216,8 @@ func (r *Runner) Execute(ctx context.Context, filter func(script codersdk.Worksp
}

// trackRun wraps "run" with metrics.
func (r *Runner) trackRun(ctx context.Context, script codersdk.WorkspaceAgentScript) error {
err := r.run(ctx, script)
func (r *Runner) trackRun(ctx context.Context, script codersdk.WorkspaceAgentScript, option ExecuteOption) error {
err := r.run(ctx, script, option)
if err != nil {
r.scriptsExecuted.WithLabelValues("false").Add(1)
} else {
Expand All @@ -212,7 +230,7 @@ func (r *Runner) trackRun(ctx context.Context, script codersdk.WorkspaceAgentScr
// If the timeout is exceeded, the process is sent an interrupt signal.
// If the process does not exit after a few seconds, it is forcefully killed.
// This function immediately returns after a timeout, and does not wait for the process to exit.
func (r *Runner) run(ctx context.Context, script codersdk.WorkspaceAgentScript) error {
func (r *Runner) run(ctx context.Context, script codersdk.WorkspaceAgentScript, option ExecuteOption) error {
logPath := script.LogPath
if logPath == "" {
logPath = fmt.Sprintf("coder-script-%s.log", script.LogSourceID)
Expand Down Expand Up @@ -299,9 +317,9 @@ func (r *Runner) run(ctx context.Context, script codersdk.WorkspaceAgentScript)
cmd.Stdout = io.MultiWriter(fileWriter, infoW)
cmd.Stderr = io.MultiWriter(fileWriter, errW)

start := time.Now()
start := dbtime.Now()
defer func() {
end := time.Now()
end := dbtime.Now()
execTime := end.Sub(start)
exitCode := 0
if err != nil {
Expand All @@ -314,6 +332,60 @@ func (r *Runner) run(ctx context.Context, script codersdk.WorkspaceAgentScript)
} else {
logger.Info(ctx, fmt.Sprintf("%s script completed", logPath), slog.F("execution_time", execTime), slog.F("exit_code", exitCode))
}

if r.scriptCompleted == nil {
logger.Debug(ctx, "r.scriptCompleted unexpectedly nil")
return
}

// We want to check this outside of the goroutine to avoid a race condition
timedOut := errors.Is(err, ErrTimeout)
pipesLeftOpen := errors.Is(err, ErrOutputPipesOpen)

err = r.trackCommandGoroutine(func() {
var stage proto.Timing_Stage
switch option {
case ExecuteStartScripts:
stage = proto.Timing_START
case ExecuteStopScripts:
stage = proto.Timing_STOP
case ExecuteCronScripts:
stage = proto.Timing_CRON
}

var status proto.Timing_Status
switch {
case timedOut:
status = proto.Timing_TIMED_OUT
case pipesLeftOpen:
status = proto.Timing_PIPES_LEFT_OPEN
case exitCode != 0:
status = proto.Timing_EXIT_FAILURE
default:
status = proto.Timing_OK
}

reportTimeout := 30 * time.Second
reportCtx, cancel := context.WithTimeout(context.Background(), reportTimeout)
defer cancel()

_, err := r.scriptCompleted(reportCtx, &proto.WorkspaceAgentScriptCompletedRequest{
Timing: &proto.Timing{
ScriptId: script.ID[:],
Start: timestamppb.New(start),
End: timestamppb.New(end),
ExitCode: int32(exitCode),
Stage: stage,
Status: status,
},
})
if err != nil {
logger.Error(ctx, fmt.Sprintf("reporting script completed: %s", err.Error()))
}
})
if err != nil {
logger.Error(ctx, fmt.Sprintf("reporting script completed: track command goroutine: %s", err.Error()))
}
}()

err = cmd.Start()
Expand Down
50 changes: 40 additions & 10 deletions agent/agentscripts/agentscripts_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"cdr.dev/slog/sloggers/slogtest"
"github.com/coder/coder/v2/agent/agentscripts"
"github.com/coder/coder/v2/agent/agentssh"
"github.com/coder/coder/v2/agent/agenttest"
"github.com/coder/coder/v2/codersdk"
"github.com/coder/coder/v2/codersdk/agentsdk"
"github.com/coder/coder/v2/testutil"
Expand All @@ -34,14 +35,13 @@ func TestExecuteBasic(t *testing.T) {
return fLogger
})
defer runner.Close()
aAPI := agenttest.NewFakeAgentAPI(t, slogtest.Make(t, nil), nil, nil)
err := runner.Init([]codersdk.WorkspaceAgentScript{{
LogSourceID: uuid.New(),
Script: "echo hello",
}})
}}, aAPI.ScriptCompleted)
require.NoError(t, err)
require.NoError(t, runner.Execute(context.Background(), func(script codersdk.WorkspaceAgentScript) bool {
return true
}))
require.NoError(t, runner.Execute(context.Background(), agentscripts.ExecuteAllScripts))
log := testutil.RequireRecvCtx(ctx, t, fLogger.logs)
require.Equal(t, "hello", log.Output)
}
Expand All @@ -61,18 +61,17 @@ func TestEnv(t *testing.T) {
cmd.exe /c echo %CODER_SCRIPT_BIN_DIR%
`
}
aAPI := agenttest.NewFakeAgentAPI(t, slogtest.Make(t, nil), nil, nil)
err := runner.Init([]codersdk.WorkspaceAgentScript{{
LogSourceID: id,
Script: script,
}})
}}, aAPI.ScriptCompleted)
require.NoError(t, err)

ctx := testutil.Context(t, testutil.WaitLong)

done := testutil.Go(t, func() {
err := runner.Execute(ctx, func(script codersdk.WorkspaceAgentScript) bool {
return true
})
err := runner.Execute(ctx, agentscripts.ExecuteAllScripts)
assert.NoError(t, err)
})
defer func() {
Expand Down Expand Up @@ -103,13 +102,44 @@ func TestTimeout(t *testing.T) {
t.Parallel()
runner := setup(t, nil)
defer runner.Close()
aAPI := agenttest.NewFakeAgentAPI(t, slogtest.Make(t, nil), nil, nil)
err := runner.Init([]codersdk.WorkspaceAgentScript{{
LogSourceID: uuid.New(),
Script: "sleep infinity",
Timeout: time.Millisecond,
}})
}}, aAPI.ScriptCompleted)
require.NoError(t, err)
require.ErrorIs(t, runner.Execute(context.Background(), nil), agentscripts.ErrTimeout)
require.ErrorIs(t, runner.Execute(context.Background(), agentscripts.ExecuteAllScripts), agentscripts.ErrTimeout)
}

func TestScriptReportsTiming(t *testing.T) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we need to expand this a bit to cover the timeout and non-zero exit status use-cases.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, definitely a good idea.

I had tried a non-zero exit code but it appeared to cause the test to fail so I'll investigate that again to make sure it wasn't me doing something wrong.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Potentially a follow-up PR?

t.Parallel()

ctx := testutil.Context(t, testutil.WaitShort)
fLogger := newFakeScriptLogger()
runner := setup(t, func(uuid2 uuid.UUID) agentscripts.ScriptLogger {
return fLogger
})

aAPI := agenttest.NewFakeAgentAPI(t, slogtest.Make(t, nil), nil, nil)
err := runner.Init([]codersdk.WorkspaceAgentScript{{
DisplayName: "say-hello",
LogSourceID: uuid.New(),
Script: "echo hello",
}}, aAPI.ScriptCompleted)
require.NoError(t, err)
require.NoError(t, runner.Execute(ctx, agentscripts.ExecuteAllScripts))
runner.Close()

log := testutil.RequireRecvCtx(ctx, t, fLogger.logs)
require.Equal(t, "hello", log.Output)

timings := aAPI.GetTimings()
require.Equal(t, 1, len(timings))

timing := timings[0]
require.Equal(t, int32(0), timing.ExitCode)
require.GreaterOrEqual(t, timing.End.AsTime(), timing.Start.AsTime())
}

// TestCronClose exists because cron.Run() can happen after cron.Close().
Expand Down
15 changes: 15 additions & 0 deletions agent/agenttest/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ type FakeAgentAPI struct {
logsCh chan<- *agentproto.BatchCreateLogsRequest
lifecycleStates []codersdk.WorkspaceAgentLifecycle
metadata map[string]agentsdk.Metadata
timings []*agentproto.Timing

getAnnouncementBannersFunc func() ([]codersdk.BannerConfig, error)
}
Expand All @@ -182,6 +183,12 @@ func (*FakeAgentAPI) GetServiceBanner(context.Context, *agentproto.GetServiceBan
return &agentproto.ServiceBanner{}, nil
}

func (f *FakeAgentAPI) GetTimings() []*agentproto.Timing {
f.Lock()
defer f.Unlock()
return slices.Clone(f.timings)
}

func (f *FakeAgentAPI) SetAnnouncementBannersFunc(fn func() ([]codersdk.BannerConfig, error)) {
f.Lock()
defer f.Unlock()
Expand Down Expand Up @@ -301,6 +308,14 @@ func (f *FakeAgentAPI) BatchCreateLogs(ctx context.Context, req *agentproto.Batc
return &agentproto.BatchCreateLogsResponse{}, nil
}

func (f *FakeAgentAPI) ScriptCompleted(_ context.Context, req *agentproto.WorkspaceAgentScriptCompletedRequest) (*agentproto.WorkspaceAgentScriptCompletedResponse, error) {
f.Lock()
f.timings = append(f.timings, req.Timing)
f.Unlock()

return &agentproto.WorkspaceAgentScriptCompletedResponse{}, nil
}

func NewFakeAgentAPI(t testing.TB, logger slog.Logger, manifest *agentproto.Manifest, statsCh chan *agentproto.Stats) *FakeAgentAPI {
return &FakeAgentAPI{
t: t,
Expand Down
Loading
Loading