Skip to content

Commit 64a1f64

Browse files
committed
Merge branch 'main' into mergegen
2 parents 96cee2b + adf14f1 commit 64a1f64

File tree

140 files changed

+4158
-1072
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

140 files changed

+4158
-1072
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -486,7 +486,7 @@ coderd/database/dump.sql: coderd/database/gen/dump/main.go $(wildcard coderd/dat
486486
go run ./coderd/database/gen/dump/main.go
487487

488488
# Generates Go code for querying the database.
489-
coderd/database/querier.go: coderd/database/sqlc.yaml coderd/database/dump.sql $(wildcard coderd/database/queries/*.sql) coderd/database/gen/enum/main.go
489+
coderd/database/querier.go: coderd/database/sqlc.yaml coderd/database/dump.sql $(wildcard coderd/database/queries/*.sql)
490490
./coderd/database/generate.sh
491491

492492

agent/agent.go

Lines changed: 85 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"net/http"
1515
"net/netip"
1616
"os"
17+
"os/exec"
1718
"os/user"
1819
"path/filepath"
1920
"sort"
@@ -126,6 +127,7 @@ func New(options Options) Agent {
126127
tempDir: options.TempDir,
127128
lifecycleUpdate: make(chan struct{}, 1),
128129
lifecycleReported: make(chan codersdk.WorkspaceAgentLifecycle, 1),
130+
lifecycleStates: []agentsdk.PostLifecycleRequest{{State: codersdk.WorkspaceAgentLifecycleCreated}},
129131
ignorePorts: options.IgnorePorts,
130132
connStatsChan: make(chan *agentsdk.Stats, 1),
131133
sshMaxTimeout: options.SSHMaxTimeout,
@@ -170,7 +172,7 @@ type agent struct {
170172
lifecycleUpdate chan struct{}
171173
lifecycleReported chan codersdk.WorkspaceAgentLifecycle
172174
lifecycleMu sync.RWMutex // Protects following.
173-
lifecycleState codersdk.WorkspaceAgentLifecycle
175+
lifecycleStates []agentsdk.PostLifecycleRequest
174176

175177
network *tailnet.Conn
176178
connStatsChan chan *agentsdk.Stats
@@ -414,11 +416,10 @@ func (a *agent) reportMetadataLoop(ctx context.Context) {
414416
}
415417
}
416418

417-
// reportLifecycleLoop reports the current lifecycle state once.
418-
// Only the latest state is reported, intermediate states may be
419-
// lost if the agent can't communicate with the API.
419+
// reportLifecycleLoop reports the current lifecycle state once. All state
420+
// changes are reported in order.
420421
func (a *agent) reportLifecycleLoop(ctx context.Context) {
421-
var lastReported codersdk.WorkspaceAgentLifecycle
422+
lastReportedIndex := 0 // Start off with the created state without reporting it.
422423
for {
423424
select {
424425
case <-a.lifecycleUpdate:
@@ -428,24 +429,32 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
428429

429430
for r := retry.New(time.Second, 15*time.Second); r.Wait(ctx); {
430431
a.lifecycleMu.RLock()
431-
state := a.lifecycleState
432+
lastIndex := len(a.lifecycleStates) - 1
433+
report := a.lifecycleStates[lastReportedIndex]
434+
if len(a.lifecycleStates) > lastReportedIndex+1 {
435+
report = a.lifecycleStates[lastReportedIndex+1]
436+
}
432437
a.lifecycleMu.RUnlock()
433438

434-
if state == lastReported {
439+
if lastIndex == lastReportedIndex {
435440
break
436441
}
437442

438-
a.logger.Debug(ctx, "reporting lifecycle state", slog.F("state", state))
443+
a.logger.Debug(ctx, "reporting lifecycle state", slog.F("payload", report))
439444

440-
err := a.client.PostLifecycle(ctx, agentsdk.PostLifecycleRequest{
441-
State: state,
442-
})
445+
err := a.client.PostLifecycle(ctx, report)
443446
if err == nil {
444-
lastReported = state
447+
lastReportedIndex++
445448
select {
446-
case a.lifecycleReported <- state:
449+
case a.lifecycleReported <- report.State:
447450
case <-a.lifecycleReported:
448-
a.lifecycleReported <- state
451+
a.lifecycleReported <- report.State
452+
}
453+
if lastReportedIndex < lastIndex {
454+
// Keep reporting until we've sent all messages, we can't
455+
// rely on the channel triggering us before the backlog is
456+
// consumed.
457+
continue
449458
}
450459
break
451460
}
@@ -461,15 +470,20 @@ func (a *agent) reportLifecycleLoop(ctx context.Context) {
461470
// setLifecycle sets the lifecycle state and notifies the lifecycle loop.
462471
// The state is only updated if it's a valid state transition.
463472
func (a *agent) setLifecycle(ctx context.Context, state codersdk.WorkspaceAgentLifecycle) {
473+
report := agentsdk.PostLifecycleRequest{
474+
State: state,
475+
ChangedAt: database.Now(),
476+
}
477+
464478
a.lifecycleMu.Lock()
465-
lastState := a.lifecycleState
466-
if slices.Index(codersdk.WorkspaceAgentLifecycleOrder, lastState) > slices.Index(codersdk.WorkspaceAgentLifecycleOrder, state) {
467-
a.logger.Warn(ctx, "attempted to set lifecycle state to a previous state", slog.F("last", lastState), slog.F("state", state))
479+
lastReport := a.lifecycleStates[len(a.lifecycleStates)-1]
480+
if slices.Index(codersdk.WorkspaceAgentLifecycleOrder, lastReport.State) >= slices.Index(codersdk.WorkspaceAgentLifecycleOrder, report.State) {
481+
a.logger.Warn(ctx, "attempted to set lifecycle state to a previous state", slog.F("last", lastReport), slog.F("current", report))
468482
a.lifecycleMu.Unlock()
469483
return
470484
}
471-
a.lifecycleState = state
472-
a.logger.Debug(ctx, "set lifecycle state", slog.F("state", state), slog.F("last", lastState))
485+
a.lifecycleStates = append(a.lifecycleStates, report)
486+
a.logger.Debug(ctx, "set lifecycle state", slog.F("current", report), slog.F("last", lastReport))
473487
a.lifecycleMu.Unlock()
474488

475489
select {
@@ -534,7 +548,6 @@ func (a *agent) run(ctx context.Context) error {
534548

535549
lifecycleState := codersdk.WorkspaceAgentLifecycleReady
536550
scriptDone := make(chan error, 1)
537-
scriptStart := time.Now()
538551
err = a.trackConnGoroutine(func() {
539552
defer close(scriptDone)
540553
scriptDone <- a.runStartupScript(ctx, manifest.StartupScript)
@@ -556,22 +569,15 @@ func (a *agent) run(ctx context.Context) error {
556569
select {
557570
case err = <-scriptDone:
558571
case <-timeout:
559-
a.logger.Warn(ctx, "startup script timed out")
572+
a.logger.Warn(ctx, "script timed out", slog.F("lifecycle", "startup"), slog.F("timeout", manifest.ShutdownScriptTimeout))
560573
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleStartTimeout)
561574
err = <-scriptDone // The script can still complete after a timeout.
562575
}
563-
if errors.Is(err, context.Canceled) {
564-
return
565-
}
566-
// Only log if there was a startup script.
567-
if manifest.StartupScript != "" {
568-
execTime := time.Since(scriptStart)
569-
if err != nil {
570-
a.logger.Warn(ctx, "startup script failed", slog.F("execution_time", execTime), slog.Error(err))
571-
lifecycleState = codersdk.WorkspaceAgentLifecycleStartError
572-
} else {
573-
a.logger.Info(ctx, "startup script completed", slog.F("execution_time", execTime))
576+
if err != nil {
577+
if errors.Is(err, context.Canceled) {
578+
return
574579
}
580+
lifecycleState = codersdk.WorkspaceAgentLifecycleStartError
575581
}
576582
a.setLifecycle(ctx, lifecycleState)
577583
}()
@@ -830,20 +836,31 @@ func (a *agent) runShutdownScript(ctx context.Context, script string) error {
830836
return a.runScript(ctx, "shutdown", script)
831837
}
832838

833-
func (a *agent) runScript(ctx context.Context, lifecycle, script string) error {
839+
func (a *agent) runScript(ctx context.Context, lifecycle, script string) (err error) {
834840
if script == "" {
835841
return nil
836842
}
837843

838-
a.logger.Info(ctx, "running script", slog.F("lifecycle", lifecycle), slog.F("script", script))
844+
logger := a.logger.With(slog.F("lifecycle", lifecycle))
845+
846+
logger.Info(ctx, fmt.Sprintf("running %s script", lifecycle), slog.F("script", script))
839847
fileWriter, err := a.filesystem.OpenFile(filepath.Join(a.logDir, fmt.Sprintf("coder-%s-script.log", lifecycle)), os.O_CREATE|os.O_RDWR, 0o600)
840848
if err != nil {
841849
return xerrors.Errorf("open %s script log file: %w", lifecycle, err)
842850
}
843851
defer func() {
844-
_ = fileWriter.Close()
852+
err := fileWriter.Close()
853+
if err != nil {
854+
logger.Warn(ctx, fmt.Sprintf("close %s script log file", lifecycle), slog.Error(err))
855+
}
845856
}()
846857

858+
cmdPty, err := a.sshServer.CreateCommand(ctx, script, nil)
859+
if err != nil {
860+
return xerrors.Errorf("%s script: create command: %w", lifecycle, err)
861+
}
862+
cmd := cmdPty.AsExec()
863+
847864
var writer io.Writer = fileWriter
848865
if lifecycle == "startup" {
849866
// Create pipes for startup logs reader and writer
@@ -854,29 +871,42 @@ func (a *agent) runScript(ctx context.Context, lifecycle, script string) error {
854871
writer = io.MultiWriter(fileWriter, logsWriter)
855872
flushedLogs, err := a.trackScriptLogs(ctx, logsReader)
856873
if err != nil {
857-
return xerrors.Errorf("track script logs: %w", err)
874+
return xerrors.Errorf("track %s script logs: %w", lifecycle, err)
858875
}
859876
defer func() {
860877
_ = logsWriter.Close()
861878
<-flushedLogs
862879
}()
863880
}
864881

865-
cmdPty, err := a.sshServer.CreateCommand(ctx, script, nil)
866-
if err != nil {
867-
return xerrors.Errorf("create command: %w", err)
868-
}
869-
cmd := cmdPty.AsExec()
870882
cmd.Stdout = writer
871883
cmd.Stderr = writer
884+
885+
start := time.Now()
886+
defer func() {
887+
end := time.Now()
888+
execTime := end.Sub(start)
889+
exitCode := 0
890+
if err != nil {
891+
exitCode = 255 // Unknown status.
892+
var exitError *exec.ExitError
893+
if xerrors.As(err, &exitError) {
894+
exitCode = exitError.ExitCode()
895+
}
896+
logger.Warn(ctx, fmt.Sprintf("%s script failed", lifecycle), slog.F("execution_time", execTime), slog.F("exit_code", exitCode), slog.Error(err))
897+
} else {
898+
logger.Info(ctx, fmt.Sprintf("%s script completed", lifecycle), slog.F("execution_time", execTime), slog.F("exit_code", exitCode))
899+
}
900+
}()
901+
872902
err = cmd.Run()
873903
if err != nil {
874904
// cmd.Run does not return a context canceled error, it returns "signal: killed".
875905
if ctx.Err() != nil {
876906
return ctx.Err()
877907
}
878908

879-
return xerrors.Errorf("run: %w", err)
909+
return xerrors.Errorf("%s script: run: %w", lifecycle, err)
880910
}
881911
return nil
882912
}
@@ -978,6 +1008,9 @@ func (a *agent) trackScriptLogs(ctx context.Context, reader io.Reader) (chan str
9781008
Output: scanner.Text(),
9791009
})
9801010
}
1011+
if err := scanner.Err(); err != nil {
1012+
a.logger.Error(ctx, "scan startup logs", slog.Error(err))
1013+
}
9811014
defer close(logsFinished)
9821015
logsFlushed.L.Lock()
9831016
for {
@@ -1004,7 +1037,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
10041037
defer a.connCountReconnectingPTY.Add(-1)
10051038

10061039
connectionID := uuid.NewString()
1007-
logger = logger.With(slog.F("id", msg.ID), slog.F("connection_id", connectionID))
1040+
logger = logger.With(slog.F("message_id", msg.ID), slog.F("connection_id", connectionID))
10081041
logger.Debug(ctx, "starting handler")
10091042

10101043
defer func() {
@@ -1016,9 +1049,9 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
10161049
// If the agent is closed, we don't want to
10171050
// log this as an error since it's expected.
10181051
if closed {
1019-
logger.Debug(ctx, "session error after agent close", slog.Error(err))
1052+
logger.Debug(ctx, "reconnecting PTY failed with session error (agent closed)", slog.Error(err))
10201053
} else {
1021-
logger.Error(ctx, "session error", slog.Error(err))
1054+
logger.Error(ctx, "reconnecting PTY failed with session error", slog.Error(err))
10221055
}
10231056
}
10241057
logger.Debug(ctx, "session closed")
@@ -1094,9 +1127,9 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
10941127
// When the PTY is closed, this is triggered.
10951128
// Error is typically a benign EOF, so only log for debugging.
10961129
if errors.Is(err, io.EOF) {
1097-
logger.Debug(ctx, "unable to read pty output, command exited?", slog.Error(err))
1130+
logger.Debug(ctx, "unable to read pty output, command might have exited", slog.Error(err))
10981131
} else {
1099-
logger.Warn(ctx, "unable to read pty output, command exited?", slog.Error(err))
1132+
logger.Warn(ctx, "unable to read pty output, command might have exited", slog.Error(err))
11001133
a.metrics.reconnectingPTYErrors.WithLabelValues("output_reader").Add(1)
11011134
}
11021135
break
@@ -1141,7 +1174,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
11411174
err := rpty.ptty.Resize(msg.Height, msg.Width)
11421175
if err != nil {
11431176
// We can continue after this, it's not fatal!
1144-
logger.Error(ctx, "resize", slog.Error(err))
1177+
logger.Error(ctx, "reconnecting PTY initial resize failed, but will continue", slog.Error(err))
11451178
a.metrics.reconnectingPTYErrors.WithLabelValues("resize").Add(1)
11461179
}
11471180
// Write any previously stored data for the TTY.
@@ -1200,12 +1233,12 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
12001233
return nil
12011234
}
12021235
if err != nil {
1203-
logger.Warn(ctx, "read conn", slog.Error(err))
1236+
logger.Warn(ctx, "reconnecting PTY failed with read error", slog.Error(err))
12041237
return nil
12051238
}
12061239
_, err = rpty.ptty.InputWriter().Write([]byte(req.Data))
12071240
if err != nil {
1208-
logger.Warn(ctx, "write to pty", slog.Error(err))
1241+
logger.Warn(ctx, "reconnecting PTY failed with write error", slog.Error(err))
12091242
a.metrics.reconnectingPTYErrors.WithLabelValues("input_writer").Add(1)
12101243
return nil
12111244
}
@@ -1216,7 +1249,7 @@ func (a *agent) handleReconnectingPTY(ctx context.Context, logger slog.Logger, m
12161249
err = rpty.ptty.Resize(req.Height, req.Width)
12171250
if err != nil {
12181251
// We can continue after this, it's not fatal!
1219-
logger.Error(ctx, "resize", slog.Error(err))
1252+
logger.Error(ctx, "reconnecting PTY resize failed, but will continue", slog.Error(err))
12201253
a.metrics.reconnectingPTYErrors.WithLabelValues("resize").Add(1)
12211254
}
12221255
}
@@ -1379,7 +1412,6 @@ func (a *agent) Close() error {
13791412
lifecycleState := codersdk.WorkspaceAgentLifecycleOff
13801413
if manifest := a.manifest.Load(); manifest != nil && manifest.ShutdownScript != "" {
13811414
scriptDone := make(chan error, 1)
1382-
scriptStart := time.Now()
13831415
go func() {
13841416
defer close(scriptDone)
13851417
scriptDone <- a.runShutdownScript(ctx, manifest.ShutdownScript)
@@ -1398,16 +1430,12 @@ func (a *agent) Close() error {
13981430
select {
13991431
case err = <-scriptDone:
14001432
case <-timeout:
1401-
a.logger.Warn(ctx, "shutdown script timed out")
1433+
a.logger.Warn(ctx, "script timed out", slog.F("lifecycle", "shutdown"), slog.F("timeout", manifest.ShutdownScriptTimeout))
14021434
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleShutdownTimeout)
14031435
err = <-scriptDone // The script can still complete after a timeout.
14041436
}
1405-
execTime := time.Since(scriptStart)
14061437
if err != nil {
1407-
a.logger.Warn(ctx, "shutdown script failed", slog.F("execution_time", execTime), slog.Error(err))
14081438
lifecycleState = codersdk.WorkspaceAgentLifecycleShutdownError
1409-
} else {
1410-
a.logger.Info(ctx, "shutdown script completed", slog.F("execution_time", execTime))
14111439
}
14121440
}
14131441

0 commit comments

Comments
 (0)