-
Notifications
You must be signed in to change notification settings - Fork 928
feat: Add workspace agent lifecycle state reporting #5785
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
40e9c87
5abf555
a84e467
8119e71
515e342
1e53635
534f954
90192e5
aaf19bc
bd1a87f
df9944e
60f414f
087070e
c6928e9
c132d4e
903e850
034a850
f76247f
2ccdcac
830f9f1
2b1569f
71706e0
21a9d28
da67c73
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -71,6 +71,7 @@ type Client interface { | |
WorkspaceAgentMetadata(ctx context.Context) (codersdk.WorkspaceAgentMetadata, error) | ||
ListenWorkspaceAgent(ctx context.Context) (net.Conn, error) | ||
AgentReportStats(ctx context.Context, log slog.Logger, stats func() *codersdk.AgentStats) (io.Closer, error) | ||
PostWorkspaceAgentLifecycle(ctx context.Context, state codersdk.PostWorkspaceAgentLifecycleRequest) error | ||
PostWorkspaceAgentAppHealth(ctx context.Context, req codersdk.PostWorkspaceAppHealthsRequest) error | ||
PostWorkspaceAgentVersion(ctx context.Context, version string) error | ||
} | ||
|
@@ -101,6 +102,7 @@ func New(options Options) io.Closer { | |
exchangeToken: options.ExchangeToken, | ||
filesystem: options.Filesystem, | ||
tempDir: options.TempDir, | ||
lifecycleUpdate: make(chan struct{}, 1), | ||
} | ||
a.init(ctx) | ||
return a | ||
|
@@ -127,6 +129,10 @@ type agent struct { | |
sessionToken atomic.Pointer[string] | ||
sshServer *ssh.Server | ||
|
||
lifecycleUpdate chan struct{} | ||
lifecycleMu sync.Mutex // Protects following. | ||
lifecycleState codersdk.WorkspaceAgentLifecycle | ||
|
||
network *tailnet.Conn | ||
} | ||
|
||
|
@@ -135,6 +141,8 @@ type agent struct { | |
// may be happening, but regardless after the intermittent | ||
// failure, you'll want the agent to reconnect. | ||
func (a *agent) runLoop(ctx context.Context) { | ||
go a.reportLifecycleLoop(ctx) | ||
|
||
for retrier := retry.New(100*time.Millisecond, 10*time.Second); retrier.Wait(ctx); { | ||
a.logger.Info(ctx, "running loop") | ||
err := a.run(ctx) | ||
|
@@ -156,6 +164,58 @@ func (a *agent) runLoop(ctx context.Context) { | |
} | ||
} | ||
|
||
// reportLifecycleLoop reports the current lifecycle state once. | ||
// Only the latest state is reported, intermediate states may be | ||
// lost if the agent can't communicate with the API. | ||
func (a *agent) reportLifecycleLoop(ctx context.Context) { | ||
var lastReported codersdk.WorkspaceAgentLifecycle | ||
for { | ||
select { | ||
case <-a.lifecycleUpdate: | ||
case <-ctx.Done(): | ||
return | ||
} | ||
|
||
for r := retry.New(time.Second, 15*time.Second); r.Wait(ctx); { | ||
a.lifecycleMu.Lock() | ||
state := a.lifecycleState | ||
a.lifecycleMu.Unlock() | ||
|
||
if state == lastReported { | ||
break | ||
} | ||
|
||
a.logger.Debug(ctx, "post lifecycle state", slog.F("state", state)) | ||
|
||
err := a.client.PostWorkspaceAgentLifecycle(ctx, codersdk.PostWorkspaceAgentLifecycleRequest{ | ||
State: state, | ||
}) | ||
if err == nil { | ||
lastReported = state | ||
break | ||
} | ||
if xerrors.Is(err, context.Canceled) || xerrors.Is(err, context.DeadlineExceeded) { | ||
return | ||
} | ||
// If we fail to report the state we probably shouldn't exit, log only. | ||
a.logger.Error(ctx, "post state", slog.Error(err)) | ||
} | ||
} | ||
} | ||
|
||
func (a *agent) setLifecycle(ctx context.Context, state codersdk.WorkspaceAgentLifecycle) { | ||
a.lifecycleMu.Lock() | ||
defer a.lifecycleMu.Unlock() | ||
|
||
a.logger.Debug(ctx, "set lifecycle state", slog.F("state", state), slog.F("previous", a.lifecycleState)) | ||
|
||
a.lifecycleState = state | ||
select { | ||
case a.lifecycleUpdate <- struct{}{}: | ||
default: | ||
mafredri marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
} | ||
|
||
func (a *agent) run(ctx context.Context) error { | ||
// This allows the agent to refresh it's token if necessary. | ||
// For instance identity this is required, since the instance | ||
|
@@ -180,22 +240,60 @@ func (a *agent) run(ctx context.Context) error { | |
|
||
// The startup script should only execute on the first run! | ||
if oldMetadata == nil { | ||
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleStarting) | ||
|
||
// Perform overrides early so that Git auth can work even if users | ||
// connect to a workspace that is not yet ready. We don't run this | ||
// concurrently with the startup script to avoid conflicts between | ||
// them. | ||
if metadata.GitAuthConfigs > 0 { | ||
// If this fails, we should consider surfacing the error in the | ||
// startup log and setting the lifecycle state to be "start_error" | ||
// (after startup script completion), but for now we'll just log it. | ||
err := gitauth.OverrideVSCodeConfigs(a.filesystem) | ||
if err != nil { | ||
a.logger.Warn(ctx, "failed to override vscode git auth configs", slog.Error(err)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: should the agent quit here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We probably shouldn't, this could fail for a multitude of reasons, like a user setting
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes, that's the situation I was considering. What can user do in this case? Restart the workspace until it works? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the change was done before, failure here doesn't really matter either (btw). Restarting will most likely not help, unless it's a problem mounting FS or similar. They'll most likely need to resolve the issue in the workspace, then restart, or create a new one. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For now, I've only added a note. I'll revisit this behavior when I add startup log streaming 👍🏻. |
||
} | ||
} | ||
|
||
scriptDone := make(chan error, 1) | ||
scriptStart := time.Now() | ||
go func() { | ||
defer close(scriptDone) | ||
scriptDone <- a.runStartupScript(ctx, metadata.StartupScript) | ||
}() | ||
go func() { | ||
err := a.runStartupScript(ctx, metadata.StartupScript) | ||
var timeout <-chan time.Time | ||
// If timeout is zero, an older version of the coder | ||
// provider was used. Otherwise a timeout is always > 0. | ||
if metadata.StartupScriptTimeout > 0 { | ||
t := time.NewTimer(metadata.StartupScriptTimeout) | ||
defer t.Stop() | ||
timeout = t.C | ||
} | ||
|
||
var err error | ||
select { | ||
case err = <-scriptDone: | ||
case <-timeout: | ||
a.logger.Warn(ctx, "startup script timed out") | ||
a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleStartTimeout) | ||
err = <-scriptDone // The script can still complete after a timeout. | ||
} | ||
if errors.Is(err, context.Canceled) { | ||
return | ||
} | ||
execTime := time.Since(scriptStart) | ||
lifecycleStatus := codersdk.WorkspaceAgentLifecycleReady | ||
if err != nil { | ||
a.logger.Warn(ctx, "agent script failed", slog.Error(err)) | ||
a.logger.Warn(ctx, "startup script failed", slog.F("execution_time", execTime), slog.Error(err)) | ||
lifecycleStatus = codersdk.WorkspaceAgentLifecycleStartError | ||
} else { | ||
a.logger.Info(ctx, "startup script completed", slog.F("execution_time", execTime)) | ||
} | ||
}() | ||
} | ||
|
||
if metadata.GitAuthConfigs > 0 { | ||
err = gitauth.OverrideVSCodeConfigs(a.filesystem) | ||
if err != nil { | ||
return xerrors.Errorf("override vscode configuration for git auth: %w", err) | ||
} | ||
a.setLifecycle(ctx, lifecycleStatus) | ||
}() | ||
} | ||
|
||
// This automatically closes when the context ends! | ||
|
Uh oh!
There was an error while loading. Please reload this page.