-
Notifications
You must be signed in to change notification settings - Fork 875
feat: reinitialize agents when a prebuilt workspace is claimed #17475
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
c09c9b9
476fe71
8c8bca6
7ce4eea
52ac64e
362db7c
dcc7379
ff66b3f
efff5d9
cebd5db
2679138
9feebef
b117b5c
a22b414
9bbd2c7
5804201
7e8dcee
725f97b
a9b1567
21ee970
e54d7e7
2799858
1d93003
763fc12
0f879c7
61784c9
604eb27
bf4d2cf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,7 @@ import ( | |
"tailscale.com/tailcfg" | ||
|
||
"cdr.dev/slog" | ||
"github.com/coder/retry" | ||
"github.com/coder/websocket" | ||
|
||
"github.com/coder/coder/v2/agent/proto" | ||
|
@@ -707,49 +708,84 @@ func PrebuildClaimedChannel(id uuid.UUID) string { | |
// - ping: ignored, keepalive | ||
// - prebuild claimed: a prebuilt workspace is claimed, so the agent must reinitialize. | ||
// NOTE: the caller is responsible for closing the events chan. | ||
func (c *Client) WaitForReinit(ctx context.Context, events chan<- ReinitializationEvent) error { | ||
func (c *Client) WaitForReinit(ctx context.Context) (*ReinitializationEvent, error) { | ||
// TODO: allow configuring httpclient | ||
c.SDK.HTTPClient.Timeout = time.Hour * 24 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is a 24 hour timeout desirable? A prebuilt workspace could remain up without a reinit for much longer than this, no? Also, this is a quiet side effect to the client: leaving it in a different state than it started in. |
||
|
||
// TODO (sasswart): tried the following to fix the above, it won't work. The shorter timeout wins. | ||
// I also considered cloning c.SDK.HTTPClient and setting the timeout on the cloned client. | ||
// That won't work because we can't pass the cloned HTTPClient into s.SDK.Request. | ||
// Looks like we're going to need a separate client to be able to have a longer timeout. | ||
// | ||
// timeoutCtx, cancelTimeoutCtx := context.WithTimeout(ctx, 24*time.Hour) | ||
// defer cancelTimeoutCtx() | ||
|
||
res, err := c.SDK.Request(ctx, http.MethodGet, "/api/v2/workspaceagents/me/reinit", nil) | ||
if err != nil { | ||
return xerrors.Errorf("execute request: %w", err) | ||
return nil, xerrors.Errorf("execute request: %w", err) | ||
} | ||
defer res.Body.Close() | ||
|
||
if res.StatusCode != http.StatusOK { | ||
return codersdk.ReadBodyAsError(res) | ||
return nil, codersdk.ReadBodyAsError(res) | ||
} | ||
|
||
nextEvent := codersdk.ServerSentEventReader(ctx, res.Body) | ||
|
||
for { | ||
// TODO (Sasswart): I don't like that we do this select at the start and at the end. | ||
// nextEvent should return an error if the context is canceled, but that feels like a larger refactor. | ||
// if it did, we'd only have the select at the end of the loop. | ||
select { | ||
case <-ctx.Done(): | ||
return ctx.Err() | ||
return nil, ctx.Err() | ||
default: | ||
} | ||
|
||
sse, err := nextEvent() | ||
if err != nil { | ||
return xerrors.Errorf("failed to read server-sent event: %w", err) | ||
return nil, xerrors.Errorf("failed to read server-sent event: %w", err) | ||
} | ||
if sse.Type != codersdk.ServerSentEventTypeData { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ignoring Ping seems OK, but probably shouldn't ignore errors. |
||
continue | ||
} | ||
var reinitEvent ReinitializationEvent | ||
b, ok := sse.Data.([]byte) | ||
if !ok { | ||
return xerrors.Errorf("expected data as []byte, got %T", sse.Data) | ||
return nil, xerrors.Errorf("expected data as []byte, got %T", sse.Data) | ||
} | ||
err = json.Unmarshal(b, &reinitEvent) | ||
if err != nil { | ||
return xerrors.Errorf("unmarshal reinit response: %w", err) | ||
return nil, xerrors.Errorf("unmarshal reinit response: %w", err) | ||
} | ||
select { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. At this point you've got the event; no reason to check the context for cancelation again, just return it. |
||
case <-ctx.Done(): | ||
return ctx.Err() | ||
case events <- reinitEvent: | ||
return nil, ctx.Err() | ||
default: | ||
return &reinitEvent, nil | ||
} | ||
} | ||
} | ||
|
||
func WaitForReinitLoop(ctx context.Context, logger slog.Logger, client *Client) <-chan ReinitializationEvent { | ||
reinitEvents := make(chan ReinitializationEvent) | ||
|
||
go func() { | ||
for retrier := retry.New(100*time.Millisecond, 10*time.Second); retrier.Wait(ctx); { | ||
logger.Debug(ctx, "waiting for agent reinitialization instructions") | ||
reinitEvent, err := client.WaitForReinit(ctx) | ||
if err != nil { | ||
logger.Error(ctx, "failed to wait for agent reinitialization instructions", slog.Error(err)) | ||
} | ||
reinitEvents <- *reinitEvent | ||
select { | ||
case <-ctx.Done(): | ||
close(reinitEvents) | ||
return | ||
case reinitEvents <- *reinitEvent: | ||
} | ||
} | ||
}() | ||
|
||
return reinitEvents | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this essentially place an upper limit of 24 hours on the maximum lifetime of a prebuild?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is being worked on currently; disregard for the moment.