Skip to content

Commit 3027bcf

Browse files
committed
feat: track resource replacements when claiming a prebuilt workspace (#17571)
Closes coder/internal#369 We can't know whether a replacement (i.e. drift of terraform state leading to a resource needing to be deleted/recreated) will take place apriori; we can only detect it at `plan` time, because the provider decides whether a resource must be replaced and it cannot be inferred through static analysis of the template. **This is likely to be the most common gotcha with using prebuilds, since it requires a slight template modification to use prebuilds effectively**, so let's head this off before it's an issue for customers. Drift details will now be logged in the workspace build logs: ![image](https://github.com/user-attachments/assets/da1988b6-2cbe-4a79-a3c5-ea29891f3d6f) Plus a notification will be sent to template admins when this situation arises: ![image](https://github.com/user-attachments/assets/39d555b1-a262-4a3e-b529-03b9f23bf66a) A new metric - `coderd_prebuilt_workspaces_resource_replacements_total` - will also increment each time a workspace encounters replacements. We only track _that_ a resource replacement occurred, not how many. Just one is enough to ruin a prebuild, but we can't know apriori which replacement would cause this. For example, say we have 2 replacements: a `docker_container` and a `null_resource`; we don't know which one might cause an issue (or indeed if either would), so we just track the replacement. --------- Signed-off-by: Danny Kopping <dannykopping@gmail.com>
1 parent 93ce739 commit 3027bcf

33 files changed

+2034
-957
lines changed

cli/server.go

Lines changed: 31 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -910,6 +910,37 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
910910
options.StatsBatcher = batcher
911911
defer closeBatcher()
912912

913+
// Manage notifications.
914+
var (
915+
notificationsCfg = options.DeploymentValues.Notifications
916+
notificationsManager *notifications.Manager
917+
)
918+
919+
metrics := notifications.NewMetrics(options.PrometheusRegistry)
920+
helpers := templateHelpers(options)
921+
922+
// The enqueuer is responsible for enqueueing notifications to the given store.
923+
enqueuer, err := notifications.NewStoreEnqueuer(notificationsCfg, options.Database, helpers, logger.Named("notifications.enqueuer"), quartz.NewReal())
924+
if err != nil {
925+
return xerrors.Errorf("failed to instantiate notification store enqueuer: %w", err)
926+
}
927+
options.NotificationsEnqueuer = enqueuer
928+
929+
// The notification manager is responsible for:
930+
// - creating notifiers and managing their lifecycles (notifiers are responsible for dequeueing/sending notifications)
931+
// - keeping the store updated with status updates
932+
notificationsManager, err = notifications.NewManager(notificationsCfg, options.Database, options.Pubsub, helpers, metrics, logger.Named("notifications.manager"))
933+
if err != nil {
934+
return xerrors.Errorf("failed to instantiate notification manager: %w", err)
935+
}
936+
937+
// nolint:gocritic // We need to run the manager in a notifier context.
938+
notificationsManager.Run(dbauthz.AsNotifier(ctx))
939+
940+
// Run report generator to distribute periodic reports.
941+
notificationReportGenerator := reports.NewReportGenerator(ctx, logger.Named("notifications.report_generator"), options.Database, options.NotificationsEnqueuer, quartz.NewReal())
942+
defer notificationReportGenerator.Close()
943+
913944
// We use a separate coderAPICloser so the Enterprise API
914945
// can have its own close functions. This is cleaner
915946
// than abstracting the Coder API itself.
@@ -957,37 +988,6 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
957988
return xerrors.Errorf("write config url: %w", err)
958989
}
959990

960-
// Manage notifications.
961-
var (
962-
notificationsCfg = options.DeploymentValues.Notifications
963-
notificationsManager *notifications.Manager
964-
)
965-
966-
metrics := notifications.NewMetrics(options.PrometheusRegistry)
967-
helpers := templateHelpers(options)
968-
969-
// The enqueuer is responsible for enqueueing notifications to the given store.
970-
enqueuer, err := notifications.NewStoreEnqueuer(notificationsCfg, options.Database, helpers, logger.Named("notifications.enqueuer"), quartz.NewReal())
971-
if err != nil {
972-
return xerrors.Errorf("failed to instantiate notification store enqueuer: %w", err)
973-
}
974-
options.NotificationsEnqueuer = enqueuer
975-
976-
// The notification manager is responsible for:
977-
// - creating notifiers and managing their lifecycles (notifiers are responsible for dequeueing/sending notifications)
978-
// - keeping the store updated with status updates
979-
notificationsManager, err = notifications.NewManager(notificationsCfg, options.Database, options.Pubsub, helpers, metrics, logger.Named("notifications.manager"))
980-
if err != nil {
981-
return xerrors.Errorf("failed to instantiate notification manager: %w", err)
982-
}
983-
984-
// nolint:gocritic // We need to run the manager in a notifier context.
985-
notificationsManager.Run(dbauthz.AsNotifier(ctx))
986-
987-
// Run report generator to distribute periodic reports.
988-
notificationReportGenerator := reports.NewReportGenerator(ctx, logger.Named("notifications.report_generator"), options.Database, options.NotificationsEnqueuer, quartz.NewReal())
989-
defer notificationReportGenerator.Close()
990-
991991
// Since errCh only has one buffered slot, all routines
992992
// sending on it must be wrapped in a select/default to
993993
// avoid leaving dangling goroutines waiting for the

coderd/coderd.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1774,6 +1774,7 @@ func (api *API) CreateInMemoryTaggedProvisionerDaemon(dialCtx context.Context, n
17741774
Clock: api.Clock,
17751775
},
17761776
api.NotificationsEnqueuer,
1777+
&api.PrebuildsReconciler,
17771778
)
17781779
if err != nil {
17791780
return nil, err
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
DELETE FROM notification_templates WHERE id = '89d9745a-816e-4695-a17f-3d0a229e2b8d';
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
INSERT INTO notification_templates
2+
(id, name, title_template, body_template, "group", actions)
3+
VALUES ('89d9745a-816e-4695-a17f-3d0a229e2b8d',
4+
'Prebuilt Workspace Resource Replaced',
5+
E'There might be a problem with a recently claimed prebuilt workspace',
6+
$$
7+
Workspace **{{.Labels.workspace}}** was claimed from a prebuilt workspace by **{{.Labels.claimant}}**.
8+
9+
During the claim, Terraform destroyed and recreated the following resources
10+
because one or more immutable attributes changed:
11+
12+
{{range $resource, $paths := .Data.replacements -}}
13+
- _{{ $resource }}_ was replaced due to changes to _{{ $paths }}_
14+
{{end}}
15+
16+
When Terraform must change an immutable attribute, it replaces the entire resource.
17+
If you’re using prebuilds to speed up provisioning, unexpected replacements will slow down
18+
workspace startup—even when claiming a prebuilt environment.
19+
20+
For tips on preventing replacements and improving claim performance, see [this guide](https://coder.com/docs/admin/templates/extending-templates/prebuilt-workspaces#preventing-resource-replacement).
21+
22+
NOTE: this prebuilt workspace used the **{{.Labels.preset}}** preset.
23+
$$,
24+
'Template Events',
25+
'[
26+
{
27+
"label": "View workspace build",
28+
"url": "{{base_url}}/@{{.Labels.claimant}}/{{.Labels.workspace}}/builds/{{.Labels.workspace_build_num}}"
29+
},
30+
{
31+
"label": "View template version",
32+
"url": "{{base_url}}/templates/{{.Labels.org}}/{{.Labels.template}}/versions/{{.Labels.template_version}}"
33+
}
34+
]'::jsonb);

coderd/notifications/events.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ var (
3939
TemplateTemplateDeprecated = uuid.MustParse("f40fae84-55a2-42cd-99fa-b41c1ca64894")
4040

4141
TemplateWorkspaceBuildsFailedReport = uuid.MustParse("34a20db2-e9cc-4a93-b0e4-8569699d7a00")
42+
TemplateWorkspaceResourceReplaced = uuid.MustParse("89d9745a-816e-4695-a17f-3d0a229e2b8d")
4243
)
4344

4445
// Notification-related events.

coderd/notifications/notifications_test.go

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ import (
3535
"golang.org/x/xerrors"
3636

3737
"cdr.dev/slog"
38+
"github.com/coder/quartz"
39+
"github.com/coder/serpent"
40+
3841
"github.com/coder/coder/v2/coderd/coderdtest"
3942
"github.com/coder/coder/v2/coderd/database"
4043
"github.com/coder/coder/v2/coderd/database/dbauthz"
@@ -48,8 +51,6 @@ import (
4851
"github.com/coder/coder/v2/coderd/util/syncmap"
4952
"github.com/coder/coder/v2/codersdk"
5053
"github.com/coder/coder/v2/testutil"
51-
"github.com/coder/quartz"
52-
"github.com/coder/serpent"
5354
)
5455

5556
// updateGoldenFiles is a flag that can be set to update golden files.
@@ -1226,6 +1227,29 @@ func TestNotificationTemplates_Golden(t *testing.T) {
12261227
Labels: map[string]string{},
12271228
},
12281229
},
1230+
{
1231+
name: "TemplateWorkspaceResourceReplaced",
1232+
id: notifications.TemplateWorkspaceResourceReplaced,
1233+
payload: types.MessagePayload{
1234+
UserName: "Bobby",
1235+
UserEmail: "bobby@coder.com",
1236+
UserUsername: "bobby",
1237+
Labels: map[string]string{
1238+
"org": "cern",
1239+
"workspace": "my-workspace",
1240+
"workspace_build_num": "2",
1241+
"template": "docker",
1242+
"template_version": "angry_torvalds",
1243+
"preset": "particle-accelerator",
1244+
"claimant": "prebuilds-claimer",
1245+
},
1246+
Data: map[string]any{
1247+
"replacements": map[string]string{
1248+
"docker_container[0]": "env, hostname",
1249+
},
1250+
},
1251+
},
1252+
},
12291253
}
12301254

12311255
// We must have a test case for every notification_template. This is enforced below:

coderd/notifications/notificationstest/fake_enqueuer.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"github.com/prometheus/client_golang/prometheus"
1010

1111
"github.com/coder/coder/v2/coderd/database/dbauthz"
12+
"github.com/coder/coder/v2/coderd/notifications"
1213
"github.com/coder/coder/v2/coderd/rbac"
1314
"github.com/coder/coder/v2/coderd/rbac/policy"
1415
)
@@ -19,6 +20,12 @@ type FakeEnqueuer struct {
1920
sent []*FakeNotification
2021
}
2122

23+
var _ notifications.Enqueuer = &FakeEnqueuer{}
24+
25+
func NewFakeEnqueuer() *FakeEnqueuer {
26+
return &FakeEnqueuer{}
27+
}
28+
2229
type FakeNotification struct {
2330
UserID, TemplateID uuid.UUID
2431
Labels map[string]string
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
From: system@coder.com
2+
To: bobby@coder.com
3+
Subject: There might be a problem with a recently claimed prebuilt workspace
4+
Message-Id: 02ee4935-73be-4fa1-a290-ff9999026b13@blush-whale-48
5+
Date: Fri, 11 Oct 2024 09:03:06 +0000
6+
Content-Type: multipart/alternative; boundary=bbe61b741255b6098bb6b3c1f41b885773df633cb18d2a3002b68e4bc9c4
7+
MIME-Version: 1.0
8+
9+
--bbe61b741255b6098bb6b3c1f41b885773df633cb18d2a3002b68e4bc9c4
10+
Content-Transfer-Encoding: quoted-printable
11+
Content-Type: text/plain; charset=UTF-8
12+
13+
Hi Bobby,
14+
15+
Workspace my-workspace was claimed from a prebuilt workspace by prebuilds-c=
16+
laimer.
17+
18+
During the claim, Terraform destroyed and recreated the following resources
19+
because one or more immutable attributes changed:
20+
21+
docker_container[0] was replaced due to changes to env, hostname
22+
23+
When Terraform must change an immutable attribute, it replaces the entire r=
24+
esource.
25+
If you=E2=80=99re using prebuilds to speed up provisioning, unexpected repl=
26+
acements will slow down
27+
workspace startup=E2=80=94even when claiming a prebuilt environment.
28+
29+
For tips on preventing replacements and improving claim performance, see th=
30+
is guide (https://coder.com/docs/admin/templates/extending-templates/prebui=
31+
lt-workspaces#preventing-resource-replacement).
32+
33+
NOTE: this prebuilt workspace used the particle-accelerator preset.
34+
35+
36+
View workspace build: http://test.com/@prebuilds-claimer/my-workspace/build=
37+
s/2
38+
39+
View template version: http://test.com/templates/cern/docker/versions/angry=
40+
_torvalds
41+
42+
--bbe61b741255b6098bb6b3c1f41b885773df633cb18d2a3002b68e4bc9c4
43+
Content-Transfer-Encoding: quoted-printable
44+
Content-Type: text/html; charset=UTF-8
45+
46+
<!doctype html>
47+
<html lang=3D"en">
48+
<head>
49+
<meta charset=3D"UTF-8" />
50+
<meta name=3D"viewport" content=3D"width=3Ddevice-width, initial-scale=
51+
=3D1.0" />
52+
<title>There might be a problem with a recently claimed prebuilt worksp=
53+
ace</title>
54+
</head>
55+
<body style=3D"margin: 0; padding: 0; font-family: -apple-system, system-=
56+
ui, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen', 'Ubuntu', 'Cantarel=
57+
l', 'Fira Sans', 'Droid Sans', 'Helvetica Neue', sans-serif; color: #020617=
58+
; background: #f8fafc;">
59+
<div style=3D"max-width: 600px; margin: 20px auto; padding: 60px; borde=
60+
r: 1px solid #e2e8f0; border-radius: 8px; background-color: #fff; text-alig=
61+
n: left; font-size: 14px; line-height: 1.5;">
62+
<div style=3D"text-align: center;">
63+
<img src=3D"https://coder.com/coder-logo-horizontal.png" alt=3D"Cod=
64+
er Logo" style=3D"height: 40px;" />
65+
</div>
66+
<h1 style=3D"text-align: center; font-size: 24px; font-weight: 400; m=
67+
argin: 8px 0 32px; line-height: 1.5;">
68+
There might be a problem with a recently claimed prebuilt workspace
69+
</h1>
70+
<div style=3D"line-height: 1.5;">
71+
<p>Hi Bobby,</p>
72+
<p>Workspace <strong>my-workspace</strong> was claimed from a prebu=
73+
ilt workspace by <strong>prebuilds-claimer</strong>.</p>
74+
75+
<p>During the claim, Terraform destroyed and recreated the following resour=
76+
ces<br>
77+
because one or more immutable attributes changed:</p>
78+
79+
<ul>
80+
<li>_docker<em>container[0]</em> was replaced due to changes to <em>env, h=
81+
ostname</em><br>
82+
</li>
83+
</ul>
84+
85+
<p>When Terraform must change an immutable attribute, it replaces the entir=
86+
e resource.<br>
87+
If you=E2=80=99re using prebuilds to speed up provisioning, unexpected repl=
88+
acements will slow down<br>
89+
workspace startup=E2=80=94even when claiming a prebuilt environment.</p>
90+
91+
<p>For tips on preventing replacements and improving claim performance, see=
92+
<a href=3D"https://coder.com/docs/admin/templates/extending-templates/preb=
93+
uilt-workspaces#preventing-resource-replacement">this guide</a>.</p>
94+
95+
<p>NOTE: this prebuilt workspace used the <strong>particle-accelerator</str=
96+
ong> preset.</p>
97+
</div>
98+
<div style=3D"text-align: center; margin-top: 32px;">
99+
=20
100+
<a href=3D"http://test.com/@prebuilds-claimer/my-workspace/builds/2=
101+
" style=3D"display: inline-block; padding: 13px 24px; background-color: #02=
102+
0617; color: #f8fafc; text-decoration: none; border-radius: 8px; margin: 0 =
103+
4px;">
104+
View workspace build
105+
</a>
106+
=20
107+
<a href=3D"http://test.com/templates/cern/docker/versions/angry_tor=
108+
valds" style=3D"display: inline-block; padding: 13px 24px; background-color=
109+
: #020617; color: #f8fafc; text-decoration: none; border-radius: 8px; margi=
110+
n: 0 4px;">
111+
View template version
112+
</a>
113+
=20
114+
</div>
115+
<div style=3D"border-top: 1px solid #e2e8f0; color: #475569; font-siz=
116+
e: 12px; margin-top: 64px; padding-top: 24px; line-height: 1.6;">
117+
<p>&copy;&nbsp;2024&nbsp;Coder. All rights reserved&nbsp;-&nbsp;<a =
118+
href=3D"http://test.com" style=3D"color: #2563eb; text-decoration: none;">h=
119+
ttp://test.com</a></p>
120+
<p><a href=3D"http://test.com/settings/notifications" style=3D"colo=
121+
r: #2563eb; text-decoration: none;">Click here to manage your notification =
122+
settings</a></p>
123+
<p><a href=3D"http://test.com/settings/notifications?disabled=3D89d=
124+
9745a-816e-4695-a17f-3d0a229e2b8d" style=3D"color: #2563eb; text-decoration=
125+
: none;">Stop receiving emails like this</a></p>
126+
</div>
127+
</div>
128+
</body>
129+
</html>
130+
131+
--bbe61b741255b6098bb6b3c1f41b885773df633cb18d2a3002b68e4bc9c4--
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
{
2+
"_version": "1.1",
3+
"msg_id": "00000000-0000-0000-0000-000000000000",
4+
"payload": {
5+
"_version": "1.2",
6+
"notification_name": "Prebuilt Workspace Resource Replaced",
7+
"notification_template_id": "00000000-0000-0000-0000-000000000000",
8+
"user_id": "00000000-0000-0000-0000-000000000000",
9+
"user_email": "bobby@coder.com",
10+
"user_name": "Bobby",
11+
"user_username": "bobby",
12+
"actions": [
13+
{
14+
"label": "View workspace build",
15+
"url": "http://test.com/@prebuilds-claimer/my-workspace/builds/2"
16+
},
17+
{
18+
"label": "View template version",
19+
"url": "http://test.com/templates/cern/docker/versions/angry_torvalds"
20+
}
21+
],
22+
"labels": {
23+
"claimant": "prebuilds-claimer",
24+
"org": "cern",
25+
"preset": "particle-accelerator",
26+
"template": "docker",
27+
"template_version": "angry_torvalds",
28+
"workspace": "my-workspace",
29+
"workspace_build_num": "2"
30+
},
31+
"data": {
32+
"replacements": {
33+
"docker_container[0]": "env, hostname"
34+
}
35+
},
36+
"targets": null
37+
},
38+
"title": "There might be a problem with a recently claimed prebuilt workspace",
39+
"title_markdown": "There might be a problem with a recently claimed prebuilt workspace",
40+
"body": "Workspace my-workspace was claimed from a prebuilt workspace by prebuilds-claimer.\n\nDuring the claim, Terraform destroyed and recreated the following resources\nbecause one or more immutable attributes changed:\n\ndocker_container[0] was replaced due to changes to env, hostname\n\nWhen Terraform must change an immutable attribute, it replaces the entire resource.\nIf you’re using prebuilds to speed up provisioning, unexpected replacements will slow down\nworkspace startup—even when claiming a prebuilt environment.\n\nFor tips on preventing replacements and improving claim performance, see this guide (https://coder.com/docs/admin/templates/extending-templates/prebuilt-workspaces#preventing-resource-replacement).\n\nNOTE: this prebuilt workspace used the particle-accelerator preset.",
41+
"body_markdown": "\nWorkspace **my-workspace** was claimed from a prebuilt workspace by **prebuilds-claimer**.\n\nDuring the claim, Terraform destroyed and recreated the following resources\nbecause one or more immutable attributes changed:\n\n- _docker_container[0]_ was replaced due to changes to _env, hostname_\n\n\nWhen Terraform must change an immutable attribute, it replaces the entire resource.\nIf you’re using prebuilds to speed up provisioning, unexpected replacements will slow down\nworkspace startup—even when claiming a prebuilt environment.\n\nFor tips on preventing replacements and improving claim performance, see [this guide](https://coder.com/docs/admin/templates/extending-templates/prebuilt-workspaces#preventing-resource-replacement).\n\nNOTE: this prebuilt workspace used the **particle-accelerator** preset.\n"
42+
}

coderd/prebuilds/api.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"golang.org/x/xerrors"
88

99
"github.com/coder/coder/v2/coderd/database"
10+
sdkproto "github.com/coder/coder/v2/provisionersdk/proto"
1011
)
1112

1213
var (
@@ -27,6 +28,11 @@ type ReconciliationOrchestrator interface {
2728
// Stop gracefully shuts down the orchestrator with the given cause.
2829
// The cause is used for logging and error reporting.
2930
Stop(ctx context.Context, cause error)
31+
32+
// TrackResourceReplacement handles a pathological situation whereby a terraform resource is replaced due to drift,
33+
// which can obviate the whole point of pre-provisioning a prebuilt workspace.
34+
// See more detail at https://coder.com/docs/admin/templates/extending-templates/prebuilt-workspaces#preventing-resource-replacement.
35+
TrackResourceReplacement(ctx context.Context, workspaceID, buildID uuid.UUID, replacements []*sdkproto.ResourceReplacement)
3036
}
3137

3238
type Reconciler interface {

0 commit comments

Comments
 (0)