Skip to content

fix: reduce cost of prebuild failure #17697

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 51 commits into from
May 21, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
fcb3e5d
fix: limit prebuild failure cost
evgeniy-scherbina May 6, 2025
701e2a8
refactor: CR's fixes
evgeniy-scherbina May 12, 2025
0fdd096
refactor: CR's fixes
evgeniy-scherbina May 12, 2025
13c0e28
refactor: CR's fixes
evgeniy-scherbina May 12, 2025
29e9cff
refactor: CR's fixes
evgeniy-scherbina May 12, 2025
1d51dfc
Merge remote-tracking branch 'origin/main' into 17432-limit-prebuild-…
evgeniy-scherbina May 14, 2025
5c60065
feat: add notification for prebuilds failure
evgeniy-scherbina May 14, 2025
b0248c3
test: fix tests
evgeniy-scherbina May 15, 2025
08b18b0
test: add a notification test
evgeniy-scherbina May 15, 2025
0fd347a
fix: add prebuild_status migration
evgeniy-scherbina May 16, 2025
08981a4
generate.sh
evgeniy-scherbina May 16, 2025
0e4600c
fix: using prebuild status
evgeniy-scherbina May 16, 2025
107879b
fix: migrations numbers
evgeniy-scherbina May 16, 2025
2dea5ab
make gen
evgeniy-scherbina May 16, 2025
e252093
make gen/golden-files
evgeniy-scherbina May 16, 2025
906ceb9
fix: fix dbauthz
evgeniy-scherbina May 16, 2025
3a0284f
Merge remote-tracking branch 'origin/main' into 17432-limit-prebuild-…
evgeniy-scherbina May 16, 2025
13fb1af
fix: minor fix
evgeniy-scherbina May 16, 2025
108ea0a
test: fix tests
evgeniy-scherbina May 19, 2025
7cd2799
test: fix dbmem
evgeniy-scherbina May 19, 2025
f9148e2
make gen/golden-files
evgeniy-scherbina May 19, 2025
315acfa
Remove outdated TODOs
evgeniy-scherbina May 19, 2025
1679ac9
fix: enhance notification template
evgeniy-scherbina May 19, 2025
d9c9d17
Merge remote-tracking branch 'origin/main' into 17432-limit-prebuild-…
evgeniy-scherbina May 19, 2025
c47ddaa
make gen/golden-files
evgeniy-scherbina May 19, 2025
c19f28a
refactor: minor improvements
evgeniy-scherbina May 19, 2025
b18730f
fix: minor fix in template
evgeniy-scherbina May 19, 2025
37173b0
fix: minor improvement to template
evgeniy-scherbina May 19, 2025
0e3cc40
fix: minor fixes
evgeniy-scherbina May 19, 2025
784adba
fix: fix TODO
evgeniy-scherbina May 19, 2025
50bd9b4
refactor: CR's fixes
evgeniy-scherbina May 20, 2025
8a33ac8
refactor: CR's fixes
evgeniy-scherbina May 20, 2025
9439fa1
refactor: use not null for prebuild_status
evgeniy-scherbina May 20, 2025
e667dad
refactor: fix dbauthz test
evgeniy-scherbina May 20, 2025
8a51702
refactor: use healthy enum option instead of normal
evgeniy-scherbina May 20, 2025
a793e18
refactor: minor refactoring in tests
evgeniy-scherbina May 20, 2025
a0fb69c
refactor: rename DB method
evgeniy-scherbina May 20, 2025
c6f209c
refactor: make fmt
evgeniy-scherbina May 20, 2025
9dccf3e
refactor: make gen + reorder methods
evgeniy-scherbina May 20, 2025
c19ae04
refactor: make fmt
evgeniy-scherbina May 20, 2025
4b145cc
refactor: CR's fixes
evgeniy-scherbina May 20, 2025
80f3677
refactor: make gen
evgeniy-scherbina May 20, 2025
2144d13
refactor: CR's fixes
evgeniy-scherbina May 20, 2025
79725b3
refactor: CR's fixes
evgeniy-scherbina May 20, 2025
7e8b4b6
refactor: improve comments for test
evgeniy-scherbina May 20, 2025
ab5acfb
refactor: CR's fixes
evgeniy-scherbina May 20, 2025
7c09465
refactor: CR's fixes
evgeniy-scherbina May 21, 2025
b7a34c5
refactor: improve notification template
evgeniy-scherbina May 21, 2025
e1e141d
refactor: CR's fixes
evgeniy-scherbina May 21, 2025
354aeb4
refactor: CR's fixes
evgeniy-scherbina May 21, 2025
7564178
Merge remote-tracking branch 'origin/main' into 17432-limit-prebuild-…
evgeniy-scherbina May 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fix: using prebuild status
  • Loading branch information
evgeniy-scherbina committed May 16, 2025
commit 0e4600cc84a3d9afb465ebc3625807ee1f7f1bad
4 changes: 4 additions & 0 deletions coderd/database/dbauthz/dbauthz.go
Original file line number Diff line number Diff line change
Expand Up @@ -4178,6 +4178,10 @@ func (q *querier) UpdateOrganizationDeletedByID(ctx context.Context, arg databas
return deleteQ(q.log, q.auth, q.db.GetOrganizationByID, deleteF)(ctx, arg.ID)
}

func (q *querier) UpdatePrebuildStatus(ctx context.Context, arg database.UpdatePrebuildStatusParams) error {
panic("not implemented")
}

func (q *querier) UpdateProvisionerDaemonLastSeenAt(ctx context.Context, arg database.UpdateProvisionerDaemonLastSeenAtParams) error {
if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerDaemon); err != nil {
return err
Expand Down
9 changes: 9 additions & 0 deletions coderd/database/dbmem/dbmem.go
Original file line number Diff line number Diff line change
Expand Up @@ -10867,6 +10867,15 @@ func (q *FakeQuerier) UpdateOrganizationDeletedByID(_ context.Context, arg datab
return sql.ErrNoRows
}

func (q *FakeQuerier) UpdatePrebuildStatus(ctx context.Context, arg database.UpdatePrebuildStatusParams) error {
err := validateDatabaseType(arg)
if err != nil {
return err
}

panic("not implemented")
}

func (q *FakeQuerier) UpdateProvisionerDaemonLastSeenAt(_ context.Context, arg database.UpdateProvisionerDaemonLastSeenAtParams) error {
err := validateDatabaseType(arg)
if err != nil {
Expand Down
7 changes: 7 additions & 0 deletions coderd/database/dbmetrics/querymetrics.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions coderd/database/querier.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

41 changes: 30 additions & 11 deletions coderd/database/queries.sql.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions coderd/database/queries/prebuilds.sql
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ SELECT
tvp.id,
tvp.name,
tvp.desired_instances AS desired_instances,
tvp.prebuild_status,
t.deleted,
t.deprecated != '' AS deprecated
FROM templates t
Expand Down
5 changes: 5 additions & 0 deletions coderd/database/queries/presets.sql
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ SELECT
unnest(@values :: TEXT[])
RETURNING *;

-- name: UpdatePrebuildStatus :exec
UPDATE template_version_presets
SET prebuild_status = @status
WHERE id = @preset_id;

-- name: GetPresetsByTemplateVersionID :many
SELECT
*
Expand Down
101 changes: 65 additions & 36 deletions enterprise/coderd/prebuilds/reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -361,49 +361,32 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres
slog.F("preset_name", ps.Preset.Name),
)

if ps.IsHardLimited {
// If the preset was previously hard-limited, log it and exit early.
if ps.Preset.PrebuildStatus.PrebuildStatus == database.PrebuildStatusHardLimited {
logger.Warn(ctx, "skipping hard limited preset", slog.F("preset_id", ps.Preset.ID), slog.F("name", ps.Preset.Name))
return nil
}

// TODO: rename ctx?
// nolint:gocritic // Necessary to query all the required data.
ctx := dbauthz.AsSystemRestricted(ctx)

// TODO(yevhenii): move into separate function
// Send notification to template admins.
if c.notifEnq == nil {
c.logger.Warn(ctx, "notification enqueuer not set, cannot send resource replacement notification(s)")
return nil
}
// If the preset reached the hard failure limit for the first time during this iteration:
// - Mark it as hard-limited in the database
// - Send notifications to template admins
if ps.IsHardLimited {
logger.Warn(ctx, "skipping hard limited preset", slog.F("preset_id", ps.Preset.ID), slog.F("name", ps.Preset.Name))

// TODO(yevhenii): remove owner from the list
templateAdmins, err := c.store.GetUsers(ctx, database.GetUsersParams{
RbacRole: []string{codersdk.RoleTemplateAdmin, codersdk.RoleOwner},
err := c.store.UpdatePrebuildStatus(ctx, database.UpdatePrebuildStatusParams{
Status: database.NullPrebuildStatus{
PrebuildStatus: database.PrebuildStatusHardLimited,
Valid: true,
},
PresetID: ps.Preset.ID,
})
if err != nil {
return xerrors.Errorf("fetch template admins: %w", err)
return err
}

for _, templateAdmin := range templateAdmins {
if _, err := c.notifEnq.EnqueueWithData(ctx, templateAdmin.ID, notifications.PrebuildFailureLimitReached,
map[string]string{
"org": ps.Preset.OrganizationName,
"template": ps.Preset.TemplateName,
"template_version": ps.Preset.TemplateVersionName,
"preset": ps.Preset.Name,
},
map[string]any{},
"prebuilds_reconciler",
// Associate this notification with all the related entities.
ps.Preset.TemplateID, ps.Preset.TemplateVersionID, ps.Preset.ID,
); err != nil {
c.logger.Error(ctx,
"failed to send notification",
slog.Error(err),
slog.F("template_admin_id", templateAdmin.ID.String()),
)

continue
}
err = c.notifyPrebuildFailureLimitReached(ctx, ps)
if err != nil {
return err
}

return nil
Expand Down Expand Up @@ -502,6 +485,52 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres
}
}

func (c *StoreReconciler) notifyPrebuildFailureLimitReached(ctx context.Context, ps prebuilds.PresetSnapshot) error {
// TODO: rename ctx?
// nolint:gocritic // Necessary to query all the required data.
ctx = dbauthz.AsSystemRestricted(ctx)

// TODO(yevhenii): move into separate function
// Send notification to template admins.
if c.notifEnq == nil {
c.logger.Warn(ctx, "notification enqueuer not set, cannot send resource replacement notification(s)")
return nil
}

// TODO(yevhenii): remove owner from the list
templateAdmins, err := c.store.GetUsers(ctx, database.GetUsersParams{
RbacRole: []string{codersdk.RoleTemplateAdmin, codersdk.RoleOwner},
})
if err != nil {
return xerrors.Errorf("fetch template admins: %w", err)
}

for _, templateAdmin := range templateAdmins {
if _, err := c.notifEnq.EnqueueWithData(ctx, templateAdmin.ID, notifications.PrebuildFailureLimitReached,
map[string]string{
"org": ps.Preset.OrganizationName,
"template": ps.Preset.TemplateName,
"template_version": ps.Preset.TemplateVersionName,
"preset": ps.Preset.Name,
},
map[string]any{},
"prebuilds_reconciler",
// Associate this notification with all the related entities.
ps.Preset.TemplateID, ps.Preset.TemplateVersionID, ps.Preset.ID,
); err != nil {
c.logger.Error(ctx,
"failed to send notification",
slog.Error(err),
slog.F("template_admin_id", templateAdmin.ID.String()),
)

continue
}
}

return nil
}

func (c *StoreReconciler) CalculateActions(ctx context.Context, snapshot prebuilds.PresetSnapshot) (*prebuilds.ReconciliationActions, error) {
if ctx.Err() != nil {
return nil, ctx.Err()
Expand Down
5 changes: 5 additions & 0 deletions enterprise/coderd/prebuilds/reconcile_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -733,6 +733,11 @@ func TestSkippingHardLimitedPresets(t *testing.T) {
// The outcome depends on whether the hard limit has been reached
require.NoError(t, controller.ReconcileAll(ctx))

// These two additional calls to ReconcileAll should not trigger any notifications.
// A notification is only sent once.
require.NoError(t, controller.ReconcileAll(ctx))
require.NoError(t, controller.ReconcileAll(ctx))

// Verify the final state after reconciliation
workspaces, err = db.GetWorkspacesByTemplateID(ctx, template.ID)
require.NoError(t, err)
Expand Down
Loading