diff --git a/cli/testdata/server-config.yaml.golden b/cli/testdata/server-config.yaml.golden index 9995a7f389130..7403819a2d10b 100644 --- a/cli/testdata/server-config.yaml.golden +++ b/cli/testdata/server-config.yaml.golden @@ -704,3 +704,7 @@ workspace_prebuilds: # backoff. # (default: 1h0m0s, type: duration) reconciliation_backoff_lookback_period: 1h0m0s + # Maximum number of consecutive failed prebuilds before a preset hits the hard + # limit; disabled when set to zero. + # (default: 3, type: int) + failure_hard_limit: 3 diff --git a/coderd/apidoc/docs.go b/coderd/apidoc/docs.go index e98197d3b5bb2..7cee63e183e7e 100644 --- a/coderd/apidoc/docs.go +++ b/coderd/apidoc/docs.go @@ -14326,6 +14326,10 @@ const docTemplate = `{ "codersdk.PrebuildsConfig": { "type": "object", "properties": { + "failure_hard_limit": { + "description": "FailureHardLimit defines the maximum number of consecutive failed prebuild attempts allowed\nbefore a preset is considered to be in a hard limit state. When a preset hits this limit,\nno new prebuilds will be created until the limit is reset.\nFailureHardLimit is disabled when set to zero.", + "type": "integer" + }, "reconciliation_backoff_interval": { "description": "ReconciliationBackoffInterval specifies the amount of time to increase the backoff interval\nwhen errors occur during reconciliation.", "type": "integer" diff --git a/coderd/apidoc/swagger.json b/coderd/apidoc/swagger.json index fa103f55fbe9f..89a582091496f 100644 --- a/coderd/apidoc/swagger.json +++ b/coderd/apidoc/swagger.json @@ -12968,6 +12968,10 @@ "codersdk.PrebuildsConfig": { "type": "object", "properties": { + "failure_hard_limit": { + "description": "FailureHardLimit defines the maximum number of consecutive failed prebuild attempts allowed\nbefore a preset is considered to be in a hard limit state. When a preset hits this limit,\nno new prebuilds will be created until the limit is reset.\nFailureHardLimit is disabled when set to zero.", + "type": "integer" + }, "reconciliation_backoff_interval": { "description": "ReconciliationBackoffInterval specifies the amount of time to increase the backoff interval\nwhen errors occur during reconciliation.", "type": "integer" diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index ab3781452dd2d..a210599d17cc4 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -2226,6 +2226,15 @@ func (q *querier) GetPresetParametersByTemplateVersionID(ctx context.Context, ar return q.db.GetPresetParametersByTemplateVersionID(ctx, args) } +func (q *querier) GetPresetsAtFailureLimit(ctx context.Context, hardLimit int64) ([]database.GetPresetsAtFailureLimitRow, error) { + // GetPresetsAtFailureLimit returns a list of template version presets that have reached the hard failure limit. + // Request the same authorization permissions as GetPresetsBackoff, since the methods are similar. + if err := q.authorizeContext(ctx, policy.ActionViewInsights, rbac.ResourceTemplate.All()); err != nil { + return nil, err + } + return q.db.GetPresetsAtFailureLimit(ctx, hardLimit) +} + func (q *querier) GetPresetsBackoff(ctx context.Context, lookback time.Time) ([]database.GetPresetsBackoffRow, error) { // GetPresetsBackoff returns a list of template version presets along with metadata such as the number of failed prebuilds. if err := q.authorizeContext(ctx, policy.ActionViewInsights, rbac.ResourceTemplate.All()); err != nil { @@ -4201,6 +4210,24 @@ func (q *querier) UpdateOrganizationDeletedByID(ctx context.Context, arg databas return deleteQ(q.log, q.auth, q.db.GetOrganizationByID, deleteF)(ctx, arg.ID) } +func (q *querier) UpdatePresetPrebuildStatus(ctx context.Context, arg database.UpdatePresetPrebuildStatusParams) error { + preset, err := q.db.GetPresetByID(ctx, arg.PresetID) + if err != nil { + return err + } + + object := rbac.ResourceTemplate. + WithID(preset.TemplateID.UUID). + InOrg(preset.OrganizationID) + + err = q.authorizeContext(ctx, policy.ActionUpdate, object) + if err != nil { + return err + } + + return q.db.UpdatePresetPrebuildStatus(ctx, arg) +} + func (q *querier) UpdateProvisionerDaemonLastSeenAt(ctx context.Context, arg database.UpdateProvisionerDaemonLastSeenAtParams) error { if err := q.authorizeContext(ctx, policy.ActionUpdate, rbac.ResourceProvisionerDaemon); err != nil { return err diff --git a/coderd/database/dbauthz/dbauthz_test.go b/coderd/database/dbauthz/dbauthz_test.go index e8b90afbc396d..703e51d739c47 100644 --- a/coderd/database/dbauthz/dbauthz_test.go +++ b/coderd/database/dbauthz/dbauthz_test.go @@ -4924,6 +4924,11 @@ func (s *MethodTestSuite) TestPrebuilds() { Asserts(rbac.ResourceWorkspace.All(), policy.ActionRead). ErrorsWithInMemDB(dbmem.ErrUnimplemented) })) + s.Run("GetPresetsAtFailureLimit", s.Subtest(func(_ database.Store, check *expects) { + check.Args(int64(0)). + Asserts(rbac.ResourceTemplate.All(), policy.ActionViewInsights). + ErrorsWithInMemDB(dbmem.ErrUnimplemented) + })) s.Run("GetPresetsBackoff", s.Subtest(func(_ database.Store, check *expects) { check.Args(time.Time{}). Asserts(rbac.ResourceTemplate.All(), policy.ActionViewInsights). @@ -4971,8 +4976,34 @@ func (s *MethodTestSuite) TestPrebuilds() { }, InvalidateAfterSecs: preset.InvalidateAfterSecs, OrganizationID: org.ID, + PrebuildStatus: database.PrebuildStatusHealthy, }) })) + s.Run("UpdatePresetPrebuildStatus", s.Subtest(func(db database.Store, check *expects) { + org := dbgen.Organization(s.T(), db, database.Organization{}) + user := dbgen.User(s.T(), db, database.User{}) + template := dbgen.Template(s.T(), db, database.Template{ + OrganizationID: org.ID, + CreatedBy: user.ID, + }) + templateVersion := dbgen.TemplateVersion(s.T(), db, database.TemplateVersion{ + TemplateID: uuid.NullUUID{ + UUID: template.ID, + Valid: true, + }, + OrganizationID: org.ID, + CreatedBy: user.ID, + }) + preset := dbgen.Preset(s.T(), db, database.InsertPresetParams{ + TemplateVersionID: templateVersion.ID, + }) + req := database.UpdatePresetPrebuildStatusParams{ + PresetID: preset.ID, + Status: database.PrebuildStatusHealthy, + } + check.Args(req). + Asserts(rbac.ResourceTemplate.WithID(template.ID).InOrg(org.ID), policy.ActionUpdate) + })) } func (s *MethodTestSuite) TestOAuth2ProviderApps() { diff --git a/coderd/database/dbmem/dbmem.go b/coderd/database/dbmem/dbmem.go index 75c56b9c2324d..1a1455d83045b 100644 --- a/coderd/database/dbmem/dbmem.go +++ b/coderd/database/dbmem/dbmem.go @@ -4287,6 +4287,7 @@ func (q *FakeQuerier) GetPresetByID(ctx context.Context, presetID uuid.UUID) (da CreatedAt: preset.CreatedAt, DesiredInstances: preset.DesiredInstances, InvalidateAfterSecs: preset.InvalidateAfterSecs, + PrebuildStatus: preset.PrebuildStatus, TemplateID: tv.TemplateID, OrganizationID: tv.OrganizationID, }, nil @@ -4352,6 +4353,10 @@ func (q *FakeQuerier) GetPresetParametersByTemplateVersionID(_ context.Context, return parameters, nil } +func (q *FakeQuerier) GetPresetsAtFailureLimit(ctx context.Context, hardLimit int64) ([]database.GetPresetsAtFailureLimitRow, error) { + return nil, ErrUnimplemented +} + func (*FakeQuerier) GetPresetsBackoff(_ context.Context, _ time.Time) ([]database.GetPresetsBackoffRow, error) { return nil, ErrUnimplemented } @@ -9089,6 +9094,7 @@ func (q *FakeQuerier) InsertPreset(_ context.Context, arg database.InsertPresetP Int32: 0, Valid: true, }, + PrebuildStatus: database.PrebuildStatusHealthy, } q.presets = append(q.presets, preset) return preset, nil @@ -10917,6 +10923,25 @@ func (q *FakeQuerier) UpdateOrganizationDeletedByID(_ context.Context, arg datab return sql.ErrNoRows } +func (q *FakeQuerier) UpdatePresetPrebuildStatus(ctx context.Context, arg database.UpdatePresetPrebuildStatusParams) error { + err := validateDatabaseType(arg) + if err != nil { + return err + } + + q.mutex.RLock() + defer q.mutex.RUnlock() + + for _, preset := range q.presets { + if preset.ID == arg.PresetID { + preset.PrebuildStatus = arg.Status + return nil + } + } + + return xerrors.Errorf("preset %v does not exist", arg.PresetID) +} + func (q *FakeQuerier) UpdateProvisionerDaemonLastSeenAt(_ context.Context, arg database.UpdateProvisionerDaemonLastSeenAtParams) error { err := validateDatabaseType(arg) if err != nil { diff --git a/coderd/database/dbmetrics/querymetrics.go b/coderd/database/dbmetrics/querymetrics.go index 47ec185915660..e35ec11b02453 100644 --- a/coderd/database/dbmetrics/querymetrics.go +++ b/coderd/database/dbmetrics/querymetrics.go @@ -1138,6 +1138,13 @@ func (m queryMetricsStore) GetPresetParametersByTemplateVersionID(ctx context.Co return r0, r1 } +func (m queryMetricsStore) GetPresetsAtFailureLimit(ctx context.Context, hardLimit int64) ([]database.GetPresetsAtFailureLimitRow, error) { + start := time.Now() + r0, r1 := m.s.GetPresetsAtFailureLimit(ctx, hardLimit) + m.queryLatencies.WithLabelValues("GetPresetsAtFailureLimit").Observe(time.Since(start).Seconds()) + return r0, r1 +} + func (m queryMetricsStore) GetPresetsBackoff(ctx context.Context, lookback time.Time) ([]database.GetPresetsBackoffRow, error) { start := time.Now() r0, r1 := m.s.GetPresetsBackoff(ctx, lookback) @@ -2692,6 +2699,13 @@ func (m queryMetricsStore) UpdateOrganizationDeletedByID(ctx context.Context, ar return r0 } +func (m queryMetricsStore) UpdatePresetPrebuildStatus(ctx context.Context, arg database.UpdatePresetPrebuildStatusParams) error { + start := time.Now() + r0 := m.s.UpdatePresetPrebuildStatus(ctx, arg) + m.queryLatencies.WithLabelValues("UpdatePresetPrebuildStatus").Observe(time.Since(start).Seconds()) + return r0 +} + func (m queryMetricsStore) UpdateProvisionerDaemonLastSeenAt(ctx context.Context, arg database.UpdateProvisionerDaemonLastSeenAtParams) error { start := time.Now() r0 := m.s.UpdateProvisionerDaemonLastSeenAt(ctx, arg) diff --git a/coderd/database/dbmock/dbmock.go b/coderd/database/dbmock/dbmock.go index e3a9a14698e42..7a1fc0c4b2a6f 100644 --- a/coderd/database/dbmock/dbmock.go +++ b/coderd/database/dbmock/dbmock.go @@ -2328,6 +2328,21 @@ func (mr *MockStoreMockRecorder) GetPresetParametersByTemplateVersionID(ctx, tem return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetPresetParametersByTemplateVersionID", reflect.TypeOf((*MockStore)(nil).GetPresetParametersByTemplateVersionID), ctx, templateVersionID) } +// GetPresetsAtFailureLimit mocks base method. +func (m *MockStore) GetPresetsAtFailureLimit(ctx context.Context, hardLimit int64) ([]database.GetPresetsAtFailureLimitRow, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetPresetsAtFailureLimit", ctx, hardLimit) + ret0, _ := ret[0].([]database.GetPresetsAtFailureLimitRow) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetPresetsAtFailureLimit indicates an expected call of GetPresetsAtFailureLimit. +func (mr *MockStoreMockRecorder) GetPresetsAtFailureLimit(ctx, hardLimit any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetPresetsAtFailureLimit", reflect.TypeOf((*MockStore)(nil).GetPresetsAtFailureLimit), ctx, hardLimit) +} + // GetPresetsBackoff mocks base method. func (m *MockStore) GetPresetsBackoff(ctx context.Context, lookback time.Time) ([]database.GetPresetsBackoffRow, error) { m.ctrl.T.Helper() @@ -5706,6 +5721,20 @@ func (mr *MockStoreMockRecorder) UpdateOrganizationDeletedByID(ctx, arg any) *go return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateOrganizationDeletedByID", reflect.TypeOf((*MockStore)(nil).UpdateOrganizationDeletedByID), ctx, arg) } +// UpdatePresetPrebuildStatus mocks base method. +func (m *MockStore) UpdatePresetPrebuildStatus(ctx context.Context, arg database.UpdatePresetPrebuildStatusParams) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "UpdatePresetPrebuildStatus", ctx, arg) + ret0, _ := ret[0].(error) + return ret0 +} + +// UpdatePresetPrebuildStatus indicates an expected call of UpdatePresetPrebuildStatus. +func (mr *MockStoreMockRecorder) UpdatePresetPrebuildStatus(ctx, arg any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdatePresetPrebuildStatus", reflect.TypeOf((*MockStore)(nil).UpdatePresetPrebuildStatus), ctx, arg) +} + // UpdateProvisionerDaemonLastSeenAt mocks base method. func (m *MockStore) UpdateProvisionerDaemonLastSeenAt(ctx context.Context, arg database.UpdateProvisionerDaemonLastSeenAtParams) error { m.ctrl.T.Helper() diff --git a/coderd/database/dump.sql b/coderd/database/dump.sql index 2f23b3ad4ce78..ec196405df2d3 100644 --- a/coderd/database/dump.sql +++ b/coderd/database/dump.sql @@ -153,6 +153,12 @@ CREATE TYPE port_share_protocol AS ENUM ( 'https' ); +CREATE TYPE prebuild_status AS ENUM ( + 'healthy', + 'hard_limited', + 'validation_failed' +); + CREATE TYPE provisioner_daemon_status AS ENUM ( 'offline', 'idle', @@ -1439,7 +1445,8 @@ CREATE TABLE template_version_presets ( name text NOT NULL, created_at timestamp with time zone DEFAULT CURRENT_TIMESTAMP NOT NULL, desired_instances integer, - invalidate_after_secs integer DEFAULT 0 + invalidate_after_secs integer DEFAULT 0, + prebuild_status prebuild_status DEFAULT 'healthy'::prebuild_status NOT NULL ); CREATE TABLE template_version_terraform_values ( diff --git a/coderd/database/migrations/000328_prebuild_failure_limit_notification.down.sql b/coderd/database/migrations/000328_prebuild_failure_limit_notification.down.sql new file mode 100644 index 0000000000000..40697c7bbc3d2 --- /dev/null +++ b/coderd/database/migrations/000328_prebuild_failure_limit_notification.down.sql @@ -0,0 +1 @@ +DELETE FROM notification_templates WHERE id = '414d9331-c1fc-4761-b40c-d1f4702279eb'; diff --git a/coderd/database/migrations/000328_prebuild_failure_limit_notification.up.sql b/coderd/database/migrations/000328_prebuild_failure_limit_notification.up.sql new file mode 100644 index 0000000000000..403bd667abd28 --- /dev/null +++ b/coderd/database/migrations/000328_prebuild_failure_limit_notification.up.sql @@ -0,0 +1,25 @@ +INSERT INTO notification_templates +(id, name, title_template, body_template, "group", actions) +VALUES ('414d9331-c1fc-4761-b40c-d1f4702279eb', + 'Prebuild Failure Limit Reached', + E'There is a problem creating prebuilt workspaces', + $$ +The number of failed prebuild attempts has reached the hard limit for template **{{ .Labels.template }}** and preset **{{ .Labels.preset }}**. + +To resume prebuilds, fix the underlying issue and upload a new template version. + +Refer to the documentation for more details: +- [Troubleshooting templates](https://coder.com/docs/admin/templates/troubleshooting) +- [Troubleshooting of prebuilt workspaces](https://coder.com/docs/admin/templates/extending-templates/prebuilt-workspaces#administration-and-troubleshooting) +$$, + 'Template Events', + '[ + { + "label": "View failed prebuilt workspaces", + "url": "{{base_url}}/workspaces?filter=owner:prebuilds+status:failed+template:{{.Labels.template}}" + }, + { + "label": "View template version", + "url": "{{base_url}}/templates/{{.Labels.org}}/{{.Labels.template}}/versions/{{.Labels.template_version}}" + } + ]'::jsonb); diff --git a/coderd/database/migrations/000329_add_status_to_template_presets.down.sql b/coderd/database/migrations/000329_add_status_to_template_presets.down.sql new file mode 100644 index 0000000000000..8fe04f99cae33 --- /dev/null +++ b/coderd/database/migrations/000329_add_status_to_template_presets.down.sql @@ -0,0 +1,5 @@ +-- Remove the column from the table first (must happen before dropping the enum type) +ALTER TABLE template_version_presets DROP COLUMN prebuild_status; + +-- Then drop the enum type +DROP TYPE prebuild_status; diff --git a/coderd/database/migrations/000329_add_status_to_template_presets.up.sql b/coderd/database/migrations/000329_add_status_to_template_presets.up.sql new file mode 100644 index 0000000000000..019a246f73a87 --- /dev/null +++ b/coderd/database/migrations/000329_add_status_to_template_presets.up.sql @@ -0,0 +1,7 @@ +CREATE TYPE prebuild_status AS ENUM ( + 'healthy', -- Prebuilds are working as expected; this is the default, healthy state. + 'hard_limited', -- Prebuilds have failed repeatedly and hit the configured hard failure limit; won't be retried anymore. + 'validation_failed' -- Prebuilds failed due to a non-retryable validation error (e.g. template misconfiguration); won't be retried. +); + +ALTER TABLE template_version_presets ADD COLUMN prebuild_status prebuild_status NOT NULL DEFAULT 'healthy'::prebuild_status; diff --git a/coderd/database/models.go b/coderd/database/models.go index ff49b8f471be0..d5047f6bbe65f 100644 --- a/coderd/database/models.go +++ b/coderd/database/models.go @@ -1343,6 +1343,67 @@ func AllPortShareProtocolValues() []PortShareProtocol { } } +type PrebuildStatus string + +const ( + PrebuildStatusHealthy PrebuildStatus = "healthy" + PrebuildStatusHardLimited PrebuildStatus = "hard_limited" + PrebuildStatusValidationFailed PrebuildStatus = "validation_failed" +) + +func (e *PrebuildStatus) Scan(src interface{}) error { + switch s := src.(type) { + case []byte: + *e = PrebuildStatus(s) + case string: + *e = PrebuildStatus(s) + default: + return fmt.Errorf("unsupported scan type for PrebuildStatus: %T", src) + } + return nil +} + +type NullPrebuildStatus struct { + PrebuildStatus PrebuildStatus `json:"prebuild_status"` + Valid bool `json:"valid"` // Valid is true if PrebuildStatus is not NULL +} + +// Scan implements the Scanner interface. +func (ns *NullPrebuildStatus) Scan(value interface{}) error { + if value == nil { + ns.PrebuildStatus, ns.Valid = "", false + return nil + } + ns.Valid = true + return ns.PrebuildStatus.Scan(value) +} + +// Value implements the driver Valuer interface. +func (ns NullPrebuildStatus) Value() (driver.Value, error) { + if !ns.Valid { + return nil, nil + } + return string(ns.PrebuildStatus), nil +} + +func (e PrebuildStatus) Valid() bool { + switch e { + case PrebuildStatusHealthy, + PrebuildStatusHardLimited, + PrebuildStatusValidationFailed: + return true + } + return false +} + +func AllPrebuildStatusValues() []PrebuildStatus { + return []PrebuildStatus{ + PrebuildStatusHealthy, + PrebuildStatusHardLimited, + PrebuildStatusValidationFailed, + } +} + // The status of a provisioner daemon. type ProvisionerDaemonStatus string @@ -3248,12 +3309,13 @@ type TemplateVersionParameter struct { } type TemplateVersionPreset struct { - ID uuid.UUID `db:"id" json:"id"` - TemplateVersionID uuid.UUID `db:"template_version_id" json:"template_version_id"` - Name string `db:"name" json:"name"` - CreatedAt time.Time `db:"created_at" json:"created_at"` - DesiredInstances sql.NullInt32 `db:"desired_instances" json:"desired_instances"` - InvalidateAfterSecs sql.NullInt32 `db:"invalidate_after_secs" json:"invalidate_after_secs"` + ID uuid.UUID `db:"id" json:"id"` + TemplateVersionID uuid.UUID `db:"template_version_id" json:"template_version_id"` + Name string `db:"name" json:"name"` + CreatedAt time.Time `db:"created_at" json:"created_at"` + DesiredInstances sql.NullInt32 `db:"desired_instances" json:"desired_instances"` + InvalidateAfterSecs sql.NullInt32 `db:"invalidate_after_secs" json:"invalidate_after_secs"` + PrebuildStatus PrebuildStatus `db:"prebuild_status" json:"prebuild_status"` } type TemplateVersionPresetParameter struct { diff --git a/coderd/database/querier.go b/coderd/database/querier.go index d248780397ead..ac7497b641a05 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -241,6 +241,15 @@ type sqlcQuerier interface { GetPresetByWorkspaceBuildID(ctx context.Context, workspaceBuildID uuid.UUID) (TemplateVersionPreset, error) GetPresetParametersByPresetID(ctx context.Context, presetID uuid.UUID) ([]TemplateVersionPresetParameter, error) GetPresetParametersByTemplateVersionID(ctx context.Context, templateVersionID uuid.UUID) ([]TemplateVersionPresetParameter, error) + // GetPresetsAtFailureLimit groups workspace builds by preset ID. + // Each preset is associated with exactly one template version ID. + // For each preset, the query checks the last hard_limit builds. + // If all of them failed, the preset is considered to have hit the hard failure limit. + // The query returns a list of preset IDs that have reached this failure threshold. + // Only active template versions with configured presets are considered. + // For each preset, check the last hard_limit builds. + // If all of them failed, the preset is considered to have hit the hard failure limit. + GetPresetsAtFailureLimit(ctx context.Context, hardLimit int64) ([]GetPresetsAtFailureLimitRow, error) // GetPresetsBackoff groups workspace builds by preset ID. // Each preset is associated with exactly one template version ID. // For each group, the query checks up to N of the most recent jobs that occurred within the @@ -568,6 +577,7 @@ type sqlcQuerier interface { UpdateOAuth2ProviderAppSecretByID(ctx context.Context, arg UpdateOAuth2ProviderAppSecretByIDParams) (OAuth2ProviderAppSecret, error) UpdateOrganization(ctx context.Context, arg UpdateOrganizationParams) (Organization, error) UpdateOrganizationDeletedByID(ctx context.Context, arg UpdateOrganizationDeletedByIDParams) error + UpdatePresetPrebuildStatus(ctx context.Context, arg UpdatePresetPrebuildStatusParams) error UpdateProvisionerDaemonLastSeenAt(ctx context.Context, arg UpdateProvisionerDaemonLastSeenAtParams) error UpdateProvisionerJobByID(ctx context.Context, arg UpdateProvisionerJobByIDParams) error UpdateProvisionerJobWithCancelByID(ctx context.Context, arg UpdateProvisionerJobWithCancelByIDParams) error diff --git a/coderd/database/querier_test.go b/coderd/database/querier_test.go index b2cc20c4894d5..5bafa58796b7a 100644 --- a/coderd/database/querier_test.go +++ b/coderd/database/querier_test.go @@ -4123,8 +4123,7 @@ func TestGetPresetsBackoff(t *testing.T) { }) tmpl1 := createTemplate(t, db, orgID, userID) - tmpl1V1 := createTmplVersionAndPreset(t, db, tmpl1, tmpl1.ActiveVersionID, now, nil) - _ = tmpl1V1 + createTmplVersionAndPreset(t, db, tmpl1, tmpl1.ActiveVersionID, now, nil) backoffs, err := db.GetPresetsBackoff(ctx, now.Add(-time.Hour)) require.NoError(t, err) @@ -4401,6 +4400,311 @@ func TestGetPresetsBackoff(t *testing.T) { }) } +func TestGetPresetsAtFailureLimit(t *testing.T) { + t.Parallel() + if !dbtestutil.WillUsePostgres() { + t.SkipNow() + } + + now := dbtime.Now() + hourBefore := now.Add(-time.Hour) + orgID := uuid.New() + userID := uuid.New() + + findPresetByTmplVersionID := func(hardLimitedPresets []database.GetPresetsAtFailureLimitRow, tmplVersionID uuid.UUID) *database.GetPresetsAtFailureLimitRow { + for _, preset := range hardLimitedPresets { + if preset.TemplateVersionID == tmplVersionID { + return &preset + } + } + + return nil + } + + testCases := []struct { + name string + // true - build is successful + // false - build is unsuccessful + buildSuccesses []bool + hardLimit int64 + expHitHardLimit bool + }{ + { + name: "failed build", + buildSuccesses: []bool{false}, + hardLimit: 1, + expHitHardLimit: true, + }, + { + name: "2 failed builds", + buildSuccesses: []bool{false, false}, + hardLimit: 1, + expHitHardLimit: true, + }, + { + name: "successful build", + buildSuccesses: []bool{true}, + hardLimit: 1, + expHitHardLimit: false, + }, + { + name: "last build is failed", + buildSuccesses: []bool{true, true, false}, + hardLimit: 1, + expHitHardLimit: true, + }, + { + name: "last build is successful", + buildSuccesses: []bool{false, false, true}, + hardLimit: 1, + expHitHardLimit: false, + }, + { + name: "last 3 builds are failed - hard limit is reached", + buildSuccesses: []bool{true, true, false, false, false}, + hardLimit: 3, + expHitHardLimit: true, + }, + { + name: "1 out of 3 last build is successful - hard limit is NOT reached", + buildSuccesses: []bool{false, false, true, false, false}, + hardLimit: 3, + expHitHardLimit: false, + }, + // hardLimit set to zero, implicitly disables the hard limit. + { + name: "despite 5 failed builds, the hard limit is not reached because it's disabled.", + buildSuccesses: []bool{false, false, false, false, false}, + hardLimit: 0, + expHitHardLimit: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + db, _ := dbtestutil.NewDB(t) + ctx := testutil.Context(t, testutil.WaitShort) + dbgen.Organization(t, db, database.Organization{ + ID: orgID, + }) + dbgen.User(t, db, database.User{ + ID: userID, + }) + + tmpl := createTemplate(t, db, orgID, userID) + tmplV1 := createTmplVersionAndPreset(t, db, tmpl, tmpl.ActiveVersionID, now, nil) + for idx, buildSuccess := range tc.buildSuccesses { + createPrebuiltWorkspace(ctx, t, db, tmpl, tmplV1, orgID, now, &createPrebuiltWorkspaceOpts{ + failedJob: !buildSuccess, + createdAt: hourBefore.Add(time.Duration(idx) * time.Second), + }) + } + + hardLimitedPresets, err := db.GetPresetsAtFailureLimit(ctx, tc.hardLimit) + require.NoError(t, err) + + if !tc.expHitHardLimit { + require.Len(t, hardLimitedPresets, 0) + return + } + + require.Len(t, hardLimitedPresets, 1) + hardLimitedPreset := hardLimitedPresets[0] + require.Equal(t, hardLimitedPreset.TemplateVersionID, tmpl.ActiveVersionID) + require.Equal(t, hardLimitedPreset.PresetID, tmplV1.preset.ID) + }) + } + + t.Run("Ignore Inactive Version", func(t *testing.T) { + t.Parallel() + + db, _ := dbtestutil.NewDB(t) + ctx := testutil.Context(t, testutil.WaitShort) + dbgen.Organization(t, db, database.Organization{ + ID: orgID, + }) + dbgen.User(t, db, database.User{ + ID: userID, + }) + + tmpl := createTemplate(t, db, orgID, userID) + tmplV1 := createTmplVersionAndPreset(t, db, tmpl, uuid.New(), now, nil) + createPrebuiltWorkspace(ctx, t, db, tmpl, tmplV1, orgID, now, &createPrebuiltWorkspaceOpts{ + failedJob: true, + }) + + // Active Version + tmplV2 := createTmplVersionAndPreset(t, db, tmpl, tmpl.ActiveVersionID, now, nil) + createPrebuiltWorkspace(ctx, t, db, tmpl, tmplV2, orgID, now, &createPrebuiltWorkspaceOpts{ + failedJob: true, + }) + createPrebuiltWorkspace(ctx, t, db, tmpl, tmplV2, orgID, now, &createPrebuiltWorkspaceOpts{ + failedJob: true, + }) + + hardLimitedPresets, err := db.GetPresetsAtFailureLimit(ctx, 1) + require.NoError(t, err) + + require.Len(t, hardLimitedPresets, 1) + hardLimitedPreset := hardLimitedPresets[0] + require.Equal(t, hardLimitedPreset.TemplateVersionID, tmpl.ActiveVersionID) + require.Equal(t, hardLimitedPreset.PresetID, tmplV2.preset.ID) + }) + + t.Run("Multiple Templates", func(t *testing.T) { + t.Parallel() + + db, _ := dbtestutil.NewDB(t) + ctx := testutil.Context(t, testutil.WaitShort) + dbgen.Organization(t, db, database.Organization{ + ID: orgID, + }) + dbgen.User(t, db, database.User{ + ID: userID, + }) + + tmpl1 := createTemplate(t, db, orgID, userID) + tmpl1V1 := createTmplVersionAndPreset(t, db, tmpl1, tmpl1.ActiveVersionID, now, nil) + createPrebuiltWorkspace(ctx, t, db, tmpl1, tmpl1V1, orgID, now, &createPrebuiltWorkspaceOpts{ + failedJob: true, + }) + + tmpl2 := createTemplate(t, db, orgID, userID) + tmpl2V1 := createTmplVersionAndPreset(t, db, tmpl2, tmpl2.ActiveVersionID, now, nil) + createPrebuiltWorkspace(ctx, t, db, tmpl2, tmpl2V1, orgID, now, &createPrebuiltWorkspaceOpts{ + failedJob: true, + }) + + hardLimitedPresets, err := db.GetPresetsAtFailureLimit(ctx, 1) + + require.NoError(t, err) + + require.Len(t, hardLimitedPresets, 2) + { + hardLimitedPreset := findPresetByTmplVersionID(hardLimitedPresets, tmpl1.ActiveVersionID) + require.Equal(t, hardLimitedPreset.TemplateVersionID, tmpl1.ActiveVersionID) + require.Equal(t, hardLimitedPreset.PresetID, tmpl1V1.preset.ID) + } + { + hardLimitedPreset := findPresetByTmplVersionID(hardLimitedPresets, tmpl2.ActiveVersionID) + require.Equal(t, hardLimitedPreset.TemplateVersionID, tmpl2.ActiveVersionID) + require.Equal(t, hardLimitedPreset.PresetID, tmpl2V1.preset.ID) + } + }) + + t.Run("Multiple Templates, Versions and Workspace Builds", func(t *testing.T) { + t.Parallel() + + db, _ := dbtestutil.NewDB(t) + ctx := testutil.Context(t, testutil.WaitShort) + dbgen.Organization(t, db, database.Organization{ + ID: orgID, + }) + dbgen.User(t, db, database.User{ + ID: userID, + }) + + tmpl1 := createTemplate(t, db, orgID, userID) + tmpl1V1 := createTmplVersionAndPreset(t, db, tmpl1, tmpl1.ActiveVersionID, now, nil) + createPrebuiltWorkspace(ctx, t, db, tmpl1, tmpl1V1, orgID, now, &createPrebuiltWorkspaceOpts{ + failedJob: true, + }) + createPrebuiltWorkspace(ctx, t, db, tmpl1, tmpl1V1, orgID, now, &createPrebuiltWorkspaceOpts{ + failedJob: true, + }) + + tmpl2 := createTemplate(t, db, orgID, userID) + tmpl2V1 := createTmplVersionAndPreset(t, db, tmpl2, tmpl2.ActiveVersionID, now, nil) + createPrebuiltWorkspace(ctx, t, db, tmpl2, tmpl2V1, orgID, now, &createPrebuiltWorkspaceOpts{ + failedJob: true, + }) + createPrebuiltWorkspace(ctx, t, db, tmpl2, tmpl2V1, orgID, now, &createPrebuiltWorkspaceOpts{ + failedJob: true, + }) + + tmpl3 := createTemplate(t, db, orgID, userID) + tmpl3V1 := createTmplVersionAndPreset(t, db, tmpl3, uuid.New(), now, nil) + createPrebuiltWorkspace(ctx, t, db, tmpl3, tmpl3V1, orgID, now, &createPrebuiltWorkspaceOpts{ + failedJob: true, + }) + + tmpl3V2 := createTmplVersionAndPreset(t, db, tmpl3, tmpl3.ActiveVersionID, now, nil) + createPrebuiltWorkspace(ctx, t, db, tmpl3, tmpl3V2, orgID, now, &createPrebuiltWorkspaceOpts{ + failedJob: true, + }) + createPrebuiltWorkspace(ctx, t, db, tmpl3, tmpl3V2, orgID, now, &createPrebuiltWorkspaceOpts{ + failedJob: true, + }) + + hardLimit := int64(2) + hardLimitedPresets, err := db.GetPresetsAtFailureLimit(ctx, hardLimit) + require.NoError(t, err) + + require.Len(t, hardLimitedPresets, 3) + { + hardLimitedPreset := findPresetByTmplVersionID(hardLimitedPresets, tmpl1.ActiveVersionID) + require.Equal(t, hardLimitedPreset.TemplateVersionID, tmpl1.ActiveVersionID) + require.Equal(t, hardLimitedPreset.PresetID, tmpl1V1.preset.ID) + } + { + hardLimitedPreset := findPresetByTmplVersionID(hardLimitedPresets, tmpl2.ActiveVersionID) + require.Equal(t, hardLimitedPreset.TemplateVersionID, tmpl2.ActiveVersionID) + require.Equal(t, hardLimitedPreset.PresetID, tmpl2V1.preset.ID) + } + { + hardLimitedPreset := findPresetByTmplVersionID(hardLimitedPresets, tmpl3.ActiveVersionID) + require.Equal(t, hardLimitedPreset.TemplateVersionID, tmpl3.ActiveVersionID) + require.Equal(t, hardLimitedPreset.PresetID, tmpl3V2.preset.ID) + } + }) + + t.Run("No Workspace Builds", func(t *testing.T) { + t.Parallel() + + db, _ := dbtestutil.NewDB(t) + ctx := testutil.Context(t, testutil.WaitShort) + dbgen.Organization(t, db, database.Organization{ + ID: orgID, + }) + dbgen.User(t, db, database.User{ + ID: userID, + }) + + tmpl1 := createTemplate(t, db, orgID, userID) + createTmplVersionAndPreset(t, db, tmpl1, tmpl1.ActiveVersionID, now, nil) + + hardLimitedPresets, err := db.GetPresetsAtFailureLimit(ctx, 1) + require.NoError(t, err) + require.Nil(t, hardLimitedPresets) + }) + + t.Run("No Failed Workspace Builds", func(t *testing.T) { + t.Parallel() + + db, _ := dbtestutil.NewDB(t) + ctx := testutil.Context(t, testutil.WaitShort) + dbgen.Organization(t, db, database.Organization{ + ID: orgID, + }) + dbgen.User(t, db, database.User{ + ID: userID, + }) + + tmpl1 := createTemplate(t, db, orgID, userID) + tmpl1V1 := createTmplVersionAndPreset(t, db, tmpl1, tmpl1.ActiveVersionID, now, nil) + successfulJobOpts := createPrebuiltWorkspaceOpts{} + createPrebuiltWorkspace(ctx, t, db, tmpl1, tmpl1V1, orgID, now, &successfulJobOpts) + createPrebuiltWorkspace(ctx, t, db, tmpl1, tmpl1V1, orgID, now, &successfulJobOpts) + createPrebuiltWorkspace(ctx, t, db, tmpl1, tmpl1V1, orgID, now, &successfulJobOpts) + + hardLimitedPresets, err := db.GetPresetsAtFailureLimit(ctx, 1) + require.NoError(t, err) + require.Nil(t, hardLimitedPresets) + }) +} + func requireUsersMatch(t testing.TB, expected []database.User, found []database.GetUsersRow, msg string) { t.Helper() require.ElementsMatch(t, expected, database.ConvertUserRows(found), msg) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index 99a8bf4603b57..ffd8ccb035206 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -6288,6 +6288,71 @@ func (q *sqlQuerier) GetPrebuildMetrics(ctx context.Context) ([]GetPrebuildMetri return items, nil } +const getPresetsAtFailureLimit = `-- name: GetPresetsAtFailureLimit :many +WITH filtered_builds AS ( + -- Only select builds which are for prebuild creations + SELECT wlb.template_version_id, wlb.created_at, tvp.id AS preset_id, wlb.job_status, tvp.desired_instances + FROM template_version_presets tvp + INNER JOIN workspace_latest_builds wlb ON wlb.template_version_preset_id = tvp.id + INNER JOIN workspaces w ON wlb.workspace_id = w.id + INNER JOIN template_versions tv ON wlb.template_version_id = tv.id + INNER JOIN templates t ON tv.template_id = t.id AND t.active_version_id = tv.id + WHERE tvp.desired_instances IS NOT NULL -- Consider only presets that have a prebuild configuration. + AND wlb.transition = 'start'::workspace_transition + AND w.owner_id = 'c42fdf75-3097-471c-8c33-fb52454d81c0' +), +time_sorted_builds AS ( + -- Group builds by preset, then sort each group by created_at. + SELECT fb.template_version_id, fb.created_at, fb.preset_id, fb.job_status, fb.desired_instances, + ROW_NUMBER() OVER (PARTITION BY fb.preset_id ORDER BY fb.created_at DESC) as rn + FROM filtered_builds fb +) +SELECT + tsb.template_version_id, + tsb.preset_id +FROM time_sorted_builds tsb +WHERE tsb.rn <= $1::bigint + AND tsb.job_status = 'failed'::provisioner_job_status +GROUP BY tsb.template_version_id, tsb.preset_id +HAVING COUNT(*) = $1::bigint +` + +type GetPresetsAtFailureLimitRow struct { + TemplateVersionID uuid.UUID `db:"template_version_id" json:"template_version_id"` + PresetID uuid.UUID `db:"preset_id" json:"preset_id"` +} + +// GetPresetsAtFailureLimit groups workspace builds by preset ID. +// Each preset is associated with exactly one template version ID. +// For each preset, the query checks the last hard_limit builds. +// If all of them failed, the preset is considered to have hit the hard failure limit. +// The query returns a list of preset IDs that have reached this failure threshold. +// Only active template versions with configured presets are considered. +// For each preset, check the last hard_limit builds. +// If all of them failed, the preset is considered to have hit the hard failure limit. +func (q *sqlQuerier) GetPresetsAtFailureLimit(ctx context.Context, hardLimit int64) ([]GetPresetsAtFailureLimitRow, error) { + rows, err := q.db.QueryContext(ctx, getPresetsAtFailureLimit, hardLimit) + if err != nil { + return nil, err + } + defer rows.Close() + var items []GetPresetsAtFailureLimitRow + for rows.Next() { + var i GetPresetsAtFailureLimitRow + if err := rows.Scan(&i.TemplateVersionID, &i.PresetID); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + const getPresetsBackoff = `-- name: GetPresetsBackoff :many WITH filtered_builds AS ( -- Only select builds which are for prebuild creations @@ -6438,6 +6503,7 @@ const getTemplatePresetsWithPrebuilds = `-- name: GetTemplatePresetsWithPrebuild SELECT t.id AS template_id, t.name AS template_name, + o.id AS organization_id, o.name AS organization_name, tv.id AS template_version_id, tv.name AS template_version_name, @@ -6445,6 +6511,7 @@ SELECT tvp.id, tvp.name, tvp.desired_instances AS desired_instances, + tvp.prebuild_status, t.deleted, t.deprecated != '' AS deprecated FROM templates t @@ -6457,17 +6524,19 @@ WHERE tvp.desired_instances IS NOT NULL -- Consider only presets that have a pre ` type GetTemplatePresetsWithPrebuildsRow struct { - TemplateID uuid.UUID `db:"template_id" json:"template_id"` - TemplateName string `db:"template_name" json:"template_name"` - OrganizationName string `db:"organization_name" json:"organization_name"` - TemplateVersionID uuid.UUID `db:"template_version_id" json:"template_version_id"` - TemplateVersionName string `db:"template_version_name" json:"template_version_name"` - UsingActiveVersion bool `db:"using_active_version" json:"using_active_version"` - ID uuid.UUID `db:"id" json:"id"` - Name string `db:"name" json:"name"` - DesiredInstances sql.NullInt32 `db:"desired_instances" json:"desired_instances"` - Deleted bool `db:"deleted" json:"deleted"` - Deprecated bool `db:"deprecated" json:"deprecated"` + TemplateID uuid.UUID `db:"template_id" json:"template_id"` + TemplateName string `db:"template_name" json:"template_name"` + OrganizationID uuid.UUID `db:"organization_id" json:"organization_id"` + OrganizationName string `db:"organization_name" json:"organization_name"` + TemplateVersionID uuid.UUID `db:"template_version_id" json:"template_version_id"` + TemplateVersionName string `db:"template_version_name" json:"template_version_name"` + UsingActiveVersion bool `db:"using_active_version" json:"using_active_version"` + ID uuid.UUID `db:"id" json:"id"` + Name string `db:"name" json:"name"` + DesiredInstances sql.NullInt32 `db:"desired_instances" json:"desired_instances"` + PrebuildStatus PrebuildStatus `db:"prebuild_status" json:"prebuild_status"` + Deleted bool `db:"deleted" json:"deleted"` + Deprecated bool `db:"deprecated" json:"deprecated"` } // GetTemplatePresetsWithPrebuilds retrieves template versions with configured presets and prebuilds. @@ -6485,6 +6554,7 @@ func (q *sqlQuerier) GetTemplatePresetsWithPrebuilds(ctx context.Context, templa if err := rows.Scan( &i.TemplateID, &i.TemplateName, + &i.OrganizationID, &i.OrganizationName, &i.TemplateVersionID, &i.TemplateVersionName, @@ -6492,6 +6562,7 @@ func (q *sqlQuerier) GetTemplatePresetsWithPrebuilds(ctx context.Context, templa &i.ID, &i.Name, &i.DesiredInstances, + &i.PrebuildStatus, &i.Deleted, &i.Deprecated, ); err != nil { @@ -6509,21 +6580,22 @@ func (q *sqlQuerier) GetTemplatePresetsWithPrebuilds(ctx context.Context, templa } const getPresetByID = `-- name: GetPresetByID :one -SELECT tvp.id, tvp.template_version_id, tvp.name, tvp.created_at, tvp.desired_instances, tvp.invalidate_after_secs, tv.template_id, tv.organization_id FROM +SELECT tvp.id, tvp.template_version_id, tvp.name, tvp.created_at, tvp.desired_instances, tvp.invalidate_after_secs, tvp.prebuild_status, tv.template_id, tv.organization_id FROM template_version_presets tvp INNER JOIN template_versions tv ON tvp.template_version_id = tv.id WHERE tvp.id = $1 ` type GetPresetByIDRow struct { - ID uuid.UUID `db:"id" json:"id"` - TemplateVersionID uuid.UUID `db:"template_version_id" json:"template_version_id"` - Name string `db:"name" json:"name"` - CreatedAt time.Time `db:"created_at" json:"created_at"` - DesiredInstances sql.NullInt32 `db:"desired_instances" json:"desired_instances"` - InvalidateAfterSecs sql.NullInt32 `db:"invalidate_after_secs" json:"invalidate_after_secs"` - TemplateID uuid.NullUUID `db:"template_id" json:"template_id"` - OrganizationID uuid.UUID `db:"organization_id" json:"organization_id"` + ID uuid.UUID `db:"id" json:"id"` + TemplateVersionID uuid.UUID `db:"template_version_id" json:"template_version_id"` + Name string `db:"name" json:"name"` + CreatedAt time.Time `db:"created_at" json:"created_at"` + DesiredInstances sql.NullInt32 `db:"desired_instances" json:"desired_instances"` + InvalidateAfterSecs sql.NullInt32 `db:"invalidate_after_secs" json:"invalidate_after_secs"` + PrebuildStatus PrebuildStatus `db:"prebuild_status" json:"prebuild_status"` + TemplateID uuid.NullUUID `db:"template_id" json:"template_id"` + OrganizationID uuid.UUID `db:"organization_id" json:"organization_id"` } func (q *sqlQuerier) GetPresetByID(ctx context.Context, presetID uuid.UUID) (GetPresetByIDRow, error) { @@ -6536,6 +6608,7 @@ func (q *sqlQuerier) GetPresetByID(ctx context.Context, presetID uuid.UUID) (Get &i.CreatedAt, &i.DesiredInstances, &i.InvalidateAfterSecs, + &i.PrebuildStatus, &i.TemplateID, &i.OrganizationID, ) @@ -6544,7 +6617,7 @@ func (q *sqlQuerier) GetPresetByID(ctx context.Context, presetID uuid.UUID) (Get const getPresetByWorkspaceBuildID = `-- name: GetPresetByWorkspaceBuildID :one SELECT - template_version_presets.id, template_version_presets.template_version_id, template_version_presets.name, template_version_presets.created_at, template_version_presets.desired_instances, template_version_presets.invalidate_after_secs + template_version_presets.id, template_version_presets.template_version_id, template_version_presets.name, template_version_presets.created_at, template_version_presets.desired_instances, template_version_presets.invalidate_after_secs, template_version_presets.prebuild_status FROM template_version_presets INNER JOIN workspace_builds ON workspace_builds.template_version_preset_id = template_version_presets.id @@ -6562,6 +6635,7 @@ func (q *sqlQuerier) GetPresetByWorkspaceBuildID(ctx context.Context, workspaceB &i.CreatedAt, &i.DesiredInstances, &i.InvalidateAfterSecs, + &i.PrebuildStatus, ) return i, err } @@ -6643,7 +6717,7 @@ func (q *sqlQuerier) GetPresetParametersByTemplateVersionID(ctx context.Context, const getPresetsByTemplateVersionID = `-- name: GetPresetsByTemplateVersionID :many SELECT - id, template_version_id, name, created_at, desired_instances, invalidate_after_secs + id, template_version_id, name, created_at, desired_instances, invalidate_after_secs, prebuild_status FROM template_version_presets WHERE @@ -6666,6 +6740,7 @@ func (q *sqlQuerier) GetPresetsByTemplateVersionID(ctx context.Context, template &i.CreatedAt, &i.DesiredInstances, &i.InvalidateAfterSecs, + &i.PrebuildStatus, ); err != nil { return nil, err } @@ -6696,7 +6771,7 @@ VALUES ( $4, $5, $6 -) RETURNING id, template_version_id, name, created_at, desired_instances, invalidate_after_secs +) RETURNING id, template_version_id, name, created_at, desired_instances, invalidate_after_secs, prebuild_status ` type InsertPresetParams struct { @@ -6725,6 +6800,7 @@ func (q *sqlQuerier) InsertPreset(ctx context.Context, arg InsertPresetParams) ( &i.CreatedAt, &i.DesiredInstances, &i.InvalidateAfterSecs, + &i.PrebuildStatus, ) return i, err } @@ -6773,6 +6849,22 @@ func (q *sqlQuerier) InsertPresetParameters(ctx context.Context, arg InsertPrese return items, nil } +const updatePresetPrebuildStatus = `-- name: UpdatePresetPrebuildStatus :exec +UPDATE template_version_presets +SET prebuild_status = $1 +WHERE id = $2 +` + +type UpdatePresetPrebuildStatusParams struct { + Status PrebuildStatus `db:"status" json:"status"` + PresetID uuid.UUID `db:"preset_id" json:"preset_id"` +} + +func (q *sqlQuerier) UpdatePresetPrebuildStatus(ctx context.Context, arg UpdatePresetPrebuildStatusParams) error { + _, err := q.db.ExecContext(ctx, updatePresetPrebuildStatus, arg.Status, arg.PresetID) + return err +} + const deleteOldProvisionerDaemons = `-- name: DeleteOldProvisionerDaemons :exec DELETE FROM provisioner_daemons WHERE ( (created_at < (NOW() - INTERVAL '7 days') AND last_seen_at IS NULL) OR diff --git a/coderd/database/queries/prebuilds.sql b/coderd/database/queries/prebuilds.sql index 8c27ddf62b7c3..9cd4321afec23 100644 --- a/coderd/database/queries/prebuilds.sql +++ b/coderd/database/queries/prebuilds.sql @@ -27,6 +27,7 @@ RETURNING w.id, w.name; SELECT t.id AS template_id, t.name AS template_name, + o.id AS organization_id, o.name AS organization_name, tv.id AS template_version_id, tv.name AS template_version_name, @@ -34,6 +35,7 @@ SELECT tvp.id, tvp.name, tvp.desired_instances AS desired_instances, + tvp.prebuild_status, t.deleted, t.deprecated != '' AS deprecated FROM templates t @@ -129,6 +131,42 @@ WHERE tsb.rn <= tsb.desired_instances -- Fetch the last N builds, where N is the AND created_at >= @lookback::timestamptz GROUP BY tsb.template_version_id, tsb.preset_id, fc.num_failed; +-- GetPresetsAtFailureLimit groups workspace builds by preset ID. +-- Each preset is associated with exactly one template version ID. +-- For each preset, the query checks the last hard_limit builds. +-- If all of them failed, the preset is considered to have hit the hard failure limit. +-- The query returns a list of preset IDs that have reached this failure threshold. +-- Only active template versions with configured presets are considered. +-- name: GetPresetsAtFailureLimit :many +WITH filtered_builds AS ( + -- Only select builds which are for prebuild creations + SELECT wlb.template_version_id, wlb.created_at, tvp.id AS preset_id, wlb.job_status, tvp.desired_instances + FROM template_version_presets tvp + INNER JOIN workspace_latest_builds wlb ON wlb.template_version_preset_id = tvp.id + INNER JOIN workspaces w ON wlb.workspace_id = w.id + INNER JOIN template_versions tv ON wlb.template_version_id = tv.id + INNER JOIN templates t ON tv.template_id = t.id AND t.active_version_id = tv.id + WHERE tvp.desired_instances IS NOT NULL -- Consider only presets that have a prebuild configuration. + AND wlb.transition = 'start'::workspace_transition + AND w.owner_id = 'c42fdf75-3097-471c-8c33-fb52454d81c0' +), +time_sorted_builds AS ( + -- Group builds by preset, then sort each group by created_at. + SELECT fb.template_version_id, fb.created_at, fb.preset_id, fb.job_status, fb.desired_instances, + ROW_NUMBER() OVER (PARTITION BY fb.preset_id ORDER BY fb.created_at DESC) as rn + FROM filtered_builds fb +) +SELECT + tsb.template_version_id, + tsb.preset_id +FROM time_sorted_builds tsb +-- For each preset, check the last hard_limit builds. +-- If all of them failed, the preset is considered to have hit the hard failure limit. +WHERE tsb.rn <= @hard_limit::bigint + AND tsb.job_status = 'failed'::provisioner_job_status +GROUP BY tsb.template_version_id, tsb.preset_id +HAVING COUNT(*) = @hard_limit::bigint; + -- name: GetPrebuildMetrics :many SELECT t.name as template_name, diff --git a/coderd/database/queries/presets.sql b/coderd/database/queries/presets.sql index 6d5646a285b4a..2fb6722bc2c33 100644 --- a/coderd/database/queries/presets.sql +++ b/coderd/database/queries/presets.sql @@ -25,6 +25,11 @@ SELECT unnest(@values :: TEXT[]) RETURNING *; +-- name: UpdatePresetPrebuildStatus :exec +UPDATE template_version_presets +SET prebuild_status = @status +WHERE id = @preset_id; + -- name: GetPresetsByTemplateVersionID :many SELECT * diff --git a/coderd/notifications/events.go b/coderd/notifications/events.go index 35d9925055da5..0e88361b56f68 100644 --- a/coderd/notifications/events.go +++ b/coderd/notifications/events.go @@ -42,6 +42,11 @@ var ( TemplateWorkspaceResourceReplaced = uuid.MustParse("89d9745a-816e-4695-a17f-3d0a229e2b8d") ) +// Prebuilds-related events +var ( + PrebuildFailureLimitReached = uuid.MustParse("414d9331-c1fc-4761-b40c-d1f4702279eb") +) + // Notification-related events. var ( TemplateTestNotification = uuid.MustParse("c425f63e-716a-4bf4-ae24-78348f706c3f") diff --git a/coderd/notifications/notifications_test.go b/coderd/notifications/notifications_test.go index 8f8a3c82441e0..fab87af41deb9 100644 --- a/coderd/notifications/notifications_test.go +++ b/coderd/notifications/notifications_test.go @@ -1250,6 +1250,22 @@ func TestNotificationTemplates_Golden(t *testing.T) { }, }, }, + { + name: "PrebuildFailureLimitReached", + id: notifications.PrebuildFailureLimitReached, + payload: types.MessagePayload{ + UserName: "Bobby", + UserEmail: "bobby@coder.com", + UserUsername: "bobby", + Labels: map[string]string{ + "org": "cern", + "template": "docker", + "template_version": "angry_torvalds", + "preset": "particle-accelerator", + }, + Data: map[string]any{}, + }, + }, } // We must have a test case for every notification_template. This is enforced below: diff --git a/coderd/notifications/testdata/rendered-templates/smtp/PrebuildFailureLimitReached.html.golden b/coderd/notifications/testdata/rendered-templates/smtp/PrebuildFailureLimitReached.html.golden new file mode 100644 index 0000000000000..69f13b86ca71c --- /dev/null +++ b/coderd/notifications/testdata/rendered-templates/smtp/PrebuildFailureLimitReached.html.golden @@ -0,0 +1,112 @@ +From: system@coder.com +To: bobby@coder.com +Subject: There is a problem creating prebuilt workspaces +Message-Id: 02ee4935-73be-4fa1-a290-ff9999026b13@blush-whale-48 +Date: Fri, 11 Oct 2024 09:03:06 +0000 +Content-Type: multipart/alternative; boundary=bbe61b741255b6098bb6b3c1f41b885773df633cb18d2a3002b68e4bc9c4 +MIME-Version: 1.0 + +--bbe61b741255b6098bb6b3c1f41b885773df633cb18d2a3002b68e4bc9c4 +Content-Transfer-Encoding: quoted-printable +Content-Type: text/plain; charset=UTF-8 + +Hi Bobby, + +The number of failed prebuild attempts has reached the hard limit for templ= +ate docker and preset particle-accelerator. + +To resume prebuilds, fix the underlying issue and upload a new template ver= +sion. + +Refer to the documentation for more details: + +Troubleshooting templates (https://coder.com/docs/admin/templates/troublesh= +ooting) +Troubleshooting of prebuilt workspaces (https://coder.com/docs/admin/templa= +tes/extending-templates/prebuilt-workspaces#administration-and-troubleshoot= +ing) + + +View failed prebuilt workspaces: http://test.com/workspaces?filter=3Downer:= +prebuilds+status:failed+template:docker + +View template version: http://test.com/templates/cern/docker/versions/angry= +_torvalds + +--bbe61b741255b6098bb6b3c1f41b885773df633cb18d2a3002b68e4bc9c4 +Content-Transfer-Encoding: quoted-printable +Content-Type: text/html; charset=UTF-8 + + + + + + + There is a problem creating prebuilt workspaces + + +
+
+ 3D"Cod= +
+

+ There is a problem creating prebuilt workspaces +

+
+

Hi Bobby,

+

The number of failed prebuild attempts has reached the hard limi= +t for template docker and preset particle-accelera= +tor.

+ +

To resume prebuilds, fix the underlying issue and upload a new template = +version.

+ +

Refer to the documentation for more details:
+- Troubl= +eshooting templates
+- Troubleshooting of pre= +built workspaces

+
+
+ =20 + + View failed prebuilt workspaces + + =20 + + View template version + + =20 +
+
+

© 2024 Coder. All rights reserved - h= +ttp://test.com

+

Click here to manage your notification = +settings

+

Stop receiving emails like this

+
+
+ + + +--bbe61b741255b6098bb6b3c1f41b885773df633cb18d2a3002b68e4bc9c4-- diff --git a/coderd/notifications/testdata/rendered-templates/webhook/PrebuildFailureLimitReached.json.golden b/coderd/notifications/testdata/rendered-templates/webhook/PrebuildFailureLimitReached.json.golden new file mode 100644 index 0000000000000..0a6e262ff7512 --- /dev/null +++ b/coderd/notifications/testdata/rendered-templates/webhook/PrebuildFailureLimitReached.json.golden @@ -0,0 +1,35 @@ +{ + "_version": "1.1", + "msg_id": "00000000-0000-0000-0000-000000000000", + "payload": { + "_version": "1.2", + "notification_name": "Prebuild Failure Limit Reached", + "notification_template_id": "00000000-0000-0000-0000-000000000000", + "user_id": "00000000-0000-0000-0000-000000000000", + "user_email": "bobby@coder.com", + "user_name": "Bobby", + "user_username": "bobby", + "actions": [ + { + "label": "View failed prebuilt workspaces", + "url": "http://test.com/workspaces?filter=owner:prebuilds+status:failed+template:docker" + }, + { + "label": "View template version", + "url": "http://test.com/templates/cern/docker/versions/angry_torvalds" + } + ], + "labels": { + "org": "cern", + "preset": "particle-accelerator", + "template": "docker", + "template_version": "angry_torvalds" + }, + "data": {}, + "targets": null + }, + "title": "There is a problem creating prebuilt workspaces", + "title_markdown": "There is a problem creating prebuilt workspaces", + "body": "The number of failed prebuild attempts has reached the hard limit for template docker and preset particle-accelerator.\n\nTo resume prebuilds, fix the underlying issue and upload a new template version.\n\nRefer to the documentation for more details:\n\nTroubleshooting templates (https://coder.com/docs/admin/templates/troubleshooting)\nTroubleshooting of prebuilt workspaces (https://coder.com/docs/admin/templates/extending-templates/prebuilt-workspaces#administration-and-troubleshooting)", + "body_markdown": "\nThe number of failed prebuild attempts has reached the hard limit for template **docker** and preset **particle-accelerator**.\n\nTo resume prebuilds, fix the underlying issue and upload a new template version.\n\nRefer to the documentation for more details:\n- [Troubleshooting templates](https://coder.com/docs/admin/templates/troubleshooting)\n- [Troubleshooting of prebuilt workspaces](https://coder.com/docs/admin/templates/extending-templates/prebuilt-workspaces#administration-and-troubleshooting)\n" +} \ No newline at end of file diff --git a/coderd/prebuilds/global_snapshot.go b/coderd/prebuilds/global_snapshot.go index 0cf3fa3facc3a..9110f57574e7b 100644 --- a/coderd/prebuilds/global_snapshot.go +++ b/coderd/prebuilds/global_snapshot.go @@ -14,6 +14,7 @@ type GlobalSnapshot struct { RunningPrebuilds []database.GetRunningPrebuiltWorkspacesRow PrebuildsInProgress []database.CountInProgressPrebuildsRow Backoffs []database.GetPresetsBackoffRow + HardLimitedPresets []database.GetPresetsAtFailureLimitRow } func NewGlobalSnapshot( @@ -21,12 +22,14 @@ func NewGlobalSnapshot( runningPrebuilds []database.GetRunningPrebuiltWorkspacesRow, prebuildsInProgress []database.CountInProgressPrebuildsRow, backoffs []database.GetPresetsBackoffRow, + hardLimitedPresets []database.GetPresetsAtFailureLimitRow, ) GlobalSnapshot { return GlobalSnapshot{ Presets: presets, RunningPrebuilds: runningPrebuilds, PrebuildsInProgress: prebuildsInProgress, Backoffs: backoffs, + HardLimitedPresets: hardLimitedPresets, } } @@ -57,10 +60,15 @@ func (s GlobalSnapshot) FilterByPreset(presetID uuid.UUID) (*PresetSnapshot, err backoffPtr = &backoff } + _, isHardLimited := slice.Find(s.HardLimitedPresets, func(row database.GetPresetsAtFailureLimitRow) bool { + return row.PresetID == preset.ID + }) + return &PresetSnapshot{ - Preset: preset, - Running: running, - InProgress: inProgress, - Backoff: backoffPtr, + Preset: preset, + Running: running, + InProgress: inProgress, + Backoff: backoffPtr, + IsHardLimited: isHardLimited, }, nil } diff --git a/coderd/prebuilds/preset_snapshot.go b/coderd/prebuilds/preset_snapshot.go index 8441a350187d2..40e77de5ab3e3 100644 --- a/coderd/prebuilds/preset_snapshot.go +++ b/coderd/prebuilds/preset_snapshot.go @@ -32,10 +32,11 @@ const ( // It contains the raw data needed to calculate the current state of a preset's prebuilds, // including running prebuilds, in-progress builds, and backoff information. type PresetSnapshot struct { - Preset database.GetTemplatePresetsWithPrebuildsRow - Running []database.GetRunningPrebuiltWorkspacesRow - InProgress []database.CountInProgressPrebuildsRow - Backoff *database.GetPresetsBackoffRow + Preset database.GetTemplatePresetsWithPrebuildsRow + Running []database.GetRunningPrebuiltWorkspacesRow + InProgress []database.CountInProgressPrebuildsRow + Backoff *database.GetPresetsBackoffRow + IsHardLimited bool } // ReconciliationState represents the processed state of a preset's prebuilds, diff --git a/coderd/prebuilds/preset_snapshot_test.go b/coderd/prebuilds/preset_snapshot_test.go index a5acb40e5311f..2febf1d13ec91 100644 --- a/coderd/prebuilds/preset_snapshot_test.go +++ b/coderd/prebuilds/preset_snapshot_test.go @@ -73,7 +73,7 @@ func TestNoPrebuilds(t *testing.T) { preset(true, 0, current), } - snapshot := prebuilds.NewGlobalSnapshot(presets, nil, nil, nil) + snapshot := prebuilds.NewGlobalSnapshot(presets, nil, nil, nil, nil) ps, err := snapshot.FilterByPreset(current.presetID) require.NoError(t, err) @@ -98,7 +98,7 @@ func TestNetNew(t *testing.T) { preset(true, 1, current), } - snapshot := prebuilds.NewGlobalSnapshot(presets, nil, nil, nil) + snapshot := prebuilds.NewGlobalSnapshot(presets, nil, nil, nil, nil) ps, err := snapshot.FilterByPreset(current.presetID) require.NoError(t, err) @@ -138,7 +138,7 @@ func TestOutdatedPrebuilds(t *testing.T) { var inProgress []database.CountInProgressPrebuildsRow // WHEN: calculating the outdated preset's state. - snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil) + snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil, nil) ps, err := snapshot.FilterByPreset(outdated.presetID) require.NoError(t, err) @@ -200,7 +200,7 @@ func TestDeleteOutdatedPrebuilds(t *testing.T) { } // WHEN: calculating the outdated preset's state. - snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil) + snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil, nil) ps, err := snapshot.FilterByPreset(outdated.presetID) require.NoError(t, err) @@ -442,7 +442,7 @@ func TestInProgressActions(t *testing.T) { } // WHEN: calculating the current preset's state. - snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil) + snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil, nil) ps, err := snapshot.FilterByPreset(current.presetID) require.NoError(t, err) @@ -485,7 +485,7 @@ func TestExtraneous(t *testing.T) { var inProgress []database.CountInProgressPrebuildsRow // WHEN: calculating the current preset's state. - snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil) + snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil, nil) ps, err := snapshot.FilterByPreset(current.presetID) require.NoError(t, err) @@ -525,7 +525,7 @@ func TestDeprecated(t *testing.T) { var inProgress []database.CountInProgressPrebuildsRow // WHEN: calculating the current preset's state. - snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil) + snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil, nil) ps, err := snapshot.FilterByPreset(current.presetID) require.NoError(t, err) @@ -576,7 +576,7 @@ func TestLatestBuildFailed(t *testing.T) { } // WHEN: calculating the current preset's state. - snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, backoffs) + snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, backoffs, nil) psCurrent, err := snapshot.FilterByPreset(current.presetID) require.NoError(t, err) @@ -669,7 +669,7 @@ func TestMultiplePresetsPerTemplateVersion(t *testing.T) { }, } - snapshot := prebuilds.NewGlobalSnapshot(presets, nil, inProgress, nil) + snapshot := prebuilds.NewGlobalSnapshot(presets, nil, inProgress, nil, nil) // Nothing has to be created for preset 1. { diff --git a/codersdk/deployment.go b/codersdk/deployment.go index 39b67feb2c73a..89834f163affd 100644 --- a/codersdk/deployment.go +++ b/codersdk/deployment.go @@ -807,6 +807,12 @@ type PrebuildsConfig struct { // ReconciliationBackoffLookback determines the time window to look back when calculating // the number of failed prebuilds, which influences the backoff strategy. ReconciliationBackoffLookback serpent.Duration `json:"reconciliation_backoff_lookback" typescript:",notnull"` + + // FailureHardLimit defines the maximum number of consecutive failed prebuild attempts allowed + // before a preset is considered to be in a hard limit state. When a preset hits this limit, + // no new prebuilds will be created until the limit is reset. + // FailureHardLimit is disabled when set to zero. + FailureHardLimit serpent.Int64 `json:"failure_hard_limit" typescript:"failure_hard_limit"` } const ( @@ -3086,6 +3092,17 @@ Write out the current server config as YAML to stdout.`, Annotations: serpent.Annotations{}.Mark(annotationFormatDuration, "true"), Hidden: true, }, + { + Name: "Failure Hard Limit", + Description: "Maximum number of consecutive failed prebuilds before a preset hits the hard limit; disabled when set to zero.", + Flag: "workspace-prebuilds-failure-hard-limit", + Env: "CODER_WORKSPACE_PREBUILDS_FAILURE_HARD_LIMIT", + Value: &c.Prebuilds.FailureHardLimit, + Default: "3", + Group: &deploymentGroupPrebuilds, + YAML: "failure_hard_limit", + Hidden: true, + }, } return opts diff --git a/docs/reference/api/general.md b/docs/reference/api/general.md index c14c317066a39..12454145569bb 100644 --- a/docs/reference/api/general.md +++ b/docs/reference/api/general.md @@ -533,6 +533,7 @@ curl -X GET http://coder-server:8080/api/v2/deployment/config \ "wildcard_access_url": "string", "workspace_hostname_suffix": "string", "workspace_prebuilds": { + "failure_hard_limit": 0, "reconciliation_backoff_interval": 0, "reconciliation_backoff_lookback": 0, "reconciliation_interval": 0 diff --git a/docs/reference/api/schemas.md b/docs/reference/api/schemas.md index 86cc4644c2685..2374c6af8800f 100644 --- a/docs/reference/api/schemas.md +++ b/docs/reference/api/schemas.md @@ -2704,6 +2704,7 @@ CreateWorkspaceRequest provides options for creating a new workspace. Only one o "wildcard_access_url": "string", "workspace_hostname_suffix": "string", "workspace_prebuilds": { + "failure_hard_limit": 0, "reconciliation_backoff_interval": 0, "reconciliation_backoff_lookback": 0, "reconciliation_interval": 0 @@ -3202,6 +3203,7 @@ CreateWorkspaceRequest provides options for creating a new workspace. Only one o "wildcard_access_url": "string", "workspace_hostname_suffix": "string", "workspace_prebuilds": { + "failure_hard_limit": 0, "reconciliation_backoff_interval": 0, "reconciliation_backoff_lookback": 0, "reconciliation_interval": 0 @@ -5261,6 +5263,7 @@ Git clone makes use of this by parsing the URL from: 'Username for "https://gith ```json { + "failure_hard_limit": 0, "reconciliation_backoff_interval": 0, "reconciliation_backoff_lookback": 0, "reconciliation_interval": 0 @@ -5269,11 +5272,12 @@ Git clone makes use of this by parsing the URL from: 'Username for "https://gith ### Properties -| Name | Type | Required | Restrictions | Description | -|-----------------------------------|---------|----------|--------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `reconciliation_backoff_interval` | integer | false | | Reconciliation backoff interval specifies the amount of time to increase the backoff interval when errors occur during reconciliation. | -| `reconciliation_backoff_lookback` | integer | false | | Reconciliation backoff lookback determines the time window to look back when calculating the number of failed prebuilds, which influences the backoff strategy. | -| `reconciliation_interval` | integer | false | | Reconciliation interval defines how often the workspace prebuilds state should be reconciled. | +| Name | Type | Required | Restrictions | Description | +|-----------------------------------|---------|----------|--------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `failure_hard_limit` | integer | false | | Failure hard limit defines the maximum number of consecutive failed prebuild attempts allowed before a preset is considered to be in a hard limit state. When a preset hits this limit, no new prebuilds will be created until the limit is reset. FailureHardLimit is disabled when set to zero. | +| `reconciliation_backoff_interval` | integer | false | | Reconciliation backoff interval specifies the amount of time to increase the backoff interval when errors occur during reconciliation. | +| `reconciliation_backoff_lookback` | integer | false | | Reconciliation backoff lookback determines the time window to look back when calculating the number of failed prebuilds, which influences the backoff strategy. | +| `reconciliation_interval` | integer | false | | Reconciliation interval defines how often the workspace prebuilds state should be reconciled. | ## codersdk.Preset diff --git a/enterprise/coderd/prebuilds/reconcile.go b/enterprise/coderd/prebuilds/reconcile.go index f9588a5d7cacb..7796e43777951 100644 --- a/enterprise/coderd/prebuilds/reconcile.go +++ b/enterprise/coderd/prebuilds/reconcile.go @@ -313,6 +313,7 @@ func (c *StoreReconciler) SnapshotState(ctx context.Context, store database.Stor if len(presetsWithPrebuilds) == 0 { return nil } + allRunningPrebuilds, err := db.GetRunningPrebuiltWorkspaces(ctx) if err != nil { return xerrors.Errorf("failed to get running prebuilds: %w", err) @@ -328,7 +329,18 @@ func (c *StoreReconciler) SnapshotState(ctx context.Context, store database.Stor return xerrors.Errorf("failed to get backoffs for presets: %w", err) } - state = prebuilds.NewGlobalSnapshot(presetsWithPrebuilds, allRunningPrebuilds, allPrebuildsInProgress, presetsBackoff) + hardLimitedPresets, err := db.GetPresetsAtFailureLimit(ctx, c.cfg.FailureHardLimit.Value()) + if err != nil { + return xerrors.Errorf("failed to get hard limited presets: %w", err) + } + + state = prebuilds.NewGlobalSnapshot( + presetsWithPrebuilds, + allRunningPrebuilds, + allPrebuildsInProgress, + presetsBackoff, + hardLimitedPresets, + ) return nil }, &database.TxOptions{ Isolation: sql.LevelRepeatableRead, // This mirrors the MVCC snapshotting Postgres does when using CTEs @@ -349,19 +361,45 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres slog.F("preset_name", ps.Preset.Name), ) + // If the preset was previously hard-limited, log it and exit early. + if ps.Preset.PrebuildStatus == database.PrebuildStatusHardLimited { + logger.Warn(ctx, "skipping hard limited preset") + return nil + } + + // If the preset reached the hard failure limit for the first time during this iteration: + // - Mark it as hard-limited in the database + // - Send notifications to template admins + if ps.IsHardLimited { + logger.Warn(ctx, "skipping hard limited preset") + + err := c.store.UpdatePresetPrebuildStatus(ctx, database.UpdatePresetPrebuildStatusParams{ + Status: database.PrebuildStatusHardLimited, + PresetID: ps.Preset.ID, + }) + if err != nil { + return xerrors.Errorf("failed to update preset prebuild status: %w", err) + } + + err = c.notifyPrebuildFailureLimitReached(ctx, ps) + if err != nil { + logger.Error(ctx, "failed to notify that number of prebuild failures reached the limit", slog.Error(err)) + return nil + } + + return nil + } + state := ps.CalculateState() actions, err := c.CalculateActions(ctx, ps) if err != nil { - logger.Error(ctx, "failed to calculate actions for preset", slog.Error(err), slog.F("preset_id", ps.Preset.ID)) + logger.Error(ctx, "failed to calculate actions for preset", slog.Error(err)) return nil } // Nothing has to be done. if !ps.Preset.UsingActiveVersion && actions.IsNoop() { - logger.Debug(ctx, "skipping reconciliation for preset - nothing has to be done", - slog.F("template_id", ps.Preset.TemplateID.String()), slog.F("template_name", ps.Preset.TemplateName), - slog.F("template_version_id", ps.Preset.TemplateVersionID.String()), slog.F("template_version_name", ps.Preset.TemplateVersionName), - slog.F("preset_id", ps.Preset.ID.String()), slog.F("preset_name", ps.Preset.Name)) + logger.Debug(ctx, "skipping reconciliation for preset - nothing has to be done") return nil } @@ -442,6 +480,49 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres } } +func (c *StoreReconciler) notifyPrebuildFailureLimitReached(ctx context.Context, ps prebuilds.PresetSnapshot) error { + // nolint:gocritic // Necessary to query all the required data. + ctx = dbauthz.AsSystemRestricted(ctx) + + // Send notification to template admins. + if c.notifEnq == nil { + c.logger.Warn(ctx, "notification enqueuer not set, cannot send prebuild is hard limited notification(s)") + return nil + } + + templateAdmins, err := c.store.GetUsers(ctx, database.GetUsersParams{ + RbacRole: []string{codersdk.RoleTemplateAdmin}, + }) + if err != nil { + return xerrors.Errorf("fetch template admins: %w", err) + } + + for _, templateAdmin := range templateAdmins { + if _, err := c.notifEnq.EnqueueWithData(ctx, templateAdmin.ID, notifications.PrebuildFailureLimitReached, + map[string]string{ + "org": ps.Preset.OrganizationName, + "template": ps.Preset.TemplateName, + "template_version": ps.Preset.TemplateVersionName, + "preset": ps.Preset.Name, + }, + map[string]any{}, + "prebuilds_reconciler", + // Associate this notification with all the related entities. + ps.Preset.TemplateID, ps.Preset.TemplateVersionID, ps.Preset.ID, ps.Preset.OrganizationID, + ); err != nil { + c.logger.Error(ctx, + "failed to send notification", + slog.Error(err), + slog.F("template_admin_id", templateAdmin.ID.String()), + ) + + continue + } + } + + return nil +} + func (c *StoreReconciler) CalculateActions(ctx context.Context, snapshot prebuilds.PresetSnapshot) (*prebuilds.ReconciliationActions, error) { if ctx.Err() != nil { return nil, ctx.Err() diff --git a/enterprise/coderd/prebuilds/reconcile_test.go b/enterprise/coderd/prebuilds/reconcile_test.go index 660b1733e6cc9..f52a77ca500b9 100644 --- a/enterprise/coderd/prebuilds/reconcile_test.go +++ b/enterprise/coderd/prebuilds/reconcile_test.go @@ -654,6 +654,131 @@ func TestDeletionOfPrebuiltWorkspaceWithInvalidPreset(t *testing.T) { require.Equal(t, database.WorkspaceTransitionDelete, builds[0].Transition) } +func TestSkippingHardLimitedPresets(t *testing.T) { + t.Parallel() + + if !dbtestutil.WillUsePostgres() { + t.Skip("This test requires postgres") + } + + // Test cases verify the behavior of prebuild creation depending on configured failure limits. + testCases := []struct { + name string + hardLimit int64 + isHardLimitHit bool + }{ + { + name: "hard limit is hit - skip creation of prebuilt workspace", + hardLimit: 1, + isHardLimitHit: true, + }, + { + name: "hard limit is not hit - try to create prebuilt workspace again", + hardLimit: 2, + isHardLimitHit: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + templateDeleted := false + + clock := quartz.NewMock(t) + ctx := testutil.Context(t, testutil.WaitShort) + cfg := codersdk.PrebuildsConfig{ + FailureHardLimit: serpent.Int64(tc.hardLimit), + ReconciliationBackoffInterval: 0, + } + logger := slogtest.Make( + t, &slogtest.Options{IgnoreErrors: true}, + ).Leveled(slog.LevelDebug) + db, pubSub := dbtestutil.NewDB(t) + fakeEnqueuer := newFakeEnqueuer() + controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, clock, prometheus.NewRegistry(), fakeEnqueuer) + + // Template admin to receive a notification. + templateAdmin := dbgen.User(t, db, database.User{ + RBACRoles: []string{codersdk.RoleTemplateAdmin}, + }) + + // Set up test environment with a template, version, and preset. + ownerID := uuid.New() + dbgen.User(t, db, database.User{ + ID: ownerID, + }) + org, template := setupTestDBTemplate(t, db, ownerID, templateDeleted) + templateVersionID := setupTestDBTemplateVersion(ctx, t, clock, db, pubSub, org.ID, ownerID, template.ID) + preset := setupTestDBPreset(t, db, templateVersionID, 1, uuid.New().String()) + + // Create a failed prebuild workspace that counts toward the hard failure limit. + setupTestDBPrebuild( + t, + clock, + db, + pubSub, + database.WorkspaceTransitionStart, + database.ProvisionerJobStatusFailed, + org.ID, + preset, + template.ID, + templateVersionID, + ) + + // Verify initial state: one failed workspace exists. + workspaces, err := db.GetWorkspacesByTemplateID(ctx, template.ID) + require.NoError(t, err) + workspaceCount := len(workspaces) + require.Equal(t, 1, workspaceCount) + + // We simulate a failed prebuild in the test; Consequently, the backoff mechanism is triggered when ReconcileAll is called. + // Even though ReconciliationBackoffInterval is set to zero, we still need to advance the clock by at least one nanosecond. + clock.Advance(time.Nanosecond).MustWait(ctx) + + // Trigger reconciliation to attempt creating a new prebuild. + // The outcome depends on whether the hard limit has been reached. + require.NoError(t, controller.ReconcileAll(ctx)) + + // These two additional calls to ReconcileAll should not trigger any notifications. + // A notification is only sent once. + require.NoError(t, controller.ReconcileAll(ctx)) + require.NoError(t, controller.ReconcileAll(ctx)) + + // Verify the final state after reconciliation. + workspaces, err = db.GetWorkspacesByTemplateID(ctx, template.ID) + require.NoError(t, err) + updatedPreset, err := db.GetPresetByID(ctx, preset.ID) + require.NoError(t, err) + + if !tc.isHardLimitHit { + // When hard limit is not reached, a new workspace should be created. + require.Equal(t, 2, len(workspaces)) + require.Equal(t, database.PrebuildStatusHealthy, updatedPreset.PrebuildStatus) + return + } + + // When hard limit is reached, no new workspace should be created. + require.Equal(t, 1, len(workspaces)) + require.Equal(t, database.PrebuildStatusHardLimited, updatedPreset.PrebuildStatus) + + // When hard limit is reached, a notification should be sent. + matching := fakeEnqueuer.Sent(func(notification *notificationstest.FakeNotification) bool { + if !assert.Equal(t, notifications.PrebuildFailureLimitReached, notification.TemplateID, "unexpected template") { + return false + } + + if !assert.Equal(t, templateAdmin.ID, notification.UserID, "unexpected receiver") { + return false + } + + return true + }) + require.Len(t, matching, 1) + }) + } +} + func TestRunLoop(t *testing.T) { t.Parallel() diff --git a/site/src/api/typesGenerated.ts b/site/src/api/typesGenerated.ts index 35cd006ec6c55..95546529feac9 100644 --- a/site/src/api/typesGenerated.ts +++ b/site/src/api/typesGenerated.ts @@ -1814,6 +1814,7 @@ export interface PrebuildsConfig { readonly reconciliation_interval: number; readonly reconciliation_backoff_interval: number; readonly reconciliation_backoff_lookback: number; + readonly failure_hard_limit: number; } // From codersdk/presets.go