diff --git a/coderd/database/lock.go b/coderd/database/lock.go index 7ccb3b8f56fec..e5091cdfd29cc 100644 --- a/coderd/database/lock.go +++ b/coderd/database/lock.go @@ -12,8 +12,7 @@ const ( LockIDDBPurge LockIDNotificationsReportGenerator LockIDCryptoKeyRotation - LockIDReconcileTemplatePrebuilds - LockIDDeterminePrebuildsState + LockIDReconcilePrebuilds ) // GenLockID generates a unique and consistent lock ID from a given string. diff --git a/coderd/database/querier.go b/coderd/database/querier.go index 1cef5ada197f5..9fbfbde410d40 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -64,7 +64,7 @@ type sqlcQuerier interface { CleanTailnetCoordinators(ctx context.Context) error CleanTailnetLostPeers(ctx context.Context) error CleanTailnetTunnels(ctx context.Context) error - // CountInProgressPrebuilds returns the number of in-progress prebuilds, grouped by template version ID and transition. + // CountInProgressPrebuilds returns the number of in-progress prebuilds, grouped by preset ID and transition. // Prebuild considered in-progress if it's in the "starting", "stopping", or "deleting" state. CountInProgressPrebuilds(ctx context.Context) ([]CountInProgressPrebuildsRow, error) CountUnreadInboxNotificationsByUserID(ctx context.Context, userID uuid.UUID) (int64, error) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index 72f2c4a8fcb8e..60416b1a35730 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -5938,7 +5938,7 @@ func (q *sqlQuerier) ClaimPrebuiltWorkspace(ctx context.Context, arg ClaimPrebui } const countInProgressPrebuilds = `-- name: CountInProgressPrebuilds :many -SELECT t.id AS template_id, wpb.template_version_id, wpb.transition, COUNT(wpb.transition)::int AS count +SELECT t.id AS template_id, wpb.template_version_id, wpb.transition, COUNT(wpb.transition)::int AS count, wlb.template_version_preset_id as preset_id FROM workspace_latest_builds wlb INNER JOIN workspace_prebuild_builds wpb ON wpb.id = wlb.id -- We only need these counts for active template versions. @@ -5949,7 +5949,7 @@ FROM workspace_latest_builds wlb -- prebuilds that are still building. INNER JOIN templates t ON t.active_version_id = wlb.template_version_id WHERE wlb.job_status IN ('pending'::provisioner_job_status, 'running'::provisioner_job_status) -GROUP BY t.id, wpb.template_version_id, wpb.transition +GROUP BY t.id, wpb.template_version_id, wpb.transition, wlb.template_version_preset_id ` type CountInProgressPrebuildsRow struct { @@ -5957,9 +5957,10 @@ type CountInProgressPrebuildsRow struct { TemplateVersionID uuid.UUID `db:"template_version_id" json:"template_version_id"` Transition WorkspaceTransition `db:"transition" json:"transition"` Count int32 `db:"count" json:"count"` + PresetID uuid.NullUUID `db:"preset_id" json:"preset_id"` } -// CountInProgressPrebuilds returns the number of in-progress prebuilds, grouped by template version ID and transition. +// CountInProgressPrebuilds returns the number of in-progress prebuilds, grouped by preset ID and transition. // Prebuild considered in-progress if it's in the "starting", "stopping", or "deleting" state. func (q *sqlQuerier) CountInProgressPrebuilds(ctx context.Context) ([]CountInProgressPrebuildsRow, error) { rows, err := q.db.QueryContext(ctx, countInProgressPrebuilds) @@ -5975,6 +5976,7 @@ func (q *sqlQuerier) CountInProgressPrebuilds(ctx context.Context) ([]CountInPro &i.TemplateVersionID, &i.Transition, &i.Count, + &i.PresetID, ); err != nil { return nil, err } diff --git a/coderd/database/queries/prebuilds.sql b/coderd/database/queries/prebuilds.sql index 53f5020f3607e..1d3a827c98586 100644 --- a/coderd/database/queries/prebuilds.sql +++ b/coderd/database/queries/prebuilds.sql @@ -57,9 +57,9 @@ WHERE (b.transition = 'start'::workspace_transition AND b.job_status = 'succeeded'::provisioner_job_status); -- name: CountInProgressPrebuilds :many --- CountInProgressPrebuilds returns the number of in-progress prebuilds, grouped by template version ID and transition. +-- CountInProgressPrebuilds returns the number of in-progress prebuilds, grouped by preset ID and transition. -- Prebuild considered in-progress if it's in the "starting", "stopping", or "deleting" state. -SELECT t.id AS template_id, wpb.template_version_id, wpb.transition, COUNT(wpb.transition)::int AS count +SELECT t.id AS template_id, wpb.template_version_id, wpb.transition, COUNT(wpb.transition)::int AS count, wlb.template_version_preset_id as preset_id FROM workspace_latest_builds wlb INNER JOIN workspace_prebuild_builds wpb ON wpb.id = wlb.id -- We only need these counts for active template versions. @@ -70,7 +70,7 @@ FROM workspace_latest_builds wlb -- prebuilds that are still building. INNER JOIN templates t ON t.active_version_id = wlb.template_version_id WHERE wlb.job_status IN ('pending'::provisioner_job_status, 'running'::provisioner_job_status) -GROUP BY t.id, wpb.template_version_id, wpb.transition; +GROUP BY t.id, wpb.template_version_id, wpb.transition, wlb.template_version_preset_id; -- GetPresetsBackoff groups workspace builds by preset ID. -- Each preset is associated with exactly one template version ID. diff --git a/coderd/prebuilds/api.go b/coderd/prebuilds/api.go new file mode 100644 index 0000000000000..6ebfb8acced44 --- /dev/null +++ b/coderd/prebuilds/api.go @@ -0,0 +1,27 @@ +package prebuilds + +import ( + "context" +) + +// ReconciliationOrchestrator manages the lifecycle of prebuild reconciliation. +// It runs a continuous loop to check and reconcile prebuild states, and can be stopped gracefully. +type ReconciliationOrchestrator interface { + Reconciler + + // RunLoop starts a continuous reconciliation loop that periodically calls ReconcileAll + // to ensure all prebuilds are in their desired states. The loop runs until the context + // is canceled or Stop is called. + RunLoop(ctx context.Context) + + // Stop gracefully shuts down the orchestrator with the given cause. + // The cause is used for logging and error reporting. + Stop(ctx context.Context, cause error) +} + +type Reconciler interface { + // ReconcileAll orchestrates the reconciliation of all prebuilds across all templates. + // It takes a global snapshot of the system state and then reconciles each preset + // in parallel, creating or deleting prebuilds as needed to reach their desired states. + ReconcileAll(ctx context.Context) error +} diff --git a/coderd/prebuilds/global_snapshot.go b/coderd/prebuilds/global_snapshot.go new file mode 100644 index 0000000000000..0cf3fa3facc3a --- /dev/null +++ b/coderd/prebuilds/global_snapshot.go @@ -0,0 +1,66 @@ +package prebuilds + +import ( + "github.com/google/uuid" + "golang.org/x/xerrors" + + "github.com/coder/coder/v2/coderd/database" + "github.com/coder/coder/v2/coderd/util/slice" +) + +// GlobalSnapshot represents a full point-in-time snapshot of state relating to prebuilds across all templates. +type GlobalSnapshot struct { + Presets []database.GetTemplatePresetsWithPrebuildsRow + RunningPrebuilds []database.GetRunningPrebuiltWorkspacesRow + PrebuildsInProgress []database.CountInProgressPrebuildsRow + Backoffs []database.GetPresetsBackoffRow +} + +func NewGlobalSnapshot( + presets []database.GetTemplatePresetsWithPrebuildsRow, + runningPrebuilds []database.GetRunningPrebuiltWorkspacesRow, + prebuildsInProgress []database.CountInProgressPrebuildsRow, + backoffs []database.GetPresetsBackoffRow, +) GlobalSnapshot { + return GlobalSnapshot{ + Presets: presets, + RunningPrebuilds: runningPrebuilds, + PrebuildsInProgress: prebuildsInProgress, + Backoffs: backoffs, + } +} + +func (s GlobalSnapshot) FilterByPreset(presetID uuid.UUID) (*PresetSnapshot, error) { + preset, found := slice.Find(s.Presets, func(preset database.GetTemplatePresetsWithPrebuildsRow) bool { + return preset.ID == presetID + }) + if !found { + return nil, xerrors.Errorf("no preset found with ID %q", presetID) + } + + running := slice.Filter(s.RunningPrebuilds, func(prebuild database.GetRunningPrebuiltWorkspacesRow) bool { + if !prebuild.CurrentPresetID.Valid { + return false + } + return prebuild.CurrentPresetID.UUID == preset.ID + }) + + inProgress := slice.Filter(s.PrebuildsInProgress, func(prebuild database.CountInProgressPrebuildsRow) bool { + return prebuild.PresetID.UUID == preset.ID + }) + + var backoffPtr *database.GetPresetsBackoffRow + backoff, found := slice.Find(s.Backoffs, func(row database.GetPresetsBackoffRow) bool { + return row.PresetID == preset.ID + }) + if found { + backoffPtr = &backoff + } + + return &PresetSnapshot{ + Preset: preset, + Running: running, + InProgress: inProgress, + Backoff: backoffPtr, + }, nil +} diff --git a/coderd/prebuilds/noop.go b/coderd/prebuilds/noop.go new file mode 100644 index 0000000000000..ffe4e7b442af9 --- /dev/null +++ b/coderd/prebuilds/noop.go @@ -0,0 +1,35 @@ +package prebuilds + +import ( + "context" + + "github.com/coder/coder/v2/coderd/database" +) + +type NoopReconciler struct{} + +func NewNoopReconciler() *NoopReconciler { + return &NoopReconciler{} +} + +func (NoopReconciler) RunLoop(context.Context) {} + +func (NoopReconciler) Stop(context.Context, error) {} + +func (NoopReconciler) ReconcileAll(context.Context) error { + return nil +} + +func (NoopReconciler) SnapshotState(context.Context, database.Store) (*GlobalSnapshot, error) { + return &GlobalSnapshot{}, nil +} + +func (NoopReconciler) ReconcilePreset(context.Context, PresetSnapshot) error { + return nil +} + +func (NoopReconciler) CalculateActions(context.Context, PresetSnapshot) (*ReconciliationActions, error) { + return &ReconciliationActions{}, nil +} + +var _ ReconciliationOrchestrator = NoopReconciler{} diff --git a/coderd/prebuilds/preset_snapshot.go b/coderd/prebuilds/preset_snapshot.go new file mode 100644 index 0000000000000..b6f05e588a6c0 --- /dev/null +++ b/coderd/prebuilds/preset_snapshot.go @@ -0,0 +1,254 @@ +package prebuilds + +import ( + "slices" + "time" + + "github.com/google/uuid" + + "github.com/coder/quartz" + + "github.com/coder/coder/v2/coderd/database" +) + +// ActionType represents the type of action needed to reconcile prebuilds. +type ActionType int + +const ( + // ActionTypeUndefined represents an uninitialized or invalid action type. + ActionTypeUndefined ActionType = iota + + // ActionTypeCreate indicates that new prebuilds should be created. + ActionTypeCreate + + // ActionTypeDelete indicates that existing prebuilds should be deleted. + ActionTypeDelete + + // ActionTypeBackoff indicates that prebuild creation should be delayed. + ActionTypeBackoff +) + +// PresetSnapshot is a filtered view of GlobalSnapshot focused on a single preset. +// It contains the raw data needed to calculate the current state of a preset's prebuilds, +// including running prebuilds, in-progress builds, and backoff information. +type PresetSnapshot struct { + Preset database.GetTemplatePresetsWithPrebuildsRow + Running []database.GetRunningPrebuiltWorkspacesRow + InProgress []database.CountInProgressPrebuildsRow + Backoff *database.GetPresetsBackoffRow +} + +// ReconciliationState represents the processed state of a preset's prebuilds, +// calculated from a PresetSnapshot. While PresetSnapshot contains raw data, +// ReconciliationState contains derived metrics that are directly used to +// determine what actions are needed (create, delete, or backoff). +// For example, it calculates how many prebuilds are eligible, how many are +// extraneous, and how many are in various transition states. +type ReconciliationState struct { + Actual int32 // Number of currently running prebuilds + Desired int32 // Number of prebuilds desired as defined in the preset + Eligible int32 // Number of prebuilds that are ready to be claimed + Extraneous int32 // Number of extra running prebuilds beyond the desired count + + // Counts of prebuilds in various transition states + Starting int32 + Stopping int32 + Deleting int32 +} + +// ReconciliationActions represents actions needed to reconcile the current state with the desired state. +// Based on ActionType, exactly one of Create, DeleteIDs, or BackoffUntil will be set. +type ReconciliationActions struct { + // ActionType determines which field is set and what action should be taken + ActionType ActionType + + // Create is set when ActionType is ActionTypeCreate and indicates the number of prebuilds to create + Create int32 + + // DeleteIDs is set when ActionType is ActionTypeDelete and contains the IDs of prebuilds to delete + DeleteIDs []uuid.UUID + + // BackoffUntil is set when ActionType is ActionTypeBackoff and indicates when to retry creating prebuilds + BackoffUntil time.Time +} + +// CalculateState computes the current state of prebuilds for a preset, including: +// - Actual: Number of currently running prebuilds +// - Desired: Number of prebuilds desired as defined in the preset +// - Eligible: Number of prebuilds that are ready to be claimed +// - Extraneous: Number of extra running prebuilds beyond the desired count +// - Starting/Stopping/Deleting: Counts of prebuilds in various transition states +// +// The function takes into account whether the preset is active (using the active template version) +// and calculates appropriate counts based on the current state of running prebuilds and +// in-progress transitions. This state information is used to determine what reconciliation +// actions are needed to reach the desired state. +func (p PresetSnapshot) CalculateState() *ReconciliationState { + var ( + actual int32 + desired int32 + eligible int32 + extraneous int32 + ) + + if p.isActive() { + // #nosec G115 - Safe conversion as p.Running slice length is expected to be within int32 range + actual = int32(len(p.Running)) + desired = p.Preset.DesiredInstances.Int32 + eligible = p.countEligible() + extraneous = max(actual-desired, 0) + } + + starting, stopping, deleting := p.countInProgress() + + return &ReconciliationState{ + Actual: actual, + Desired: desired, + Eligible: eligible, + Extraneous: extraneous, + + Starting: starting, + Stopping: stopping, + Deleting: deleting, + } +} + +// CalculateActions determines what actions are needed to reconcile the current state with the desired state. +// The function: +// 1. First checks if a backoff period is needed (if previous builds failed) +// 2. If the preset is inactive (template version is not active), it will delete all running prebuilds +// 3. For active presets, it calculates the number of prebuilds to create or delete based on: +// - The desired number of instances +// - Currently running prebuilds +// - Prebuilds in transition states (starting/stopping/deleting) +// - Any extraneous prebuilds that need to be removed +// +// The function returns a ReconciliationActions struct that will have exactly one action type set: +// - ActionTypeBackoff: Only BackoffUntil is set, indicating when to retry +// - ActionTypeCreate: Only Create is set, indicating how many prebuilds to create +// - ActionTypeDelete: Only DeleteIDs is set, containing IDs of prebuilds to delete +func (p PresetSnapshot) CalculateActions(clock quartz.Clock, backoffInterval time.Duration) (*ReconciliationActions, error) { + // TODO: align workspace states with how we represent them on the FE and the CLI + // right now there's some slight differences which can lead to additional prebuilds being created + + // TODO: add mechanism to prevent prebuilds being reconciled from being claimable by users; i.e. if a prebuild is + // about to be deleted, it should not be deleted if it has been claimed - beware of TOCTOU races! + + actions, needsBackoff := p.needsBackoffPeriod(clock, backoffInterval) + if needsBackoff { + return actions, nil + } + + if !p.isActive() { + return p.handleInactiveTemplateVersion() + } + + return p.handleActiveTemplateVersion() +} + +// isActive returns true if the preset's template version is the active version, and it is neither deleted nor deprecated. +// This determines whether we should maintain prebuilds for this preset or delete them. +func (p PresetSnapshot) isActive() bool { + return p.Preset.UsingActiveVersion && !p.Preset.Deleted && !p.Preset.Deprecated +} + +// handleActiveTemplateVersion deletes excess prebuilds if there are too many, +// otherwise creates new ones to reach the desired count. +func (p PresetSnapshot) handleActiveTemplateVersion() (*ReconciliationActions, error) { + state := p.CalculateState() + + // If we have more prebuilds than desired, delete the oldest ones + if state.Extraneous > 0 { + return &ReconciliationActions{ + ActionType: ActionTypeDelete, + DeleteIDs: p.getOldestPrebuildIDs(int(state.Extraneous)), + }, nil + } + + // Calculate how many new prebuilds we need to create + // We subtract starting prebuilds since they're already being created + prebuildsToCreate := max(state.Desired-state.Actual-state.Starting, 0) + + return &ReconciliationActions{ + ActionType: ActionTypeCreate, + Create: prebuildsToCreate, + }, nil +} + +// handleInactiveTemplateVersion deletes all running prebuilds except those already being deleted +// to avoid duplicate deletion attempts. +func (p PresetSnapshot) handleInactiveTemplateVersion() (*ReconciliationActions, error) { + prebuildsToDelete := len(p.Running) + deleteIDs := p.getOldestPrebuildIDs(prebuildsToDelete) + + return &ReconciliationActions{ + ActionType: ActionTypeDelete, + DeleteIDs: deleteIDs, + }, nil +} + +// needsBackoffPeriod checks if we should delay prebuild creation due to recent failures. +// If there were failures, it calculates a backoff period based on the number of failures +// and returns true if we're still within that period. +func (p PresetSnapshot) needsBackoffPeriod(clock quartz.Clock, backoffInterval time.Duration) (*ReconciliationActions, bool) { + if p.Backoff == nil || p.Backoff.NumFailed == 0 { + return nil, false + } + backoffUntil := p.Backoff.LastBuildAt.Add(time.Duration(p.Backoff.NumFailed) * backoffInterval) + if clock.Now().After(backoffUntil) { + return nil, false + } + + return &ReconciliationActions{ + ActionType: ActionTypeBackoff, + BackoffUntil: backoffUntil, + }, true +} + +// countEligible returns the number of prebuilds that are ready to be claimed. +// A prebuild is eligible if it's running and its agents are in ready state. +func (p PresetSnapshot) countEligible() int32 { + var count int32 + for _, prebuild := range p.Running { + if prebuild.Ready { + count++ + } + } + return count +} + +// countInProgress returns counts of prebuilds in transition states (starting, stopping, deleting). +// These counts are tracked at the template level, so all presets sharing the same template see the same values. +func (p PresetSnapshot) countInProgress() (starting int32, stopping int32, deleting int32) { + for _, progress := range p.InProgress { + num := progress.Count + switch progress.Transition { + case database.WorkspaceTransitionStart: + starting += num + case database.WorkspaceTransitionStop: + stopping += num + case database.WorkspaceTransitionDelete: + deleting += num + } + } + + return starting, stopping, deleting +} + +// getOldestPrebuildIDs returns the IDs of the N oldest prebuilds, sorted by creation time. +// This is used when we need to delete prebuilds, ensuring we remove the oldest ones first. +func (p PresetSnapshot) getOldestPrebuildIDs(n int) []uuid.UUID { + // Sort by creation time, oldest first + slices.SortFunc(p.Running, func(a, b database.GetRunningPrebuiltWorkspacesRow) int { + return a.CreatedAt.Compare(b.CreatedAt) + }) + + // Take the first N IDs + n = min(n, len(p.Running)) + ids := make([]uuid.UUID, n) + for i := 0; i < n; i++ { + ids[i] = p.Running[i].ID + } + + return ids +} diff --git a/coderd/prebuilds/preset_snapshot_test.go b/coderd/prebuilds/preset_snapshot_test.go new file mode 100644 index 0000000000000..cce8ea67cb05c --- /dev/null +++ b/coderd/prebuilds/preset_snapshot_test.go @@ -0,0 +1,758 @@ +package prebuilds_test + +import ( + "database/sql" + "fmt" + "testing" + "time" + + "github.com/google/uuid" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/coder/quartz" + + "github.com/coder/coder/v2/coderd/database" + "github.com/coder/coder/v2/coderd/prebuilds" +) + +type options struct { + templateID uuid.UUID + templateVersionID uuid.UUID + presetID uuid.UUID + presetName string + prebuiltWorkspaceID uuid.UUID + workspaceName string +} + +// templateID is common across all option sets. +var templateID = uuid.UUID{1} + +const ( + backoffInterval = time.Second * 5 + + optionSet0 = iota + optionSet1 + optionSet2 +) + +var opts = map[uint]options{ + optionSet0: { + templateID: templateID, + templateVersionID: uuid.UUID{11}, + presetID: uuid.UUID{12}, + presetName: "my-preset", + prebuiltWorkspaceID: uuid.UUID{13}, + workspaceName: "prebuilds0", + }, + optionSet1: { + templateID: templateID, + templateVersionID: uuid.UUID{21}, + presetID: uuid.UUID{22}, + presetName: "my-preset", + prebuiltWorkspaceID: uuid.UUID{23}, + workspaceName: "prebuilds1", + }, + optionSet2: { + templateID: templateID, + templateVersionID: uuid.UUID{31}, + presetID: uuid.UUID{32}, + presetName: "my-preset", + prebuiltWorkspaceID: uuid.UUID{33}, + workspaceName: "prebuilds2", + }, +} + +// A new template version with a preset without prebuilds configured should result in no prebuilds being created. +func TestNoPrebuilds(t *testing.T) { + t.Parallel() + current := opts[optionSet0] + clock := quartz.NewMock(t) + + presets := []database.GetTemplatePresetsWithPrebuildsRow{ + preset(true, 0, current), + } + + snapshot := prebuilds.NewGlobalSnapshot(presets, nil, nil, nil) + ps, err := snapshot.FilterByPreset(current.presetID) + require.NoError(t, err) + + state := ps.CalculateState() + actions, err := ps.CalculateActions(clock, backoffInterval) + require.NoError(t, err) + + validateState(t, prebuilds.ReconciliationState{ /*all zero values*/ }, *state) + validateActions(t, prebuilds.ReconciliationActions{ + ActionType: prebuilds.ActionTypeCreate, + Create: 0, + }, *actions) +} + +// A new template version with a preset with prebuilds configured should result in a new prebuild being created. +func TestNetNew(t *testing.T) { + t.Parallel() + current := opts[optionSet0] + clock := quartz.NewMock(t) + + presets := []database.GetTemplatePresetsWithPrebuildsRow{ + preset(true, 1, current), + } + + snapshot := prebuilds.NewGlobalSnapshot(presets, nil, nil, nil) + ps, err := snapshot.FilterByPreset(current.presetID) + require.NoError(t, err) + + state := ps.CalculateState() + actions, err := ps.CalculateActions(clock, backoffInterval) + require.NoError(t, err) + + validateState(t, prebuilds.ReconciliationState{ + Desired: 1, + }, *state) + validateActions(t, prebuilds.ReconciliationActions{ + ActionType: prebuilds.ActionTypeCreate, + Create: 1, + }, *actions) +} + +// A new template version is created with a preset with prebuilds configured; this outdates the older version and +// requires the old prebuilds to be destroyed and new prebuilds to be created. +func TestOutdatedPrebuilds(t *testing.T) { + t.Parallel() + outdated := opts[optionSet0] + current := opts[optionSet1] + clock := quartz.NewMock(t) + + // GIVEN: 2 presets, one outdated and one new. + presets := []database.GetTemplatePresetsWithPrebuildsRow{ + preset(false, 1, outdated), + preset(true, 1, current), + } + + // GIVEN: a running prebuild for the outdated preset. + running := []database.GetRunningPrebuiltWorkspacesRow{ + prebuiltWorkspace(outdated, clock), + } + + // GIVEN: no in-progress builds. + var inProgress []database.CountInProgressPrebuildsRow + + // WHEN: calculating the outdated preset's state. + snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil) + ps, err := snapshot.FilterByPreset(outdated.presetID) + require.NoError(t, err) + + // THEN: we should identify that this prebuild is outdated and needs to be deleted. + state := ps.CalculateState() + actions, err := ps.CalculateActions(clock, backoffInterval) + require.NoError(t, err) + validateState(t, prebuilds.ReconciliationState{}, *state) + validateActions(t, prebuilds.ReconciliationActions{ + ActionType: prebuilds.ActionTypeDelete, + DeleteIDs: []uuid.UUID{outdated.prebuiltWorkspaceID}, + }, *actions) + + // WHEN: calculating the current preset's state. + ps, err = snapshot.FilterByPreset(current.presetID) + require.NoError(t, err) + + // THEN: we should not be blocked from creating a new prebuild while the outdate one deletes. + state = ps.CalculateState() + actions, err = ps.CalculateActions(clock, backoffInterval) + require.NoError(t, err) + validateState(t, prebuilds.ReconciliationState{Desired: 1}, *state) + validateActions(t, prebuilds.ReconciliationActions{ + ActionType: prebuilds.ActionTypeCreate, + Create: 1, + }, *actions) +} + +// Make sure that outdated prebuild will be deleted, even if deletion of another outdated prebuild is already in progress. +func TestDeleteOutdatedPrebuilds(t *testing.T) { + t.Parallel() + outdated := opts[optionSet0] + clock := quartz.NewMock(t) + + // GIVEN: 1 outdated preset. + presets := []database.GetTemplatePresetsWithPrebuildsRow{ + preset(false, 1, outdated), + } + + // GIVEN: one running prebuild for the outdated preset. + running := []database.GetRunningPrebuiltWorkspacesRow{ + prebuiltWorkspace(outdated, clock), + } + + // GIVEN: one deleting prebuild for the outdated preset. + inProgress := []database.CountInProgressPrebuildsRow{ + { + TemplateID: outdated.templateID, + TemplateVersionID: outdated.templateVersionID, + Transition: database.WorkspaceTransitionDelete, + Count: 1, + PresetID: uuid.NullUUID{ + UUID: outdated.presetID, + Valid: true, + }, + }, + } + + // WHEN: calculating the outdated preset's state. + snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil) + ps, err := snapshot.FilterByPreset(outdated.presetID) + require.NoError(t, err) + + // THEN: we should identify that this prebuild is outdated and needs to be deleted. + // Despite the fact that deletion of another outdated prebuild is already in progress. + state := ps.CalculateState() + actions, err := ps.CalculateActions(clock, backoffInterval) + require.NoError(t, err) + validateState(t, prebuilds.ReconciliationState{ + Deleting: 1, + }, *state) + + validateActions(t, prebuilds.ReconciliationActions{ + ActionType: prebuilds.ActionTypeDelete, + DeleteIDs: []uuid.UUID{outdated.prebuiltWorkspaceID}, + }, *actions) +} + +// A new template version is created with a preset with prebuilds configured; while a prebuild is provisioning up or down, +// the calculated actions should indicate the state correctly. +func TestInProgressActions(t *testing.T) { + t.Parallel() + current := opts[optionSet0] + clock := quartz.NewMock(t) + + cases := []struct { + name string + transition database.WorkspaceTransition + desired int32 + running int32 + inProgress int32 + checkFn func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) + }{ + // With no running prebuilds and one starting, no creations/deletions should take place. + { + name: fmt.Sprintf("%s-short", database.WorkspaceTransitionStart), + transition: database.WorkspaceTransitionStart, + desired: 1, + running: 0, + inProgress: 1, + checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) { + validateState(t, prebuilds.ReconciliationState{Desired: 1, Starting: 1}, state) + validateActions(t, prebuilds.ReconciliationActions{ + ActionType: prebuilds.ActionTypeCreate, + }, actions) + }, + }, + // With one running prebuild and one starting, no creations/deletions should occur since we're approaching the correct state. + { + name: fmt.Sprintf("%s-balanced", database.WorkspaceTransitionStart), + transition: database.WorkspaceTransitionStart, + desired: 2, + running: 1, + inProgress: 1, + checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) { + validateState(t, prebuilds.ReconciliationState{Actual: 1, Desired: 2, Starting: 1}, state) + validateActions(t, prebuilds.ReconciliationActions{ + ActionType: prebuilds.ActionTypeCreate, + }, actions) + }, + }, + // With one running prebuild and one starting, no creations/deletions should occur + // SIDE-NOTE: once the starting prebuild completes, the older of the two will be considered extraneous since we only desire 2. + { + name: fmt.Sprintf("%s-extraneous", database.WorkspaceTransitionStart), + transition: database.WorkspaceTransitionStart, + desired: 2, + running: 2, + inProgress: 1, + checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) { + validateState(t, prebuilds.ReconciliationState{Actual: 2, Desired: 2, Starting: 1}, state) + validateActions(t, prebuilds.ReconciliationActions{ + ActionType: prebuilds.ActionTypeCreate, + }, actions) + }, + }, + // With one prebuild desired and one stopping, a new prebuild will be created. + { + name: fmt.Sprintf("%s-short", database.WorkspaceTransitionStop), + transition: database.WorkspaceTransitionStop, + desired: 1, + running: 0, + inProgress: 1, + checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) { + validateState(t, prebuilds.ReconciliationState{Desired: 1, Stopping: 1}, state) + validateActions(t, prebuilds.ReconciliationActions{ + ActionType: prebuilds.ActionTypeCreate, + Create: 1, + }, actions) + }, + }, + // With 3 prebuilds desired, 2 running, and 1 stopping, a new prebuild will be created. + { + name: fmt.Sprintf("%s-balanced", database.WorkspaceTransitionStop), + transition: database.WorkspaceTransitionStop, + desired: 3, + running: 2, + inProgress: 1, + checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) { + validateState(t, prebuilds.ReconciliationState{Actual: 2, Desired: 3, Stopping: 1}, state) + validateActions(t, prebuilds.ReconciliationActions{ + ActionType: prebuilds.ActionTypeCreate, + Create: 1, + }, actions) + }, + }, + // With 3 prebuilds desired, 3 running, and 1 stopping, no creations/deletions should occur since the desired state is already achieved. + { + name: fmt.Sprintf("%s-extraneous", database.WorkspaceTransitionStop), + transition: database.WorkspaceTransitionStop, + desired: 3, + running: 3, + inProgress: 1, + checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) { + validateState(t, prebuilds.ReconciliationState{Actual: 3, Desired: 3, Stopping: 1}, state) + validateActions(t, prebuilds.ReconciliationActions{ + ActionType: prebuilds.ActionTypeCreate, + }, actions) + }, + }, + // With one prebuild desired and one deleting, a new prebuild will be created. + { + name: fmt.Sprintf("%s-short", database.WorkspaceTransitionDelete), + transition: database.WorkspaceTransitionDelete, + desired: 1, + running: 0, + inProgress: 1, + checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) { + validateState(t, prebuilds.ReconciliationState{Desired: 1, Deleting: 1}, state) + validateActions(t, prebuilds.ReconciliationActions{ + ActionType: prebuilds.ActionTypeCreate, + Create: 1, + }, actions) + }, + }, + // With 2 prebuilds desired, 1 running, and 1 deleting, a new prebuild will be created. + { + name: fmt.Sprintf("%s-balanced", database.WorkspaceTransitionDelete), + transition: database.WorkspaceTransitionDelete, + desired: 2, + running: 1, + inProgress: 1, + checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) { + validateState(t, prebuilds.ReconciliationState{Actual: 1, Desired: 2, Deleting: 1}, state) + validateActions(t, prebuilds.ReconciliationActions{ + ActionType: prebuilds.ActionTypeCreate, + Create: 1, + }, actions) + }, + }, + // With 2 prebuilds desired, 2 running, and 1 deleting, no creations/deletions should occur since the desired state is already achieved. + { + name: fmt.Sprintf("%s-extraneous", database.WorkspaceTransitionDelete), + transition: database.WorkspaceTransitionDelete, + desired: 2, + running: 2, + inProgress: 1, + checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) { + validateState(t, prebuilds.ReconciliationState{Actual: 2, Desired: 2, Deleting: 1}, state) + validateActions(t, prebuilds.ReconciliationActions{ + ActionType: prebuilds.ActionTypeCreate, + }, actions) + }, + }, + // With 3 prebuilds desired, 1 running, and 2 starting, no creations should occur since the builds are in progress. + { + name: fmt.Sprintf("%s-inhibit", database.WorkspaceTransitionStart), + transition: database.WorkspaceTransitionStart, + desired: 3, + running: 1, + inProgress: 2, + checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) { + validateState(t, prebuilds.ReconciliationState{Actual: 1, Desired: 3, Starting: 2}, state) + validateActions(t, prebuilds.ReconciliationActions{ActionType: prebuilds.ActionTypeCreate, Create: 0}, actions) + }, + }, + // With 3 prebuilds desired, 5 running, and 2 deleting, no deletions should occur since the builds are in progress. + { + name: fmt.Sprintf("%s-inhibit", database.WorkspaceTransitionDelete), + transition: database.WorkspaceTransitionDelete, + desired: 3, + running: 5, + inProgress: 2, + checkFn: func(state prebuilds.ReconciliationState, actions prebuilds.ReconciliationActions) { + expectedState := prebuilds.ReconciliationState{Actual: 5, Desired: 3, Deleting: 2, Extraneous: 2} + expectedActions := prebuilds.ReconciliationActions{ + ActionType: prebuilds.ActionTypeDelete, + } + + validateState(t, expectedState, state) + assert.EqualValuesf(t, expectedActions.ActionType, actions.ActionType, "'ActionType' did not match expectation") + assert.Len(t, actions.DeleteIDs, 2, "'deleteIDs' did not match expectation") + assert.EqualValuesf(t, expectedActions.Create, actions.Create, "'create' did not match expectation") + assert.EqualValuesf(t, expectedActions.BackoffUntil, actions.BackoffUntil, "'BackoffUntil' did not match expectation") + }, + }, + } + + for _, tc := range cases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + // GIVEN: a preset. + defaultPreset := preset(true, tc.desired, current) + presets := []database.GetTemplatePresetsWithPrebuildsRow{ + defaultPreset, + } + + // GIVEN: running prebuilt workspaces for the preset. + running := make([]database.GetRunningPrebuiltWorkspacesRow, 0, tc.running) + for range tc.running { + name, err := prebuilds.GenerateName() + require.NoError(t, err) + running = append(running, database.GetRunningPrebuiltWorkspacesRow{ + ID: uuid.New(), + Name: name, + TemplateID: current.templateID, + TemplateVersionID: current.templateVersionID, + CurrentPresetID: uuid.NullUUID{UUID: current.presetID, Valid: true}, + Ready: false, + CreatedAt: clock.Now(), + }) + } + + // GIVEN: some prebuilds for the preset which are currently transitioning. + inProgress := []database.CountInProgressPrebuildsRow{ + { + TemplateID: current.templateID, + TemplateVersionID: current.templateVersionID, + Transition: tc.transition, + Count: tc.inProgress, + PresetID: uuid.NullUUID{ + UUID: defaultPreset.ID, + Valid: true, + }, + }, + } + + // WHEN: calculating the current preset's state. + snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil) + ps, err := snapshot.FilterByPreset(current.presetID) + require.NoError(t, err) + + // THEN: we should identify that this prebuild is in progress. + state := ps.CalculateState() + actions, err := ps.CalculateActions(clock, backoffInterval) + require.NoError(t, err) + tc.checkFn(*state, *actions) + }) + } +} + +// Additional prebuilds exist for a given preset configuration; these must be deleted. +func TestExtraneous(t *testing.T) { + t.Parallel() + current := opts[optionSet0] + clock := quartz.NewMock(t) + + // GIVEN: a preset with 1 desired prebuild. + presets := []database.GetTemplatePresetsWithPrebuildsRow{ + preset(true, 1, current), + } + + var older uuid.UUID + // GIVEN: 2 running prebuilds for the preset. + running := []database.GetRunningPrebuiltWorkspacesRow{ + prebuiltWorkspace(current, clock, func(row database.GetRunningPrebuiltWorkspacesRow) database.GetRunningPrebuiltWorkspacesRow { + // The older of the running prebuilds will be deleted in order to maintain freshness. + row.CreatedAt = clock.Now().Add(-time.Hour) + older = row.ID + return row + }), + prebuiltWorkspace(current, clock, func(row database.GetRunningPrebuiltWorkspacesRow) database.GetRunningPrebuiltWorkspacesRow { + row.CreatedAt = clock.Now() + return row + }), + } + + // GIVEN: NO prebuilds in progress. + var inProgress []database.CountInProgressPrebuildsRow + + // WHEN: calculating the current preset's state. + snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil) + ps, err := snapshot.FilterByPreset(current.presetID) + require.NoError(t, err) + + // THEN: an extraneous prebuild is detected and marked for deletion. + state := ps.CalculateState() + actions, err := ps.CalculateActions(clock, backoffInterval) + require.NoError(t, err) + validateState(t, prebuilds.ReconciliationState{ + Actual: 2, Desired: 1, Extraneous: 1, Eligible: 2, + }, *state) + validateActions(t, prebuilds.ReconciliationActions{ + ActionType: prebuilds.ActionTypeDelete, + DeleteIDs: []uuid.UUID{older}, + }, *actions) +} + +// A template marked as deprecated will not have prebuilds running. +func TestDeprecated(t *testing.T) { + t.Parallel() + current := opts[optionSet0] + clock := quartz.NewMock(t) + + // GIVEN: a preset with 1 desired prebuild. + presets := []database.GetTemplatePresetsWithPrebuildsRow{ + preset(true, 1, current, func(row database.GetTemplatePresetsWithPrebuildsRow) database.GetTemplatePresetsWithPrebuildsRow { + row.Deprecated = true + return row + }), + } + + // GIVEN: 1 running prebuilds for the preset. + running := []database.GetRunningPrebuiltWorkspacesRow{ + prebuiltWorkspace(current, clock), + } + + // GIVEN: NO prebuilds in progress. + var inProgress []database.CountInProgressPrebuildsRow + + // WHEN: calculating the current preset's state. + snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, nil) + ps, err := snapshot.FilterByPreset(current.presetID) + require.NoError(t, err) + + // THEN: all running prebuilds should be deleted because the template is deprecated. + state := ps.CalculateState() + actions, err := ps.CalculateActions(clock, backoffInterval) + require.NoError(t, err) + validateState(t, prebuilds.ReconciliationState{}, *state) + validateActions(t, prebuilds.ReconciliationActions{ + ActionType: prebuilds.ActionTypeDelete, + DeleteIDs: []uuid.UUID{current.prebuiltWorkspaceID}, + }, *actions) +} + +// If the latest build failed, backoff exponentially with the given interval. +func TestLatestBuildFailed(t *testing.T) { + t.Parallel() + current := opts[optionSet0] + other := opts[optionSet1] + clock := quartz.NewMock(t) + + // GIVEN: two presets. + presets := []database.GetTemplatePresetsWithPrebuildsRow{ + preset(true, 1, current), + preset(true, 1, other), + } + + // GIVEN: running prebuilds only for one preset (the other will be failing, as evidenced by the backoffs below). + running := []database.GetRunningPrebuiltWorkspacesRow{ + prebuiltWorkspace(other, clock), + } + + // GIVEN: NO prebuilds in progress. + var inProgress []database.CountInProgressPrebuildsRow + + // GIVEN: a backoff entry. + lastBuildTime := clock.Now() + numFailed := 1 + backoffs := []database.GetPresetsBackoffRow{ + { + TemplateVersionID: current.templateVersionID, + PresetID: current.presetID, + NumFailed: int32(numFailed), + LastBuildAt: lastBuildTime, + }, + } + + // WHEN: calculating the current preset's state. + snapshot := prebuilds.NewGlobalSnapshot(presets, running, inProgress, backoffs) + psCurrent, err := snapshot.FilterByPreset(current.presetID) + require.NoError(t, err) + + // THEN: reconciliation should backoff. + state := psCurrent.CalculateState() + actions, err := psCurrent.CalculateActions(clock, backoffInterval) + require.NoError(t, err) + validateState(t, prebuilds.ReconciliationState{ + Actual: 0, Desired: 1, + }, *state) + validateActions(t, prebuilds.ReconciliationActions{ + ActionType: prebuilds.ActionTypeBackoff, + BackoffUntil: lastBuildTime.Add(time.Duration(numFailed) * backoffInterval), + }, *actions) + + // WHEN: calculating the other preset's state. + psOther, err := snapshot.FilterByPreset(other.presetID) + require.NoError(t, err) + + // THEN: it should NOT be in backoff because all is OK. + state = psOther.CalculateState() + actions, err = psOther.CalculateActions(clock, backoffInterval) + require.NoError(t, err) + validateState(t, prebuilds.ReconciliationState{ + Actual: 1, Desired: 1, Eligible: 1, + }, *state) + validateActions(t, prebuilds.ReconciliationActions{ + ActionType: prebuilds.ActionTypeCreate, + BackoffUntil: time.Time{}, + }, *actions) + + // WHEN: the clock is advanced a backoff interval. + clock.Advance(backoffInterval + time.Microsecond) + + // THEN: a new prebuild should be created. + psCurrent, err = snapshot.FilterByPreset(current.presetID) + require.NoError(t, err) + state = psCurrent.CalculateState() + actions, err = psCurrent.CalculateActions(clock, backoffInterval) + require.NoError(t, err) + validateState(t, prebuilds.ReconciliationState{ + Actual: 0, Desired: 1, + }, *state) + validateActions(t, prebuilds.ReconciliationActions{ + ActionType: prebuilds.ActionTypeCreate, + Create: 1, // <--- NOTE: we're now able to create a new prebuild because the interval has elapsed. + + }, *actions) +} + +func TestMultiplePresetsPerTemplateVersion(t *testing.T) { + t.Parallel() + + templateID := uuid.New() + templateVersionID := uuid.New() + presetOpts1 := options{ + templateID: templateID, + templateVersionID: templateVersionID, + presetID: uuid.New(), + presetName: "my-preset-1", + prebuiltWorkspaceID: uuid.New(), + workspaceName: "prebuilds1", + } + presetOpts2 := options{ + templateID: templateID, + templateVersionID: templateVersionID, + presetID: uuid.New(), + presetName: "my-preset-2", + prebuiltWorkspaceID: uuid.New(), + workspaceName: "prebuilds2", + } + + clock := quartz.NewMock(t) + + presets := []database.GetTemplatePresetsWithPrebuildsRow{ + preset(true, 1, presetOpts1), + preset(true, 1, presetOpts2), + } + + inProgress := []database.CountInProgressPrebuildsRow{ + { + TemplateID: templateID, + TemplateVersionID: templateVersionID, + Transition: database.WorkspaceTransitionStart, + Count: 1, + PresetID: uuid.NullUUID{ + UUID: presetOpts1.presetID, + Valid: true, + }, + }, + } + + snapshot := prebuilds.NewGlobalSnapshot(presets, nil, inProgress, nil) + + // Nothing has to be created for preset 1. + { + ps, err := snapshot.FilterByPreset(presetOpts1.presetID) + require.NoError(t, err) + + state := ps.CalculateState() + actions, err := ps.CalculateActions(clock, backoffInterval) + require.NoError(t, err) + + validateState(t, prebuilds.ReconciliationState{ + Starting: 1, + Desired: 1, + }, *state) + validateActions(t, prebuilds.ReconciliationActions{ + ActionType: prebuilds.ActionTypeCreate, + Create: 0, + }, *actions) + } + + // One prebuild has to be created for preset 2. Make sure preset 1 doesn't block preset 2. + { + ps, err := snapshot.FilterByPreset(presetOpts2.presetID) + require.NoError(t, err) + + state := ps.CalculateState() + actions, err := ps.CalculateActions(clock, backoffInterval) + require.NoError(t, err) + + validateState(t, prebuilds.ReconciliationState{ + Starting: 0, + Desired: 1, + }, *state) + validateActions(t, prebuilds.ReconciliationActions{ + ActionType: prebuilds.ActionTypeCreate, + Create: 1, + }, *actions) + } +} + +func preset(active bool, instances int32, opts options, muts ...func(row database.GetTemplatePresetsWithPrebuildsRow) database.GetTemplatePresetsWithPrebuildsRow) database.GetTemplatePresetsWithPrebuildsRow { + entry := database.GetTemplatePresetsWithPrebuildsRow{ + TemplateID: opts.templateID, + TemplateVersionID: opts.templateVersionID, + ID: opts.presetID, + UsingActiveVersion: active, + Name: opts.presetName, + DesiredInstances: sql.NullInt32{ + Valid: true, + Int32: instances, + }, + Deleted: false, + Deprecated: false, + } + + for _, mut := range muts { + entry = mut(entry) + } + return entry +} + +func prebuiltWorkspace( + opts options, + clock quartz.Clock, + muts ...func(row database.GetRunningPrebuiltWorkspacesRow) database.GetRunningPrebuiltWorkspacesRow, +) database.GetRunningPrebuiltWorkspacesRow { + entry := database.GetRunningPrebuiltWorkspacesRow{ + ID: opts.prebuiltWorkspaceID, + Name: opts.workspaceName, + TemplateID: opts.templateID, + TemplateVersionID: opts.templateVersionID, + CurrentPresetID: uuid.NullUUID{UUID: opts.presetID, Valid: true}, + Ready: true, + CreatedAt: clock.Now(), + } + + for _, mut := range muts { + entry = mut(entry) + } + return entry +} + +func validateState(t *testing.T, expected, actual prebuilds.ReconciliationState) { + require.Equal(t, expected, actual) +} + +// validateActions is a convenience func to make tests more readable; it exploits the fact that the default states for +// prebuilds align with zero values. +func validateActions(t *testing.T, expected, actual prebuilds.ReconciliationActions) { + require.Equal(t, expected, actual) +} diff --git a/coderd/prebuilds/util.go b/coderd/prebuilds/util.go new file mode 100644 index 0000000000000..2cc5311d5ed99 --- /dev/null +++ b/coderd/prebuilds/util.go @@ -0,0 +1,26 @@ +package prebuilds + +import ( + "crypto/rand" + "encoding/base32" + "fmt" + "strings" +) + +// GenerateName generates a 20-byte prebuild name which should safe to use without truncation in most situations. +// UUIDs may be too long for a resource name in cloud providers (since this ID will be used in the prebuild's name). +// +// We're generating a 9-byte suffix (72 bits of entropy): +// 1 - e^(-1e9^2 / (2 * 2^72)) = ~0.01% likelihood of collision in 1 billion IDs. +// See https://en.wikipedia.org/wiki/Birthday_attack. +func GenerateName() (string, error) { + b := make([]byte, 9) + + _, err := rand.Read(b) + if err != nil { + return "", err + } + + // Encode the bytes to Base32 (A-Z2-7), strip any '=' padding + return fmt.Sprintf("prebuild-%s", strings.ToLower(base32.StdEncoding.WithPadding(base32.NoPadding).EncodeToString(b))), nil +} diff --git a/coderd/util/slice/slice.go b/coderd/util/slice/slice.go index b4ee79291d73f..f3811650786b7 100644 --- a/coderd/util/slice/slice.go +++ b/coderd/util/slice/slice.go @@ -90,6 +90,17 @@ func Find[T any](haystack []T, cond func(T) bool) (T, bool) { return empty, false } +// Filter returns all elements that satisfy the condition. +func Filter[T any](haystack []T, cond func(T) bool) []T { + out := make([]T, 0, len(haystack)) + for _, hay := range haystack { + if cond(hay) { + out = append(out, hay) + } + } + return out +} + // Overlap returns if the 2 sets have any overlap (element(s) in common) func Overlap[T comparable](a []T, b []T) bool { return OverlapCompare(a, b, func(a, b T) bool { diff --git a/coderd/util/slice/slice_test.go b/coderd/util/slice/slice_test.go index df8d119273652..006337794faee 100644 --- a/coderd/util/slice/slice_test.go +++ b/coderd/util/slice/slice_test.go @@ -2,6 +2,7 @@ package slice_test import ( "math/rand" + "strings" "testing" "github.com/google/uuid" @@ -82,6 +83,64 @@ func TestContains(t *testing.T) { ) } +func TestFilter(t *testing.T) { + t.Parallel() + + type testCase[T any] struct { + haystack []T + cond func(T) bool + expected []T + } + + { + testCases := []*testCase[int]{ + { + haystack: []int{1, 2, 3, 4, 5}, + cond: func(num int) bool { + return num%2 == 1 + }, + expected: []int{1, 3, 5}, + }, + { + haystack: []int{1, 2, 3, 4, 5}, + cond: func(num int) bool { + return num%2 == 0 + }, + expected: []int{2, 4}, + }, + } + + for _, tc := range testCases { + actual := slice.Filter(tc.haystack, tc.cond) + require.Equal(t, tc.expected, actual) + } + } + + { + testCases := []*testCase[string]{ + { + haystack: []string{"hello", "hi", "bye"}, + cond: func(str string) bool { + return strings.HasPrefix(str, "h") + }, + expected: []string{"hello", "hi"}, + }, + { + haystack: []string{"hello", "hi", "bye"}, + cond: func(str string) bool { + return strings.HasPrefix(str, "b") + }, + expected: []string{"bye"}, + }, + } + + for _, tc := range testCases { + actual := slice.Filter(tc.haystack, tc.cond) + require.Equal(t, tc.expected, actual) + } + } +} + func TestOverlap(t *testing.T) { t.Parallel() diff --git a/codersdk/deployment.go b/codersdk/deployment.go index 9db5a030ebc18..8b447e2c96e06 100644 --- a/codersdk/deployment.go +++ b/codersdk/deployment.go @@ -791,6 +791,19 @@ type NotificationsWebhookConfig struct { Endpoint serpent.URL `json:"endpoint" typescript:",notnull"` } +type PrebuildsConfig struct { + // ReconciliationInterval defines how often the workspace prebuilds state should be reconciled. + ReconciliationInterval serpent.Duration `json:"reconciliation_interval" typescript:",notnull"` + + // ReconciliationBackoffInterval specifies the amount of time to increase the backoff interval + // when errors occur during reconciliation. + ReconciliationBackoffInterval serpent.Duration `json:"reconciliation_backoff_interval" typescript:",notnull"` + + // ReconciliationBackoffLookback determines the time window to look back when calculating + // the number of failed prebuilds, which influences the backoff strategy. + ReconciliationBackoffLookback serpent.Duration `json:"reconciliation_backoff_lookback" typescript:",notnull"` +} + const ( annotationFormatDuration = "format_duration" annotationEnterpriseKey = "enterprise" diff --git a/enterprise/coderd/prebuilds/reconcile.go b/enterprise/coderd/prebuilds/reconcile.go new file mode 100644 index 0000000000000..f74e019207c18 --- /dev/null +++ b/enterprise/coderd/prebuilds/reconcile.go @@ -0,0 +1,541 @@ +package prebuilds + +import ( + "context" + "database/sql" + "fmt" + "math" + "sync/atomic" + "time" + + "github.com/hashicorp/go-multierror" + + "github.com/coder/quartz" + + "github.com/coder/coder/v2/coderd/audit" + "github.com/coder/coder/v2/coderd/database" + "github.com/coder/coder/v2/coderd/database/dbauthz" + "github.com/coder/coder/v2/coderd/database/provisionerjobs" + "github.com/coder/coder/v2/coderd/database/pubsub" + "github.com/coder/coder/v2/coderd/prebuilds" + "github.com/coder/coder/v2/coderd/rbac" + "github.com/coder/coder/v2/coderd/rbac/policy" + "github.com/coder/coder/v2/coderd/wsbuilder" + "github.com/coder/coder/v2/codersdk" + + "cdr.dev/slog" + + "github.com/google/uuid" + "golang.org/x/sync/errgroup" + "golang.org/x/xerrors" +) + +type StoreReconciler struct { + store database.Store + cfg codersdk.PrebuildsConfig + pubsub pubsub.Pubsub + logger slog.Logger + clock quartz.Clock + + cancelFn context.CancelCauseFunc + stopped atomic.Bool + done chan struct{} +} + +var _ prebuilds.ReconciliationOrchestrator = &StoreReconciler{} + +func NewStoreReconciler( + store database.Store, + ps pubsub.Pubsub, + cfg codersdk.PrebuildsConfig, + logger slog.Logger, + clock quartz.Clock, +) *StoreReconciler { + return &StoreReconciler{ + store: store, + pubsub: ps, + logger: logger, + cfg: cfg, + clock: clock, + done: make(chan struct{}, 1), + } +} + +func (c *StoreReconciler) RunLoop(ctx context.Context) { + reconciliationInterval := c.cfg.ReconciliationInterval.Value() + if reconciliationInterval <= 0 { // avoids a panic + reconciliationInterval = 5 * time.Minute + } + + c.logger.Info(ctx, "starting reconciler", + slog.F("interval", reconciliationInterval), + slog.F("backoff_interval", c.cfg.ReconciliationBackoffInterval.String()), + slog.F("backoff_lookback", c.cfg.ReconciliationBackoffLookback.String())) + + ticker := c.clock.NewTicker(reconciliationInterval) + defer ticker.Stop() + defer func() { + c.done <- struct{}{} + }() + + // nolint:gocritic // Reconciliation Loop needs Prebuilds Orchestrator permissions. + ctx, cancel := context.WithCancelCause(dbauthz.AsPrebuildsOrchestrator(ctx)) + c.cancelFn = cancel + + for { + select { + // TODO: implement pubsub listener to allow reconciling a specific template imperatively once it has been changed, + // instead of waiting for the next reconciliation interval + case <-ticker.C: + // Trigger a new iteration on each tick. + err := c.ReconcileAll(ctx) + if err != nil { + c.logger.Error(context.Background(), "reconciliation failed", slog.Error(err)) + } + case <-ctx.Done(): + // nolint:gocritic // it's okay to use slog.F() for an error in this case + // because we want to differentiate two different types of errors: ctx.Err() and context.Cause() + c.logger.Warn( + context.Background(), + "reconciliation loop exited", + slog.Error(ctx.Err()), + slog.F("cause", context.Cause(ctx)), + ) + return + } + } +} + +func (c *StoreReconciler) Stop(ctx context.Context, cause error) { + if cause != nil { + c.logger.Error(context.Background(), "stopping reconciler due to an error", slog.Error(cause)) + } else { + c.logger.Info(context.Background(), "gracefully stopping reconciler") + } + + if c.isStopped() { + return + } + c.stopped.Store(true) + if c.cancelFn != nil { + c.cancelFn(cause) + } + + select { + // Give up waiting for control loop to exit. + case <-ctx.Done(): + // nolint:gocritic // it's okay to use slog.F() for an error in this case + // because we want to differentiate two different types of errors: ctx.Err() and context.Cause() + c.logger.Error( + context.Background(), + "reconciler stop exited prematurely", + slog.Error(ctx.Err()), + slog.F("cause", context.Cause(ctx)), + ) + // Wait for the control loop to exit. + case <-c.done: + c.logger.Info(context.Background(), "reconciler stopped") + } +} + +func (c *StoreReconciler) isStopped() bool { + return c.stopped.Load() +} + +// ReconcileAll will attempt to resolve the desired vs actual state of all templates which have presets with prebuilds configured. +// +// NOTE: +// +// This function will kick of n provisioner jobs, based on the calculated state modifications. +// +// These provisioning jobs are fire-and-forget. We DO NOT wait for the prebuilt workspaces to complete their +// provisioning. As a consequence, it's possible that another reconciliation run will occur, which will mean that +// multiple preset versions could be reconciling at once. This may mean some temporary over-provisioning, but the +// reconciliation loop will bring these resources back into their desired numbers in an EVENTUALLY-consistent way. +// +// For example: we could decide to provision 1 new instance in this reconciliation. +// While that workspace is being provisioned, another template version is created which means this same preset will +// be reconciled again, leading to another workspace being provisioned. Two workspace builds will be occurring +// simultaneously for the same preset, but once both jobs have completed the reconciliation loop will notice the +// extraneous instance and delete it. +func (c *StoreReconciler) ReconcileAll(ctx context.Context) error { + logger := c.logger.With(slog.F("reconcile_context", "all")) + + select { + case <-ctx.Done(): + logger.Warn(context.Background(), "reconcile exiting prematurely; context done", slog.Error(ctx.Err())) + return nil + default: + } + + logger.Debug(ctx, "starting reconciliation") + + err := c.WithReconciliationLock(ctx, logger, func(ctx context.Context, db database.Store) error { + snapshot, err := c.SnapshotState(ctx, db) + if err != nil { + return xerrors.Errorf("determine current snapshot: %w", err) + } + if len(snapshot.Presets) == 0 { + logger.Debug(ctx, "no templates found with prebuilds configured") + return nil + } + + var eg errgroup.Group + // Reconcile presets in parallel. Each preset in its own goroutine. + for _, preset := range snapshot.Presets { + ps, err := snapshot.FilterByPreset(preset.ID) + if err != nil { + logger.Warn(ctx, "failed to find preset snapshot", slog.Error(err), slog.F("preset_id", preset.ID.String())) + continue + } + + eg.Go(func() error { + // Pass outer context. + err = c.ReconcilePreset(ctx, *ps) + if err != nil { + logger.Error( + ctx, + "failed to reconcile prebuilds for preset", + slog.Error(err), + slog.F("preset_id", preset.ID), + ) + } + // DO NOT return error otherwise the tx will end. + return nil + }) + } + + // Release lock only when all preset reconciliation goroutines are finished. + return eg.Wait() + }) + if err != nil { + logger.Error(ctx, "failed to reconcile", slog.Error(err)) + } + + return err +} + +// SnapshotState captures the current state of all prebuilds across templates. +func (c *StoreReconciler) SnapshotState(ctx context.Context, store database.Store) (*prebuilds.GlobalSnapshot, error) { + if err := ctx.Err(); err != nil { + return nil, err + } + + var state prebuilds.GlobalSnapshot + + err := store.InTx(func(db database.Store) error { + // TODO: implement template-specific reconciliations later + presetsWithPrebuilds, err := db.GetTemplatePresetsWithPrebuilds(ctx, uuid.NullUUID{}) + if err != nil { + return xerrors.Errorf("failed to get template presets with prebuilds: %w", err) + } + if len(presetsWithPrebuilds) == 0 { + return nil + } + allRunningPrebuilds, err := db.GetRunningPrebuiltWorkspaces(ctx) + if err != nil { + return xerrors.Errorf("failed to get running prebuilds: %w", err) + } + + allPrebuildsInProgress, err := db.CountInProgressPrebuilds(ctx) + if err != nil { + return xerrors.Errorf("failed to get prebuilds in progress: %w", err) + } + + presetsBackoff, err := db.GetPresetsBackoff(ctx, c.clock.Now().Add(-c.cfg.ReconciliationBackoffLookback.Value())) + if err != nil { + return xerrors.Errorf("failed to get backoffs for presets: %w", err) + } + + state = prebuilds.NewGlobalSnapshot(presetsWithPrebuilds, allRunningPrebuilds, allPrebuildsInProgress, presetsBackoff) + return nil + }, &database.TxOptions{ + Isolation: sql.LevelRepeatableRead, // This mirrors the MVCC snapshotting Postgres does when using CTEs + ReadOnly: true, + TxIdentifier: "prebuilds_state_determination", + }) + + return &state, err +} + +func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.PresetSnapshot) error { + logger := c.logger.With( + slog.F("template_id", ps.Preset.TemplateID.String()), + slog.F("template_name", ps.Preset.TemplateName), + slog.F("template_version_id", ps.Preset.TemplateVersionID), + slog.F("template_version_name", ps.Preset.TemplateVersionName), + slog.F("preset_id", ps.Preset.ID), + slog.F("preset_name", ps.Preset.Name), + ) + + state := ps.CalculateState() + actions, err := c.CalculateActions(ctx, ps) + if err != nil { + logger.Error(ctx, "failed to calculate actions for preset", slog.Error(err), slog.F("preset_id", ps.Preset.ID)) + return nil + } + + // nolint:gocritic // ReconcilePreset needs Prebuilds Orchestrator permissions. + prebuildsCtx := dbauthz.AsPrebuildsOrchestrator(ctx) + + levelFn := logger.Debug + switch { + case actions.ActionType == prebuilds.ActionTypeBackoff: + levelFn = logger.Warn + // Log at info level when there's a change to be effected. + case actions.ActionType == prebuilds.ActionTypeCreate && actions.Create > 0: + levelFn = logger.Info + case actions.ActionType == prebuilds.ActionTypeDelete && len(actions.DeleteIDs) > 0: + levelFn = logger.Info + } + + fields := []any{ + slog.F("action_type", actions.ActionType), + slog.F("create_count", actions.Create), slog.F("delete_count", len(actions.DeleteIDs)), + slog.F("to_delete", actions.DeleteIDs), + slog.F("desired", state.Desired), slog.F("actual", state.Actual), + slog.F("extraneous", state.Extraneous), slog.F("starting", state.Starting), + slog.F("stopping", state.Stopping), slog.F("deleting", state.Deleting), + slog.F("eligible", state.Eligible), + } + + levelFn(ctx, "calculated reconciliation actions for preset", fields...) + + switch actions.ActionType { + case prebuilds.ActionTypeBackoff: + // If there is anything to backoff for (usually a cycle of failed prebuilds), then log and bail out. + levelFn(ctx, "template prebuild state retrieved, backing off", + append(fields, + slog.F("backoff_until", actions.BackoffUntil.Format(time.RFC3339)), + slog.F("backoff_secs", math.Round(actions.BackoffUntil.Sub(c.clock.Now()).Seconds())), + )...) + + return nil + + case prebuilds.ActionTypeCreate: + // Unexpected things happen (i.e. bugs or bitflips); let's defend against disastrous outcomes. + // See https://blog.robertelder.org/causes-of-bit-flips-in-computer-memory/. + // This is obviously not comprehensive protection against this sort of problem, but this is one essential check. + desired := ps.Preset.DesiredInstances.Int32 + if actions.Create > desired { + logger.Critical(ctx, "determined excessive count of prebuilds to create; clamping to desired count", + slog.F("create_count", actions.Create), slog.F("desired_count", desired)) + + actions.Create = desired + } + + var multiErr multierror.Error + + for range actions.Create { + if err := c.createPrebuiltWorkspace(prebuildsCtx, uuid.New(), ps.Preset.TemplateID, ps.Preset.ID); err != nil { + logger.Error(ctx, "failed to create prebuild", slog.Error(err)) + multiErr.Errors = append(multiErr.Errors, err) + } + } + + return multiErr.ErrorOrNil() + + case prebuilds.ActionTypeDelete: + var multiErr multierror.Error + + for _, id := range actions.DeleteIDs { + if err := c.deletePrebuiltWorkspace(prebuildsCtx, id, ps.Preset.TemplateID, ps.Preset.ID); err != nil { + logger.Error(ctx, "failed to delete prebuild", slog.Error(err)) + multiErr.Errors = append(multiErr.Errors, err) + } + } + + return multiErr.ErrorOrNil() + + default: + return xerrors.Errorf("unknown action type: %v", actions.ActionType) + } +} + +func (c *StoreReconciler) CalculateActions(ctx context.Context, snapshot prebuilds.PresetSnapshot) (*prebuilds.ReconciliationActions, error) { + if ctx.Err() != nil { + return nil, ctx.Err() + } + + return snapshot.CalculateActions(c.clock, c.cfg.ReconciliationBackoffInterval.Value()) +} + +func (c *StoreReconciler) WithReconciliationLock( + ctx context.Context, + logger slog.Logger, + fn func(ctx context.Context, db database.Store) error, +) error { + // This tx holds a global lock, which prevents any other coderd replica from starting a reconciliation and + // possibly getting an inconsistent view of the state. + // + // The lock MUST be held until ALL modifications have been effected. + // + // It is run with RepeatableRead isolation, so it's effectively snapshotting the data at the start of the tx. + // + // This is a read-only tx, so returning an error (i.e. causing a rollback) has no impact. + return c.store.InTx(func(db database.Store) error { + start := c.clock.Now() + + // Try to acquire the lock. If we can't get it, another replica is handling reconciliation. + acquired, err := db.TryAcquireLock(ctx, database.LockIDReconcilePrebuilds) + if err != nil { + // This is a real database error, not just lock contention + logger.Error(ctx, "failed to acquire reconciliation lock due to database error", slog.Error(err)) + return err + } + if !acquired { + // Normal case: another replica has the lock + return nil + } + + logger.Debug(ctx, + "acquired top-level reconciliation lock", + slog.F("acquire_wait_secs", fmt.Sprintf("%.4f", c.clock.Since(start).Seconds())), + ) + + return fn(ctx, db) + }, &database.TxOptions{ + Isolation: sql.LevelRepeatableRead, + ReadOnly: true, + TxIdentifier: "prebuilds", + }) +} + +func (c *StoreReconciler) createPrebuiltWorkspace(ctx context.Context, prebuiltWorkspaceID uuid.UUID, templateID uuid.UUID, presetID uuid.UUID) error { + name, err := prebuilds.GenerateName() + if err != nil { + return xerrors.Errorf("failed to generate unique prebuild ID: %w", err) + } + + return c.store.InTx(func(db database.Store) error { + template, err := db.GetTemplateByID(ctx, templateID) + if err != nil { + return xerrors.Errorf("failed to get template: %w", err) + } + + now := c.clock.Now() + + minimumWorkspace, err := db.InsertWorkspace(ctx, database.InsertWorkspaceParams{ + ID: prebuiltWorkspaceID, + CreatedAt: now, + UpdatedAt: now, + OwnerID: prebuilds.SystemUserID, + OrganizationID: template.OrganizationID, + TemplateID: template.ID, + Name: name, + LastUsedAt: c.clock.Now(), + AutomaticUpdates: database.AutomaticUpdatesNever, + AutostartSchedule: sql.NullString{}, + Ttl: sql.NullInt64{}, + NextStartAt: sql.NullTime{}, + }) + if err != nil { + return xerrors.Errorf("insert workspace: %w", err) + } + + // We have to refetch the workspace for the joined in fields. + workspace, err := db.GetWorkspaceByID(ctx, minimumWorkspace.ID) + if err != nil { + return xerrors.Errorf("get workspace by ID: %w", err) + } + + c.logger.Info(ctx, "attempting to create prebuild", slog.F("name", name), + slog.F("workspace_id", prebuiltWorkspaceID.String()), slog.F("preset_id", presetID.String())) + + return c.provision(ctx, db, prebuiltWorkspaceID, template, presetID, database.WorkspaceTransitionStart, workspace) + }, &database.TxOptions{ + Isolation: sql.LevelRepeatableRead, + ReadOnly: false, + }) +} + +func (c *StoreReconciler) deletePrebuiltWorkspace(ctx context.Context, prebuiltWorkspaceID uuid.UUID, templateID uuid.UUID, presetID uuid.UUID) error { + return c.store.InTx(func(db database.Store) error { + workspace, err := db.GetWorkspaceByID(ctx, prebuiltWorkspaceID) + if err != nil { + return xerrors.Errorf("get workspace by ID: %w", err) + } + + template, err := db.GetTemplateByID(ctx, templateID) + if err != nil { + return xerrors.Errorf("failed to get template: %w", err) + } + + if workspace.OwnerID != prebuilds.SystemUserID { + return xerrors.Errorf("prebuilt workspace is not owned by prebuild user anymore, probably it was claimed") + } + + c.logger.Info(ctx, "attempting to delete prebuild", + slog.F("workspace_id", prebuiltWorkspaceID.String()), slog.F("preset_id", presetID.String())) + + return c.provision(ctx, db, prebuiltWorkspaceID, template, presetID, database.WorkspaceTransitionDelete, workspace) + }, &database.TxOptions{ + Isolation: sql.LevelRepeatableRead, + ReadOnly: false, + }) +} + +func (c *StoreReconciler) provision( + ctx context.Context, + db database.Store, + prebuildID uuid.UUID, + template database.Template, + presetID uuid.UUID, + transition database.WorkspaceTransition, + workspace database.Workspace, +) error { + tvp, err := db.GetPresetParametersByTemplateVersionID(ctx, template.ActiveVersionID) + if err != nil { + return xerrors.Errorf("fetch preset details: %w", err) + } + + var params []codersdk.WorkspaceBuildParameter + for _, param := range tvp { + // TODO: don't fetch in the first place. + if param.TemplateVersionPresetID != presetID { + continue + } + + params = append(params, codersdk.WorkspaceBuildParameter{ + Name: param.Name, + Value: param.Value, + }) + } + + builder := wsbuilder.New(workspace, transition). + Reason(database.BuildReasonInitiator). + Initiator(prebuilds.SystemUserID). + VersionID(template.ActiveVersionID). + MarkPrebuild(). + TemplateVersionPresetID(presetID) + + // We only inject the required params when the prebuild is being created. + // This mirrors the behavior of regular workspace deletion (see cli/delete.go). + if transition != database.WorkspaceTransitionDelete { + builder = builder.RichParameterValues(params) + } + + _, provisionerJob, _, err := builder.Build( + ctx, + db, + func(_ policy.Action, _ rbac.Objecter) bool { + return true // TODO: harden? + }, + audit.WorkspaceBuildBaggage{}, + ) + if err != nil { + return xerrors.Errorf("provision workspace: %w", err) + } + + err = provisionerjobs.PostJob(c.pubsub, *provisionerJob) + if err != nil { + // Client probably doesn't care about this error, so just log it. + c.logger.Error(ctx, "failed to post provisioner job to pubsub", slog.Error(err)) + } + + c.logger.Info(ctx, "prebuild job scheduled", slog.F("transition", transition), + slog.F("prebuild_id", prebuildID.String()), slog.F("preset_id", presetID.String()), + slog.F("job_id", provisionerJob.ID)) + + return nil +} diff --git a/enterprise/coderd/prebuilds/reconcile_test.go b/enterprise/coderd/prebuilds/reconcile_test.go new file mode 100644 index 0000000000000..a5bd4a728a4ea --- /dev/null +++ b/enterprise/coderd/prebuilds/reconcile_test.go @@ -0,0 +1,1027 @@ +package prebuilds_test + +import ( + "context" + "database/sql" + "fmt" + "sync" + "testing" + "time" + + "github.com/coder/coder/v2/coderd/util/slice" + + "github.com/google/uuid" + "github.com/stretchr/testify/require" + "tailscale.com/types/ptr" + + "cdr.dev/slog" + "cdr.dev/slog/sloggers/slogtest" + "github.com/coder/quartz" + + "github.com/coder/serpent" + + "github.com/coder/coder/v2/coderd/database" + "github.com/coder/coder/v2/coderd/database/dbgen" + "github.com/coder/coder/v2/coderd/database/dbtestutil" + "github.com/coder/coder/v2/coderd/database/pubsub" + agplprebuilds "github.com/coder/coder/v2/coderd/prebuilds" + "github.com/coder/coder/v2/codersdk" + "github.com/coder/coder/v2/enterprise/coderd/prebuilds" + "github.com/coder/coder/v2/testutil" +) + +func TestNoReconciliationActionsIfNoPresets(t *testing.T) { + // Scenario: No reconciliation actions are taken if there are no presets + t.Parallel() + + if !dbtestutil.WillUsePostgres() { + t.Skip("This test requires postgres") + } + + clock := quartz.NewMock(t) + ctx := testutil.Context(t, testutil.WaitLong) + db, ps := dbtestutil.NewDB(t) + cfg := codersdk.PrebuildsConfig{ + ReconciliationInterval: serpent.Duration(testutil.WaitLong), + } + logger := testutil.Logger(t) + controller := prebuilds.NewStoreReconciler(db, ps, cfg, logger, quartz.NewMock(t)) + + // given a template version with no presets + org := dbgen.Organization(t, db, database.Organization{}) + user := dbgen.User(t, db, database.User{}) + template := dbgen.Template(t, db, database.Template{ + CreatedBy: user.ID, + OrganizationID: org.ID, + }) + templateVersion := dbgen.TemplateVersion(t, db, database.TemplateVersion{ + TemplateID: uuid.NullUUID{UUID: template.ID, Valid: true}, + OrganizationID: org.ID, + CreatedBy: user.ID, + }) + // verify that the db state is correct + gotTemplateVersion, err := db.GetTemplateVersionByID(ctx, templateVersion.ID) + require.NoError(t, err) + require.Equal(t, templateVersion, gotTemplateVersion) + + // when we trigger the reconciliation loop for all templates + require.NoError(t, controller.ReconcileAll(ctx)) + + // then no reconciliation actions are taken + // because without presets, there are no prebuilds + // and without prebuilds, there is nothing to reconcile + jobs, err := db.GetProvisionerJobsCreatedAfter(ctx, clock.Now().Add(earlier)) + require.NoError(t, err) + require.Empty(t, jobs) +} + +func TestNoReconciliationActionsIfNoPrebuilds(t *testing.T) { + // Scenario: No reconciliation actions are taken if there are no prebuilds + t.Parallel() + + if !dbtestutil.WillUsePostgres() { + t.Skip("This test requires postgres") + } + + clock := quartz.NewMock(t) + ctx := testutil.Context(t, testutil.WaitLong) + db, ps := dbtestutil.NewDB(t) + cfg := codersdk.PrebuildsConfig{ + ReconciliationInterval: serpent.Duration(testutil.WaitLong), + } + logger := testutil.Logger(t) + controller := prebuilds.NewStoreReconciler(db, ps, cfg, logger, quartz.NewMock(t)) + + // given there are presets, but no prebuilds + org := dbgen.Organization(t, db, database.Organization{}) + user := dbgen.User(t, db, database.User{}) + template := dbgen.Template(t, db, database.Template{ + CreatedBy: user.ID, + OrganizationID: org.ID, + }) + templateVersion := dbgen.TemplateVersion(t, db, database.TemplateVersion{ + TemplateID: uuid.NullUUID{UUID: template.ID, Valid: true}, + OrganizationID: org.ID, + CreatedBy: user.ID, + }) + preset, err := db.InsertPreset(ctx, database.InsertPresetParams{ + TemplateVersionID: templateVersion.ID, + Name: "test", + }) + require.NoError(t, err) + _, err = db.InsertPresetParameters(ctx, database.InsertPresetParametersParams{ + TemplateVersionPresetID: preset.ID, + Names: []string{"test"}, + Values: []string{"test"}, + }) + require.NoError(t, err) + + // verify that the db state is correct + presetParameters, err := db.GetPresetParametersByTemplateVersionID(ctx, templateVersion.ID) + require.NoError(t, err) + require.NotEmpty(t, presetParameters) + + // when we trigger the reconciliation loop for all templates + require.NoError(t, controller.ReconcileAll(ctx)) + + // then no reconciliation actions are taken + // because without prebuilds, there is nothing to reconcile + // even if there are presets + jobs, err := db.GetProvisionerJobsCreatedAfter(ctx, clock.Now().Add(earlier)) + require.NoError(t, err) + require.Empty(t, jobs) +} + +func TestPrebuildReconciliation(t *testing.T) { + t.Parallel() + + if !dbtestutil.WillUsePostgres() { + t.Skip("This test requires postgres") + } + + type testCase struct { + name string + prebuildLatestTransitions []database.WorkspaceTransition + prebuildJobStatuses []database.ProvisionerJobStatus + templateVersionActive []bool + templateDeleted []bool + shouldCreateNewPrebuild *bool + shouldDeleteOldPrebuild *bool + } + + testCases := []testCase{ + { + name: "never create prebuilds for inactive template versions", + prebuildLatestTransitions: allTransitions, + prebuildJobStatuses: allJobStatuses, + templateVersionActive: []bool{false}, + shouldCreateNewPrebuild: ptr.To(false), + templateDeleted: []bool{false}, + }, + { + name: "no need to create a new prebuild if one is already running", + prebuildLatestTransitions: []database.WorkspaceTransition{ + database.WorkspaceTransitionStart, + }, + prebuildJobStatuses: []database.ProvisionerJobStatus{ + database.ProvisionerJobStatusSucceeded, + }, + templateVersionActive: []bool{true}, + shouldCreateNewPrebuild: ptr.To(false), + templateDeleted: []bool{false}, + }, + { + name: "don't create a new prebuild if one is queued to build or already building", + prebuildLatestTransitions: []database.WorkspaceTransition{ + database.WorkspaceTransitionStart, + }, + prebuildJobStatuses: []database.ProvisionerJobStatus{ + database.ProvisionerJobStatusPending, + database.ProvisionerJobStatusRunning, + }, + templateVersionActive: []bool{true}, + shouldCreateNewPrebuild: ptr.To(false), + templateDeleted: []bool{false}, + }, + { + name: "create a new prebuild if one is in a state that disqualifies it from ever being claimed", + prebuildLatestTransitions: []database.WorkspaceTransition{ + database.WorkspaceTransitionStop, + database.WorkspaceTransitionDelete, + }, + prebuildJobStatuses: []database.ProvisionerJobStatus{ + database.ProvisionerJobStatusPending, + database.ProvisionerJobStatusRunning, + database.ProvisionerJobStatusCanceling, + database.ProvisionerJobStatusSucceeded, + }, + templateVersionActive: []bool{true}, + shouldCreateNewPrebuild: ptr.To(true), + templateDeleted: []bool{false}, + }, + { + // See TestFailedBuildBackoff for the start/failed case. + name: "create a new prebuild if one is in any kind of exceptional state", + prebuildLatestTransitions: []database.WorkspaceTransition{ + database.WorkspaceTransitionStop, + database.WorkspaceTransitionDelete, + }, + prebuildJobStatuses: []database.ProvisionerJobStatus{ + database.ProvisionerJobStatusCanceled, + }, + templateVersionActive: []bool{true}, + shouldCreateNewPrebuild: ptr.To(true), + templateDeleted: []bool{false}, + }, + { + name: "never attempt to interfere with active builds", + // The workspace builder does not allow scheduling a new build if there is already a build + // pending, running, or canceling. As such, we should never attempt to start, stop or delete + // such prebuilds. Rather, we should wait for the existing build to complete and reconcile + // again in the next cycle. + prebuildLatestTransitions: allTransitions, + prebuildJobStatuses: []database.ProvisionerJobStatus{ + database.ProvisionerJobStatusPending, + database.ProvisionerJobStatusRunning, + database.ProvisionerJobStatusCanceling, + }, + templateVersionActive: []bool{true, false}, + shouldDeleteOldPrebuild: ptr.To(false), + templateDeleted: []bool{false}, + }, + { + name: "never delete prebuilds in an exceptional state", + // We don't want to destroy evidence that might be useful to operators + // when troubleshooting issues. So we leave these prebuilds in place. + // Operators are expected to manually delete these prebuilds. + prebuildLatestTransitions: allTransitions, + prebuildJobStatuses: []database.ProvisionerJobStatus{ + database.ProvisionerJobStatusCanceled, + database.ProvisionerJobStatusFailed, + }, + templateVersionActive: []bool{true, false}, + shouldDeleteOldPrebuild: ptr.To(false), + templateDeleted: []bool{false}, + }, + { + name: "delete running prebuilds for inactive template versions", + // We only support prebuilds for active template versions. + // If a template version is inactive, we should delete any prebuilds + // that are running. + prebuildLatestTransitions: []database.WorkspaceTransition{ + database.WorkspaceTransitionStart, + }, + prebuildJobStatuses: []database.ProvisionerJobStatus{ + database.ProvisionerJobStatusSucceeded, + }, + templateVersionActive: []bool{false}, + shouldDeleteOldPrebuild: ptr.To(true), + templateDeleted: []bool{false}, + }, + { + name: "don't delete running prebuilds for active template versions", + prebuildLatestTransitions: []database.WorkspaceTransition{ + database.WorkspaceTransitionStart, + }, + prebuildJobStatuses: []database.ProvisionerJobStatus{ + database.ProvisionerJobStatusSucceeded, + }, + templateVersionActive: []bool{true}, + shouldDeleteOldPrebuild: ptr.To(false), + templateDeleted: []bool{false}, + }, + { + name: "don't delete stopped or already deleted prebuilds", + // We don't ever stop prebuilds. A stopped prebuild is an exceptional state. + // As such we keep it, to allow operators to investigate the cause. + prebuildLatestTransitions: []database.WorkspaceTransition{ + database.WorkspaceTransitionStop, + database.WorkspaceTransitionDelete, + }, + prebuildJobStatuses: []database.ProvisionerJobStatus{ + database.ProvisionerJobStatusSucceeded, + }, + templateVersionActive: []bool{true, false}, + shouldDeleteOldPrebuild: ptr.To(false), + templateDeleted: []bool{false}, + }, + { + name: "delete prebuilds for deleted templates", + prebuildLatestTransitions: []database.WorkspaceTransition{database.WorkspaceTransitionStart}, + prebuildJobStatuses: []database.ProvisionerJobStatus{database.ProvisionerJobStatusSucceeded}, + templateVersionActive: []bool{true, false}, + shouldDeleteOldPrebuild: ptr.To(true), + templateDeleted: []bool{true}, + }, + } + for _, tc := range testCases { + tc := tc // capture for parallel + for _, templateVersionActive := range tc.templateVersionActive { + for _, prebuildLatestTransition := range tc.prebuildLatestTransitions { + for _, prebuildJobStatus := range tc.prebuildJobStatuses { + for _, templateDeleted := range tc.templateDeleted { + t.Run(fmt.Sprintf("%s - %s - %s", tc.name, prebuildLatestTransition, prebuildJobStatus), func(t *testing.T) { + t.Parallel() + t.Cleanup(func() { + if t.Failed() { + t.Logf("failed to run test: %s", tc.name) + t.Logf("templateVersionActive: %t", templateVersionActive) + t.Logf("prebuildLatestTransition: %s", prebuildLatestTransition) + t.Logf("prebuildJobStatus: %s", prebuildJobStatus) + } + }) + clock := quartz.NewMock(t) + ctx := testutil.Context(t, testutil.WaitShort) + cfg := codersdk.PrebuildsConfig{} + logger := slogtest.Make( + t, &slogtest.Options{IgnoreErrors: true}, + ).Leveled(slog.LevelDebug) + db, pubSub := dbtestutil.NewDB(t) + controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t)) + + ownerID := uuid.New() + dbgen.User(t, db, database.User{ + ID: ownerID, + }) + org, template := setupTestDBTemplate(t, db, ownerID, templateDeleted) + templateVersionID := setupTestDBTemplateVersion( + ctx, + t, + clock, + db, + pubSub, + org.ID, + ownerID, + template.ID, + ) + preset := setupTestDBPreset( + t, + db, + templateVersionID, + 1, + uuid.New().String(), + ) + prebuild := setupTestDBPrebuild( + t, + clock, + db, + pubSub, + prebuildLatestTransition, + prebuildJobStatus, + org.ID, + preset, + template.ID, + templateVersionID, + ) + + if !templateVersionActive { + // Create a new template version and mark it as active + // This marks the template version that we care about as inactive + setupTestDBTemplateVersion(ctx, t, clock, db, pubSub, org.ID, ownerID, template.ID) + } + + // Run the reconciliation multiple times to ensure idempotency + // 8 was arbitrary, but large enough to reasonably trust the result + for i := 1; i <= 8; i++ { + require.NoErrorf(t, controller.ReconcileAll(ctx), "failed on iteration %d", i) + + if tc.shouldCreateNewPrebuild != nil { + newPrebuildCount := 0 + workspaces, err := db.GetWorkspacesByTemplateID(ctx, template.ID) + require.NoError(t, err) + for _, workspace := range workspaces { + if workspace.ID != prebuild.ID { + newPrebuildCount++ + } + } + // This test configures a preset that desires one prebuild. + // In cases where new prebuilds should be created, there should be exactly one. + require.Equal(t, *tc.shouldCreateNewPrebuild, newPrebuildCount == 1) + } + + if tc.shouldDeleteOldPrebuild != nil { + builds, err := db.GetWorkspaceBuildsByWorkspaceID(ctx, database.GetWorkspaceBuildsByWorkspaceIDParams{ + WorkspaceID: prebuild.ID, + }) + require.NoError(t, err) + if *tc.shouldDeleteOldPrebuild { + require.Equal(t, 2, len(builds)) + require.Equal(t, database.WorkspaceTransitionDelete, builds[0].Transition) + } else { + require.Equal(t, 1, len(builds)) + require.Equal(t, prebuildLatestTransition, builds[0].Transition) + } + } + } + }) + } + } + } + } + } +} + +func TestMultiplePresetsPerTemplateVersion(t *testing.T) { + t.Parallel() + + if !dbtestutil.WillUsePostgres() { + t.Skip("This test requires postgres") + } + + prebuildLatestTransition := database.WorkspaceTransitionStart + prebuildJobStatus := database.ProvisionerJobStatusRunning + templateDeleted := false + + clock := quartz.NewMock(t) + ctx := testutil.Context(t, testutil.WaitShort) + cfg := codersdk.PrebuildsConfig{} + logger := slogtest.Make( + t, &slogtest.Options{IgnoreErrors: true}, + ).Leveled(slog.LevelDebug) + db, pubSub := dbtestutil.NewDB(t) + controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t)) + + ownerID := uuid.New() + dbgen.User(t, db, database.User{ + ID: ownerID, + }) + org, template := setupTestDBTemplate(t, db, ownerID, templateDeleted) + templateVersionID := setupTestDBTemplateVersion( + ctx, + t, + clock, + db, + pubSub, + org.ID, + ownerID, + template.ID, + ) + preset := setupTestDBPreset( + t, + db, + templateVersionID, + 4, + uuid.New().String(), + ) + preset2 := setupTestDBPreset( + t, + db, + templateVersionID, + 10, + uuid.New().String(), + ) + prebuildIDs := make([]uuid.UUID, 0) + for i := 0; i < int(preset.DesiredInstances.Int32); i++ { + prebuild := setupTestDBPrebuild( + t, + clock, + db, + pubSub, + prebuildLatestTransition, + prebuildJobStatus, + org.ID, + preset, + template.ID, + templateVersionID, + ) + prebuildIDs = append(prebuildIDs, prebuild.ID) + } + + // Run the reconciliation multiple times to ensure idempotency + // 8 was arbitrary, but large enough to reasonably trust the result + for i := 1; i <= 8; i++ { + require.NoErrorf(t, controller.ReconcileAll(ctx), "failed on iteration %d", i) + + newPrebuildCount := 0 + workspaces, err := db.GetWorkspacesByTemplateID(ctx, template.ID) + require.NoError(t, err) + for _, workspace := range workspaces { + if slice.Contains(prebuildIDs, workspace.ID) { + continue + } + newPrebuildCount++ + } + + // NOTE: preset1 doesn't block creation of instances in preset2 + require.Equal(t, preset2.DesiredInstances.Int32, int32(newPrebuildCount)) // nolint:gosec + } +} + +func TestInvalidPreset(t *testing.T) { + t.Parallel() + + if !dbtestutil.WillUsePostgres() { + t.Skip("This test requires postgres") + } + + templateDeleted := false + + clock := quartz.NewMock(t) + ctx := testutil.Context(t, testutil.WaitShort) + cfg := codersdk.PrebuildsConfig{} + logger := slogtest.Make( + t, &slogtest.Options{IgnoreErrors: true}, + ).Leveled(slog.LevelDebug) + db, pubSub := dbtestutil.NewDB(t) + controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, quartz.NewMock(t)) + + ownerID := uuid.New() + dbgen.User(t, db, database.User{ + ID: ownerID, + }) + org, template := setupTestDBTemplate(t, db, ownerID, templateDeleted) + templateVersionID := setupTestDBTemplateVersion( + ctx, + t, + clock, + db, + pubSub, + org.ID, + ownerID, + template.ID, + ) + // Add required param, which is not set in preset. It means that creating of prebuild will constantly fail. + dbgen.TemplateVersionParameter(t, db, database.TemplateVersionParameter{ + TemplateVersionID: templateVersionID, + Name: "required-param", + Description: "required param to make sure creating prebuild will fail", + Type: "bool", + DefaultValue: "", + Required: true, + }) + setupTestDBPreset( + t, + db, + templateVersionID, + 1, + uuid.New().String(), + ) + + // Run the reconciliation multiple times to ensure idempotency + // 8 was arbitrary, but large enough to reasonably trust the result + for i := 1; i <= 8; i++ { + require.NoErrorf(t, controller.ReconcileAll(ctx), "failed on iteration %d", i) + + workspaces, err := db.GetWorkspacesByTemplateID(ctx, template.ID) + require.NoError(t, err) + newPrebuildCount := len(workspaces) + + // NOTE: we don't have any new prebuilds, because their creation constantly fails. + require.Equal(t, int32(0), int32(newPrebuildCount)) // nolint:gosec + } +} + +func TestRunLoop(t *testing.T) { + t.Parallel() + + if !dbtestutil.WillUsePostgres() { + t.Skip("This test requires postgres") + } + + prebuildLatestTransition := database.WorkspaceTransitionStart + prebuildJobStatus := database.ProvisionerJobStatusRunning + templateDeleted := false + + clock := quartz.NewMock(t) + ctx := testutil.Context(t, testutil.WaitShort) + backoffInterval := time.Minute + cfg := codersdk.PrebuildsConfig{ + // Given: explicitly defined backoff configuration to validate timings. + ReconciliationBackoffLookback: serpent.Duration(muchEarlier * -10), // Has to be positive. + ReconciliationBackoffInterval: serpent.Duration(backoffInterval), + ReconciliationInterval: serpent.Duration(time.Second), + } + logger := slogtest.Make( + t, &slogtest.Options{IgnoreErrors: true}, + ).Leveled(slog.LevelDebug) + db, pubSub := dbtestutil.NewDB(t) + controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, clock) + + ownerID := uuid.New() + dbgen.User(t, db, database.User{ + ID: ownerID, + }) + org, template := setupTestDBTemplate(t, db, ownerID, templateDeleted) + templateVersionID := setupTestDBTemplateVersion( + ctx, + t, + clock, + db, + pubSub, + org.ID, + ownerID, + template.ID, + ) + preset := setupTestDBPreset( + t, + db, + templateVersionID, + 4, + uuid.New().String(), + ) + preset2 := setupTestDBPreset( + t, + db, + templateVersionID, + 10, + uuid.New().String(), + ) + prebuildIDs := make([]uuid.UUID, 0) + for i := 0; i < int(preset.DesiredInstances.Int32); i++ { + prebuild := setupTestDBPrebuild( + t, + clock, + db, + pubSub, + prebuildLatestTransition, + prebuildJobStatus, + org.ID, + preset, + template.ID, + templateVersionID, + ) + prebuildIDs = append(prebuildIDs, prebuild.ID) + } + getNewPrebuildCount := func() int32 { + newPrebuildCount := 0 + workspaces, err := db.GetWorkspacesByTemplateID(ctx, template.ID) + require.NoError(t, err) + for _, workspace := range workspaces { + if slice.Contains(prebuildIDs, workspace.ID) { + continue + } + newPrebuildCount++ + } + + return int32(newPrebuildCount) // nolint:gosec + } + + // we need to wait until ticker is initialized, and only then use clock.Advance() + // otherwise clock.Advance() will be ignored + trap := clock.Trap().NewTicker() + go controller.RunLoop(ctx) + // wait until ticker is initialized + trap.MustWait(ctx).Release() + // start 1st iteration of ReconciliationLoop + // NOTE: at this point MustWait waits that iteration is started (ReconcileAll is called), but it doesn't wait until it completes + clock.Advance(cfg.ReconciliationInterval.Value()).MustWait(ctx) + + // wait until ReconcileAll is completed + // TODO: is it possible to avoid Eventually and replace it with quartz? + // Ideally to have all control on test-level, and be able to advance loop iterations from the test. + require.Eventually(t, func() bool { + newPrebuildCount := getNewPrebuildCount() + + // NOTE: preset1 doesn't block creation of instances in preset2 + return preset2.DesiredInstances.Int32 == newPrebuildCount + }, testutil.WaitShort, testutil.IntervalFast) + + // setup one more preset with 5 prebuilds + preset3 := setupTestDBPreset( + t, + db, + templateVersionID, + 5, + uuid.New().String(), + ) + newPrebuildCount := getNewPrebuildCount() + // nothing changed, because we didn't trigger a new iteration of a loop + require.Equal(t, preset2.DesiredInstances.Int32, newPrebuildCount) + + // start 2nd iteration of ReconciliationLoop + // NOTE: at this point MustWait waits that iteration is started (ReconcileAll is called), but it doesn't wait until it completes + clock.Advance(cfg.ReconciliationInterval.Value()).MustWait(ctx) + + // wait until ReconcileAll is completed + require.Eventually(t, func() bool { + newPrebuildCount := getNewPrebuildCount() + + // both prebuilds for preset2 and preset3 were created + return preset2.DesiredInstances.Int32+preset3.DesiredInstances.Int32 == newPrebuildCount + }, testutil.WaitShort, testutil.IntervalFast) + + // gracefully stop the reconciliation loop + controller.Stop(ctx, nil) +} + +func TestFailedBuildBackoff(t *testing.T) { + t.Parallel() + + if !dbtestutil.WillUsePostgres() { + t.Skip("This test requires postgres") + } + ctx := testutil.Context(t, testutil.WaitSuperLong) + + // Setup. + clock := quartz.NewMock(t) + backoffInterval := time.Minute + cfg := codersdk.PrebuildsConfig{ + // Given: explicitly defined backoff configuration to validate timings. + ReconciliationBackoffLookback: serpent.Duration(muchEarlier * -10), // Has to be positive. + ReconciliationBackoffInterval: serpent.Duration(backoffInterval), + ReconciliationInterval: serpent.Duration(time.Second), + } + logger := slogtest.Make( + t, &slogtest.Options{IgnoreErrors: true}, + ).Leveled(slog.LevelDebug) + db, ps := dbtestutil.NewDB(t) + reconciler := prebuilds.NewStoreReconciler(db, ps, cfg, logger, clock) + + // Given: an active template version with presets and prebuilds configured. + const desiredInstances = 2 + userID := uuid.New() + dbgen.User(t, db, database.User{ + ID: userID, + }) + org, template := setupTestDBTemplate(t, db, userID, false) + templateVersionID := setupTestDBTemplateVersion(ctx, t, clock, db, ps, org.ID, userID, template.ID) + + preset := setupTestDBPreset(t, db, templateVersionID, desiredInstances, "test") + for range desiredInstances { + _ = setupTestDBPrebuild(t, clock, db, ps, database.WorkspaceTransitionStart, database.ProvisionerJobStatusFailed, org.ID, preset, template.ID, templateVersionID) + } + + // When: determining what actions to take next, backoff is calculated because the prebuild is in a failed state. + snapshot, err := reconciler.SnapshotState(ctx, db) + require.NoError(t, err) + require.Len(t, snapshot.Presets, 1) + presetState, err := snapshot.FilterByPreset(preset.ID) + require.NoError(t, err) + state := presetState.CalculateState() + actions, err := reconciler.CalculateActions(ctx, *presetState) + require.NoError(t, err) + + // Then: the backoff time is in the future, no prebuilds are running, and we won't create any new prebuilds. + require.EqualValues(t, 0, state.Actual) + require.EqualValues(t, 0, actions.Create) + require.EqualValues(t, desiredInstances, state.Desired) + require.True(t, clock.Now().Before(actions.BackoffUntil)) + + // Then: the backoff time is as expected based on the number of failed builds. + require.NotNil(t, presetState.Backoff) + require.EqualValues(t, desiredInstances, presetState.Backoff.NumFailed) + require.EqualValues(t, backoffInterval*time.Duration(presetState.Backoff.NumFailed), clock.Until(actions.BackoffUntil).Truncate(backoffInterval)) + + // When: advancing to the next tick which is still within the backoff time. + clock.Advance(cfg.ReconciliationInterval.Value()) + + // Then: the backoff interval will not have changed. + snapshot, err = reconciler.SnapshotState(ctx, db) + require.NoError(t, err) + presetState, err = snapshot.FilterByPreset(preset.ID) + require.NoError(t, err) + newState := presetState.CalculateState() + newActions, err := reconciler.CalculateActions(ctx, *presetState) + require.NoError(t, err) + require.EqualValues(t, 0, newState.Actual) + require.EqualValues(t, 0, newActions.Create) + require.EqualValues(t, desiredInstances, newState.Desired) + require.EqualValues(t, actions.BackoffUntil, newActions.BackoffUntil) + + // When: advancing beyond the backoff time. + clock.Advance(clock.Until(actions.BackoffUntil.Add(time.Second))) + + // Then: we will attempt to create a new prebuild. + snapshot, err = reconciler.SnapshotState(ctx, db) + require.NoError(t, err) + presetState, err = snapshot.FilterByPreset(preset.ID) + require.NoError(t, err) + state = presetState.CalculateState() + actions, err = reconciler.CalculateActions(ctx, *presetState) + require.NoError(t, err) + require.EqualValues(t, 0, state.Actual) + require.EqualValues(t, desiredInstances, state.Desired) + require.EqualValues(t, desiredInstances, actions.Create) + + // When: the desired number of new prebuild are provisioned, but one fails again. + for i := 0; i < desiredInstances; i++ { + status := database.ProvisionerJobStatusFailed + if i == 1 { + status = database.ProvisionerJobStatusSucceeded + } + _ = setupTestDBPrebuild(t, clock, db, ps, database.WorkspaceTransitionStart, status, org.ID, preset, template.ID, templateVersionID) + } + + // Then: the backoff time is roughly equal to two backoff intervals, since another build has failed. + snapshot, err = reconciler.SnapshotState(ctx, db) + require.NoError(t, err) + presetState, err = snapshot.FilterByPreset(preset.ID) + require.NoError(t, err) + state = presetState.CalculateState() + actions, err = reconciler.CalculateActions(ctx, *presetState) + require.NoError(t, err) + require.EqualValues(t, 1, state.Actual) + require.EqualValues(t, desiredInstances, state.Desired) + require.EqualValues(t, 0, actions.Create) + require.EqualValues(t, 3, presetState.Backoff.NumFailed) + require.EqualValues(t, backoffInterval*time.Duration(presetState.Backoff.NumFailed), clock.Until(actions.BackoffUntil).Truncate(backoffInterval)) +} + +func TestReconciliationLock(t *testing.T) { + t.Parallel() + + if !dbtestutil.WillUsePostgres() { + t.Skip("This test requires postgres") + } + + ctx := testutil.Context(t, testutil.WaitSuperLong) + logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}).Leveled(slog.LevelDebug) + db, ps := dbtestutil.NewDB(t) + + wg := sync.WaitGroup{} + mutex := sync.Mutex{} + for i := 0; i < 5; i++ { + wg.Add(1) + go func() { + defer wg.Done() + reconciler := prebuilds.NewStoreReconciler( + db, + ps, + codersdk.PrebuildsConfig{}, + slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}).Leveled(slog.LevelDebug), + quartz.NewMock(t), + ) + reconciler.WithReconciliationLock(ctx, logger, func(_ context.Context, _ database.Store) error { + lockObtained := mutex.TryLock() + // As long as the postgres lock is held, this mutex should always be unlocked when we get here. + // If this mutex is ever locked at this point, then that means that the postgres lock is not being held while we're + // inside WithReconciliationLock, which is meant to hold the lock. + require.True(t, lockObtained) + // Sleep a bit to give reconcilers more time to contend for the lock + time.Sleep(time.Second) + defer mutex.Unlock() + return nil + }) + }() + } + wg.Wait() +} + +// nolint:revive // It's a control flag, but this is a test. +func setupTestDBTemplate( + t *testing.T, + db database.Store, + userID uuid.UUID, + templateDeleted bool, +) ( + database.Organization, + database.Template, +) { + t.Helper() + org := dbgen.Organization(t, db, database.Organization{}) + + template := dbgen.Template(t, db, database.Template{ + CreatedBy: userID, + OrganizationID: org.ID, + CreatedAt: time.Now().Add(muchEarlier), + }) + if templateDeleted { + ctx := testutil.Context(t, testutil.WaitShort) + require.NoError(t, db.UpdateTemplateDeletedByID(ctx, database.UpdateTemplateDeletedByIDParams{ + ID: template.ID, + Deleted: true, + })) + } + return org, template +} + +const ( + earlier = -time.Hour + muchEarlier = -time.Hour * 2 +) + +func setupTestDBTemplateVersion( + ctx context.Context, + t *testing.T, + clock quartz.Clock, + db database.Store, + ps pubsub.Pubsub, + orgID uuid.UUID, + userID uuid.UUID, + templateID uuid.UUID, +) uuid.UUID { + t.Helper() + templateVersionJob := dbgen.ProvisionerJob(t, db, ps, database.ProvisionerJob{ + CreatedAt: clock.Now().Add(muchEarlier), + CompletedAt: sql.NullTime{Time: clock.Now().Add(earlier), Valid: true}, + OrganizationID: orgID, + InitiatorID: userID, + }) + templateVersion := dbgen.TemplateVersion(t, db, database.TemplateVersion{ + TemplateID: uuid.NullUUID{UUID: templateID, Valid: true}, + OrganizationID: orgID, + CreatedBy: userID, + JobID: templateVersionJob.ID, + CreatedAt: time.Now().Add(muchEarlier), + }) + require.NoError(t, db.UpdateTemplateActiveVersionByID(ctx, database.UpdateTemplateActiveVersionByIDParams{ + ID: templateID, + ActiveVersionID: templateVersion.ID, + })) + return templateVersion.ID +} + +func setupTestDBPreset( + t *testing.T, + db database.Store, + templateVersionID uuid.UUID, + desiredInstances int32, + presetName string, +) database.TemplateVersionPreset { + t.Helper() + preset := dbgen.Preset(t, db, database.InsertPresetParams{ + TemplateVersionID: templateVersionID, + Name: presetName, + DesiredInstances: sql.NullInt32{ + Valid: true, + Int32: desiredInstances, + }, + }) + dbgen.PresetParameter(t, db, database.InsertPresetParametersParams{ + TemplateVersionPresetID: preset.ID, + Names: []string{"test"}, + Values: []string{"test"}, + }) + return preset +} + +func setupTestDBPrebuild( + t *testing.T, + clock quartz.Clock, + db database.Store, + ps pubsub.Pubsub, + transition database.WorkspaceTransition, + prebuildStatus database.ProvisionerJobStatus, + orgID uuid.UUID, + preset database.TemplateVersionPreset, + templateID uuid.UUID, + templateVersionID uuid.UUID, +) database.WorkspaceTable { + t.Helper() + return setupTestDBWorkspace(t, clock, db, ps, transition, prebuildStatus, orgID, preset, templateID, templateVersionID, agplprebuilds.SystemUserID, agplprebuilds.SystemUserID) +} + +func setupTestDBWorkspace( + t *testing.T, + clock quartz.Clock, + db database.Store, + ps pubsub.Pubsub, + transition database.WorkspaceTransition, + prebuildStatus database.ProvisionerJobStatus, + orgID uuid.UUID, + preset database.TemplateVersionPreset, + templateID uuid.UUID, + templateVersionID uuid.UUID, + initiatorID uuid.UUID, + ownerID uuid.UUID, +) database.WorkspaceTable { + t.Helper() + cancelledAt := sql.NullTime{} + completedAt := sql.NullTime{} + + startedAt := sql.NullTime{} + if prebuildStatus != database.ProvisionerJobStatusPending { + startedAt = sql.NullTime{Time: clock.Now().Add(muchEarlier), Valid: true} + } + + buildError := sql.NullString{} + if prebuildStatus == database.ProvisionerJobStatusFailed { + completedAt = sql.NullTime{Time: clock.Now().Add(earlier), Valid: true} + buildError = sql.NullString{String: "build failed", Valid: true} + } + + switch prebuildStatus { + case database.ProvisionerJobStatusCanceling: + cancelledAt = sql.NullTime{Time: clock.Now().Add(earlier), Valid: true} + case database.ProvisionerJobStatusCanceled: + completedAt = sql.NullTime{Time: clock.Now().Add(earlier), Valid: true} + cancelledAt = sql.NullTime{Time: clock.Now().Add(earlier), Valid: true} + case database.ProvisionerJobStatusSucceeded: + completedAt = sql.NullTime{Time: clock.Now().Add(earlier), Valid: true} + default: + } + + workspace := dbgen.Workspace(t, db, database.WorkspaceTable{ + TemplateID: templateID, + OrganizationID: orgID, + OwnerID: ownerID, + Deleted: false, + }) + job := dbgen.ProvisionerJob(t, db, ps, database.ProvisionerJob{ + InitiatorID: initiatorID, + CreatedAt: clock.Now().Add(muchEarlier), + StartedAt: startedAt, + CompletedAt: completedAt, + CanceledAt: cancelledAt, + OrganizationID: orgID, + Error: buildError, + }) + dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{ + WorkspaceID: workspace.ID, + InitiatorID: initiatorID, + TemplateVersionID: templateVersionID, + JobID: job.ID, + TemplateVersionPresetID: uuid.NullUUID{UUID: preset.ID, Valid: true}, + Transition: transition, + CreatedAt: clock.Now(), + }) + + return workspace +} + +var allTransitions = []database.WorkspaceTransition{ + database.WorkspaceTransitionStart, + database.WorkspaceTransitionStop, + database.WorkspaceTransitionDelete, +} + +var allJobStatuses = []database.ProvisionerJobStatus{ + database.ProvisionerJobStatusPending, + database.ProvisionerJobStatusRunning, + database.ProvisionerJobStatusSucceeded, + database.ProvisionerJobStatusFailed, + database.ProvisionerJobStatusCanceled, + database.ProvisionerJobStatusCanceling, +} + +// TODO (sasswart): test mutual exclusion diff --git a/site/src/api/typesGenerated.ts b/site/src/api/typesGenerated.ts index 38e8e91ac8c1a..f01cb9c98dc64 100644 --- a/site/src/api/typesGenerated.ts +++ b/site/src/api/typesGenerated.ts @@ -1695,6 +1695,13 @@ export interface PprofConfig { readonly address: string; } +// From codersdk/deployment.go +export interface PrebuildsConfig { + readonly reconciliation_interval: number; + readonly reconciliation_backoff_interval: number; + readonly reconciliation_backoff_lookback: number; +} + // From codersdk/presets.go export interface Preset { readonly ID: string;