Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
b32923a
feat: log resource replacements
dannykopping Apr 25, 2025
0b0830f
feat: show terraform state drift diff in build logs
dannykopping Apr 25, 2025
256395a
feat: only highlight lines which mention replacement
dannykopping Apr 25, 2025
61ef61a
feat: notify template admins when prebuild claim results in resource …
dannykopping Apr 25, 2025
a66559f
chore: appease linter
dannykopping Apr 25, 2025
222892b
chore: fix notifications test
dannykopping Apr 25, 2025
f34e011
fix: don't panic
dannykopping Apr 28, 2025
5168c01
fix: renaming type
dannykopping Apr 28, 2025
41e5e0c
chore: updating migration numbers
dannykopping May 6, 2025
b29e8fa
chore: minor touch-ups
dannykopping May 6, 2025
b31ed5e
feat: add resource replacements metric
dannykopping May 7, 2025
adf98d2
feat: add resource replacement notification
dannykopping May 7, 2025
f24aef0
make lint; make fmt
dannykopping May 7, 2025
70f9a53
chore: adding tests
dannykopping May 8, 2025
1e8385d
feat: pass flag to terraform provider when prebuilt workspace claimed
dannykopping May 9, 2025
d0f00ce
chore: update provider, add test for is_prebuild_claim
dannykopping May 12, 2025
11a2c5a
Merge branch 'main' of github.com:/coder/coder into dk/logreplacements
dannykopping May 12, 2025
ce63b24
Merge branch 'dk/is-prebuild-claim' of github.com:/coder/coder into d…
dannykopping May 12, 2025
d2c5d43
chore: replace GetTemplatePresetsByID with GetPresetByID
dannykopping May 12, 2025
22d82a4
chore: correcting docs link
dannykopping May 12, 2025
5209aae
Merge branch 'main' of github.com:/coder/coder into dk/logreplacement
dannykopping May 12, 2025
39ce658
Merge branch 'main' of github.com:/coder/coder into dk/logreplacements
dannykopping May 12, 2025
ac5655f
Merge branch 'main' of github.com:/coder/coder into dk/logreplacements
dannykopping May 12, 2025
82c3f58
chore: note provisioner API change
dannykopping May 12, 2025
7577a90
chore: fixups
dannykopping May 13, 2025
a893b79
chore: adding note about immutable resources
dannykopping May 13, 2025
d9c906a
chore: review feedback
dannykopping May 13, 2025
471198a
Merge branch 'main' of github.com:/coder/coder into dk/logreplacements
dannykopping May 13, 2025
7d694e6
chore: merge conflicts
dannykopping May 13, 2025
6b7a8b7
chore: fix 'is not iterable' bullshit
dannykopping May 13, 2025
5df2cb3
Merge branch 'main' of github.com:/coder/coder into dk/logreplacements
dannykopping May 14, 2025
6d1c3ea
chore: rename migrations
dannykopping May 14, 2025
5f62702
chore: set notifications manager before enterprise server initializes…
dannykopping May 14, 2025
f74d799
chore: completing refactor since https://github.com/coder/coder/pull/…
dannykopping May 14, 2025
971f65c
chore: remove unnecessary atomicity since map is protected by mutex a…
dannykopping May 14, 2025
bc362b0
chore: appeasing linter's Very Important Suggestion
dannykopping May 14, 2025
4fbd356
Merge branch 'main' of github.com:/coder/coder into dk/logreplacements
dannykopping May 14, 2025
b9eb8be
chore: remove old replacement logging
dannykopping May 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
chore: adding tests
Signed-off-by: Danny Kopping <dannykopping@gmail.com>
  • Loading branch information
dannykopping committed May 8, 2025
commit 70f9a53d6d7be5c98b7ecd21f078d4ec140dc547
7 changes: 1 addition & 6 deletions coderd/coderd.go
Original file line number Diff line number Diff line change
Expand Up @@ -1763,11 +1763,6 @@ func (api *API) CreateInMemoryTaggedProvisionerDaemon(dialCtx context.Context, n
return nil, xerrors.Errorf("failed to create in-memory provisioner daemon: %w", err)
}

var prebuildsOrchestrator prebuilds.ReconciliationOrchestrator
if val := api.PrebuildsReconciler.Load(); val != nil {
prebuildsOrchestrator = *val
}

mux := drpcmux.New()
api.Logger.Debug(dialCtx, "starting in-memory provisioner daemon", slog.F("name", name))
logger := api.Logger.Named(fmt.Sprintf("inmem-provisionerd-%s", name))
Expand Down Expand Up @@ -1795,7 +1790,7 @@ func (api *API) CreateInMemoryTaggedProvisionerDaemon(dialCtx context.Context, n
Clock: api.Clock,
},
api.NotificationsEnqueuer,
prebuildsOrchestrator,
&api.PrebuildsReconciler,
)
if err != nil {
return nil, err
Expand Down
7 changes: 7 additions & 0 deletions coderd/notifications/notificationstest/fake_enqueuer.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"github.com/prometheus/client_golang/prometheus"

"github.com/coder/coder/v2/coderd/database/dbauthz"
"github.com/coder/coder/v2/coderd/notifications"
"github.com/coder/coder/v2/coderd/rbac"
"github.com/coder/coder/v2/coderd/rbac/policy"
)
Expand All @@ -19,6 +20,12 @@ type FakeEnqueuer struct {
sent []*FakeNotification
}

var _ notifications.Enqueuer = &FakeEnqueuer{}

func NewFakeEnqueuer() notifications.Enqueuer {
return &FakeEnqueuer{}
}

type FakeNotification struct {
UserID, TemplateID uuid.UUID
Labels map[string]string
Expand Down
1 change: 1 addition & 0 deletions coderd/prebuilds/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ type ReconciliationOrchestrator interface {

// TrackResourceReplacement handles a pathological situation whereby a terraform resource is replaced due to drift,
// which can obviate the whole point of pre-provisioning a prebuilt workspace.
// See more detail at https://coder.com/docs/admin/templates/extending-templates/prebuilt-workspaces.md#preventing-resource-replacement.
TrackResourceReplacement(ctx context.Context, workspaceID, buildID, claimantID uuid.UUID, replacements []*sdkproto.ResourceReplacement)
}

Expand Down
15 changes: 9 additions & 6 deletions coderd/provisionerdserver/provisionerdserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ type server struct {
UserQuietHoursScheduleStore *atomic.Pointer[schedule.UserQuietHoursScheduleStore]
DeploymentValues *codersdk.DeploymentValues
NotificationsEnqueuer notifications.Enqueuer
PrebuildsOrchestrator prebuilds.ReconciliationOrchestrator
PrebuildsOrchestrator *atomic.Pointer[prebuilds.ReconciliationOrchestrator]

OIDCConfig promoauth.OAuth2Config

Expand Down Expand Up @@ -164,7 +164,7 @@ func NewServer(lifecycleCtx context.Context,
deploymentValues *codersdk.DeploymentValues,
options Options,
enqueuer notifications.Enqueuer,
prebuildsOrchestrator prebuilds.ReconciliationOrchestrator,
prebuildsOrchestrator *atomic.Pointer[prebuilds.ReconciliationOrchestrator],
) (proto.DRPCProvisionerDaemonServer, error) {
// Fail-fast if pointers are nil
if lifecycleCtx == nil {
Expand Down Expand Up @@ -1731,10 +1731,13 @@ func (s *server) CompleteJob(ctx context.Context, completed *proto.CompletedJob)
})
}

// Track resource replacements, if there are any.
if resourceReplacements := completed.GetWorkspaceBuild().GetResourceReplacements(); len(resourceReplacements) > 0 {
// Fire and forget.
go s.PrebuildsOrchestrator.TrackResourceReplacement(context.Background(), workspace.ID, workspaceBuild.ID, input.PrebuildClaimedByUser, resourceReplacements)
if s.PrebuildsOrchestrator != nil {
// Track resource replacements, if there are any.
orchestrator := s.PrebuildsOrchestrator.Load()
if resourceReplacements := completed.GetWorkspaceBuild().GetResourceReplacements(); orchestrator != nil && len(resourceReplacements) > 0 {
// Fire and forget.
go (*orchestrator).TrackResourceReplacement(context.Background(), workspace.ID, workspaceBuild.ID, input.PrebuildClaimedByUser, resourceReplacements)
}
}

msg, err := json.Marshal(wspubsub.WorkspaceEvent{
Expand Down
113 changes: 112 additions & 1 deletion coderd/provisionerdserver/provisionerdserver_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1747,6 +1747,109 @@ func TestCompleteJob(t *testing.T) {
})
}
})

t.Run("PrebuiltWorkspaceClaimWithResourceReplacements", func(t *testing.T) {
t.Parallel()

ctx := testutil.Context(t, testutil.WaitLong)

// Given: a mock prebuild orchestrator which stores calls to TrackResourceReplacement.
done := make(chan struct{})
orchestrator := &mockPrebuildsOrchestrator{
ReconciliationOrchestrator: agplprebuilds.DefaultReconciler,
done: done,
}
srv, db, ps, pd := setup(t, false, &overrides{
prebuildsOrchestrator: orchestrator,
})

// Given: a workspace build which simulates claiming a prebuild.
user := dbgen.User(t, db, database.User{})
template := dbgen.Template(t, db, database.Template{
Name: "template",
Provisioner: database.ProvisionerTypeEcho,
OrganizationID: pd.OrganizationID,
})
file := dbgen.File(t, db, database.File{CreatedBy: user.ID})
workspaceTable := dbgen.Workspace(t, db, database.WorkspaceTable{
TemplateID: template.ID,
OwnerID: user.ID,
OrganizationID: pd.OrganizationID,
})
version := dbgen.TemplateVersion(t, db, database.TemplateVersion{
OrganizationID: pd.OrganizationID,
TemplateID: uuid.NullUUID{
UUID: template.ID,
Valid: true,
},
JobID: uuid.New(),
})
build := dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{
WorkspaceID: workspaceTable.ID,
InitiatorID: user.ID,
TemplateVersionID: version.ID,
Transition: database.WorkspaceTransitionStart,
Reason: database.BuildReasonInitiator,
})
job := dbgen.ProvisionerJob(t, db, ps, database.ProvisionerJob{
FileID: file.ID,
InitiatorID: user.ID,
Type: database.ProvisionerJobTypeWorkspaceBuild,
Input: must(json.Marshal(provisionerdserver.WorkspaceProvisionJob{
WorkspaceBuildID: build.ID,

// Mark the job as a prebuilt workspace claim.
PrebuildClaimedByUser: uuid.New(),
IsPrebuild: false,
})),
OrganizationID: pd.OrganizationID,
})
_, err := db.AcquireProvisionerJob(ctx, database.AcquireProvisionerJobParams{
OrganizationID: pd.OrganizationID,
WorkerID: uuid.NullUUID{
UUID: pd.ID,
Valid: true,
},
Types: []database.ProvisionerType{database.ProvisionerTypeEcho},
})
require.NoError(t, err)

// When: a replacement is encountered.
replacements := []*sdkproto.ResourceReplacement{
{
Resource: "docker_container[0]",
Paths: []string{"env"},
},
}

// Then: CompleteJob makes a call to TrackResourceReplacement.
_, err = srv.CompleteJob(ctx, &proto.CompletedJob{
JobId: job.ID.String(),
Type: &proto.CompletedJob_WorkspaceBuild_{
WorkspaceBuild: &proto.CompletedJob_WorkspaceBuild{
State: []byte{},
ResourceReplacements: replacements,
},
},
})
require.NoError(t, err)

// Then: the replacements are as we expected.
testutil.RequireReceive(ctx, t, done)
require.Equal(t, replacements, orchestrator.replacements)
})
}

type mockPrebuildsOrchestrator struct {
agplprebuilds.ReconciliationOrchestrator

replacements []*sdkproto.ResourceReplacement
done chan struct{}
}

func (m *mockPrebuildsOrchestrator) TrackResourceReplacement(_ context.Context, _, _, _ uuid.UUID, replacements []*sdkproto.ResourceReplacement) {
m.replacements = replacements
m.done <- struct{}{}
}

func TestInsertWorkspacePresetsAndParameters(t *testing.T) {
Expand Down Expand Up @@ -2632,6 +2735,7 @@ type overrides struct {
heartbeatInterval time.Duration
auditor audit.Auditor
notificationEnqueuer notifications.Enqueuer
prebuildsOrchestrator agplprebuilds.ReconciliationOrchestrator
}

func setup(t *testing.T, ignoreLogErrors bool, ov *overrides) (proto.DRPCProvisionerDaemonServer, database.Store, pubsub.Pubsub, database.ProvisionerDaemon) {
Expand Down Expand Up @@ -2713,6 +2817,13 @@ func setup(t *testing.T, ignoreLogErrors bool, ov *overrides) (proto.DRPCProvisi
})
require.NoError(t, err)

prebuildsOrchestrator := ov.prebuildsOrchestrator
if prebuildsOrchestrator == nil {
prebuildsOrchestrator = agplprebuilds.DefaultReconciler
}
var op atomic.Pointer[agplprebuilds.ReconciliationOrchestrator]
op.Store(&prebuildsOrchestrator)

srv, err := provisionerdserver.NewServer(
ov.ctx,
&url.URL{},
Expand Down Expand Up @@ -2740,7 +2851,7 @@ func setup(t *testing.T, ignoreLogErrors bool, ov *overrides) (proto.DRPCProvisi
HeartbeatFn: ov.heartbeatFn,
},
notifEnq,
agplprebuilds.DefaultReconciler,
&op,
)
require.NoError(t, err)
return srv, db, ps, daemon
Expand Down
31 changes: 24 additions & 7 deletions enterprise/coderd/prebuilds/metricscollector.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,30 +16,42 @@ import (
"github.com/coder/coder/v2/coderd/prebuilds"
)

const (
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Muddies the purpose of the PR a bit, but it was a worthwhile driveby refactoring given that we're adding a new metric (MetricResourceReplacementsCount) and we need to check for its value in a test.

namespace = "coderd_prebuilt_workspaces_"

MetricCreatedCount = namespace + "created_total"
MetricFailedCount = namespace + "failed_total"
MetricClaimedCount = namespace + "claimed_total"
MetricResourceReplacementsCount = namespace + "resource_replacements_total"
MetricDesiredGauge = namespace + "desired"
MetricRunningGauge = namespace + "running"
MetricEligibleGauge = namespace + "eligible"
)

var (
labels = []string{"template_name", "preset_name", "organization_name"}
createdPrebuildsDesc = prometheus.NewDesc(
"coderd_prebuilt_workspaces_created_total",
MetricCreatedCount,
"Total number of prebuilt workspaces that have been created to meet the desired instance count of each "+
"template preset.",
labels,
nil,
)
failedPrebuildsDesc = prometheus.NewDesc(
"coderd_prebuilt_workspaces_failed_total",
MetricFailedCount,
"Total number of prebuilt workspaces that failed to build.",
labels,
nil,
)
claimedPrebuildsDesc = prometheus.NewDesc(
"coderd_prebuilt_workspaces_claimed_total",
MetricClaimedCount,
"Total number of prebuilt workspaces which were claimed by users. Claiming refers to creating a workspace "+
"with a preset selected for which eligible prebuilt workspaces are available and one is reassigned to a user.",
labels,
nil,
)
resourceReplacementsDesc = prometheus.NewDesc(
"coderd_prebuilt_workspaces_resource_replacements_total",
MetricResourceReplacementsCount,
"Total number of prebuilt workspaces whose resource(s) got replaced upon being claimed. "+
"In Terraform, drift on immutable attributes results in resource replacement. "+
"This represents a worst-case scenario for prebuilt workspaces because the pre-provisioned resource "+
Expand All @@ -49,20 +61,20 @@ var (
nil,
)
desiredPrebuildsDesc = prometheus.NewDesc(
"coderd_prebuilt_workspaces_desired",
MetricDesiredGauge,
"Target number of prebuilt workspaces that should be available for each template preset.",
labels,
nil,
)
runningPrebuildsDesc = prometheus.NewDesc(
"coderd_prebuilt_workspaces_running",
MetricRunningGauge,
"Current number of prebuilt workspaces that are in a running state. These workspaces have started "+
"successfully but may not yet be claimable by users (see coderd_prebuilt_workspaces_eligible).",
labels,
nil,
)
eligiblePrebuildsDesc = prometheus.NewDesc(
"coderd_prebuilt_workspaces_eligible",
MetricEligibleGauge,
"Current number of prebuilt workspaces that are eligible to be claimed by users. These are workspaces that "+
"have completed their build process with their agent reporting 'ready' status.",
labels,
Expand Down Expand Up @@ -162,5 +174,10 @@ func (mc *MetricsCollector) trackResourceReplacement(orgName, templateName, pres
if _, ok := mc.replacementsCounter[key]; !ok {
mc.replacementsCounter[key] = &atomic.Int64{}
}

// We only track _that_ a resource replacement occurred, not how many.
// Just one is enough to ruin a prebuild, but we can't know apriori which replacement would cause this.
// For example, say we have 2 replacements: a docker_container and a null_resource; we don't know which one might
// cause an issue (or indeed if either would), so we just track the replacement.
mc.replacementsCounter[key].Add(1)
}
Loading
Loading