Skip to content

Commit 511fd09

Browse files
authored
fix(coderd): mark sub agent deletion via boolean instead of delete (#18411)
Deletion of data is uncommon in our database, so the introduction of sub agents and the deletion of them introduced issues with foreign key assumptions, as can be seen in coder/internal#685. We could have only addressed the specific case by allowing cascade deletion of stats as well as handling in the stats collector, but it's unclear how many more such edge-cases we could run into. In this change, we mark the rows as deleted via boolean instead, and filter them out in all relevant queries. Fixes coder/internal#685
1 parent 68f21fa commit 511fd09

File tree

13 files changed

+385
-38
lines changed

13 files changed

+385
-38
lines changed

coderd/agentapi/subagent_test.go

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -875,14 +875,9 @@ func TestSubAgentAPI(t *testing.T) {
875875
require.NoError(t, err)
876876
})
877877

878-
t.Run("DeletesWorkspaceApps", func(t *testing.T) {
878+
t.Run("DeleteRetainsWorkspaceApps", func(t *testing.T) {
879879
t.Parallel()
880880

881-
// Skip test on in-memory database since CASCADE DELETE is not implemented
882-
if !dbtestutil.WillUsePostgres() {
883-
t.Skip("CASCADE DELETE behavior requires PostgreSQL")
884-
}
885-
886881
log := testutil.Logger(t)
887882
ctx := testutil.Context(t, testutil.WaitShort)
888883
clock := quartz.NewMock(t)
@@ -931,11 +926,11 @@ func TestSubAgentAPI(t *testing.T) {
931926
_, err = api.Database.GetWorkspaceAgentByID(dbauthz.AsSystemRestricted(ctx), subAgentID) //nolint:gocritic // this is a test.
932927
require.ErrorIs(t, err, sql.ErrNoRows)
933928

934-
// And: The apps are also deleted (due to CASCADE DELETE)
935-
// Use raw database since authorization layer requires agent to exist
929+
// And: The apps are *retained* to avoid causing issues
930+
// where the resources are expected to be present.
936931
appsAfterDeletion, err := db.GetWorkspaceAppsByAgentID(ctx, subAgentID)
937932
require.NoError(t, err)
938-
require.Empty(t, appsAfterDeletion)
933+
require.NotEmpty(t, appsAfterDeletion)
939934
})
940935
})
941936

coderd/database/dbfake/dbfake.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"database/sql"
66
"encoding/json"
7+
"errors"
78
"testing"
89
"time"
910

@@ -243,6 +244,25 @@ func (b WorkspaceBuildBuilder) Do() WorkspaceResponse {
243244
require.NoError(b.t, err)
244245
}
245246

247+
agents, err := b.db.GetWorkspaceAgentsByWorkspaceAndBuildNumber(ownerCtx, database.GetWorkspaceAgentsByWorkspaceAndBuildNumberParams{
248+
WorkspaceID: resp.Workspace.ID,
249+
BuildNumber: resp.Build.BuildNumber,
250+
})
251+
if !errors.Is(err, sql.ErrNoRows) {
252+
require.NoError(b.t, err, "get workspace agents")
253+
// Insert deleted subagent test antagonists for the workspace build.
254+
// See also `dbgen.WorkspaceAgent()`.
255+
for _, agent := range agents {
256+
subAgent := dbgen.WorkspaceSubAgent(b.t, b.db, agent, database.WorkspaceAgent{
257+
TroubleshootingURL: "I AM A TEST ANTAGONIST AND I AM HERE TO MESS UP YOUR TESTS. IF YOU SEE ME, SOMETHING IS WRONG AND SUB AGENT DELETION MAY NOT BE HANDLED CORRECTLY IN A QUERY.",
258+
})
259+
err = b.db.DeleteWorkspaceSubAgentByID(ownerCtx, subAgent.ID)
260+
require.NoError(b.t, err, "delete workspace agent subagent antagonist")
261+
262+
b.t.Logf("inserted deleted subagent antagonist %s (%v) for workspace agent %s (%v)", subAgent.Name, subAgent.ID, agent.Name, agent.ID)
263+
}
264+
}
265+
246266
return resp
247267
}
248268

coderd/database/dbgen/dbgen.go

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ func WorkspaceAgent(t testing.TB, db database.Store, orig database.WorkspaceAgen
209209
},
210210
ConnectionTimeoutSeconds: takeFirst(orig.ConnectionTimeoutSeconds, 3600),
211211
TroubleshootingURL: takeFirst(orig.TroubleshootingURL, "https://example.com"),
212-
MOTDFile: takeFirst(orig.TroubleshootingURL, ""),
212+
MOTDFile: takeFirst(orig.MOTDFile, ""),
213213
DisplayApps: append([]database.DisplayApp{}, orig.DisplayApps...),
214214
DisplayOrder: takeFirst(orig.DisplayOrder, 1),
215215
APIKeyScope: takeFirst(orig.APIKeyScope, database.AgentKeyScopeEnumAll),
@@ -226,9 +226,50 @@ func WorkspaceAgent(t testing.TB, db database.Store, orig database.WorkspaceAgen
226226
})
227227
require.NoError(t, err, "update workspace agent first connected at")
228228
}
229+
230+
if orig.ParentID.UUID == uuid.Nil {
231+
// Add a test antagonist. For every agent we add a deleted sub agent
232+
// to discover cases where deletion should be handled.
233+
// See also `(dbfake.WorkspaceBuildBuilder).Do()`.
234+
subAgt, err := db.InsertWorkspaceAgent(genCtx, database.InsertWorkspaceAgentParams{
235+
ID: uuid.New(),
236+
ParentID: uuid.NullUUID{UUID: agt.ID, Valid: true},
237+
CreatedAt: dbtime.Now(),
238+
UpdatedAt: dbtime.Now(),
239+
Name: testutil.GetRandomName(t),
240+
ResourceID: agt.ResourceID,
241+
AuthToken: uuid.New(),
242+
AuthInstanceID: sql.NullString{},
243+
Architecture: agt.Architecture,
244+
EnvironmentVariables: pqtype.NullRawMessage{},
245+
OperatingSystem: agt.OperatingSystem,
246+
Directory: agt.Directory,
247+
InstanceMetadata: pqtype.NullRawMessage{},
248+
ResourceMetadata: pqtype.NullRawMessage{},
249+
ConnectionTimeoutSeconds: agt.ConnectionTimeoutSeconds,
250+
TroubleshootingURL: "I AM A TEST ANTAGONIST AND I AM HERE TO MESS UP YOUR TESTS. IF YOU SEE ME, SOMETHING IS WRONG AND SUB AGENT DELETION MAY NOT BE HANDLED CORRECTLY IN A QUERY.",
251+
MOTDFile: "",
252+
DisplayApps: nil,
253+
DisplayOrder: agt.DisplayOrder,
254+
APIKeyScope: agt.APIKeyScope,
255+
})
256+
require.NoError(t, err, "insert workspace agent subagent antagonist")
257+
err = db.DeleteWorkspaceSubAgentByID(genCtx, subAgt.ID)
258+
require.NoError(t, err, "delete workspace agent subagent antagonist")
259+
260+
t.Logf("inserted deleted subagent antagonist %s (%v) for workspace agent %s (%v)", subAgt.Name, subAgt.ID, agt.Name, agt.ID)
261+
}
262+
229263
return agt
230264
}
231265

266+
func WorkspaceSubAgent(t testing.TB, db database.Store, parentAgent database.WorkspaceAgent, orig database.WorkspaceAgent) database.WorkspaceAgent {
267+
orig.ParentID = uuid.NullUUID{UUID: parentAgent.ID, Valid: true}
268+
orig.ResourceID = parentAgent.ResourceID
269+
subAgt := WorkspaceAgent(t, db, orig)
270+
return subAgt
271+
}
272+
232273
func WorkspaceAgentScript(t testing.TB, db database.Store, orig database.WorkspaceAgentScript) database.WorkspaceAgentScript {
233274
scripts, err := db.InsertWorkspaceAgentScripts(genCtx, database.InsertWorkspaceAgentScriptsParams{
234275
WorkspaceAgentID: takeFirst(orig.WorkspaceAgentID, uuid.New()),

coderd/database/dbmem/dbmem.go

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -792,7 +792,7 @@ func (q *FakeQuerier) getWorkspaceAgentByIDNoLock(_ context.Context, id uuid.UUI
792792
// The schema sorts this by created at, so we iterate the array backwards.
793793
for i := len(q.workspaceAgents) - 1; i >= 0; i-- {
794794
agent := q.workspaceAgents[i]
795-
if agent.ID == id {
795+
if !agent.Deleted && agent.ID == id {
796796
return agent, nil
797797
}
798798
}
@@ -802,6 +802,9 @@ func (q *FakeQuerier) getWorkspaceAgentByIDNoLock(_ context.Context, id uuid.UUI
802802
func (q *FakeQuerier) getWorkspaceAgentsByResourceIDsNoLock(_ context.Context, resourceIDs []uuid.UUID) ([]database.WorkspaceAgent, error) {
803803
workspaceAgents := make([]database.WorkspaceAgent, 0)
804804
for _, agent := range q.workspaceAgents {
805+
if agent.Deleted {
806+
continue
807+
}
805808
for _, resourceID := range resourceIDs {
806809
if agent.ResourceID != resourceID {
807810
continue
@@ -2554,13 +2557,13 @@ func (q *FakeQuerier) DeleteWorkspaceAgentPortSharesByTemplate(_ context.Context
25542557
return nil
25552558
}
25562559

2557-
func (q *FakeQuerier) DeleteWorkspaceSubAgentByID(ctx context.Context, id uuid.UUID) error {
2560+
func (q *FakeQuerier) DeleteWorkspaceSubAgentByID(_ context.Context, id uuid.UUID) error {
25582561
q.mutex.Lock()
25592562
defer q.mutex.Unlock()
25602563

25612564
for i, agent := range q.workspaceAgents {
25622565
if agent.ID == id && agent.ParentID.Valid {
2563-
q.workspaceAgents = slices.Delete(q.workspaceAgents, i, i+1)
2566+
q.workspaceAgents[i].Deleted = true
25642567
return nil
25652568
}
25662569
}
@@ -7077,6 +7080,10 @@ func (q *FakeQuerier) GetWorkspaceAgentAndLatestBuildByAuthToken(_ context.Conte
70777080
latestBuildNumber := make(map[uuid.UUID]int32)
70787081

70797082
for _, agt := range q.workspaceAgents {
7083+
if agt.Deleted {
7084+
continue
7085+
}
7086+
70807087
// get the related workspace and user
70817088
for _, res := range q.workspaceResources {
70827089
if agt.ResourceID != res.ID {
@@ -7146,7 +7153,7 @@ func (q *FakeQuerier) GetWorkspaceAgentByInstanceID(_ context.Context, instanceI
71467153
// The schema sorts this by created at, so we iterate the array backwards.
71477154
for i := len(q.workspaceAgents) - 1; i >= 0; i-- {
71487155
agent := q.workspaceAgents[i]
7149-
if agent.AuthInstanceID.Valid && agent.AuthInstanceID.String == instanceID {
7156+
if !agent.Deleted && agent.AuthInstanceID.Valid && agent.AuthInstanceID.String == instanceID {
71507157
return agent, nil
71517158
}
71527159
}
@@ -7706,13 +7713,13 @@ func (q *FakeQuerier) GetWorkspaceAgentUsageStatsAndLabels(_ context.Context, cr
77067713
return stats, nil
77077714
}
77087715

7709-
func (q *FakeQuerier) GetWorkspaceAgentsByParentID(ctx context.Context, parentID uuid.UUID) ([]database.WorkspaceAgent, error) {
7716+
func (q *FakeQuerier) GetWorkspaceAgentsByParentID(_ context.Context, parentID uuid.UUID) ([]database.WorkspaceAgent, error) {
77107717
q.mutex.RLock()
77117718
defer q.mutex.RUnlock()
77127719

77137720
workspaceAgents := make([]database.WorkspaceAgent, 0)
77147721
for _, agent := range q.workspaceAgents {
7715-
if !agent.ParentID.Valid || agent.ParentID.UUID != parentID {
7722+
if !agent.ParentID.Valid || agent.ParentID.UUID != parentID || agent.Deleted {
77167723
continue
77177724
}
77187725

@@ -7759,6 +7766,9 @@ func (q *FakeQuerier) GetWorkspaceAgentsCreatedAfter(_ context.Context, after ti
77597766

77607767
workspaceAgents := make([]database.WorkspaceAgent, 0)
77617768
for _, agent := range q.workspaceAgents {
7769+
if agent.Deleted {
7770+
continue
7771+
}
77627772
if agent.CreatedAt.After(after) {
77637773
workspaceAgents = append(workspaceAgents, agent)
77647774
}

coderd/database/dump.sql

Lines changed: 6 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
-- Restore prebuilds, previously modified in 000323_workspace_latest_builds_optimization.up.sql.
2+
DROP VIEW workspace_prebuilds;
3+
4+
CREATE VIEW workspace_prebuilds AS
5+
WITH all_prebuilds AS (
6+
SELECT w.id,
7+
w.name,
8+
w.template_id,
9+
w.created_at
10+
FROM workspaces w
11+
WHERE (w.owner_id = 'c42fdf75-3097-471c-8c33-fb52454d81c0'::uuid)
12+
), workspaces_with_latest_presets AS (
13+
SELECT DISTINCT ON (workspace_builds.workspace_id) workspace_builds.workspace_id,
14+
workspace_builds.template_version_preset_id
15+
FROM workspace_builds
16+
WHERE (workspace_builds.template_version_preset_id IS NOT NULL)
17+
ORDER BY workspace_builds.workspace_id, workspace_builds.build_number DESC
18+
), workspaces_with_agents_status AS (
19+
SELECT w.id AS workspace_id,
20+
bool_and((wa.lifecycle_state = 'ready'::workspace_agent_lifecycle_state)) AS ready
21+
FROM (((workspaces w
22+
JOIN workspace_latest_builds wlb ON ((wlb.workspace_id = w.id)))
23+
JOIN workspace_resources wr ON ((wr.job_id = wlb.job_id)))
24+
JOIN workspace_agents wa ON ((wa.resource_id = wr.id)))
25+
WHERE (w.owner_id = 'c42fdf75-3097-471c-8c33-fb52454d81c0'::uuid)
26+
GROUP BY w.id
27+
), current_presets AS (
28+
SELECT w.id AS prebuild_id,
29+
wlp.template_version_preset_id
30+
FROM (workspaces w
31+
JOIN workspaces_with_latest_presets wlp ON ((wlp.workspace_id = w.id)))
32+
WHERE (w.owner_id = 'c42fdf75-3097-471c-8c33-fb52454d81c0'::uuid)
33+
)
34+
SELECT p.id,
35+
p.name,
36+
p.template_id,
37+
p.created_at,
38+
COALESCE(a.ready, false) AS ready,
39+
cp.template_version_preset_id AS current_preset_id
40+
FROM ((all_prebuilds p
41+
LEFT JOIN workspaces_with_agents_status a ON ((a.workspace_id = p.id)))
42+
JOIN current_presets cp ON ((cp.prebuild_id = p.id)));
43+
44+
-- Restore trigger without deleted check.
45+
DROP TRIGGER IF EXISTS workspace_agent_name_unique_trigger ON workspace_agents;
46+
DROP FUNCTION IF EXISTS check_workspace_agent_name_unique();
47+
48+
CREATE OR REPLACE FUNCTION check_workspace_agent_name_unique()
49+
RETURNS TRIGGER AS $$
50+
DECLARE
51+
workspace_build_id uuid;
52+
agents_with_name int;
53+
BEGIN
54+
-- Find the workspace build the workspace agent is being inserted into.
55+
SELECT workspace_builds.id INTO workspace_build_id
56+
FROM workspace_resources
57+
JOIN workspace_builds ON workspace_builds.job_id = workspace_resources.job_id
58+
WHERE workspace_resources.id = NEW.resource_id;
59+
60+
-- If the agent doesn't have a workspace build, we'll allow the insert.
61+
IF workspace_build_id IS NULL THEN
62+
RETURN NEW;
63+
END IF;
64+
65+
-- Count how many agents in this workspace build already have the given agent name.
66+
SELECT COUNT(*) INTO agents_with_name
67+
FROM workspace_agents
68+
JOIN workspace_resources ON workspace_resources.id = workspace_agents.resource_id
69+
JOIN workspace_builds ON workspace_builds.job_id = workspace_resources.job_id
70+
WHERE workspace_builds.id = workspace_build_id
71+
AND workspace_agents.name = NEW.name
72+
AND workspace_agents.id != NEW.id;
73+
74+
-- If there's already an agent with this name, raise an error
75+
IF agents_with_name > 0 THEN
76+
RAISE EXCEPTION 'workspace agent name "%" already exists in this workspace build', NEW.name
77+
USING ERRCODE = 'unique_violation';
78+
END IF;
79+
80+
RETURN NEW;
81+
END;
82+
$$ LANGUAGE plpgsql;
83+
84+
CREATE TRIGGER workspace_agent_name_unique_trigger
85+
BEFORE INSERT OR UPDATE OF name, resource_id ON workspace_agents
86+
FOR EACH ROW
87+
EXECUTE FUNCTION check_workspace_agent_name_unique();
88+
89+
COMMENT ON TRIGGER workspace_agent_name_unique_trigger ON workspace_agents IS
90+
'Use a trigger instead of a unique constraint because existing data may violate
91+
the uniqueness requirement. A trigger allows us to enforce uniqueness going
92+
forward without requiring a migration to clean up historical data.';
93+
94+
95+
ALTER TABLE workspace_agents
96+
DROP COLUMN deleted;

0 commit comments

Comments
 (0)