Skip to content

fix(coderd): mark sub agent deletion via boolean instead of delete #18411

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 4 additions & 9 deletions coderd/agentapi/subagent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -875,14 +875,9 @@ func TestSubAgentAPI(t *testing.T) {
require.NoError(t, err)
})

t.Run("DeletesWorkspaceApps", func(t *testing.T) {
t.Run("DeleteRetainsWorkspaceApps", func(t *testing.T) {
t.Parallel()

// Skip test on in-memory database since CASCADE DELETE is not implemented
if !dbtestutil.WillUsePostgres() {
t.Skip("CASCADE DELETE behavior requires PostgreSQL")
}

log := testutil.Logger(t)
ctx := testutil.Context(t, testutil.WaitShort)
clock := quartz.NewMock(t)
Expand Down Expand Up @@ -931,11 +926,11 @@ func TestSubAgentAPI(t *testing.T) {
_, err = api.Database.GetWorkspaceAgentByID(dbauthz.AsSystemRestricted(ctx), subAgentID) //nolint:gocritic // this is a test.
require.ErrorIs(t, err, sql.ErrNoRows)

// And: The apps are also deleted (due to CASCADE DELETE)
// Use raw database since authorization layer requires agent to exist
// And: The apps are *retained* to avoid causing issues
// where the resources are expected to be present.
appsAfterDeletion, err := db.GetWorkspaceAppsByAgentID(ctx, subAgentID)
require.NoError(t, err)
require.Empty(t, appsAfterDeletion)
require.NotEmpty(t, appsAfterDeletion)
})
})

Expand Down
33 changes: 32 additions & 1 deletion coderd/database/dbgen/dbgen.go
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ func WorkspaceAgent(t testing.TB, db database.Store, orig database.WorkspaceAgen
},
ConnectionTimeoutSeconds: takeFirst(orig.ConnectionTimeoutSeconds, 3600),
TroubleshootingURL: takeFirst(orig.TroubleshootingURL, "https://example.com"),
MOTDFile: takeFirst(orig.TroubleshootingURL, ""),
MOTDFile: takeFirst(orig.MOTDFile, ""),
DisplayApps: append([]database.DisplayApp{}, orig.DisplayApps...),
DisplayOrder: takeFirst(orig.DisplayOrder, 1),
APIKeyScope: takeFirst(orig.APIKeyScope, database.AgentKeyScopeEnumAll),
Expand All @@ -226,6 +226,37 @@ func WorkspaceAgent(t testing.TB, db database.Store, orig database.WorkspaceAgen
})
require.NoError(t, err, "update workspace agent first connected at")
}

// Add a test antagonist. For every agent we add a deleted sub agent
// to discover cases where deletion should be handled.
subAgt, err := db.InsertWorkspaceAgent(genCtx, database.InsertWorkspaceAgentParams{
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Review: This may be a bit unusual, but it already helped me catch an issue with the workspace_prebuilds VIEW.

Ideally this potential cause would somehow be propagated to the user in case of test failure, but not sure how that could be done. I'll add a log entry to help with discovery.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like this a lot! I do agree that it could get confusing to troubleshoot, but the log message in combination with dbtestutil.DumpOnFailure should help. You could also potentially set some relevant fields of the sub-agent so that its purpose is clear.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about this? 😆 bc370a8 (#18411)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SATISFACTORY

ID: uuid.New(),
ParentID: uuid.NullUUID{UUID: agt.ID, Valid: true},
CreatedAt: dbtime.Now(),
UpdatedAt: dbtime.Now(),
Name: testutil.GetRandomName(t),
ResourceID: agt.ResourceID,
AuthToken: uuid.New(),
AuthInstanceID: sql.NullString{},
Architecture: agt.Architecture,
EnvironmentVariables: pqtype.NullRawMessage{},
OperatingSystem: agt.OperatingSystem,
Directory: agt.Directory,
InstanceMetadata: pqtype.NullRawMessage{},
ResourceMetadata: pqtype.NullRawMessage{},
ConnectionTimeoutSeconds: agt.ConnectionTimeoutSeconds,
TroubleshootingURL: "I AM A TEST ANTAGONIST AND I AM HERE TO MESS UP YOUR TESTS. IF YOU SEE ME, SOMETHING IS WRONG AND SUB AGENT DELETION MAY NOT BE HANDLED CORRECTLY IN A QUERY.",
MOTDFile: "",
DisplayApps: nil,
DisplayOrder: agt.DisplayOrder,
APIKeyScope: agt.APIKeyScope,
})
require.NoError(t, err, "insert workspace agent subagent antagonist")
err = db.DeleteWorkspaceSubAgentByID(genCtx, subAgt.ID)
require.NoError(t, err, "delete workspace agent subagent antagonist")

t.Logf("inserted workspace agent %s (%v) with deleted subagent antagonist %s (%v)", agt.Name, agt.ID, subAgt.Name, subAgt.ID)

return agt
}

Expand Down
22 changes: 16 additions & 6 deletions coderd/database/dbmem/dbmem.go
Original file line number Diff line number Diff line change
Expand Up @@ -792,7 +792,7 @@ func (q *FakeQuerier) getWorkspaceAgentByIDNoLock(_ context.Context, id uuid.UUI
// The schema sorts this by created at, so we iterate the array backwards.
for i := len(q.workspaceAgents) - 1; i >= 0; i-- {
agent := q.workspaceAgents[i]
if agent.ID == id {
if !agent.Deleted && agent.ID == id {
return agent, nil
}
}
Expand All @@ -802,6 +802,9 @@ func (q *FakeQuerier) getWorkspaceAgentByIDNoLock(_ context.Context, id uuid.UUI
func (q *FakeQuerier) getWorkspaceAgentsByResourceIDsNoLock(_ context.Context, resourceIDs []uuid.UUID) ([]database.WorkspaceAgent, error) {
workspaceAgents := make([]database.WorkspaceAgent, 0)
for _, agent := range q.workspaceAgents {
if agent.Deleted {
continue
}
for _, resourceID := range resourceIDs {
if agent.ResourceID != resourceID {
continue
Expand Down Expand Up @@ -2543,13 +2546,13 @@ func (q *FakeQuerier) DeleteWorkspaceAgentPortSharesByTemplate(_ context.Context
return nil
}

func (q *FakeQuerier) DeleteWorkspaceSubAgentByID(ctx context.Context, id uuid.UUID) error {
func (q *FakeQuerier) DeleteWorkspaceSubAgentByID(_ context.Context, id uuid.UUID) error {
q.mutex.Lock()
defer q.mutex.Unlock()

for i, agent := range q.workspaceAgents {
if agent.ID == id && agent.ParentID.Valid {
q.workspaceAgents = slices.Delete(q.workspaceAgents, i, i+1)
q.workspaceAgents[i].Deleted = true
return nil
}
}
Expand Down Expand Up @@ -7066,6 +7069,10 @@ func (q *FakeQuerier) GetWorkspaceAgentAndLatestBuildByAuthToken(_ context.Conte
latestBuildNumber := make(map[uuid.UUID]int32)

for _, agt := range q.workspaceAgents {
if agt.Deleted {
continue
}

// get the related workspace and user
for _, res := range q.workspaceResources {
if agt.ResourceID != res.ID {
Expand Down Expand Up @@ -7135,7 +7142,7 @@ func (q *FakeQuerier) GetWorkspaceAgentByInstanceID(_ context.Context, instanceI
// The schema sorts this by created at, so we iterate the array backwards.
for i := len(q.workspaceAgents) - 1; i >= 0; i-- {
agent := q.workspaceAgents[i]
if agent.AuthInstanceID.Valid && agent.AuthInstanceID.String == instanceID {
if !agent.Deleted && agent.AuthInstanceID.Valid && agent.AuthInstanceID.String == instanceID {
return agent, nil
}
}
Expand Down Expand Up @@ -7695,13 +7702,13 @@ func (q *FakeQuerier) GetWorkspaceAgentUsageStatsAndLabels(_ context.Context, cr
return stats, nil
}

func (q *FakeQuerier) GetWorkspaceAgentsByParentID(ctx context.Context, parentID uuid.UUID) ([]database.WorkspaceAgent, error) {
func (q *FakeQuerier) GetWorkspaceAgentsByParentID(_ context.Context, parentID uuid.UUID) ([]database.WorkspaceAgent, error) {
q.mutex.RLock()
defer q.mutex.RUnlock()

workspaceAgents := make([]database.WorkspaceAgent, 0)
for _, agent := range q.workspaceAgents {
if !agent.ParentID.Valid || agent.ParentID.UUID != parentID {
if !agent.ParentID.Valid || agent.ParentID.UUID != parentID || agent.Deleted {
continue
}

Expand Down Expand Up @@ -7748,6 +7755,9 @@ func (q *FakeQuerier) GetWorkspaceAgentsCreatedAfter(_ context.Context, after ti

workspaceAgents := make([]database.WorkspaceAgent, 0)
for _, agent := range q.workspaceAgents {
if agent.Deleted {
continue
}
if agent.CreatedAt.After(after) {
workspaceAgents = append(workspaceAgents, agent)
}
Expand Down
8 changes: 6 additions & 2 deletions coderd/database/dump.sql

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
-- Restore prebuilds, previously modified in 000323_workspace_latest_builds_optimization.up.sql.
DROP VIEW workspace_prebuilds;

CREATE VIEW workspace_prebuilds AS
WITH all_prebuilds AS (
SELECT w.id,
w.name,
w.template_id,
w.created_at
FROM workspaces w
WHERE (w.owner_id = 'c42fdf75-3097-471c-8c33-fb52454d81c0'::uuid)
), workspaces_with_latest_presets AS (
SELECT DISTINCT ON (workspace_builds.workspace_id) workspace_builds.workspace_id,
workspace_builds.template_version_preset_id
FROM workspace_builds
WHERE (workspace_builds.template_version_preset_id IS NOT NULL)
ORDER BY workspace_builds.workspace_id, workspace_builds.build_number DESC
), workspaces_with_agents_status AS (
SELECT w.id AS workspace_id,
bool_and((wa.lifecycle_state = 'ready'::workspace_agent_lifecycle_state)) AS ready
FROM (((workspaces w
JOIN workspace_latest_builds wlb ON ((wlb.workspace_id = w.id)))
JOIN workspace_resources wr ON ((wr.job_id = wlb.job_id)))
JOIN workspace_agents wa ON ((wa.resource_id = wr.id)))
WHERE (w.owner_id = 'c42fdf75-3097-471c-8c33-fb52454d81c0'::uuid)
GROUP BY w.id
), current_presets AS (
SELECT w.id AS prebuild_id,
wlp.template_version_preset_id
FROM (workspaces w
JOIN workspaces_with_latest_presets wlp ON ((wlp.workspace_id = w.id)))
WHERE (w.owner_id = 'c42fdf75-3097-471c-8c33-fb52454d81c0'::uuid)
)
SELECT p.id,
p.name,
p.template_id,
p.created_at,
COALESCE(a.ready, false) AS ready,
cp.template_version_preset_id AS current_preset_id
FROM ((all_prebuilds p
LEFT JOIN workspaces_with_agents_status a ON ((a.workspace_id = p.id)))
JOIN current_presets cp ON ((cp.prebuild_id = p.id)));

-- Restore trigger without deleted check.
DROP TRIGGER IF EXISTS workspace_agent_name_unique_trigger ON workspace_agents;
DROP FUNCTION IF EXISTS check_workspace_agent_name_unique();

CREATE OR REPLACE FUNCTION check_workspace_agent_name_unique()
RETURNS TRIGGER AS $$
DECLARE
workspace_build_id uuid;
agents_with_name int;
BEGIN
-- Find the workspace build the workspace agent is being inserted into.
SELECT workspace_builds.id INTO workspace_build_id
FROM workspace_resources
JOIN workspace_builds ON workspace_builds.job_id = workspace_resources.job_id
WHERE workspace_resources.id = NEW.resource_id;

-- If the agent doesn't have a workspace build, we'll allow the insert.
IF workspace_build_id IS NULL THEN
RETURN NEW;
END IF;

-- Count how many agents in this workspace build already have the given agent name.
SELECT COUNT(*) INTO agents_with_name
FROM workspace_agents
JOIN workspace_resources ON workspace_resources.id = workspace_agents.resource_id
JOIN workspace_builds ON workspace_builds.job_id = workspace_resources.job_id
WHERE workspace_builds.id = workspace_build_id
AND workspace_agents.name = NEW.name
AND workspace_agents.id != NEW.id;

-- If there's already an agent with this name, raise an error
IF agents_with_name > 0 THEN
RAISE EXCEPTION 'workspace agent name "%" already exists in this workspace build', NEW.name
USING ERRCODE = 'unique_violation';
END IF;

RETURN NEW;
END;
$$ LANGUAGE plpgsql;

CREATE TRIGGER workspace_agent_name_unique_trigger
BEFORE INSERT OR UPDATE OF name, resource_id ON workspace_agents
FOR EACH ROW
EXECUTE FUNCTION check_workspace_agent_name_unique();

COMMENT ON TRIGGER workspace_agent_name_unique_trigger ON workspace_agents IS
'Use a trigger instead of a unique constraint because existing data may violate
the uniqueness requirement. A trigger allows us to enforce uniqueness going
forward without requiring a migration to clean up historical data.';


ALTER TABLE workspace_agents
DROP COLUMN deleted;
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
ALTER TABLE workspace_agents
ADD COLUMN deleted BOOLEAN NOT NULL DEFAULT FALSE;

COMMENT ON COLUMN workspace_agents.deleted IS 'Indicates whether or not the agent has been deleted. This is currently only applicable to sub agents.';

-- Recreate the trigger with deleted check.
DROP TRIGGER IF EXISTS workspace_agent_name_unique_trigger ON workspace_agents;
DROP FUNCTION IF EXISTS check_workspace_agent_name_unique();

CREATE OR REPLACE FUNCTION check_workspace_agent_name_unique()
RETURNS TRIGGER AS $$
DECLARE
workspace_build_id uuid;
agents_with_name int;
BEGIN
-- Find the workspace build the workspace agent is being inserted into.
SELECT workspace_builds.id INTO workspace_build_id
FROM workspace_resources
JOIN workspace_builds ON workspace_builds.job_id = workspace_resources.job_id
WHERE workspace_resources.id = NEW.resource_id;

-- If the agent doesn't have a workspace build, we'll allow the insert.
IF workspace_build_id IS NULL THEN
RETURN NEW;
END IF;

-- Count how many agents in this workspace build already have the given agent name.
SELECT COUNT(*) INTO agents_with_name
FROM workspace_agents
JOIN workspace_resources ON workspace_resources.id = workspace_agents.resource_id
JOIN workspace_builds ON workspace_builds.job_id = workspace_resources.job_id
WHERE workspace_builds.id = workspace_build_id
AND workspace_agents.name = NEW.name
AND workspace_agents.id != NEW.id
AND workspace_agents.deleted = FALSE; -- Ensure we only count non-deleted agents.

-- If there's already an agent with this name, raise an error
IF agents_with_name > 0 THEN
RAISE EXCEPTION 'workspace agent name "%" already exists in this workspace build', NEW.name
USING ERRCODE = 'unique_violation';
END IF;

RETURN NEW;
END;
$$ LANGUAGE plpgsql;

CREATE TRIGGER workspace_agent_name_unique_trigger
BEFORE INSERT OR UPDATE OF name, resource_id ON workspace_agents
FOR EACH ROW
EXECUTE FUNCTION check_workspace_agent_name_unique();

COMMENT ON TRIGGER workspace_agent_name_unique_trigger ON workspace_agents IS
'Use a trigger instead of a unique constraint because existing data may violate
the uniqueness requirement. A trigger allows us to enforce uniqueness going
forward without requiring a migration to clean up historical data.';

-- Handle agent deletion in prebuilds, previously modified in 000323_workspace_latest_builds_optimization.up.sql.
DROP VIEW workspace_prebuilds;

CREATE VIEW workspace_prebuilds AS
WITH all_prebuilds AS (
SELECT w.id,
w.name,
w.template_id,
w.created_at
FROM workspaces w
WHERE (w.owner_id = 'c42fdf75-3097-471c-8c33-fb52454d81c0'::uuid)
), workspaces_with_latest_presets AS (
SELECT DISTINCT ON (workspace_builds.workspace_id) workspace_builds.workspace_id,
workspace_builds.template_version_preset_id
FROM workspace_builds
WHERE (workspace_builds.template_version_preset_id IS NOT NULL)
ORDER BY workspace_builds.workspace_id, workspace_builds.build_number DESC
), workspaces_with_agents_status AS (
SELECT w.id AS workspace_id,
bool_and((wa.lifecycle_state = 'ready'::workspace_agent_lifecycle_state)) AS ready
FROM (((workspaces w
JOIN workspace_latest_builds wlb ON ((wlb.workspace_id = w.id)))
JOIN workspace_resources wr ON ((wr.job_id = wlb.job_id)))
-- ADD: deleted check for sub agents.
JOIN workspace_agents wa ON ((wa.resource_id = wr.id AND wa.deleted = FALSE)))
WHERE (w.owner_id = 'c42fdf75-3097-471c-8c33-fb52454d81c0'::uuid)
GROUP BY w.id
), current_presets AS (
SELECT w.id AS prebuild_id,
wlp.template_version_preset_id
FROM (workspaces w
JOIN workspaces_with_latest_presets wlp ON ((wlp.workspace_id = w.id)))
WHERE (w.owner_id = 'c42fdf75-3097-471c-8c33-fb52454d81c0'::uuid)
)
SELECT p.id,
p.name,
p.template_id,
p.created_at,
COALESCE(a.ready, false) AS ready,
cp.template_version_preset_id AS current_preset_id
FROM ((all_prebuilds p
LEFT JOIN workspaces_with_agents_status a ON ((a.workspace_id = p.id)))
JOIN current_presets cp ON ((cp.prebuild_id = p.id)));
Loading
Loading