Skip to content

feat(coderd/database/dbpurge): retain most recent agent build logs #14460

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Aug 30, 2024
91 changes: 81 additions & 10 deletions coderd/database/dbmem/dbmem.go
Original file line number Diff line number Diff line change
Expand Up @@ -1710,19 +1710,90 @@ func (q *FakeQuerier) DeleteOldWorkspaceAgentLogs(_ context.Context, threshold t
q.mutex.Lock()
defer q.mutex.Unlock()

var validLogs []database.WorkspaceAgentLog
for _, log := range q.workspaceAgentLogs {
var toBeDeleted bool
for _, agent := range q.workspaceAgents {
if agent.ID == log.AgentID && agent.LastConnectedAt.Valid && agent.LastConnectedAt.Time.Before(threshold) {
toBeDeleted = true
break
}
/*
WITH
latest_builds AS (
SELECT
workspace_id, max(build_number) AS max_build_number
FROM
workspace_builds
GROUP BY
workspace_id
),
*/
latestBuilds := make(map[uuid.UUID]int32)
for _, wb := range q.workspaceBuilds {
if lastBuildNumber, found := latestBuilds[wb.WorkspaceID]; found && lastBuildNumber > wb.BuildNumber {
continue
}
// not found or newer build number
latestBuilds[wb.WorkspaceID] = wb.BuildNumber
}

if !toBeDeleted {
validLogs = append(validLogs, log)
/*
old_agents AS (
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is really nice!

I'm concerned it'll go stale...
Although it could be argued all the business logic duplicated in this file is stale the second it's written.
We really need to get rid of dbmem.

SELECT
wa.id
FROM
workspace_agents AS wa
JOIN
workspace_resources AS wr
ON
wa.resource_id = wr.id
JOIN
workspace_builds AS wb
ON
wb.job_id = wr.job_id
LEFT JOIN
latest_builds
ON
latest_builds.workspace_id = wb.workspace_id
AND
latest_builds.max_build_number = wb.build_number
WHERE
-- Filter out the latest builds for each workspace.
latest_builds.workspace_id IS NULL
AND CASE
-- If the last time the agent connected was before @threshold
WHEN wa.last_connected_at IS NOT NULL THEN
wa.last_connected_at < @threshold :: timestamptz
-- The agent never connected, and was created before @threshold
ELSE wa.created_at < @threshold :: timestamptz
END
)
*/
oldAgents := make(map[uuid.UUID]struct{})
for _, wa := range q.workspaceAgents {
for _, wr := range q.workspaceResources {
if wr.ID != wa.ResourceID {
continue
}
for _, wb := range q.workspaceBuilds {
if wb.JobID != wr.JobID {
continue
}
latestBuildNumber, found := latestBuilds[wb.WorkspaceID]
if !found {
panic("workspaceBuilds got modified somehow while q was locked! This is a bug in dbmem!")
}
if latestBuildNumber == wb.BuildNumber {
continue
}
if wa.LastConnectedAt.Valid && wa.LastConnectedAt.Time.Before(threshold) || wa.CreatedAt.Before(threshold) {
oldAgents[wa.ID] = struct{}{}
}
}
}
}
/*
DELETE FROM workspace_agent_logs WHERE agent_id IN (SELECT id FROM old_agents);
*/
var validLogs []database.WorkspaceAgentLog
for _, log := range q.workspaceAgentLogs {
if _, found := oldAgents[log.AgentID]; found {
continue
}
validLogs = append(validLogs, log)
}
q.workspaceAgentLogs = validLogs
return nil
Expand Down
11 changes: 8 additions & 3 deletions coderd/database/dbpurge/dbpurge.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (

"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbauthz"
"github.com/coder/coder/v2/coderd/database/dbtime"
"github.com/coder/quartz"
)

Expand All @@ -30,7 +31,8 @@ func New(ctx context.Context, logger slog.Logger, db database.Store, clk quartz.
//nolint:gocritic // The system purges old db records without user input.
ctx = dbauthz.AsSystemRestricted(ctx)

ticker := clk.NewTicker(time.Nanosecond)
// Start the ticker with the initial delay.
ticker := clk.NewTicker(delay)
Comment on lines +34 to +35
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Review: I found this to be the source of a race condition while testing.
I moved the initial 'tick' to instead be inside the goroutine below.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What was the race condition?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It may actually be the same race condition you mentioned below, just moved around slightly.

doTick := func(start time.Time) {
defer ticker.Reset(delay)
// Start a transaction to grab advisory lock, we don't want to run
Expand All @@ -47,7 +49,8 @@ func New(ctx context.Context, logger slog.Logger, db database.Store, clk quartz.
return nil
}

if err := tx.DeleteOldWorkspaceAgentLogs(ctx, start.Add(-maxAgentLogAge)); err != nil {
deleteOldWorkspaceAgentLogsBefore := start.Add(-maxAgentLogAge)
if err := tx.DeleteOldWorkspaceAgentLogs(ctx, deleteOldWorkspaceAgentLogsBefore); err != nil {
return xerrors.Errorf("failed to delete old workspace agent logs: %w", err)
}
if err := tx.DeleteOldWorkspaceAgentStats(ctx); err != nil {
Expand All @@ -72,13 +75,15 @@ func New(ctx context.Context, logger slog.Logger, db database.Store, clk quartz.
go func() {
defer close(closed)
defer ticker.Stop()
// Force an initial tick.
doTick(dbtime.Time(clk.Now()).UTC())
for {
select {
case <-ctx.Done():
return
case tick := <-ticker.C:
ticker.Stop()
doTick(tick)
doTick(dbtime.Time(tick).UTC())
}
}
}()
Expand Down
Loading
Loading