Skip to content

feat(coderd/database): keep only 1 day of workspace_agent_stats after rollup #12674

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cli/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -965,7 +965,7 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
defer shutdownConns()

// Ensures that old database entries are cleaned up over time!
purger := dbpurge.New(ctx, logger, options.Database)
purger := dbpurge.New(ctx, logger.Named("dbpurge"), options.Database)
defer purger.Close()

// Updates workspace usage
Expand Down
58 changes: 55 additions & 3 deletions coderd/database/dbmem/dbmem.go
Original file line number Diff line number Diff line change
Expand Up @@ -1506,13 +1506,65 @@ func (q *FakeQuerier) DeleteOldWorkspaceAgentStats(_ context.Context) error {
q.mutex.Lock()
defer q.mutex.Unlock()

/*
DELETE FROM
workspace_agent_stats
WHERE
created_at < (
SELECT
COALESCE(
-- When generating initial template usage stats, all the
-- raw agent stats are needed, after that only ~30 mins
-- from last rollup is needed. Deployment stats seem to
-- use between 15 mins and 1 hour of data. We keep a
-- little bit more (1 day) just in case.
MAX(start_time) - '1 days'::interval,
-- Fall back to 6 months ago if there are no template
-- usage stats so that we don't delete the data before
-- it's rolled up.
NOW() - '6 months'::interval
)
FROM
template_usage_stats
)
AND created_at < (
-- Delete at most in batches of 3 days (with a batch size of 3 days, we
-- can clear out the previous 6 months of data in ~60 iterations) whilst
-- keeping the DB load relatively low.
SELECT
COALESCE(MIN(created_at) + '3 days'::interval, NOW())
FROM
workspace_agent_stats
);
*/

now := dbtime.Now()
sixMonthInterval := 6 * 30 * 24 * time.Hour
sixMonthsAgo := now.Add(-sixMonthInterval)
var limit time.Time
// MAX
for _, stat := range q.templateUsageStats {
if stat.StartTime.After(limit) {
limit = stat.StartTime.AddDate(0, 0, -1)
}
}
// COALESCE
if limit.IsZero() {
limit = now.AddDate(0, -6, 0)
}

var validStats []database.WorkspaceAgentStat
var batchLimit time.Time
for _, stat := range q.workspaceAgentStats {
if batchLimit.IsZero() || stat.CreatedAt.Before(batchLimit) {
batchLimit = stat.CreatedAt
}
}
if batchLimit.IsZero() {
batchLimit = time.Now()
} else {
batchLimit = batchLimit.AddDate(0, 0, 3)
}
for _, stat := range q.workspaceAgentStats {
if stat.CreatedAt.Before(sixMonthsAgo) {
if stat.CreatedAt.Before(limit) && stat.CreatedAt.Before(batchLimit) {
continue
}
validStats = append(validStats, stat)
Expand Down
1 change: 0 additions & 1 deletion coderd/database/dbpurge/dbpurge.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ const (
// This is for cleaning up old, unused resources from the database that take up space.
func New(ctx context.Context, logger slog.Logger, db database.Store) io.Closer {
closed := make(chan struct{})
logger = logger.Named("dbpurge")

ctx, cancelFunc := context.WithCancel(ctx)
//nolint:gocritic // The system purges old db records without user input.
Expand Down
79 changes: 70 additions & 9 deletions coderd/database/dbpurge/dbpurge_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,14 @@ import (
"go.uber.org/goleak"
"golang.org/x/exp/slices"

"cdr.dev/slog"
"cdr.dev/slog/sloggers/slogtest"

"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbgen"
"github.com/coder/coder/v2/coderd/database/dbmem"
"github.com/coder/coder/v2/coderd/database/dbpurge"
"github.com/coder/coder/v2/coderd/database/dbrollup"
"github.com/coder/coder/v2/coderd/database/dbtestutil"
"github.com/coder/coder/v2/coderd/database/dbtime"
"github.com/coder/coder/v2/provisionerd/proto"
Expand All @@ -40,27 +42,62 @@ func TestDeleteOldWorkspaceAgentStats(t *testing.T) {
t.Parallel()

db, _ := dbtestutil.NewDB(t)
logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true})
logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}).Leveled(slog.LevelDebug)

now := dbtime.Now()

defer func() {
if t.Failed() {
t.Logf("Test failed, printing rows...")
ctx := testutil.Context(t, testutil.WaitShort)
wasRows, err := db.GetWorkspaceAgentStats(ctx, now.AddDate(0, -7, 0))
if err == nil {
for _, row := range wasRows {
t.Logf("workspace agent stat: %v", row)
}
}
tusRows, err := db.GetTemplateUsageStats(context.Background(), database.GetTemplateUsageStatsParams{
StartTime: now.AddDate(0, -7, 0),
EndTime: now,
})
if err == nil {
for _, row := range tusRows {
t.Logf("template usage stat: %v", row)
}
}
}
}()

ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
defer cancel()

now := dbtime.Now()

// given
// Let's use RxBytes to identify stat entries.
// Stat inserted 6 months + 1 hour ago, should be deleted.
first := dbgen.WorkspaceAgentStat(t, db, database.WorkspaceAgentStat{
CreatedAt: now.Add(-6*30*24*time.Hour - time.Hour),
CreatedAt: now.AddDate(0, -6, 0).Add(-time.Hour),
ConnectionCount: 1,
ConnectionMedianLatencyMS: 1,
RxBytes: 1111,
SessionCountSSH: 1,
})

// Stat inserted 6 months - 1 hour ago, should not be deleted.
// Stat inserted 6 months - 1 hour ago, should not be deleted before rollup.
second := dbgen.WorkspaceAgentStat(t, db, database.WorkspaceAgentStat{
CreatedAt: now.Add(-5*30*24*time.Hour + time.Hour),
CreatedAt: now.AddDate(0, -6, 0).Add(time.Hour),
ConnectionCount: 1,
ConnectionMedianLatencyMS: 1,
RxBytes: 2222,
SessionCountSSH: 1,
})

// Stat inserted 6 months - 1 day - 2 hour ago, should not be deleted at all.
third := dbgen.WorkspaceAgentStat(t, db, database.WorkspaceAgentStat{
CreatedAt: now.AddDate(0, -6, 0).AddDate(0, 0, 1).Add(2 * time.Hour),
ConnectionCount: 1,
ConnectionMedianLatencyMS: 1,
RxBytes: 3333,
SessionCountSSH: 1,
})

// when
Expand All @@ -70,15 +107,39 @@ func TestDeleteOldWorkspaceAgentStats(t *testing.T) {
// then
var stats []database.GetWorkspaceAgentStatsRow
var err error
require.Eventually(t, func() bool {
require.Eventuallyf(t, func() bool {
// Query all stats created not earlier than 7 months ago
stats, err = db.GetWorkspaceAgentStats(ctx, now.Add(-7*30*24*time.Hour))
stats, err = db.GetWorkspaceAgentStats(ctx, now.AddDate(0, -7, 0))
if err != nil {
return false
}
return !containsWorkspaceAgentStat(stats, first) &&
containsWorkspaceAgentStat(stats, second)
}, testutil.WaitShort, testutil.IntervalFast, stats)
}, testutil.WaitShort, testutil.IntervalFast, "it should delete old stats: %v", stats)

// when
events := make(chan dbrollup.Event)
rolluper := dbrollup.New(logger, db, dbrollup.WithEventChannel(events))
defer rolluper.Close()

_, _ = <-events, <-events

// Start a new purger to immediately trigger delete after rollup.
_ = closer.Close()
closer = dbpurge.New(ctx, logger, db)
defer closer.Close()

// then
require.Eventuallyf(t, func() bool {
// Query all stats created not earlier than 7 months ago
stats, err = db.GetWorkspaceAgentStats(ctx, now.AddDate(0, -7, 0))
if err != nil {
return false
}
return !containsWorkspaceAgentStat(stats, first) &&
!containsWorkspaceAgentStat(stats, second) &&
containsWorkspaceAgentStat(stats, third)
}, testutil.WaitShort, testutil.IntervalFast, "it should delete old stats after rollup: %v", stats)
}

func containsWorkspaceAgentStat(stats []database.GetWorkspaceAgentStatsRow, needle database.WorkspaceAgentStat) bool {
Expand Down
30 changes: 29 additions & 1 deletion coderd/database/queries.sql.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

30 changes: 29 additions & 1 deletion coderd/database/queries/workspaceagentstats.sql
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a maximum number of rows that could be deleted here?
Would it make sense to add a limit on the number of rows deleted?
My thinking is that we could choose a reasonably large upper limit and hopefully converge after a few iterations.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not a bad idea. Limiting would only be relevant for the first time(s) this query runs, after that there's no point but if we want to avoid the initial performance penalty of doing a huge delete, that would be sensible. There's no limit for deletes though so we could rely on selecting the min(created_at) and only delete 1 (or N days) of data.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thinking deleting the oldest day of data every interval will probably work fine here. The magnitude of that data should scale with the size of the deployment. We'll have to assume that the database is correctly sized to handle that level of deletes, but if not that is likely a sizing issue with the database itself in relation to the rest of the deployment.

Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,35 @@ ORDER BY
date ASC;

-- name: DeleteOldWorkspaceAgentStats :exec
DELETE FROM workspace_agent_stats WHERE created_at < NOW() - INTERVAL '180 days';
DELETE FROM
workspace_agent_stats
WHERE
created_at < (
SELECT
COALESCE(
-- When generating initial template usage stats, all the
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change may be little too soon to drop. If customers identify problems with the release, they will decide to roll back to the previous version. Boom! No workspace_agent_stats anymore.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, I made this a separate PR so we can defer it for later.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rollbacks require db backups at present. We just do not support downgrades at present.

And not supported like, you can't even really do it. You have to run the migration downgrade with the latest version, then rollback. We have 0 documentation for this, so no customer should do it.

-- raw agent stats are needed, after that only ~30 mins
-- from last rollup is needed. Deployment stats seem to
-- use between 15 mins and 1 hour of data. We keep a
-- little bit more (1 day) just in case.
MAX(start_time) - '1 days'::interval,
-- Fall back to 6 months ago if there are no template
-- usage stats so that we don't delete the data before
-- it's rolled up.
NOW() - '6 months'::interval
)
FROM
template_usage_stats
)
AND created_at < (
-- Delete at most in batches of 3 days (with a batch size of 3 days, we
-- can clear out the previous 6 months of data in ~60 iterations) whilst
-- keeping the DB load relatively low.
SELECT
COALESCE(MIN(created_at) + '3 days'::interval, NOW())
FROM
workspace_agent_stats
);

-- name: GetDeploymentWorkspaceAgentStats :one
WITH agent_stats AS (
Expand Down
Loading