From 1df93befa64a59c4a73f5592a6d0bb33f7bc9278 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Mon, 18 Mar 2024 20:27:21 +0200 Subject: [PATCH 1/2] feat(coderd/database): keep only 1 day of `workspace_agent_stats` after rollup --- cli/server.go | 2 +- coderd/database/dbmem/dbmem.go | 40 +++++++++- coderd/database/dbpurge/dbpurge.go | 1 - coderd/database/dbpurge/dbpurge_test.go | 79 ++++++++++++++++--- coderd/database/queries.sql.go | 21 ++++- .../database/queries/workspaceagentstats.sql | 21 ++++- 6 files changed, 148 insertions(+), 16 deletions(-) diff --git a/cli/server.go b/cli/server.go index d278dbea3542a..5b804f927869f 100644 --- a/cli/server.go +++ b/cli/server.go @@ -965,7 +965,7 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. defer shutdownConns() // Ensures that old database entries are cleaned up over time! - purger := dbpurge.New(ctx, logger, options.Database) + purger := dbpurge.New(ctx, logger.Named("dbpurge"), options.Database) defer purger.Close() // Updates workspace usage diff --git a/coderd/database/dbmem/dbmem.go b/coderd/database/dbmem/dbmem.go index b30184773bb1b..896e749c0e3e8 100644 --- a/coderd/database/dbmem/dbmem.go +++ b/coderd/database/dbmem/dbmem.go @@ -1506,13 +1506,47 @@ func (q *FakeQuerier) DeleteOldWorkspaceAgentStats(_ context.Context) error { q.mutex.Lock() defer q.mutex.Unlock() + /* + DELETE FROM + workspace_agent_stats + WHERE + created_at < ( + SELECT + COALESCE( + -- When generating initial template usage stats, all the + -- raw agent stats are needed, after that only ~30 mins + -- from last rollup is needed. Deployment stats seem to + -- use between 15 mins and 1 hour of data. We keep a + -- little bit more (1 day) just in case. + MAX(start_time) - '1 days'::interval, + -- Fall back to 6 months ago if there are no template + -- usage stats so that we don't delete the data before + -- it's rolled up. + NOW() - '6 months'::interval + ) + FROM + template_usage_stats + ); + */ + now := dbtime.Now() - sixMonthInterval := 6 * 30 * 24 * time.Hour - sixMonthsAgo := now.Add(-sixMonthInterval) + var limit time.Time + // MAX + for _, stat := range q.templateUsageStats { + if stat.StartTime.After(limit) { + limit = stat.StartTime.AddDate(0, 0, -1) + } + } + // COALESCE + if limit.IsZero() { + limit = now.AddDate(0, -6, 0) + } var validStats []database.WorkspaceAgentStat for _, stat := range q.workspaceAgentStats { - if stat.CreatedAt.Before(sixMonthsAgo) { + fmt.Println(stat.CreatedAt, limit) + if stat.CreatedAt.Before(limit) { + fmt.Println("delete") continue } validStats = append(validStats, stat) diff --git a/coderd/database/dbpurge/dbpurge.go b/coderd/database/dbpurge/dbpurge.go index c4b5a609a3179..d3fc56a8c5f21 100644 --- a/coderd/database/dbpurge/dbpurge.go +++ b/coderd/database/dbpurge/dbpurge.go @@ -24,7 +24,6 @@ const ( // This is for cleaning up old, unused resources from the database that take up space. func New(ctx context.Context, logger slog.Logger, db database.Store) io.Closer { closed := make(chan struct{}) - logger = logger.Named("dbpurge") ctx, cancelFunc := context.WithCancel(ctx) //nolint:gocritic // The system purges old db records without user input. diff --git a/coderd/database/dbpurge/dbpurge_test.go b/coderd/database/dbpurge/dbpurge_test.go index 9e949f270d2ca..4255409ba68fd 100644 --- a/coderd/database/dbpurge/dbpurge_test.go +++ b/coderd/database/dbpurge/dbpurge_test.go @@ -11,12 +11,14 @@ import ( "go.uber.org/goleak" "golang.org/x/exp/slices" + "cdr.dev/slog" "cdr.dev/slog/sloggers/slogtest" "github.com/coder/coder/v2/coderd/database" "github.com/coder/coder/v2/coderd/database/dbgen" "github.com/coder/coder/v2/coderd/database/dbmem" "github.com/coder/coder/v2/coderd/database/dbpurge" + "github.com/coder/coder/v2/coderd/database/dbrollup" "github.com/coder/coder/v2/coderd/database/dbtestutil" "github.com/coder/coder/v2/coderd/database/dbtime" "github.com/coder/coder/v2/provisionerd/proto" @@ -40,27 +42,62 @@ func TestDeleteOldWorkspaceAgentStats(t *testing.T) { t.Parallel() db, _ := dbtestutil.NewDB(t) - logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}) + logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}).Leveled(slog.LevelDebug) + + now := dbtime.Now() + + defer func() { + if t.Failed() { + t.Logf("Test failed, printing rows...") + ctx := testutil.Context(t, testutil.WaitShort) + wasRows, err := db.GetWorkspaceAgentStats(ctx, now.AddDate(0, -7, 0)) + if err == nil { + for _, row := range wasRows { + t.Logf("workspace agent stat: %v", row) + } + } + tusRows, err := db.GetTemplateUsageStats(context.Background(), database.GetTemplateUsageStatsParams{ + StartTime: now.AddDate(0, -7, 0), + EndTime: now, + }) + if err == nil { + for _, row := range tusRows { + t.Logf("template usage stat: %v", row) + } + } + } + }() ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort) defer cancel() - now := dbtime.Now() - // given // Let's use RxBytes to identify stat entries. // Stat inserted 6 months + 1 hour ago, should be deleted. first := dbgen.WorkspaceAgentStat(t, db, database.WorkspaceAgentStat{ - CreatedAt: now.Add(-6*30*24*time.Hour - time.Hour), + CreatedAt: now.AddDate(0, -6, 0).Add(-time.Hour), + ConnectionCount: 1, ConnectionMedianLatencyMS: 1, RxBytes: 1111, + SessionCountSSH: 1, }) - // Stat inserted 6 months - 1 hour ago, should not be deleted. + // Stat inserted 6 months - 1 hour ago, should not be deleted before rollup. second := dbgen.WorkspaceAgentStat(t, db, database.WorkspaceAgentStat{ - CreatedAt: now.Add(-5*30*24*time.Hour + time.Hour), + CreatedAt: now.AddDate(0, -6, 0).Add(time.Hour), + ConnectionCount: 1, ConnectionMedianLatencyMS: 1, RxBytes: 2222, + SessionCountSSH: 1, + }) + + // Stat inserted 6 months - 1 day - 2 hour ago, should not be deleted at all. + third := dbgen.WorkspaceAgentStat(t, db, database.WorkspaceAgentStat{ + CreatedAt: now.AddDate(0, -6, 0).AddDate(0, 0, 1).Add(2 * time.Hour), + ConnectionCount: 1, + ConnectionMedianLatencyMS: 1, + RxBytes: 3333, + SessionCountSSH: 1, }) // when @@ -70,15 +107,39 @@ func TestDeleteOldWorkspaceAgentStats(t *testing.T) { // then var stats []database.GetWorkspaceAgentStatsRow var err error - require.Eventually(t, func() bool { + require.Eventuallyf(t, func() bool { // Query all stats created not earlier than 7 months ago - stats, err = db.GetWorkspaceAgentStats(ctx, now.Add(-7*30*24*time.Hour)) + stats, err = db.GetWorkspaceAgentStats(ctx, now.AddDate(0, -7, 0)) if err != nil { return false } return !containsWorkspaceAgentStat(stats, first) && containsWorkspaceAgentStat(stats, second) - }, testutil.WaitShort, testutil.IntervalFast, stats) + }, testutil.WaitShort, testutil.IntervalFast, "it should delete old stats: %v", stats) + + // when + events := make(chan dbrollup.Event) + rolluper := dbrollup.New(logger, db, dbrollup.WithEventChannel(events)) + defer rolluper.Close() + + _, _ = <-events, <-events + + // Start a new purger to immediately trigger delete after rollup. + _ = closer.Close() + closer = dbpurge.New(ctx, logger, db) + defer closer.Close() + + // then + require.Eventuallyf(t, func() bool { + // Query all stats created not earlier than 7 months ago + stats, err = db.GetWorkspaceAgentStats(ctx, now.AddDate(0, -7, 0)) + if err != nil { + return false + } + return !containsWorkspaceAgentStat(stats, first) && + !containsWorkspaceAgentStat(stats, second) && + containsWorkspaceAgentStat(stats, third) + }, testutil.WaitShort, testutil.IntervalFast, "it should delete old stats after rollup: %v", stats) } func containsWorkspaceAgentStat(stats []database.GetWorkspaceAgentStatsRow, needle database.WorkspaceAgentStat) bool { diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index 35c55dd6fe3ec..ada3f4c93052d 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -10111,7 +10111,26 @@ func (q *sqlQuerier) UpdateWorkspaceAgentStartupByID(ctx context.Context, arg Up } const deleteOldWorkspaceAgentStats = `-- name: DeleteOldWorkspaceAgentStats :exec -DELETE FROM workspace_agent_stats WHERE created_at < NOW() - INTERVAL '180 days' +DELETE FROM + workspace_agent_stats +WHERE + created_at < ( + SELECT + COALESCE( + -- When generating initial template usage stats, all the + -- raw agent stats are needed, after that only ~30 mins + -- from last rollup is needed. Deployment stats seem to + -- use between 15 mins and 1 hour of data. We keep a + -- little bit more (1 day) just in case. + MAX(start_time) - '1 days'::interval, + -- Fall back to 6 months ago if there are no template + -- usage stats so that we don't delete the data before + -- it's rolled up. + NOW() - '6 months'::interval + ) + FROM + template_usage_stats + ) ` func (q *sqlQuerier) DeleteOldWorkspaceAgentStats(ctx context.Context) error { diff --git a/coderd/database/queries/workspaceagentstats.sql b/coderd/database/queries/workspaceagentstats.sql index cf059121dec77..0ad553aed45eb 100644 --- a/coderd/database/queries/workspaceagentstats.sql +++ b/coderd/database/queries/workspaceagentstats.sql @@ -90,7 +90,26 @@ ORDER BY date ASC; -- name: DeleteOldWorkspaceAgentStats :exec -DELETE FROM workspace_agent_stats WHERE created_at < NOW() - INTERVAL '180 days'; +DELETE FROM + workspace_agent_stats +WHERE + created_at < ( + SELECT + COALESCE( + -- When generating initial template usage stats, all the + -- raw agent stats are needed, after that only ~30 mins + -- from last rollup is needed. Deployment stats seem to + -- use between 15 mins and 1 hour of data. We keep a + -- little bit more (1 day) just in case. + MAX(start_time) - '1 days'::interval, + -- Fall back to 6 months ago if there are no template + -- usage stats so that we don't delete the data before + -- it's rolled up. + NOW() - '6 months'::interval + ) + FROM + template_usage_stats + ); -- name: GetDeploymentWorkspaceAgentStats :one WITH agent_stats AS ( From 18b45c23c04d289091817c49a138c4abbf1dcd20 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Tue, 9 Apr 2024 11:33:00 +0000 Subject: [PATCH 2/2] delete in batches --- coderd/database/dbmem/dbmem.go | 24 ++++++++++++++++--- coderd/database/queries.sql.go | 9 +++++++ .../database/queries/workspaceagentstats.sql | 9 +++++++ 3 files changed, 39 insertions(+), 3 deletions(-) diff --git a/coderd/database/dbmem/dbmem.go b/coderd/database/dbmem/dbmem.go index 896e749c0e3e8..fd06b026b2dcd 100644 --- a/coderd/database/dbmem/dbmem.go +++ b/coderd/database/dbmem/dbmem.go @@ -1526,6 +1526,15 @@ func (q *FakeQuerier) DeleteOldWorkspaceAgentStats(_ context.Context) error { ) FROM template_usage_stats + ) + AND created_at < ( + -- Delete at most in batches of 3 days (with a batch size of 3 days, we + -- can clear out the previous 6 months of data in ~60 iterations) whilst + -- keeping the DB load relatively low. + SELECT + COALESCE(MIN(created_at) + '3 days'::interval, NOW()) + FROM + workspace_agent_stats ); */ @@ -1543,10 +1552,19 @@ func (q *FakeQuerier) DeleteOldWorkspaceAgentStats(_ context.Context) error { } var validStats []database.WorkspaceAgentStat + var batchLimit time.Time + for _, stat := range q.workspaceAgentStats { + if batchLimit.IsZero() || stat.CreatedAt.Before(batchLimit) { + batchLimit = stat.CreatedAt + } + } + if batchLimit.IsZero() { + batchLimit = time.Now() + } else { + batchLimit = batchLimit.AddDate(0, 0, 3) + } for _, stat := range q.workspaceAgentStats { - fmt.Println(stat.CreatedAt, limit) - if stat.CreatedAt.Before(limit) { - fmt.Println("delete") + if stat.CreatedAt.Before(limit) && stat.CreatedAt.Before(batchLimit) { continue } validStats = append(validStats, stat) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index ada3f4c93052d..4b93cefcac3e0 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -10131,6 +10131,15 @@ WHERE FROM template_usage_stats ) + AND created_at < ( + -- Delete at most in batches of 3 days (with a batch size of 3 days, we + -- can clear out the previous 6 months of data in ~60 iterations) whilst + -- keeping the DB load relatively low. + SELECT + COALESCE(MIN(created_at) + '3 days'::interval, NOW()) + FROM + workspace_agent_stats + ) ` func (q *sqlQuerier) DeleteOldWorkspaceAgentStats(ctx context.Context) error { diff --git a/coderd/database/queries/workspaceagentstats.sql b/coderd/database/queries/workspaceagentstats.sql index 0ad553aed45eb..8b9a9c2964ef6 100644 --- a/coderd/database/queries/workspaceagentstats.sql +++ b/coderd/database/queries/workspaceagentstats.sql @@ -109,6 +109,15 @@ WHERE ) FROM template_usage_stats + ) + AND created_at < ( + -- Delete at most in batches of 3 days (with a batch size of 3 days, we + -- can clear out the previous 6 months of data in ~60 iterations) whilst + -- keeping the DB load relatively low. + SELECT + COALESCE(MIN(created_at) + '3 days'::interval, NOW()) + FROM + workspace_agent_stats ); -- name: GetDeploymentWorkspaceAgentStats :one