Skip to content

Commit e17e8aa

Browse files
authored
feat(coderd/database): keep only 1 day of workspace_agent_stats after rollup (coder#12674)
1 parent 4a6693a commit e17e8aa

File tree

6 files changed

+184
-16
lines changed

6 files changed

+184
-16
lines changed

cli/server.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -965,7 +965,7 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
965965
defer shutdownConns()
966966

967967
// Ensures that old database entries are cleaned up over time!
968-
purger := dbpurge.New(ctx, logger, options.Database)
968+
purger := dbpurge.New(ctx, logger.Named("dbpurge"), options.Database)
969969
defer purger.Close()
970970

971971
// Updates workspace usage

coderd/database/dbmem/dbmem.go

Lines changed: 55 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1506,13 +1506,65 @@ func (q *FakeQuerier) DeleteOldWorkspaceAgentStats(_ context.Context) error {
15061506
q.mutex.Lock()
15071507
defer q.mutex.Unlock()
15081508

1509+
/*
1510+
DELETE FROM
1511+
workspace_agent_stats
1512+
WHERE
1513+
created_at < (
1514+
SELECT
1515+
COALESCE(
1516+
-- When generating initial template usage stats, all the
1517+
-- raw agent stats are needed, after that only ~30 mins
1518+
-- from last rollup is needed. Deployment stats seem to
1519+
-- use between 15 mins and 1 hour of data. We keep a
1520+
-- little bit more (1 day) just in case.
1521+
MAX(start_time) - '1 days'::interval,
1522+
-- Fall back to 6 months ago if there are no template
1523+
-- usage stats so that we don't delete the data before
1524+
-- it's rolled up.
1525+
NOW() - '6 months'::interval
1526+
)
1527+
FROM
1528+
template_usage_stats
1529+
)
1530+
AND created_at < (
1531+
-- Delete at most in batches of 3 days (with a batch size of 3 days, we
1532+
-- can clear out the previous 6 months of data in ~60 iterations) whilst
1533+
-- keeping the DB load relatively low.
1534+
SELECT
1535+
COALESCE(MIN(created_at) + '3 days'::interval, NOW())
1536+
FROM
1537+
workspace_agent_stats
1538+
);
1539+
*/
1540+
15091541
now := dbtime.Now()
1510-
sixMonthInterval := 6 * 30 * 24 * time.Hour
1511-
sixMonthsAgo := now.Add(-sixMonthInterval)
1542+
var limit time.Time
1543+
// MAX
1544+
for _, stat := range q.templateUsageStats {
1545+
if stat.StartTime.After(limit) {
1546+
limit = stat.StartTime.AddDate(0, 0, -1)
1547+
}
1548+
}
1549+
// COALESCE
1550+
if limit.IsZero() {
1551+
limit = now.AddDate(0, -6, 0)
1552+
}
15121553

15131554
var validStats []database.WorkspaceAgentStat
1555+
var batchLimit time.Time
1556+
for _, stat := range q.workspaceAgentStats {
1557+
if batchLimit.IsZero() || stat.CreatedAt.Before(batchLimit) {
1558+
batchLimit = stat.CreatedAt
1559+
}
1560+
}
1561+
if batchLimit.IsZero() {
1562+
batchLimit = time.Now()
1563+
} else {
1564+
batchLimit = batchLimit.AddDate(0, 0, 3)
1565+
}
15141566
for _, stat := range q.workspaceAgentStats {
1515-
if stat.CreatedAt.Before(sixMonthsAgo) {
1567+
if stat.CreatedAt.Before(limit) && stat.CreatedAt.Before(batchLimit) {
15161568
continue
15171569
}
15181570
validStats = append(validStats, stat)

coderd/database/dbpurge/dbpurge.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ const (
2424
// This is for cleaning up old, unused resources from the database that take up space.
2525
func New(ctx context.Context, logger slog.Logger, db database.Store) io.Closer {
2626
closed := make(chan struct{})
27-
logger = logger.Named("dbpurge")
2827

2928
ctx, cancelFunc := context.WithCancel(ctx)
3029
//nolint:gocritic // The system purges old db records without user input.

coderd/database/dbpurge/dbpurge_test.go

Lines changed: 70 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,14 @@ import (
1111
"go.uber.org/goleak"
1212
"golang.org/x/exp/slices"
1313

14+
"cdr.dev/slog"
1415
"cdr.dev/slog/sloggers/slogtest"
1516

1617
"github.com/coder/coder/v2/coderd/database"
1718
"github.com/coder/coder/v2/coderd/database/dbgen"
1819
"github.com/coder/coder/v2/coderd/database/dbmem"
1920
"github.com/coder/coder/v2/coderd/database/dbpurge"
21+
"github.com/coder/coder/v2/coderd/database/dbrollup"
2022
"github.com/coder/coder/v2/coderd/database/dbtestutil"
2123
"github.com/coder/coder/v2/coderd/database/dbtime"
2224
"github.com/coder/coder/v2/provisionerd/proto"
@@ -40,27 +42,62 @@ func TestDeleteOldWorkspaceAgentStats(t *testing.T) {
4042
t.Parallel()
4143

4244
db, _ := dbtestutil.NewDB(t)
43-
logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true})
45+
logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}).Leveled(slog.LevelDebug)
46+
47+
now := dbtime.Now()
48+
49+
defer func() {
50+
if t.Failed() {
51+
t.Logf("Test failed, printing rows...")
52+
ctx := testutil.Context(t, testutil.WaitShort)
53+
wasRows, err := db.GetWorkspaceAgentStats(ctx, now.AddDate(0, -7, 0))
54+
if err == nil {
55+
for _, row := range wasRows {
56+
t.Logf("workspace agent stat: %v", row)
57+
}
58+
}
59+
tusRows, err := db.GetTemplateUsageStats(context.Background(), database.GetTemplateUsageStatsParams{
60+
StartTime: now.AddDate(0, -7, 0),
61+
EndTime: now,
62+
})
63+
if err == nil {
64+
for _, row := range tusRows {
65+
t.Logf("template usage stat: %v", row)
66+
}
67+
}
68+
}
69+
}()
4470

4571
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitShort)
4672
defer cancel()
4773

48-
now := dbtime.Now()
49-
5074
// given
5175
// Let's use RxBytes to identify stat entries.
5276
// Stat inserted 6 months + 1 hour ago, should be deleted.
5377
first := dbgen.WorkspaceAgentStat(t, db, database.WorkspaceAgentStat{
54-
CreatedAt: now.Add(-6*30*24*time.Hour - time.Hour),
78+
CreatedAt: now.AddDate(0, -6, 0).Add(-time.Hour),
79+
ConnectionCount: 1,
5580
ConnectionMedianLatencyMS: 1,
5681
RxBytes: 1111,
82+
SessionCountSSH: 1,
5783
})
5884

59-
// Stat inserted 6 months - 1 hour ago, should not be deleted.
85+
// Stat inserted 6 months - 1 hour ago, should not be deleted before rollup.
6086
second := dbgen.WorkspaceAgentStat(t, db, database.WorkspaceAgentStat{
61-
CreatedAt: now.Add(-5*30*24*time.Hour + time.Hour),
87+
CreatedAt: now.AddDate(0, -6, 0).Add(time.Hour),
88+
ConnectionCount: 1,
6289
ConnectionMedianLatencyMS: 1,
6390
RxBytes: 2222,
91+
SessionCountSSH: 1,
92+
})
93+
94+
// Stat inserted 6 months - 1 day - 2 hour ago, should not be deleted at all.
95+
third := dbgen.WorkspaceAgentStat(t, db, database.WorkspaceAgentStat{
96+
CreatedAt: now.AddDate(0, -6, 0).AddDate(0, 0, 1).Add(2 * time.Hour),
97+
ConnectionCount: 1,
98+
ConnectionMedianLatencyMS: 1,
99+
RxBytes: 3333,
100+
SessionCountSSH: 1,
64101
})
65102

66103
// when
@@ -70,15 +107,39 @@ func TestDeleteOldWorkspaceAgentStats(t *testing.T) {
70107
// then
71108
var stats []database.GetWorkspaceAgentStatsRow
72109
var err error
73-
require.Eventually(t, func() bool {
110+
require.Eventuallyf(t, func() bool {
74111
// Query all stats created not earlier than 7 months ago
75-
stats, err = db.GetWorkspaceAgentStats(ctx, now.Add(-7*30*24*time.Hour))
112+
stats, err = db.GetWorkspaceAgentStats(ctx, now.AddDate(0, -7, 0))
76113
if err != nil {
77114
return false
78115
}
79116
return !containsWorkspaceAgentStat(stats, first) &&
80117
containsWorkspaceAgentStat(stats, second)
81-
}, testutil.WaitShort, testutil.IntervalFast, stats)
118+
}, testutil.WaitShort, testutil.IntervalFast, "it should delete old stats: %v", stats)
119+
120+
// when
121+
events := make(chan dbrollup.Event)
122+
rolluper := dbrollup.New(logger, db, dbrollup.WithEventChannel(events))
123+
defer rolluper.Close()
124+
125+
_, _ = <-events, <-events
126+
127+
// Start a new purger to immediately trigger delete after rollup.
128+
_ = closer.Close()
129+
closer = dbpurge.New(ctx, logger, db)
130+
defer closer.Close()
131+
132+
// then
133+
require.Eventuallyf(t, func() bool {
134+
// Query all stats created not earlier than 7 months ago
135+
stats, err = db.GetWorkspaceAgentStats(ctx, now.AddDate(0, -7, 0))
136+
if err != nil {
137+
return false
138+
}
139+
return !containsWorkspaceAgentStat(stats, first) &&
140+
!containsWorkspaceAgentStat(stats, second) &&
141+
containsWorkspaceAgentStat(stats, third)
142+
}, testutil.WaitShort, testutil.IntervalFast, "it should delete old stats after rollup: %v", stats)
82143
}
83144

84145
func containsWorkspaceAgentStat(stats []database.GetWorkspaceAgentStatsRow, needle database.WorkspaceAgentStat) bool {

coderd/database/queries.sql.go

Lines changed: 29 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries/workspaceagentstats.sql

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,35 @@ ORDER BY
6666
date ASC;
6767

6868
-- name: DeleteOldWorkspaceAgentStats :exec
69-
DELETE FROM workspace_agent_stats WHERE created_at < NOW() - INTERVAL '180 days';
69+
DELETE FROM
70+
workspace_agent_stats
71+
WHERE
72+
created_at < (
73+
SELECT
74+
COALESCE(
75+
-- When generating initial template usage stats, all the
76+
-- raw agent stats are needed, after that only ~30 mins
77+
-- from last rollup is needed. Deployment stats seem to
78+
-- use between 15 mins and 1 hour of data. We keep a
79+
-- little bit more (1 day) just in case.
80+
MAX(start_time) - '1 days'::interval,
81+
-- Fall back to 6 months ago if there are no template
82+
-- usage stats so that we don't delete the data before
83+
-- it's rolled up.
84+
NOW() - '6 months'::interval
85+
)
86+
FROM
87+
template_usage_stats
88+
)
89+
AND created_at < (
90+
-- Delete at most in batches of 3 days (with a batch size of 3 days, we
91+
-- can clear out the previous 6 months of data in ~60 iterations) whilst
92+
-- keeping the DB load relatively low.
93+
SELECT
94+
COALESCE(MIN(created_at) + '3 days'::interval, NOW())
95+
FROM
96+
workspace_agent_stats
97+
);
7098

7199
-- name: GetDeploymentWorkspaceAgentStats :one
72100
WITH agent_stats AS (

0 commit comments

Comments
 (0)