From 3dbc40d1137cdd4ade0ecfbef762ef2fa47767ec Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Mon, 31 Jul 2023 20:36:22 +0100 Subject: [PATCH 01/24] RED: add tests for batchstats.Batcher --- coderd/batchstats/batcher_test.go | 103 ++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 coderd/batchstats/batcher_test.go diff --git a/coderd/batchstats/batcher_test.go b/coderd/batchstats/batcher_test.go new file mode 100644 index 0000000000000..831a3d50f0934 --- /dev/null +++ b/coderd/batchstats/batcher_test.go @@ -0,0 +1,103 @@ +package batchstats_test + +import ( + "context" + "github.com/coder/coder/codersdk/agentsdk" + "github.com/stretchr/testify/require" + "testing" + "time" + + "cdr.dev/slog/sloggers/slogtest" + "github.com/coder/coder/coderd/batchstats" + "github.com/coder/coder/coderd/database" + "github.com/coder/coder/coderd/database/dbgen" + "github.com/coder/coder/coderd/database/dbtestutil" +) + +func TestBatchStats(t *testing.T) { + t.Parallel() + // Given: a fresh batcher with no data + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + log := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}) + store, _ := dbtestutil.NewDB(t) + ws1 := dbgen.Workspace(t, store, database.Workspace{ + LastUsedAt: time.Now().Add(-time.Hour), + }) + ws2 := dbgen.Workspace(t, store, database.Workspace{ + LastUsedAt: time.Now().Add(-time.Hour), + }) + startedAt := time.Now() + tick := make(chan time.Time) + + b, closer := batchstats.New( + batchstats.WithStore(store), + batchstats.WithLogger(log), + batchstats.WithTicker(tick), + ) + t.Cleanup(closer) + + // When: it becomes time to report stats + done := make(chan struct{}) + t.Cleanup(func() { + close(done) + }) + go func() { + b.Run(ctx) + }() + t1 := time.Now() + tick <- t1 + + // Then: it should report no stats. + stats, err := store.GetWorkspaceAgentStats(ctx, startedAt) + require.NoError(t, err) + require.Empty(t, stats) + + // Then: workspace last used time should not be updated + updated1, err := store.GetWorkspaceByID(ctx, ws1.ID) + require.NoError(t, err) + require.Equal(t, ws1.LastUsedAt, updated1.LastUsedAt) + updated2, err := store.GetWorkspaceByID(ctx, ws2.ID) + require.NoError(t, err) + require.Equal(t, ws2.LastUsedAt, updated2.LastUsedAt) + + // When: a single data point is added for ws1 + b.Add(agentsdk.AgentMetric{}) + // And it becomes time to report stats + t2 := time.Now() + tick <- t2 + + // Then: it should report a single stat. + stats, err = store.GetWorkspaceAgentStats(ctx, startedAt) + require.NoError(t, err) + require.Len(t, stats, 1) + + // Then: ws1 last used time should be updated + updated1, err = store.GetWorkspaceByID(ctx, ws1.ID) + require.NoError(t, err) + require.NotEqual(t, ws1.LastUsedAt, updated1.LastUsedAt) + // And: ws2 last used time should not be updated + updated2, err = store.GetWorkspaceByID(ctx, ws2.ID) + require.NoError(t, err) + require.Equal(t, ws2.LastUsedAt, updated2.LastUsedAt) + + // When: a lot of data points are added for both ws1 and ws2 + // (equal to batch size) + t3 := time.Now() + for i := 0; i < batchstats.DefaultBatchSize; i++ { + b.Add(agentsdk.AgentMetric{}) + } + + // Then: it should immediately flush its stats to store. + stats, err = store.GetWorkspaceAgentStats(ctx, t3) + require.NoError(t, err) + require.Len(t, stats, batchstats.DefaultBatchSize) + + // Then: ws1 and ws2 last used time should be updated + updated1, err = store.GetWorkspaceByID(ctx, ws1.ID) + require.NoError(t, err) + require.NotEqual(t, ws1.LastUsedAt, updated1.LastUsedAt) + updated2, err = store.GetWorkspaceByID(ctx, ws2.ID) + require.NoError(t, err) + require.NotEqual(t, ws2.LastUsedAt, updated2.LastUsedAt) +} From c302132bd35fea703c9238454a34a3c19ebcbdb5 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Mon, 31 Jul 2023 20:37:12 +0100 Subject: [PATCH 02/24] RED: add initial batchstats.Batcher implementation and some TODOs --- coderd/batchstats/batcher.go | 135 +++++++++++++++++++++++++++++++++++ coderd/coderd.go | 1 + coderd/workspaceagents.go | 3 + 3 files changed, 139 insertions(+) create mode 100644 coderd/batchstats/batcher.go diff --git a/coderd/batchstats/batcher.go b/coderd/batchstats/batcher.go new file mode 100644 index 0000000000000..ae29504e1e2bb --- /dev/null +++ b/coderd/batchstats/batcher.go @@ -0,0 +1,135 @@ +package batchstats + +import ( + "context" + "os" + "sync" + "time" + + "cdr.dev/slog" + "cdr.dev/slog/sloggers/sloghuman" + "github.com/coder/coder/coderd/database" + "github.com/coder/coder/codersdk/agentsdk" +) + +const ( + // DefaultBatchInterval is the default interval at which the batcher + // flushes its buffer. + DefaultBatchInterval = 1 * time.Second + // DefaultBatchSize is the default size of the batcher's buffer. + DefaultBatchSize = 1024 +) + +// Batcher holds a buffer of agent stats and periodically flushes them to +// its configured store. It also updates the workspace's last used time. +type Batcher struct { + store database.Store + log slog.Logger + + mu sync.RWMutex + buf []agentsdk.AgentMetric + + ticker <-chan time.Time +} + +// Option is a functional option for configuring a Batcher. +type Option func(b *Batcher) + +// WithStore sets the store to use for storing stats. +func WithStore(store database.Store) Option { + return func(b *Batcher) { + b.store = store + } +} + +// WithBatchSize sets the number of stats to store in a batch. +func WithBatchSize(size int) Option { + return func(b *Batcher) { + b.buf = make([]agentsdk.AgentMetric, 0, size) + } +} + +// WithTicker sets the ticker to use for batching stats. +func WithTicker(ticker <-chan time.Time) Option { + return func(b *Batcher) { + b.ticker = ticker + } +} + +// WithLogger sets the logger to use for logging. +func WithLogger(log slog.Logger) Option { + return func(b *Batcher) { + b.log = log + } +} + +// New creates a new Batcher. +func New(opts ...Option) (*Batcher, func()) { + b := &Batcher{} + closer := func() {} + buf := make([]agentsdk.AgentMetric, 0, DefaultBatchSize) + b.buf = buf + b.log = slog.Make(sloghuman.Sink(os.Stderr)) + for _, opt := range opts { + opt(b) + } + + if b.store == nil { + panic("batcher needs a store") + } + + if b.ticker == nil { + t := time.NewTicker(DefaultBatchInterval) + b.ticker = t.C + closer = t.Stop + } + + return b, closer +} + +// Add adds a stat to the batcher. +func (b *Batcher) Add(m agentsdk.AgentMetric) { + b.mu.Lock() + defer b.mu.Unlock() + + b.buf = append(b.buf, m) +} + +// Run runs the batcher. +func (b *Batcher) Run(ctx context.Context) { + for { + select { + case tick, ok := <-b.ticker: + if !ok { + b.log.Warn(ctx, "ticker closed, flushing batcher") + b.flush(ctx, time.Now()) + return + } + b.flush(ctx, tick) + case <-ctx.Done(): + b.log.Warn(ctx, "context done, flushing batcher") + b.flush(ctx, time.Now()) + return + } + } +} + +// flush flushes the batcher's buffer. +func (b *Batcher) flush(ctx context.Context, now time.Time) { + b.mu.Lock() + defer b.mu.Unlock() + + if len(b.buf) == 0 { + return + } + + b.log.Debug(context.Background(), "flushing batcher", slog.F("count", len(b.buf))) + // TODO(cian): update the query to batch-insert multiple stats + for range b.buf { + if _, err := b.store.InsertWorkspaceAgentStat(ctx, database.InsertWorkspaceAgentStatParams{ + // TODO: fill + }); err != nil { + b.log.Error(context.Background(), "insert workspace agent stat", slog.Error(err)) + } + } +} diff --git a/coderd/coderd.go b/coderd/coderd.go index d7b80ff273097..25b611a5ac4fb 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -159,6 +159,7 @@ type Options struct { HTTPClient *http.Client + // TODO(Cian): This may need to be passed to batchstats.Batcher instead UpdateAgentMetrics func(ctx context.Context, username, workspaceName, agentName string, metrics []agentsdk.AgentMetric) } diff --git a/coderd/workspaceagents.go b/coderd/workspaceagents.go index 8567ff1d895b3..3a972e0e90a0b 100644 --- a/coderd/workspaceagents.go +++ b/coderd/workspaceagents.go @@ -1374,6 +1374,8 @@ func convertWorkspaceAgent(derpMap *tailcfg.DERPMap, coordinator tailnet.Coordin // @Success 200 {object} agentsdk.StatsResponse // @Router /workspaceagents/me/report-stats [post] func (api *API) workspaceAgentReportStats(rw http.ResponseWriter, r *http.Request) { + // TODO: here: instead of inserting directly, we should queue up the stats for + // periodic batch insert. ctx := r.Context() workspaceAgent := httpmw.WorkspaceAgent(r) @@ -1456,6 +1458,7 @@ func (api *API) workspaceAgentReportStats(rw http.ResponseWriter, r *http.Reques }) if api.Options.UpdateAgentMetrics != nil { errGroup.Go(func() error { + // TODO(Cian): why do we need to look up the user each time we post stats? user, err := api.Database.GetUserByID(ctx, workspace.OwnerID) if err != nil { return xerrors.Errorf("can't get user: %w", err) From 554fcf06ef173285cdb9117fed4321201d43cbbd Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Mon, 31 Jul 2023 22:36:31 +0100 Subject: [PATCH 03/24] wip --- coderd/batchstats/batcher.go | 93 +++++++++++++++++++++---------- coderd/batchstats/batcher_test.go | 91 ++++++++++++++++++++++++++++-- 2 files changed, 152 insertions(+), 32 deletions(-) diff --git a/coderd/batchstats/batcher.go b/coderd/batchstats/batcher.go index ae29504e1e2bb..6fd8e1166597d 100644 --- a/coderd/batchstats/batcher.go +++ b/coderd/batchstats/batcher.go @@ -2,6 +2,9 @@ package batchstats import ( "context" + "encoding/json" + "github.com/google/uuid" + "golang.org/x/xerrors" "os" "sync" "time" @@ -13,9 +16,6 @@ import ( ) const ( - // DefaultBatchInterval is the default interval at which the batcher - // flushes its buffer. - DefaultBatchInterval = 1 * time.Second // DefaultBatchSize is the default size of the batcher's buffer. DefaultBatchSize = 1024 ) @@ -27,9 +27,12 @@ type Batcher struct { log slog.Logger mu sync.RWMutex - buf []agentsdk.AgentMetric + buf []database.InsertWorkspaceAgentStatParams + // ticker is used to periodically flush the buffer. ticker <-chan time.Time + // flushLever is used to signal the flusher to flush the buffer immediately. + flushLever chan struct{} } // Option is a functional option for configuring a Batcher. @@ -45,14 +48,14 @@ func WithStore(store database.Store) Option { // WithBatchSize sets the number of stats to store in a batch. func WithBatchSize(size int) Option { return func(b *Batcher) { - b.buf = make([]agentsdk.AgentMetric, 0, size) + b.buf = make([]database.InsertWorkspaceAgentStatParams, 0, size) } } -// WithTicker sets the ticker to use for batching stats. -func WithTicker(ticker <-chan time.Time) Option { +// WithTicker sets the flush interval. +func WithTicker(ch <-chan time.Time) Option { return func(b *Batcher) { - b.ticker = ticker + b.ticker = ch } } @@ -64,10 +67,9 @@ func WithLogger(log slog.Logger) Option { } // New creates a new Batcher. -func New(opts ...Option) (*Batcher, func()) { +func New(opts ...Option) (*Batcher, error) { b := &Batcher{} - closer := func() {} - buf := make([]agentsdk.AgentMetric, 0, DefaultBatchSize) + buf := make([]database.InsertWorkspaceAgentStatParams, 0, DefaultBatchSize) b.buf = buf b.log = slog.Make(sloghuman.Sink(os.Stderr)) for _, opt := range opts { @@ -75,40 +77,73 @@ func New(opts ...Option) (*Batcher, func()) { } if b.store == nil { - panic("batcher needs a store") + return nil, xerrors.Errorf("no store configured for batcher") } if b.ticker == nil { - t := time.NewTicker(DefaultBatchInterval) - b.ticker = t.C - closer = t.Stop + return nil, xerrors.Errorf("no ticker configured for batcher") } - return b, closer + return b, nil } -// Add adds a stat to the batcher. -func (b *Batcher) Add(m agentsdk.AgentMetric) { +// Add adds a stat to the batcher for the given workspace and agent. +func (b *Batcher) Add( + ctx context.Context, + agentID uuid.UUID, + st agentsdk.Stats, +) error { b.mu.Lock() defer b.mu.Unlock() - b.buf = append(b.buf, m) + now := database.Now() + ws, err := b.store.GetWorkspaceByAgentID(ctx, agentID) + if err != nil { + return err + } + payload, err := json.Marshal(st.ConnectionsByProto) + if err != nil { + b.log.Error(ctx, "marshal agent connections by proto", + slog.F("workspace_agent_id", agentID), + slog.Error(err), + ) + payload = json.RawMessage("{}") + } + p := database.InsertWorkspaceAgentStatParams{ + ID: uuid.New(), + CreatedAt: now, + WorkspaceID: ws.ID, + UserID: ws.OwnerID, + TemplateID: ws.TemplateID, + ConnectionsByProto: payload, + ConnectionCount: st.ConnectionCount, + RxPackets: st.RxPackets, + RxBytes: st.RxBytes, + TxPackets: st.TxPackets, + TxBytes: st.TxBytes, + SessionCountVSCode: st.SessionCountVSCode, + SessionCountJetBrains: st.SessionCountJetBrains, + SessionCountReconnectingPTY: st.SessionCountReconnectingPTY, + SessionCountSSH: st.SessionCountSSH, + ConnectionMedianLatencyMS: st.ConnectionMedianLatencyMS, + } + b.buf = append(b.buf, p) + return nil } // Run runs the batcher. func (b *Batcher) Run(ctx context.Context) { for { select { - case tick, ok := <-b.ticker: - if !ok { - b.log.Warn(ctx, "ticker closed, flushing batcher") - b.flush(ctx, time.Now()) - return - } + case tick := <-b.ticker: b.flush(ctx, tick) + case <-b.flushLever: + // If the flush lever is depressed, flush the buffer immediately. + b.log.Warn(ctx, "flushing due to full buffer", slog.F("count", len(b.buf))) + b.flush(ctx, database.Now()) case <-ctx.Done(): - b.log.Warn(ctx, "context done, flushing batcher") - b.flush(ctx, time.Now()) + b.log.Warn(ctx, "context done, flushing before exit") + b.flush(ctx, database.Now()) return } } @@ -118,12 +153,14 @@ func (b *Batcher) Run(ctx context.Context) { func (b *Batcher) flush(ctx context.Context, now time.Time) { b.mu.Lock() defer b.mu.Unlock() + // TODO(Cian): After flushing, should we somehow reset the ticker? if len(b.buf) == 0 { + b.log.Debug(ctx, "nothing to flush") return } - b.log.Debug(context.Background(), "flushing batcher", slog.F("count", len(b.buf))) + b.log.Debug(context.Background(), "flushing buffer", slog.F("count", len(b.buf))) // TODO(cian): update the query to batch-insert multiple stats for range b.buf { if _, err := b.store.InsertWorkspaceAgentStat(ctx, database.InsertWorkspaceAgentStatParams{ diff --git a/coderd/batchstats/batcher_test.go b/coderd/batchstats/batcher_test.go index 831a3d50f0934..55eecba0e68b8 100644 --- a/coderd/batchstats/batcher_test.go +++ b/coderd/batchstats/batcher_test.go @@ -3,6 +3,7 @@ package batchstats_test import ( "context" "github.com/coder/coder/codersdk/agentsdk" + "github.com/google/uuid" "github.com/stretchr/testify/require" "testing" "time" @@ -27,15 +28,18 @@ func TestBatchStats(t *testing.T) { ws2 := dbgen.Workspace(t, store, database.Workspace{ LastUsedAt: time.Now().Add(-time.Hour), }) + // TODO: link to ws1 and ws2 + agt1 := dbgen.WorkspaceAgent(t, store, database.WorkspaceAgent{}) + agt2 := dbgen.WorkspaceAgent(t, store, database.WorkspaceAgent{}) startedAt := time.Now() tick := make(chan time.Time) - b, closer := batchstats.New( + b, err := batchstats.New( batchstats.WithStore(store), batchstats.WithLogger(log), batchstats.WithTicker(tick), ) - t.Cleanup(closer) + require.NoError(t, err) // When: it becomes time to report stats done := make(chan struct{}) @@ -62,7 +66,7 @@ func TestBatchStats(t *testing.T) { require.Equal(t, ws2.LastUsedAt, updated2.LastUsedAt) // When: a single data point is added for ws1 - b.Add(agentsdk.AgentMetric{}) + require.NoError(t, b.Add(ctx, agt1.ID, randAgentSDKStats(t))) // And it becomes time to report stats t2 := time.Now() tick <- t2 @@ -85,7 +89,11 @@ func TestBatchStats(t *testing.T) { // (equal to batch size) t3 := time.Now() for i := 0; i < batchstats.DefaultBatchSize; i++ { - b.Add(agentsdk.AgentMetric{}) + if i%2 == 0 { + require.NoError(t, b.Add(ctx, agt1.ID, randAgentSDKStats(t))) + } else { + require.NoError(t, b.Add(ctx, agt2.ID, randAgentSDKStats(t))) + } } // Then: it should immediately flush its stats to store. @@ -101,3 +109,78 @@ func TestBatchStats(t *testing.T) { require.NoError(t, err) require.NotEqual(t, ws2.LastUsedAt, updated2.LastUsedAt) } + +// randAgentSDKStats returns a random agentsdk.Stats +func randAgentSDKStats(t *testing.T, opts ...func(*agentsdk.Stats)) agentsdk.Stats { + t.Helper() + var s agentsdk.Stats + for _, opt := range opts { + opt(&s) + } + return s +} + +// randInsertWorkspaceAgentStatParams returns a random InsertWorkspaceAgentStatParams +func randInsertWorkspaceAgentStatParams(t *testing.T, opts ...func(params *database.InsertWorkspaceAgentStatParams)) database.InsertWorkspaceAgentStatParams { + t.Helper() + p := database.InsertWorkspaceAgentStatParams{ + ID: uuid.New(), + CreatedAt: time.Now(), + UserID: uuid.New(), + WorkspaceID: uuid.New(), + TemplateID: uuid.New(), + AgentID: uuid.New(), + ConnectionsByProto: []byte(`{"tcp": 1}`), + ConnectionCount: 1, + RxPackets: 1, + RxBytes: 1, + TxPackets: 1, + TxBytes: 1, + SessionCountVSCode: 1, + SessionCountJetBrains: 1, + SessionCountReconnectingPTY: 0, + SessionCountSSH: 0, + ConnectionMedianLatencyMS: 0, + } + for _, opt := range opts { + opt(&p) + } + return p +} + +//type InsertWorkspaceAgentStatParams struct { +// ID uuid.UUID `db:"id" json:"id"` +// CreatedAt time.Time `db:"created_at" json:"created_at"` +// UserID uuid.UUID `db:"user_id" json:"user_id"` +// WorkspaceID uuid.UUID `db:"workspace_id" json:"workspace_id"` +// TemplateID uuid.UUID `db:"template_id" json:"template_id"` +// AgentID uuid.UUID `db:"agent_id" json:"agent_id"` +// ConnectionsByProto json.RawMessage `db:"connections_by_proto" json:"connections_by_proto"` +// ConnectionCount int64 `db:"connection_count" json:"connection_count"` +// RxPackets int64 `db:"rx_packets" json:"rx_packets"` +// RxBytes int64 `db:"rx_bytes" json:"rx_bytes"` +// TxPackets int64 `db:"tx_packets" json:"tx_packets"` +// TxBytes int64 `db:"tx_bytes" json:"tx_bytes"` +// SessionCountVSCode int64 `db:"session_count_vscode" json:"session_count_vscode"` +// SessionCountJetBrains int64 `db:"session_count_jetbrains" json:"session_count_jetbrains"` +// SessionCountReconnectingPTY int64 `db:"session_count_reconnecting_pty" json:"session_count_reconnecting_pty"` +// SessionCountSSH int64 `db:"session_count_ssh" json:"session_count_ssh"` +// ConnectionMedianLatencyMS float64 `db:"connection_median_latency_ms" json:"connection_median_latency_ms"` +//} + +//type GetWorkspaceAgentStatsRow struct { +// UserID uuid.UUID `db:"user_id" json:"user_id"` +// AgentID uuid.UUID `db:"agent_id" json:"agent_id"` +// WorkspaceID uuid.UUID `db:"workspace_id" json:"workspace_id"` +// TemplateID uuid.UUID `db:"template_id" json:"template_id"` +// AggregatedFrom time.Time `db:"aggregated_from" json:"aggregated_from"` +// WorkspaceRxBytes int64 `db:"workspace_rx_bytes" json:"workspace_rx_bytes"` +// WorkspaceTxBytes int64 `db:"workspace_tx_bytes" json:"workspace_tx_bytes"` +// WorkspaceConnectionLatency50 float64 `db:"workspace_connection_latency_50" json:"workspace_connection_latency_50"` +// WorkspaceConnectionLatency95 float64 `db:"workspace_connection_latency_95" json:"workspace_connection_latency_95"` +// AgentID_2 uuid.UUID `db:"agent_id_2" json:"agent_id_2"` +// SessionCountVSCode int64 `db:"session_count_vscode" json:"session_count_vscode"` +// SessionCountSSH int64 `db:"session_count_ssh" json:"session_count_ssh"` +// SessionCountJetBrains int64 `db:"session_count_jetbrains" json:"session_count_jetbrains"` +// SessionCountReconnectingPTY int64 `db:"session_count_reconnecting_pty" json:"session_count_reconnecting_pty"` +//} From 04d017213371bce2ca99056cb8738343c83f46fd Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Tue, 1 Aug 2023 11:03:04 +0100 Subject: [PATCH 04/24] actually insert stats --- coderd/batchstats/batcher.go | 50 +++++++++---- coderd/batchstats/batcher_test.go | 116 ++++++++++++++++++------------ 2 files changed, 104 insertions(+), 62 deletions(-) diff --git a/coderd/batchstats/batcher.go b/coderd/batchstats/batcher.go index 6fd8e1166597d..6f1c58a16e83b 100644 --- a/coderd/batchstats/batcher.go +++ b/coderd/batchstats/batcher.go @@ -3,15 +3,18 @@ package batchstats import ( "context" "encoding/json" - "github.com/google/uuid" - "golang.org/x/xerrors" "os" "sync" "time" + "github.com/google/uuid" + "golang.org/x/xerrors" + "cdr.dev/slog" "cdr.dev/slog/sloggers/sloghuman" + "github.com/coder/coder/coderd/database" + "github.com/coder/coder/coderd/database/dbauthz" "github.com/coder/coder/codersdk/agentsdk" ) @@ -33,6 +36,8 @@ type Batcher struct { ticker <-chan time.Time // flushLever is used to signal the flusher to flush the buffer immediately. flushLever chan struct{} + // flushed is used during testing to signal that a flush has completed. + flushed chan struct{} } // Option is a functional option for configuring a Batcher. @@ -66,6 +71,13 @@ func WithLogger(log slog.Logger) Option { } } +// With Flushed sets the channel to use for signaling that a flush has completed. +func WithFlushed(ch chan struct{}) Option { + return func(b *Batcher) { + b.flushed = ch + } +} + // New creates a new Batcher. func New(opts ...Option) (*Batcher, error) { b := &Batcher{} @@ -96,10 +108,13 @@ func (b *Batcher) Add( b.mu.Lock() defer b.mu.Unlock() + // TODO(Cian): add a specific dbauthz context for this. + authCtx := dbauthz.AsSystemRestricted(ctx) now := database.Now() - ws, err := b.store.GetWorkspaceByAgentID(ctx, agentID) + // TODO(Cian): cache agentID -> workspaceID? + ws, err := b.store.GetWorkspaceByAgentID(authCtx, agentID) if err != nil { - return err + return xerrors.Errorf("get workspace by agent id: %w", err) } payload, err := json.Marshal(st.ConnectionsByProto) if err != nil { @@ -133,17 +148,18 @@ func (b *Batcher) Add( // Run runs the batcher. func (b *Batcher) Run(ctx context.Context) { + authCtx := dbauthz.AsSystemRestricted(ctx) for { select { case tick := <-b.ticker: - b.flush(ctx, tick) + b.flush(authCtx, tick) case <-b.flushLever: // If the flush lever is depressed, flush the buffer immediately. b.log.Warn(ctx, "flushing due to full buffer", slog.F("count", len(b.buf))) - b.flush(ctx, database.Now()) + b.flush(authCtx, database.Now()) case <-ctx.Done(): b.log.Warn(ctx, "context done, flushing before exit") - b.flush(ctx, database.Now()) + b.flush(authCtx, database.Now()) return } } @@ -151,22 +167,26 @@ func (b *Batcher) Run(ctx context.Context) { // flush flushes the batcher's buffer. func (b *Batcher) flush(ctx context.Context, now time.Time) { + // TODO(Cian): After flushing, should we somehow reset the ticker? b.mu.Lock() defer b.mu.Unlock() - // TODO(Cian): After flushing, should we somehow reset the ticker? - + defer func() { + // Notify that a flush has completed. + if b.flushed != nil { + b.flushed <- struct{}{} + } + }() if len(b.buf) == 0 { b.log.Debug(ctx, "nothing to flush") return } - b.log.Debug(context.Background(), "flushing buffer", slog.F("count", len(b.buf))) + b.log.Debug(ctx, "flushing buffer", slog.F("count", len(b.buf))) // TODO(cian): update the query to batch-insert multiple stats - for range b.buf { - if _, err := b.store.InsertWorkspaceAgentStat(ctx, database.InsertWorkspaceAgentStatParams{ - // TODO: fill - }); err != nil { - b.log.Error(context.Background(), "insert workspace agent stat", slog.Error(err)) + for _, p := range b.buf { + if _, err := b.store.InsertWorkspaceAgentStat(ctx, p); err != nil { + b.log.Error(ctx, "insert workspace agent stat", slog.Error(err)) } } + } diff --git a/coderd/batchstats/batcher_test.go b/coderd/batchstats/batcher_test.go index 55eecba0e68b8..bc809c62beff2 100644 --- a/coderd/batchstats/batcher_test.go +++ b/coderd/batchstats/batcher_test.go @@ -2,17 +2,18 @@ package batchstats_test import ( "context" - "github.com/coder/coder/codersdk/agentsdk" - "github.com/google/uuid" - "github.com/stretchr/testify/require" "testing" "time" + "github.com/stretchr/testify/require" + "cdr.dev/slog/sloggers/slogtest" + "github.com/coder/coder/coderd/batchstats" "github.com/coder/coder/coderd/database" "github.com/coder/coder/coderd/database/dbgen" "github.com/coder/coder/coderd/database/dbtestutil" + "github.com/coder/coder/codersdk/agentsdk" ) func TestBatchStats(t *testing.T) { @@ -22,22 +23,23 @@ func TestBatchStats(t *testing.T) { t.Cleanup(cancel) log := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}) store, _ := dbtestutil.NewDB(t) - ws1 := dbgen.Workspace(t, store, database.Workspace{ - LastUsedAt: time.Now().Add(-time.Hour), - }) - ws2 := dbgen.Workspace(t, store, database.Workspace{ - LastUsedAt: time.Now().Add(-time.Hour), - }) - // TODO: link to ws1 and ws2 - agt1 := dbgen.WorkspaceAgent(t, store, database.WorkspaceAgent{}) - agt2 := dbgen.WorkspaceAgent(t, store, database.WorkspaceAgent{}) + + // Set up some test dependencies. + deps1 := setupDeps(t, store) + ws1, err := store.GetWorkspaceByID(ctx, deps1.Workspace.ID) + deps2 := setupDeps(t, store) + require.NoError(t, err) + ws2, err := store.GetWorkspaceByID(ctx, deps2.Workspace.ID) + require.NoError(t, err) startedAt := time.Now() tick := make(chan time.Time) + flushed := make(chan struct{}) b, err := batchstats.New( batchstats.WithStore(store), batchstats.WithLogger(log), batchstats.WithTicker(tick), + batchstats.WithFlushed(flushed), ) require.NoError(t, err) @@ -51,6 +53,7 @@ func TestBatchStats(t *testing.T) { }() t1 := time.Now() tick <- t1 + <-flushed // Wait for a flush to complete. // Then: it should report no stats. stats, err := store.GetWorkspaceAgentStats(ctx, startedAt) @@ -66,13 +69,14 @@ func TestBatchStats(t *testing.T) { require.Equal(t, ws2.LastUsedAt, updated2.LastUsedAt) // When: a single data point is added for ws1 - require.NoError(t, b.Add(ctx, agt1.ID, randAgentSDKStats(t))) + require.NoError(t, b.Add(ctx, deps1.Agent.ID, randAgentSDKStats(t))) // And it becomes time to report stats t2 := time.Now() tick <- t2 + <-flushed // Wait for a flush to complete. // Then: it should report a single stat. - stats, err = store.GetWorkspaceAgentStats(ctx, startedAt) + stats, err = store.GetWorkspaceAgentStats(ctx, t1) require.NoError(t, err) require.Len(t, stats, 1) @@ -87,14 +91,16 @@ func TestBatchStats(t *testing.T) { // When: a lot of data points are added for both ws1 and ws2 // (equal to batch size) - t3 := time.Now() for i := 0; i < batchstats.DefaultBatchSize; i++ { if i%2 == 0 { - require.NoError(t, b.Add(ctx, agt1.ID, randAgentSDKStats(t))) + require.NoError(t, b.Add(ctx, deps1.Agent.ID, randAgentSDKStats(t))) } else { - require.NoError(t, b.Add(ctx, agt2.ID, randAgentSDKStats(t))) + require.NoError(t, b.Add(ctx, deps2.Agent.ID, randAgentSDKStats(t))) } } + t3 := time.Now() + tick <- t3 + <-flushed // Wait for a flush to complete. // Then: it should immediately flush its stats to store. stats, err = store.GetWorkspaceAgentStats(ctx, t3) @@ -120,35 +126,7 @@ func randAgentSDKStats(t *testing.T, opts ...func(*agentsdk.Stats)) agentsdk.Sta return s } -// randInsertWorkspaceAgentStatParams returns a random InsertWorkspaceAgentStatParams -func randInsertWorkspaceAgentStatParams(t *testing.T, opts ...func(params *database.InsertWorkspaceAgentStatParams)) database.InsertWorkspaceAgentStatParams { - t.Helper() - p := database.InsertWorkspaceAgentStatParams{ - ID: uuid.New(), - CreatedAt: time.Now(), - UserID: uuid.New(), - WorkspaceID: uuid.New(), - TemplateID: uuid.New(), - AgentID: uuid.New(), - ConnectionsByProto: []byte(`{"tcp": 1}`), - ConnectionCount: 1, - RxPackets: 1, - RxBytes: 1, - TxPackets: 1, - TxBytes: 1, - SessionCountVSCode: 1, - SessionCountJetBrains: 1, - SessionCountReconnectingPTY: 0, - SessionCountSSH: 0, - ConnectionMedianLatencyMS: 0, - } - for _, opt := range opts { - opt(&p) - } - return p -} - -//type InsertWorkspaceAgentStatParams struct { +// type InsertWorkspaceAgentStatParams struct { // ID uuid.UUID `db:"id" json:"id"` // CreatedAt time.Time `db:"created_at" json:"created_at"` // UserID uuid.UUID `db:"user_id" json:"user_id"` @@ -168,7 +146,7 @@ func randInsertWorkspaceAgentStatParams(t *testing.T, opts ...func(params *datab // ConnectionMedianLatencyMS float64 `db:"connection_median_latency_ms" json:"connection_median_latency_ms"` //} -//type GetWorkspaceAgentStatsRow struct { +// type GetWorkspaceAgentStatsRow struct { // UserID uuid.UUID `db:"user_id" json:"user_id"` // AgentID uuid.UUID `db:"agent_id" json:"agent_id"` // WorkspaceID uuid.UUID `db:"workspace_id" json:"workspace_id"` @@ -184,3 +162,47 @@ func randInsertWorkspaceAgentStatParams(t *testing.T, opts ...func(params *datab // SessionCountJetBrains int64 `db:"session_count_jetbrains" json:"session_count_jetbrains"` // SessionCountReconnectingPTY int64 `db:"session_count_reconnecting_pty" json:"session_count_reconnecting_pty"` //} + +type deps struct { + Agent database.WorkspaceAgent + Template database.Template + User database.User + Workspace database.Workspace +} + +func setupDeps(t *testing.T, store database.Store) deps { + t.Helper() + + user := dbgen.User(t, store, database.User{}) + tv := dbgen.TemplateVersion(t, store, database.TemplateVersion{}) + tpl := dbgen.Template(t, store, database.Template{ + CreatedBy: user.ID, + ActiveVersionID: tv.ID, + }) + ws := dbgen.Workspace(t, store, database.Workspace{ + TemplateID: tpl.ID, + OwnerID: user.ID, + LastUsedAt: time.Now().Add(-time.Hour), + }) + pj := dbgen.ProvisionerJob(t, store, database.ProvisionerJob{ + InitiatorID: user.ID, + }) + _ = dbgen.WorkspaceBuild(t, store, database.WorkspaceBuild{ + TemplateVersionID: tv.ID, + WorkspaceID: ws.ID, + JobID: pj.ID, + }) + res := dbgen.WorkspaceResource(t, store, database.WorkspaceResource{ + Transition: database.WorkspaceTransitionStart, + JobID: pj.ID, + }) + agt := dbgen.WorkspaceAgent(t, store, database.WorkspaceAgent{ + ResourceID: res.ID, + }) + return deps{ + Agent: agt, + Template: tpl, + User: user, + Workspace: ws, + } +} From 5ee8398068cbba6a41815d04969ddcc59b37242c Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Tue, 1 Aug 2023 14:31:20 +0100 Subject: [PATCH 05/24] GREEN (with postgres) --- coderd/batchstats/batcher.go | 31 ++++-- coderd/batchstats/batcher_test.go | 171 ++++++++++++++++++++---------- 2 files changed, 134 insertions(+), 68 deletions(-) diff --git a/coderd/batchstats/batcher.go b/coderd/batchstats/batcher.go index 6f1c58a16e83b..96802c9d2090a 100644 --- a/coderd/batchstats/batcher.go +++ b/coderd/batchstats/batcher.go @@ -37,7 +37,7 @@ type Batcher struct { // flushLever is used to signal the flusher to flush the buffer immediately. flushLever chan struct{} // flushed is used during testing to signal that a flush has completed. - flushed chan struct{} + flushed chan<- bool } // Option is a functional option for configuring a Batcher. @@ -72,7 +72,9 @@ func WithLogger(log slog.Logger) Option { } // With Flushed sets the channel to use for signaling that a flush has completed. -func WithFlushed(ch chan struct{}) Option { +// This is only used for testing. +// True signifies that a flush was forced. +func WithFlushed(ch chan bool) Option { return func(b *Batcher) { b.flushed = ch } @@ -84,6 +86,7 @@ func New(opts ...Option) (*Batcher, error) { buf := make([]database.InsertWorkspaceAgentStatParams, 0, DefaultBatchSize) b.buf = buf b.log = slog.Make(sloghuman.Sink(os.Stderr)) + b.flushLever = make(chan struct{}, 1) // Buffered so that it doesn't block. for _, opt := range opts { opt(b) } @@ -126,6 +129,7 @@ func (b *Batcher) Add( } p := database.InsertWorkspaceAgentStatParams{ ID: uuid.New(), + AgentID: agentID, CreatedAt: now, WorkspaceID: ws.ID, UserID: ws.OwnerID, @@ -143,6 +147,10 @@ func (b *Batcher) Add( ConnectionMedianLatencyMS: st.ConnectionMedianLatencyMS, } b.buf = append(b.buf, p) + if len(b.buf) == cap(b.buf) { + // If the buffer is full, signal the flusher to flush immediately. + b.flushLever <- struct{}{} + } return nil } @@ -151,37 +159,38 @@ func (b *Batcher) Run(ctx context.Context) { authCtx := dbauthz.AsSystemRestricted(ctx) for { select { - case tick := <-b.ticker: - b.flush(authCtx, tick) + case <-b.ticker: + b.flush(authCtx, false, "scheduled") case <-b.flushLever: // If the flush lever is depressed, flush the buffer immediately. - b.log.Warn(ctx, "flushing due to full buffer", slog.F("count", len(b.buf))) - b.flush(authCtx, database.Now()) + b.flush(authCtx, true, "full buffer") case <-ctx.Done(): b.log.Warn(ctx, "context done, flushing before exit") - b.flush(authCtx, database.Now()) + b.flush(authCtx, true, "exit") return } } } // flush flushes the batcher's buffer. -func (b *Batcher) flush(ctx context.Context, now time.Time) { +func (b *Batcher) flush(ctx context.Context, forced bool, reason string) { // TODO(Cian): After flushing, should we somehow reset the ticker? b.mu.Lock() defer b.mu.Unlock() defer func() { // Notify that a flush has completed. if b.flushed != nil { - b.flushed <- struct{}{} + b.flushed <- forced + b.log.Debug(ctx, "notify flush") } }() + + b.log.Debug(ctx, "flushing buffer", slog.F("count", len(b.buf)), slog.F("forced", forced)) if len(b.buf) == 0 { b.log.Debug(ctx, "nothing to flush") return } - b.log.Debug(ctx, "flushing buffer", slog.F("count", len(b.buf))) // TODO(cian): update the query to batch-insert multiple stats for _, p := range b.buf { if _, err := b.store.InsertWorkspaceAgentStat(ctx, p); err != nil { @@ -189,4 +198,6 @@ func (b *Batcher) flush(ctx context.Context, now time.Time) { } } + // Reset the buffer. + b.buf = b.buf[:0] } diff --git a/coderd/batchstats/batcher_test.go b/coderd/batchstats/batcher_test.go index bc809c62beff2..903b7f289adfe 100644 --- a/coderd/batchstats/batcher_test.go +++ b/coderd/batchstats/batcher_test.go @@ -5,44 +5,47 @@ import ( "testing" "time" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "cdr.dev/slog" "cdr.dev/slog/sloggers/slogtest" "github.com/coder/coder/coderd/batchstats" "github.com/coder/coder/coderd/database" "github.com/coder/coder/coderd/database/dbgen" "github.com/coder/coder/coderd/database/dbtestutil" + "github.com/coder/coder/coderd/rbac" "github.com/coder/coder/codersdk/agentsdk" + "github.com/coder/coder/cryptorand" ) func TestBatchStats(t *testing.T) { + var ( + batchSize = batchstats.DefaultBatchSize + ) t.Parallel() // Given: a fresh batcher with no data ctx, cancel := context.WithCancel(context.Background()) t.Cleanup(cancel) - log := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}) + log := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}).Leveled(slog.LevelDebug) store, _ := dbtestutil.NewDB(t) // Set up some test dependencies. - deps1 := setupDeps(t, store) - ws1, err := store.GetWorkspaceByID(ctx, deps1.Workspace.ID) - deps2 := setupDeps(t, store) - require.NoError(t, err) - ws2, err := store.GetWorkspaceByID(ctx, deps2.Workspace.ID) - require.NoError(t, err) - startedAt := time.Now() + deps := setupDeps(t, store) tick := make(chan time.Time) - flushed := make(chan struct{}) + flushed := make(chan bool) b, err := batchstats.New( batchstats.WithStore(store), + batchstats.WithBatchSize(batchSize), batchstats.WithLogger(log), batchstats.WithTicker(tick), batchstats.WithFlushed(flushed), ) require.NoError(t, err) + // Given: no data points are added for workspace // When: it becomes time to report stats done := make(chan struct{}) t.Cleanup(func() { @@ -52,80 +55,112 @@ func TestBatchStats(t *testing.T) { b.Run(ctx) }() t1 := time.Now() + // Signal a tick and wait for a flush to complete. tick <- t1 - <-flushed // Wait for a flush to complete. + f := <-flushed + require.False(t, f, "flush should not have been forced") + t.Logf("flush 1 completed") // Then: it should report no stats. - stats, err := store.GetWorkspaceAgentStats(ctx, startedAt) + stats, err := store.GetWorkspaceAgentStats(ctx, t1) require.NoError(t, err) require.Empty(t, stats) - // Then: workspace last used time should not be updated - updated1, err := store.GetWorkspaceByID(ctx, ws1.ID) - require.NoError(t, err) - require.Equal(t, ws1.LastUsedAt, updated1.LastUsedAt) - updated2, err := store.GetWorkspaceByID(ctx, ws2.ID) - require.NoError(t, err) - require.Equal(t, ws2.LastUsedAt, updated2.LastUsedAt) - - // When: a single data point is added for ws1 - require.NoError(t, b.Add(ctx, deps1.Agent.ID, randAgentSDKStats(t))) - // And it becomes time to report stats + // Given: a single data point is added for workspace t2 := time.Now() + t.Logf("inserting 1 stat") + require.NoError(t, b.Add(ctx, deps.Agent.ID, randAgentSDKStats(t))) + + // When: it becomes time to report stats + // Signal a tick and wait for a flush to complete. tick <- t2 - <-flushed // Wait for a flush to complete. + f = <-flushed // Wait for a flush to complete. + require.False(t, f, "flush should not have been forced") + t.Logf("flush 2 completed") // Then: it should report a single stat. - stats, err = store.GetWorkspaceAgentStats(ctx, t1) + stats, err = store.GetWorkspaceAgentStats(ctx, t2) require.NoError(t, err) require.Len(t, stats, 1) - // Then: ws1 last used time should be updated - updated1, err = store.GetWorkspaceByID(ctx, ws1.ID) - require.NoError(t, err) - require.NotEqual(t, ws1.LastUsedAt, updated1.LastUsedAt) - // And: ws2 last used time should not be updated - updated2, err = store.GetWorkspaceByID(ctx, ws2.ID) - require.NoError(t, err) - require.Equal(t, ws2.LastUsedAt, updated2.LastUsedAt) - - // When: a lot of data points are added for both ws1 and ws2 + // Given: a lot of data points are added for workspace // (equal to batch size) - for i := 0; i < batchstats.DefaultBatchSize; i++ { - if i%2 == 0 { - require.NoError(t, b.Add(ctx, deps1.Agent.ID, randAgentSDKStats(t))) - } else { - require.NoError(t, b.Add(ctx, deps2.Agent.ID, randAgentSDKStats(t))) - } - } t3 := time.Now() - tick <- t3 - <-flushed // Wait for a flush to complete. + t.Logf("inserting %d stats", batchSize) + for i := 0; i < batchSize; i++ { + require.NoError(t, b.Add(ctx, deps.Agent.ID, randAgentSDKStats(t))) + } + + // When: the buffer is full + // Wait for a flush to complete. This should be forced by filling the buffer. + f = <-flushed + require.True(t, f, "flush should have been forced") + t.Logf("flush 3 completed") // Then: it should immediately flush its stats to store. stats, err = store.GetWorkspaceAgentStats(ctx, t3) require.NoError(t, err) - require.Len(t, stats, batchstats.DefaultBatchSize) - - // Then: ws1 and ws2 last used time should be updated - updated1, err = store.GetWorkspaceByID(ctx, ws1.ID) - require.NoError(t, err) - require.NotEqual(t, ws1.LastUsedAt, updated1.LastUsedAt) - updated2, err = store.GetWorkspaceByID(ctx, ws2.ID) - require.NoError(t, err) - require.NotEqual(t, ws2.LastUsedAt, updated2.LastUsedAt) + if assert.Len(t, stats, 1) { + assert.Greater(t, stats[0].AggregatedFrom, t3) + assert.Equal(t, stats[0].AgentID, deps.Agent.ID) + assert.Equal(t, stats[0].WorkspaceID, deps.Workspace.ID) + assert.Equal(t, stats[0].TemplateID, deps.Template.ID) + assert.NotZero(t, stats[0].WorkspaceRxBytes) + assert.NotZero(t, stats[0].WorkspaceTxBytes) + assert.NotZero(t, stats[0].WorkspaceConnectionLatency50) + assert.NotZero(t, stats[0].WorkspaceConnectionLatency95) + assert.NotZero(t, stats[0].SessionCountVSCode) + assert.NotZero(t, stats[0].SessionCountSSH) + assert.NotZero(t, stats[0].SessionCountJetBrains) + assert.NotZero(t, stats[0].SessionCountReconnectingPTY) + } } // randAgentSDKStats returns a random agentsdk.Stats func randAgentSDKStats(t *testing.T, opts ...func(*agentsdk.Stats)) agentsdk.Stats { t.Helper() - var s agentsdk.Stats + s := agentsdk.Stats{ + ConnectionsByProto: map[string]int64{ + "ssh": mustRandInt64n(t, 9) + 1, + "vscode": mustRandInt64n(t, 9) + 1, + "jetbrains": mustRandInt64n(t, 9) + 1, + "reconnecting_pty": mustRandInt64n(t, 9) + 1, + }, + ConnectionCount: mustRandInt64n(t, 99) + 1, + ConnectionMedianLatencyMS: float64(mustRandInt64n(t, 99) + 1), + RxPackets: mustRandInt64n(t, 99) + 1, + RxBytes: mustRandInt64n(t, 99) + 1, + TxPackets: mustRandInt64n(t, 99) + 1, + TxBytes: mustRandInt64n(t, 99) + 1, + SessionCountVSCode: mustRandInt64n(t, 9) + 1, + SessionCountJetBrains: mustRandInt64n(t, 9) + 1, + SessionCountReconnectingPTY: mustRandInt64n(t, 9) + 1, + SessionCountSSH: mustRandInt64n(t, 9) + 1, + Metrics: []agentsdk.AgentMetric{}, + } for _, opt := range opts { opt(&s) } return s } +// type Stats struct { +// ConnectionsByProto map[string]int64 `json:"connections_by_proto"` +// ConnectionCount int64 `json:"connection_count"` +// ConnectionMedianLatencyMS float64 `json:"connection_median_latency_ms"` +// RxPackets int64 `json:"rx_packets"` +// RxBytes int64 `json:"rx_bytes"` +// TxPackets int64 `json:"tx_packets"` +// TxBytes int64 `json:"tx_bytes"` +// SessionCountVSCode int64 `json:"session_count_vscode"` +// SessionCountJetBrains int64 `json:"session_count_jetbrains"` +// SessionCountReconnectingPTY int64 `json:"session_count_reconnecting_pty"` +// SessionCountSSH int64 `json:"session_count_ssh"` + +// // Metrics collected by the agent +// Metrics []AgentMetric `json:"metrics"` +// } + // type InsertWorkspaceAgentStatParams struct { // ID uuid.UUID `db:"id" json:"id"` // CreatedAt time.Time `db:"created_at" json:"created_at"` @@ -173,19 +208,32 @@ type deps struct { func setupDeps(t *testing.T, store database.Store) deps { t.Helper() + org := dbgen.Organization(t, store, database.Organization{}) user := dbgen.User(t, store, database.User{}) - tv := dbgen.TemplateVersion(t, store, database.TemplateVersion{}) + _, err := store.InsertOrganizationMember(context.Background(), database.InsertOrganizationMemberParams{ + OrganizationID: org.ID, + UserID: user.ID, + Roles: []string{rbac.RoleOrgMember(org.ID)}, + }) + require.NoError(t, err) + tv := dbgen.TemplateVersion(t, store, database.TemplateVersion{ + OrganizationID: org.ID, + CreatedBy: user.ID, + }) tpl := dbgen.Template(t, store, database.Template{ CreatedBy: user.ID, + OrganizationID: org.ID, ActiveVersionID: tv.ID, }) ws := dbgen.Workspace(t, store, database.Workspace{ - TemplateID: tpl.ID, - OwnerID: user.ID, - LastUsedAt: time.Now().Add(-time.Hour), + TemplateID: tpl.ID, + OwnerID: user.ID, + OrganizationID: org.ID, + LastUsedAt: time.Now().Add(-time.Hour), }) pj := dbgen.ProvisionerJob(t, store, database.ProvisionerJob{ - InitiatorID: user.ID, + InitiatorID: user.ID, + OrganizationID: org.ID, }) _ = dbgen.WorkspaceBuild(t, store, database.WorkspaceBuild{ TemplateVersionID: tv.ID, @@ -206,3 +254,10 @@ func setupDeps(t *testing.T, store database.Store) deps { Workspace: ws, } } + +func mustRandInt64n(t *testing.T, n int64) int64 { + t.Helper() + i, err := cryptorand.Intn(int(n)) + require.NoError(t, err) + return int64(i) +} From 666261f63d150bbc9204e3236f8154c7ab14c77b Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Tue, 1 Aug 2023 14:34:24 +0100 Subject: [PATCH 06/24] test with multiple workspaces --- coderd/batchstats/batcher_test.go | 39 ++++++++++++------------------- 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/coderd/batchstats/batcher_test.go b/coderd/batchstats/batcher_test.go index 903b7f289adfe..d0054712797ae 100644 --- a/coderd/batchstats/batcher_test.go +++ b/coderd/batchstats/batcher_test.go @@ -5,7 +5,6 @@ import ( "testing" "time" - "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "cdr.dev/slog" @@ -32,7 +31,8 @@ func TestBatchStats(t *testing.T) { store, _ := dbtestutil.NewDB(t) // Set up some test dependencies. - deps := setupDeps(t, store) + deps1 := setupDeps(t, store) + deps2 := setupDeps(t, store) tick := make(chan time.Time) flushed := make(chan bool) @@ -63,13 +63,13 @@ func TestBatchStats(t *testing.T) { // Then: it should report no stats. stats, err := store.GetWorkspaceAgentStats(ctx, t1) - require.NoError(t, err) - require.Empty(t, stats) + require.NoError(t, err, "should not error getting stats") + require.Empty(t, stats, "should have no stats for workspace") // Given: a single data point is added for workspace t2 := time.Now() t.Logf("inserting 1 stat") - require.NoError(t, b.Add(ctx, deps.Agent.ID, randAgentSDKStats(t))) + require.NoError(t, b.Add(ctx, deps1.Agent.ID, randAgentSDKStats(t))) // When: it becomes time to report stats // Signal a tick and wait for a flush to complete. @@ -80,15 +80,19 @@ func TestBatchStats(t *testing.T) { // Then: it should report a single stat. stats, err = store.GetWorkspaceAgentStats(ctx, t2) - require.NoError(t, err) - require.Len(t, stats, 1) + require.NoError(t, err, "should not error getting stats") + require.Len(t, stats, 1, "should have stats for workspace") - // Given: a lot of data points are added for workspace + // Given: a lot of data points are added for both workspaces // (equal to batch size) t3 := time.Now() t.Logf("inserting %d stats", batchSize) for i := 0; i < batchSize; i++ { - require.NoError(t, b.Add(ctx, deps.Agent.ID, randAgentSDKStats(t))) + if i%2 == 0 { + require.NoError(t, b.Add(ctx, deps1.Agent.ID, randAgentSDKStats(t))) + } else { + require.NoError(t, b.Add(ctx, deps2.Agent.ID, randAgentSDKStats(t))) + } } // When: the buffer is full @@ -99,21 +103,8 @@ func TestBatchStats(t *testing.T) { // Then: it should immediately flush its stats to store. stats, err = store.GetWorkspaceAgentStats(ctx, t3) - require.NoError(t, err) - if assert.Len(t, stats, 1) { - assert.Greater(t, stats[0].AggregatedFrom, t3) - assert.Equal(t, stats[0].AgentID, deps.Agent.ID) - assert.Equal(t, stats[0].WorkspaceID, deps.Workspace.ID) - assert.Equal(t, stats[0].TemplateID, deps.Template.ID) - assert.NotZero(t, stats[0].WorkspaceRxBytes) - assert.NotZero(t, stats[0].WorkspaceTxBytes) - assert.NotZero(t, stats[0].WorkspaceConnectionLatency50) - assert.NotZero(t, stats[0].WorkspaceConnectionLatency95) - assert.NotZero(t, stats[0].SessionCountVSCode) - assert.NotZero(t, stats[0].SessionCountSSH) - assert.NotZero(t, stats[0].SessionCountJetBrains) - assert.NotZero(t, stats[0].SessionCountReconnectingPTY) - } + require.NoError(t, err, "should not error getting stats") + require.Len(t, stats, 2, "should have stats for both workspaces") } // randAgentSDKStats returns a random agentsdk.Stats From e856a71eaa30f22a050dbd647c90eba4ef0f11b3 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Tue, 1 Aug 2023 14:41:25 +0100 Subject: [PATCH 07/24] rm comments --- coderd/batchstats/batcher_test.go | 60 ++++--------------------------- 1 file changed, 6 insertions(+), 54 deletions(-) diff --git a/coderd/batchstats/batcher_test.go b/coderd/batchstats/batcher_test.go index d0054712797ae..dfbb8eee0e4c3 100644 --- a/coderd/batchstats/batcher_test.go +++ b/coderd/batchstats/batcher_test.go @@ -135,60 +135,7 @@ func randAgentSDKStats(t *testing.T, opts ...func(*agentsdk.Stats)) agentsdk.Sta return s } -// type Stats struct { -// ConnectionsByProto map[string]int64 `json:"connections_by_proto"` -// ConnectionCount int64 `json:"connection_count"` -// ConnectionMedianLatencyMS float64 `json:"connection_median_latency_ms"` -// RxPackets int64 `json:"rx_packets"` -// RxBytes int64 `json:"rx_bytes"` -// TxPackets int64 `json:"tx_packets"` -// TxBytes int64 `json:"tx_bytes"` -// SessionCountVSCode int64 `json:"session_count_vscode"` -// SessionCountJetBrains int64 `json:"session_count_jetbrains"` -// SessionCountReconnectingPTY int64 `json:"session_count_reconnecting_pty"` -// SessionCountSSH int64 `json:"session_count_ssh"` - -// // Metrics collected by the agent -// Metrics []AgentMetric `json:"metrics"` -// } - -// type InsertWorkspaceAgentStatParams struct { -// ID uuid.UUID `db:"id" json:"id"` -// CreatedAt time.Time `db:"created_at" json:"created_at"` -// UserID uuid.UUID `db:"user_id" json:"user_id"` -// WorkspaceID uuid.UUID `db:"workspace_id" json:"workspace_id"` -// TemplateID uuid.UUID `db:"template_id" json:"template_id"` -// AgentID uuid.UUID `db:"agent_id" json:"agent_id"` -// ConnectionsByProto json.RawMessage `db:"connections_by_proto" json:"connections_by_proto"` -// ConnectionCount int64 `db:"connection_count" json:"connection_count"` -// RxPackets int64 `db:"rx_packets" json:"rx_packets"` -// RxBytes int64 `db:"rx_bytes" json:"rx_bytes"` -// TxPackets int64 `db:"tx_packets" json:"tx_packets"` -// TxBytes int64 `db:"tx_bytes" json:"tx_bytes"` -// SessionCountVSCode int64 `db:"session_count_vscode" json:"session_count_vscode"` -// SessionCountJetBrains int64 `db:"session_count_jetbrains" json:"session_count_jetbrains"` -// SessionCountReconnectingPTY int64 `db:"session_count_reconnecting_pty" json:"session_count_reconnecting_pty"` -// SessionCountSSH int64 `db:"session_count_ssh" json:"session_count_ssh"` -// ConnectionMedianLatencyMS float64 `db:"connection_median_latency_ms" json:"connection_median_latency_ms"` -//} - -// type GetWorkspaceAgentStatsRow struct { -// UserID uuid.UUID `db:"user_id" json:"user_id"` -// AgentID uuid.UUID `db:"agent_id" json:"agent_id"` -// WorkspaceID uuid.UUID `db:"workspace_id" json:"workspace_id"` -// TemplateID uuid.UUID `db:"template_id" json:"template_id"` -// AggregatedFrom time.Time `db:"aggregated_from" json:"aggregated_from"` -// WorkspaceRxBytes int64 `db:"workspace_rx_bytes" json:"workspace_rx_bytes"` -// WorkspaceTxBytes int64 `db:"workspace_tx_bytes" json:"workspace_tx_bytes"` -// WorkspaceConnectionLatency50 float64 `db:"workspace_connection_latency_50" json:"workspace_connection_latency_50"` -// WorkspaceConnectionLatency95 float64 `db:"workspace_connection_latency_95" json:"workspace_connection_latency_95"` -// AgentID_2 uuid.UUID `db:"agent_id_2" json:"agent_id_2"` -// SessionCountVSCode int64 `db:"session_count_vscode" json:"session_count_vscode"` -// SessionCountSSH int64 `db:"session_count_ssh" json:"session_count_ssh"` -// SessionCountJetBrains int64 `db:"session_count_jetbrains" json:"session_count_jetbrains"` -// SessionCountReconnectingPTY int64 `db:"session_count_reconnecting_pty" json:"session_count_reconnecting_pty"` -//} - +// deps is a set of test dependencies. type deps struct { Agent database.WorkspaceAgent Template database.Template @@ -196,6 +143,10 @@ type deps struct { Workspace database.Workspace } +// setupDeps sets up a set of test dependencies. +// It creates an organization, user, template, workspace, and agent +// along with all the other miscellaneous plumbing required to link +// them together. func setupDeps(t *testing.T, store database.Store) deps { t.Helper() @@ -246,6 +197,7 @@ func setupDeps(t *testing.T, store database.Store) deps { } } +// mustRandInt64n returns a random int64 in the range [0, n). func mustRandInt64n(t *testing.T, n int64) int64 { t.Helper() i, err := cryptorand.Intn(int(n)) From 270497f1431739dc340f191315447b00f6f9b367 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Tue, 1 Aug 2023 15:12:11 +0100 Subject: [PATCH 08/24] fix null IDs in FakeQuerier.GetWorkspaceAgentStats --- coderd/batchstats/batcher_test.go | 4 +++- coderd/database/dbfake/dbfake.go | 8 ++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/coderd/batchstats/batcher_test.go b/coderd/batchstats/batcher_test.go index dfbb8eee0e4c3..a7920d01f6d19 100644 --- a/coderd/batchstats/batcher_test.go +++ b/coderd/batchstats/batcher_test.go @@ -20,10 +20,12 @@ import ( ) func TestBatchStats(t *testing.T) { + t.Parallel() + var ( batchSize = batchstats.DefaultBatchSize ) - t.Parallel() + // Given: a fresh batcher with no data ctx, cancel := context.WithCancel(context.Background()) t.Cleanup(cancel) diff --git a/coderd/database/dbfake/dbfake.go b/coderd/database/dbfake/dbfake.go index 8e29bae4349fb..fe2cc25ba7390 100644 --- a/coderd/database/dbfake/dbfake.go +++ b/coderd/database/dbfake/dbfake.go @@ -2701,8 +2701,12 @@ func (q *FakeQuerier) GetWorkspaceAgentStats(_ context.Context, createdAfter tim } statByAgent := map[uuid.UUID]database.GetWorkspaceAgentStatsRow{} - for _, agentStat := range latestAgentStats { - stat := statByAgent[agentStat.AgentID] + for agentID, agentStat := range latestAgentStats { + stat := statByAgent[agentID] + stat.AgentID = agentStat.AgentID + stat.TemplateID = agentStat.TemplateID + stat.UserID = agentStat.UserID + stat.WorkspaceID = agentStat.WorkspaceID stat.SessionCountVSCode += agentStat.SessionCountVSCode stat.SessionCountJetBrains += agentStat.SessionCountJetBrains stat.SessionCountReconnectingPTY += agentStat.SessionCountReconnectingPTY From 1fa285cf9d935ea179f2866f230d94bb8825e716 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Tue, 1 Aug 2023 17:32:47 +0100 Subject: [PATCH 09/24] first stab at batch query --- coderd/batchstats/batcher.go | 150 ++++++++++++++---- coderd/database/dbauthz/dbauthz.go | 4 + coderd/database/dbfake/dbfake.go | 9 ++ coderd/database/dbmetrics/dbmetrics.go | 7 + coderd/database/dbmock/dbmock.go | 14 ++ coderd/database/querier.go | 1 + coderd/database/queries.sql.go | 66 ++++++++ .../database/queries/workspaceagentstats.sql | 22 +++ 8 files changed, 238 insertions(+), 35 deletions(-) diff --git a/coderd/batchstats/batcher.go b/coderd/batchstats/batcher.go index 96802c9d2090a..0a428eabb91a2 100644 --- a/coderd/batchstats/batcher.go +++ b/coderd/batchstats/batcher.go @@ -23,14 +23,41 @@ const ( DefaultBatchSize = 1024 ) +// batchInsertParams is a struct used to batch-insert stats. +// It looks weird because of how we insert the data into the database +// (using unnest). +// Ideally we would just use COPY FROM but that isn't supported by +// sqlc using lib/pq. At a later date we should consider switching to +// pgx. See: https://docs.sqlc.dev/en/stable/guides/using-go-and-pgx.html +type batchInsertParams struct { + IDs []uuid.UUID + CreatedAts []time.Time + UserIDs []uuid.UUID + WorkspaceIDs []uuid.UUID + TemplateIDs []uuid.UUID + AgentIDs []uuid.UUID + ConnectionsByProtos []json.RawMessage + ConnectionCounts []int64 + RxPacketses []int64 + RxByteses []int64 + TxPacketses []int64 + TxByteses []int64 + SessionCountVSCodes []int64 + SessionCountJetBrainses []int64 + SessionCountReconnectingPTYs []int64 + SessionCountSSHs []int64 + ConnectionMedianLatencyMSes []float64 +} + // Batcher holds a buffer of agent stats and periodically flushes them to // its configured store. It also updates the workspace's last used time. type Batcher struct { store database.Store log slog.Logger - mu sync.RWMutex - buf []database.InsertWorkspaceAgentStatParams + mu sync.RWMutex + buf batchInsertParams + batchSize int // ticker is used to periodically flush the buffer. ticker <-chan time.Time @@ -53,7 +80,7 @@ func WithStore(store database.Store) Option { // WithBatchSize sets the number of stats to store in a batch. func WithBatchSize(size int) Option { return func(b *Batcher) { - b.buf = make([]database.InsertWorkspaceAgentStatParams, 0, size) + b.batchSize = size } } @@ -83,8 +110,6 @@ func WithFlushed(ch chan bool) Option { // New creates a new Batcher. func New(opts ...Option) (*Batcher, error) { b := &Batcher{} - buf := make([]database.InsertWorkspaceAgentStatParams, 0, DefaultBatchSize) - b.buf = buf b.log = slog.Make(sloghuman.Sink(os.Stderr)) b.flushLever = make(chan struct{}, 1) // Buffered so that it doesn't block. for _, opt := range opts { @@ -99,6 +124,30 @@ func New(opts ...Option) (*Batcher, error) { return nil, xerrors.Errorf("no ticker configured for batcher") } + if b.batchSize == 0 { + b.batchSize = DefaultBatchSize + } + + b.buf = batchInsertParams{ + IDs: make([]uuid.UUID, 0, b.batchSize), + CreatedAts: make([]time.Time, 0, b.batchSize), + UserIDs: make([]uuid.UUID, 0, b.batchSize), + WorkspaceIDs: make([]uuid.UUID, 0, b.batchSize), + TemplateIDs: make([]uuid.UUID, 0, b.batchSize), + AgentIDs: make([]uuid.UUID, 0, b.batchSize), + ConnectionsByProtos: make([]json.RawMessage, 0, b.batchSize), + ConnectionCounts: make([]int64, 0, b.batchSize), + RxPacketses: make([]int64, 0, b.batchSize), + RxByteses: make([]int64, 0, b.batchSize), + TxPacketses: make([]int64, 0, b.batchSize), + TxByteses: make([]int64, 0, b.batchSize), + SessionCountVSCodes: make([]int64, 0, b.batchSize), + SessionCountJetBrainses: make([]int64, 0, b.batchSize), + SessionCountReconnectingPTYs: make([]int64, 0, b.batchSize), + SessionCountSSHs: make([]int64, 0, b.batchSize), + ConnectionMedianLatencyMSes: make([]float64, 0, b.batchSize), + } + return b, nil } @@ -127,28 +176,27 @@ func (b *Batcher) Add( ) payload = json.RawMessage("{}") } - p := database.InsertWorkspaceAgentStatParams{ - ID: uuid.New(), - AgentID: agentID, - CreatedAt: now, - WorkspaceID: ws.ID, - UserID: ws.OwnerID, - TemplateID: ws.TemplateID, - ConnectionsByProto: payload, - ConnectionCount: st.ConnectionCount, - RxPackets: st.RxPackets, - RxBytes: st.RxBytes, - TxPackets: st.TxPackets, - TxBytes: st.TxBytes, - SessionCountVSCode: st.SessionCountVSCode, - SessionCountJetBrains: st.SessionCountJetBrains, - SessionCountReconnectingPTY: st.SessionCountReconnectingPTY, - SessionCountSSH: st.SessionCountSSH, - ConnectionMedianLatencyMS: st.ConnectionMedianLatencyMS, - } - b.buf = append(b.buf, p) - if len(b.buf) == cap(b.buf) { - // If the buffer is full, signal the flusher to flush immediately. + + b.buf.IDs = append(b.buf.IDs, uuid.New()) + b.buf.AgentIDs = append(b.buf.AgentIDs, agentID) + b.buf.CreatedAts = append(b.buf.CreatedAts, now) + b.buf.UserIDs = append(b.buf.UserIDs, ws.OwnerID) + b.buf.WorkspaceIDs = append(b.buf.WorkspaceIDs, ws.ID) + b.buf.TemplateIDs = append(b.buf.TemplateIDs, ws.TemplateID) + b.buf.ConnectionsByProtos = append(b.buf.ConnectionsByProtos, payload) + b.buf.ConnectionCounts = append(b.buf.ConnectionCounts, st.ConnectionCount) + b.buf.RxPacketses = append(b.buf.RxPacketses, st.RxPackets) + b.buf.RxByteses = append(b.buf.RxByteses, st.RxBytes) + b.buf.TxPacketses = append(b.buf.TxPacketses, st.TxPackets) + b.buf.TxByteses = append(b.buf.TxByteses, st.TxBytes) + b.buf.SessionCountVSCodes = append(b.buf.SessionCountVSCodes, st.SessionCountVSCode) + b.buf.SessionCountJetBrainses = append(b.buf.SessionCountJetBrainses, st.SessionCountJetBrains) + b.buf.SessionCountReconnectingPTYs = append(b.buf.SessionCountReconnectingPTYs, st.SessionCountReconnectingPTY) + b.buf.SessionCountSSHs = append(b.buf.SessionCountSSHs, st.SessionCountSSH) + b.buf.ConnectionMedianLatencyMSes = append(b.buf.ConnectionMedianLatencyMSes, st.ConnectionMedianLatencyMS) + + // If the buffer is full, signal the flusher to flush immediately. + if len(b.buf.IDs) == cap(b.buf.IDs) { b.flushLever <- struct{}{} } return nil @@ -185,19 +233,51 @@ func (b *Batcher) flush(ctx context.Context, forced bool, reason string) { } }() - b.log.Debug(ctx, "flushing buffer", slog.F("count", len(b.buf)), slog.F("forced", forced)) - if len(b.buf) == 0 { + b.log.Debug(ctx, "flushing buffer", slog.F("count", len(b.buf.IDs)), slog.F("forced", forced)) + if len(b.buf.IDs) == 0 { b.log.Debug(ctx, "nothing to flush") return } - // TODO(cian): update the query to batch-insert multiple stats - for _, p := range b.buf { - if _, err := b.store.InsertWorkspaceAgentStat(ctx, p); err != nil { - b.log.Error(ctx, "insert workspace agent stat", slog.Error(err)) - } + if err := b.store.InsertWorkspaceAgentStats(ctx, database.InsertWorkspaceAgentStatsParams{ + ID: b.buf.IDs, + CreatedAt: b.buf.CreatedAts, + UserID: b.buf.UserIDs, + WorkspaceID: b.buf.WorkspaceIDs, + TemplateID: b.buf.TemplateIDs, + AgentID: b.buf.AgentIDs, + ConnectionsByProto: b.buf.ConnectionsByProtos, + ConnectionCount: b.buf.ConnectionCounts, + RxPackets: b.buf.RxPacketses, + RxBytes: b.buf.RxByteses, + TxPackets: b.buf.TxPacketses, + TxBytes: b.buf.TxByteses, + SessionCountVSCode: b.buf.SessionCountVSCodes, + SessionCountJetBrains: b.buf.SessionCountJetBrainses, + SessionCountReconnectingPTY: b.buf.SessionCountReconnectingPTYs, + SessionCountSSH: b.buf.SessionCountSSHs, + ConnectionMedianLatencyMS: b.buf.ConnectionMedianLatencyMSes, + }); err != nil { + b.log.Error(ctx, "insert workspace agent stats", slog.Error(err)) } // Reset the buffer. - b.buf = b.buf[:0] + // b.buf = b.buf[:0] + b.buf.IDs = b.buf.IDs[:0] + b.buf.CreatedAts = b.buf.CreatedAts[:0] + b.buf.UserIDs = b.buf.UserIDs[:0] + b.buf.WorkspaceIDs = b.buf.WorkspaceIDs[:0] + b.buf.TemplateIDs = b.buf.TemplateIDs[:0] + b.buf.AgentIDs = b.buf.AgentIDs[:0] + b.buf.ConnectionsByProtos = b.buf.ConnectionsByProtos[:0] + b.buf.ConnectionCounts = b.buf.ConnectionCounts[:0] + b.buf.RxPacketses = b.buf.RxPacketses[:0] + b.buf.RxByteses = b.buf.RxByteses[:0] + b.buf.TxPacketses = b.buf.TxPacketses[:0] + b.buf.TxByteses = b.buf.TxByteses[:0] + b.buf.SessionCountVSCodes = b.buf.SessionCountVSCodes[:0] + b.buf.SessionCountJetBrainses = b.buf.SessionCountJetBrainses[:0] + b.buf.SessionCountReconnectingPTYs = b.buf.SessionCountReconnectingPTYs[:0] + b.buf.SessionCountSSHs = b.buf.SessionCountSSHs[:0] + b.buf.ConnectionMedianLatencyMSes = b.buf.ConnectionMedianLatencyMSes[:0] } diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index afeab08de9873..4d5a74afcc114 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -1997,6 +1997,10 @@ func (q *querier) InsertWorkspaceAgentStat(ctx context.Context, arg database.Ins return q.db.InsertWorkspaceAgentStat(ctx, arg) } +func (q *querier) InsertWorkspaceAgentStats(ctx context.Context, arg database.InsertWorkspaceAgentStatsParams) error { + panic("not implemented") +} + func (q *querier) InsertWorkspaceApp(ctx context.Context, arg database.InsertWorkspaceAppParams) (database.WorkspaceApp, error) { if err := q.authorizeContext(ctx, rbac.ActionCreate, rbac.ResourceSystem); err != nil { return database.WorkspaceApp{}, err diff --git a/coderd/database/dbfake/dbfake.go b/coderd/database/dbfake/dbfake.go index fe2cc25ba7390..cb3f5ced88cf4 100644 --- a/coderd/database/dbfake/dbfake.go +++ b/coderd/database/dbfake/dbfake.go @@ -4070,6 +4070,15 @@ func (q *FakeQuerier) InsertWorkspaceAgentStat(_ context.Context, p database.Ins return stat, nil } +func (q *FakeQuerier) InsertWorkspaceAgentStats(ctx context.Context, arg database.InsertWorkspaceAgentStatsParams) error { + err := validateDatabaseType(arg) + if err != nil { + return err + } + + panic("not implemented") +} + func (q *FakeQuerier) InsertWorkspaceApp(_ context.Context, arg database.InsertWorkspaceAppParams) (database.WorkspaceApp, error) { if err := validateDatabaseType(arg); err != nil { return database.WorkspaceApp{}, err diff --git a/coderd/database/dbmetrics/dbmetrics.go b/coderd/database/dbmetrics/dbmetrics.go index 95dde653ca547..d0002b60653ab 100644 --- a/coderd/database/dbmetrics/dbmetrics.go +++ b/coderd/database/dbmetrics/dbmetrics.go @@ -1229,6 +1229,13 @@ func (m metricsStore) InsertWorkspaceAgentStat(ctx context.Context, arg database return stat, err } +func (m metricsStore) InsertWorkspaceAgentStats(ctx context.Context, arg database.InsertWorkspaceAgentStatsParams) error { + start := time.Now() + r0 := m.s.InsertWorkspaceAgentStats(ctx, arg) + m.queryLatencies.WithLabelValues("InsertWorkspaceAgentStats").Observe(time.Since(start).Seconds()) + return r0 +} + func (m metricsStore) InsertWorkspaceApp(ctx context.Context, arg database.InsertWorkspaceAppParams) (database.WorkspaceApp, error) { start := time.Now() app, err := m.s.InsertWorkspaceApp(ctx, arg) diff --git a/coderd/database/dbmock/dbmock.go b/coderd/database/dbmock/dbmock.go index f6ee26a15f817..70fa1c29d0100 100644 --- a/coderd/database/dbmock/dbmock.go +++ b/coderd/database/dbmock/dbmock.go @@ -2583,6 +2583,20 @@ func (mr *MockStoreMockRecorder) InsertWorkspaceAgentStat(arg0, arg1 interface{} return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "InsertWorkspaceAgentStat", reflect.TypeOf((*MockStore)(nil).InsertWorkspaceAgentStat), arg0, arg1) } +// InsertWorkspaceAgentStats mocks base method. +func (m *MockStore) InsertWorkspaceAgentStats(arg0 context.Context, arg1 database.InsertWorkspaceAgentStatsParams) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "InsertWorkspaceAgentStats", arg0, arg1) + ret0, _ := ret[0].(error) + return ret0 +} + +// InsertWorkspaceAgentStats indicates an expected call of InsertWorkspaceAgentStats. +func (mr *MockStoreMockRecorder) InsertWorkspaceAgentStats(arg0, arg1 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "InsertWorkspaceAgentStats", reflect.TypeOf((*MockStore)(nil).InsertWorkspaceAgentStats), arg0, arg1) +} + // InsertWorkspaceApp mocks base method. func (m *MockStore) InsertWorkspaceApp(arg0 context.Context, arg1 database.InsertWorkspaceAppParams) (database.WorkspaceApp, error) { m.ctrl.T.Helper() diff --git a/coderd/database/querier.go b/coderd/database/querier.go index be33b00bfc51b..9ceede458eb3a 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -220,6 +220,7 @@ type sqlcQuerier interface { InsertWorkspaceAgentLogs(ctx context.Context, arg InsertWorkspaceAgentLogsParams) ([]WorkspaceAgentLog, error) InsertWorkspaceAgentMetadata(ctx context.Context, arg InsertWorkspaceAgentMetadataParams) error InsertWorkspaceAgentStat(ctx context.Context, arg InsertWorkspaceAgentStatParams) (WorkspaceAgentStat, error) + InsertWorkspaceAgentStats(ctx context.Context, arg InsertWorkspaceAgentStatsParams) error InsertWorkspaceApp(ctx context.Context, arg InsertWorkspaceAppParams) (WorkspaceApp, error) InsertWorkspaceBuild(ctx context.Context, arg InsertWorkspaceBuildParams) error InsertWorkspaceBuildParameters(ctx context.Context, arg InsertWorkspaceBuildParametersParams) error diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index 87d86e9621fbb..bf943513849af 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -7245,6 +7245,72 @@ func (q *sqlQuerier) InsertWorkspaceAgentStat(ctx context.Context, arg InsertWor return i, err } +const insertWorkspaceAgentStats = `-- name: InsertWorkspaceAgentStats :exec +INSERT INTO + workspace_agent_stats +SELECT + unnest($1 :: uuid[]) AS id, + unnest($2 :: timestamptz[]) AS created_at, + unnest($3 :: uuid[]) AS user_id, + unnest($4 :: uuid[]) AS workspace_id, + unnest($5 :: uuid[]) AS template_id, + unnest($6 :: uuid[]) AS agent_id, + unnest($7 :: jsonb[]) AS connections_by_proto, + unnest($8 :: bigint[]) AS connection_count, + unnest($9 :: bigint[]) AS rx_packets, + unnest($10 :: bigint[]) AS rx_bytes, + unnest($11 :: bigint[]) AS tx_packets, + unnest($12 :: bigint[]) AS tx_bytes, + unnest($13 :: bigint[]) AS session_count_vscode, + unnest($14 :: bigint[]) AS session_count_jetbrains, + unnest($15 :: bigint[]) AS session_count_reconnecting_pty, + unnest($16 :: bigint[]) AS session_count_ssh, + unnest($17 :: double precision[]) AS connection_median_latency_ms +` + +type InsertWorkspaceAgentStatsParams struct { + ID []uuid.UUID `db:"id" json:"id"` + CreatedAt []time.Time `db:"created_at" json:"created_at"` + UserID []uuid.UUID `db:"user_id" json:"user_id"` + WorkspaceID []uuid.UUID `db:"workspace_id" json:"workspace_id"` + TemplateID []uuid.UUID `db:"template_id" json:"template_id"` + AgentID []uuid.UUID `db:"agent_id" json:"agent_id"` + ConnectionsByProto []json.RawMessage `db:"connections_by_proto" json:"connections_by_proto"` + ConnectionCount []int64 `db:"connection_count" json:"connection_count"` + RxPackets []int64 `db:"rx_packets" json:"rx_packets"` + RxBytes []int64 `db:"rx_bytes" json:"rx_bytes"` + TxPackets []int64 `db:"tx_packets" json:"tx_packets"` + TxBytes []int64 `db:"tx_bytes" json:"tx_bytes"` + SessionCountVSCode []int64 `db:"session_count_vscode" json:"session_count_vscode"` + SessionCountJetBrains []int64 `db:"session_count_jetbrains" json:"session_count_jetbrains"` + SessionCountReconnectingPTY []int64 `db:"session_count_reconnecting_pty" json:"session_count_reconnecting_pty"` + SessionCountSSH []int64 `db:"session_count_ssh" json:"session_count_ssh"` + ConnectionMedianLatencyMS []float64 `db:"connection_median_latency_ms" json:"connection_median_latency_ms"` +} + +func (q *sqlQuerier) InsertWorkspaceAgentStats(ctx context.Context, arg InsertWorkspaceAgentStatsParams) error { + _, err := q.db.ExecContext(ctx, insertWorkspaceAgentStats, + pq.Array(arg.ID), + pq.Array(arg.CreatedAt), + pq.Array(arg.UserID), + pq.Array(arg.WorkspaceID), + pq.Array(arg.TemplateID), + pq.Array(arg.AgentID), + pq.Array(arg.ConnectionsByProto), + pq.Array(arg.ConnectionCount), + pq.Array(arg.RxPackets), + pq.Array(arg.RxBytes), + pq.Array(arg.TxPackets), + pq.Array(arg.TxBytes), + pq.Array(arg.SessionCountVSCode), + pq.Array(arg.SessionCountJetBrains), + pq.Array(arg.SessionCountReconnectingPTY), + pq.Array(arg.SessionCountSSH), + pq.Array(arg.ConnectionMedianLatencyMS), + ) + return err +} + const getWorkspaceAppByAgentIDAndSlug = `-- name: GetWorkspaceAppByAgentIDAndSlug :one SELECT id, created_at, agent_id, display_name, icon, command, url, healthcheck_url, healthcheck_interval, healthcheck_threshold, health, subdomain, sharing_level, slug, external FROM workspace_apps WHERE agent_id = $1 AND slug = $2 ` diff --git a/coderd/database/queries/workspaceagentstats.sql b/coderd/database/queries/workspaceagentstats.sql index 1a598bd6a6263..2c2e68c7b1bd6 100644 --- a/coderd/database/queries/workspaceagentstats.sql +++ b/coderd/database/queries/workspaceagentstats.sql @@ -22,6 +22,28 @@ INSERT INTO VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17) RETURNING *; +-- name: InsertWorkspaceAgentStats :exec +INSERT INTO + workspace_agent_stats +SELECT + unnest(@id :: uuid[]) AS id, + unnest(@created_at :: timestamptz[]) AS created_at, + unnest(@user_id :: uuid[]) AS user_id, + unnest(@workspace_id :: uuid[]) AS workspace_id, + unnest(@template_id :: uuid[]) AS template_id, + unnest(@agent_id :: uuid[]) AS agent_id, + unnest(@connections_by_proto :: jsonb[]) AS connections_by_proto, + unnest(@connection_count :: bigint[]) AS connection_count, + unnest(@rx_packets :: bigint[]) AS rx_packets, + unnest(@rx_bytes :: bigint[]) AS rx_bytes, + unnest(@tx_packets :: bigint[]) AS tx_packets, + unnest(@tx_bytes :: bigint[]) AS tx_bytes, + unnest(@session_count_vscode :: bigint[]) AS session_count_vscode, + unnest(@session_count_jetbrains :: bigint[]) AS session_count_jetbrains, + unnest(@session_count_reconnecting_pty :: bigint[]) AS session_count_reconnecting_pty, + unnest(@session_count_ssh :: bigint[]) AS session_count_ssh, + unnest(@connection_median_latency_ms :: double precision[]) AS connection_median_latency_ms; + -- name: GetTemplateDAUs :many SELECT (created_at at TIME ZONE cast(@tz_offset::integer as text))::date as date, From d15a0df4cd1ab58a3d35d767e337425762082ed6 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Wed, 2 Aug 2023 12:40:12 +0100 Subject: [PATCH 10/24] remove unnecessary type --- coderd/batchstats/batcher.go | 160 +++++++++++++---------------------- 1 file changed, 58 insertions(+), 102 deletions(-) diff --git a/coderd/batchstats/batcher.go b/coderd/batchstats/batcher.go index 0a428eabb91a2..13afbc89287a8 100644 --- a/coderd/batchstats/batcher.go +++ b/coderd/batchstats/batcher.go @@ -23,32 +23,6 @@ const ( DefaultBatchSize = 1024 ) -// batchInsertParams is a struct used to batch-insert stats. -// It looks weird because of how we insert the data into the database -// (using unnest). -// Ideally we would just use COPY FROM but that isn't supported by -// sqlc using lib/pq. At a later date we should consider switching to -// pgx. See: https://docs.sqlc.dev/en/stable/guides/using-go-and-pgx.html -type batchInsertParams struct { - IDs []uuid.UUID - CreatedAts []time.Time - UserIDs []uuid.UUID - WorkspaceIDs []uuid.UUID - TemplateIDs []uuid.UUID - AgentIDs []uuid.UUID - ConnectionsByProtos []json.RawMessage - ConnectionCounts []int64 - RxPacketses []int64 - RxByteses []int64 - TxPacketses []int64 - TxByteses []int64 - SessionCountVSCodes []int64 - SessionCountJetBrainses []int64 - SessionCountReconnectingPTYs []int64 - SessionCountSSHs []int64 - ConnectionMedianLatencyMSes []float64 -} - // Batcher holds a buffer of agent stats and periodically flushes them to // its configured store. It also updates the workspace's last used time. type Batcher struct { @@ -56,7 +30,7 @@ type Batcher struct { log slog.Logger mu sync.RWMutex - buf batchInsertParams + buf database.InsertWorkspaceAgentStatsParams batchSize int // ticker is used to periodically flush the buffer. @@ -128,24 +102,24 @@ func New(opts ...Option) (*Batcher, error) { b.batchSize = DefaultBatchSize } - b.buf = batchInsertParams{ - IDs: make([]uuid.UUID, 0, b.batchSize), - CreatedAts: make([]time.Time, 0, b.batchSize), - UserIDs: make([]uuid.UUID, 0, b.batchSize), - WorkspaceIDs: make([]uuid.UUID, 0, b.batchSize), - TemplateIDs: make([]uuid.UUID, 0, b.batchSize), - AgentIDs: make([]uuid.UUID, 0, b.batchSize), - ConnectionsByProtos: make([]json.RawMessage, 0, b.batchSize), - ConnectionCounts: make([]int64, 0, b.batchSize), - RxPacketses: make([]int64, 0, b.batchSize), - RxByteses: make([]int64, 0, b.batchSize), - TxPacketses: make([]int64, 0, b.batchSize), - TxByteses: make([]int64, 0, b.batchSize), - SessionCountVSCodes: make([]int64, 0, b.batchSize), - SessionCountJetBrainses: make([]int64, 0, b.batchSize), - SessionCountReconnectingPTYs: make([]int64, 0, b.batchSize), - SessionCountSSHs: make([]int64, 0, b.batchSize), - ConnectionMedianLatencyMSes: make([]float64, 0, b.batchSize), + b.buf = database.InsertWorkspaceAgentStatsParams{ + ID: make([]uuid.UUID, 0, b.batchSize), + CreatedAt: make([]time.Time, 0, b.batchSize), + UserID: make([]uuid.UUID, 0, b.batchSize), + WorkspaceID: make([]uuid.UUID, 0, b.batchSize), + TemplateID: make([]uuid.UUID, 0, b.batchSize), + AgentID: make([]uuid.UUID, 0, b.batchSize), + ConnectionsByProto: make([]json.RawMessage, 0, b.batchSize), + ConnectionCount: make([]int64, 0, b.batchSize), + RxPackets: make([]int64, 0, b.batchSize), + RxBytes: make([]int64, 0, b.batchSize), + TxPackets: make([]int64, 0, b.batchSize), + TxBytes: make([]int64, 0, b.batchSize), + SessionCountVSCode: make([]int64, 0, b.batchSize), + SessionCountJetBrains: make([]int64, 0, b.batchSize), + SessionCountReconnectingPTY: make([]int64, 0, b.batchSize), + SessionCountSSH: make([]int64, 0, b.batchSize), + ConnectionMedianLatencyMS: make([]float64, 0, b.batchSize), } return b, nil @@ -177,26 +151,26 @@ func (b *Batcher) Add( payload = json.RawMessage("{}") } - b.buf.IDs = append(b.buf.IDs, uuid.New()) - b.buf.AgentIDs = append(b.buf.AgentIDs, agentID) - b.buf.CreatedAts = append(b.buf.CreatedAts, now) - b.buf.UserIDs = append(b.buf.UserIDs, ws.OwnerID) - b.buf.WorkspaceIDs = append(b.buf.WorkspaceIDs, ws.ID) - b.buf.TemplateIDs = append(b.buf.TemplateIDs, ws.TemplateID) - b.buf.ConnectionsByProtos = append(b.buf.ConnectionsByProtos, payload) - b.buf.ConnectionCounts = append(b.buf.ConnectionCounts, st.ConnectionCount) - b.buf.RxPacketses = append(b.buf.RxPacketses, st.RxPackets) - b.buf.RxByteses = append(b.buf.RxByteses, st.RxBytes) - b.buf.TxPacketses = append(b.buf.TxPacketses, st.TxPackets) - b.buf.TxByteses = append(b.buf.TxByteses, st.TxBytes) - b.buf.SessionCountVSCodes = append(b.buf.SessionCountVSCodes, st.SessionCountVSCode) - b.buf.SessionCountJetBrainses = append(b.buf.SessionCountJetBrainses, st.SessionCountJetBrains) - b.buf.SessionCountReconnectingPTYs = append(b.buf.SessionCountReconnectingPTYs, st.SessionCountReconnectingPTY) - b.buf.SessionCountSSHs = append(b.buf.SessionCountSSHs, st.SessionCountSSH) - b.buf.ConnectionMedianLatencyMSes = append(b.buf.ConnectionMedianLatencyMSes, st.ConnectionMedianLatencyMS) + b.buf.ID = append(b.buf.ID, uuid.New()) + b.buf.AgentID = append(b.buf.AgentID, agentID) + b.buf.CreatedAt = append(b.buf.CreatedAt, now) + b.buf.UserID = append(b.buf.UserID, ws.OwnerID) + b.buf.WorkspaceID = append(b.buf.WorkspaceID, ws.ID) + b.buf.TemplateID = append(b.buf.TemplateID, ws.TemplateID) + b.buf.ConnectionsByProto = append(b.buf.ConnectionsByProto, payload) + b.buf.ConnectionCount = append(b.buf.ConnectionCount, st.ConnectionCount) + b.buf.RxPackets = append(b.buf.RxPackets, st.RxPackets) + b.buf.RxBytes = append(b.buf.RxBytes, st.RxBytes) + b.buf.TxPackets = append(b.buf.TxPackets, st.TxPackets) + b.buf.TxBytes = append(b.buf.TxBytes, st.TxBytes) + b.buf.SessionCountVSCode = append(b.buf.SessionCountVSCode, st.SessionCountVSCode) + b.buf.SessionCountJetBrains = append(b.buf.SessionCountJetBrains, st.SessionCountJetBrains) + b.buf.SessionCountReconnectingPTY = append(b.buf.SessionCountReconnectingPTY, st.SessionCountReconnectingPTY) + b.buf.SessionCountSSH = append(b.buf.SessionCountSSH, st.SessionCountSSH) + b.buf.ConnectionMedianLatencyMS = append(b.buf.ConnectionMedianLatencyMS, st.ConnectionMedianLatencyMS) // If the buffer is full, signal the flusher to flush immediately. - if len(b.buf.IDs) == cap(b.buf.IDs) { + if len(b.buf.ID) == cap(b.buf.ID) { b.flushLever <- struct{}{} } return nil @@ -228,56 +202,38 @@ func (b *Batcher) flush(ctx context.Context, forced bool, reason string) { defer func() { // Notify that a flush has completed. if b.flushed != nil { - b.flushed <- forced b.log.Debug(ctx, "notify flush") + b.flushed <- forced } }() - b.log.Debug(ctx, "flushing buffer", slog.F("count", len(b.buf.IDs)), slog.F("forced", forced)) - if len(b.buf.IDs) == 0 { + if len(b.buf.ID) == 0 { b.log.Debug(ctx, "nothing to flush") return } + b.log.Debug(ctx, "flushing buffer", slog.F("count", len(b.buf.ID)), slog.F("forced", forced)) - if err := b.store.InsertWorkspaceAgentStats(ctx, database.InsertWorkspaceAgentStatsParams{ - ID: b.buf.IDs, - CreatedAt: b.buf.CreatedAts, - UserID: b.buf.UserIDs, - WorkspaceID: b.buf.WorkspaceIDs, - TemplateID: b.buf.TemplateIDs, - AgentID: b.buf.AgentIDs, - ConnectionsByProto: b.buf.ConnectionsByProtos, - ConnectionCount: b.buf.ConnectionCounts, - RxPackets: b.buf.RxPacketses, - RxBytes: b.buf.RxByteses, - TxPackets: b.buf.TxPacketses, - TxBytes: b.buf.TxByteses, - SessionCountVSCode: b.buf.SessionCountVSCodes, - SessionCountJetBrains: b.buf.SessionCountJetBrainses, - SessionCountReconnectingPTY: b.buf.SessionCountReconnectingPTYs, - SessionCountSSH: b.buf.SessionCountSSHs, - ConnectionMedianLatencyMS: b.buf.ConnectionMedianLatencyMSes, - }); err != nil { + if err := b.store.InsertWorkspaceAgentStats(ctx, b.buf); err != nil { b.log.Error(ctx, "insert workspace agent stats", slog.Error(err)) } // Reset the buffer. // b.buf = b.buf[:0] - b.buf.IDs = b.buf.IDs[:0] - b.buf.CreatedAts = b.buf.CreatedAts[:0] - b.buf.UserIDs = b.buf.UserIDs[:0] - b.buf.WorkspaceIDs = b.buf.WorkspaceIDs[:0] - b.buf.TemplateIDs = b.buf.TemplateIDs[:0] - b.buf.AgentIDs = b.buf.AgentIDs[:0] - b.buf.ConnectionsByProtos = b.buf.ConnectionsByProtos[:0] - b.buf.ConnectionCounts = b.buf.ConnectionCounts[:0] - b.buf.RxPacketses = b.buf.RxPacketses[:0] - b.buf.RxByteses = b.buf.RxByteses[:0] - b.buf.TxPacketses = b.buf.TxPacketses[:0] - b.buf.TxByteses = b.buf.TxByteses[:0] - b.buf.SessionCountVSCodes = b.buf.SessionCountVSCodes[:0] - b.buf.SessionCountJetBrainses = b.buf.SessionCountJetBrainses[:0] - b.buf.SessionCountReconnectingPTYs = b.buf.SessionCountReconnectingPTYs[:0] - b.buf.SessionCountSSHs = b.buf.SessionCountSSHs[:0] - b.buf.ConnectionMedianLatencyMSes = b.buf.ConnectionMedianLatencyMSes[:0] + b.buf.ID = b.buf.ID[:0] + b.buf.CreatedAt = b.buf.CreatedAt[:0] + b.buf.UserID = b.buf.UserID[:0] + b.buf.WorkspaceID = b.buf.WorkspaceID[:0] + b.buf.TemplateID = b.buf.TemplateID[:0] + b.buf.AgentID = b.buf.AgentID[:0] + b.buf.ConnectionsByProto = b.buf.ConnectionsByProto[:0] + b.buf.ConnectionCount = b.buf.ConnectionCount[:0] + b.buf.RxPackets = b.buf.RxPackets[:0] + b.buf.RxBytes = b.buf.RxBytes[:0] + b.buf.TxPackets = b.buf.TxPackets[:0] + b.buf.TxBytes = b.buf.TxBytes[:0] + b.buf.SessionCountVSCode = b.buf.SessionCountVSCode[:0] + b.buf.SessionCountJetBrains = b.buf.SessionCountJetBrains[:0] + b.buf.SessionCountReconnectingPTY = b.buf.SessionCountReconnectingPTY[:0] + b.buf.SessionCountSSH = b.buf.SessionCountSSH[:0] + b.buf.ConnectionMedianLatencyMS = b.buf.ConnectionMedianLatencyMS[:0] } From f13300cb277725d2dba7eab092dec5e46b4cc2dc Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Wed, 2 Aug 2023 14:30:15 +0100 Subject: [PATCH 11/24] GREEN: got query working --- coderd/batchstats/batcher.go | 58 ++++++++++++------- coderd/batchstats/batcher_test.go | 11 ++++ coderd/database/queries.sql.go | 38 ++++++------ .../database/queries/workspaceagentstats.sql | 2 +- 4 files changed, 69 insertions(+), 40 deletions(-) diff --git a/coderd/batchstats/batcher.go b/coderd/batchstats/batcher.go index 13afbc89287a8..72ce10020225d 100644 --- a/coderd/batchstats/batcher.go +++ b/coderd/batchstats/batcher.go @@ -29,9 +29,12 @@ type Batcher struct { store database.Store log slog.Logger - mu sync.RWMutex - buf database.InsertWorkspaceAgentStatsParams - batchSize int + mu sync.RWMutex + buf database.InsertWorkspaceAgentStatsParams + // NOTE: we batch this separately as it's a jsonb field and + // pq.Array + unnest doesn't play nicely with this. + connectionsByProto []map[string]int64 + batchSize int // ticker is used to periodically flush the buffer. ticker <-chan time.Time @@ -102,6 +105,7 @@ func New(opts ...Option) (*Batcher, error) { b.batchSize = DefaultBatchSize } + b.connectionsByProto = make([]map[string]int64, 0, b.batchSize) b.buf = database.InsertWorkspaceAgentStatsParams{ ID: make([]uuid.UUID, 0, b.batchSize), CreatedAt: make([]time.Time, 0, b.batchSize), @@ -109,7 +113,7 @@ func New(opts ...Option) (*Batcher, error) { WorkspaceID: make([]uuid.UUID, 0, b.batchSize), TemplateID: make([]uuid.UUID, 0, b.batchSize), AgentID: make([]uuid.UUID, 0, b.batchSize), - ConnectionsByProto: make([]json.RawMessage, 0, b.batchSize), + ConnectionsByProto: json.RawMessage("[]"), ConnectionCount: make([]int64, 0, b.batchSize), RxPackets: make([]int64, 0, b.batchSize), RxBytes: make([]int64, 0, b.batchSize), @@ -142,14 +146,8 @@ func (b *Batcher) Add( if err != nil { return xerrors.Errorf("get workspace by agent id: %w", err) } - payload, err := json.Marshal(st.ConnectionsByProto) - if err != nil { - b.log.Error(ctx, "marshal agent connections by proto", - slog.F("workspace_agent_id", agentID), - slog.Error(err), - ) - payload = json.RawMessage("{}") - } + + b.connectionsByProto = append(b.connectionsByProto, st.ConnectionsByProto) b.buf.ID = append(b.buf.ID, uuid.New()) b.buf.AgentID = append(b.buf.AgentID, agentID) @@ -157,7 +155,8 @@ func (b *Batcher) Add( b.buf.UserID = append(b.buf.UserID, ws.OwnerID) b.buf.WorkspaceID = append(b.buf.WorkspaceID, ws.ID) b.buf.TemplateID = append(b.buf.TemplateID, ws.TemplateID) - b.buf.ConnectionsByProto = append(b.buf.ConnectionsByProto, payload) + // We explicitly do *not* do this. + // b.buf.ConnectionsByProto = append(b.buf.ConnectionsByProto, st.ConnectionsByProto) b.buf.ConnectionCount = append(b.buf.ConnectionCount, st.ConnectionCount) b.buf.RxPackets = append(b.buf.RxPackets, st.RxPackets) b.buf.RxBytes = append(b.buf.RxBytes, st.RxBytes) @@ -196,36 +195,55 @@ func (b *Batcher) Run(ctx context.Context) { // flush flushes the batcher's buffer. func (b *Batcher) flush(ctx context.Context, forced bool, reason string) { - // TODO(Cian): After flushing, should we somehow reset the ticker? b.mu.Lock() - defer b.mu.Unlock() + start := time.Now() defer func() { + b.mu.Unlock() // Notify that a flush has completed. if b.flushed != nil { b.log.Debug(ctx, "notify flush") b.flushed <- forced } + elapsed := time.Since(start) + b.log.Debug(ctx, "flush complete", + slog.F("count", len(b.buf.ID)), + slog.F("elapsed", elapsed), + slog.F("reason", reason), + ) }() if len(b.buf.ID) == 0 { b.log.Debug(ctx, "nothing to flush") return } - b.log.Debug(ctx, "flushing buffer", slog.F("count", len(b.buf.ID)), slog.F("forced", forced)) + b.log.Debug(ctx, "flushing buffer", slog.F("count", len(b.buf.ID)), slog.F("forced", forced), slog.F("reason", reason)) - if err := b.store.InsertWorkspaceAgentStats(ctx, b.buf); err != nil { - b.log.Error(ctx, "insert workspace agent stats", slog.Error(err)) + // marshal connections by proto + payload, err := json.Marshal(b.connectionsByProto) + if err != nil { + b.log.Error(ctx, "unable to marshal agent connections by proto, dropping data", slog.Error(err)) + b.buf.ConnectionsByProto = json.RawMessage(`[]`) + } else { + b.buf.ConnectionsByProto = payload + } + + err = b.store.InsertWorkspaceAgentStats(ctx, b.buf) + elapsed := time.Since(start) + if err != nil { + b.log.Error(ctx, "error inserting workspace agent stats", slog.Error(err), slog.F("elapsed", elapsed)) + return } // Reset the buffer. - // b.buf = b.buf[:0] + b.connectionsByProto = b.connectionsByProto[:0] + b.buf.ID = b.buf.ID[:0] b.buf.CreatedAt = b.buf.CreatedAt[:0] b.buf.UserID = b.buf.UserID[:0] b.buf.WorkspaceID = b.buf.WorkspaceID[:0] b.buf.TemplateID = b.buf.TemplateID[:0] b.buf.AgentID = b.buf.AgentID[:0] - b.buf.ConnectionsByProto = b.buf.ConnectionsByProto[:0] + b.buf.ConnectionsByProto = json.RawMessage(`[]`) b.buf.ConnectionCount = b.buf.ConnectionCount[:0] b.buf.RxPackets = b.buf.RxPackets[:0] b.buf.RxBytes = b.buf.RxBytes[:0] diff --git a/coderd/batchstats/batcher_test.go b/coderd/batchstats/batcher_test.go index a7920d01f6d19..1ae7d75c27b60 100644 --- a/coderd/batchstats/batcher_test.go +++ b/coderd/batchstats/batcher_test.go @@ -107,6 +107,17 @@ func TestBatchStats(t *testing.T) { stats, err = store.GetWorkspaceAgentStats(ctx, t3) require.NoError(t, err, "should not error getting stats") require.Len(t, stats, 2, "should have stats for both workspaces") + + // Ensure that a subsequent flush does not push stale data. + t4 := time.Now() + tick <- t4 + f = <-flushed + require.False(t, f, "flush should not have been forced") + t.Logf("flush 4 completed") + + stats, err = store.GetWorkspaceAgentStats(ctx, t4) + require.NoError(t, err, "should not error getting stats") + require.Len(t, stats, 0, "should have no stats for workspace") } // randAgentSDKStats returns a random agentsdk.Stats diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index bf943513849af..84e25f2dfca8a 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -7255,7 +7255,7 @@ SELECT unnest($4 :: uuid[]) AS workspace_id, unnest($5 :: uuid[]) AS template_id, unnest($6 :: uuid[]) AS agent_id, - unnest($7 :: jsonb[]) AS connections_by_proto, + jsonb_array_elements($7 :: jsonb) AS connections_by_proto, unnest($8 :: bigint[]) AS connection_count, unnest($9 :: bigint[]) AS rx_packets, unnest($10 :: bigint[]) AS rx_bytes, @@ -7269,23 +7269,23 @@ SELECT ` type InsertWorkspaceAgentStatsParams struct { - ID []uuid.UUID `db:"id" json:"id"` - CreatedAt []time.Time `db:"created_at" json:"created_at"` - UserID []uuid.UUID `db:"user_id" json:"user_id"` - WorkspaceID []uuid.UUID `db:"workspace_id" json:"workspace_id"` - TemplateID []uuid.UUID `db:"template_id" json:"template_id"` - AgentID []uuid.UUID `db:"agent_id" json:"agent_id"` - ConnectionsByProto []json.RawMessage `db:"connections_by_proto" json:"connections_by_proto"` - ConnectionCount []int64 `db:"connection_count" json:"connection_count"` - RxPackets []int64 `db:"rx_packets" json:"rx_packets"` - RxBytes []int64 `db:"rx_bytes" json:"rx_bytes"` - TxPackets []int64 `db:"tx_packets" json:"tx_packets"` - TxBytes []int64 `db:"tx_bytes" json:"tx_bytes"` - SessionCountVSCode []int64 `db:"session_count_vscode" json:"session_count_vscode"` - SessionCountJetBrains []int64 `db:"session_count_jetbrains" json:"session_count_jetbrains"` - SessionCountReconnectingPTY []int64 `db:"session_count_reconnecting_pty" json:"session_count_reconnecting_pty"` - SessionCountSSH []int64 `db:"session_count_ssh" json:"session_count_ssh"` - ConnectionMedianLatencyMS []float64 `db:"connection_median_latency_ms" json:"connection_median_latency_ms"` + ID []uuid.UUID `db:"id" json:"id"` + CreatedAt []time.Time `db:"created_at" json:"created_at"` + UserID []uuid.UUID `db:"user_id" json:"user_id"` + WorkspaceID []uuid.UUID `db:"workspace_id" json:"workspace_id"` + TemplateID []uuid.UUID `db:"template_id" json:"template_id"` + AgentID []uuid.UUID `db:"agent_id" json:"agent_id"` + ConnectionsByProto json.RawMessage `db:"connections_by_proto" json:"connections_by_proto"` + ConnectionCount []int64 `db:"connection_count" json:"connection_count"` + RxPackets []int64 `db:"rx_packets" json:"rx_packets"` + RxBytes []int64 `db:"rx_bytes" json:"rx_bytes"` + TxPackets []int64 `db:"tx_packets" json:"tx_packets"` + TxBytes []int64 `db:"tx_bytes" json:"tx_bytes"` + SessionCountVSCode []int64 `db:"session_count_vscode" json:"session_count_vscode"` + SessionCountJetBrains []int64 `db:"session_count_jetbrains" json:"session_count_jetbrains"` + SessionCountReconnectingPTY []int64 `db:"session_count_reconnecting_pty" json:"session_count_reconnecting_pty"` + SessionCountSSH []int64 `db:"session_count_ssh" json:"session_count_ssh"` + ConnectionMedianLatencyMS []float64 `db:"connection_median_latency_ms" json:"connection_median_latency_ms"` } func (q *sqlQuerier) InsertWorkspaceAgentStats(ctx context.Context, arg InsertWorkspaceAgentStatsParams) error { @@ -7296,7 +7296,7 @@ func (q *sqlQuerier) InsertWorkspaceAgentStats(ctx context.Context, arg InsertWo pq.Array(arg.WorkspaceID), pq.Array(arg.TemplateID), pq.Array(arg.AgentID), - pq.Array(arg.ConnectionsByProto), + arg.ConnectionsByProto, pq.Array(arg.ConnectionCount), pq.Array(arg.RxPackets), pq.Array(arg.RxBytes), diff --git a/coderd/database/queries/workspaceagentstats.sql b/coderd/database/queries/workspaceagentstats.sql index 2c2e68c7b1bd6..89d98a420eae5 100644 --- a/coderd/database/queries/workspaceagentstats.sql +++ b/coderd/database/queries/workspaceagentstats.sql @@ -32,7 +32,7 @@ SELECT unnest(@workspace_id :: uuid[]) AS workspace_id, unnest(@template_id :: uuid[]) AS template_id, unnest(@agent_id :: uuid[]) AS agent_id, - unnest(@connections_by_proto :: jsonb[]) AS connections_by_proto, + jsonb_array_elements(@connections_by_proto :: jsonb) AS connections_by_proto, unnest(@connection_count :: bigint[]) AS connection_count, unnest(@rx_packets :: bigint[]) AS rx_packets, unnest(@rx_bytes :: bigint[]) AS rx_bytes, From 0b1b0accb7cf7526eeb7bfccdc480d55c6905ee3 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Wed, 2 Aug 2023 17:06:13 +0100 Subject: [PATCH 12/24] implement dbfake / dbauthz, adjust external API --- coderd/batchstats/batcher.go | 28 +++++++++++------------ coderd/batchstats/batcher_test.go | 6 ++--- coderd/database/dbauthz/dbauthz.go | 6 ++++- coderd/database/dbfake/dbfake.go | 36 +++++++++++++++++++++++++++++- 4 files changed, 56 insertions(+), 20 deletions(-) diff --git a/coderd/batchstats/batcher.go b/coderd/batchstats/batcher.go index 72ce10020225d..c9b5daabda694 100644 --- a/coderd/batchstats/batcher.go +++ b/coderd/batchstats/batcher.go @@ -131,32 +131,28 @@ func New(opts ...Option) (*Batcher, error) { // Add adds a stat to the batcher for the given workspace and agent. func (b *Batcher) Add( - ctx context.Context, agentID uuid.UUID, + templateID uuid.UUID, + userID uuid.UUID, + workspaceID uuid.UUID, st agentsdk.Stats, ) error { b.mu.Lock() defer b.mu.Unlock() - // TODO(Cian): add a specific dbauthz context for this. - authCtx := dbauthz.AsSystemRestricted(ctx) now := database.Now() - // TODO(Cian): cache agentID -> workspaceID? - ws, err := b.store.GetWorkspaceByAgentID(authCtx, agentID) - if err != nil { - return xerrors.Errorf("get workspace by agent id: %w", err) - } - - b.connectionsByProto = append(b.connectionsByProto, st.ConnectionsByProto) b.buf.ID = append(b.buf.ID, uuid.New()) - b.buf.AgentID = append(b.buf.AgentID, agentID) b.buf.CreatedAt = append(b.buf.CreatedAt, now) - b.buf.UserID = append(b.buf.UserID, ws.OwnerID) - b.buf.WorkspaceID = append(b.buf.WorkspaceID, ws.ID) - b.buf.TemplateID = append(b.buf.TemplateID, ws.TemplateID) - // We explicitly do *not* do this. + b.buf.AgentID = append(b.buf.AgentID, agentID) + b.buf.UserID = append(b.buf.UserID, userID) + b.buf.TemplateID = append(b.buf.TemplateID, templateID) + b.buf.WorkspaceID = append(b.buf.WorkspaceID, workspaceID) + + // Store the connections by proto separately as it's a jsonb field. We marshal on flush. // b.buf.ConnectionsByProto = append(b.buf.ConnectionsByProto, st.ConnectionsByProto) + b.connectionsByProto = append(b.connectionsByProto, st.ConnectionsByProto) + b.buf.ConnectionCount = append(b.buf.ConnectionCount, st.ConnectionCount) b.buf.RxPackets = append(b.buf.RxPackets, st.RxPackets) b.buf.RxBytes = append(b.buf.RxBytes, st.RxBytes) @@ -177,6 +173,7 @@ func (b *Batcher) Add( // Run runs the batcher. func (b *Batcher) Run(ctx context.Context) { + // nolint:gocritic authCtx := dbauthz.AsSystemRestricted(ctx) for { select { @@ -208,6 +205,7 @@ func (b *Batcher) flush(ctx context.Context, forced bool, reason string) { b.log.Debug(ctx, "flush complete", slog.F("count", len(b.buf.ID)), slog.F("elapsed", elapsed), + slog.F("forced", forced), slog.F("reason", reason), ) }() diff --git a/coderd/batchstats/batcher_test.go b/coderd/batchstats/batcher_test.go index 1ae7d75c27b60..6816cdb1bbf9c 100644 --- a/coderd/batchstats/batcher_test.go +++ b/coderd/batchstats/batcher_test.go @@ -71,7 +71,7 @@ func TestBatchStats(t *testing.T) { // Given: a single data point is added for workspace t2 := time.Now() t.Logf("inserting 1 stat") - require.NoError(t, b.Add(ctx, deps1.Agent.ID, randAgentSDKStats(t))) + require.NoError(t, b.Add(deps1.Agent.ID, deps1.User.ID, deps1.Template.ID, deps1.Workspace.ID, randAgentSDKStats(t))) // When: it becomes time to report stats // Signal a tick and wait for a flush to complete. @@ -91,9 +91,9 @@ func TestBatchStats(t *testing.T) { t.Logf("inserting %d stats", batchSize) for i := 0; i < batchSize; i++ { if i%2 == 0 { - require.NoError(t, b.Add(ctx, deps1.Agent.ID, randAgentSDKStats(t))) + require.NoError(t, b.Add(deps1.Agent.ID, deps1.User.ID, deps1.Template.ID, deps1.Workspace.ID, randAgentSDKStats(t))) } else { - require.NoError(t, b.Add(ctx, deps2.Agent.ID, randAgentSDKStats(t))) + require.NoError(t, b.Add(deps2.Agent.ID, deps2.User.ID, deps2.Template.ID, deps2.Workspace.ID, randAgentSDKStats(t))) } } diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index 4d5a74afcc114..2ea82409be1ea 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -1998,7 +1998,11 @@ func (q *querier) InsertWorkspaceAgentStat(ctx context.Context, arg database.Ins } func (q *querier) InsertWorkspaceAgentStats(ctx context.Context, arg database.InsertWorkspaceAgentStatsParams) error { - panic("not implemented") + if err := q.authorizeContext(ctx, rbac.ActionCreate, rbac.ResourceSystem); err != nil { + return err + } + + return q.db.InsertWorkspaceAgentStats(ctx, arg) } func (q *querier) InsertWorkspaceApp(ctx context.Context, arg database.InsertWorkspaceAppParams) (database.WorkspaceApp, error) { diff --git a/coderd/database/dbfake/dbfake.go b/coderd/database/dbfake/dbfake.go index cb3f5ced88cf4..794443be617fd 100644 --- a/coderd/database/dbfake/dbfake.go +++ b/coderd/database/dbfake/dbfake.go @@ -4076,7 +4076,41 @@ func (q *FakeQuerier) InsertWorkspaceAgentStats(ctx context.Context, arg databas return err } - panic("not implemented") + q.mutex.Lock() + defer q.mutex.Unlock() + + var connectionsByProto []map[string]int64 + if err := json.Unmarshal(arg.ConnectionsByProto, &connectionsByProto); err != nil { + return err + } + for i := 0; i < len(arg.ID); i++ { + cbp, err := json.Marshal(connectionsByProto[i]) + if err != nil { + return xerrors.Errorf("failed to marshal connections_by_proto: %w", err) + } + stat := database.WorkspaceAgentStat{ + ID: arg.ID[i], + CreatedAt: arg.CreatedAt[i], + WorkspaceID: arg.WorkspaceID[i], + AgentID: arg.AgentID[i], + UserID: arg.UserID[i], + ConnectionsByProto: cbp, + ConnectionCount: arg.ConnectionCount[i], + RxPackets: arg.RxPackets[i], + RxBytes: arg.RxBytes[i], + TxPackets: arg.TxPackets[i], + TxBytes: arg.TxBytes[i], + TemplateID: arg.TemplateID[i], + SessionCountVSCode: arg.SessionCountVSCode[i], + SessionCountJetBrains: arg.SessionCountJetBrains[i], + SessionCountReconnectingPTY: arg.SessionCountReconnectingPTY[i], + SessionCountSSH: arg.SessionCountSSH[i], + ConnectionMedianLatencyMS: arg.ConnectionMedianLatencyMS[i], + } + q.workspaceAgentStats = append(q.workspaceAgentStats, stat) + } + + return nil } func (q *FakeQuerier) InsertWorkspaceApp(_ context.Context, arg database.InsertWorkspaceAppParams) (database.WorkspaceApp, error) { From d5acbdefc23e882149dfec2d426aebde570ed7ac Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Wed, 2 Aug 2023 18:03:40 +0100 Subject: [PATCH 13/24] plumb it all through --- cli/server.go | 16 ++++++++++++++++ coderd/coderd.go | 12 +++++++++++- coderd/coderdtest/coderdtest.go | 16 +++++++++++++++- coderd/workspaceagents.go | 29 +++-------------------------- 4 files changed, 45 insertions(+), 28 deletions(-) diff --git a/cli/server.go b/cli/server.go index c55b58831c0a5..372495b0f5bb0 100644 --- a/cli/server.go +++ b/cli/server.go @@ -63,6 +63,7 @@ import ( "github.com/coder/coder/cli/config" "github.com/coder/coder/coderd" "github.com/coder/coder/coderd/autobuild" + "github.com/coder/coder/coderd/batchstats" "github.com/coder/coder/coderd/database" "github.com/coder/coder/coderd/database/dbfake" "github.com/coder/coder/coderd/database/dbmetrics" @@ -812,6 +813,21 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. options.SwaggerEndpoint = cfg.Swagger.Enable.Value() } + batchStatsTicker := time.NewTicker(29 * time.Second) // Hard-coding. + defer batchStatsTicker.Stop() + b, err := batchstats.New( + batchstats.WithLogger(options.Logger.Named("batchstats")), + batchstats.WithStore(options.Database), + batchstats.WithTicker(batchStatsTicker.C), + ) + if err != nil { + return xerrors.Errorf("failed to create agent stats batcher: %w", err) + } + options.StatsBatcher = b + go func() { + b.Run(ctx) + }() + // We use a separate coderAPICloser so the Enterprise API // can have it's own close functions. This is cleaner // than abstracting the Coder API itself. diff --git a/coderd/coderd.go b/coderd/coderd.go index 25b611a5ac4fb..6ff0f108602bf 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -43,6 +43,7 @@ import ( "github.com/coder/coder/buildinfo" "github.com/coder/coder/coderd/audit" "github.com/coder/coder/coderd/awsidentity" + "github.com/coder/coder/coderd/batchstats" "github.com/coder/coder/coderd/database" "github.com/coder/coder/coderd/database/dbauthz" "github.com/coder/coder/coderd/database/pubsub" @@ -161,6 +162,7 @@ type Options struct { // TODO(Cian): This may need to be passed to batchstats.Batcher instead UpdateAgentMetrics func(ctx context.Context, username, workspaceName, agentName string, metrics []agentsdk.AgentMetric) + StatsBatcher *batchstats.Batcher } // @title Coder API @@ -288,6 +290,11 @@ func New(options *Options) *API { v := schedule.NewAGPLUserQuietHoursScheduleStore() options.UserQuietHoursScheduleStore.Store(&v) } + ctx, cancel := context.WithCancel(context.Background()) + + if options.StatsBatcher == nil { + panic("developer error: options.Batcher is nil") + } siteCacheDir := options.CacheDir if siteCacheDir != "" { @@ -322,7 +329,6 @@ func New(options *Options) *API { }) staticHandler.Experiments.Store(&experiments) - ctx, cancel := context.WithCancel(context.Background()) r := chi.NewRouter() // nolint:gocritic // Load deployment ID. This never changes @@ -463,6 +469,8 @@ func New(options *Options) *API { cors := httpmw.Cors(options.DeploymentValues.Dangerous.AllowAllCors.Value()) prometheusMW := httpmw.Prometheus(options.PrometheusRegistry) + api.statsBatcher = options.StatsBatcher + r.Use( httpmw.Recover(api.Logger), tracing.StatusWriterMiddleware, @@ -995,6 +1003,8 @@ type API struct { healthCheckGroup *singleflight.Group[string, *healthcheck.Report] healthCheckCache atomic.Pointer[healthcheck.Report] + + statsBatcher *batchstats.Batcher } // Close waits for all WebSocket connections to drain before returning. diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index ada8782514381..1ed56e9cd1125 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -56,6 +56,7 @@ import ( "github.com/coder/coder/coderd/audit" "github.com/coder/coder/coderd/autobuild" "github.com/coder/coder/coderd/awsidentity" + "github.com/coder/coder/coderd/batchstats" "github.com/coder/coder/coderd/database" "github.com/coder/coder/coderd/database/dbauthz" "github.com/coder/coder/coderd/database/dbtestutil" @@ -140,7 +141,8 @@ type Options struct { SwaggerEndpoint bool // Logger should only be overridden if you expect errors // as part of your test. - Logger *slog.Logger + Logger *slog.Logger + StatsBatcher *batchstats.Batcher } // New constructs a codersdk client connected to an in-memory API instance. @@ -241,6 +243,17 @@ func NewOptions(t testing.TB, options *Options) (func(http.Handler), context.Can if options.FilesRateLimit == 0 { options.FilesRateLimit = -1 } + if options.StatsBatcher == nil { + batchStatsTicker := time.NewTicker(testutil.IntervalFast) + t.Cleanup(batchStatsTicker.Stop) + options.StatsBatcher, err = batchstats.New( + batchstats.WithStore(options.Database), + batchstats.WithBatchSize(batchstats.DefaultBatchSize), + batchstats.WithLogger(slogtest.Make(t, nil).Leveled(slog.LevelDebug)), + batchstats.WithTicker(batchStatsTicker.C), + ) + require.NoError(t, err, "create stats batcher") + } var templateScheduleStore atomic.Pointer[schedule.TemplateScheduleStore] if options.TemplateScheduleStore == nil { @@ -409,6 +422,7 @@ func NewOptions(t testing.TB, options *Options) (func(http.Handler), context.Can HealthcheckFunc: options.HealthcheckFunc, HealthcheckTimeout: options.HealthcheckTimeout, HealthcheckRefresh: options.HealthcheckRefresh, + StatsBatcher: options.StatsBatcher, } } diff --git a/coderd/workspaceagents.go b/coderd/workspaceagents.go index 3a972e0e90a0b..361e5b68ac05d 100644 --- a/coderd/workspaceagents.go +++ b/coderd/workspaceagents.go @@ -1412,36 +1412,12 @@ func (api *API) workspaceAgentReportStats(rw http.ResponseWriter, r *http.Reques activityBumpWorkspace(ctx, api.Logger.Named("activity_bump"), api.Database, workspace.ID) } - payload, err := json.Marshal(req.ConnectionsByProto) - if err != nil { - api.Logger.Error(ctx, "marshal agent connections by proto", slog.F("workspace_agent_id", workspaceAgent.ID), slog.Error(err)) - payload = json.RawMessage("{}") - } - now := database.Now() var errGroup errgroup.Group errGroup.Go(func() error { - _, err = api.Database.InsertWorkspaceAgentStat(ctx, database.InsertWorkspaceAgentStatParams{ - ID: uuid.New(), - CreatedAt: now, - AgentID: workspaceAgent.ID, - WorkspaceID: workspace.ID, - UserID: workspace.OwnerID, - TemplateID: workspace.TemplateID, - ConnectionsByProto: payload, - ConnectionCount: req.ConnectionCount, - RxPackets: req.RxPackets, - RxBytes: req.RxBytes, - TxPackets: req.TxPackets, - TxBytes: req.TxBytes, - SessionCountVSCode: req.SessionCountVSCode, - SessionCountJetBrains: req.SessionCountJetBrains, - SessionCountReconnectingPTY: req.SessionCountReconnectingPTY, - SessionCountSSH: req.SessionCountSSH, - ConnectionMedianLatencyMS: req.ConnectionMedianLatencyMS, - }) - if err != nil { + if err := api.statsBatcher.Add(workspaceAgent.ID, workspace.TemplateID, workspace.OwnerID, workspace.ID, req); err != nil { + api.Logger.Error(ctx, "failed to add stats to batcher", slog.Error(err)) return xerrors.Errorf("can't insert workspace agent stat: %w", err) } return nil @@ -1464,6 +1440,7 @@ func (api *API) workspaceAgentReportStats(rw http.ResponseWriter, r *http.Reques return xerrors.Errorf("can't get user: %w", err) } + api.Logger.Debug(ctx, "updating agent metrics", slog.F("username", user.Username), slog.F("workspace", workspace.Name), slog.F("agent", workspaceAgent.Name)) api.Options.UpdateAgentMetrics(ctx, user.Username, workspace.Name, workspaceAgent.Name, req.Metrics) return nil }) From 6c8de19d388fbdead88d0bcc423d07046f8f88b0 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Wed, 2 Aug 2023 20:33:54 +0100 Subject: [PATCH 14/24] fixup! plumb it all through --- coderd/coderdtest/coderdtest.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index 1ed56e9cd1125..5a4a7712f11d8 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -253,6 +253,9 @@ func NewOptions(t testing.TB, options *Options) (func(http.Handler), context.Can batchstats.WithTicker(batchStatsTicker.C), ) require.NoError(t, err, "create stats batcher") + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + go options.StatsBatcher.Run(ctx) } var templateScheduleStore atomic.Pointer[schedule.TemplateScheduleStore] From c93b8610f537fd005fd4f4f0bf168506d7f79a9e Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Wed, 2 Aug 2023 21:15:18 +0100 Subject: [PATCH 15/24] reduce test log output --- coderd/batchstats/batcher.go | 20 ++++++++++---------- coderd/coderdtest/coderdtest.go | 9 ++++++--- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/coderd/batchstats/batcher.go b/coderd/batchstats/batcher.go index c9b5daabda694..2fde397d08bc2 100644 --- a/coderd/batchstats/batcher.go +++ b/coderd/batchstats/batcher.go @@ -194,27 +194,27 @@ func (b *Batcher) Run(ctx context.Context) { func (b *Batcher) flush(ctx context.Context, forced bool, reason string) { b.mu.Lock() start := time.Now() + count := len(b.buf.ID) defer func() { b.mu.Unlock() // Notify that a flush has completed. if b.flushed != nil { - b.log.Debug(ctx, "notify flush") b.flushed <- forced } - elapsed := time.Since(start) - b.log.Debug(ctx, "flush complete", - slog.F("count", len(b.buf.ID)), - slog.F("elapsed", elapsed), - slog.F("forced", forced), - slog.F("reason", reason), - ) + if count > 0 { + elapsed := time.Since(start) + b.log.Debug(ctx, "flush complete", + slog.F("count", count), + slog.F("elapsed", elapsed), + slog.F("forced", forced), + slog.F("reason", reason), + ) + } }() if len(b.buf.ID) == 0 { - b.log.Debug(ctx, "nothing to flush") return } - b.log.Debug(ctx, "flushing buffer", slog.F("count", len(b.buf.ID)), slog.F("forced", forced), slog.F("reason", reason)) // marshal connections by proto payload, err := json.Marshal(b.connectionsByProto) diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index 5a4a7712f11d8..c85859f1c731b 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -51,6 +51,7 @@ import ( "tailscale.com/types/nettype" "cdr.dev/slog" + "cdr.dev/slog/sloggers/sloghuman" "cdr.dev/slog/sloggers/slogtest" "github.com/coder/coder/coderd" "github.com/coder/coder/coderd/audit" @@ -244,12 +245,14 @@ func NewOptions(t testing.TB, options *Options) (func(http.Handler), context.Can options.FilesRateLimit = -1 } if options.StatsBatcher == nil { - batchStatsTicker := time.NewTicker(testutil.IntervalFast) + batchStatsTicker := time.NewTicker(time.Hour) t.Cleanup(batchStatsTicker.Stop) options.StatsBatcher, err = batchstats.New( batchstats.WithStore(options.Database), - batchstats.WithBatchSize(batchstats.DefaultBatchSize), - batchstats.WithLogger(slogtest.Make(t, nil).Leveled(slog.LevelDebug)), + // Some tests rely on being able to read stats immediately after writing. + batchstats.WithBatchSize(1), + // Avoid cluttering up test output. + batchstats.WithLogger(slog.Make(sloghuman.Sink(io.Discard))), batchstats.WithTicker(batchStatsTicker.C), ) require.NoError(t, err, "create stats batcher") From 79860082a92795666d2d766168f6f2eed09f3d0a Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Thu, 3 Aug 2023 10:15:32 +0100 Subject: [PATCH 16/24] linter --- coderd/coderd.go | 2 ++ coderd/database/dbfake/dbfake.go | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/coderd/coderd.go b/coderd/coderd.go index 6ff0f108602bf..7487e9f6ca11e 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -183,6 +183,8 @@ type Options struct { // @in header // @name Coder-Session-Token // New constructs a Coder API handler. +// +//nolint:gocyclo func New(options *Options) *API { if options == nil { options = &Options{} diff --git a/coderd/database/dbfake/dbfake.go b/coderd/database/dbfake/dbfake.go index 0904e36152655..f1d096d89e204 100644 --- a/coderd/database/dbfake/dbfake.go +++ b/coderd/database/dbfake/dbfake.go @@ -4072,7 +4072,7 @@ func (q *FakeQuerier) InsertWorkspaceAgentStat(_ context.Context, p database.Ins return stat, nil } -func (q *FakeQuerier) InsertWorkspaceAgentStats(ctx context.Context, arg database.InsertWorkspaceAgentStatsParams) error { +func (q *FakeQuerier) InsertWorkspaceAgentStats(_ context.Context, arg database.InsertWorkspaceAgentStatsParams) error { err := validateDatabaseType(arg) if err != nil { return err From 4c1020eddfefc7e09593d9fcf7c320a7ab9e0836 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Thu, 3 Aug 2023 10:27:16 +0100 Subject: [PATCH 17/24] fmt --- coderd/batchstats/batcher_test.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/coderd/batchstats/batcher_test.go b/coderd/batchstats/batcher_test.go index 6816cdb1bbf9c..8353beef82447 100644 --- a/coderd/batchstats/batcher_test.go +++ b/coderd/batchstats/batcher_test.go @@ -22,9 +22,7 @@ import ( func TestBatchStats(t *testing.T) { t.Parallel() - var ( - batchSize = batchstats.DefaultBatchSize - ) + batchSize := batchstats.DefaultBatchSize // Given: a fresh batcher with no data ctx, cancel := context.WithCancel(context.Background()) From f0a0ae5aea8c4d0aebac64a57aa30fbb68b2875a Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Thu, 3 Aug 2023 12:30:18 +0100 Subject: [PATCH 18/24] fix bugged query --- coderd/database/queries.sql.go | 20 ++++++++++++++++++- .../database/queries/workspaceagentstats.sql | 20 ++++++++++++++++++- 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index bd4aa50cf2e0c..eb426d0c54c9f 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -7318,7 +7318,25 @@ func (q *sqlQuerier) InsertWorkspaceAgentStat(ctx context.Context, arg InsertWor const insertWorkspaceAgentStats = `-- name: InsertWorkspaceAgentStats :exec INSERT INTO - workspace_agent_stats + workspace_agent_stats ( + id, + created_at, + user_id, + workspace_id, + template_id, + agent_id, + connections_by_proto, + connection_count, + rx_packets, + rx_bytes, + tx_packets, + tx_bytes, + session_count_vscode, + session_count_jetbrains, + session_count_reconnecting_pty, + session_count_ssh, + connection_median_latency_ms + ) SELECT unnest($1 :: uuid[]) AS id, unnest($2 :: timestamptz[]) AS created_at, diff --git a/coderd/database/queries/workspaceagentstats.sql b/coderd/database/queries/workspaceagentstats.sql index 89d98a420eae5..daba093a3d9e1 100644 --- a/coderd/database/queries/workspaceagentstats.sql +++ b/coderd/database/queries/workspaceagentstats.sql @@ -24,7 +24,25 @@ VALUES -- name: InsertWorkspaceAgentStats :exec INSERT INTO - workspace_agent_stats + workspace_agent_stats ( + id, + created_at, + user_id, + workspace_id, + template_id, + agent_id, + connections_by_proto, + connection_count, + rx_packets, + rx_bytes, + tx_packets, + tx_bytes, + session_count_vscode, + session_count_jetbrains, + session_count_reconnecting_pty, + session_count_ssh, + connection_median_latency_ms + ) SELECT unnest(@id :: uuid[]) AS id, unnest(@created_at :: timestamptz[]) AS created_at, From 3d842b43986e8216a85b70f4be1dfbbc23bb74d9 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Thu, 3 Aug 2023 12:31:20 +0100 Subject: [PATCH 19/24] todone --- coderd/workspaceagents.go | 4 ---- 1 file changed, 4 deletions(-) diff --git a/coderd/workspaceagents.go b/coderd/workspaceagents.go index 361e5b68ac05d..0f5607db73436 100644 --- a/coderd/workspaceagents.go +++ b/coderd/workspaceagents.go @@ -1374,8 +1374,6 @@ func convertWorkspaceAgent(derpMap *tailcfg.DERPMap, coordinator tailnet.Coordin // @Success 200 {object} agentsdk.StatsResponse // @Router /workspaceagents/me/report-stats [post] func (api *API) workspaceAgentReportStats(rw http.ResponseWriter, r *http.Request) { - // TODO: here: instead of inserting directly, we should queue up the stats for - // periodic batch insert. ctx := r.Context() workspaceAgent := httpmw.WorkspaceAgent(r) @@ -1434,13 +1432,11 @@ func (api *API) workspaceAgentReportStats(rw http.ResponseWriter, r *http.Reques }) if api.Options.UpdateAgentMetrics != nil { errGroup.Go(func() error { - // TODO(Cian): why do we need to look up the user each time we post stats? user, err := api.Database.GetUserByID(ctx, workspace.OwnerID) if err != nil { return xerrors.Errorf("can't get user: %w", err) } - api.Logger.Debug(ctx, "updating agent metrics", slog.F("username", user.Username), slog.F("workspace", workspace.Name), slog.F("agent", workspaceAgent.Name)) api.Options.UpdateAgentMetrics(ctx, user.Username, workspace.Name, workspaceAgent.Name, req.Metrics) return nil }) From 8b4dfdc430f17068d05c6f93f394ea34dffb7516 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Thu, 3 Aug 2023 12:33:10 +0100 Subject: [PATCH 20/24] fixup! todone --- coderd/coderd.go | 1 - 1 file changed, 1 deletion(-) diff --git a/coderd/coderd.go b/coderd/coderd.go index 7487e9f6ca11e..c13619bf63491 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -160,7 +160,6 @@ type Options struct { HTTPClient *http.Client - // TODO(Cian): This may need to be passed to batchstats.Batcher instead UpdateAgentMetrics func(ctx context.Context, username, workspaceName, agentName string, metrics []agentsdk.AgentMetric) StatsBatcher *batchstats.Batcher } From 721b99e0aadcea2b4c3625a8befb93c51fc22814 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Thu, 3 Aug 2023 12:40:46 +0100 Subject: [PATCH 21/24] cli/server: wait for batcher to be done --- cli/server.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cli/server.go b/cli/server.go index e508511a2d198..3a31f3c24140d 100644 --- a/cli/server.go +++ b/cli/server.go @@ -825,8 +825,13 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. return xerrors.Errorf("failed to create agent stats batcher: %w", err) } options.StatsBatcher = b + batcherDone := make(chan struct{}) go func() { b.Run(ctx) + close(batcherDone) + }() + defer func() { + <-batcherDone }() closeCheckInactiveUsersFunc := dormancy.CheckInactiveUsers(ctx, logger, options.Database) From 8b8a4b177e0d1801c9d64021f731bffefb5ed646 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Thu, 3 Aug 2023 12:42:21 +0100 Subject: [PATCH 22/24] fix panic error message --- coderd/coderd.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/coderd/coderd.go b/coderd/coderd.go index c13619bf63491..3e3d3589da84c 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -294,7 +294,7 @@ func New(options *Options) *API { ctx, cancel := context.WithCancel(context.Background()) if options.StatsBatcher == nil { - panic("developer error: options.Batcher is nil") + panic("developer error: options.StatsBatcher is nil") } siteCacheDir := options.CacheDir From e82fb33cff6ce09e3370c3df8639d14a9870bda6 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Thu, 3 Aug 2023 22:04:16 +0100 Subject: [PATCH 23/24] refactor --- cli/server.go | 16 +-- coderd/batchstats/batcher.go | 125 +++++++++++------- ...tcher_test.go => batcher_internal_test.go} | 30 ++--- coderd/coderd.go | 2 +- coderd/coderdtest/coderdtest.go | 14 +- .../prometheusmetrics_test.go | 31 ++++- 6 files changed, 120 insertions(+), 98 deletions(-) rename coderd/batchstats/{batcher_test.go => batcher_internal_test.go} (93%) diff --git a/cli/server.go b/cli/server.go index 3a31f3c24140d..15cefb364ce3e 100644 --- a/cli/server.go +++ b/cli/server.go @@ -814,25 +814,15 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. options.SwaggerEndpoint = cfg.Swagger.Enable.Value() } - batchStatsTicker := time.NewTicker(29 * time.Second) // Hard-coding. - defer batchStatsTicker.Stop() - b, err := batchstats.New( + batcher, closeBatcher, err := batchstats.New(ctx, batchstats.WithLogger(options.Logger.Named("batchstats")), batchstats.WithStore(options.Database), - batchstats.WithTicker(batchStatsTicker.C), ) if err != nil { return xerrors.Errorf("failed to create agent stats batcher: %w", err) } - options.StatsBatcher = b - batcherDone := make(chan struct{}) - go func() { - b.Run(ctx) - close(batcherDone) - }() - defer func() { - <-batcherDone - }() + options.StatsBatcher = batcher + defer closeBatcher() closeCheckInactiveUsersFunc := dormancy.CheckInactiveUsers(ctx, logger, options.Database) defer closeCheckInactiveUsersFunc() diff --git a/coderd/batchstats/batcher.go b/coderd/batchstats/batcher.go index 2fde397d08bc2..b0a6df3ae0742 100644 --- a/coderd/batchstats/batcher.go +++ b/coderd/batchstats/batcher.go @@ -19,8 +19,8 @@ import ( ) const ( - // DefaultBatchSize is the default size of the batcher's buffer. - DefaultBatchSize = 1024 + defaultBufferSize = 1024 + defaultFlushInterval = time.Second ) // Batcher holds a buffer of agent stats and periodically flushes them to @@ -30,14 +30,16 @@ type Batcher struct { log slog.Logger mu sync.RWMutex - buf database.InsertWorkspaceAgentStatsParams + buf *database.InsertWorkspaceAgentStatsParams // NOTE: we batch this separately as it's a jsonb field and // pq.Array + unnest doesn't play nicely with this. connectionsByProto []map[string]int64 batchSize int - // ticker is used to periodically flush the buffer. - ticker <-chan time.Time + // tickCh is used to periodically flush the buffer. + tickCh <-chan time.Time + ticker *time.Ticker + interval time.Duration // flushLever is used to signal the flusher to flush the buffer immediately. flushLever chan struct{} // flushed is used during testing to signal that a flush has completed. @@ -61,10 +63,10 @@ func WithBatchSize(size int) Option { } } -// WithTicker sets the flush interval. -func WithTicker(ch <-chan time.Time) Option { +// WithInterval sets the interval for flushes. +func WithInterval(d time.Duration) Option { return func(b *Batcher) { - b.ticker = ch + b.interval = d } } @@ -75,17 +77,8 @@ func WithLogger(log slog.Logger) Option { } } -// With Flushed sets the channel to use for signaling that a flush has completed. -// This is only used for testing. -// True signifies that a flush was forced. -func WithFlushed(ch chan bool) Option { - return func(b *Batcher) { - b.flushed = ch - } -} - -// New creates a new Batcher. -func New(opts ...Option) (*Batcher, error) { +// New creates a new Batcher and starts it. +func New(ctx context.Context, opts ...Option) (*Batcher, func(), error) { b := &Batcher{} b.log = slog.Make(sloghuman.Sink(os.Stderr)) b.flushLever = make(chan struct{}, 1) // Buffered so that it doesn't block. @@ -94,39 +87,38 @@ func New(opts ...Option) (*Batcher, error) { } if b.store == nil { - return nil, xerrors.Errorf("no store configured for batcher") + return nil, nil, xerrors.Errorf("no store configured for batcher") } - if b.ticker == nil { - return nil, xerrors.Errorf("no ticker configured for batcher") + if b.interval == 0 { + b.interval = defaultFlushInterval } if b.batchSize == 0 { - b.batchSize = DefaultBatchSize + b.batchSize = defaultBufferSize } - b.connectionsByProto = make([]map[string]int64, 0, b.batchSize) - b.buf = database.InsertWorkspaceAgentStatsParams{ - ID: make([]uuid.UUID, 0, b.batchSize), - CreatedAt: make([]time.Time, 0, b.batchSize), - UserID: make([]uuid.UUID, 0, b.batchSize), - WorkspaceID: make([]uuid.UUID, 0, b.batchSize), - TemplateID: make([]uuid.UUID, 0, b.batchSize), - AgentID: make([]uuid.UUID, 0, b.batchSize), - ConnectionsByProto: json.RawMessage("[]"), - ConnectionCount: make([]int64, 0, b.batchSize), - RxPackets: make([]int64, 0, b.batchSize), - RxBytes: make([]int64, 0, b.batchSize), - TxPackets: make([]int64, 0, b.batchSize), - TxBytes: make([]int64, 0, b.batchSize), - SessionCountVSCode: make([]int64, 0, b.batchSize), - SessionCountJetBrains: make([]int64, 0, b.batchSize), - SessionCountReconnectingPTY: make([]int64, 0, b.batchSize), - SessionCountSSH: make([]int64, 0, b.batchSize), - ConnectionMedianLatencyMS: make([]float64, 0, b.batchSize), + if b.tickCh == nil { + b.ticker = time.NewTicker(b.interval) + b.tickCh = b.ticker.C + } + + cancelCtx, cancelFunc := context.WithCancel(ctx) + done := make(chan struct{}) + go func() { + b.run(cancelCtx) + close(done) + }() + + closer := func() { + cancelFunc() + if b.ticker != nil { + b.ticker.Stop() + } + <-done } - return b, nil + return b, closer, nil } // Add adds a stat to the batcher for the given workspace and agent. @@ -172,12 +164,13 @@ func (b *Batcher) Add( } // Run runs the batcher. -func (b *Batcher) Run(ctx context.Context) { - // nolint:gocritic +func (b *Batcher) run(ctx context.Context) { + b.initBuf(b.batchSize) + // nolint:gocritic // This is only ever used for one thing - inserting agent stats. authCtx := dbauthz.AsSystemRestricted(ctx) for { select { - case <-b.ticker: + case <-b.tickCh: b.flush(authCtx, false, "scheduled") case <-b.flushLever: // If the flush lever is depressed, flush the buffer immediately. @@ -199,7 +192,12 @@ func (b *Batcher) flush(ctx context.Context, forced bool, reason string) { b.mu.Unlock() // Notify that a flush has completed. if b.flushed != nil { - b.flushed <- forced + select { + case <-ctx.Done(): + close(b.flushed) + default: + b.flushed <- forced + } } if count > 0 { elapsed := time.Since(start) @@ -225,16 +223,42 @@ func (b *Batcher) flush(ctx context.Context, forced bool, reason string) { b.buf.ConnectionsByProto = payload } - err = b.store.InsertWorkspaceAgentStats(ctx, b.buf) + err = b.store.InsertWorkspaceAgentStats(ctx, *b.buf) elapsed := time.Since(start) if err != nil { b.log.Error(ctx, "error inserting workspace agent stats", slog.Error(err), slog.F("elapsed", elapsed)) return } - // Reset the buffer. - b.connectionsByProto = b.connectionsByProto[:0] + b.resetBuf() +} +// initBuf resets the buffer. b MUST be locked. +func (b *Batcher) initBuf(size int) { + b.buf = &database.InsertWorkspaceAgentStatsParams{ + ID: make([]uuid.UUID, 0, b.batchSize), + CreatedAt: make([]time.Time, 0, b.batchSize), + UserID: make([]uuid.UUID, 0, b.batchSize), + WorkspaceID: make([]uuid.UUID, 0, b.batchSize), + TemplateID: make([]uuid.UUID, 0, b.batchSize), + AgentID: make([]uuid.UUID, 0, b.batchSize), + ConnectionsByProto: json.RawMessage("[]"), + ConnectionCount: make([]int64, 0, b.batchSize), + RxPackets: make([]int64, 0, b.batchSize), + RxBytes: make([]int64, 0, b.batchSize), + TxPackets: make([]int64, 0, b.batchSize), + TxBytes: make([]int64, 0, b.batchSize), + SessionCountVSCode: make([]int64, 0, b.batchSize), + SessionCountJetBrains: make([]int64, 0, b.batchSize), + SessionCountReconnectingPTY: make([]int64, 0, b.batchSize), + SessionCountSSH: make([]int64, 0, b.batchSize), + ConnectionMedianLatencyMS: make([]float64, 0, b.batchSize), + } + + b.connectionsByProto = make([]map[string]int64, 0, size) +} + +func (b *Batcher) resetBuf() { b.buf.ID = b.buf.ID[:0] b.buf.CreatedAt = b.buf.CreatedAt[:0] b.buf.UserID = b.buf.UserID[:0] @@ -252,4 +276,5 @@ func (b *Batcher) flush(ctx context.Context, forced bool, reason string) { b.buf.SessionCountReconnectingPTY = b.buf.SessionCountReconnectingPTY[:0] b.buf.SessionCountSSH = b.buf.SessionCountSSH[:0] b.buf.ConnectionMedianLatencyMS = b.buf.ConnectionMedianLatencyMS[:0] + b.connectionsByProto = b.connectionsByProto[:0] } diff --git a/coderd/batchstats/batcher_test.go b/coderd/batchstats/batcher_internal_test.go similarity index 93% rename from coderd/batchstats/batcher_test.go rename to coderd/batchstats/batcher_internal_test.go index 8353beef82447..162587437d8a2 100644 --- a/coderd/batchstats/batcher_test.go +++ b/coderd/batchstats/batcher_internal_test.go @@ -1,4 +1,4 @@ -package batchstats_test +package batchstats import ( "context" @@ -10,7 +10,6 @@ import ( "cdr.dev/slog" "cdr.dev/slog/sloggers/slogtest" - "github.com/coder/coder/coderd/batchstats" "github.com/coder/coder/coderd/database" "github.com/coder/coder/coderd/database/dbgen" "github.com/coder/coder/coderd/database/dbtestutil" @@ -22,8 +21,6 @@ import ( func TestBatchStats(t *testing.T) { t.Parallel() - batchSize := batchstats.DefaultBatchSize - // Given: a fresh batcher with no data ctx, cancel := context.WithCancel(context.Background()) t.Cleanup(cancel) @@ -36,24 +33,19 @@ func TestBatchStats(t *testing.T) { tick := make(chan time.Time) flushed := make(chan bool) - b, err := batchstats.New( - batchstats.WithStore(store), - batchstats.WithBatchSize(batchSize), - batchstats.WithLogger(log), - batchstats.WithTicker(tick), - batchstats.WithFlushed(flushed), + b, closer, err := New(ctx, + WithStore(store), + WithLogger(log), + func(b *Batcher) { + b.tickCh = tick + b.flushed = flushed + }, ) require.NoError(t, err) + t.Cleanup(closer) // Given: no data points are added for workspace // When: it becomes time to report stats - done := make(chan struct{}) - t.Cleanup(func() { - close(done) - }) - go func() { - b.Run(ctx) - }() t1 := time.Now() // Signal a tick and wait for a flush to complete. tick <- t1 @@ -86,8 +78,8 @@ func TestBatchStats(t *testing.T) { // Given: a lot of data points are added for both workspaces // (equal to batch size) t3 := time.Now() - t.Logf("inserting %d stats", batchSize) - for i := 0; i < batchSize; i++ { + t.Logf("inserting %d stats", 1024) + for i := 0; i < 1024; i++ { if i%2 == 0 { require.NoError(t, b.Add(deps1.Agent.ID, deps1.User.ID, deps1.Template.ID, deps1.Workspace.ID, randAgentSDKStats(t))) } else { diff --git a/coderd/coderd.go b/coderd/coderd.go index 3e3d3589da84c..58b6c902c7dbc 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -291,7 +291,6 @@ func New(options *Options) *API { v := schedule.NewAGPLUserQuietHoursScheduleStore() options.UserQuietHoursScheduleStore.Store(&v) } - ctx, cancel := context.WithCancel(context.Background()) if options.StatsBatcher == nil { panic("developer error: options.StatsBatcher is nil") @@ -330,6 +329,7 @@ func New(options *Options) *API { }) staticHandler.Experiments.Store(&experiments) + ctx, cancel := context.WithCancel(context.Background()) r := chi.NewRouter() // nolint:gocritic // Load deployment ID. This never changes diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index 4b9cda0b0b9c7..522e343a16a7b 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -245,20 +245,16 @@ func NewOptions(t testing.TB, options *Options) (func(http.Handler), context.Can options.FilesRateLimit = -1 } if options.StatsBatcher == nil { - batchStatsTicker := time.NewTicker(time.Hour) - t.Cleanup(batchStatsTicker.Stop) - options.StatsBatcher, err = batchstats.New( + ctx, cancel := context.WithCancel(context.Background()) + t.Cleanup(cancel) + batcher, closeBatcher, err := batchstats.New(ctx, batchstats.WithStore(options.Database), - // Some tests rely on being able to read stats immediately after writing. - batchstats.WithBatchSize(1), // Avoid cluttering up test output. batchstats.WithLogger(slog.Make(sloghuman.Sink(io.Discard))), - batchstats.WithTicker(batchStatsTicker.C), ) require.NoError(t, err, "create stats batcher") - ctx, cancel := context.WithCancel(context.Background()) - t.Cleanup(cancel) - go options.StatsBatcher.Run(ctx) + options.StatsBatcher = batcher + t.Cleanup(closeBatcher) } var templateScheduleStore atomic.Pointer[schedule.TemplateScheduleStore] diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index 3ea774df1186d..ad39ec840c526 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -11,6 +11,9 @@ import ( "testing" "time" + "github.com/coder/coder/coderd/batchstats" + "github.com/coder/coder/coderd/database/dbtestutil" + "github.com/google/uuid" "github.com/prometheus/client_golang/prometheus" "github.com/stretchr/testify/assert" @@ -372,9 +375,29 @@ func TestAgents(t *testing.T) { func TestAgentStats(t *testing.T) { t.Parallel() + ctx, cancelFunc := context.WithCancel(context.Background()) + t.Cleanup(cancelFunc) + + db, pubsub := dbtestutil.NewDB(t) + log := slogtest.Make(t, nil) + + batcher, closeBatcher, err := batchstats.New(ctx, + batchstats.WithStore(db), + // We want our stats, and we want them NOW. + batchstats.WithBatchSize(1), + batchstats.WithInterval(time.Hour), + batchstats.WithLogger(log), + ) + require.NoError(t, err, "create stats batcher failed") + t.Cleanup(closeBatcher) + // Build sample workspaces with test agents and fake agent client - client, _, api := coderdtest.NewWithAPI(t, &coderdtest.Options{IncludeProvisionerDaemon: true}) - db := api.Database + client, _, _ := coderdtest.NewWithAPI(t, &coderdtest.Options{ + Database: db, + IncludeProvisionerDaemon: true, + Pubsub: pubsub, + StatsBatcher: batcher, + }) user := coderdtest.CreateFirstUser(t, client) @@ -384,11 +407,7 @@ func TestAgentStats(t *testing.T) { registry := prometheus.NewRegistry() - ctx, cancelFunc := context.WithCancel(context.Background()) - defer cancelFunc() - // given - var err error var i int64 for i = 0; i < 3; i++ { _, err = agent1.PostStats(ctx, &agentsdk.Stats{ From 35c8d92278bfc5d2d8622b5e3b68a5f21f0ac1b2 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Fri, 4 Aug 2023 10:28:02 +0100 Subject: [PATCH 24/24] flush buffer early to avoid growing buf --- coderd/batchstats/batcher.go | 25 ++++++---- coderd/batchstats/batcher_internal_test.go | 53 ++++++++++++++-------- 2 files changed, 52 insertions(+), 26 deletions(-) diff --git a/coderd/batchstats/batcher.go b/coderd/batchstats/batcher.go index b0a6df3ae0742..fc177fd143d6a 100644 --- a/coderd/batchstats/batcher.go +++ b/coderd/batchstats/batcher.go @@ -5,6 +5,7 @@ import ( "encoding/json" "os" "sync" + "sync/atomic" "time" "github.com/google/uuid" @@ -29,7 +30,8 @@ type Batcher struct { store database.Store log slog.Logger - mu sync.RWMutex + mu sync.Mutex + // TODO: make this a buffered chan instead? buf *database.InsertWorkspaceAgentStatsParams // NOTE: we batch this separately as it's a jsonb field and // pq.Array + unnest doesn't play nicely with this. @@ -41,9 +43,10 @@ type Batcher struct { ticker *time.Ticker interval time.Duration // flushLever is used to signal the flusher to flush the buffer immediately. - flushLever chan struct{} + flushLever chan struct{} + flushForced atomic.Bool // flushed is used during testing to signal that a flush has completed. - flushed chan<- bool + flushed chan<- int } // Option is a functional option for configuring a Batcher. @@ -156,9 +159,13 @@ func (b *Batcher) Add( b.buf.SessionCountSSH = append(b.buf.SessionCountSSH, st.SessionCountSSH) b.buf.ConnectionMedianLatencyMS = append(b.buf.ConnectionMedianLatencyMS, st.ConnectionMedianLatencyMS) - // If the buffer is full, signal the flusher to flush immediately. - if len(b.buf.ID) == cap(b.buf.ID) { + // If the buffer is over 80% full, signal the flusher to flush immediately. + // We want to trigger flushes early to reduce the likelihood of + // accidentally growing the buffer over batchSize. + filled := float64(len(b.buf.ID)) / float64(b.batchSize) + if filled >= 0.8 && !b.flushForced.Load() { b.flushLever <- struct{}{} + b.flushForced.Store(true) } return nil } @@ -174,7 +181,7 @@ func (b *Batcher) run(ctx context.Context) { b.flush(authCtx, false, "scheduled") case <-b.flushLever: // If the flush lever is depressed, flush the buffer immediately. - b.flush(authCtx, true, "full buffer") + b.flush(authCtx, true, "reaching capacity") case <-ctx.Done(): b.log.Warn(ctx, "context done, flushing before exit") b.flush(authCtx, true, "exit") @@ -186,17 +193,19 @@ func (b *Batcher) run(ctx context.Context) { // flush flushes the batcher's buffer. func (b *Batcher) flush(ctx context.Context, forced bool, reason string) { b.mu.Lock() + b.flushForced.Store(true) start := time.Now() count := len(b.buf.ID) defer func() { + b.flushForced.Store(false) b.mu.Unlock() - // Notify that a flush has completed. + // Notify that a flush has completed. This only happens in tests. if b.flushed != nil { select { case <-ctx.Done(): close(b.flushed) default: - b.flushed <- forced + b.flushed <- count } } if count > 0 { diff --git a/coderd/batchstats/batcher_internal_test.go b/coderd/batchstats/batcher_internal_test.go index 162587437d8a2..a6e28f1a9f389 100644 --- a/coderd/batchstats/batcher_internal_test.go +++ b/coderd/batchstats/batcher_internal_test.go @@ -31,7 +31,7 @@ func TestBatchStats(t *testing.T) { deps1 := setupDeps(t, store) deps2 := setupDeps(t, store) tick := make(chan time.Time) - flushed := make(chan bool) + flushed := make(chan int) b, closer, err := New(ctx, WithStore(store), @@ -50,7 +50,7 @@ func TestBatchStats(t *testing.T) { // Signal a tick and wait for a flush to complete. tick <- t1 f := <-flushed - require.False(t, f, "flush should not have been forced") + require.Equal(t, 0, f, "expected no data to be flushed") t.Logf("flush 1 completed") // Then: it should report no stats. @@ -67,7 +67,7 @@ func TestBatchStats(t *testing.T) { // Signal a tick and wait for a flush to complete. tick <- t2 f = <-flushed // Wait for a flush to complete. - require.False(t, f, "flush should not have been forced") + require.Equal(t, 1, f, "expected one stat to be flushed") t.Logf("flush 2 completed") // Then: it should report a single stat. @@ -78,36 +78,53 @@ func TestBatchStats(t *testing.T) { // Given: a lot of data points are added for both workspaces // (equal to batch size) t3 := time.Now() - t.Logf("inserting %d stats", 1024) - for i := 0; i < 1024; i++ { - if i%2 == 0 { - require.NoError(t, b.Add(deps1.Agent.ID, deps1.User.ID, deps1.Template.ID, deps1.Workspace.ID, randAgentSDKStats(t))) - } else { - require.NoError(t, b.Add(deps2.Agent.ID, deps2.User.ID, deps2.Template.ID, deps2.Workspace.ID, randAgentSDKStats(t))) + done := make(chan struct{}) + + go func() { + defer close(done) + t.Logf("inserting %d stats", defaultBufferSize) + for i := 0; i < defaultBufferSize; i++ { + if i%2 == 0 { + require.NoError(t, b.Add(deps1.Agent.ID, deps1.User.ID, deps1.Template.ID, deps1.Workspace.ID, randAgentSDKStats(t))) + } else { + require.NoError(t, b.Add(deps2.Agent.ID, deps2.User.ID, deps2.Template.ID, deps2.Workspace.ID, randAgentSDKStats(t))) + } } - } + }() - // When: the buffer is full - // Wait for a flush to complete. This should be forced by filling the buffer. + // When: the buffer comes close to capacity + // Then: The buffer will force-flush once. f = <-flushed - require.True(t, f, "flush should have been forced") t.Logf("flush 3 completed") + require.Greater(t, f, 819, "expected at least 819 stats to be flushed (>=80% of buffer)") + // And we should finish inserting the stats + <-done - // Then: it should immediately flush its stats to store. stats, err = store.GetWorkspaceAgentStats(ctx, t3) require.NoError(t, err, "should not error getting stats") require.Len(t, stats, 2, "should have stats for both workspaces") - // Ensure that a subsequent flush does not push stale data. + // Ensures that a subsequent flush pushes all the remaining data t4 := time.Now() tick <- t4 - f = <-flushed - require.False(t, f, "flush should not have been forced") + f2 := <-flushed t.Logf("flush 4 completed") + expectedCount := defaultBufferSize - f + require.Equal(t, expectedCount, f2, "did not flush expected remaining rows") - stats, err = store.GetWorkspaceAgentStats(ctx, t4) + // Ensure that a subsequent flush does not push stale data. + t5 := time.Now() + tick <- t5 + f = <-flushed + require.Zero(t, f, "expected zero stats to have been flushed") + t.Logf("flush 5 completed") + + stats, err = store.GetWorkspaceAgentStats(ctx, t5) require.NoError(t, err, "should not error getting stats") require.Len(t, stats, 0, "should have no stats for workspace") + + // Ensure that buf never grew beyond what we expect + require.Equal(t, defaultBufferSize, cap(b.buf.ID), "buffer grew beyond expected capacity") } // randAgentSDKStats returns a random agentsdk.Stats