From 2f978c32ea03867cdee261e69d71de57be0e27bf Mon Sep 17 00:00:00 2001 From: Steven Masley Date: Fri, 8 Dec 2023 12:59:26 -0600 Subject: [PATCH 01/19] wip: work on agent script metrics --- agent/agent.go | 24 +++++++++++-- agent/agentscripts/agentscripts.go | 34 +++++++++++++++++-- agent/metrics.go | 14 ++++++++ coderd/database/models.go | 2 +- coderd/database/querier.go | 2 +- coderd/database/queries.sql.go | 2 +- .../database/queries/workspaceagentstats.sql | 11 ++++-- coderd/workspaceagents.go | 3 +- codersdk/agentsdk/agentsdk.go | 12 ++++++- 9 files changed, 91 insertions(+), 13 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index 1b5247331b4c4..27a8d0d46d619 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -35,6 +35,8 @@ import ( "tailscale.com/types/netlogtype" "cdr.dev/slog" + "github.com/coder/retry" + "github.com/coder/coder/v2/agent/agentproc" "github.com/coder/coder/v2/agent/agentscripts" "github.com/coder/coder/v2/agent/agentssh" @@ -45,7 +47,6 @@ import ( "github.com/coder/coder/v2/codersdk" "github.com/coder/coder/v2/codersdk/agentsdk" "github.com/coder/coder/v2/tailnet" - "github.com/coder/retry" ) const ( @@ -222,8 +223,11 @@ type agent struct { connCountReconnectingPTY atomic.Int64 prometheusRegistry *prometheus.Registry - metrics *agentMetrics - syscaller agentproc.Syscaller + // metrics are prometheus registered metrics that will be collected and + // labeled in Coder with the agent + workspace. + metrics *agentMetrics + stats agentStats + syscaller agentproc.Syscaller // modifiedProcs is used for testing process priority management. modifiedProcs chan []*agentproc.Process @@ -252,6 +256,9 @@ func (a *agent) init(ctx context.Context) { Filesystem: a.filesystem, PatchLogs: a.client.PatchLogs, }) + // Register runner metrics. If the prom registry is nil, the metrics + // will not report anywhere. + a.scriptRunner.RegisterMetrics(a.prometheusRegistry) go a.runLoop(ctx) } @@ -745,6 +752,7 @@ func (a *agent) run(ctx context.Context) error { return xerrors.Errorf("init script runner: %w", err) } err = a.trackConnGoroutine(func() { + start := time.Now() err := a.scriptRunner.Execute(ctx, func(script codersdk.WorkspaceAgentScript) bool { return script.RunOnStart }) @@ -758,6 +766,16 @@ func (a *agent) run(ctx context.Context) error { } else { a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleReady) } + + dur := time.Since(start).Nanoseconds() + // If something really look 0 ns, just set it to 1 to indicate that it ran. + // Otherwise, 0 looks like the startup script has not run yet. I don't think + // this will ever be 1ns + if dur == 0 { + dur = 1 + } + a.stats.startScriptNs.Store(dur) + a.stats.startScriptSuccess.Store(err == nil) a.scriptRunner.StartCron() }) if err != nil { diff --git a/agent/agentscripts/agentscripts.go b/agent/agentscripts/agentscripts.go index 3acc48b0a140c..045af8461ead2 100644 --- a/agent/agentscripts/agentscripts.go +++ b/agent/agentscripts/agentscripts.go @@ -13,12 +13,14 @@ import ( "sync/atomic" "time" + "github.com/prometheus/client_golang/prometheus" "github.com/robfig/cron/v3" "github.com/spf13/afero" "golang.org/x/sync/errgroup" "golang.org/x/xerrors" "cdr.dev/slog" + "github.com/coder/coder/v2/agent/agentssh" "github.com/coder/coder/v2/codersdk" "github.com/coder/coder/v2/codersdk/agentsdk" @@ -57,6 +59,11 @@ func New(opts Options) *Runner { cronCtxCancel: cronCtxCancel, cron: cron.New(cron.WithParser(parser)), closed: make(chan struct{}), + scriptsExecuted: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "agent", + Subsystem: "scripts", + Name: "executed_total", + }, []string{"success"}), } } @@ -71,6 +78,18 @@ type Runner struct { cron *cron.Cron initialized atomic.Bool scripts []codersdk.WorkspaceAgentScript + + // Metrics + // scripts includes scripts that are scheduled. + scriptsExecuted *prometheus.CounterVec +} + +func (r *Runner) RegisterMetrics(reg prometheus.Registerer) { + if reg == nil { + // If no registry, do nothing. + return + } + reg.MustRegister(r.scriptsExecuted) } // Init initializes the runner with the provided scripts. @@ -90,7 +109,7 @@ func (r *Runner) Init(scripts []codersdk.WorkspaceAgentScript) error { } script := script _, err := r.cron.AddFunc(script.Cron, func() { - err := r.run(r.cronCtx, script) + err := r.trackRun(r.cronCtx, script) if err != nil { r.Logger.Warn(context.Background(), "run agent script on schedule", slog.Error(err)) } @@ -131,7 +150,7 @@ func (r *Runner) Execute(ctx context.Context, filter func(script codersdk.Worksp } script := script eg.Go(func() error { - err := r.run(ctx, script) + err := r.trackRun(ctx, script) if err != nil { return xerrors.Errorf("run agent script %q: %w", script.LogSourceID, err) } @@ -141,6 +160,17 @@ func (r *Runner) Execute(ctx context.Context, filter func(script codersdk.Worksp return eg.Wait() } +// trackRun wraps "run" with metrics. +func (r *Runner) trackRun(ctx context.Context, script codersdk.WorkspaceAgentScript) error { + err := r.run(ctx, script) + if err != nil { + r.scriptsExecuted.WithLabelValues("false").Add(1) + } else { + r.scriptsExecuted.WithLabelValues("true").Add(1) + } + return err +} + // run executes the provided script with the timeout. // If the timeout is exceeded, the process is sent an interrupt signal. // If the process does not exit after a few seconds, it is forcefully killed. diff --git a/agent/metrics.go b/agent/metrics.go index ddbe6f49beed1..5bb8718bdf749 100644 --- a/agent/metrics.go +++ b/agent/metrics.go @@ -7,6 +7,7 @@ import ( "github.com/prometheus/client_golang/prometheus" prompb "github.com/prometheus/client_model/go" + "go.uber.org/atomic" "tailscale.com/util/clientmetric" "cdr.dev/slog" @@ -14,6 +15,19 @@ import ( "github.com/coder/coder/v2/codersdk/agentsdk" ) +// agentStats unlike agentMetrics, are not prometheus metrics. Prometheus' metrics +// are sent to Coder as generic "metrics" that get labeled and reported for each +// workspace. agentStats are sent to Coder as first-class metrics that Coder decides +// how to aggregate and report. +type agentStats struct { + // startScriptNs is the time in nanoseconds that the start script(s) + // took to run. This is reported once per agent, and is collected into a + // histogram by Coder. + startScriptNs atomic.Int64 + // startScriptSuccess should be ignored if startScriptReadyMs is 0. + startScriptSuccess atomic.Bool +} + type agentMetrics struct { connectionsTotal prometheus.Counter reconnectingPTYErrors *prometheus.CounterVec diff --git a/coderd/database/models.go b/coderd/database/models.go index 19d051aba0ab1..554d08c70e990 100644 --- a/coderd/database/models.go +++ b/coderd/database/models.go @@ -1,6 +1,6 @@ // Code generated by sqlc. DO NOT EDIT. // versions: -// sqlc v1.20.0 +// sqlc v1.23.0 package database diff --git a/coderd/database/querier.go b/coderd/database/querier.go index ed4c57e258bef..f06e251a75bd9 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -1,6 +1,6 @@ // Code generated by sqlc. DO NOT EDIT. // versions: -// sqlc v1.20.0 +// sqlc v1.23.0 package database diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index a8931e0d29a94..a10e194cf9cfb 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -1,6 +1,6 @@ // Code generated by sqlc. DO NOT EDIT. // versions: -// sqlc v1.20.0 +// sqlc v1.23.0 package database diff --git a/coderd/database/queries/workspaceagentstats.sql b/coderd/database/queries/workspaceagentstats.sql index cf059121dec77..31263699ce79a 100644 --- a/coderd/database/queries/workspaceagentstats.sql +++ b/coderd/database/queries/workspaceagentstats.sql @@ -171,10 +171,11 @@ WITH agent_stats AS ( WHERE created_at > $1 AND connection_median_latency_ms > 0 ) AS a WHERE a.rn = 1 - GROUP BY a.user_id, a.agent_id, a.workspace_id + GROUP BY a.user_id, a.agent_id, a.workspace_id, a.template_id ) SELECT - users.username, workspace_agents.name AS agent_name, workspaces.name AS workspace_name, rx_bytes, tx_bytes, + users.username, workspace_agents.name AS agent_name, workspaces.name AS workspace_name, + workspaces.template_id AS template_id, rx_bytes, tx_bytes, session_count_vscode, session_count_ssh, session_count_jetbrains, session_count_reconnecting_pty, connection_count, connection_median_latency_ms FROM @@ -194,4 +195,8 @@ ON JOIN workspaces ON - workspaces.id = agent_stats.workspace_id; + workspaces.id = agent_stats.workspace_id +JOIN + templates +ON + templates.id = workspaces.template_id; diff --git a/coderd/workspaceagents.go b/coderd/workspaceagents.go index c862706c5620a..d8f5ec842a4db 100644 --- a/coderd/workspaceagents.go +++ b/coderd/workspaceagents.go @@ -30,6 +30,7 @@ import ( "tailscale.com/tailcfg" "cdr.dev/slog" + "github.com/coder/coder/v2/coderd/autobuild" "github.com/coder/coder/v2/coderd/database" "github.com/coder/coder/v2/coderd/database/dbauthz" @@ -1677,7 +1678,7 @@ func (api *API) workspaceAgentReportStats(rw http.ResponseWriter, r *http.Reques var nextAutostart time.Time if workspace.AutostartSchedule.String != "" { templateSchedule, err := (*(api.TemplateScheduleStore.Load())).Get(ctx, api.Database, workspace.TemplateID) - // If the template schedule fails to load, just default to bumping without the next trasition and log it. + // If the template schedule fails to load, just default to bumping without the next transition and log it. if err != nil { api.Logger.Warn(ctx, "failed to load template schedule bumping activity, defaulting to bumping by 60min", slog.F("workspace_id", workspace.ID), diff --git a/codersdk/agentsdk/agentsdk.go b/codersdk/agentsdk/agentsdk.go index 1ca60a09b12b7..9073f35622dc6 100644 --- a/codersdk/agentsdk/agentsdk.go +++ b/codersdk/agentsdk/agentsdk.go @@ -19,8 +19,9 @@ import ( "tailscale.com/tailcfg" "cdr.dev/slog" - "github.com/coder/coder/v2/codersdk" "github.com/coder/retry" + + "github.com/coder/coder/v2/codersdk" ) // ExternalLogSourceID is the statically-defined ID of a log-source that @@ -574,6 +575,15 @@ type Stats struct { // that are normal, non-tagged SSH sessions. SessionCountSSH int64 `json:"session_count_ssh"` + // Script stats relate to all scripts executed by the agent. + // StartupScriptNs is the duration in nano seconds the startup scripts + // took to execute. If there are no scripts, this still has some value > 0. + // This is because the act of "no script" still takes time to eval, and still + // has a "success" value. + StartupScriptNs int64 `json:"startup_script_ns"` + // StartupScriptSuccess is true if the startup script(s) executed successfully. + StartupScriptSuccess bool `json:"startup_script_success"` + // Metrics collected by the agent Metrics []AgentMetric `json:"metrics"` } From c567eab6aee2ec6b77858384c84fe9eb314f97c4 Mon Sep 17 00:00:00 2001 From: Steven Masley Date: Mon, 11 Dec 2023 10:40:10 -0600 Subject: [PATCH 02/19] push startup script metrics to agent --- agent/agent.go | 5 +++++ codersdk/agentsdk/agentsdk.go | 1 - 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/agent/agent.go b/agent/agent.go index 27a8d0d46d619..536a7336f92bc 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -1203,6 +1203,11 @@ func (a *agent) startReportingConnectionStats(ctx context.Context) { stats.TxPackets += int64(counts.TxPackets) } + // Load the latest startup script stats. These stats are static + // once the agent has started. + stats.StartupScriptNs = a.stats.startScriptNs.Load() + stats.StartupScriptSuccess = a.stats.startScriptSuccess.Load() + // The count of active sessions. sshStats := a.sshServer.ConnStats() stats.SessionCountSSH = sshStats.Sessions diff --git a/codersdk/agentsdk/agentsdk.go b/codersdk/agentsdk/agentsdk.go index 9073f35622dc6..b1c80f2b4be44 100644 --- a/codersdk/agentsdk/agentsdk.go +++ b/codersdk/agentsdk/agentsdk.go @@ -575,7 +575,6 @@ type Stats struct { // that are normal, non-tagged SSH sessions. SessionCountSSH int64 `json:"session_count_ssh"` - // Script stats relate to all scripts executed by the agent. // StartupScriptNs is the duration in nano seconds the startup scripts // took to execute. If there are no scripts, this still has some value > 0. // This is because the act of "no script" still takes time to eval, and still From 5d39495eceb7a16c7166b540bb4860c801361ec1 Mon Sep 17 00:00:00 2001 From: Steven Masley Date: Mon, 11 Dec 2023 16:04:43 -0600 Subject: [PATCH 03/19] Add metrics pushing to prometheus --- coderd/batchstats/batcher.go | 3 + coderd/database/dump.sql | 8 +- ...000176_agent_startup_script_stats.down.sql | 2 + .../000176_agent_startup_script_stats.up.sql | 5 + coderd/database/models.go | 4 + coderd/database/queries.sql.go | 66 +++++++--- .../database/queries/workspaceagentstats.sql | 18 ++- coderd/prometheusmetrics/collector.go | 118 +++++++++++++----- coderd/prometheusmetrics/prometheusmetrics.go | 15 +++ 9 files changed, 180 insertions(+), 59 deletions(-) create mode 100644 coderd/database/migrations/000176_agent_startup_script_stats.down.sql create mode 100644 coderd/database/migrations/000176_agent_startup_script_stats.up.sql diff --git a/coderd/batchstats/batcher.go b/coderd/batchstats/batcher.go index cc234c693e462..0bafede3cd9ae 100644 --- a/coderd/batchstats/batcher.go +++ b/coderd/batchstats/batcher.go @@ -13,6 +13,7 @@ import ( "cdr.dev/slog" "cdr.dev/slog/sloggers/sloghuman" + "github.com/coder/coder/v2/coderd/database" "github.com/coder/coder/v2/coderd/database/dbauthz" "github.com/coder/coder/v2/coderd/database/dbtime" @@ -161,6 +162,8 @@ func (b *Batcher) Add( b.buf.SessionCountReconnectingPTY = append(b.buf.SessionCountReconnectingPTY, st.SessionCountReconnectingPTY) b.buf.SessionCountSSH = append(b.buf.SessionCountSSH, st.SessionCountSSH) b.buf.ConnectionMedianLatencyMS = append(b.buf.ConnectionMedianLatencyMS, st.ConnectionMedianLatencyMS) + b.buf.StartupScriptNs = append(b.buf.StartupScriptNs, st.StartupScriptNs) + b.buf.StartupScriptSuccess = append(b.buf.StartupScriptSuccess, st.StartupScriptSuccess) // If the buffer is over 80% full, signal the flusher to flush immediately. // We want to trigger flushes early to reduce the likelihood of diff --git a/coderd/database/dump.sql b/coderd/database/dump.sql index a8b16269389d3..7f07f9adb9f84 100644 --- a/coderd/database/dump.sql +++ b/coderd/database/dump.sql @@ -954,9 +954,15 @@ CREATE TABLE workspace_agent_stats ( session_count_vscode bigint DEFAULT 0 NOT NULL, session_count_jetbrains bigint DEFAULT 0 NOT NULL, session_count_reconnecting_pty bigint DEFAULT 0 NOT NULL, - session_count_ssh bigint DEFAULT 0 NOT NULL + session_count_ssh bigint DEFAULT 0 NOT NULL, + startup_script_ns bigint DEFAULT 0 NOT NULL, + startup_script_success boolean DEFAULT false NOT NULL ); +COMMENT ON COLUMN workspace_agent_stats.startup_script_ns IS 'The time it took to run the startup script in nanoseconds. If set to 0, the startup script was not run.'; + +COMMENT ON COLUMN workspace_agent_stats.startup_script_success IS 'Whether the startup script was run successfully. Will be false if the duration is 0, but the script has not been run.'; + CREATE TABLE workspace_agents ( id uuid NOT NULL, created_at timestamp with time zone NOT NULL, diff --git a/coderd/database/migrations/000176_agent_startup_script_stats.down.sql b/coderd/database/migrations/000176_agent_startup_script_stats.down.sql new file mode 100644 index 0000000000000..558efb25cf707 --- /dev/null +++ b/coderd/database/migrations/000176_agent_startup_script_stats.down.sql @@ -0,0 +1,2 @@ +ALTER TABLE workspace_agent_stats DROP COLUMN startup_script_ns; +ALTER TABLE workspace_agent_stats DROP COLUMN startup_script_success; diff --git a/coderd/database/migrations/000176_agent_startup_script_stats.up.sql b/coderd/database/migrations/000176_agent_startup_script_stats.up.sql new file mode 100644 index 0000000000000..29faa9607008d --- /dev/null +++ b/coderd/database/migrations/000176_agent_startup_script_stats.up.sql @@ -0,0 +1,5 @@ +ALTER TABLE workspace_agent_stats ADD COLUMN startup_script_ns BIGINT NOT NULL DEFAULT 0; +ALTER TABLE workspace_agent_stats ADD COLUMN startup_script_success BOOL NOT NULL DEFAULT false; + +COMMENT ON COLUMN workspace_agent_stats.startup_script_ns IS 'The time it took to run the startup script in nanoseconds. If set to 0, the startup script was not run.'; +COMMENT ON COLUMN workspace_agent_stats.startup_script_success IS 'Whether the startup script was run successfully. Will be false if the duration is 0, but the script has not been run.'; diff --git a/coderd/database/models.go b/coderd/database/models.go index 554d08c70e990..229160e7f881e 100644 --- a/coderd/database/models.go +++ b/coderd/database/models.go @@ -2265,6 +2265,10 @@ type WorkspaceAgentStat struct { SessionCountJetBrains int64 `db:"session_count_jetbrains" json:"session_count_jetbrains"` SessionCountReconnectingPTY int64 `db:"session_count_reconnecting_pty" json:"session_count_reconnecting_pty"` SessionCountSSH int64 `db:"session_count_ssh" json:"session_count_ssh"` + // The time it took to run the startup script in nanoseconds. If set to 0, the startup script was not run. + StartupScriptNs int64 `db:"startup_script_ns" json:"startup_script_ns"` + // Whether the startup script was run successfully. Will be false if the duration is 0, but the script has not been run. + StartupScriptSuccess bool `db:"startup_script_success" json:"startup_script_success"` } type WorkspaceApp struct { diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index a10e194cf9cfb..23cf04f2e020a 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -8753,7 +8753,7 @@ WITH agent_stats AS ( coalesce(SUM(session_count_jetbrains), 0)::bigint AS session_count_jetbrains, coalesce(SUM(session_count_reconnecting_pty), 0)::bigint AS session_count_reconnecting_pty FROM ( - SELECT id, created_at, user_id, agent_id, workspace_id, template_id, connections_by_proto, connection_count, rx_packets, rx_bytes, tx_packets, tx_bytes, connection_median_latency_ms, session_count_vscode, session_count_jetbrains, session_count_reconnecting_pty, session_count_ssh, ROW_NUMBER() OVER(PARTITION BY agent_id ORDER BY created_at DESC) AS rn + SELECT id, created_at, user_id, agent_id, workspace_id, template_id, connections_by_proto, connection_count, rx_packets, rx_bytes, tx_packets, tx_bytes, connection_median_latency_ms, session_count_vscode, session_count_jetbrains, session_count_reconnecting_pty, session_count_ssh, startup_script_ns, startup_script_success, ROW_NUMBER() OVER(PARTITION BY agent_id ORDER BY created_at DESC) AS rn FROM workspace_agent_stats WHERE created_at > $1 ) AS a WHERE a.rn = 1 ) @@ -8858,7 +8858,7 @@ WITH agent_stats AS ( coalesce(SUM(session_count_jetbrains), 0)::bigint AS session_count_jetbrains, coalesce(SUM(session_count_reconnecting_pty), 0)::bigint AS session_count_reconnecting_pty FROM ( - SELECT id, created_at, user_id, agent_id, workspace_id, template_id, connections_by_proto, connection_count, rx_packets, rx_bytes, tx_packets, tx_bytes, connection_median_latency_ms, session_count_vscode, session_count_jetbrains, session_count_reconnecting_pty, session_count_ssh, ROW_NUMBER() OVER(PARTITION BY agent_id ORDER BY created_at DESC) AS rn + SELECT id, created_at, user_id, agent_id, workspace_id, template_id, connections_by_proto, connection_count, rx_packets, rx_bytes, tx_packets, tx_bytes, connection_median_latency_ms, session_count_vscode, session_count_jetbrains, session_count_reconnecting_pty, session_count_ssh, startup_script_ns, startup_script_success, ROW_NUMBER() OVER(PARTITION BY agent_id ORDER BY created_at DESC) AS rn FROM workspace_agent_stats WHERE created_at > $1 ) AS a WHERE a.rn = 1 GROUP BY a.user_id, a.agent_id, a.workspace_id, a.template_id ) @@ -8939,9 +8939,12 @@ WITH agent_stats AS ( coalesce(SUM(session_count_jetbrains), 0)::bigint AS session_count_jetbrains, coalesce(SUM(session_count_reconnecting_pty), 0)::bigint AS session_count_reconnecting_pty, coalesce(SUM(connection_count), 0)::bigint AS connection_count, - coalesce(MAX(connection_median_latency_ms), 0)::float AS connection_median_latency_ms + coalesce(MAX(connection_median_latency_ms), 0)::float AS connection_median_latency_ms, + -- TODO: Figure this out + coalesce(MAX(startup_script_ns), 0)::float AS startup_script_ns, + coalesce(MAX(startup_script_success), false)::float AS startup_script_success FROM ( - SELECT id, created_at, user_id, agent_id, workspace_id, template_id, connections_by_proto, connection_count, rx_packets, rx_bytes, tx_packets, tx_bytes, connection_median_latency_ms, session_count_vscode, session_count_jetbrains, session_count_reconnecting_pty, session_count_ssh, ROW_NUMBER() OVER(PARTITION BY agent_id ORDER BY created_at DESC) AS rn + SELECT id, created_at, user_id, agent_id, workspace_id, template_id, connections_by_proto, connection_count, rx_packets, rx_bytes, tx_packets, tx_bytes, connection_median_latency_ms, session_count_vscode, session_count_jetbrains, session_count_reconnecting_pty, session_count_ssh, startup_script_ns, startup_script_success, ROW_NUMBER() OVER(PARTITION BY agent_id ORDER BY created_at DESC) AS rn FROM workspace_agent_stats -- The greater than 0 is to support legacy agents that don't report connection_median_latency_ms. WHERE created_at > $1 AND connection_median_latency_ms > 0 @@ -8950,9 +8953,10 @@ WITH agent_stats AS ( GROUP BY a.user_id, a.agent_id, a.workspace_id ) SELECT - users.username, workspace_agents.name AS agent_name, workspaces.name AS workspace_name, rx_bytes, tx_bytes, + users.username, workspace_agents.name AS agent_name, workspaces.name AS workspace_name, + workspaces.template_id AS template_id, rx_bytes, tx_bytes, session_count_vscode, session_count_ssh, session_count_jetbrains, session_count_reconnecting_pty, - connection_count, connection_median_latency_ms + connection_count, connection_median_latency_ms, startup_script_ns, startup_script_success, templates.name AS template_name FROM agent_stats JOIN @@ -8971,20 +8975,28 @@ JOIN workspaces ON workspaces.id = agent_stats.workspace_id +JOIN + templates +ON + templates.id = workspaces.template_id ` type GetWorkspaceAgentStatsAndLabelsRow struct { - Username string `db:"username" json:"username"` - AgentName string `db:"agent_name" json:"agent_name"` - WorkspaceName string `db:"workspace_name" json:"workspace_name"` - RxBytes int64 `db:"rx_bytes" json:"rx_bytes"` - TxBytes int64 `db:"tx_bytes" json:"tx_bytes"` - SessionCountVSCode int64 `db:"session_count_vscode" json:"session_count_vscode"` - SessionCountSSH int64 `db:"session_count_ssh" json:"session_count_ssh"` - SessionCountJetBrains int64 `db:"session_count_jetbrains" json:"session_count_jetbrains"` - SessionCountReconnectingPTY int64 `db:"session_count_reconnecting_pty" json:"session_count_reconnecting_pty"` - ConnectionCount int64 `db:"connection_count" json:"connection_count"` - ConnectionMedianLatencyMS float64 `db:"connection_median_latency_ms" json:"connection_median_latency_ms"` + Username string `db:"username" json:"username"` + AgentName string `db:"agent_name" json:"agent_name"` + WorkspaceName string `db:"workspace_name" json:"workspace_name"` + TemplateID uuid.UUID `db:"template_id" json:"template_id"` + RxBytes int64 `db:"rx_bytes" json:"rx_bytes"` + TxBytes int64 `db:"tx_bytes" json:"tx_bytes"` + SessionCountVSCode int64 `db:"session_count_vscode" json:"session_count_vscode"` + SessionCountSSH int64 `db:"session_count_ssh" json:"session_count_ssh"` + SessionCountJetBrains int64 `db:"session_count_jetbrains" json:"session_count_jetbrains"` + SessionCountReconnectingPTY int64 `db:"session_count_reconnecting_pty" json:"session_count_reconnecting_pty"` + ConnectionCount int64 `db:"connection_count" json:"connection_count"` + ConnectionMedianLatencyMS float64 `db:"connection_median_latency_ms" json:"connection_median_latency_ms"` + StartupScriptNs float64 `db:"startup_script_ns" json:"startup_script_ns"` + StartupScriptSuccess float64 `db:"startup_script_success" json:"startup_script_success"` + TemplateName string `db:"template_name" json:"template_name"` } func (q *sqlQuerier) GetWorkspaceAgentStatsAndLabels(ctx context.Context, createdAt time.Time) ([]GetWorkspaceAgentStatsAndLabelsRow, error) { @@ -9000,6 +9012,7 @@ func (q *sqlQuerier) GetWorkspaceAgentStatsAndLabels(ctx context.Context, create &i.Username, &i.AgentName, &i.WorkspaceName, + &i.TemplateID, &i.RxBytes, &i.TxBytes, &i.SessionCountVSCode, @@ -9008,6 +9021,9 @@ func (q *sqlQuerier) GetWorkspaceAgentStatsAndLabels(ctx context.Context, create &i.SessionCountReconnectingPTY, &i.ConnectionCount, &i.ConnectionMedianLatencyMS, + &i.StartupScriptNs, + &i.StartupScriptSuccess, + &i.TemplateName, ); err != nil { return nil, err } @@ -9044,7 +9060,7 @@ INSERT INTO connection_median_latency_ms ) VALUES - ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17) RETURNING id, created_at, user_id, agent_id, workspace_id, template_id, connections_by_proto, connection_count, rx_packets, rx_bytes, tx_packets, tx_bytes, connection_median_latency_ms, session_count_vscode, session_count_jetbrains, session_count_reconnecting_pty, session_count_ssh + ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17) RETURNING id, created_at, user_id, agent_id, workspace_id, template_id, connections_by_proto, connection_count, rx_packets, rx_bytes, tx_packets, tx_bytes, connection_median_latency_ms, session_count_vscode, session_count_jetbrains, session_count_reconnecting_pty, session_count_ssh, startup_script_ns, startup_script_success ` type InsertWorkspaceAgentStatParams struct { @@ -9106,6 +9122,8 @@ func (q *sqlQuerier) InsertWorkspaceAgentStat(ctx context.Context, arg InsertWor &i.SessionCountJetBrains, &i.SessionCountReconnectingPTY, &i.SessionCountSSH, + &i.StartupScriptNs, + &i.StartupScriptSuccess, ) return i, err } @@ -9129,7 +9147,9 @@ INSERT INTO session_count_jetbrains, session_count_reconnecting_pty, session_count_ssh, - connection_median_latency_ms + connection_median_latency_ms, + startup_script_ns, + startup_script_success ) SELECT unnest($1 :: uuid[]) AS id, @@ -9148,7 +9168,9 @@ SELECT unnest($14 :: bigint[]) AS session_count_jetbrains, unnest($15 :: bigint[]) AS session_count_reconnecting_pty, unnest($16 :: bigint[]) AS session_count_ssh, - unnest($17 :: double precision[]) AS connection_median_latency_ms + unnest($17 :: double precision[]) AS connection_median_latency_ms, + unnest($18 :: bigint[]) AS startup_script_ns, + unnest($19 :: bool[]) AS startup_script_success ` type InsertWorkspaceAgentStatsParams struct { @@ -9169,6 +9191,8 @@ type InsertWorkspaceAgentStatsParams struct { SessionCountReconnectingPTY []int64 `db:"session_count_reconnecting_pty" json:"session_count_reconnecting_pty"` SessionCountSSH []int64 `db:"session_count_ssh" json:"session_count_ssh"` ConnectionMedianLatencyMS []float64 `db:"connection_median_latency_ms" json:"connection_median_latency_ms"` + StartupScriptNs []int64 `db:"startup_script_ns" json:"startup_script_ns"` + StartupScriptSuccess []bool `db:"startup_script_success" json:"startup_script_success"` } func (q *sqlQuerier) InsertWorkspaceAgentStats(ctx context.Context, arg InsertWorkspaceAgentStatsParams) error { @@ -9190,6 +9214,8 @@ func (q *sqlQuerier) InsertWorkspaceAgentStats(ctx context.Context, arg InsertWo pq.Array(arg.SessionCountReconnectingPTY), pq.Array(arg.SessionCountSSH), pq.Array(arg.ConnectionMedianLatencyMS), + pq.Array(arg.StartupScriptNs), + pq.Array(arg.StartupScriptSuccess), ) return err } diff --git a/coderd/database/queries/workspaceagentstats.sql b/coderd/database/queries/workspaceagentstats.sql index 31263699ce79a..1fed96145ad1f 100644 --- a/coderd/database/queries/workspaceagentstats.sql +++ b/coderd/database/queries/workspaceagentstats.sql @@ -41,7 +41,9 @@ INSERT INTO session_count_jetbrains, session_count_reconnecting_pty, session_count_ssh, - connection_median_latency_ms + connection_median_latency_ms, + startup_script_ns, + startup_script_success ) SELECT unnest(@id :: uuid[]) AS id, @@ -60,7 +62,10 @@ SELECT unnest(@session_count_jetbrains :: bigint[]) AS session_count_jetbrains, unnest(@session_count_reconnecting_pty :: bigint[]) AS session_count_reconnecting_pty, unnest(@session_count_ssh :: bigint[]) AS session_count_ssh, - unnest(@connection_median_latency_ms :: double precision[]) AS connection_median_latency_ms; + unnest(@connection_median_latency_ms :: double precision[]) AS connection_median_latency_ms, + unnest(@startup_script_ns :: bigint[]) AS startup_script_ns, + unnest(@startup_script_success :: bool[]) AS startup_script_success +; -- name: GetTemplateDAUs :many SELECT @@ -163,7 +168,10 @@ WITH agent_stats AS ( coalesce(SUM(session_count_jetbrains), 0)::bigint AS session_count_jetbrains, coalesce(SUM(session_count_reconnecting_pty), 0)::bigint AS session_count_reconnecting_pty, coalesce(SUM(connection_count), 0)::bigint AS connection_count, - coalesce(MAX(connection_median_latency_ms), 0)::float AS connection_median_latency_ms + coalesce(MAX(connection_median_latency_ms), 0)::float AS connection_median_latency_ms, + -- TODO: Figure this out + coalesce(MAX(startup_script_ns), 0)::float AS startup_script_ns, + coalesce(MAX(startup_script_success), false)::float AS startup_script_success FROM ( SELECT *, ROW_NUMBER() OVER(PARTITION BY agent_id ORDER BY created_at DESC) AS rn FROM workspace_agent_stats @@ -171,13 +179,13 @@ WITH agent_stats AS ( WHERE created_at > $1 AND connection_median_latency_ms > 0 ) AS a WHERE a.rn = 1 - GROUP BY a.user_id, a.agent_id, a.workspace_id, a.template_id + GROUP BY a.user_id, a.agent_id, a.workspace_id ) SELECT users.username, workspace_agents.name AS agent_name, workspaces.name AS workspace_name, workspaces.template_id AS template_id, rx_bytes, tx_bytes, session_count_vscode, session_count_ssh, session_count_jetbrains, session_count_reconnecting_pty, - connection_count, connection_median_latency_ms + connection_count, connection_median_latency_ms, startup_script_ns, startup_script_success, templates.name AS template_name FROM agent_stats JOIN diff --git a/coderd/prometheusmetrics/collector.go b/coderd/prometheusmetrics/collector.go index 45eb479640970..31ab398004b42 100644 --- a/coderd/prometheusmetrics/collector.go +++ b/coderd/prometheusmetrics/collector.go @@ -6,9 +6,20 @@ import ( "github.com/prometheus/client_golang/prometheus" ) -// CachedGaugeVec is a wrapper for the prometheus.GaugeVec which allows +type cachableMetric interface { + prometheus.Collector + Reset() + + // Process commits the staged changes to the metric. No error can be returned, + // just do best effort to process the records. + Process(records []vectorRecord) +} + +var _ prometheus.Collector = new(CachedMetric) + +// CachedMetric is a wrapper for the prometheus.MetricVec which allows // for staging changes in the metrics vector. Calling "WithLabelValues(...)" -// will update the internal gauge value, but it will not be returned by +// will update the internal metric value, but it will not be returned by // "Collect(...)" until the "Commit()" method is called. The "Commit()" method // resets the internal gauge and applies all staged changes to it. // @@ -16,46 +27,31 @@ import ( // that the Prometheus collector receives incomplete metrics, collected // in the middle of metrics recalculation, between "Reset()" and the last // "WithLabelValues()" call. -type CachedGaugeVec struct { +type CachedMetric struct { m sync.Mutex - gaugeVec *prometheus.GaugeVec - records []vectorRecord -} - -var _ prometheus.Collector = new(CachedGaugeVec) - -type VectorOperation int - -const ( - VectorOperationAdd VectorOperation = iota - VectorOperationSet -) - -type vectorRecord struct { - operation VectorOperation - value float64 - labelValues []string + metric cachableMetric + records []vectorRecord } -func NewCachedGaugeVec(gaugeVec *prometheus.GaugeVec) *CachedGaugeVec { - return &CachedGaugeVec{ - gaugeVec: gaugeVec, +func newCachedMetric(metric cachableMetric) *CachedMetric { + return &CachedMetric{ + metric: metric, } } -func (v *CachedGaugeVec) Describe(desc chan<- *prometheus.Desc) { - v.gaugeVec.Describe(desc) +func (v *CachedMetric) Describe(desc chan<- *prometheus.Desc) { + v.metric.Describe(desc) } -func (v *CachedGaugeVec) Collect(ch chan<- prometheus.Metric) { +func (v *CachedMetric) Collect(ch chan<- prometheus.Metric) { v.m.Lock() defer v.m.Unlock() - v.gaugeVec.Collect(ch) + v.metric.Collect(ch) } -func (v *CachedGaugeVec) WithLabelValues(operation VectorOperation, value float64, labelValues ...string) { +func (v *CachedMetric) WithLabelValues(operation VectorOperation, value float64, labelValues ...string) { switch operation { case VectorOperationAdd, VectorOperationSet: default: @@ -75,20 +71,76 @@ func (v *CachedGaugeVec) WithLabelValues(operation VectorOperation, value float6 // Commit will set the internal value as the cached value to return from "Collect()". // The internal metric value is completely reset, so the caller should expect // the gauge to be empty for the next 'WithLabelValues' values. -func (v *CachedGaugeVec) Commit() { +func (v *CachedMetric) Commit() { v.m.Lock() defer v.m.Unlock() - v.gaugeVec.Reset() - for _, record := range v.records { - g := v.gaugeVec.WithLabelValues(record.labelValues...) + v.metric.Reset() + v.metric.Process(v.records) + + v.records = nil +} + +type CachedHistogramVec struct { +} + +// CachedGaugeVec is a gauge instance of a cached metric. +type cachedGaugeVec struct { + *prometheus.GaugeVec +} + +func NewCachedGaugeVec(gaugeVec *prometheus.GaugeVec) *CachedMetric { + return newCachedMetric(&cachedGaugeVec{ + GaugeVec: gaugeVec, + }) +} + +func (v *cachedGaugeVec) Process(records []vectorRecord) { + for _, record := range records { + g := v.GaugeVec.WithLabelValues(record.labelValues...) switch record.operation { case VectorOperationAdd: g.Add(record.value) case VectorOperationSet: g.Set(record.value) + default: + // ignore unsupported vectors. } } +} - v.records = nil +type cachedHistogramVec struct { + *prometheus.HistogramVec +} + +func NewCachedHistogramVec(gaugeVec *prometheus.HistogramVec) *CachedMetric { + return newCachedMetric(&cachedHistogramVec{ + HistogramVec: gaugeVec, + }) +} + +func (v *cachedHistogramVec) Process(records []vectorRecord) { + for _, record := range records { + g := v.HistogramVec.WithLabelValues(record.labelValues...) + switch record.operation { + case VectorOperationObserve: + g.Observe(record.value) + default: + // ignore unsupported vectors. + } + } +} + +type VectorOperation int + +const ( + VectorOperationAdd VectorOperation = iota + VectorOperationSet + VectorOperationObserve +) + +type vectorRecord struct { + operation VectorOperation + value float64 + labelValues []string } diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 7145c2afa3b39..a2f97c0d2b36d 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -17,6 +17,7 @@ import ( "tailscale.com/tailcfg" "cdr.dev/slog" + "github.com/coder/coder/v2/coderd/database" "github.com/coder/coder/v2/coderd/database/dbauthz" "github.com/coder/coder/v2/coderd/database/dbtime" @@ -24,6 +25,7 @@ import ( ) const ( + templateIDLabel = "template_id" agentNameLabel = "agent_name" usernameLabel = "username" workspaceNameLabel = "workspace_name" @@ -438,6 +440,17 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R return nil, err } + agentStartupScriptNs := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agentstats", + Name: "startup_script_ns", + Help: "Amount of time taken to run the startup script in nanoseconds", + }, []string{agentNameLabel, usernameLabel, workspaceNameLabel})) + err = registerer.Register(agentStatsSessionCountVSCodeGauge) + if err != nil { + return nil, err + } + ctx, cancelFunc := context.WithCancel(ctx) done := make(chan struct{}) @@ -474,6 +487,8 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R agentStatsSessionCountReconnectingPTYGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountReconnectingPTY), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) agentStatsSessionCountSSHGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountSSH), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) agentStatsSessionCountVSCodeGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountVSCode), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) + + agentStartupScriptNs.WithLabelValues(VectorOperationObserve, float64(agentStat.StartupScriptNs), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) } if len(stats) > 0 { From 07fe74a91840c7b76909196e08d3320b18014198 Mon Sep 17 00:00:00 2001 From: Steven Masley Date: Mon, 11 Dec 2023 16:44:58 -0600 Subject: [PATCH 04/19] make metric a normal prom metric exported, rather than a first class stat --- agent/agent.go | 15 +-- agent/metrics.go | 27 ++-- coderd/batchstats/batcher.go | 2 - coderd/database/dump.sql | 8 +- ...000176_agent_startup_script_stats.down.sql | 2 - .../000176_agent_startup_script_stats.up.sql | 5 - coderd/database/models.go | 4 - coderd/database/queries.sql.go | 66 +++------- .../database/queries/workspaceagentstats.sql | 25 +--- coderd/prometheusmetrics/collector.go | 118 +++++------------- coderd/prometheusmetrics/prometheusmetrics.go | 17 +-- codersdk/agentsdk/agentsdk.go | 8 -- 12 files changed, 82 insertions(+), 215 deletions(-) delete mode 100644 coderd/database/migrations/000176_agent_startup_script_stats.down.sql delete mode 100644 coderd/database/migrations/000176_agent_startup_script_stats.up.sql diff --git a/agent/agent.go b/agent/agent.go index 536a7336f92bc..0cfa3124c3369 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -226,7 +226,6 @@ type agent struct { // metrics are prometheus registered metrics that will be collected and // labeled in Coder with the agent + workspace. metrics *agentMetrics - stats agentStats syscaller agentproc.Syscaller // modifiedProcs is used for testing process priority management. @@ -767,15 +766,18 @@ func (a *agent) run(ctx context.Context) error { a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleReady) } - dur := time.Since(start).Nanoseconds() + dur := time.Since(start).Seconds() // If something really look 0 ns, just set it to 1 to indicate that it ran. // Otherwise, 0 looks like the startup script has not run yet. I don't think // this will ever be 1ns if dur == 0 { dur = 1 } - a.stats.startScriptNs.Store(dur) - a.stats.startScriptSuccess.Store(err == nil) + label := "false" + if err == nil { + label = "true" + } + a.metrics.startScriptNs.WithLabelValues(label).Set(float64(dur)) a.scriptRunner.StartCron() }) if err != nil { @@ -1203,11 +1205,6 @@ func (a *agent) startReportingConnectionStats(ctx context.Context) { stats.TxPackets += int64(counts.TxPackets) } - // Load the latest startup script stats. These stats are static - // once the agent has started. - stats.StartupScriptNs = a.stats.startScriptNs.Load() - stats.StartupScriptSuccess = a.stats.startScriptSuccess.Load() - // The count of active sessions. sshStats := a.sshServer.ConnStats() stats.SessionCountSSH = sshStats.Sessions diff --git a/agent/metrics.go b/agent/metrics.go index 5bb8718bdf749..995bd41e897be 100644 --- a/agent/metrics.go +++ b/agent/metrics.go @@ -7,7 +7,6 @@ import ( "github.com/prometheus/client_golang/prometheus" prompb "github.com/prometheus/client_model/go" - "go.uber.org/atomic" "tailscale.com/util/clientmetric" "cdr.dev/slog" @@ -15,22 +14,15 @@ import ( "github.com/coder/coder/v2/codersdk/agentsdk" ) -// agentStats unlike agentMetrics, are not prometheus metrics. Prometheus' metrics -// are sent to Coder as generic "metrics" that get labeled and reported for each -// workspace. agentStats are sent to Coder as first-class metrics that Coder decides -// how to aggregate and report. -type agentStats struct { - // startScriptNs is the time in nanoseconds that the start script(s) - // took to run. This is reported once per agent, and is collected into a - // histogram by Coder. - startScriptNs atomic.Int64 - // startScriptSuccess should be ignored if startScriptReadyMs is 0. - startScriptSuccess atomic.Bool -} + type agentMetrics struct { connectionsTotal prometheus.Counter reconnectingPTYErrors *prometheus.CounterVec + // startScriptNs is the time in nanoseconds that the start script(s) + // took to run. This is reported once per agent, and is collected into a + // histogram by Coder. + startScriptNs *prometheus.GaugeVec } func newAgentMetrics(registerer prometheus.Registerer) *agentMetrics { @@ -49,9 +41,18 @@ func newAgentMetrics(registerer prometheus.Registerer) *agentMetrics { ) registerer.MustRegister(reconnectingPTYErrors) + startScriptNs := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agentstats", + Name: "startup_script_s", + Help: "Amount of time taken to run the startup script in seconds.", + }, []string{"success"}) + registerer.MustRegister(startScriptNs) + return &agentMetrics{ connectionsTotal: connectionsTotal, reconnectingPTYErrors: reconnectingPTYErrors, + startScriptNs: startScriptNs, } } diff --git a/coderd/batchstats/batcher.go b/coderd/batchstats/batcher.go index 0bafede3cd9ae..654b648b28584 100644 --- a/coderd/batchstats/batcher.go +++ b/coderd/batchstats/batcher.go @@ -162,8 +162,6 @@ func (b *Batcher) Add( b.buf.SessionCountReconnectingPTY = append(b.buf.SessionCountReconnectingPTY, st.SessionCountReconnectingPTY) b.buf.SessionCountSSH = append(b.buf.SessionCountSSH, st.SessionCountSSH) b.buf.ConnectionMedianLatencyMS = append(b.buf.ConnectionMedianLatencyMS, st.ConnectionMedianLatencyMS) - b.buf.StartupScriptNs = append(b.buf.StartupScriptNs, st.StartupScriptNs) - b.buf.StartupScriptSuccess = append(b.buf.StartupScriptSuccess, st.StartupScriptSuccess) // If the buffer is over 80% full, signal the flusher to flush immediately. // We want to trigger flushes early to reduce the likelihood of diff --git a/coderd/database/dump.sql b/coderd/database/dump.sql index 7f07f9adb9f84..a8b16269389d3 100644 --- a/coderd/database/dump.sql +++ b/coderd/database/dump.sql @@ -954,15 +954,9 @@ CREATE TABLE workspace_agent_stats ( session_count_vscode bigint DEFAULT 0 NOT NULL, session_count_jetbrains bigint DEFAULT 0 NOT NULL, session_count_reconnecting_pty bigint DEFAULT 0 NOT NULL, - session_count_ssh bigint DEFAULT 0 NOT NULL, - startup_script_ns bigint DEFAULT 0 NOT NULL, - startup_script_success boolean DEFAULT false NOT NULL + session_count_ssh bigint DEFAULT 0 NOT NULL ); -COMMENT ON COLUMN workspace_agent_stats.startup_script_ns IS 'The time it took to run the startup script in nanoseconds. If set to 0, the startup script was not run.'; - -COMMENT ON COLUMN workspace_agent_stats.startup_script_success IS 'Whether the startup script was run successfully. Will be false if the duration is 0, but the script has not been run.'; - CREATE TABLE workspace_agents ( id uuid NOT NULL, created_at timestamp with time zone NOT NULL, diff --git a/coderd/database/migrations/000176_agent_startup_script_stats.down.sql b/coderd/database/migrations/000176_agent_startup_script_stats.down.sql deleted file mode 100644 index 558efb25cf707..0000000000000 --- a/coderd/database/migrations/000176_agent_startup_script_stats.down.sql +++ /dev/null @@ -1,2 +0,0 @@ -ALTER TABLE workspace_agent_stats DROP COLUMN startup_script_ns; -ALTER TABLE workspace_agent_stats DROP COLUMN startup_script_success; diff --git a/coderd/database/migrations/000176_agent_startup_script_stats.up.sql b/coderd/database/migrations/000176_agent_startup_script_stats.up.sql deleted file mode 100644 index 29faa9607008d..0000000000000 --- a/coderd/database/migrations/000176_agent_startup_script_stats.up.sql +++ /dev/null @@ -1,5 +0,0 @@ -ALTER TABLE workspace_agent_stats ADD COLUMN startup_script_ns BIGINT NOT NULL DEFAULT 0; -ALTER TABLE workspace_agent_stats ADD COLUMN startup_script_success BOOL NOT NULL DEFAULT false; - -COMMENT ON COLUMN workspace_agent_stats.startup_script_ns IS 'The time it took to run the startup script in nanoseconds. If set to 0, the startup script was not run.'; -COMMENT ON COLUMN workspace_agent_stats.startup_script_success IS 'Whether the startup script was run successfully. Will be false if the duration is 0, but the script has not been run.'; diff --git a/coderd/database/models.go b/coderd/database/models.go index 229160e7f881e..554d08c70e990 100644 --- a/coderd/database/models.go +++ b/coderd/database/models.go @@ -2265,10 +2265,6 @@ type WorkspaceAgentStat struct { SessionCountJetBrains int64 `db:"session_count_jetbrains" json:"session_count_jetbrains"` SessionCountReconnectingPTY int64 `db:"session_count_reconnecting_pty" json:"session_count_reconnecting_pty"` SessionCountSSH int64 `db:"session_count_ssh" json:"session_count_ssh"` - // The time it took to run the startup script in nanoseconds. If set to 0, the startup script was not run. - StartupScriptNs int64 `db:"startup_script_ns" json:"startup_script_ns"` - // Whether the startup script was run successfully. Will be false if the duration is 0, but the script has not been run. - StartupScriptSuccess bool `db:"startup_script_success" json:"startup_script_success"` } type WorkspaceApp struct { diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index 23cf04f2e020a..a10e194cf9cfb 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -8753,7 +8753,7 @@ WITH agent_stats AS ( coalesce(SUM(session_count_jetbrains), 0)::bigint AS session_count_jetbrains, coalesce(SUM(session_count_reconnecting_pty), 0)::bigint AS session_count_reconnecting_pty FROM ( - SELECT id, created_at, user_id, agent_id, workspace_id, template_id, connections_by_proto, connection_count, rx_packets, rx_bytes, tx_packets, tx_bytes, connection_median_latency_ms, session_count_vscode, session_count_jetbrains, session_count_reconnecting_pty, session_count_ssh, startup_script_ns, startup_script_success, ROW_NUMBER() OVER(PARTITION BY agent_id ORDER BY created_at DESC) AS rn + SELECT id, created_at, user_id, agent_id, workspace_id, template_id, connections_by_proto, connection_count, rx_packets, rx_bytes, tx_packets, tx_bytes, connection_median_latency_ms, session_count_vscode, session_count_jetbrains, session_count_reconnecting_pty, session_count_ssh, ROW_NUMBER() OVER(PARTITION BY agent_id ORDER BY created_at DESC) AS rn FROM workspace_agent_stats WHERE created_at > $1 ) AS a WHERE a.rn = 1 ) @@ -8858,7 +8858,7 @@ WITH agent_stats AS ( coalesce(SUM(session_count_jetbrains), 0)::bigint AS session_count_jetbrains, coalesce(SUM(session_count_reconnecting_pty), 0)::bigint AS session_count_reconnecting_pty FROM ( - SELECT id, created_at, user_id, agent_id, workspace_id, template_id, connections_by_proto, connection_count, rx_packets, rx_bytes, tx_packets, tx_bytes, connection_median_latency_ms, session_count_vscode, session_count_jetbrains, session_count_reconnecting_pty, session_count_ssh, startup_script_ns, startup_script_success, ROW_NUMBER() OVER(PARTITION BY agent_id ORDER BY created_at DESC) AS rn + SELECT id, created_at, user_id, agent_id, workspace_id, template_id, connections_by_proto, connection_count, rx_packets, rx_bytes, tx_packets, tx_bytes, connection_median_latency_ms, session_count_vscode, session_count_jetbrains, session_count_reconnecting_pty, session_count_ssh, ROW_NUMBER() OVER(PARTITION BY agent_id ORDER BY created_at DESC) AS rn FROM workspace_agent_stats WHERE created_at > $1 ) AS a WHERE a.rn = 1 GROUP BY a.user_id, a.agent_id, a.workspace_id, a.template_id ) @@ -8939,12 +8939,9 @@ WITH agent_stats AS ( coalesce(SUM(session_count_jetbrains), 0)::bigint AS session_count_jetbrains, coalesce(SUM(session_count_reconnecting_pty), 0)::bigint AS session_count_reconnecting_pty, coalesce(SUM(connection_count), 0)::bigint AS connection_count, - coalesce(MAX(connection_median_latency_ms), 0)::float AS connection_median_latency_ms, - -- TODO: Figure this out - coalesce(MAX(startup_script_ns), 0)::float AS startup_script_ns, - coalesce(MAX(startup_script_success), false)::float AS startup_script_success + coalesce(MAX(connection_median_latency_ms), 0)::float AS connection_median_latency_ms FROM ( - SELECT id, created_at, user_id, agent_id, workspace_id, template_id, connections_by_proto, connection_count, rx_packets, rx_bytes, tx_packets, tx_bytes, connection_median_latency_ms, session_count_vscode, session_count_jetbrains, session_count_reconnecting_pty, session_count_ssh, startup_script_ns, startup_script_success, ROW_NUMBER() OVER(PARTITION BY agent_id ORDER BY created_at DESC) AS rn + SELECT id, created_at, user_id, agent_id, workspace_id, template_id, connections_by_proto, connection_count, rx_packets, rx_bytes, tx_packets, tx_bytes, connection_median_latency_ms, session_count_vscode, session_count_jetbrains, session_count_reconnecting_pty, session_count_ssh, ROW_NUMBER() OVER(PARTITION BY agent_id ORDER BY created_at DESC) AS rn FROM workspace_agent_stats -- The greater than 0 is to support legacy agents that don't report connection_median_latency_ms. WHERE created_at > $1 AND connection_median_latency_ms > 0 @@ -8953,10 +8950,9 @@ WITH agent_stats AS ( GROUP BY a.user_id, a.agent_id, a.workspace_id ) SELECT - users.username, workspace_agents.name AS agent_name, workspaces.name AS workspace_name, - workspaces.template_id AS template_id, rx_bytes, tx_bytes, + users.username, workspace_agents.name AS agent_name, workspaces.name AS workspace_name, rx_bytes, tx_bytes, session_count_vscode, session_count_ssh, session_count_jetbrains, session_count_reconnecting_pty, - connection_count, connection_median_latency_ms, startup_script_ns, startup_script_success, templates.name AS template_name + connection_count, connection_median_latency_ms FROM agent_stats JOIN @@ -8975,28 +8971,20 @@ JOIN workspaces ON workspaces.id = agent_stats.workspace_id -JOIN - templates -ON - templates.id = workspaces.template_id ` type GetWorkspaceAgentStatsAndLabelsRow struct { - Username string `db:"username" json:"username"` - AgentName string `db:"agent_name" json:"agent_name"` - WorkspaceName string `db:"workspace_name" json:"workspace_name"` - TemplateID uuid.UUID `db:"template_id" json:"template_id"` - RxBytes int64 `db:"rx_bytes" json:"rx_bytes"` - TxBytes int64 `db:"tx_bytes" json:"tx_bytes"` - SessionCountVSCode int64 `db:"session_count_vscode" json:"session_count_vscode"` - SessionCountSSH int64 `db:"session_count_ssh" json:"session_count_ssh"` - SessionCountJetBrains int64 `db:"session_count_jetbrains" json:"session_count_jetbrains"` - SessionCountReconnectingPTY int64 `db:"session_count_reconnecting_pty" json:"session_count_reconnecting_pty"` - ConnectionCount int64 `db:"connection_count" json:"connection_count"` - ConnectionMedianLatencyMS float64 `db:"connection_median_latency_ms" json:"connection_median_latency_ms"` - StartupScriptNs float64 `db:"startup_script_ns" json:"startup_script_ns"` - StartupScriptSuccess float64 `db:"startup_script_success" json:"startup_script_success"` - TemplateName string `db:"template_name" json:"template_name"` + Username string `db:"username" json:"username"` + AgentName string `db:"agent_name" json:"agent_name"` + WorkspaceName string `db:"workspace_name" json:"workspace_name"` + RxBytes int64 `db:"rx_bytes" json:"rx_bytes"` + TxBytes int64 `db:"tx_bytes" json:"tx_bytes"` + SessionCountVSCode int64 `db:"session_count_vscode" json:"session_count_vscode"` + SessionCountSSH int64 `db:"session_count_ssh" json:"session_count_ssh"` + SessionCountJetBrains int64 `db:"session_count_jetbrains" json:"session_count_jetbrains"` + SessionCountReconnectingPTY int64 `db:"session_count_reconnecting_pty" json:"session_count_reconnecting_pty"` + ConnectionCount int64 `db:"connection_count" json:"connection_count"` + ConnectionMedianLatencyMS float64 `db:"connection_median_latency_ms" json:"connection_median_latency_ms"` } func (q *sqlQuerier) GetWorkspaceAgentStatsAndLabels(ctx context.Context, createdAt time.Time) ([]GetWorkspaceAgentStatsAndLabelsRow, error) { @@ -9012,7 +9000,6 @@ func (q *sqlQuerier) GetWorkspaceAgentStatsAndLabels(ctx context.Context, create &i.Username, &i.AgentName, &i.WorkspaceName, - &i.TemplateID, &i.RxBytes, &i.TxBytes, &i.SessionCountVSCode, @@ -9021,9 +9008,6 @@ func (q *sqlQuerier) GetWorkspaceAgentStatsAndLabels(ctx context.Context, create &i.SessionCountReconnectingPTY, &i.ConnectionCount, &i.ConnectionMedianLatencyMS, - &i.StartupScriptNs, - &i.StartupScriptSuccess, - &i.TemplateName, ); err != nil { return nil, err } @@ -9060,7 +9044,7 @@ INSERT INTO connection_median_latency_ms ) VALUES - ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17) RETURNING id, created_at, user_id, agent_id, workspace_id, template_id, connections_by_proto, connection_count, rx_packets, rx_bytes, tx_packets, tx_bytes, connection_median_latency_ms, session_count_vscode, session_count_jetbrains, session_count_reconnecting_pty, session_count_ssh, startup_script_ns, startup_script_success + ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17) RETURNING id, created_at, user_id, agent_id, workspace_id, template_id, connections_by_proto, connection_count, rx_packets, rx_bytes, tx_packets, tx_bytes, connection_median_latency_ms, session_count_vscode, session_count_jetbrains, session_count_reconnecting_pty, session_count_ssh ` type InsertWorkspaceAgentStatParams struct { @@ -9122,8 +9106,6 @@ func (q *sqlQuerier) InsertWorkspaceAgentStat(ctx context.Context, arg InsertWor &i.SessionCountJetBrains, &i.SessionCountReconnectingPTY, &i.SessionCountSSH, - &i.StartupScriptNs, - &i.StartupScriptSuccess, ) return i, err } @@ -9147,9 +9129,7 @@ INSERT INTO session_count_jetbrains, session_count_reconnecting_pty, session_count_ssh, - connection_median_latency_ms, - startup_script_ns, - startup_script_success + connection_median_latency_ms ) SELECT unnest($1 :: uuid[]) AS id, @@ -9168,9 +9148,7 @@ SELECT unnest($14 :: bigint[]) AS session_count_jetbrains, unnest($15 :: bigint[]) AS session_count_reconnecting_pty, unnest($16 :: bigint[]) AS session_count_ssh, - unnest($17 :: double precision[]) AS connection_median_latency_ms, - unnest($18 :: bigint[]) AS startup_script_ns, - unnest($19 :: bool[]) AS startup_script_success + unnest($17 :: double precision[]) AS connection_median_latency_ms ` type InsertWorkspaceAgentStatsParams struct { @@ -9191,8 +9169,6 @@ type InsertWorkspaceAgentStatsParams struct { SessionCountReconnectingPTY []int64 `db:"session_count_reconnecting_pty" json:"session_count_reconnecting_pty"` SessionCountSSH []int64 `db:"session_count_ssh" json:"session_count_ssh"` ConnectionMedianLatencyMS []float64 `db:"connection_median_latency_ms" json:"connection_median_latency_ms"` - StartupScriptNs []int64 `db:"startup_script_ns" json:"startup_script_ns"` - StartupScriptSuccess []bool `db:"startup_script_success" json:"startup_script_success"` } func (q *sqlQuerier) InsertWorkspaceAgentStats(ctx context.Context, arg InsertWorkspaceAgentStatsParams) error { @@ -9214,8 +9190,6 @@ func (q *sqlQuerier) InsertWorkspaceAgentStats(ctx context.Context, arg InsertWo pq.Array(arg.SessionCountReconnectingPTY), pq.Array(arg.SessionCountSSH), pq.Array(arg.ConnectionMedianLatencyMS), - pq.Array(arg.StartupScriptNs), - pq.Array(arg.StartupScriptSuccess), ) return err } diff --git a/coderd/database/queries/workspaceagentstats.sql b/coderd/database/queries/workspaceagentstats.sql index 1fed96145ad1f..cf059121dec77 100644 --- a/coderd/database/queries/workspaceagentstats.sql +++ b/coderd/database/queries/workspaceagentstats.sql @@ -41,9 +41,7 @@ INSERT INTO session_count_jetbrains, session_count_reconnecting_pty, session_count_ssh, - connection_median_latency_ms, - startup_script_ns, - startup_script_success + connection_median_latency_ms ) SELECT unnest(@id :: uuid[]) AS id, @@ -62,10 +60,7 @@ SELECT unnest(@session_count_jetbrains :: bigint[]) AS session_count_jetbrains, unnest(@session_count_reconnecting_pty :: bigint[]) AS session_count_reconnecting_pty, unnest(@session_count_ssh :: bigint[]) AS session_count_ssh, - unnest(@connection_median_latency_ms :: double precision[]) AS connection_median_latency_ms, - unnest(@startup_script_ns :: bigint[]) AS startup_script_ns, - unnest(@startup_script_success :: bool[]) AS startup_script_success -; + unnest(@connection_median_latency_ms :: double precision[]) AS connection_median_latency_ms; -- name: GetTemplateDAUs :many SELECT @@ -168,10 +163,7 @@ WITH agent_stats AS ( coalesce(SUM(session_count_jetbrains), 0)::bigint AS session_count_jetbrains, coalesce(SUM(session_count_reconnecting_pty), 0)::bigint AS session_count_reconnecting_pty, coalesce(SUM(connection_count), 0)::bigint AS connection_count, - coalesce(MAX(connection_median_latency_ms), 0)::float AS connection_median_latency_ms, - -- TODO: Figure this out - coalesce(MAX(startup_script_ns), 0)::float AS startup_script_ns, - coalesce(MAX(startup_script_success), false)::float AS startup_script_success + coalesce(MAX(connection_median_latency_ms), 0)::float AS connection_median_latency_ms FROM ( SELECT *, ROW_NUMBER() OVER(PARTITION BY agent_id ORDER BY created_at DESC) AS rn FROM workspace_agent_stats @@ -182,10 +174,9 @@ WITH agent_stats AS ( GROUP BY a.user_id, a.agent_id, a.workspace_id ) SELECT - users.username, workspace_agents.name AS agent_name, workspaces.name AS workspace_name, - workspaces.template_id AS template_id, rx_bytes, tx_bytes, + users.username, workspace_agents.name AS agent_name, workspaces.name AS workspace_name, rx_bytes, tx_bytes, session_count_vscode, session_count_ssh, session_count_jetbrains, session_count_reconnecting_pty, - connection_count, connection_median_latency_ms, startup_script_ns, startup_script_success, templates.name AS template_name + connection_count, connection_median_latency_ms FROM agent_stats JOIN @@ -203,8 +194,4 @@ ON JOIN workspaces ON - workspaces.id = agent_stats.workspace_id -JOIN - templates -ON - templates.id = workspaces.template_id; + workspaces.id = agent_stats.workspace_id; diff --git a/coderd/prometheusmetrics/collector.go b/coderd/prometheusmetrics/collector.go index 31ab398004b42..45eb479640970 100644 --- a/coderd/prometheusmetrics/collector.go +++ b/coderd/prometheusmetrics/collector.go @@ -6,20 +6,9 @@ import ( "github.com/prometheus/client_golang/prometheus" ) -type cachableMetric interface { - prometheus.Collector - Reset() - - // Process commits the staged changes to the metric. No error can be returned, - // just do best effort to process the records. - Process(records []vectorRecord) -} - -var _ prometheus.Collector = new(CachedMetric) - -// CachedMetric is a wrapper for the prometheus.MetricVec which allows +// CachedGaugeVec is a wrapper for the prometheus.GaugeVec which allows // for staging changes in the metrics vector. Calling "WithLabelValues(...)" -// will update the internal metric value, but it will not be returned by +// will update the internal gauge value, but it will not be returned by // "Collect(...)" until the "Commit()" method is called. The "Commit()" method // resets the internal gauge and applies all staged changes to it. // @@ -27,31 +16,46 @@ var _ prometheus.Collector = new(CachedMetric) // that the Prometheus collector receives incomplete metrics, collected // in the middle of metrics recalculation, between "Reset()" and the last // "WithLabelValues()" call. -type CachedMetric struct { +type CachedGaugeVec struct { m sync.Mutex - metric cachableMetric - records []vectorRecord + gaugeVec *prometheus.GaugeVec + records []vectorRecord +} + +var _ prometheus.Collector = new(CachedGaugeVec) + +type VectorOperation int + +const ( + VectorOperationAdd VectorOperation = iota + VectorOperationSet +) + +type vectorRecord struct { + operation VectorOperation + value float64 + labelValues []string } -func newCachedMetric(metric cachableMetric) *CachedMetric { - return &CachedMetric{ - metric: metric, +func NewCachedGaugeVec(gaugeVec *prometheus.GaugeVec) *CachedGaugeVec { + return &CachedGaugeVec{ + gaugeVec: gaugeVec, } } -func (v *CachedMetric) Describe(desc chan<- *prometheus.Desc) { - v.metric.Describe(desc) +func (v *CachedGaugeVec) Describe(desc chan<- *prometheus.Desc) { + v.gaugeVec.Describe(desc) } -func (v *CachedMetric) Collect(ch chan<- prometheus.Metric) { +func (v *CachedGaugeVec) Collect(ch chan<- prometheus.Metric) { v.m.Lock() defer v.m.Unlock() - v.metric.Collect(ch) + v.gaugeVec.Collect(ch) } -func (v *CachedMetric) WithLabelValues(operation VectorOperation, value float64, labelValues ...string) { +func (v *CachedGaugeVec) WithLabelValues(operation VectorOperation, value float64, labelValues ...string) { switch operation { case VectorOperationAdd, VectorOperationSet: default: @@ -71,76 +75,20 @@ func (v *CachedMetric) WithLabelValues(operation VectorOperation, value float64, // Commit will set the internal value as the cached value to return from "Collect()". // The internal metric value is completely reset, so the caller should expect // the gauge to be empty for the next 'WithLabelValues' values. -func (v *CachedMetric) Commit() { +func (v *CachedGaugeVec) Commit() { v.m.Lock() defer v.m.Unlock() - v.metric.Reset() - v.metric.Process(v.records) - - v.records = nil -} - -type CachedHistogramVec struct { -} - -// CachedGaugeVec is a gauge instance of a cached metric. -type cachedGaugeVec struct { - *prometheus.GaugeVec -} - -func NewCachedGaugeVec(gaugeVec *prometheus.GaugeVec) *CachedMetric { - return newCachedMetric(&cachedGaugeVec{ - GaugeVec: gaugeVec, - }) -} - -func (v *cachedGaugeVec) Process(records []vectorRecord) { - for _, record := range records { - g := v.GaugeVec.WithLabelValues(record.labelValues...) + v.gaugeVec.Reset() + for _, record := range v.records { + g := v.gaugeVec.WithLabelValues(record.labelValues...) switch record.operation { case VectorOperationAdd: g.Add(record.value) case VectorOperationSet: g.Set(record.value) - default: - // ignore unsupported vectors. - } - } -} - -type cachedHistogramVec struct { - *prometheus.HistogramVec -} - -func NewCachedHistogramVec(gaugeVec *prometheus.HistogramVec) *CachedMetric { - return newCachedMetric(&cachedHistogramVec{ - HistogramVec: gaugeVec, - }) -} - -func (v *cachedHistogramVec) Process(records []vectorRecord) { - for _, record := range records { - g := v.HistogramVec.WithLabelValues(record.labelValues...) - switch record.operation { - case VectorOperationObserve: - g.Observe(record.value) - default: - // ignore unsupported vectors. } } -} - -type VectorOperation int -const ( - VectorOperationAdd VectorOperation = iota - VectorOperationSet - VectorOperationObserve -) - -type vectorRecord struct { - operation VectorOperation - value float64 - labelValues []string + v.records = nil } diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index a2f97c0d2b36d..e1928fec5fa15 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -25,7 +25,7 @@ import ( ) const ( - templateIDLabel = "template_id" + templateNameLabel = "template_name" agentNameLabel = "agent_name" usernameLabel = "username" workspaceNameLabel = "workspace_name" @@ -156,7 +156,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis Subsystem: "agents", Name: "up", Help: "The number of active agents per workspace.", - }, []string{usernameLabel, workspaceNameLabel, "template_name", "template_version"})) + }, []string{usernameLabel, workspaceNameLabel, templateNameLabel, "template_version"})) err := registerer.Register(agentsGauge) if err != nil { return nil, err @@ -440,17 +440,6 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R return nil, err } - agentStartupScriptNs := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ - Namespace: "coderd", - Subsystem: "agentstats", - Name: "startup_script_ns", - Help: "Amount of time taken to run the startup script in nanoseconds", - }, []string{agentNameLabel, usernameLabel, workspaceNameLabel})) - err = registerer.Register(agentStatsSessionCountVSCodeGauge) - if err != nil { - return nil, err - } - ctx, cancelFunc := context.WithCancel(ctx) done := make(chan struct{}) @@ -487,8 +476,6 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R agentStatsSessionCountReconnectingPTYGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountReconnectingPTY), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) agentStatsSessionCountSSHGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountSSH), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) agentStatsSessionCountVSCodeGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountVSCode), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) - - agentStartupScriptNs.WithLabelValues(VectorOperationObserve, float64(agentStat.StartupScriptNs), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) } if len(stats) > 0 { diff --git a/codersdk/agentsdk/agentsdk.go b/codersdk/agentsdk/agentsdk.go index b1c80f2b4be44..cb01a9ec3fae2 100644 --- a/codersdk/agentsdk/agentsdk.go +++ b/codersdk/agentsdk/agentsdk.go @@ -575,14 +575,6 @@ type Stats struct { // that are normal, non-tagged SSH sessions. SessionCountSSH int64 `json:"session_count_ssh"` - // StartupScriptNs is the duration in nano seconds the startup scripts - // took to execute. If there are no scripts, this still has some value > 0. - // This is because the act of "no script" still takes time to eval, and still - // has a "success" value. - StartupScriptNs int64 `json:"startup_script_ns"` - // StartupScriptSuccess is true if the startup script(s) executed successfully. - StartupScriptSuccess bool `json:"startup_script_success"` - // Metrics collected by the agent Metrics []AgentMetric `json:"metrics"` } From 753f1a658baaf58bd48015c75774e58a35cc2db2 Mon Sep 17 00:00:00 2001 From: Steven Masley Date: Mon, 11 Dec 2023 16:51:14 -0600 Subject: [PATCH 05/19] linting/fmt/naming --- agent/agent.go | 8 +------- agent/metrics.go | 15 ++++++--------- coderd/database/models.go | 2 +- coderd/database/querier.go | 2 +- coderd/database/queries.sql.go | 2 +- codersdk/agentsdk/agentsdk.go | 3 +-- 6 files changed, 11 insertions(+), 21 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index 0cfa3124c3369..2df12368f9243 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -767,17 +767,11 @@ func (a *agent) run(ctx context.Context) error { } dur := time.Since(start).Seconds() - // If something really look 0 ns, just set it to 1 to indicate that it ran. - // Otherwise, 0 looks like the startup script has not run yet. I don't think - // this will ever be 1ns - if dur == 0 { - dur = 1 - } label := "false" if err == nil { label = "true" } - a.metrics.startScriptNs.WithLabelValues(label).Set(float64(dur)) + a.metrics.startScriptSeconds.WithLabelValues(label).Set(dur) a.scriptRunner.StartCron() }) if err != nil { diff --git a/agent/metrics.go b/agent/metrics.go index 995bd41e897be..4ac81f937bd7c 100644 --- a/agent/metrics.go +++ b/agent/metrics.go @@ -14,15 +14,12 @@ import ( "github.com/coder/coder/v2/codersdk/agentsdk" ) - - type agentMetrics struct { connectionsTotal prometheus.Counter reconnectingPTYErrors *prometheus.CounterVec - // startScriptNs is the time in nanoseconds that the start script(s) - // took to run. This is reported once per agent, and is collected into a - // histogram by Coder. - startScriptNs *prometheus.GaugeVec + // startScriptSeconds is the time in seconds that the start script(s) + // took to run. This is reported once per agent. + startScriptSeconds *prometheus.GaugeVec } func newAgentMetrics(registerer prometheus.Registerer) *agentMetrics { @@ -41,18 +38,18 @@ func newAgentMetrics(registerer prometheus.Registerer) *agentMetrics { ) registerer.MustRegister(reconnectingPTYErrors) - startScriptNs := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + startScriptSeconds := prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "coderd", Subsystem: "agentstats", Name: "startup_script_s", Help: "Amount of time taken to run the startup script in seconds.", }, []string{"success"}) - registerer.MustRegister(startScriptNs) + registerer.MustRegister(startScriptSeconds) return &agentMetrics{ connectionsTotal: connectionsTotal, reconnectingPTYErrors: reconnectingPTYErrors, - startScriptNs: startScriptNs, + startScriptSeconds: startScriptSeconds, } } diff --git a/coderd/database/models.go b/coderd/database/models.go index 554d08c70e990..19d051aba0ab1 100644 --- a/coderd/database/models.go +++ b/coderd/database/models.go @@ -1,6 +1,6 @@ // Code generated by sqlc. DO NOT EDIT. // versions: -// sqlc v1.23.0 +// sqlc v1.20.0 package database diff --git a/coderd/database/querier.go b/coderd/database/querier.go index f06e251a75bd9..ed4c57e258bef 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -1,6 +1,6 @@ // Code generated by sqlc. DO NOT EDIT. // versions: -// sqlc v1.23.0 +// sqlc v1.20.0 package database diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index a10e194cf9cfb..a8931e0d29a94 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -1,6 +1,6 @@ // Code generated by sqlc. DO NOT EDIT. // versions: -// sqlc v1.23.0 +// sqlc v1.20.0 package database diff --git a/codersdk/agentsdk/agentsdk.go b/codersdk/agentsdk/agentsdk.go index cb01a9ec3fae2..1ca60a09b12b7 100644 --- a/codersdk/agentsdk/agentsdk.go +++ b/codersdk/agentsdk/agentsdk.go @@ -19,9 +19,8 @@ import ( "tailscale.com/tailcfg" "cdr.dev/slog" - "github.com/coder/retry" - "github.com/coder/coder/v2/codersdk" + "github.com/coder/retry" ) // ExternalLogSourceID is the statically-defined ID of a log-source that From aafd29e11d2356e6c3fb9f4e971fdcc98b367373 Mon Sep 17 00:00:00 2001 From: Steven Masley Date: Mon, 11 Dec 2023 17:06:13 -0600 Subject: [PATCH 06/19] add template_name label to workspace agent stats --- coderd/coderd.go | 3 +- coderd/database/dbauthz/dbauthz.go | 3 +- coderd/database/dbmem/dbmem.go | 17 ++++++++-- coderd/database/dbmetrics/dbmetrics.go | 2 +- coderd/database/dbmock/dbmock.go | 4 +-- coderd/database/modelmethods.go | 4 +++ coderd/database/models.go | 2 +- coderd/database/querier.go | 4 +-- coderd/database/queries.sql.go | 45 +++++++++++++++----------- coderd/database/queries/workspaces.sql | 5 ++- coderd/prometheusmetrics/aggregator.go | 10 ++++-- coderd/workspaceagents.go | 11 ++++--- 12 files changed, 74 insertions(+), 36 deletions(-) diff --git a/coderd/coderd.go b/coderd/coderd.go index 747ac04ee8407..f751d25bd654e 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -40,6 +40,7 @@ import ( "github.com/coder/coder/v2/coderd/healthcheck/derphealth" "cdr.dev/slog" + "github.com/coder/coder/v2/buildinfo" "github.com/coder/coder/v2/cli/clibase" "github.com/coder/coder/v2/coderd/audit" @@ -168,7 +169,7 @@ type Options struct { HTTPClient *http.Client - UpdateAgentMetrics func(ctx context.Context, username, workspaceName, agentName string, metrics []agentsdk.AgentMetric) + UpdateAgentMetrics func(ctx context.Context, username, workspaceName, agentName, templateName string, metrics []agentsdk.AgentMetric) StatsBatcher *batchstats.Batcher WorkspaceAppsStatsCollectorOptions workspaceapps.StatsCollectorOptions diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index 5b186ff671202..ff27db0b27153 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -16,6 +16,7 @@ import ( "github.com/open-policy-agent/opa/topdown" "cdr.dev/slog" + "github.com/coder/coder/v2/coderd/database" "github.com/coder/coder/v2/coderd/database/dbtime" "github.com/coder/coder/v2/coderd/httpapi/httpapiconstraints" @@ -1923,7 +1924,7 @@ func (q *querier) GetWorkspaceBuildsCreatedAfter(ctx context.Context, createdAt return q.db.GetWorkspaceBuildsCreatedAfter(ctx, createdAt) } -func (q *querier) GetWorkspaceByAgentID(ctx context.Context, agentID uuid.UUID) (database.Workspace, error) { +func (q *querier) GetWorkspaceByAgentID(ctx context.Context, agentID uuid.UUID) (database.GetWorkspaceByAgentIDRow, error) { return fetch(q.log, q.auth, q.db.GetWorkspaceByAgentID)(ctx, agentID) } diff --git a/coderd/database/dbmem/dbmem.go b/coderd/database/dbmem/dbmem.go index b6536677c1140..0f8a2349f7e8c 100644 --- a/coderd/database/dbmem/dbmem.go +++ b/coderd/database/dbmem/dbmem.go @@ -4296,11 +4296,24 @@ func (q *FakeQuerier) GetWorkspaceBuildsCreatedAfter(_ context.Context, after ti return workspaceBuilds, nil } -func (q *FakeQuerier) GetWorkspaceByAgentID(ctx context.Context, agentID uuid.UUID) (database.Workspace, error) { +func (q *FakeQuerier) GetWorkspaceByAgentID(ctx context.Context, agentID uuid.UUID) (database.GetWorkspaceByAgentIDRow, error) { q.mutex.RLock() defer q.mutex.RUnlock() - return q.getWorkspaceByAgentIDNoLock(ctx, agentID) + w, err := q.getWorkspaceByAgentIDNoLock(ctx, agentID) + if err != nil { + return database.GetWorkspaceByAgentIDRow{}, err + } + + tpl, err := q.getTemplateByIDNoLock(ctx, w.TemplateID) + if err != nil { + return database.GetWorkspaceByAgentIDRow{}, err + } + + return database.GetWorkspaceByAgentIDRow{ + Workspace: w, + TemplateName: tpl.Name, + }, nil } func (q *FakeQuerier) GetWorkspaceByID(ctx context.Context, id uuid.UUID) (database.Workspace, error) { diff --git a/coderd/database/dbmetrics/dbmetrics.go b/coderd/database/dbmetrics/dbmetrics.go index 09770f4ec6bef..4ad82b3f224ab 100644 --- a/coderd/database/dbmetrics/dbmetrics.go +++ b/coderd/database/dbmetrics/dbmetrics.go @@ -1131,7 +1131,7 @@ func (m metricsStore) GetWorkspaceBuildsCreatedAfter(ctx context.Context, create return builds, err } -func (m metricsStore) GetWorkspaceByAgentID(ctx context.Context, agentID uuid.UUID) (database.Workspace, error) { +func (m metricsStore) GetWorkspaceByAgentID(ctx context.Context, agentID uuid.UUID) (database.GetWorkspaceByAgentIDRow, error) { start := time.Now() workspace, err := m.s.GetWorkspaceByAgentID(ctx, agentID) m.queryLatencies.WithLabelValues("GetWorkspaceByAgentID").Observe(time.Since(start).Seconds()) diff --git a/coderd/database/dbmock/dbmock.go b/coderd/database/dbmock/dbmock.go index 1b5b3ca259df9..abbf188ae64e8 100644 --- a/coderd/database/dbmock/dbmock.go +++ b/coderd/database/dbmock/dbmock.go @@ -2359,10 +2359,10 @@ func (mr *MockStoreMockRecorder) GetWorkspaceBuildsCreatedAfter(arg0, arg1 inter } // GetWorkspaceByAgentID mocks base method. -func (m *MockStore) GetWorkspaceByAgentID(arg0 context.Context, arg1 uuid.UUID) (database.Workspace, error) { +func (m *MockStore) GetWorkspaceByAgentID(arg0 context.Context, arg1 uuid.UUID) (database.GetWorkspaceByAgentIDRow, error) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "GetWorkspaceByAgentID", arg0, arg1) - ret0, _ := ret[0].(database.Workspace) + ret0, _ := ret[0].(database.GetWorkspaceByAgentIDRow) ret1, _ := ret[1].(error) return ret0, ret1 } diff --git a/coderd/database/modelmethods.go b/coderd/database/modelmethods.go index 8d15af65aada1..685c138c95288 100644 --- a/coderd/database/modelmethods.go +++ b/coderd/database/modelmethods.go @@ -148,6 +148,10 @@ func (g Group) RBACObject() rbac.Object { InOrg(g.OrganizationID) } +func (w GetWorkspaceByAgentIDRow) RBACObject() rbac.Object { + return w.Workspace.RBACObject() +} + func (w Workspace) RBACObject() rbac.Object { return rbac.ResourceWorkspace.WithID(w.ID). InOrg(w.OrganizationID). diff --git a/coderd/database/models.go b/coderd/database/models.go index 19d051aba0ab1..554d08c70e990 100644 --- a/coderd/database/models.go +++ b/coderd/database/models.go @@ -1,6 +1,6 @@ // Code generated by sqlc. DO NOT EDIT. // versions: -// sqlc v1.20.0 +// sqlc v1.23.0 package database diff --git a/coderd/database/querier.go b/coderd/database/querier.go index ed4c57e258bef..7c1864521243c 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -1,6 +1,6 @@ // Code generated by sqlc. DO NOT EDIT. // versions: -// sqlc v1.20.0 +// sqlc v1.23.0 package database @@ -232,7 +232,7 @@ type sqlcQuerier interface { GetWorkspaceBuildParameters(ctx context.Context, workspaceBuildID uuid.UUID) ([]WorkspaceBuildParameter, error) GetWorkspaceBuildsByWorkspaceID(ctx context.Context, arg GetWorkspaceBuildsByWorkspaceIDParams) ([]WorkspaceBuild, error) GetWorkspaceBuildsCreatedAfter(ctx context.Context, createdAt time.Time) ([]WorkspaceBuild, error) - GetWorkspaceByAgentID(ctx context.Context, agentID uuid.UUID) (Workspace, error) + GetWorkspaceByAgentID(ctx context.Context, agentID uuid.UUID) (GetWorkspaceByAgentIDRow, error) GetWorkspaceByID(ctx context.Context, id uuid.UUID) (Workspace, error) GetWorkspaceByOwnerIDAndName(ctx context.Context, arg GetWorkspaceByOwnerIDAndNameParams) (Workspace, error) GetWorkspaceByWorkspaceAppID(ctx context.Context, workspaceAppID uuid.UUID) (Workspace, error) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index a8931e0d29a94..6e0ebcb5c745a 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -1,6 +1,6 @@ // Code generated by sqlc. DO NOT EDIT. // versions: -// sqlc v1.20.0 +// sqlc v1.23.0 package database @@ -10566,9 +10566,12 @@ func (q *sqlQuerier) GetDeploymentWorkspaceStats(ctx context.Context) (GetDeploy const getWorkspaceByAgentID = `-- name: GetWorkspaceByAgentID :one SELECT - id, created_at, updated_at, owner_id, organization_id, template_id, deleted, name, autostart_schedule, ttl, last_used_at, dormant_at, deleting_at, automatic_updates + workspaces.id, workspaces.created_at, workspaces.updated_at, workspaces.owner_id, workspaces.organization_id, workspaces.template_id, workspaces.deleted, workspaces.name, workspaces.autostart_schedule, workspaces.ttl, workspaces.last_used_at, workspaces.dormant_at, workspaces.deleting_at, workspaces.automatic_updates, + templates.name as template_name FROM workspaces +INNER JOIN + templates ON workspaces.template_id = templates.id WHERE workspaces.id = ( SELECT @@ -10594,24 +10597,30 @@ WHERE ) ` -func (q *sqlQuerier) GetWorkspaceByAgentID(ctx context.Context, agentID uuid.UUID) (Workspace, error) { +type GetWorkspaceByAgentIDRow struct { + Workspace Workspace `db:"workspace" json:"workspace"` + TemplateName string `db:"template_name" json:"template_name"` +} + +func (q *sqlQuerier) GetWorkspaceByAgentID(ctx context.Context, agentID uuid.UUID) (GetWorkspaceByAgentIDRow, error) { row := q.db.QueryRowContext(ctx, getWorkspaceByAgentID, agentID) - var i Workspace + var i GetWorkspaceByAgentIDRow err := row.Scan( - &i.ID, - &i.CreatedAt, - &i.UpdatedAt, - &i.OwnerID, - &i.OrganizationID, - &i.TemplateID, - &i.Deleted, - &i.Name, - &i.AutostartSchedule, - &i.Ttl, - &i.LastUsedAt, - &i.DormantAt, - &i.DeletingAt, - &i.AutomaticUpdates, + &i.Workspace.ID, + &i.Workspace.CreatedAt, + &i.Workspace.UpdatedAt, + &i.Workspace.OwnerID, + &i.Workspace.OrganizationID, + &i.Workspace.TemplateID, + &i.Workspace.Deleted, + &i.Workspace.Name, + &i.Workspace.AutostartSchedule, + &i.Workspace.Ttl, + &i.Workspace.LastUsedAt, + &i.Workspace.DormantAt, + &i.Workspace.DeletingAt, + &i.Workspace.AutomaticUpdates, + &i.TemplateName, ) return i, err } diff --git a/coderd/database/queries/workspaces.sql b/coderd/database/queries/workspaces.sql index 7862497ebfc46..d9ff657fd21dc 100644 --- a/coderd/database/queries/workspaces.sql +++ b/coderd/database/queries/workspaces.sql @@ -46,9 +46,12 @@ WHERE -- name: GetWorkspaceByAgentID :one SELECT - * + sqlc.embed(workspaces), + templates.name as template_name FROM workspaces +INNER JOIN + templates ON workspaces.template_id = templates.id WHERE workspaces.id = ( SELECT diff --git a/coderd/prometheusmetrics/aggregator.go b/coderd/prometheusmetrics/aggregator.go index b1091b2451405..5b8dc251a94f0 100644 --- a/coderd/prometheusmetrics/aggregator.go +++ b/coderd/prometheusmetrics/aggregator.go @@ -47,6 +47,7 @@ type updateRequest struct { username string workspaceName string agentName string + templateName string metrics []agentsdk.AgentMetric @@ -59,6 +60,7 @@ type annotatedMetric struct { username string workspaceName string agentName string + templateName string expiryDate time.Time } @@ -74,7 +76,7 @@ func (am *annotatedMetric) asPrometheus() (prometheus.Metric, error) { labelValues := make([]string, 0, len(agentMetricsLabels)+len(am.Labels)) labels = append(labels, agentMetricsLabels...) - labelValues = append(labelValues, am.username, am.workspaceName, am.agentName) + labelValues = append(labelValues, am.username, am.workspaceName, am.agentName, am.templateName) for _, l := range am.Labels { labels = append(labels, l.Name) @@ -160,6 +162,7 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() { username: req.username, workspaceName: req.workspaceName, agentName: req.agentName, + templateName: req.templateName, AgentMetric: m, @@ -227,7 +230,7 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() { func (*MetricsAggregator) Describe(_ chan<- *prometheus.Desc) { } -var agentMetricsLabels = []string{usernameLabel, workspaceNameLabel, agentNameLabel} +var agentMetricsLabels = []string{usernameLabel, workspaceNameLabel, agentNameLabel, templateNameLabel} func (ma *MetricsAggregator) Collect(ch chan<- prometheus.Metric) { output := make(chan []prometheus.Metric, 1) @@ -246,12 +249,13 @@ func (ma *MetricsAggregator) Collect(ch chan<- prometheus.Metric) { } } -func (ma *MetricsAggregator) Update(ctx context.Context, username, workspaceName, agentName string, metrics []agentsdk.AgentMetric) { +func (ma *MetricsAggregator) Update(ctx context.Context, username, workspaceName, agentName, templateName string, metrics []agentsdk.AgentMetric) { select { case ma.updateCh <- updateRequest{ username: username, workspaceName: workspaceName, agentName: agentName, + templateName: templateName, metrics: metrics, timestamp: time.Now(), diff --git a/coderd/workspaceagents.go b/coderd/workspaceagents.go index d8f5ec842a4db..dfa20294a3a58 100644 --- a/coderd/workspaceagents.go +++ b/coderd/workspaceagents.go @@ -569,7 +569,7 @@ func (api *API) workspaceAgentLogs(rw http.ResponseWriter, r *http.Request) { return } - workspace, err := api.Database.GetWorkspaceByAgentID(ctx, workspaceAgent.ID) + row, err := api.Database.GetWorkspaceByAgentID(ctx, workspaceAgent.ID) if err != nil { httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{ Message: "Internal error fetching workspace by agent id.", @@ -577,6 +577,7 @@ func (api *API) workspaceAgentLogs(rw http.ResponseWriter, r *http.Request) { }) return } + workspace := row.Workspace api.WebsocketWaitMutex.Lock() api.WebsocketWaitGroup.Add(1) @@ -1645,7 +1646,7 @@ func (api *API) workspaceAgentReportStats(rw http.ResponseWriter, r *http.Reques ctx := r.Context() workspaceAgent := httpmw.WorkspaceAgent(r) - workspace, err := api.Database.GetWorkspaceByAgentID(ctx, workspaceAgent.ID) + row, err := api.Database.GetWorkspaceByAgentID(ctx, workspaceAgent.ID) if err != nil { httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{ Message: "Failed to get workspace.", @@ -1653,6 +1654,7 @@ func (api *API) workspaceAgentReportStats(rw http.ResponseWriter, r *http.Reques }) return } + workspace := row.Workspace var req agentsdk.Stats if !httpapi.Read(ctx, rw, r, &req) { @@ -1724,7 +1726,7 @@ func (api *API) workspaceAgentReportStats(rw http.ResponseWriter, r *http.Reques return xerrors.Errorf("can't get user: %w", err) } - api.Options.UpdateAgentMetrics(ctx, user.Username, workspace.Name, workspaceAgent.Name, req.Metrics) + api.Options.UpdateAgentMetrics(ctx, user.Username, workspace.Name, workspaceAgent.Name, row.TemplateName, req.Metrics) return nil }) } @@ -2100,7 +2102,7 @@ func (api *API) workspaceAgentReportLifecycle(rw http.ResponseWriter, r *http.Re ctx := r.Context() workspaceAgent := httpmw.WorkspaceAgent(r) - workspace, err := api.Database.GetWorkspaceByAgentID(ctx, workspaceAgent.ID) + row, err := api.Database.GetWorkspaceByAgentID(ctx, workspaceAgent.ID) if err != nil { httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{ Message: "Failed to get workspace.", @@ -2108,6 +2110,7 @@ func (api *API) workspaceAgentReportLifecycle(rw http.ResponseWriter, r *http.Re }) return } + workspace := row.Workspace var req agentsdk.PostLifecycleRequest if !httpapi.Read(ctx, rw, r, &req) { From 6c5a560883cf73da35185c18c4ddc76ac424e11b Mon Sep 17 00:00:00 2001 From: Steven Masley Date: Mon, 11 Dec 2023 17:31:05 -0600 Subject: [PATCH 07/19] Fix prom tests --- agent/agent_test.go | 12 ++++++++++++ coderd/database/dbauthz/dbauthz_test.go | 6 +++++- coderd/database/models.go | 2 +- coderd/database/querier.go | 2 +- coderd/database/queries.sql.go | 2 +- coderd/prometheusmetrics/aggregator_test.go | 17 +++++++++++++---- 6 files changed, 33 insertions(+), 8 deletions(-) diff --git a/agent/agent_test.go b/agent/agent_test.go index 19e28346adcde..fc0700dd6c8f4 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -46,6 +46,7 @@ import ( "cdr.dev/slog" "cdr.dev/slog/sloggers/sloghuman" "cdr.dev/slog/sloggers/slogtest" + "github.com/coder/coder/v2/agent" "github.com/coder/coder/v2/agent/agentproc" "github.com/coder/coder/v2/agent/agentproc/agentproctest" @@ -2235,6 +2236,17 @@ func TestAgent_Metrics_SSH(t *testing.T) { Type: agentsdk.AgentMetricTypeCounter, Value: 0, }, + { + Name: "coderd_agentstats_startup_script_s", + Type: agentsdk.AgentMetricTypeGauge, + Value: 0, + Labels: []agentsdk.AgentMetricLabel{ + { + Name: "success", + Value: "true", + }, + }, + }, } var actual []*promgo.MetricFamily diff --git a/coderd/database/dbauthz/dbauthz_test.go b/coderd/database/dbauthz/dbauthz_test.go index c52606f5436ca..01a18ef2a3122 100644 --- a/coderd/database/dbauthz/dbauthz_test.go +++ b/coderd/database/dbauthz/dbauthz_test.go @@ -13,6 +13,7 @@ import ( "golang.org/x/xerrors" "cdr.dev/slog" + "github.com/coder/coder/v2/coderd/coderdtest" "github.com/coder/coder/v2/coderd/database" "github.com/coder/coder/v2/coderd/database/dbauthz" @@ -1063,7 +1064,10 @@ func (s *MethodTestSuite) TestWorkspace() { check.Args(ws.ID).Asserts(ws, rbac.ActionRead).Returns(b) })) s.Run("GetWorkspaceAgentByID", s.Subtest(func(db database.Store, check *expects) { - ws := dbgen.Workspace(s.T(), db, database.Workspace{}) + tpl := dbgen.Template(s.T(), db, database.Template{}) + ws := dbgen.Workspace(s.T(), db, database.Workspace{ + TemplateID: tpl.ID, + }) build := dbgen.WorkspaceBuild(s.T(), db, database.WorkspaceBuild{WorkspaceID: ws.ID, JobID: uuid.New()}) res := dbgen.WorkspaceResource(s.T(), db, database.WorkspaceResource{JobID: build.JobID}) agt := dbgen.WorkspaceAgent(s.T(), db, database.WorkspaceAgent{ResourceID: res.ID}) diff --git a/coderd/database/models.go b/coderd/database/models.go index 554d08c70e990..19d051aba0ab1 100644 --- a/coderd/database/models.go +++ b/coderd/database/models.go @@ -1,6 +1,6 @@ // Code generated by sqlc. DO NOT EDIT. // versions: -// sqlc v1.23.0 +// sqlc v1.20.0 package database diff --git a/coderd/database/querier.go b/coderd/database/querier.go index 7c1864521243c..55160babf8e4c 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -1,6 +1,6 @@ // Code generated by sqlc. DO NOT EDIT. // versions: -// sqlc v1.23.0 +// sqlc v1.20.0 package database diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index 6e0ebcb5c745a..3d9fd195c5cdd 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -1,6 +1,6 @@ // Code generated by sqlc. DO NOT EDIT. // versions: -// sqlc v1.23.0 +// sqlc v1.20.0 package database diff --git a/coderd/prometheusmetrics/aggregator_test.go b/coderd/prometheusmetrics/aggregator_test.go index 45f0de14851c3..db6a8e233755d 100644 --- a/coderd/prometheusmetrics/aggregator_test.go +++ b/coderd/prometheusmetrics/aggregator_test.go @@ -2,6 +2,7 @@ package prometheusmetrics_test import ( "context" + "sort" "sync/atomic" "testing" "time" @@ -12,6 +13,7 @@ import ( "github.com/stretchr/testify/require" "cdr.dev/slog/sloggers/slogtest" + "github.com/coder/coder/v2/coderd/prometheusmetrics" "github.com/coder/coder/v2/codersdk/agentsdk" "github.com/coder/coder/v2/cryptorand" @@ -22,6 +24,7 @@ const ( testWorkspaceName = "yogi-workspace" testUsername = "yogi-bear" testAgentName = "main-agent" + testTemplateName = "main-template" ) func TestUpdateMetrics_MetricsDoNotExpire(t *testing.T) { @@ -58,6 +61,7 @@ func TestUpdateMetrics_MetricsDoNotExpire(t *testing.T) { {Name: "agent_name", Value: testAgentName}, {Name: "username", Value: testUsername}, {Name: "workspace_name", Value: testWorkspaceName}, + {Name: "template_name", Value: testTemplateName}, } expected := []agentsdk.AgentMetric{ {Name: "a_counter_one", Type: agentsdk.AgentMetricTypeCounter, Value: 1, Labels: commonLabels}, @@ -69,13 +73,14 @@ func TestUpdateMetrics_MetricsDoNotExpire(t *testing.T) { {Name: "hello", Value: "world"}, {Name: "username", Value: testUsername}, {Name: "workspace_name", Value: testWorkspaceName}, + {Name: "template_name", Value: testTemplateName}, }}, {Name: "d_gauge_four", Type: agentsdk.AgentMetricTypeGauge, Value: 6, Labels: commonLabels}, } // when - metricsAggregator.Update(ctx, testUsername, testWorkspaceName, testAgentName, given1) - metricsAggregator.Update(ctx, testUsername, testWorkspaceName, testAgentName, given2) + metricsAggregator.Update(ctx, testUsername, testWorkspaceName, testAgentName, testTemplateName, given1) + metricsAggregator.Update(ctx, testUsername, testWorkspaceName, testAgentName, testTemplateName, given2) // then require.Eventually(t, func() bool { @@ -119,6 +124,10 @@ func verifyCollectedMetrics(t *testing.T, expected []agentsdk.AgentMetric, actua } dtoLabels := asMetricAgentLabels(d.GetLabel()) + // dto labels are sorted in alphabetical order. + sort.Slice(e.Labels, func(i, j int) bool { + return e.Labels[i].Name < e.Labels[j].Name + }) require.Equal(t, e.Labels, dtoLabels, d.String()) } return true @@ -154,7 +163,7 @@ func TestUpdateMetrics_MetricsExpire(t *testing.T) { } // when - metricsAggregator.Update(ctx, testUsername, testWorkspaceName, testAgentName, given) + metricsAggregator.Update(ctx, testUsername, testWorkspaceName, testAgentName, testTemplateName, given) time.Sleep(time.Millisecond * 10) // Ensure that metric is expired @@ -220,7 +229,7 @@ func Benchmark_MetricsAggregator_Run(b *testing.B) { b.Logf("N=%d sending %d metrics", b.N, numMetrics) var nGot atomic.Int64 b.StartTimer() - metricsAggregator.Update(ctx, testUsername, testWorkspaceName, testAgentName, metrics) + metricsAggregator.Update(ctx, testUsername, testWorkspaceName, testAgentName, testTemplateName, metrics) for i := 0; i < numMetrics; i++ { select { case <-ctx.Done(): From ad3f47f7f9187ff993ff7e0837a8cafb41067c49 Mon Sep 17 00:00:00 2001 From: Steven Masley Date: Mon, 11 Dec 2023 17:37:46 -0600 Subject: [PATCH 08/19] Update prom docs --- docs/admin/prometheus.md | 136 +++++++++++++++++----------------- scripts/metricsdocgen/metrics | 12 ++- 2 files changed, 78 insertions(+), 70 deletions(-) diff --git a/docs/admin/prometheus.md b/docs/admin/prometheus.md index f5faf6c0d035c..a3483ee78b075 100644 --- a/docs/admin/prometheus.md +++ b/docs/admin/prometheus.md @@ -78,72 +78,74 @@ spec: -| Name | Type | Description | Labels | -| ----------------------------------------------------- | --------- | ------------------------------------------------------------------ | ----------------------------------------------------------------------------------- | -| `coderd_agents_apps` | gauge | Agent applications with statuses. | `agent_name` `app_name` `health` `username` `workspace_name` | -| `coderd_agents_connection_latencies_seconds` | gauge | Agent connection latencies in seconds. | `agent_name` `derp_region` `preferred` `username` `workspace_name` | -| `coderd_agents_connections` | gauge | Agent connections with statuses. | `agent_name` `lifecycle_state` `status` `tailnet_node` `username` `workspace_name` | -| `coderd_agents_up` | gauge | The number of active agents per workspace. | `username` `workspace_name` | -| `coderd_agentstats_connection_count` | gauge | The number of established connections by agent | `agent_name` `username` `workspace_name` | -| `coderd_agentstats_connection_median_latency_seconds` | gauge | The median agent connection latency | `agent_name` `username` `workspace_name` | -| `coderd_agentstats_rx_bytes` | gauge | Agent Rx bytes | `agent_name` `username` `workspace_name` | -| `coderd_agentstats_session_count_jetbrains` | gauge | The number of session established by JetBrains | `agent_name` `username` `workspace_name` | -| `coderd_agentstats_session_count_reconnecting_pty` | gauge | The number of session established by reconnecting PTY | `agent_name` `username` `workspace_name` | -| `coderd_agentstats_session_count_ssh` | gauge | The number of session established by SSH | `agent_name` `username` `workspace_name` | -| `coderd_agentstats_session_count_vscode` | gauge | The number of session established by VSCode | `agent_name` `username` `workspace_name` | -| `coderd_agentstats_tx_bytes` | gauge | Agent Tx bytes | `agent_name` `username` `workspace_name` | -| `coderd_api_active_users_duration_hour` | gauge | The number of users that have been active within the last hour. | | -| `coderd_api_concurrent_requests` | gauge | The number of concurrent API requests. | | -| `coderd_api_concurrent_websockets` | gauge | The total number of concurrent API websockets. | | -| `coderd_api_request_latencies_seconds` | histogram | Latency distribution of requests in seconds. | `method` `path` | -| `coderd_api_requests_processed_total` | counter | The total number of processed API requests | `code` `method` `path` | -| `coderd_api_websocket_durations_seconds` | histogram | Websocket duration distribution of requests in seconds. | `path` | -| `coderd_api_workspace_latest_build_total` | gauge | The latest workspace builds with a status. | `status` | -| `coderd_insights_applications_usage_seconds` | gauge | The application usage per template. | `application_name` `slug` `template_name` | -| `coderd_insights_parameters` | gauge | The parameter usage per template. | `parameter_name` `parameter_type` `parameter_value` `template_name` | -| `coderd_insights_templates_active_users` | gauge | The number of active users of the template. | `template_name` | -| `coderd_license_active_users` | gauge | The number of active users. | | -| `coderd_license_limit_users` | gauge | The user seats limit based on the active Coder license. | | -| `coderd_license_user_limit_enabled` | gauge | Returns 1 if the current license enforces the user limit. | | -| `coderd_metrics_collector_agents_execution_seconds` | histogram | Histogram for duration of agents metrics collection in seconds. | | -| `coderd_provisionerd_job_timings_seconds` | histogram | The provisioner job time duration in seconds. | `provisioner` `status` | -| `coderd_provisionerd_jobs_current` | gauge | The number of currently running provisioner jobs. | `provisioner` | -| `coderd_workspace_builds_total` | counter | The number of workspaces started, updated, or deleted. | `action` `owner_email` `status` `template_name` `template_version` `workspace_name` | -| `go_gc_duration_seconds` | summary | A summary of the pause duration of garbage collection cycles. | | -| `go_goroutines` | gauge | Number of goroutines that currently exist. | | -| `go_info` | gauge | Information about the Go environment. | `version` | -| `go_memstats_alloc_bytes` | gauge | Number of bytes allocated and still in use. | | -| `go_memstats_alloc_bytes_total` | counter | Total number of bytes allocated, even if freed. | | -| `go_memstats_buck_hash_sys_bytes` | gauge | Number of bytes used by the profiling bucket hash table. | | -| `go_memstats_frees_total` | counter | Total number of frees. | | -| `go_memstats_gc_sys_bytes` | gauge | Number of bytes used for garbage collection system metadata. | | -| `go_memstats_heap_alloc_bytes` | gauge | Number of heap bytes allocated and still in use. | | -| `go_memstats_heap_idle_bytes` | gauge | Number of heap bytes waiting to be used. | | -| `go_memstats_heap_inuse_bytes` | gauge | Number of heap bytes that are in use. | | -| `go_memstats_heap_objects` | gauge | Number of allocated objects. | | -| `go_memstats_heap_released_bytes` | gauge | Number of heap bytes released to OS. | | -| `go_memstats_heap_sys_bytes` | gauge | Number of heap bytes obtained from system. | | -| `go_memstats_last_gc_time_seconds` | gauge | Number of seconds since 1970 of last garbage collection. | | -| `go_memstats_lookups_total` | counter | Total number of pointer lookups. | | -| `go_memstats_mallocs_total` | counter | Total number of mallocs. | | -| `go_memstats_mcache_inuse_bytes` | gauge | Number of bytes in use by mcache structures. | | -| `go_memstats_mcache_sys_bytes` | gauge | Number of bytes used for mcache structures obtained from system. | | -| `go_memstats_mspan_inuse_bytes` | gauge | Number of bytes in use by mspan structures. | | -| `go_memstats_mspan_sys_bytes` | gauge | Number of bytes used for mspan structures obtained from system. | | -| `go_memstats_next_gc_bytes` | gauge | Number of heap bytes when next garbage collection will take place. | | -| `go_memstats_other_sys_bytes` | gauge | Number of bytes used for other system allocations. | | -| `go_memstats_stack_inuse_bytes` | gauge | Number of bytes in use by the stack allocator. | | -| `go_memstats_stack_sys_bytes` | gauge | Number of bytes obtained from system for stack allocator. | | -| `go_memstats_sys_bytes` | gauge | Number of bytes obtained from system. | | -| `go_threads` | gauge | Number of OS threads created. | | -| `process_cpu_seconds_total` | counter | Total user and system CPU time spent in seconds. | | -| `process_max_fds` | gauge | Maximum number of open file descriptors. | | -| `process_open_fds` | gauge | Number of open file descriptors. | | -| `process_resident_memory_bytes` | gauge | Resident memory size in bytes. | | -| `process_start_time_seconds` | gauge | Start time of the process since unix epoch in seconds. | | -| `process_virtual_memory_bytes` | gauge | Virtual memory size in bytes. | | -| `process_virtual_memory_max_bytes` | gauge | Maximum amount of virtual memory available in bytes. | | -| `promhttp_metric_handler_requests_in_flight` | gauge | Current number of scrapes being served. | | -| `promhttp_metric_handler_requests_total` | counter | Total number of scrapes by HTTP status code. | `code` | +| Name | Type | Description | Labels | +| ----------------------------------------------------- | --------- | ------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------- | +| `agent_scripts_executed_total` | counter | Total number of scripts executed by the Coder agent. Includes cron scheduled scripts. | `agent_name` `success` `template_name` `username` `workspace_name` | +| `coderd_agents_apps` | gauge | Agent applications with statuses. | `agent_name` `app_name` `health` `username` `workspace_name` | +| `coderd_agents_connection_latencies_seconds` | gauge | Agent connection latencies in seconds. | `agent_name` `derp_region` `preferred` `username` `workspace_name` | +| `coderd_agents_connections` | gauge | Agent connections with statuses. | `agent_name` `lifecycle_state` `status` `tailnet_node` `username` `workspace_name` | +| `coderd_agents_up` | gauge | The number of active agents per workspace. | `template_name` `username` `workspace_name` | +| `coderd_agentstats_connection_count` | gauge | The number of established connections by agent | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_connection_median_latency_seconds` | gauge | The median agent connection latency | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_rx_bytes` | gauge | Agent Rx bytes | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_session_count_jetbrains` | gauge | The number of session established by JetBrains | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_session_count_reconnecting_pty` | gauge | The number of session established by reconnecting PTY | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_session_count_ssh` | gauge | The number of session established by SSH | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_session_count_vscode` | gauge | The number of session established by VSCode | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_startup_script_s` | gauge | The number of seconds the startup script took to execute. | `agent_name` `success` `template_name` `username` `workspace_name` | +| `coderd_agentstats_tx_bytes` | gauge | Agent Tx bytes | `agent_name` `username` `workspace_name` | +| `coderd_api_active_users_duration_hour` | gauge | The number of users that have been active within the last hour. | | +| `coderd_api_concurrent_requests` | gauge | The number of concurrent API requests. | | +| `coderd_api_concurrent_websockets` | gauge | The total number of concurrent API websockets. | | +| `coderd_api_request_latencies_seconds` | histogram | Latency distribution of requests in seconds. | `method` `path` | +| `coderd_api_requests_processed_total` | counter | The total number of processed API requests | `code` `method` `path` | +| `coderd_api_websocket_durations_seconds` | histogram | Websocket duration distribution of requests in seconds. | `path` | +| `coderd_api_workspace_latest_build_total` | gauge | The latest workspace builds with a status. | `status` | +| `coderd_insights_applications_usage_seconds` | gauge | The application usage per template. | `application_name` `slug` `template_name` | +| `coderd_insights_parameters` | gauge | The parameter usage per template. | `parameter_name` `parameter_type` `parameter_value` `template_name` | +| `coderd_insights_templates_active_users` | gauge | The number of active users of the template. | `template_name` | +| `coderd_license_active_users` | gauge | The number of active users. | | +| `coderd_license_limit_users` | gauge | The user seats limit based on the active Coder license. | | +| `coderd_license_user_limit_enabled` | gauge | Returns 1 if the current license enforces the user limit. | | +| `coderd_metrics_collector_agents_execution_seconds` | histogram | Histogram for duration of agents metrics collection in seconds. | | +| `coderd_provisionerd_job_timings_seconds` | histogram | The provisioner job time duration in seconds. | `provisioner` `status` | +| `coderd_provisionerd_jobs_current` | gauge | The number of currently running provisioner jobs. | `provisioner` | +| `coderd_workspace_builds_total` | counter | The number of workspaces started, updated, or deleted. | `action` `owner_email` `status` `template_name` `template_version` `workspace_name` | +| `go_gc_duration_seconds` | summary | A summary of the pause duration of garbage collection cycles. | | +| `go_goroutines` | gauge | Number of goroutines that currently exist. | | +| `go_info` | gauge | Information about the Go environment. | `version` | +| `go_memstats_alloc_bytes` | gauge | Number of bytes allocated and still in use. | | +| `go_memstats_alloc_bytes_total` | counter | Total number of bytes allocated, even if freed. | | +| `go_memstats_buck_hash_sys_bytes` | gauge | Number of bytes used by the profiling bucket hash table. | | +| `go_memstats_frees_total` | counter | Total number of frees. | | +| `go_memstats_gc_sys_bytes` | gauge | Number of bytes used for garbage collection system metadata. | | +| `go_memstats_heap_alloc_bytes` | gauge | Number of heap bytes allocated and still in use. | | +| `go_memstats_heap_idle_bytes` | gauge | Number of heap bytes waiting to be used. | | +| `go_memstats_heap_inuse_bytes` | gauge | Number of heap bytes that are in use. | | +| `go_memstats_heap_objects` | gauge | Number of allocated objects. | | +| `go_memstats_heap_released_bytes` | gauge | Number of heap bytes released to OS. | | +| `go_memstats_heap_sys_bytes` | gauge | Number of heap bytes obtained from system. | | +| `go_memstats_last_gc_time_seconds` | gauge | Number of seconds since 1970 of last garbage collection. | | +| `go_memstats_lookups_total` | counter | Total number of pointer lookups. | | +| `go_memstats_mallocs_total` | counter | Total number of mallocs. | | +| `go_memstats_mcache_inuse_bytes` | gauge | Number of bytes in use by mcache structures. | | +| `go_memstats_mcache_sys_bytes` | gauge | Number of bytes used for mcache structures obtained from system. | | +| `go_memstats_mspan_inuse_bytes` | gauge | Number of bytes in use by mspan structures. | | +| `go_memstats_mspan_sys_bytes` | gauge | Number of bytes used for mspan structures obtained from system. | | +| `go_memstats_next_gc_bytes` | gauge | Number of heap bytes when next garbage collection will take place. | | +| `go_memstats_other_sys_bytes` | gauge | Number of bytes used for other system allocations. | | +| `go_memstats_stack_inuse_bytes` | gauge | Number of bytes in use by the stack allocator. | | +| `go_memstats_stack_sys_bytes` | gauge | Number of bytes obtained from system for stack allocator. | | +| `go_memstats_sys_bytes` | gauge | Number of bytes obtained from system. | | +| `go_threads` | gauge | Number of OS threads created. | | +| `process_cpu_seconds_total` | counter | Total user and system CPU time spent in seconds. | | +| `process_max_fds` | gauge | Maximum number of open file descriptors. | | +| `process_open_fds` | gauge | Number of open file descriptors. | | +| `process_resident_memory_bytes` | gauge | Resident memory size in bytes. | | +| `process_start_time_seconds` | gauge | Start time of the process since unix epoch in seconds. | | +| `process_virtual_memory_bytes` | gauge | Virtual memory size in bytes. | | +| `process_virtual_memory_max_bytes` | gauge | Maximum amount of virtual memory available in bytes. | | +| `promhttp_metric_handler_requests_in_flight` | gauge | Current number of scrapes being served. | | +| `promhttp_metric_handler_requests_total` | counter | Total number of scrapes by HTTP status code. | `code` | diff --git a/scripts/metricsdocgen/metrics b/scripts/metricsdocgen/metrics index d55e5cd9669b7..148af2b2051db 100644 --- a/scripts/metricsdocgen/metrics +++ b/scripts/metricsdocgen/metrics @@ -15,9 +15,15 @@ coderd_agents_connections{agent_name="main",lifecycle_state="start_timeout",stat coderd_agents_connections{agent_name="main",lifecycle_state="start_timeout",status="connected",tailnet_node="nodeid:3779bd45d00be0eb",username="admin",workspace_name="workspace-1"} 1 # HELP coderd_agents_up The number of active agents per workspace. # TYPE coderd_agents_up gauge -coderd_agents_up{username="admin",workspace_name="workspace-1"} 1 -coderd_agents_up{username="admin",workspace_name="workspace-2"} 1 -coderd_agents_up{username="admin",workspace_name="workspace-3"} 1 +coderd_agents_up{template_name="docker", username="admin",workspace_name="workspace-1"} 1 +coderd_agents_up{template_name="docker", username="admin",workspace_name="workspace-2"} 1 +coderd_agents_up{template_name="gcp", username="admin",workspace_name="workspace-3"} 1 +# HELP coderd_agentstats_startup_script_s The number of seconds the startup script took to execute. +# TYPE coderd_agentstats_startup_script_s gauge +coderd_agentstats_startup_script_s{agent_name="main",success="true",template_name="docker",username="admin",workspace_name="workspace-1"} 1.969900304 +# HELP agent_scripts_executed_total Total number of scripts executed by the Coder agent. Includes cron scheduled scripts. +# TYPE agent_scripts_executed_total counter +agent_scripts_executed_total{agent_name="main",success="true",template_name="docker",username="admin",workspace_name="workspace-1"} 1 # HELP coderd_agentstats_connection_count The number of established connections by agent # TYPE coderd_agentstats_connection_count gauge coderd_agentstats_connection_count{agent_name="main",username="admin",workspace_name="workspace1"} 2 From 19428fa2e3af94553bccce31af1c83e2d5fa62b3 Mon Sep 17 00:00:00 2001 From: Steven Masley Date: Mon, 11 Dec 2023 17:51:00 -0600 Subject: [PATCH 09/19] seed template id in test --- coderd/database/dbauthz/dbauthz_test.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/coderd/database/dbauthz/dbauthz_test.go b/coderd/database/dbauthz/dbauthz_test.go index 01a18ef2a3122..3a5e534a503ee 100644 --- a/coderd/database/dbauthz/dbauthz_test.go +++ b/coderd/database/dbauthz/dbauthz_test.go @@ -1074,7 +1074,10 @@ func (s *MethodTestSuite) TestWorkspace() { check.Args(agt.ID).Asserts(ws, rbac.ActionRead).Returns(agt) })) s.Run("GetWorkspaceAgentByInstanceID", s.Subtest(func(db database.Store, check *expects) { - ws := dbgen.Workspace(s.T(), db, database.Workspace{}) + tpl := dbgen.Template(s.T(), db, database.Template{}) + ws := dbgen.Workspace(s.T(), db, database.Workspace{ + TemplateID: tpl.ID, + }) build := dbgen.WorkspaceBuild(s.T(), db, database.WorkspaceBuild{WorkspaceID: ws.ID, JobID: uuid.New()}) res := dbgen.WorkspaceResource(s.T(), db, database.WorkspaceResource{JobID: build.JobID}) agt := dbgen.WorkspaceAgent(s.T(), db, database.WorkspaceAgent{ResourceID: res.ID}) From 8bfa9ad8a5df033cca717e2024962f3ffd9045dd Mon Sep 17 00:00:00 2001 From: Steven Masley Date: Mon, 11 Dec 2023 18:04:14 -0600 Subject: [PATCH 10/19] fixup! seed template id in test --- coderd/database/dbauthz/dbauthz_test.go | 40 ++++++++++++++++++++----- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/coderd/database/dbauthz/dbauthz_test.go b/coderd/database/dbauthz/dbauthz_test.go index 3a5e534a503ee..604a333d3146b 100644 --- a/coderd/database/dbauthz/dbauthz_test.go +++ b/coderd/database/dbauthz/dbauthz_test.go @@ -1084,7 +1084,10 @@ func (s *MethodTestSuite) TestWorkspace() { check.Args(agt.AuthInstanceID.String).Asserts(ws, rbac.ActionRead).Returns(agt) })) s.Run("UpdateWorkspaceAgentLifecycleStateByID", s.Subtest(func(db database.Store, check *expects) { - ws := dbgen.Workspace(s.T(), db, database.Workspace{}) + tpl := dbgen.Template(s.T(), db, database.Template{}) + ws := dbgen.Workspace(s.T(), db, database.Workspace{ + TemplateID: tpl.ID, + }) build := dbgen.WorkspaceBuild(s.T(), db, database.WorkspaceBuild{WorkspaceID: ws.ID, JobID: uuid.New()}) res := dbgen.WorkspaceResource(s.T(), db, database.WorkspaceResource{JobID: build.JobID}) agt := dbgen.WorkspaceAgent(s.T(), db, database.WorkspaceAgent{ResourceID: res.ID}) @@ -1094,7 +1097,10 @@ func (s *MethodTestSuite) TestWorkspace() { }).Asserts(ws, rbac.ActionUpdate).Returns() })) s.Run("UpdateWorkspaceAgentLogOverflowByID", s.Subtest(func(db database.Store, check *expects) { - ws := dbgen.Workspace(s.T(), db, database.Workspace{}) + tpl := dbgen.Template(s.T(), db, database.Template{}) + ws := dbgen.Workspace(s.T(), db, database.Workspace{ + TemplateID: tpl.ID, + }) build := dbgen.WorkspaceBuild(s.T(), db, database.WorkspaceBuild{WorkspaceID: ws.ID, JobID: uuid.New()}) res := dbgen.WorkspaceResource(s.T(), db, database.WorkspaceResource{JobID: build.JobID}) agt := dbgen.WorkspaceAgent(s.T(), db, database.WorkspaceAgent{ResourceID: res.ID}) @@ -1104,7 +1110,10 @@ func (s *MethodTestSuite) TestWorkspace() { }).Asserts(ws, rbac.ActionUpdate).Returns() })) s.Run("UpdateWorkspaceAgentStartupByID", s.Subtest(func(db database.Store, check *expects) { - ws := dbgen.Workspace(s.T(), db, database.Workspace{}) + tpl := dbgen.Template(s.T(), db, database.Template{}) + ws := dbgen.Workspace(s.T(), db, database.Workspace{ + TemplateID: tpl.ID, + }) build := dbgen.WorkspaceBuild(s.T(), db, database.WorkspaceBuild{WorkspaceID: ws.ID, JobID: uuid.New()}) res := dbgen.WorkspaceResource(s.T(), db, database.WorkspaceResource{JobID: build.JobID}) agt := dbgen.WorkspaceAgent(s.T(), db, database.WorkspaceAgent{ResourceID: res.ID}) @@ -1116,7 +1125,10 @@ func (s *MethodTestSuite) TestWorkspace() { }).Asserts(ws, rbac.ActionUpdate).Returns() })) s.Run("GetWorkspaceAgentLogsAfter", s.Subtest(func(db database.Store, check *expects) { - ws := dbgen.Workspace(s.T(), db, database.Workspace{}) + tpl := dbgen.Template(s.T(), db, database.Template{}) + ws := dbgen.Workspace(s.T(), db, database.Workspace{ + TemplateID: tpl.ID, + }) build := dbgen.WorkspaceBuild(s.T(), db, database.WorkspaceBuild{WorkspaceID: ws.ID, JobID: uuid.New()}) res := dbgen.WorkspaceResource(s.T(), db, database.WorkspaceResource{JobID: build.JobID}) agt := dbgen.WorkspaceAgent(s.T(), db, database.WorkspaceAgent{ResourceID: res.ID}) @@ -1125,7 +1137,10 @@ func (s *MethodTestSuite) TestWorkspace() { }).Asserts(ws, rbac.ActionRead).Returns([]database.WorkspaceAgentLog{}) })) s.Run("GetWorkspaceAppByAgentIDAndSlug", s.Subtest(func(db database.Store, check *expects) { - ws := dbgen.Workspace(s.T(), db, database.Workspace{}) + tpl := dbgen.Template(s.T(), db, database.Template{}) + ws := dbgen.Workspace(s.T(), db, database.Workspace{ + TemplateID: tpl.ID, + }) build := dbgen.WorkspaceBuild(s.T(), db, database.WorkspaceBuild{WorkspaceID: ws.ID, JobID: uuid.New()}) res := dbgen.WorkspaceResource(s.T(), db, database.WorkspaceResource{JobID: build.JobID}) agt := dbgen.WorkspaceAgent(s.T(), db, database.WorkspaceAgent{ResourceID: res.ID}) @@ -1137,7 +1152,10 @@ func (s *MethodTestSuite) TestWorkspace() { }).Asserts(ws, rbac.ActionRead).Returns(app) })) s.Run("GetWorkspaceAppsByAgentID", s.Subtest(func(db database.Store, check *expects) { - ws := dbgen.Workspace(s.T(), db, database.Workspace{}) + tpl := dbgen.Template(s.T(), db, database.Template{}) + ws := dbgen.Workspace(s.T(), db, database.Workspace{ + TemplateID: tpl.ID, + }) build := dbgen.WorkspaceBuild(s.T(), db, database.WorkspaceBuild{WorkspaceID: ws.ID, JobID: uuid.New()}) res := dbgen.WorkspaceResource(s.T(), db, database.WorkspaceResource{JobID: build.JobID}) agt := dbgen.WorkspaceAgent(s.T(), db, database.WorkspaceAgent{ResourceID: res.ID}) @@ -1178,11 +1196,17 @@ func (s *MethodTestSuite) TestWorkspace() { check.Args(database.GetWorkspaceBuildsByWorkspaceIDParams{WorkspaceID: ws.ID}).Asserts(ws, rbac.ActionRead) // ordering })) s.Run("GetWorkspaceByAgentID", s.Subtest(func(db database.Store, check *expects) { - ws := dbgen.Workspace(s.T(), db, database.Workspace{}) + tpl := dbgen.Template(s.T(), db, database.Template{}) + ws := dbgen.Workspace(s.T(), db, database.Workspace{ + TemplateID: tpl.ID, + }) build := dbgen.WorkspaceBuild(s.T(), db, database.WorkspaceBuild{WorkspaceID: ws.ID, JobID: uuid.New()}) res := dbgen.WorkspaceResource(s.T(), db, database.WorkspaceResource{JobID: build.JobID}) agt := dbgen.WorkspaceAgent(s.T(), db, database.WorkspaceAgent{ResourceID: res.ID}) - check.Args(agt.ID).Asserts(ws, rbac.ActionRead).Returns(ws) + check.Args(agt.ID).Asserts(ws, rbac.ActionRead).Returns(database.GetWorkspaceByAgentIDRow{ + Workspace: ws, + TemplateName: tpl.Name, + }) })) s.Run("GetWorkspaceByOwnerIDAndName", s.Subtest(func(db database.Store, check *expects) { ws := dbgen.Workspace(s.T(), db, database.Workspace{}) From 044b9423208aee873ddecdd10c55e2486c753755 Mon Sep 17 00:00:00 2001 From: Steven Masley Date: Wed, 13 Dec 2023 09:08:37 -0600 Subject: [PATCH 11/19] PR feedback, _s -> seconds, start -> startup --- agent/agent.go | 2 +- agent/agent_test.go | 2 +- agent/metrics.go | 12 ++++++------ coderd/batchstats/batcher.go | 1 - docs/admin/prometheus.md | 4 ++-- scripts/metricsdocgen/metrics | 6 +++--- 6 files changed, 13 insertions(+), 14 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index 2df12368f9243..43fa2f1ccd969 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -771,7 +771,7 @@ func (a *agent) run(ctx context.Context) error { if err == nil { label = "true" } - a.metrics.startScriptSeconds.WithLabelValues(label).Set(dur) + a.metrics.startupScriptSeconds.WithLabelValues(label).Set(dur) a.scriptRunner.StartCron() }) if err != nil { diff --git a/agent/agent_test.go b/agent/agent_test.go index fc0700dd6c8f4..69a4a1ac91abb 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -2237,7 +2237,7 @@ func TestAgent_Metrics_SSH(t *testing.T) { Value: 0, }, { - Name: "coderd_agentstats_startup_script_s", + Name: "coderd_agentstats_startup_script_seconds", Type: agentsdk.AgentMetricTypeGauge, Value: 0, Labels: []agentsdk.AgentMetricLabel{ diff --git a/agent/metrics.go b/agent/metrics.go index 4ac81f937bd7c..d987bad9a50c0 100644 --- a/agent/metrics.go +++ b/agent/metrics.go @@ -17,9 +17,9 @@ import ( type agentMetrics struct { connectionsTotal prometheus.Counter reconnectingPTYErrors *prometheus.CounterVec - // startScriptSeconds is the time in seconds that the start script(s) + // startupScriptSeconds is the time in seconds that the start script(s) // took to run. This is reported once per agent. - startScriptSeconds *prometheus.GaugeVec + startupScriptSeconds *prometheus.GaugeVec } func newAgentMetrics(registerer prometheus.Registerer) *agentMetrics { @@ -38,18 +38,18 @@ func newAgentMetrics(registerer prometheus.Registerer) *agentMetrics { ) registerer.MustRegister(reconnectingPTYErrors) - startScriptSeconds := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + startupScriptSeconds := prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "coderd", Subsystem: "agentstats", - Name: "startup_script_s", + Name: "startup_script_seconds", Help: "Amount of time taken to run the startup script in seconds.", }, []string{"success"}) - registerer.MustRegister(startScriptSeconds) + registerer.MustRegister(startupScriptSeconds) return &agentMetrics{ connectionsTotal: connectionsTotal, reconnectingPTYErrors: reconnectingPTYErrors, - startScriptSeconds: startScriptSeconds, + startupScriptSeconds: startupScriptSeconds, } } diff --git a/coderd/batchstats/batcher.go b/coderd/batchstats/batcher.go index 654b648b28584..cc234c693e462 100644 --- a/coderd/batchstats/batcher.go +++ b/coderd/batchstats/batcher.go @@ -13,7 +13,6 @@ import ( "cdr.dev/slog" "cdr.dev/slog/sloggers/sloghuman" - "github.com/coder/coder/v2/coderd/database" "github.com/coder/coder/v2/coderd/database/dbauthz" "github.com/coder/coder/v2/coderd/database/dbtime" diff --git a/docs/admin/prometheus.md b/docs/admin/prometheus.md index a3483ee78b075..9bf2e662c10f6 100644 --- a/docs/admin/prometheus.md +++ b/docs/admin/prometheus.md @@ -79,7 +79,7 @@ spec: | Name | Type | Description | Labels | -| ----------------------------------------------------- | --------- | ------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------- | +|-------------------------------------------------------| --------- | ------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------- | | `agent_scripts_executed_total` | counter | Total number of scripts executed by the Coder agent. Includes cron scheduled scripts. | `agent_name` `success` `template_name` `username` `workspace_name` | | `coderd_agents_apps` | gauge | Agent applications with statuses. | `agent_name` `app_name` `health` `username` `workspace_name` | | `coderd_agents_connection_latencies_seconds` | gauge | Agent connection latencies in seconds. | `agent_name` `derp_region` `preferred` `username` `workspace_name` | @@ -92,7 +92,7 @@ spec: | `coderd_agentstats_session_count_reconnecting_pty` | gauge | The number of session established by reconnecting PTY | `agent_name` `username` `workspace_name` | | `coderd_agentstats_session_count_ssh` | gauge | The number of session established by SSH | `agent_name` `username` `workspace_name` | | `coderd_agentstats_session_count_vscode` | gauge | The number of session established by VSCode | `agent_name` `username` `workspace_name` | -| `coderd_agentstats_startup_script_s` | gauge | The number of seconds the startup script took to execute. | `agent_name` `success` `template_name` `username` `workspace_name` | +| `coderd_agentstats_startup_script_seconds` | gauge | The number of seconds the startup script took to execute. | `agent_name` `success` `template_name` `username` `workspace_name` | | `coderd_agentstats_tx_bytes` | gauge | Agent Tx bytes | `agent_name` `username` `workspace_name` | | `coderd_api_active_users_duration_hour` | gauge | The number of users that have been active within the last hour. | | | `coderd_api_concurrent_requests` | gauge | The number of concurrent API requests. | | diff --git a/scripts/metricsdocgen/metrics b/scripts/metricsdocgen/metrics index 148af2b2051db..06889bce35c39 100644 --- a/scripts/metricsdocgen/metrics +++ b/scripts/metricsdocgen/metrics @@ -18,9 +18,9 @@ coderd_agents_connections{agent_name="main",lifecycle_state="start_timeout",stat coderd_agents_up{template_name="docker", username="admin",workspace_name="workspace-1"} 1 coderd_agents_up{template_name="docker", username="admin",workspace_name="workspace-2"} 1 coderd_agents_up{template_name="gcp", username="admin",workspace_name="workspace-3"} 1 -# HELP coderd_agentstats_startup_script_s The number of seconds the startup script took to execute. -# TYPE coderd_agentstats_startup_script_s gauge -coderd_agentstats_startup_script_s{agent_name="main",success="true",template_name="docker",username="admin",workspace_name="workspace-1"} 1.969900304 +# HELP coderd_agentstats_startup_script_seconds The number of seconds the startup script took to execute. +# TYPE coderd_agentstats_startup_script_seconds gauge +coderd_agentstats_startup_script_seconds{agent_name="main",success="true",template_name="docker",username="admin",workspace_name="workspace-1"} 1.969900304 # HELP agent_scripts_executed_total Total number of scripts executed by the Coder agent. Includes cron scheduled scripts. # TYPE agent_scripts_executed_total counter agent_scripts_executed_total{agent_name="main",success="true",template_name="docker",username="admin",workspace_name="workspace-1"} 1 From 2db41b6da9a8611ba8875841681edf7bafc407c1 Mon Sep 17 00:00:00 2001 From: Steven Masley Date: Wed, 13 Dec 2023 09:13:37 -0600 Subject: [PATCH 12/19] change string params to a struct --- coderd/coderd.go | 3 ++- coderd/prometheusmetrics/aggregator.go | 18 +++++++++++++----- coderd/workspaceagents.go | 9 +++++++-- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/coderd/coderd.go b/coderd/coderd.go index f751d25bd654e..3985d0fd447d6 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -38,6 +38,7 @@ import ( _ "github.com/coder/coder/v2/coderd/apidoc" "github.com/coder/coder/v2/coderd/externalauth" "github.com/coder/coder/v2/coderd/healthcheck/derphealth" + "github.com/coder/coder/v2/coderd/prometheusmetrics" "cdr.dev/slog" @@ -169,7 +170,7 @@ type Options struct { HTTPClient *http.Client - UpdateAgentMetrics func(ctx context.Context, username, workspaceName, agentName, templateName string, metrics []agentsdk.AgentMetric) + UpdateAgentMetrics func(ctx context.Context, labels prometheusmetrics.UpdateAgentMetricsLabels, metrics []agentsdk.AgentMetric) StatsBatcher *batchstats.Batcher WorkspaceAppsStatsCollectorOptions workspaceapps.StatsCollectorOptions diff --git a/coderd/prometheusmetrics/aggregator.go b/coderd/prometheusmetrics/aggregator.go index 5b8dc251a94f0..c466a3c61d227 100644 --- a/coderd/prometheusmetrics/aggregator.go +++ b/coderd/prometheusmetrics/aggregator.go @@ -249,13 +249,21 @@ func (ma *MetricsAggregator) Collect(ch chan<- prometheus.Metric) { } } -func (ma *MetricsAggregator) Update(ctx context.Context, username, workspaceName, agentName, templateName string, metrics []agentsdk.AgentMetric) { +// UpdateAgentMetricsLabels are the labels used to decorate an agent's metrics. +type UpdateAgentMetricsLabels struct { + Username string + WorkspaceName string + AgentName string + TemplateName string +} + +func (ma *MetricsAggregator) Update(ctx context.Context, labels UpdateAgentMetricsLabels, metrics []agentsdk.AgentMetric) { select { case ma.updateCh <- updateRequest{ - username: username, - workspaceName: workspaceName, - agentName: agentName, - templateName: templateName, + username: labels.Username, + workspaceName: labels.WorkspaceName, + agentName: labels.AgentName, + templateName: labels.TemplateName, metrics: metrics, timestamp: time.Now(), diff --git a/coderd/workspaceagents.go b/coderd/workspaceagents.go index dfa20294a3a58..2d25916c05b27 100644 --- a/coderd/workspaceagents.go +++ b/coderd/workspaceagents.go @@ -30,7 +30,6 @@ import ( "tailscale.com/tailcfg" "cdr.dev/slog" - "github.com/coder/coder/v2/coderd/autobuild" "github.com/coder/coder/v2/coderd/database" "github.com/coder/coder/v2/coderd/database/dbauthz" @@ -38,6 +37,7 @@ import ( "github.com/coder/coder/v2/coderd/externalauth" "github.com/coder/coder/v2/coderd/httpapi" "github.com/coder/coder/v2/coderd/httpmw" + "github.com/coder/coder/v2/coderd/prometheusmetrics" "github.com/coder/coder/v2/coderd/rbac" "github.com/coder/coder/v2/coderd/util/ptr" "github.com/coder/coder/v2/codersdk" @@ -1726,7 +1726,12 @@ func (api *API) workspaceAgentReportStats(rw http.ResponseWriter, r *http.Reques return xerrors.Errorf("can't get user: %w", err) } - api.Options.UpdateAgentMetrics(ctx, user.Username, workspace.Name, workspaceAgent.Name, row.TemplateName, req.Metrics) + api.Options.UpdateAgentMetrics(ctx, prometheusmetrics.UpdateAgentMetricsLabels{ + Username: user.Username, + WorkspaceName: workspace.Name, + AgentName: workspaceAgent.Name, + TemplateName: row.TemplateName, + }, req.Metrics) return nil }) } From dd324015346830e5dc84579e1922c9aa420c1f70 Mon Sep 17 00:00:00 2001 From: Steven Masley Date: Wed, 13 Dec 2023 09:15:47 -0600 Subject: [PATCH 13/19] Change the name of the labels argument struct --- coderd/coderd.go | 2 +- coderd/prometheusmetrics/aggregator.go | 19 ++++++++++--------- coderd/workspaceagents.go | 2 +- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/coderd/coderd.go b/coderd/coderd.go index 3985d0fd447d6..0fc27139f2efd 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -170,7 +170,7 @@ type Options struct { HTTPClient *http.Client - UpdateAgentMetrics func(ctx context.Context, labels prometheusmetrics.UpdateAgentMetricsLabels, metrics []agentsdk.AgentMetric) + UpdateAgentMetrics func(ctx context.Context, labels prometheusmetrics.AgentMetricLabels, metrics []agentsdk.AgentMetric) StatsBatcher *batchstats.Batcher WorkspaceAppsStatsCollectorOptions workspaceapps.StatsCollectorOptions diff --git a/coderd/prometheusmetrics/aggregator.go b/coderd/prometheusmetrics/aggregator.go index c466a3c61d227..d3d19bf2391a7 100644 --- a/coderd/prometheusmetrics/aggregator.go +++ b/coderd/prometheusmetrics/aggregator.go @@ -232,6 +232,15 @@ func (*MetricsAggregator) Describe(_ chan<- *prometheus.Desc) { var agentMetricsLabels = []string{usernameLabel, workspaceNameLabel, agentNameLabel, templateNameLabel} +// AgentMetricLabels are the labels used to decorate an agent's metrics. +// This list should match the list of labels in agentMetricsLabels. +type AgentMetricLabels struct { + Username string + WorkspaceName string + AgentName string + TemplateName string +} + func (ma *MetricsAggregator) Collect(ch chan<- prometheus.Metric) { output := make(chan []prometheus.Metric, 1) @@ -249,15 +258,7 @@ func (ma *MetricsAggregator) Collect(ch chan<- prometheus.Metric) { } } -// UpdateAgentMetricsLabels are the labels used to decorate an agent's metrics. -type UpdateAgentMetricsLabels struct { - Username string - WorkspaceName string - AgentName string - TemplateName string -} - -func (ma *MetricsAggregator) Update(ctx context.Context, labels UpdateAgentMetricsLabels, metrics []agentsdk.AgentMetric) { +func (ma *MetricsAggregator) Update(ctx context.Context, labels AgentMetricLabels, metrics []agentsdk.AgentMetric) { select { case ma.updateCh <- updateRequest{ username: labels.Username, diff --git a/coderd/workspaceagents.go b/coderd/workspaceagents.go index 2d25916c05b27..daabb19479f9b 100644 --- a/coderd/workspaceagents.go +++ b/coderd/workspaceagents.go @@ -1726,7 +1726,7 @@ func (api *API) workspaceAgentReportStats(rw http.ResponseWriter, r *http.Reques return xerrors.Errorf("can't get user: %w", err) } - api.Options.UpdateAgentMetrics(ctx, prometheusmetrics.UpdateAgentMetricsLabels{ + api.Options.UpdateAgentMetrics(ctx, prometheusmetrics.AgentMetricLabels{ Username: user.Username, WorkspaceName: workspace.Name, AgentName: workspaceAgent.Name, From b4a3607762c98a279fd75c3694e1ddd7304e2f31 Mon Sep 17 00:00:00 2001 From: Steven Masley Date: Wed, 13 Dec 2023 09:17:22 -0600 Subject: [PATCH 14/19] Clarify comment --- agent/agentscripts/agentscripts.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/agent/agentscripts/agentscripts.go b/agent/agentscripts/agentscripts.go index 045af8461ead2..e72d8120bbe58 100644 --- a/agent/agentscripts/agentscripts.go +++ b/agent/agentscripts/agentscripts.go @@ -79,8 +79,9 @@ type Runner struct { initialized atomic.Bool scripts []codersdk.WorkspaceAgentScript - // Metrics - // scripts includes scripts that are scheduled. + // scriptsExecuted includes all scripts executed by the workspace agent. + // Agents execute startup scripts, and scripts on a cron schedule. Both of + // which will increment this counter. scriptsExecuted *prometheus.CounterVec } From c5da250cc9b77ddb6ee8ca0dfb9ab5a43db7f3d4 Mon Sep 17 00:00:00 2001 From: Steven Masley Date: Wed, 13 Dec 2023 09:18:14 -0600 Subject: [PATCH 15/19] Remove extra words --- agent/agentscripts/agentscripts.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/agent/agentscripts/agentscripts.go b/agent/agentscripts/agentscripts.go index e72d8120bbe58..8b3aaf9a22c3d 100644 --- a/agent/agentscripts/agentscripts.go +++ b/agent/agentscripts/agentscripts.go @@ -79,9 +79,9 @@ type Runner struct { initialized atomic.Bool scripts []codersdk.WorkspaceAgentScript - // scriptsExecuted includes all scripts executed by the workspace agent. - // Agents execute startup scripts, and scripts on a cron schedule. Both of - // which will increment this counter. + // scriptsExecuted includes all scripts executed by the workspace agent. Agents + // execute startup scripts, and scripts on a cron schedule. Both will increment + // this counter. scriptsExecuted *prometheus.CounterVec } From eb1d173bd2a7eb394ef65f559f80eecff6a70301 Mon Sep 17 00:00:00 2001 From: Steven Masley Date: Wed, 13 Dec 2023 09:19:22 -0600 Subject: [PATCH 16/19] measure workspace startup closer to script finish in lines of code --- agent/agent.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/agent/agent.go b/agent/agent.go index 43fa2f1ccd969..4a7b9a827b187 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -755,6 +755,8 @@ func (a *agent) run(ctx context.Context) error { err := a.scriptRunner.Execute(ctx, func(script codersdk.WorkspaceAgentScript) bool { return script.RunOnStart }) + // Measure the time immediately after the script has finished + dur := time.Since(start).Seconds() if err != nil { a.logger.Warn(ctx, "startup script(s) failed", slog.Error(err)) if errors.Is(err, agentscripts.ErrTimeout) { @@ -766,7 +768,6 @@ func (a *agent) run(ctx context.Context) error { a.setLifecycle(ctx, codersdk.WorkspaceAgentLifecycleReady) } - dur := time.Since(start).Seconds() label := "false" if err == nil { label = "true" From e8034367800b96548f1a9ac93f2ea22d7e99fa63 Mon Sep 17 00:00:00 2001 From: Steven Masley Date: Wed, 13 Dec 2023 09:34:07 -0600 Subject: [PATCH 17/19] Fix unit test call args --- coderd/prometheusmetrics/aggregator_test.go | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/coderd/prometheusmetrics/aggregator_test.go b/coderd/prometheusmetrics/aggregator_test.go index db6a8e233755d..2318622ef14c3 100644 --- a/coderd/prometheusmetrics/aggregator_test.go +++ b/coderd/prometheusmetrics/aggregator_test.go @@ -27,6 +27,15 @@ const ( testTemplateName = "main-template" ) +var ( + testLabels = prometheusmetrics.AgentMetricLabels{ + Username: testUsername, + WorkspaceName: testWorkspaceName, + AgentName: testAgentName, + TemplateName: testTemplateName, + } +) + func TestUpdateMetrics_MetricsDoNotExpire(t *testing.T) { t.Parallel() @@ -79,8 +88,8 @@ func TestUpdateMetrics_MetricsDoNotExpire(t *testing.T) { } // when - metricsAggregator.Update(ctx, testUsername, testWorkspaceName, testAgentName, testTemplateName, given1) - metricsAggregator.Update(ctx, testUsername, testWorkspaceName, testAgentName, testTemplateName, given2) + metricsAggregator.Update(ctx, testLabels, given1) + metricsAggregator.Update(ctx, testLabels, given2) // then require.Eventually(t, func() bool { @@ -163,7 +172,7 @@ func TestUpdateMetrics_MetricsExpire(t *testing.T) { } // when - metricsAggregator.Update(ctx, testUsername, testWorkspaceName, testAgentName, testTemplateName, given) + metricsAggregator.Update(ctx, testLabels, given) time.Sleep(time.Millisecond * 10) // Ensure that metric is expired @@ -229,7 +238,7 @@ func Benchmark_MetricsAggregator_Run(b *testing.B) { b.Logf("N=%d sending %d metrics", b.N, numMetrics) var nGot atomic.Int64 b.StartTimer() - metricsAggregator.Update(ctx, testUsername, testWorkspaceName, testAgentName, testTemplateName, metrics) + metricsAggregator.Update(ctx, testLabels, metrics) for i := 0; i < numMetrics; i++ { select { case <-ctx.Done(): From 99483fe370261e7d8ab57fe1f998d5faadb03e95 Mon Sep 17 00:00:00 2001 From: Steven Masley Date: Wed, 13 Dec 2023 09:44:47 -0600 Subject: [PATCH 18/19] Make gen --- docs/admin/prometheus.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/admin/prometheus.md b/docs/admin/prometheus.md index 9bf2e662c10f6..06bed3bd222a1 100644 --- a/docs/admin/prometheus.md +++ b/docs/admin/prometheus.md @@ -79,7 +79,7 @@ spec: | Name | Type | Description | Labels | -|-------------------------------------------------------| --------- | ------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------- | +| ----------------------------------------------------- | --------- | ------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------- | | `agent_scripts_executed_total` | counter | Total number of scripts executed by the Coder agent. Includes cron scheduled scripts. | `agent_name` `success` `template_name` `username` `workspace_name` | | `coderd_agents_apps` | gauge | Agent applications with statuses. | `agent_name` `app_name` `health` `username` `workspace_name` | | `coderd_agents_connection_latencies_seconds` | gauge | Agent connection latencies in seconds. | `agent_name` `derp_region` `preferred` `username` `workspace_name` | From 2f89b6293c0796e9b8928aab7061139c7ae33dba Mon Sep 17 00:00:00 2001 From: Steven Masley Date: Wed, 13 Dec 2023 09:56:49 -0600 Subject: [PATCH 19/19] Formatting --- coderd/prometheusmetrics/aggregator_test.go | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/coderd/prometheusmetrics/aggregator_test.go b/coderd/prometheusmetrics/aggregator_test.go index 2318622ef14c3..ec305b9d44f14 100644 --- a/coderd/prometheusmetrics/aggregator_test.go +++ b/coderd/prometheusmetrics/aggregator_test.go @@ -27,14 +27,12 @@ const ( testTemplateName = "main-template" ) -var ( - testLabels = prometheusmetrics.AgentMetricLabels{ - Username: testUsername, - WorkspaceName: testWorkspaceName, - AgentName: testAgentName, - TemplateName: testTemplateName, - } -) +var testLabels = prometheusmetrics.AgentMetricLabels{ + Username: testUsername, + WorkspaceName: testWorkspaceName, + AgentName: testAgentName, + TemplateName: testTemplateName, +} func TestUpdateMetrics_MetricsDoNotExpire(t *testing.T) { t.Parallel()