diff --git a/cli/server.go b/cli/server.go index 7d4261a2e2a7f..c93064f34c8ef 100644 --- a/cli/server.go +++ b/cli/server.go @@ -889,6 +889,15 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. return xerrors.Errorf("create coder API: %w", err) } + if cfg.Prometheus.Enable { + // Agent metrics require reference to the tailnet coordinator, so must be initiated after Coder API. + closeAgentsFunc, err := prometheusmetrics.Agents(ctx, logger, options.PrometheusRegistry, coderAPI.Database, &coderAPI.TailnetCoordinator, options.DERPMap, coderAPI.Options.AgentInactiveDisconnectTimeout, 0) + if err != nil { + return xerrors.Errorf("register agents prometheus metric: %w", err) + } + defer closeAgentsFunc() + } + client := codersdk.New(localURL) if localURL.Scheme == "https" && isLocalhost(localURL.Hostname()) { // The certificate will likely be self-signed or for a different diff --git a/coderd/prometheusmetrics/collector.go b/coderd/prometheusmetrics/collector.go new file mode 100644 index 0000000000000..8839553a1ffdd --- /dev/null +++ b/coderd/prometheusmetrics/collector.go @@ -0,0 +1,95 @@ +package prometheusmetrics + +import ( + "sync" + + "github.com/prometheus/client_golang/prometheus" +) + +// CachedGaugeVec is a wrapper for the prometheus.GaugeVec which allows +// for staging changes in the metrics vector. Calling "WithLabelValues(...)" +// will update the internal gauge value, but it will not be returned by +// "Collect(...)" until the "Commit()" method is called. The "Commit()" method +// resets the internal gauge and applies all staged changes to it. +// +// The Use of CachedGaugeVec is recommended for use cases when there is a risk +// that the Prometheus collector receives incomplete metrics, collected +// in the middle of metrics recalculation, between "Reset()" and the last +// "WithLabelValues()" call. +type CachedGaugeVec struct { + m sync.Mutex + + gaugeVec *prometheus.GaugeVec + records []vectorRecord +} + +var _ prometheus.Collector = new(CachedGaugeVec) + +type VectorOperation int + +const ( + VectorOperationAdd VectorOperation = iota + VectorOperationSet +) + +type vectorRecord struct { + operation VectorOperation + value float64 + labelValues []string +} + +func NewCachedGaugeVec(gaugeVec *prometheus.GaugeVec) *CachedGaugeVec { + return &CachedGaugeVec{ + gaugeVec: gaugeVec, + } +} + +func (v *CachedGaugeVec) Describe(desc chan<- *prometheus.Desc) { + v.gaugeVec.Describe(desc) +} + +func (v *CachedGaugeVec) Collect(ch chan<- prometheus.Metric) { + v.m.Lock() + defer v.m.Unlock() + + v.gaugeVec.Collect(ch) +} + +func (v *CachedGaugeVec) WithLabelValues(operation VectorOperation, value float64, labelValues ...string) { + switch operation { + case VectorOperationAdd: + case VectorOperationSet: + default: + panic("unsupported vector operation") + } + + v.m.Lock() + defer v.m.Unlock() + + v.records = append(v.records, vectorRecord{ + operation: operation, + value: value, + labelValues: labelValues, + }) +} + +// Commit will set the internal value as the cached value to return from "Collect()". +// The internal metric value is completely reset, so the caller should expect +// the gauge to be empty for the next 'WithLabelValues' values. +func (v *CachedGaugeVec) Commit() { + v.m.Lock() + defer v.m.Unlock() + + v.gaugeVec.Reset() + for _, record := range v.records { + g := v.gaugeVec.WithLabelValues(record.labelValues...) + switch record.operation { + case VectorOperationAdd: + g.Add(record.value) + case VectorOperationSet: + g.Set(record.value) + } + } + + v.records = nil +} diff --git a/coderd/prometheusmetrics/collector_test.go b/coderd/prometheusmetrics/collector_test.go new file mode 100644 index 0000000000000..9d63f6669113d --- /dev/null +++ b/coderd/prometheusmetrics/collector_test.go @@ -0,0 +1,140 @@ +package prometheusmetrics_test + +import ( + "sort" + "testing" + + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/coder/coder/coderd/prometheusmetrics" +) + +func TestCollector_Add(t *testing.T) { + t.Parallel() + + // given + agentsGauge := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "up", + Help: "The number of active agents per workspace.", + }, []string{"username", "workspace_name"})) + + // when + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 7, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 23, "second user", "your workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 1, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 25, "second user", "your workspace") + agentsGauge.Commit() + + // then + ch := make(chan prometheus.Metric, 2) + agentsGauge.Collect(ch) + + metrics := collectAndSortMetrics(t, agentsGauge, 2) + + assert.Equal(t, "first user", metrics[0].Label[0].GetValue()) // Username + assert.Equal(t, "my workspace", metrics[0].Label[1].GetValue()) // Workspace name + assert.Equal(t, 8, int(metrics[0].Gauge.GetValue())) // Metric value + + assert.Equal(t, "second user", metrics[1].Label[0].GetValue()) // Username + assert.Equal(t, "your workspace", metrics[1].Label[1].GetValue()) // Workspace name + assert.Equal(t, 48, int(metrics[1].Gauge.GetValue())) // Metric value +} + +func TestCollector_Set(t *testing.T) { + t.Parallel() + + // given + agentsGauge := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "up", + Help: "The number of active agents per workspace.", + }, []string{"username", "workspace_name"})) + + // when + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 3, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 4, "second user", "your workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 5, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 6, "second user", "your workspace") + agentsGauge.Commit() + + // then + ch := make(chan prometheus.Metric, 2) + agentsGauge.Collect(ch) + + metrics := collectAndSortMetrics(t, agentsGauge, 2) + + assert.Equal(t, "first user", metrics[0].Label[0].GetValue()) // Username + assert.Equal(t, "my workspace", metrics[0].Label[1].GetValue()) // Workspace name + assert.Equal(t, 5, int(metrics[0].Gauge.GetValue())) // Metric value + + assert.Equal(t, "second user", metrics[1].Label[0].GetValue()) // Username + assert.Equal(t, "your workspace", metrics[1].Label[1].GetValue()) // Workspace name + assert.Equal(t, 6, int(metrics[1].Gauge.GetValue())) // Metric value +} + +func TestCollector_Set_Add(t *testing.T) { + t.Parallel() + + // given + agentsGauge := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "up", + Help: "The number of active agents per workspace.", + }, []string{"username", "workspace_name"})) + + // when + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 9, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 8, "second user", "your workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 7, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 6, "second user", "your workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 5, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 4, "second user", "your workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 3, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 2, "second user", "your workspace") + agentsGauge.Commit() + + // then + ch := make(chan prometheus.Metric, 2) + agentsGauge.Collect(ch) + + metrics := collectAndSortMetrics(t, agentsGauge, 2) + + assert.Equal(t, "first user", metrics[0].Label[0].GetValue()) // Username + assert.Equal(t, "my workspace", metrics[0].Label[1].GetValue()) // Workspace name + assert.Equal(t, 8, int(metrics[0].Gauge.GetValue())) // Metric value + + assert.Equal(t, "second user", metrics[1].Label[0].GetValue()) // Username + assert.Equal(t, "your workspace", metrics[1].Label[1].GetValue()) // Workspace name + assert.Equal(t, 6, int(metrics[1].Gauge.GetValue())) // Metric value +} + +func collectAndSortMetrics(t *testing.T, collector prometheus.Collector, count int) []dto.Metric { + ch := make(chan prometheus.Metric, count) + defer close(ch) + + var metrics []dto.Metric + + collector.Collect(ch) + for i := 0; i < count; i++ { + m := <-ch + + var metric dto.Metric + err := m.Write(&metric) + require.NoError(t, err) + + metrics = append(metrics, metric) + } + + // Ensure always the same order of metrics + sort.Slice(metrics, func(i, j int) bool { + return sort.StringsAreSorted([]string{metrics[i].Label[0].GetValue(), metrics[j].Label[1].GetValue()}) + }) + return metrics +} diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 536522bf73e04..83e4af90d0765 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -2,13 +2,24 @@ package prometheusmetrics import ( "context" + "database/sql" + "errors" + "fmt" + "strconv" + "strings" + "sync/atomic" "time" "github.com/google/uuid" "github.com/prometheus/client_golang/prometheus" + "tailscale.com/tailcfg" + + "cdr.dev/slog" "github.com/coder/coder/coderd" "github.com/coder/coder/coderd/database" + "github.com/coder/coder/coderd/database/dbauthz" + "github.com/coder/coder/tailnet" ) // ActiveUsers tracks the number of users that have authenticated within the past hour. @@ -106,3 +117,175 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa }() return cancelFunc, nil } + +// Agents tracks the total number of workspaces with labels on status. +func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, coordinator *atomic.Pointer[tailnet.Coordinator], derpMap *tailcfg.DERPMap, agentInactiveDisconnectTimeout, duration time.Duration) (context.CancelFunc, error) { + if duration == 0 { + duration = 1 * time.Minute + } + + agentsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "up", + Help: "The number of active agents per workspace.", + }, []string{"username", "workspace_name"})) + err := registerer.Register(agentsGauge) + if err != nil { + return nil, err + } + + agentsConnectionsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "connections", + Help: "Agent connections with statuses.", + }, []string{"agent_name", "username", "workspace_name", "status", "lifecycle_state", "tailnet_node"})) + err = registerer.Register(agentsConnectionsGauge) + if err != nil { + return nil, err + } + + agentsConnectionLatenciesGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "connection_latencies_seconds", + Help: "Agent connection latencies in seconds.", + }, []string{"agent_id", "username", "workspace_name", "derp_region", "preferred"})) + err = registerer.Register(agentsConnectionLatenciesGauge) + if err != nil { + return nil, err + } + + agentsAppsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "apps", + Help: "Agent applications with statuses.", + }, []string{"agent_name", "username", "workspace_name", "app_name", "health"})) + err = registerer.Register(agentsAppsGauge) + if err != nil { + return nil, err + } + + metricsCollectorAgents := prometheus.NewHistogram(prometheus.HistogramOpts{ + Namespace: "coderd", + Subsystem: "prometheusmetrics", + Name: "agents_execution_seconds", + Help: "Histogram for duration of agents metrics collection in seconds.", + Buckets: []float64{0.001, 0.005, 0.010, 0.025, 0.050, 0.100, 0.500, 1, 5, 10, 30}, + }) + err = registerer.Register(metricsCollectorAgents) + if err != nil { + return nil, err + } + + // nolint:gocritic // Prometheus must collect metrics for all Coder users. + ctx, cancelFunc := context.WithCancel(dbauthz.AsSystemRestricted(ctx)) + ticker := time.NewTicker(duration) + go func() { + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + } + + logger.Debug(ctx, "Agent metrics collection is starting") + timer := prometheus.NewTimer(metricsCollectorAgents) + + workspaceRows, err := db.GetWorkspaces(ctx, database.GetWorkspacesParams{ + AgentInactiveDisconnectTimeoutSeconds: int64(agentInactiveDisconnectTimeout.Seconds()), + }) + if err != nil { + logger.Error(ctx, "can't get workspace rows", slog.Error(err)) + continue + } + + for _, workspace := range workspaceRows { + user, err := db.GetUserByID(ctx, workspace.OwnerID) + if err != nil { + logger.Error(ctx, "can't get user", slog.F("user_id", workspace.OwnerID), slog.Error(err)) + agentsGauge.WithLabelValues(VectorOperationAdd, 0, user.Username, workspace.Name) + continue + } + + agents, err := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, workspace.ID) + if err != nil { + logger.Error(ctx, "can't get workspace agents", slog.F("workspace_id", workspace.ID), slog.Error(err)) + agentsGauge.WithLabelValues(VectorOperationAdd, 0, user.Username, workspace.Name) + continue + } + + if len(agents) == 0 { + logger.Debug(ctx, "workspace agents are unavailable", slog.F("workspace_id", workspace.ID)) + agentsGauge.WithLabelValues(VectorOperationAdd, 0, user.Username, workspace.Name) + continue + } + + for _, agent := range agents { + // Collect information about agents + agentsGauge.WithLabelValues(VectorOperationAdd, 1, user.Username, workspace.Name) + + connectionStatus := agent.Status(agentInactiveDisconnectTimeout) + node := (*coordinator.Load()).Node(agent.ID) + + tailnetNode := "unknown" + if node != nil { + tailnetNode = node.ID.String() + } + + agentsConnectionsGauge.WithLabelValues(VectorOperationSet, 1, agent.Name, user.Username, workspace.Name, string(connectionStatus.Status), string(agent.LifecycleState), tailnetNode) + + if node == nil { + logger.Debug(ctx, "can't read in-memory node for agent", slog.F("agent_id", agent.ID)) + } else { + // Collect information about connection latencies + for rawRegion, latency := range node.DERPLatency { + regionParts := strings.SplitN(rawRegion, "-", 2) + regionID, err := strconv.Atoi(regionParts[0]) + if err != nil { + logger.Error(ctx, "can't convert DERP region", slog.F("agent_id", agent.ID), slog.F("raw_region", rawRegion), slog.Error(err)) + continue + } + + region, found := derpMap.Regions[regionID] + if !found { + // It's possible that a workspace agent is using an old DERPMap + // and reports regions that do not exist. If that's the case, + // report the region as unknown! + region = &tailcfg.DERPRegion{ + RegionID: regionID, + RegionName: fmt.Sprintf("Unnamed %d", regionID), + } + } + + agentsConnectionLatenciesGauge.WithLabelValues(VectorOperationSet, latency, agent.Name, user.Username, workspace.Name, region.RegionName, fmt.Sprintf("%v", node.PreferredDERP == regionID)) + } + } + + // Collect information about registered applications + apps, err := db.GetWorkspaceAppsByAgentID(ctx, agent.ID) + if err != nil && !errors.Is(err, sql.ErrNoRows) { + logger.Error(ctx, "can't get workspace apps", slog.F("agent_id", agent.ID), slog.Error(err)) + continue + } + + for _, app := range apps { + agentsAppsGauge.WithLabelValues(VectorOperationAdd, 1, agent.Name, user.Username, workspace.Name, app.DisplayName, string(app.Health)) + } + } + } + + agentsGauge.Commit() + agentsConnectionsGauge.Commit() + agentsConnectionLatenciesGauge.Commit() + agentsAppsGauge.Commit() + + logger.Debug(ctx, "Agent metrics collection is done") + metricsCollectorAgents.Observe(timer.ObserveDuration().Seconds()) + } + }() + return cancelFunc, nil +} diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index a0b375ccf8622..e765c5f2a1128 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -3,6 +3,7 @@ package prometheusmetrics_test import ( "context" "database/sql" + "sync/atomic" "testing" "time" @@ -11,11 +12,18 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "cdr.dev/slog/sloggers/slogtest" + + "github.com/coder/coder/coderd/coderdtest" "github.com/coder/coder/coderd/database" "github.com/coder/coder/coderd/database/dbfake" "github.com/coder/coder/coderd/database/dbgen" "github.com/coder/coder/coderd/prometheusmetrics" "github.com/coder/coder/codersdk" + "github.com/coder/coder/provisioner/echo" + "github.com/coder/coder/provisionersdk/proto" + "github.com/coder/coder/tailnet" + "github.com/coder/coder/tailnet/tailnettest" "github.com/coder/coder/testutil" ) @@ -239,3 +247,108 @@ func TestWorkspaces(t *testing.T) { }) } } + +func TestAgents(t *testing.T) { + t.Parallel() + + // Build a sample workspace with test agent and fake application + client, _, api := coderdtest.NewWithAPI(t, &coderdtest.Options{IncludeProvisionerDaemon: true}) + db := api.Database + + user := coderdtest.CreateFirstUser(t, client) + version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{ + Parse: echo.ParseComplete, + ProvisionPlan: echo.ProvisionComplete, + ProvisionApply: []*proto.Provision_Response{{ + Type: &proto.Provision_Response_Complete{ + Complete: &proto.Provision_Complete{ + Resources: []*proto.Resource{{ + Name: "example", + Type: "aws_instance", + Agents: []*proto.Agent{{ + Id: uuid.NewString(), + Name: "testagent", + Directory: t.TempDir(), + Auth: &proto.Agent_Token{ + Token: uuid.NewString(), + }, + Apps: []*proto.App{ + { + Slug: "fake-app", + DisplayName: "Fake application", + SharingLevel: proto.AppSharingLevel_OWNER, + // Hopefully this IP and port doesn't exist. + Url: "http://127.1.0.1:65535", + }, + }, + }}, + }}, + }, + }, + }}, + }) + template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID) + coderdtest.AwaitTemplateVersionJob(t, client, version.ID) + workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID) + coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID) + + // given + coordinator := tailnet.NewCoordinator() + coordinatorPtr := atomic.Pointer[tailnet.Coordinator]{} + coordinatorPtr.Store(&coordinator) + derpMap := tailnettest.RunDERPAndSTUN(t) + agentInactiveDisconnectTimeout := 1 * time.Hour // don't need to focus on this value in tests + registry := prometheus.NewRegistry() + + // when + cancel, err := prometheusmetrics.Agents(context.Background(), slogtest.Make(t, nil), registry, db, &coordinatorPtr, derpMap, agentInactiveDisconnectTimeout, time.Millisecond) + t.Cleanup(cancel) + + // then + require.NoError(t, err) + + var agentsUp bool + var agentsConnections bool + var agentsApps bool + var agentsExecutionInSeconds bool + require.Eventually(t, func() bool { + metrics, err := registry.Gather() + assert.NoError(t, err) + + if len(metrics) < 1 { + return false + } + + for _, metric := range metrics { + switch metric.GetName() { + case "coderd_agents_up": + assert.Equal(t, "testuser", metric.Metric[0].Label[0].GetValue()) // Username + assert.Equal(t, workspace.Name, metric.Metric[0].Label[1].GetValue()) // Workspace name + assert.Equal(t, 1, int(metric.Metric[0].Gauge.GetValue())) // Metric value + agentsUp = true + case "coderd_agents_connections": + assert.Equal(t, "testagent", metric.Metric[0].Label[0].GetValue()) // Agent name + assert.Equal(t, "created", metric.Metric[0].Label[1].GetValue()) // Lifecycle state + assert.Equal(t, "connecting", metric.Metric[0].Label[2].GetValue()) // Status + assert.Equal(t, "unknown", metric.Metric[0].Label[3].GetValue()) // Tailnet node + assert.Equal(t, "testuser", metric.Metric[0].Label[4].GetValue()) // Username + assert.Equal(t, workspace.Name, metric.Metric[0].Label[5].GetValue()) // Workspace name + assert.Equal(t, 1, int(metric.Metric[0].Gauge.GetValue())) // Metric value + agentsConnections = true + case "coderd_agents_apps": + assert.Equal(t, "testagent", metric.Metric[0].Label[0].GetValue()) // Agent name + assert.Equal(t, "Fake application", metric.Metric[0].Label[1].GetValue()) // App name + assert.Equal(t, "disabled", metric.Metric[0].Label[2].GetValue()) // Health + assert.Equal(t, "testuser", metric.Metric[0].Label[3].GetValue()) // Username + assert.Equal(t, workspace.Name, metric.Metric[0].Label[4].GetValue()) // Workspace name + assert.Equal(t, 1, int(metric.Metric[0].Gauge.GetValue())) // Metric value + agentsApps = true + case "coderd_prometheusmetrics_agents_execution_seconds": + agentsExecutionInSeconds = true + default: + require.FailNowf(t, "unexpected metric collected", "metric: %s", metric.GetName()) + } + } + return agentsUp && agentsConnections && agentsApps && agentsExecutionInSeconds + }, testutil.WaitShort, testutil.IntervalFast) +} diff --git a/docs/admin/prometheus.md b/docs/admin/prometheus.md index f35ba5d1c5182..2898f8f4a469c 100644 --- a/docs/admin/prometheus.md +++ b/docs/admin/prometheus.md @@ -29,53 +29,58 @@ The environment variable `CODER_PROMETHEUS_ENABLE` will be enabled automatically -| Name | Type | Description | Labels | -| -------------------------------------------- | --------- | ------------------------------------------------------------------ | ----------------------------------------------------------------------------------- | -| `coderd_api_active_users_duration_hour` | gauge | The number of users that have been active within the last hour. | | -| `coderd_api_concurrent_requests` | gauge | The number of concurrent API requests. | | -| `coderd_api_concurrent_websockets` | gauge | The total number of concurrent API websockets. | | -| `coderd_api_request_latencies_seconds` | histogram | Latency distribution of requests in seconds. | `method` `path` | -| `coderd_api_requests_processed_total` | counter | The total number of processed API requests | `code` `method` `path` | -| `coderd_api_websocket_durations_seconds` | histogram | Websocket duration distribution of requests in seconds. | `path` | -| `coderd_api_workspace_latest_build_total` | gauge | The latest workspace builds with a status. | `status` | -| `coderd_provisionerd_job_timings_seconds` | histogram | The provisioner job time duration in seconds. | `provisioner` `status` | -| `coderd_provisionerd_jobs_current` | gauge | The number of currently running provisioner jobs. | `provisioner` | -| `coderd_workspace_builds_total` | counter | The number of workspaces started, updated, or deleted. | `action` `owner_email` `status` `template_name` `template_version` `workspace_name` | -| `go_gc_duration_seconds` | summary | A summary of the pause duration of garbage collection cycles. | | -| `go_goroutines` | gauge | Number of goroutines that currently exist. | | -| `go_info` | gauge | Information about the Go environment. | `version` | -| `go_memstats_alloc_bytes` | gauge | Number of bytes allocated and still in use. | | -| `go_memstats_alloc_bytes_total` | counter | Total number of bytes allocated, even if freed. | | -| `go_memstats_buck_hash_sys_bytes` | gauge | Number of bytes used by the profiling bucket hash table. | | -| `go_memstats_frees_total` | counter | Total number of frees. | | -| `go_memstats_gc_sys_bytes` | gauge | Number of bytes used for garbage collection system metadata. | | -| `go_memstats_heap_alloc_bytes` | gauge | Number of heap bytes allocated and still in use. | | -| `go_memstats_heap_idle_bytes` | gauge | Number of heap bytes waiting to be used. | | -| `go_memstats_heap_inuse_bytes` | gauge | Number of heap bytes that are in use. | | -| `go_memstats_heap_objects` | gauge | Number of allocated objects. | | -| `go_memstats_heap_released_bytes` | gauge | Number of heap bytes released to OS. | | -| `go_memstats_heap_sys_bytes` | gauge | Number of heap bytes obtained from system. | | -| `go_memstats_last_gc_time_seconds` | gauge | Number of seconds since 1970 of last garbage collection. | | -| `go_memstats_lookups_total` | counter | Total number of pointer lookups. | | -| `go_memstats_mallocs_total` | counter | Total number of mallocs. | | -| `go_memstats_mcache_inuse_bytes` | gauge | Number of bytes in use by mcache structures. | | -| `go_memstats_mcache_sys_bytes` | gauge | Number of bytes used for mcache structures obtained from system. | | -| `go_memstats_mspan_inuse_bytes` | gauge | Number of bytes in use by mspan structures. | | -| `go_memstats_mspan_sys_bytes` | gauge | Number of bytes used for mspan structures obtained from system. | | -| `go_memstats_next_gc_bytes` | gauge | Number of heap bytes when next garbage collection will take place. | | -| `go_memstats_other_sys_bytes` | gauge | Number of bytes used for other system allocations. | | -| `go_memstats_stack_inuse_bytes` | gauge | Number of bytes in use by the stack allocator. | | -| `go_memstats_stack_sys_bytes` | gauge | Number of bytes obtained from system for stack allocator. | | -| `go_memstats_sys_bytes` | gauge | Number of bytes obtained from system. | | -| `go_threads` | gauge | Number of OS threads created. | | -| `process_cpu_seconds_total` | counter | Total user and system CPU time spent in seconds. | | -| `process_max_fds` | gauge | Maximum number of open file descriptors. | | -| `process_open_fds` | gauge | Number of open file descriptors. | | -| `process_resident_memory_bytes` | gauge | Resident memory size in bytes. | | -| `process_start_time_seconds` | gauge | Start time of the process since unix epoch in seconds. | | -| `process_virtual_memory_bytes` | gauge | Virtual memory size in bytes. | | -| `process_virtual_memory_max_bytes` | gauge | Maximum amount of virtual memory available in bytes. | | -| `promhttp_metric_handler_requests_in_flight` | gauge | Current number of scrapes being served. | | -| `promhttp_metric_handler_requests_total` | counter | Total number of scrapes by HTTP status code. | `code` | +| Name | Type | Description | Labels | +| --------------------------------------------------- | --------- | ------------------------------------------------------------------ | ----------------------------------------------------------------------------------- | +| `coderd_agents_apps` | gauge | Agent applications with statuses. | `agent_name` `app_name` `health` `username` `workspace_name` | +| `coderd_agents_connection_latencies_seconds` | gauge | Agent connection latencies in seconds. | `agent_id` `derp_region` `preferred` `username` `workspace_name` | +| `coderd_agents_connections` | gauge | Agent connections with statuses. | `agent_name` `lifecycle_state` `status` `tailnet_node` `username` `workspace_name` | +| `coderd_agents_up` | gauge | The number of active agents per workspace. | `username` `workspace_name` | +| `coderd_api_active_users_duration_hour` | gauge | The number of users that have been active within the last hour. | | +| `coderd_api_concurrent_requests` | gauge | The number of concurrent API requests. | | +| `coderd_api_concurrent_websockets` | gauge | The total number of concurrent API websockets. | | +| `coderd_api_request_latencies_seconds` | histogram | Latency distribution of requests in seconds. | `method` `path` | +| `coderd_api_requests_processed_total` | counter | The total number of processed API requests | `code` `method` `path` | +| `coderd_api_websocket_durations_seconds` | histogram | Websocket duration distribution of requests in seconds. | `path` | +| `coderd_api_workspace_latest_build_total` | gauge | The latest workspace builds with a status. | `status` | +| `coderd_metrics_collector_agents_execution_seconds` | histogram | Histogram for duration of agents metrics collection in seconds. | | +| `coderd_provisionerd_job_timings_seconds` | histogram | The provisioner job time duration in seconds. | `provisioner` `status` | +| `coderd_provisionerd_jobs_current` | gauge | The number of currently running provisioner jobs. | `provisioner` | +| `coderd_workspace_builds_total` | counter | The number of workspaces started, updated, or deleted. | `action` `owner_email` `status` `template_name` `template_version` `workspace_name` | +| `go_gc_duration_seconds` | summary | A summary of the pause duration of garbage collection cycles. | | +| `go_goroutines` | gauge | Number of goroutines that currently exist. | | +| `go_info` | gauge | Information about the Go environment. | `version` | +| `go_memstats_alloc_bytes` | gauge | Number of bytes allocated and still in use. | | +| `go_memstats_alloc_bytes_total` | counter | Total number of bytes allocated, even if freed. | | +| `go_memstats_buck_hash_sys_bytes` | gauge | Number of bytes used by the profiling bucket hash table. | | +| `go_memstats_frees_total` | counter | Total number of frees. | | +| `go_memstats_gc_sys_bytes` | gauge | Number of bytes used for garbage collection system metadata. | | +| `go_memstats_heap_alloc_bytes` | gauge | Number of heap bytes allocated and still in use. | | +| `go_memstats_heap_idle_bytes` | gauge | Number of heap bytes waiting to be used. | | +| `go_memstats_heap_inuse_bytes` | gauge | Number of heap bytes that are in use. | | +| `go_memstats_heap_objects` | gauge | Number of allocated objects. | | +| `go_memstats_heap_released_bytes` | gauge | Number of heap bytes released to OS. | | +| `go_memstats_heap_sys_bytes` | gauge | Number of heap bytes obtained from system. | | +| `go_memstats_last_gc_time_seconds` | gauge | Number of seconds since 1970 of last garbage collection. | | +| `go_memstats_lookups_total` | counter | Total number of pointer lookups. | | +| `go_memstats_mallocs_total` | counter | Total number of mallocs. | | +| `go_memstats_mcache_inuse_bytes` | gauge | Number of bytes in use by mcache structures. | | +| `go_memstats_mcache_sys_bytes` | gauge | Number of bytes used for mcache structures obtained from system. | | +| `go_memstats_mspan_inuse_bytes` | gauge | Number of bytes in use by mspan structures. | | +| `go_memstats_mspan_sys_bytes` | gauge | Number of bytes used for mspan structures obtained from system. | | +| `go_memstats_next_gc_bytes` | gauge | Number of heap bytes when next garbage collection will take place. | | +| `go_memstats_other_sys_bytes` | gauge | Number of bytes used for other system allocations. | | +| `go_memstats_stack_inuse_bytes` | gauge | Number of bytes in use by the stack allocator. | | +| `go_memstats_stack_sys_bytes` | gauge | Number of bytes obtained from system for stack allocator. | | +| `go_memstats_sys_bytes` | gauge | Number of bytes obtained from system. | | +| `go_threads` | gauge | Number of OS threads created. | | +| `process_cpu_seconds_total` | counter | Total user and system CPU time spent in seconds. | | +| `process_max_fds` | gauge | Maximum number of open file descriptors. | | +| `process_open_fds` | gauge | Number of open file descriptors. | | +| `process_resident_memory_bytes` | gauge | Resident memory size in bytes. | | +| `process_start_time_seconds` | gauge | Start time of the process since unix epoch in seconds. | | +| `process_virtual_memory_bytes` | gauge | Virtual memory size in bytes. | | +| `process_virtual_memory_max_bytes` | gauge | Maximum amount of virtual memory available in bytes. | | +| `promhttp_metric_handler_requests_in_flight` | gauge | Current number of scrapes being served. | | +| `promhttp_metric_handler_requests_total` | counter | Total number of scrapes by HTTP status code. | `code` | diff --git a/scripts/metricsdocgen/metrics b/scripts/metricsdocgen/metrics index 50bbc87990dda..7e598b17abe56 100644 --- a/scripts/metricsdocgen/metrics +++ b/scripts/metricsdocgen/metrics @@ -1,3 +1,23 @@ +# HELP coderd_agents_apps Agent applications with statuses. +# TYPE coderd_agents_apps gauge +coderd_agents_apps{agent_name="main",app_name="code-server",health="healthy",username="admin",workspace_name="workspace-1"} 1 +coderd_agents_apps{agent_name="main",app_name="code-server",health="healthy",username="admin",workspace_name="workspace-2"} 1 +coderd_agents_apps{agent_name="main",app_name="code-server",health="healthy",username="admin",workspace_name="workspace-3"} 1 +# HELP coderd_agents_connection_latencies_seconds Agent connection latencies in seconds. +# TYPE coderd_agents_connection_latencies_seconds gauge +coderd_agents_connection_latencies_seconds{agent_id="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-1"} 0.03018125 +coderd_agents_connection_latencies_seconds{agent_id="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-2"} 0.028658416 +coderd_agents_connection_latencies_seconds{agent_id="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-3"} 0.028041416 +# HELP coderd_agents_connections Agent connections with statuses. +# TYPE coderd_agents_connections gauge +coderd_agents_connections{agent_name="main",lifecycle_state="ready",status="connected",tailnet_node="nodeid:16966f7df70d8cc5",username="admin",workspace_name="workspace-3"} 1 +coderd_agents_connections{agent_name="main",lifecycle_state="start_timeout",status="connected",tailnet_node="nodeid:3237d00938be23e3",username="admin",workspace_name="workspace-2"} 1 +coderd_agents_connections{agent_name="main",lifecycle_state="start_timeout",status="connected",tailnet_node="nodeid:3779bd45d00be0eb",username="admin",workspace_name="workspace-1"} 1 +# HELP coderd_agents_up The number of active agents per workspace. +# TYPE coderd_agents_up gauge +coderd_agents_up{username="admin",workspace_name="workspace-1"} 1 +coderd_agents_up{username="admin",workspace_name="workspace-2"} 1 +coderd_agents_up{username="admin",workspace_name="workspace-3"} 1 # HELP coderd_api_websocket_durations_seconds Websocket duration distribution of requests in seconds. # TYPE coderd_api_websocket_durations_seconds histogram coderd_api_websocket_durations_seconds_bucket{path="/api/v2/workspaceagents/me/coordinate",le="0.001"} 0 @@ -568,6 +588,22 @@ coderd_api_requests_processed_total{code="401",method="POST",path="/api/v2/files # HELP coderd_api_workspace_latest_build_total The latest workspace builds with a status. # TYPE coderd_api_workspace_latest_build_total gauge coderd_api_workspace_latest_build_total{status="succeeded"} 1 +# HELP coderd_metrics_collector_agents_execution_seconds Histogram for duration of agents metrics collection in seconds. +# TYPE coderd_metrics_collector_agents_execution_seconds histogram +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.001"} 0 +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.005"} 0 +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.01"} 0 +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.025"} 0 +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.05"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.1"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.5"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="1"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="5"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="10"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="30"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="+Inf"} 2 +coderd_metrics_collector_agents_execution_seconds_sum 0.0592915 +coderd_metrics_collector_agents_execution_seconds_count 2 # HELP coderd_provisionerd_job_timings_seconds The provisioner job time duration in seconds. # TYPE coderd_provisionerd_job_timings_seconds histogram coderd_provisionerd_job_timings_seconds_bucket{provisioner="terraform",status="success",le="1"} 0