From ee52c33be44d32c1eb6b786fe337c7caace506d1 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Wed, 22 Feb 2023 17:14:51 -0700 Subject: [PATCH 1/2] feat(api): add prometheus metric coderd_workspace_builds_total --- .../provisionerdserver/provisionerdserver.go | 2 + .../provisionerdserver_test.go | 2 + docs/admin/prometheus.md | 95 ++++++++++--------- provisionerd/provisionerd.go | 6 ++ provisionerd/runner/runner.go | 12 +++ provisionersdk/proto/provisioner.pb.go | 27 +++++- provisionersdk/proto/provisioner.proto | 2 + scripts/metricsdocgen/metrics | 5 + 8 files changed, 101 insertions(+), 50 deletions(-) diff --git a/coderd/provisionerdserver/provisionerdserver.go b/coderd/provisionerdserver/provisionerdserver.go index d740d325a1f65..c1f7045921baa 100644 --- a/coderd/provisionerdserver/provisionerdserver.go +++ b/coderd/provisionerdserver/provisionerdserver.go @@ -210,6 +210,8 @@ func (server *Server) AcquireJob(ctx context.Context, _ *proto.Empty) (*proto.Ac WorkspaceOwnerEmail: owner.Email, WorkspaceId: workspace.ID.String(), WorkspaceOwnerId: owner.ID.String(), + TemplateName: template.Name, + TemplateVersion: templateVersion.Name, }, }, } diff --git a/coderd/provisionerdserver/provisionerdserver_test.go b/coderd/provisionerdserver/provisionerdserver_test.go index e456d5d4e0c4e..83f661563f60d 100644 --- a/coderd/provisionerdserver/provisionerdserver_test.go +++ b/coderd/provisionerdserver/provisionerdserver_test.go @@ -206,6 +206,8 @@ func TestAcquireJob(t *testing.T) { WorkspaceOwnerEmail: user.Email, WorkspaceId: workspace.ID.String(), WorkspaceOwnerId: user.ID.String(), + TemplateName: template.Name, + TemplateVersion: version.Name, }, }, }) diff --git a/docs/admin/prometheus.md b/docs/admin/prometheus.md index de162c99461d5..f35ba5d1c5182 100644 --- a/docs/admin/prometheus.md +++ b/docs/admin/prometheus.md @@ -29,52 +29,53 @@ The environment variable `CODER_PROMETHEUS_ENABLE` will be enabled automatically -| Name | Type | Description | Labels | -| -------------------------------------------- | --------- | ------------------------------------------------------------------ | ---------------------- | -| `coderd_api_active_users_duration_hour` | gauge | The number of users that have been active within the last hour. | | -| `coderd_api_concurrent_requests` | gauge | The number of concurrent API requests. | | -| `coderd_api_concurrent_websockets` | gauge | The total number of concurrent API websockets. | | -| `coderd_api_request_latencies_seconds` | histogram | Latency distribution of requests in seconds. | `method` `path` | -| `coderd_api_requests_processed_total` | counter | The total number of processed API requests | `code` `method` `path` | -| `coderd_api_websocket_durations_seconds` | histogram | Websocket duration distribution of requests in seconds. | `path` | -| `coderd_api_workspace_latest_build_total` | gauge | The latest workspace builds with a status. | `status` | -| `coderd_provisionerd_job_timings_seconds` | histogram | The provisioner job time duration in seconds. | `provisioner` `status` | -| `coderd_provisionerd_jobs_current` | gauge | The number of currently running provisioner jobs. | `provisioner` | -| `go_gc_duration_seconds` | summary | A summary of the pause duration of garbage collection cycles. | | -| `go_goroutines` | gauge | Number of goroutines that currently exist. | | -| `go_info` | gauge | Information about the Go environment. | `version` | -| `go_memstats_alloc_bytes` | gauge | Number of bytes allocated and still in use. | | -| `go_memstats_alloc_bytes_total` | counter | Total number of bytes allocated, even if freed. | | -| `go_memstats_buck_hash_sys_bytes` | gauge | Number of bytes used by the profiling bucket hash table. | | -| `go_memstats_frees_total` | counter | Total number of frees. | | -| `go_memstats_gc_sys_bytes` | gauge | Number of bytes used for garbage collection system metadata. | | -| `go_memstats_heap_alloc_bytes` | gauge | Number of heap bytes allocated and still in use. | | -| `go_memstats_heap_idle_bytes` | gauge | Number of heap bytes waiting to be used. | | -| `go_memstats_heap_inuse_bytes` | gauge | Number of heap bytes that are in use. | | -| `go_memstats_heap_objects` | gauge | Number of allocated objects. | | -| `go_memstats_heap_released_bytes` | gauge | Number of heap bytes released to OS. | | -| `go_memstats_heap_sys_bytes` | gauge | Number of heap bytes obtained from system. | | -| `go_memstats_last_gc_time_seconds` | gauge | Number of seconds since 1970 of last garbage collection. | | -| `go_memstats_lookups_total` | counter | Total number of pointer lookups. | | -| `go_memstats_mallocs_total` | counter | Total number of mallocs. | | -| `go_memstats_mcache_inuse_bytes` | gauge | Number of bytes in use by mcache structures. | | -| `go_memstats_mcache_sys_bytes` | gauge | Number of bytes used for mcache structures obtained from system. | | -| `go_memstats_mspan_inuse_bytes` | gauge | Number of bytes in use by mspan structures. | | -| `go_memstats_mspan_sys_bytes` | gauge | Number of bytes used for mspan structures obtained from system. | | -| `go_memstats_next_gc_bytes` | gauge | Number of heap bytes when next garbage collection will take place. | | -| `go_memstats_other_sys_bytes` | gauge | Number of bytes used for other system allocations. | | -| `go_memstats_stack_inuse_bytes` | gauge | Number of bytes in use by the stack allocator. | | -| `go_memstats_stack_sys_bytes` | gauge | Number of bytes obtained from system for stack allocator. | | -| `go_memstats_sys_bytes` | gauge | Number of bytes obtained from system. | | -| `go_threads` | gauge | Number of OS threads created. | | -| `process_cpu_seconds_total` | counter | Total user and system CPU time spent in seconds. | | -| `process_max_fds` | gauge | Maximum number of open file descriptors. | | -| `process_open_fds` | gauge | Number of open file descriptors. | | -| `process_resident_memory_bytes` | gauge | Resident memory size in bytes. | | -| `process_start_time_seconds` | gauge | Start time of the process since unix epoch in seconds. | | -| `process_virtual_memory_bytes` | gauge | Virtual memory size in bytes. | | -| `process_virtual_memory_max_bytes` | gauge | Maximum amount of virtual memory available in bytes. | | -| `promhttp_metric_handler_requests_in_flight` | gauge | Current number of scrapes being served. | | -| `promhttp_metric_handler_requests_total` | counter | Total number of scrapes by HTTP status code. | `code` | +| Name | Type | Description | Labels | +| -------------------------------------------- | --------- | ------------------------------------------------------------------ | ----------------------------------------------------------------------------------- | +| `coderd_api_active_users_duration_hour` | gauge | The number of users that have been active within the last hour. | | +| `coderd_api_concurrent_requests` | gauge | The number of concurrent API requests. | | +| `coderd_api_concurrent_websockets` | gauge | The total number of concurrent API websockets. | | +| `coderd_api_request_latencies_seconds` | histogram | Latency distribution of requests in seconds. | `method` `path` | +| `coderd_api_requests_processed_total` | counter | The total number of processed API requests | `code` `method` `path` | +| `coderd_api_websocket_durations_seconds` | histogram | Websocket duration distribution of requests in seconds. | `path` | +| `coderd_api_workspace_latest_build_total` | gauge | The latest workspace builds with a status. | `status` | +| `coderd_provisionerd_job_timings_seconds` | histogram | The provisioner job time duration in seconds. | `provisioner` `status` | +| `coderd_provisionerd_jobs_current` | gauge | The number of currently running provisioner jobs. | `provisioner` | +| `coderd_workspace_builds_total` | counter | The number of workspaces started, updated, or deleted. | `action` `owner_email` `status` `template_name` `template_version` `workspace_name` | +| `go_gc_duration_seconds` | summary | A summary of the pause duration of garbage collection cycles. | | +| `go_goroutines` | gauge | Number of goroutines that currently exist. | | +| `go_info` | gauge | Information about the Go environment. | `version` | +| `go_memstats_alloc_bytes` | gauge | Number of bytes allocated and still in use. | | +| `go_memstats_alloc_bytes_total` | counter | Total number of bytes allocated, even if freed. | | +| `go_memstats_buck_hash_sys_bytes` | gauge | Number of bytes used by the profiling bucket hash table. | | +| `go_memstats_frees_total` | counter | Total number of frees. | | +| `go_memstats_gc_sys_bytes` | gauge | Number of bytes used for garbage collection system metadata. | | +| `go_memstats_heap_alloc_bytes` | gauge | Number of heap bytes allocated and still in use. | | +| `go_memstats_heap_idle_bytes` | gauge | Number of heap bytes waiting to be used. | | +| `go_memstats_heap_inuse_bytes` | gauge | Number of heap bytes that are in use. | | +| `go_memstats_heap_objects` | gauge | Number of allocated objects. | | +| `go_memstats_heap_released_bytes` | gauge | Number of heap bytes released to OS. | | +| `go_memstats_heap_sys_bytes` | gauge | Number of heap bytes obtained from system. | | +| `go_memstats_last_gc_time_seconds` | gauge | Number of seconds since 1970 of last garbage collection. | | +| `go_memstats_lookups_total` | counter | Total number of pointer lookups. | | +| `go_memstats_mallocs_total` | counter | Total number of mallocs. | | +| `go_memstats_mcache_inuse_bytes` | gauge | Number of bytes in use by mcache structures. | | +| `go_memstats_mcache_sys_bytes` | gauge | Number of bytes used for mcache structures obtained from system. | | +| `go_memstats_mspan_inuse_bytes` | gauge | Number of bytes in use by mspan structures. | | +| `go_memstats_mspan_sys_bytes` | gauge | Number of bytes used for mspan structures obtained from system. | | +| `go_memstats_next_gc_bytes` | gauge | Number of heap bytes when next garbage collection will take place. | | +| `go_memstats_other_sys_bytes` | gauge | Number of bytes used for other system allocations. | | +| `go_memstats_stack_inuse_bytes` | gauge | Number of bytes in use by the stack allocator. | | +| `go_memstats_stack_sys_bytes` | gauge | Number of bytes obtained from system for stack allocator. | | +| `go_memstats_sys_bytes` | gauge | Number of bytes obtained from system. | | +| `go_threads` | gauge | Number of OS threads created. | | +| `process_cpu_seconds_total` | counter | Total user and system CPU time spent in seconds. | | +| `process_max_fds` | gauge | Maximum number of open file descriptors. | | +| `process_open_fds` | gauge | Number of open file descriptors. | | +| `process_resident_memory_bytes` | gauge | Resident memory size in bytes. | | +| `process_start_time_seconds` | gauge | Start time of the process since unix epoch in seconds. | | +| `process_virtual_memory_bytes` | gauge | Virtual memory size in bytes. | | +| `process_virtual_memory_max_bytes` | gauge | Maximum amount of virtual memory available in bytes. | | +| `promhttp_metric_handler_requests_in_flight` | gauge | Current number of scrapes being served. | | +| `promhttp_metric_handler_requests_total` | counter | Total number of scrapes by HTTP status code. | `code` | diff --git a/provisionerd/provisionerd.go b/provisionerd/provisionerd.go index ce9fe9f8788fb..be5e602b6326e 100644 --- a/provisionerd/provisionerd.go +++ b/provisionerd/provisionerd.go @@ -157,6 +157,12 @@ func NewMetrics(reg prometheus.Registerer) Metrics { 60 * 60, // 1hr }, }, []string{"provisioner", "status"}), + WorkspaceBuilds: auto.NewCounterVec(prometheus.CounterOpts{ + Namespace: "coderd", + Subsystem: "", // Explicitly empty to make this a top-level metric. + Name: "workspace_builds_total", + Help: "The number of workspaces started, updated, or deleted.", + }, []string{"owner_email", "workspace_name", "template_name", "template_version", "action", "status"}), }, } } diff --git a/provisionerd/runner/runner.go b/provisionerd/runner/runner.go index 8b418948053b4..5fcf81313f43b 100644 --- a/provisionerd/runner/runner.go +++ b/provisionerd/runner/runner.go @@ -77,6 +77,8 @@ type Metrics struct { ConcurrentJobs *prometheus.GaugeVec // JobTimings also counts the total amount of jobs. JobTimings *prometheus.HistogramVec + // WorkspaceBuilds counts workspace build successes and failures. + WorkspaceBuilds *prometheus.CounterVec } type JobUpdater interface { @@ -161,6 +163,16 @@ func (r *Runner) Run() { } concurrentGauge.Dec() + if build := r.job.GetWorkspaceBuild(); build != nil { + r.metrics.WorkspaceBuilds.WithLabelValues( + build.Metadata.WorkspaceOwnerEmail, + build.Metadata.WorkspaceName, + build.Metadata.TemplateName, + build.Metadata.TemplateVersion, + build.Metadata.WorkspaceTransition.String(), + status, + ).Inc() + } r.metrics.JobTimings.WithLabelValues(r.job.Provisioner, status).Observe(time.Since(start).Seconds()) }() diff --git a/provisionersdk/proto/provisioner.pb.go b/provisionersdk/proto/provisioner.pb.go index aef691ec3fca1..b30dd23392485 100644 --- a/provisionersdk/proto/provisioner.pb.go +++ b/provisionersdk/proto/provisioner.pb.go @@ -1976,6 +1976,8 @@ type Provision_Metadata struct { WorkspaceId string `protobuf:"bytes,5,opt,name=workspace_id,json=workspaceId,proto3" json:"workspace_id,omitempty"` WorkspaceOwnerId string `protobuf:"bytes,6,opt,name=workspace_owner_id,json=workspaceOwnerId,proto3" json:"workspace_owner_id,omitempty"` WorkspaceOwnerEmail string `protobuf:"bytes,7,opt,name=workspace_owner_email,json=workspaceOwnerEmail,proto3" json:"workspace_owner_email,omitempty"` + TemplateName string `protobuf:"bytes,8,opt,name=template_name,json=templateName,proto3" json:"template_name,omitempty"` + TemplateVersion string `protobuf:"bytes,9,opt,name=template_version,json=templateVersion,proto3" json:"template_version,omitempty"` } func (x *Provision_Metadata) Reset() { @@ -2059,6 +2061,20 @@ func (x *Provision_Metadata) GetWorkspaceOwnerEmail() string { return "" } +func (x *Provision_Metadata) GetTemplateName() string { + if x != nil { + return x.TemplateName + } + return "" +} + +func (x *Provision_Metadata) GetTemplateVersion() string { + if x != nil { + return x.TemplateVersion + } + return "" +} + // Config represents execution configuration shared by both Plan and // Apply commands. type Provision_Config struct { @@ -2793,8 +2809,8 @@ var file_provisionersdk_proto_provisioner_proto_rawDesc = []byte{ 0x0b, 0x32, 0x1b, 0x2e, 0x70, 0x72, 0x6f, 0x76, 0x69, 0x73, 0x69, 0x6f, 0x6e, 0x65, 0x72, 0x2e, 0x50, 0x61, 0x72, 0x73, 0x65, 0x2e, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x48, 0x00, 0x52, 0x08, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x42, 0x06, 0x0a, 0x04, 0x74, 0x79, - 0x70, 0x65, 0x22, 0xc7, 0x0a, 0x0a, 0x09, 0x50, 0x72, 0x6f, 0x76, 0x69, 0x73, 0x69, 0x6f, 0x6e, - 0x1a, 0xd1, 0x02, 0x0a, 0x08, 0x4d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x12, 0x1b, 0x0a, + 0x70, 0x65, 0x22, 0x97, 0x0b, 0x0a, 0x09, 0x50, 0x72, 0x6f, 0x76, 0x69, 0x73, 0x69, 0x6f, 0x6e, + 0x1a, 0xa1, 0x03, 0x0a, 0x08, 0x4d, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x12, 0x1b, 0x0a, 0x09, 0x63, 0x6f, 0x64, 0x65, 0x72, 0x5f, 0x75, 0x72, 0x6c, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, 0x63, 0x6f, 0x64, 0x65, 0x72, 0x55, 0x72, 0x6c, 0x12, 0x53, 0x0a, 0x14, 0x77, 0x6f, 0x72, 0x6b, 0x73, 0x70, 0x61, 0x63, 0x65, 0x5f, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x69, 0x74, 0x69, @@ -2815,7 +2831,12 @@ var file_provisionersdk_proto_provisioner_proto_rawDesc = []byte{ 0x12, 0x32, 0x0a, 0x15, 0x77, 0x6f, 0x72, 0x6b, 0x73, 0x70, 0x61, 0x63, 0x65, 0x5f, 0x6f, 0x77, 0x6e, 0x65, 0x72, 0x5f, 0x65, 0x6d, 0x61, 0x69, 0x6c, 0x18, 0x07, 0x20, 0x01, 0x28, 0x09, 0x52, 0x13, 0x77, 0x6f, 0x72, 0x6b, 0x73, 0x70, 0x61, 0x63, 0x65, 0x4f, 0x77, 0x6e, 0x65, 0x72, 0x45, - 0x6d, 0x61, 0x69, 0x6c, 0x1a, 0x79, 0x0a, 0x06, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x12, 0x1c, + 0x6d, 0x61, 0x69, 0x6c, 0x12, 0x23, 0x0a, 0x0d, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, + 0x5f, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x08, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0c, 0x74, 0x65, 0x6d, + 0x70, 0x6c, 0x61, 0x74, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x12, 0x29, 0x0a, 0x10, 0x74, 0x65, 0x6d, + 0x70, 0x6c, 0x61, 0x74, 0x65, 0x5f, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x18, 0x09, 0x20, + 0x01, 0x28, 0x09, 0x52, 0x0f, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x56, 0x65, 0x72, + 0x73, 0x69, 0x6f, 0x6e, 0x1a, 0x79, 0x0a, 0x06, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x12, 0x1c, 0x0a, 0x09, 0x64, 0x69, 0x72, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x09, 0x64, 0x69, 0x72, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x79, 0x12, 0x14, 0x0a, 0x05, 0x73, 0x74, 0x61, 0x74, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x0c, 0x52, 0x05, 0x73, 0x74, 0x61, diff --git a/provisionersdk/proto/provisioner.proto b/provisionersdk/proto/provisioner.proto index e9f714033688d..b61ce175db15d 100644 --- a/provisionersdk/proto/provisioner.proto +++ b/provisionersdk/proto/provisioner.proto @@ -220,6 +220,8 @@ message Provision { string workspace_id = 5; string workspace_owner_id = 6; string workspace_owner_email = 7; + string template_name = 8; + string template_version = 9; } // Config represents execution configuration shared by both Plan and diff --git a/scripts/metricsdocgen/metrics b/scripts/metricsdocgen/metrics index 59cb16b797f72..50bbc87990dda 100644 --- a/scripts/metricsdocgen/metrics +++ b/scripts/metricsdocgen/metrics @@ -584,6 +584,11 @@ coderd_provisionerd_job_timings_seconds_count{provisioner="terraform",status="su # HELP coderd_provisionerd_jobs_current The number of currently running provisioner jobs. # TYPE coderd_provisionerd_jobs_current gauge coderd_provisionerd_jobs_current{provisioner="terraform"} 0 +# HELP coderd_workspace_builds_total The number of workspaces started, updated, or deleted. +# TYPE coderd_workspace_builds_total counter +coderd_workspace_builds_total{action="START",owner_email="admin@coder.com",status="failed",template_name="docker",template_version="gallant_wright0",workspace_name="test1"} 1 +coderd_workspace_builds_total{action="START",owner_email="admin@coder.com",status="success",template_name="docker",template_version="gallant_wright0",workspace_name="test1"} 1 +coderd_workspace_builds_total{action="STOP",owner_email="admin@coder.com",status="success",template_name="docker",template_version="gallant_wright0",workspace_name="test1"} 1 # HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles. # TYPE go_gc_duration_seconds summary go_gc_duration_seconds{quantile="0"} 2.4056e-05 From c6b3fb7aae448ce3ab3b2fe0a39b4a044166f37d Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Wed, 22 Feb 2023 17:23:41 -0700 Subject: [PATCH 2/2] fixup! feat(api): add prometheus metric coderd_workspace_builds_total --- provisionersdk/proto/provisioner.proto | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/provisionersdk/proto/provisioner.proto b/provisionersdk/proto/provisioner.proto index b61ce175db15d..6b935f27db531 100644 --- a/provisionersdk/proto/provisioner.proto +++ b/provisionersdk/proto/provisioner.proto @@ -220,8 +220,8 @@ message Provision { string workspace_id = 5; string workspace_owner_id = 6; string workspace_owner_email = 7; - string template_name = 8; - string template_version = 9; + string template_name = 8; + string template_version = 9; } // Config represents execution configuration shared by both Plan and