From f62cf1663c70043ece072d026bc51d69070eb46c Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Mon, 15 May 2023 14:55:34 +0100 Subject: [PATCH 01/19] add prometheus metrics endpoint to workspacetraffic command --- cli/scaletest.go | 53 +++++++++++++------ ..._scaletest_workspace-traffic_--help.golden | 3 ++ scaletest/terraform/coder.tf | 2 +- scaletest/workspacetraffic/config.go | 13 +++++ scaletest/workspacetraffic/metrics.go | 48 +++++++++++++++++ scaletest/workspacetraffic/run.go | 24 ++++++--- 6 files changed, 120 insertions(+), 23 deletions(-) create mode 100644 scaletest/workspacetraffic/metrics.go diff --git a/cli/scaletest.go b/cli/scaletest.go index 67186da2212fa..30a20730a9840 100644 --- a/cli/scaletest.go +++ b/cli/scaletest.go @@ -14,9 +14,13 @@ import ( "time" "github.com/google/uuid" + "github.com/prometheus/client_golang/prometheus" "go.opentelemetry.io/otel/trace" "golang.org/x/xerrors" + "cdr.dev/slog" + "cdr.dev/slog/sloggers/sloghuman" + "github.com/coder/coder/cli/clibase" "github.com/coder/coder/cli/cliui" "github.com/coder/coder/coderd/httpapi" @@ -896,13 +900,14 @@ func (r *RootCmd) scaletestCreateWorkspaces() *clibase.Cmd { func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd { var ( - tickInterval time.Duration - bytesPerTick int64 - client = &codersdk.Client{} - tracingFlags = &scaletestTracingFlags{} - strategy = &scaletestStrategyFlags{} - cleanupStrategy = &scaletestStrategyFlags{cleanup: true} - output = &scaletestOutputFlags{} + tickInterval time.Duration + bytesPerTick int64 + prometheusAddress string + client = &codersdk.Client{} + tracingFlags = &scaletestTracingFlags{} + strategy = &scaletestStrategyFlags{} + cleanupStrategy = &scaletestStrategyFlags{cleanup: true} + output = &scaletestOutputFlags{} ) cmd := &clibase.Cmd{ @@ -913,6 +918,12 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd { ), Handler: func(inv *clibase.Invocation) error { ctx := inv.Context() + reg := prometheus.NewRegistry() + metrics := workspacetraffic.NewMetrics(reg) + + logger := slog.Make(sloghuman.Sink(io.Discard)) + prometheusSrvClose := ServeHandler(ctx, logger, prometheusMetricsHandler(), prometheusAddress, "prometheus") + defer prometheusSrvClose() // Bypass rate limiting client.HTTPClient = &http.Client{ @@ -955,9 +966,10 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd { th := harness.NewTestHarness(strategy.toStrategy(), cleanupStrategy.toStrategy()) for idx, ws := range workspaces { var ( - agentID uuid.UUID - name = "workspace-traffic" - id = strconv.Itoa(idx) + agentID uuid.UUID + agentName string + name = "workspace-traffic" + id = strconv.Itoa(idx) ) for _, res := range ws.LatestBuild.Resources { @@ -965,6 +977,7 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd { continue } agentID = res.Agents[0].ID + agentName = res.Agents[0].Name } if agentID == uuid.Nil { @@ -974,16 +987,19 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd { // Setup our workspace agent connection. config := workspacetraffic.Config{ - AgentID: agentID, - BytesPerTick: bytesPerTick, - Duration: strategy.timeout, - TickInterval: tickInterval, + AgentID: agentID, + AgentName: agentName, + BytesPerTick: bytesPerTick, + Duration: strategy.timeout, + TickInterval: tickInterval, + WorkspaceName: ws.Name, + WorkspaceOwner: ws.OwnerName, } if err := config.Validate(); err != nil { return xerrors.Errorf("validate config: %w", err) } - var runner harness.Runnable = workspacetraffic.NewRunner(client, config) + var runner harness.Runnable = workspacetraffic.NewRunner(client, config, metrics) if tracingEnabled { runner = &runnableTraceWrapper{ tracer: tracer, @@ -1034,6 +1050,13 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd { Description: "How often to send traffic.", Value: clibase.DurationOf(&tickInterval), }, + { + Flag: "prometheus-address", + Env: "CODER_SCALETEST_PROMETHEUS_ADDRESS", + Default: "0.0.0.0:2112", + Description: "Address on which to expose prometheus metrics.", + Value: clibase.StringOf(&prometheusAddress), + }, } tracingFlags.attach(&cmd.Options) diff --git a/cli/testdata/coder_scaletest_workspace-traffic_--help.golden b/cli/testdata/coder_scaletest_workspace-traffic_--help.golden index b7de6ca960e97..e0740e800bf73 100644 --- a/cli/testdata/coder_scaletest_workspace-traffic_--help.golden +++ b/cli/testdata/coder_scaletest_workspace-traffic_--help.golden @@ -27,6 +27,9 @@ Generate traffic to scaletest workspaces through coderd Output format specs in the format "[:]". Not specifying a path will default to stdout. Available formats: text, json. + --prometheus-address string, $CODER_SCALETEST_PROMETHEUS_ADDRESS (default: 0.0.0.0:2112) + Address on which to expose prometheus metrics. + --tick-interval duration, $CODER_SCALETEST_WORKSPACE_TRAFFIC_TICK_INTERVAL (default: 100ms) How often to send traffic. diff --git a/scaletest/terraform/coder.tf b/scaletest/terraform/coder.tf index d86aa2a7fe1ad..76bbcaa6e14bf 100644 --- a/scaletest/terraform/coder.tf +++ b/scaletest/terraform/coder.tf @@ -150,7 +150,7 @@ resource "null_resource" "coder-monitoring-manifest_apply" { provisioner "local-exec" { working_dir = "${abspath(path.module)}/.coderv2" command = < 0 { w.bytesRead.Add(int64(n)) + w.metrics.BytesRead.WithLabelValues(w.labels...).Add(float64(n)) } return n, err } func (w *countReadWriter) Write(p []byte) (int, error) { + start := time.Now() n, err := w.ReadWriter.Write(p) - if err == nil { + w.metrics.WriteLatencyMS.WithLabelValues(w.labels...).Observe(time.Since(start).Seconds()) + if n > 0 { w.bytesWritten.Add(int64(n)) + w.metrics.BytesWritten.WithLabelValues(w.labels...).Add(float64(n)) } return n, err } From 00b9ecaf199fd3e3efc494cdffd958500ba98853 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Tue, 16 May 2023 13:49:29 +0100 Subject: [PATCH 02/19] use self-managed prometheus --- .gitignore | 1 + scaletest/terraform/coder.tf | 28 ------ scaletest/terraform/gcp_cluster.tf | 2 +- scaletest/terraform/prometheus.tf | 145 +++++++++++++++++++++++++++++ scaletest/terraform/vars.tf | 30 ++++++ 5 files changed, 177 insertions(+), 29 deletions(-) create mode 100644 scaletest/terraform/prometheus.tf diff --git a/.gitignore b/.gitignore index 69b58c4cee458..29b297a9e41ec 100644 --- a/.gitignore +++ b/.gitignore @@ -59,3 +59,4 @@ site/stats/ ./scaletest/terraform/.terraform ./scaletest/terraform/.terraform.lock.hcl terraform.tfstate.* +**/*.tfvars diff --git a/scaletest/terraform/coder.tf b/scaletest/terraform/coder.tf index 76bbcaa6e14bf..2486f753f76c8 100644 --- a/scaletest/terraform/coder.tf +++ b/scaletest/terraform/coder.tf @@ -128,34 +128,6 @@ EOF ] } -resource "local_file" "coder-monitoring-manifest" { - filename = "${path.module}/.coderv2/coder-monitoring.yaml" - content = < Date: Tue, 16 May 2023 20:36:21 +0100 Subject: [PATCH 03/19] plumb in prom properly --- cli/scaletest.go | 3 ++- scaletest/workspacetraffic/metrics.go | 13 +++++++------ scaletest/workspacetraffic/run.go | 4 ++-- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/cli/scaletest.go b/cli/scaletest.go index 30a20730a9840..69bcf0184b29e 100644 --- a/cli/scaletest.go +++ b/cli/scaletest.go @@ -919,7 +919,7 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd { Handler: func(inv *clibase.Invocation) error { ctx := inv.Context() reg := prometheus.NewRegistry() - metrics := workspacetraffic.NewMetrics(reg) + metrics := workspacetraffic.NewMetrics(reg, "username", "workspace_name", "agent_name") logger := slog.Make(sloghuman.Sink(io.Discard)) prometheusSrvClose := ServeHandler(ctx, logger, prometheusMetricsHandler(), prometheusAddress, "prometheus") @@ -994,6 +994,7 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd { TickInterval: tickInterval, WorkspaceName: ws.Name, WorkspaceOwner: ws.OwnerName, + Registry: reg, } if err := config.Validate(); err != nil { diff --git a/scaletest/workspacetraffic/metrics.go b/scaletest/workspacetraffic/metrics.go index 7dca36eedcfb0..a8e012da20c10 100644 --- a/scaletest/workspacetraffic/metrics.go +++ b/scaletest/workspacetraffic/metrics.go @@ -8,35 +8,36 @@ type Metrics struct { Errors prometheus.CounterVec ReadLatencyMS prometheus.HistogramVec WriteLatencyMS prometheus.HistogramVec + LabelNames []string } -func NewMetrics(reg prometheus.Registerer) *Metrics { +func NewMetrics(reg prometheus.Registerer, labelNames ...string) *Metrics { m := &Metrics{ BytesRead: *prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: "coderd", Subsystem: "scaletest", Name: "bytes_read", - }, []string{"username", "workspace_name", "agent_name"}), + }, labelNames), BytesWritten: *prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: "coderd", Subsystem: "scaletest", Name: "bytes_written", - }, []string{"username", "workspace_name", "agent_name"}), + }, labelNames), Errors: *prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: "coderd", Subsystem: "scaletest", Name: "errors", - }, []string{"username", "workspace_name", "agent_name"}), + }, labelNames), ReadLatencyMS: *prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: "coderd", Subsystem: "scaletest", Name: "read_latency_seconds", - }, []string{"username", "workspace_name", "agent_name"}), + }, labelNames), WriteLatencyMS: *prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: "coderd", Subsystem: "scaletest", Name: "write_latency_seconds", - }, []string{"username", "workspace_name", "agent_name"}), + }, labelNames), } reg.MustRegister(m.BytesRead) diff --git a/scaletest/workspacetraffic/run.go b/scaletest/workspacetraffic/run.go index d0ea38efe3e39..9f3bc8c532cb9 100644 --- a/scaletest/workspacetraffic/run.go +++ b/scaletest/workspacetraffic/run.go @@ -92,7 +92,7 @@ func (r *Runner) Run(ctx context.Context, _ string, logs io.Writer) error { }() // Wrap the conn in a countReadWriter so we can monitor bytes sent/rcvd. - crw := countReadWriter{ReadWriter: conn} + crw := countReadWriter{ReadWriter: conn, metrics: r.metrics, labels: []string{r.cfg.WorkspaceOwner, r.cfg.WorkspaceName, r.cfg.AgentName}} // Create a ticker for sending data to the PTY. tick := time.NewTicker(tickInterval) @@ -188,7 +188,7 @@ type countReadWriter struct { io.ReadWriter bytesRead atomic.Int64 bytesWritten atomic.Int64 - metrics Metrics + metrics *Metrics labels []string } From f00b8d7ce7343d842b68795e814d5aaab3759ae7 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Tue, 16 May 2023 20:44:10 +0100 Subject: [PATCH 04/19] fix label on second pod monitor --- scaletest/terraform/prometheus.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scaletest/terraform/prometheus.tf b/scaletest/terraform/prometheus.tf index 0c4b3fbbdcb91..2fddae32d40f5 100644 --- a/scaletest/terraform/prometheus.tf +++ b/scaletest/terraform/prometheus.tf @@ -126,7 +126,7 @@ metadata: spec: selector: matchLabels: - app.kubernetes.io/name: coder + app.kubernetes.io/name: coder-scaletest-workspace-traffic podMetricsEndpoints: - port: prometheus-http interval: 15s From 69bc5adb3e927fa433cd32db490da03d5f69533b Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Tue, 16 May 2023 21:28:55 +0100 Subject: [PATCH 05/19] do not clobber existing prometheus --- cli/scaletest.go | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/cli/scaletest.go b/cli/scaletest.go index 69bcf0184b29e..503139cc09524 100644 --- a/cli/scaletest.go +++ b/cli/scaletest.go @@ -15,6 +15,7 @@ import ( "github.com/google/uuid" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" "go.opentelemetry.io/otel/trace" "golang.org/x/xerrors" @@ -900,14 +901,14 @@ func (r *RootCmd) scaletestCreateWorkspaces() *clibase.Cmd { func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd { var ( - tickInterval time.Duration - bytesPerTick int64 - prometheusAddress string - client = &codersdk.Client{} - tracingFlags = &scaletestTracingFlags{} - strategy = &scaletestStrategyFlags{} - cleanupStrategy = &scaletestStrategyFlags{cleanup: true} - output = &scaletestOutputFlags{} + tickInterval time.Duration + bytesPerTick int64 + scaletestPrometheusAddress string + client = &codersdk.Client{} + tracingFlags = &scaletestTracingFlags{} + strategy = &scaletestStrategyFlags{} + cleanupStrategy = &scaletestStrategyFlags{cleanup: true} + output = &scaletestOutputFlags{} ) cmd := &clibase.Cmd{ @@ -922,7 +923,7 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd { metrics := workspacetraffic.NewMetrics(reg, "username", "workspace_name", "agent_name") logger := slog.Make(sloghuman.Sink(io.Discard)) - prometheusSrvClose := ServeHandler(ctx, logger, prometheusMetricsHandler(), prometheusAddress, "prometheus") + prometheusSrvClose := ServeHandler(ctx, logger, promhttp.HandlerFor(reg, promhttp.HandlerOpts{}), scaletestPrometheusAddress, "prometheus") defer prometheusSrvClose() // Bypass rate limiting @@ -1052,11 +1053,11 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd { Value: clibase.DurationOf(&tickInterval), }, { - Flag: "prometheus-address", + Flag: "scaletest-prometheus-address", Env: "CODER_SCALETEST_PROMETHEUS_ADDRESS", - Default: "0.0.0.0:2112", - Description: "Address on which to expose prometheus metrics.", - Value: clibase.StringOf(&prometheusAddress), + Default: "0.0.0.0:21112", + Description: "Address on which to expose scaletest prometheus metrics.", + Value: clibase.StringOf(&scaletestPrometheusAddress), }, } From 57d338d5f2474abdaa20049a55eed3526aded0a0 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Wed, 17 May 2023 11:52:17 +0100 Subject: [PATCH 06/19] remote_write all metrics --- scaletest/terraform/vars.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scaletest/terraform/vars.tf b/scaletest/terraform/vars.tf index 1d28c4c1c1d7e..41c83be84348c 100644 --- a/scaletest/terraform/vars.tf +++ b/scaletest/terraform/vars.tf @@ -150,7 +150,7 @@ variable "prometheus_remote_write_insecure_skip_verify" { variable "prometheus_remote_write_metrics_regex" { description = "Allowlist regex of metrics for Prometheus remote write." - default = ".*(coderd_|pg_database_size_bytes|pg_stat_|coderloader_|go_goroutines|go_memstats_heap_inuse_bytes|kube_pod_container_resource_requests|kube_pod_container_resource_limits|container_cpu_usage_seconds_total|container_memory_working_set_bytes).*" + default = ".*" } variable "prometheus_remote_write_send_interval" { From a1d14dffd923750a6d8298aac2fcf62ecd5db122 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Wed, 17 May 2023 11:59:25 +0100 Subject: [PATCH 07/19] add convenience script to create traffic inside cluster --- scaletest/terraform/coder_workspacetraffic.sh | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100755 scaletest/terraform/coder_workspacetraffic.sh diff --git a/scaletest/terraform/coder_workspacetraffic.sh b/scaletest/terraform/coder_workspacetraffic.sh new file mode 100755 index 0000000000000..a7659f92b47a9 --- /dev/null +++ b/scaletest/terraform/coder_workspacetraffic.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash + +set -euo pipefail + +if [[ $# -lt 1 ]]; then + echo "Usage: $0 " + exit 1 +fi + +# Allow toggling verbose output +[[ -n ${VERBOSE:-} ]] && set -x + +LOADTEST_NAME="$1" +CODER_TOKEN=$(./coder_shim.sh tokens create) +CODER_URL="http://coder.coder-${LOADTEST_NAME}.svc.cluster.local" + +cat < Date: Wed, 17 May 2023 15:36:10 +0100 Subject: [PATCH 08/19] fixup! add convenience script to create traffic inside cluster --- scaletest/terraform/coder_workspacetraffic.sh | 11 +++++++++-- scaletest/terraform/prometheus.tf | 8 ++++---- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/scaletest/terraform/coder_workspacetraffic.sh b/scaletest/terraform/coder_workspacetraffic.sh index a7659f92b47a9..df528e8ce8446 100755 --- a/scaletest/terraform/coder_workspacetraffic.sh +++ b/scaletest/terraform/coder_workspacetraffic.sh @@ -36,12 +36,20 @@ spec: - command: - sh - -c - - "curl -fsSL $CODER_URL/bin/coder-linux-amd64 -o /tmp/coder && chmod +x /tmp/coder && /tmp/coder --url=$CODER_URL --token=$CODER_TOKEN scaletest workspace-traffic --concurrency 0 --job-timeout=30m --scaletest-prometheus-address 0.0.0.0:21112" + - "curl -fsSL $CODER_URL/bin/coder-linux-amd64 -o /tmp/coder && chmod +x /tmp/coder && /tmp/coder --url=$CODER_URL --token=$CODER_TOKEN scaletest workspace-traffic" env: - name: CODER_URL value: $CODER_URL - name: CODER_TOKEN value: $CODER_TOKEN + - name: CODER_SCALETEST_PROMETHEUS_ADDRESS + value: "0.0.0.0:21112" + - name: CODER_SCALETEST_JOB_TIMEOUT + value: "30m" + - name: CODER_SCALETEST_CONCURRENCY + value: "0" + - name: CODER_SCALETEST_WORKSPACE_TRAFFIC_BYTES_PER_TICK + value: "2048" ports: - containerPort: 21112 name: prometheus-http @@ -62,4 +70,3 @@ spec: - port: prometheus-http interval: 15s EOF - diff --git a/scaletest/terraform/prometheus.tf b/scaletest/terraform/prometheus.tf index 2fddae32d40f5..c54e69e63cd1c 100644 --- a/scaletest/terraform/prometheus.tf +++ b/scaletest/terraform/prometheus.tf @@ -102,9 +102,9 @@ prometheus: # after creating a cluster, and we want this to be brought up # with a single command. resource "local_file" "coder-monitoring-manifest" { - filename = "${path.module}/.coderv2/coder-monitoring.yaml" - depends_on = [ helm_release.prometheus-chart ] - content = < Date: Wed, 17 May 2023 15:56:37 +0100 Subject: [PATCH 09/19] make fmt, make gen --- .prettierignore | 1 + .../coder_scaletest_workspace-traffic_--help.golden | 4 ++-- docs/cli/scaletest_workspace-traffic.md | 10 ++++++++++ scaletest/terraform/README.md | 7 +++++-- scaletest/terraform/coder_workspacetraffic.sh | 1 + site/.eslintignore | 1 + site/.prettierignore | 1 + 7 files changed, 21 insertions(+), 4 deletions(-) diff --git a/.prettierignore b/.prettierignore index cc4a83b0231a8..d96e9df947ddd 100644 --- a/.prettierignore +++ b/.prettierignore @@ -62,6 +62,7 @@ site/stats/ ./scaletest/terraform/.terraform ./scaletest/terraform/.terraform.lock.hcl terraform.tfstate.* +**/*.tfvars # .prettierignore.include: # Helm templates contain variables that are invalid YAML and can't be formatted # by Prettier. diff --git a/cli/testdata/coder_scaletest_workspace-traffic_--help.golden b/cli/testdata/coder_scaletest_workspace-traffic_--help.golden index e0740e800bf73..a59d5744e4a0c 100644 --- a/cli/testdata/coder_scaletest_workspace-traffic_--help.golden +++ b/cli/testdata/coder_scaletest_workspace-traffic_--help.golden @@ -27,8 +27,8 @@ Generate traffic to scaletest workspaces through coderd Output format specs in the format "[:]". Not specifying a path will default to stdout. Available formats: text, json. - --prometheus-address string, $CODER_SCALETEST_PROMETHEUS_ADDRESS (default: 0.0.0.0:2112) - Address on which to expose prometheus metrics. + --scaletest-prometheus-address string, $CODER_SCALETEST_PROMETHEUS_ADDRESS (default: 0.0.0.0:21112) + Address on which to expose scaletest prometheus metrics. --tick-interval duration, $CODER_SCALETEST_WORKSPACE_TRAFFIC_TICK_INTERVAL (default: 100ms) How often to send traffic. diff --git a/docs/cli/scaletest_workspace-traffic.md b/docs/cli/scaletest_workspace-traffic.md index 5303847345a49..dfde115fa8d5a 100644 --- a/docs/cli/scaletest_workspace-traffic.md +++ b/docs/cli/scaletest_workspace-traffic.md @@ -82,6 +82,16 @@ Timeout per job. Jobs may take longer to complete under higher concurrency limit Output format specs in the format "[:]". Not specifying a path will default to stdout. Available formats: text, json. +### --scaletest-prometheus-address + +| | | +| ----------- | ------------------------------------------------ | +| Type | string | +| Environment | $CODER_SCALETEST_PROMETHEUS_ADDRESS | +| Default | 0.0.0.0:21112 | + +Address on which to expose scaletest prometheus metrics. + ### --tick-interval | | | diff --git a/scaletest/terraform/README.md b/scaletest/terraform/README.md index f5a2bc376d9c2..3933c6f8c4bcf 100644 --- a/scaletest/terraform/README.md +++ b/scaletest/terraform/README.md @@ -32,9 +32,12 @@ project_id = "some_google_project_id" 1. Run `coder_init.sh ` to setup an initial user and a pre-configured Kubernetes template. It will also download the Coder CLI from the Coder instance locally. -1. Do whatever you need to do with the Coder instance. +1. Do whatever you need to do with the Coder instance: - > To run Coder commands against the instance, you can use `coder_shim.sh `. + > Note: To run Coder commands against the instance, you can use `coder_shim.sh `. > You don't need to run `coder login` yourself. + - To create workspaces, run `./coder_shim.sh scaletest create-workspaces --template="kubernetes" --count=N` + - To generate workspace traffic, run `./coder_trafficgen.sh `. This will keep running until you delete the pod `coder-scaletest-workspace-traffic`. + 1. When you are finished, you can run `terraform destroy -var-file=override.tfvars`. diff --git a/scaletest/terraform/coder_workspacetraffic.sh b/scaletest/terraform/coder_workspacetraffic.sh index df528e8ce8446..b979ca04a8be5 100755 --- a/scaletest/terraform/coder_workspacetraffic.sh +++ b/scaletest/terraform/coder_workspacetraffic.sh @@ -13,6 +13,7 @@ fi LOADTEST_NAME="$1" CODER_TOKEN=$(./coder_shim.sh tokens create) CODER_URL="http://coder.coder-${LOADTEST_NAME}.svc.cluster.local" +export KUBECONFIG="${PWD}/.coderv2/${LOADTEST_NAME}-cluster.kubeconfig" cat < Date: Wed, 17 May 2023 16:23:51 +0100 Subject: [PATCH 10/19] Update scaletest/terraform/prometheus.tf --- scaletest/terraform/prometheus.tf | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/scaletest/terraform/prometheus.tf b/scaletest/terraform/prometheus.tf index c54e69e63cd1c..83816e39626b9 100644 --- a/scaletest/terraform/prometheus.tf +++ b/scaletest/terraform/prometheus.tf @@ -117,19 +117,6 @@ spec: podMetricsEndpoints: - port: prometheus-http interval: 30s ---- -apiVersion: monitoring.coreos.com/v1 -kind: PodMonitor -metadata: - namespace: ${kubernetes_namespace.coder_namespace.metadata.0.name} - name: coder-workspacetraffic-monitoring -spec: - selector: - matchLabels: - app.kubernetes.io/name: coder-scaletest-workspace-traffic - podMetricsEndpoints: - - port: prometheus-http - interval: 15s EOF } From d54af33c10b0747bf38e319f8f86281d87dc9c65 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Wed, 24 May 2023 16:54:59 +0100 Subject: [PATCH 11/19] address PR comments --- cli/scaletest.go | 2 +- docs/cli/scaletest_workspace-traffic.md | 2 +- scaletest/workspacetraffic/metrics.go | 20 ++++++++++---------- scaletest/workspacetraffic/run.go | 4 ++-- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/cli/scaletest.go b/cli/scaletest.go index 503139cc09524..033628385c53d 100644 --- a/cli/scaletest.go +++ b/cli/scaletest.go @@ -1056,7 +1056,7 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd { Flag: "scaletest-prometheus-address", Env: "CODER_SCALETEST_PROMETHEUS_ADDRESS", Default: "0.0.0.0:21112", - Description: "Address on which to expose scaletest prometheus metrics.", + Description: "Address on which to expose scaletest Prometheus metrics.", Value: clibase.StringOf(&scaletestPrometheusAddress), }, } diff --git a/docs/cli/scaletest_workspace-traffic.md b/docs/cli/scaletest_workspace-traffic.md index dfde115fa8d5a..11c234129687b 100644 --- a/docs/cli/scaletest_workspace-traffic.md +++ b/docs/cli/scaletest_workspace-traffic.md @@ -90,7 +90,7 @@ Output format specs in the format "[:]". Not specifying a path wil | Environment | $CODER_SCALETEST_PROMETHEUS_ADDRESS | | Default | 0.0.0.0:21112 | -Address on which to expose scaletest prometheus metrics. +Address on which to expose scaletest Prometheus metrics. ### --tick-interval diff --git a/scaletest/workspacetraffic/metrics.go b/scaletest/workspacetraffic/metrics.go index a8e012da20c10..2d6ceb2efc13f 100644 --- a/scaletest/workspacetraffic/metrics.go +++ b/scaletest/workspacetraffic/metrics.go @@ -3,12 +3,12 @@ package workspacetraffic import "github.com/prometheus/client_golang/prometheus" type Metrics struct { - BytesRead prometheus.CounterVec - BytesWritten prometheus.CounterVec - Errors prometheus.CounterVec - ReadLatencyMS prometheus.HistogramVec - WriteLatencyMS prometheus.HistogramVec - LabelNames []string + BytesRead prometheus.CounterVec + BytesWritten prometheus.CounterVec + Errors prometheus.CounterVec + ReadLatencySeconds prometheus.HistogramVec + WriteLatencySeconds prometheus.HistogramVec + LabelNames []string } func NewMetrics(reg prometheus.Registerer, labelNames ...string) *Metrics { @@ -28,12 +28,12 @@ func NewMetrics(reg prometheus.Registerer, labelNames ...string) *Metrics { Subsystem: "scaletest", Name: "errors", }, labelNames), - ReadLatencyMS: *prometheus.NewHistogramVec(prometheus.HistogramOpts{ + ReadLatencySeconds: *prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: "coderd", Subsystem: "scaletest", Name: "read_latency_seconds", }, labelNames), - WriteLatencyMS: *prometheus.NewHistogramVec(prometheus.HistogramOpts{ + WriteLatencySeconds: *prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: "coderd", Subsystem: "scaletest", Name: "write_latency_seconds", @@ -43,7 +43,7 @@ func NewMetrics(reg prometheus.Registerer, labelNames ...string) *Metrics { reg.MustRegister(m.BytesRead) reg.MustRegister(m.BytesWritten) reg.MustRegister(m.Errors) - reg.MustRegister(m.ReadLatencyMS) - reg.MustRegister(m.WriteLatencyMS) + reg.MustRegister(m.ReadLatencySeconds) + reg.MustRegister(m.WriteLatencySeconds) return m } diff --git a/scaletest/workspacetraffic/run.go b/scaletest/workspacetraffic/run.go index 9f3bc8c532cb9..11037c28fa627 100644 --- a/scaletest/workspacetraffic/run.go +++ b/scaletest/workspacetraffic/run.go @@ -195,7 +195,7 @@ type countReadWriter struct { func (w *countReadWriter) Read(p []byte) (int, error) { start := time.Now() n, err := w.ReadWriter.Read(p) - w.metrics.ReadLatencyMS.WithLabelValues(w.labels...).Observe(time.Since(start).Seconds()) + w.metrics.ReadLatencySeconds.WithLabelValues(w.labels...).Observe(time.Since(start).Seconds()) if n > 0 { w.bytesRead.Add(int64(n)) w.metrics.BytesRead.WithLabelValues(w.labels...).Add(float64(n)) @@ -206,7 +206,7 @@ func (w *countReadWriter) Read(p []byte) (int, error) { func (w *countReadWriter) Write(p []byte) (int, error) { start := time.Now() n, err := w.ReadWriter.Write(p) - w.metrics.WriteLatencyMS.WithLabelValues(w.labels...).Observe(time.Since(start).Seconds()) + w.metrics.WriteLatencySeconds.WithLabelValues(w.labels...).Observe(time.Since(start).Seconds()) if n > 0 { w.bytesWritten.Add(int64(n)) w.metrics.BytesWritten.WithLabelValues(w.labels...).Add(float64(n)) From 8c48d2b4845abee661202cde656d909b4c516d1b Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Thu, 25 May 2023 15:17:51 +0100 Subject: [PATCH 12/19] move test to workspacetraffic package --- cli/scaletest_test.go | 60 ++----------- scaletest/workspacetraffic/run_test.go | 115 +++++++++++++++++++++++++ 2 files changed, 121 insertions(+), 54 deletions(-) create mode 100644 scaletest/workspacetraffic/run_test.go diff --git a/cli/scaletest_test.go b/cli/scaletest_test.go index b026e7636b0f3..db7588e8b22a2 100644 --- a/cli/scaletest_test.go +++ b/cli/scaletest_test.go @@ -8,17 +8,12 @@ import ( "path/filepath" "testing" - "github.com/google/uuid" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "github.com/coder/coder/agent" "github.com/coder/coder/cli/clitest" "github.com/coder/coder/coderd/coderdtest" "github.com/coder/coder/codersdk" - "github.com/coder/coder/codersdk/agentsdk" - "github.com/coder/coder/provisioner/echo" - "github.com/coder/coder/provisionersdk/proto" "github.com/coder/coder/pty/ptytest" "github.com/coder/coder/scaletest/harness" "github.com/coder/coder/testutil" @@ -205,70 +200,27 @@ param3: 1 }) } -// This test pretends to stand up a workspace and run a no-op traffic generation test. -// It's not a real test, but it's useful for debugging. -// We do not perform any cleanup. +// This test just validates that the CLI command accepts its known arguments. +// A more comprehensive test is performed in workspacetraffic/run_test.go func TestScaleTestWorkspaceTraffic(t *testing.T) { t.Parallel() ctx, cancelFunc := context.WithTimeout(context.Background(), testutil.WaitMedium) defer cancelFunc() - client := coderdtest.New(t, &coderdtest.Options{IncludeProvisionerDaemon: true}) - user := coderdtest.CreateFirstUser(t, client) - - authToken := uuid.NewString() - version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{ - Parse: echo.ParseComplete, - ProvisionPlan: echo.ProvisionComplete, - ProvisionApply: []*proto.Provision_Response{{ - Type: &proto.Provision_Response_Complete{ - Complete: &proto.Provision_Complete{ - Resources: []*proto.Resource{{ - Name: "example", - Type: "aws_instance", - Agents: []*proto.Agent{{ - Id: uuid.NewString(), - Name: "agent", - Auth: &proto.Agent_Token{ - Token: authToken, - }, - Apps: []*proto.App{}, - }}, - }}, - }, - }, - }}, - }) - template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID) - coderdtest.AwaitTemplateVersionJob(t, client, version.ID) - - ws := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID, func(cwr *codersdk.CreateWorkspaceRequest) { - cwr.Name = "scaletest-test" - }) - coderdtest.AwaitWorkspaceBuildJob(t, client, ws.LatestBuild.ID) - - agentClient := agentsdk.New(client.URL) - agentClient.SetSessionToken(authToken) - agentCloser := agent.New(agent.Options{ - Client: agentClient, - }) - t.Cleanup(func() { - _ = agentCloser.Close() - }) - - coderdtest.AwaitWorkspaceAgents(t, client, ws.ID) + client := coderdtest.New(t, nil) + _ = coderdtest.CreateFirstUser(t, client) inv, root := clitest.New(t, "scaletest", "workspace-traffic", "--timeout", "1s", "--bytes-per-tick", "1024", "--tick-interval", "100ms", + "--scaletest-prometheus-address", "127.0.0.1:0", ) clitest.SetupConfig(t, client, root) var stdout, stderr bytes.Buffer inv.Stdout = &stdout inv.Stderr = &stderr err := inv.WithContext(ctx).Run() - require.NoError(t, err) - require.Contains(t, stdout.String(), "Pass: 1") + require.ErrorContains(t, err, "no scaletest workspaces exist") } diff --git a/scaletest/workspacetraffic/run_test.go b/scaletest/workspacetraffic/run_test.go new file mode 100644 index 0000000000000..8362f64d2907c --- /dev/null +++ b/scaletest/workspacetraffic/run_test.go @@ -0,0 +1,115 @@ +package workspacetraffic_test + +import ( + "context" + "strings" + "testing" + "time" + + "github.com/coder/coder/agent" + "github.com/coder/coder/coderd/coderdtest" + "github.com/coder/coder/codersdk" + "github.com/coder/coder/codersdk/agentsdk" + "github.com/coder/coder/provisioner/echo" + "github.com/coder/coder/provisionersdk/proto" + "github.com/coder/coder/scaletest/workspacetraffic" + "github.com/coder/coder/testutil" + + "github.com/google/uuid" + "github.com/prometheus/client_golang/prometheus" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestRun(t *testing.T) { + t.Parallel() + + // We need to stand up an in-memory coderd and run a fake workspace. + var ( + client = coderdtest.New(t, &coderdtest.Options{IncludeProvisionerDaemon: true}) + user = coderdtest.CreateFirstUser(t, client) + authToken = uuid.NewString() + agentName = "agent" + version = coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{ + Parse: echo.ParseComplete, + ProvisionPlan: echo.ProvisionComplete, + ProvisionApply: []*proto.Provision_Response{{ + Type: &proto.Provision_Response_Complete{ + Complete: &proto.Provision_Complete{ + Resources: []*proto.Resource{{ + Name: "example", + Type: "aws_instance", + Agents: []*proto.Agent{{ + // Agent ID gets generated no matter what we say ¯\_(ツ)_/¯ + Name: agentName, + Auth: &proto.Agent_Token{ + Token: authToken, + }, + Apps: []*proto.App{}, + }}, + }}, + }, + }, + }}, + }) + template = coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID) + _ = coderdtest.AwaitTemplateVersionJob(t, client, version.ID) + // In order to be picked up as a scaletest workspace, the workspace must be named specifically + ws = coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID, func(cwr *codersdk.CreateWorkspaceRequest) { + cwr.Name = "scaletest-test" + }) + _ = coderdtest.AwaitWorkspaceBuildJob(t, client, ws.LatestBuild.ID) + ) + + // We also need a running agent to run this test. + agentClient := agentsdk.New(client.URL) + agentClient.SetSessionToken(authToken) + agentCloser := agent.New(agent.Options{ + Client: agentClient, + }) + ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitMedium) + + t.Cleanup(cancel) + t.Cleanup(func() { + _ = agentCloser.Close() + }) + // Make sure the agent is connected before we go any further. + resources := coderdtest.AwaitWorkspaceAgents(t, client, ws.ID) + var agentID uuid.UUID + for _, res := range resources { + for _, agt := range res.Agents { + agentID = agt.ID + } + } + require.NotEqual(t, uuid.Nil, agentID, "did not expect agentID to be nil") + + // Now we can start the runner. + reg := prometheus.NewRegistry() + metrics := workspacetraffic.NewMetrics(reg, "username", "workspace_name", "agent_name") + runner := workspacetraffic.NewRunner(client, workspacetraffic.Config{ + AgentID: agentID, + AgentName: agentName, + WorkspaceName: ws.Name, + WorkspaceOwner: ws.OwnerName, + BytesPerTick: 1024, + TickInterval: testutil.IntervalMedium, + Duration: testutil.WaitMedium - time.Second, + Registry: reg, + }, metrics) + + var logs strings.Builder + require.NoError(t, runner.Run(ctx, "", &logs), "unexpected error calling Run()") + + var collected []prometheus.Metric + collectCh := make(chan prometheus.Metric) + go func() { + for metric := range collectCh { + collected = append(collected, metric) + } + }() + reg.Collect(collectCh) + assert.NotEmpty(t, collected) + for _, m := range collected { + assert.NotZero(t, m.Desc()) + } +} From e6917e6e2bb00fac2de3a4c80ec16ea41db57e68 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Thu, 25 May 2023 15:26:02 +0100 Subject: [PATCH 13/19] update golden files --- cli/testdata/coder_scaletest_workspace-traffic_--help.golden | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cli/testdata/coder_scaletest_workspace-traffic_--help.golden b/cli/testdata/coder_scaletest_workspace-traffic_--help.golden index a59d5744e4a0c..55448f9a38d4d 100644 --- a/cli/testdata/coder_scaletest_workspace-traffic_--help.golden +++ b/cli/testdata/coder_scaletest_workspace-traffic_--help.golden @@ -28,7 +28,7 @@ Generate traffic to scaletest workspaces through coderd a path will default to stdout. Available formats: text, json. --scaletest-prometheus-address string, $CODER_SCALETEST_PROMETHEUS_ADDRESS (default: 0.0.0.0:21112) - Address on which to expose scaletest prometheus metrics. + Address on which to expose scaletest Prometheus metrics. --tick-interval duration, $CODER_SCALETEST_WORKSPACE_TRAFFIC_TICK_INTERVAL (default: 100ms) How often to send traffic. From ea71c4f387e85340a058cd389d37ae5cdd81e812 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Thu, 25 May 2023 15:27:00 +0100 Subject: [PATCH 14/19] separate prom address var --- cli/scaletest.go | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cli/scaletest.go b/cli/scaletest.go index 033628385c53d..feb603ad1a63b 100644 --- a/cli/scaletest.go +++ b/cli/scaletest.go @@ -904,11 +904,12 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd { tickInterval time.Duration bytesPerTick int64 scaletestPrometheusAddress string - client = &codersdk.Client{} - tracingFlags = &scaletestTracingFlags{} - strategy = &scaletestStrategyFlags{} - cleanupStrategy = &scaletestStrategyFlags{cleanup: true} - output = &scaletestOutputFlags{} + + client = &codersdk.Client{} + tracingFlags = &scaletestTracingFlags{} + strategy = &scaletestStrategyFlags{} + cleanupStrategy = &scaletestStrategyFlags{cleanup: true} + output = &scaletestOutputFlags{} ) cmd := &clibase.Cmd{ From 399e506da727fc6dc73b6e918d023a7d15a2bbc3 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Fri, 26 May 2023 12:09:04 +0100 Subject: [PATCH 15/19] break out errors in to read/write --- scaletest/workspacetraffic/metrics.go | 31 ++++++++++++++++----------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/scaletest/workspacetraffic/metrics.go b/scaletest/workspacetraffic/metrics.go index 2d6ceb2efc13f..ce9fdc6caf028 100644 --- a/scaletest/workspacetraffic/metrics.go +++ b/scaletest/workspacetraffic/metrics.go @@ -3,9 +3,10 @@ package workspacetraffic import "github.com/prometheus/client_golang/prometheus" type Metrics struct { - BytesRead prometheus.CounterVec - BytesWritten prometheus.CounterVec - Errors prometheus.CounterVec + BytesReadTotal prometheus.CounterVec + BytesWrittenTotal prometheus.CounterVec + ReadErrorsTotal prometheus.CounterVec + WriteErrorsTotal prometheus.CounterVec ReadLatencySeconds prometheus.HistogramVec WriteLatencySeconds prometheus.HistogramVec LabelNames []string @@ -13,20 +14,25 @@ type Metrics struct { func NewMetrics(reg prometheus.Registerer, labelNames ...string) *Metrics { m := &Metrics{ - BytesRead: *prometheus.NewCounterVec(prometheus.CounterOpts{ + BytesReadTotal: *prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: "coderd", Subsystem: "scaletest", - Name: "bytes_read", + Name: "bytes_read_total", }, labelNames), - BytesWritten: *prometheus.NewCounterVec(prometheus.CounterOpts{ + BytesWrittenTotal: *prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: "coderd", Subsystem: "scaletest", - Name: "bytes_written", + Name: "bytes_written_total", }, labelNames), - Errors: *prometheus.NewCounterVec(prometheus.CounterOpts{ + ReadErrorsTotal: *prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: "coderd", Subsystem: "scaletest", - Name: "errors", + Name: "read_errors_total", + }, labelNames), + WriteErrorsTotal: *prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: "coderd", + Subsystem: "scaletest", + Name: "write_errors_total", }, labelNames), ReadLatencySeconds: *prometheus.NewHistogramVec(prometheus.HistogramOpts{ Namespace: "coderd", @@ -40,9 +46,10 @@ func NewMetrics(reg prometheus.Registerer, labelNames ...string) *Metrics { }, labelNames), } - reg.MustRegister(m.BytesRead) - reg.MustRegister(m.BytesWritten) - reg.MustRegister(m.Errors) + reg.MustRegister(m.BytesReadTotal) + reg.MustRegister(m.BytesWrittenTotal) + reg.MustRegister(m.ReadErrorsTotal) + reg.MustRegister(m.WriteErrorsTotal) reg.MustRegister(m.ReadLatencySeconds) reg.MustRegister(m.WriteLatencySeconds) return m From ec96a0016b9af15ff8a03092cf63b78fea7cfca7 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Fri, 26 May 2023 12:09:42 +0100 Subject: [PATCH 16/19] ignore certain errors such as websocket closing or context cancellation --- scaletest/workspacetraffic/run.go | 67 +++++++++++++++++++++---------- 1 file changed, 45 insertions(+), 22 deletions(-) diff --git a/scaletest/workspacetraffic/run.go b/scaletest/workspacetraffic/run.go index 11037c28fa627..866a93a72c5f1 100644 --- a/scaletest/workspacetraffic/run.go +++ b/scaletest/workspacetraffic/run.go @@ -3,8 +3,8 @@ package workspacetraffic import ( "context" "encoding/json" + "errors" "io" - "sync/atomic" "time" "github.com/google/uuid" @@ -19,6 +19,8 @@ import ( "github.com/coder/coder/cryptorand" "github.com/coder/coder/scaletest/harness" "github.com/coder/coder/scaletest/loadtestutil" + + promtest "github.com/prometheus/client_golang/prometheus/testutil" ) type Runner struct { @@ -49,6 +51,16 @@ func (r *Runner) Run(ctx context.Context, _ string, logs io.Writer) error { r.client.Logger = logger r.client.LogBodies = true + // Initialize our metrics eagerly. This is mainly so that we can test for the + // presence of a zero-valued metric as opposed to the absence of a metric. + lvs := []string{r.cfg.WorkspaceOwner, r.cfg.WorkspaceName, r.cfg.AgentName} + r.metrics.BytesReadTotal.WithLabelValues(lvs...).Add(0) + r.metrics.BytesWrittenTotal.WithLabelValues(lvs...).Add(0) + r.metrics.ReadErrorsTotal.WithLabelValues(lvs...).Add(0) + r.metrics.WriteErrorsTotal.WithLabelValues(lvs...).Add(0) + r.metrics.ReadLatencySeconds.WithLabelValues(lvs...).Observe(0) + r.metrics.WriteLatencySeconds.WithLabelValues(lvs...).Observe(0) + var ( agentID = r.cfg.AgentID reconnect = uuid.New() @@ -92,7 +104,7 @@ func (r *Runner) Run(ctx context.Context, _ string, logs io.Writer) error { }() // Wrap the conn in a countReadWriter so we can monitor bytes sent/rcvd. - crw := countReadWriter{ReadWriter: conn, metrics: r.metrics, labels: []string{r.cfg.WorkspaceOwner, r.cfg.WorkspaceName, r.cfg.AgentName}} + crw := countReadWriter{ReadWriter: conn, metrics: r.metrics, labels: lvs} // Create a ticker for sending data to the PTY. tick := time.NewTicker(tickInterval) @@ -133,11 +145,12 @@ func (r *Runner) Run(ctx context.Context, _ string, logs io.Writer) error { } duration := time.Since(start) - - logger.Info(ctx, "results", + logger.Info(ctx, "Test Results", slog.F("duration", duration), - slog.F("sent", crw.BytesWritten()), - slog.F("rcvd", crw.BytesRead()), + slog.F("bytes_read_total", promtest.ToFloat64(r.metrics.BytesReadTotal)), + slog.F("bytes_written_total", promtest.ToFloat64(r.metrics.BytesWrittenTotal)), + slog.F("read_errors_total", promtest.ToFloat64(r.metrics.ReadErrorsTotal)), + slog.F("write_errors_total", promtest.ToFloat64(r.metrics.WriteErrorsTotal)), ) return nil @@ -186,19 +199,19 @@ func writeRandomData(dst io.Writer, size int64, tick <-chan time.Time) error { // countReadWriter wraps an io.ReadWriter and counts the number of bytes read and written. type countReadWriter struct { io.ReadWriter - bytesRead atomic.Int64 - bytesWritten atomic.Int64 - metrics *Metrics - labels []string + metrics *Metrics + labels []string } func (w *countReadWriter) Read(p []byte) (int, error) { start := time.Now() n, err := w.ReadWriter.Read(p) + if reportableErr(err) { + w.metrics.ReadErrorsTotal.WithLabelValues(w.labels...).Inc() + } w.metrics.ReadLatencySeconds.WithLabelValues(w.labels...).Observe(time.Since(start).Seconds()) if n > 0 { - w.bytesRead.Add(int64(n)) - w.metrics.BytesRead.WithLabelValues(w.labels...).Add(float64(n)) + w.metrics.BytesReadTotal.WithLabelValues(w.labels...).Add(float64(n)) } return n, err } @@ -206,22 +219,16 @@ func (w *countReadWriter) Read(p []byte) (int, error) { func (w *countReadWriter) Write(p []byte) (int, error) { start := time.Now() n, err := w.ReadWriter.Write(p) + if reportableErr(err) { + w.metrics.WriteErrorsTotal.WithLabelValues(w.labels...).Inc() + } w.metrics.WriteLatencySeconds.WithLabelValues(w.labels...).Observe(time.Since(start).Seconds()) if n > 0 { - w.bytesWritten.Add(int64(n)) - w.metrics.BytesWritten.WithLabelValues(w.labels...).Add(float64(n)) + w.metrics.BytesWrittenTotal.WithLabelValues(w.labels...).Add(float64(n)) } return n, err } -func (w *countReadWriter) BytesRead() int64 { - return w.bytesRead.Load() -} - -func (w *countReadWriter) BytesWritten() int64 { - return w.bytesWritten.Load() -} - func mustRandStr(l int64) string { if l < 1 { l = 1 @@ -232,3 +239,19 @@ func mustRandStr(l int64) string { } return randStr } + +// some errors we want to report in metrics; others we want to ignore +// such as websocket.StatusNormalClosure or context.Canceled +func reportableErr(err error) bool { + if err == nil { + return false + } + if xerrors.Is(err, context.Canceled) { + return false + } + var wsErr websocket.CloseError + if errors.As(err, &wsErr) { + return wsErr.Code != websocket.StatusNormalClosure + } + return false +} From a4ecd0c248664ffe1d6248006f9f328d643990a5 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Fri, 26 May 2023 12:10:07 +0100 Subject: [PATCH 17/19] test metrics more closely --- scaletest/workspacetraffic/run_test.go | 95 +++++++++++++++++++++----- 1 file changed, 78 insertions(+), 17 deletions(-) diff --git a/scaletest/workspacetraffic/run_test.go b/scaletest/workspacetraffic/run_test.go index 8362f64d2907c..c5e3dd246fc03 100644 --- a/scaletest/workspacetraffic/run_test.go +++ b/scaletest/workspacetraffic/run_test.go @@ -17,6 +17,7 @@ import ( "github.com/google/uuid" "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -27,10 +28,10 @@ func TestRun(t *testing.T) { // We need to stand up an in-memory coderd and run a fake workspace. var ( client = coderdtest.New(t, &coderdtest.Options{IncludeProvisionerDaemon: true}) - user = coderdtest.CreateFirstUser(t, client) + firstUser = coderdtest.CreateFirstUser(t, client) authToken = uuid.NewString() agentName = "agent" - version = coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{ + version = coderdtest.CreateTemplateVersion(t, client, firstUser.OrganizationID, &echo.Responses{ Parse: echo.ParseComplete, ProvisionPlan: echo.ProvisionComplete, ProvisionApply: []*proto.Provision_Response{{ @@ -52,10 +53,10 @@ func TestRun(t *testing.T) { }, }}, }) - template = coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID) + template = coderdtest.CreateTemplate(t, client, firstUser.OrganizationID, version.ID) _ = coderdtest.AwaitTemplateVersionJob(t, client, version.ID) // In order to be picked up as a scaletest workspace, the workspace must be named specifically - ws = coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID, func(cwr *codersdk.CreateWorkspaceRequest) { + ws = coderdtest.CreateWorkspace(t, client, firstUser.OrganizationID, template.ID, func(cwr *codersdk.CreateWorkspaceRequest) { cwr.Name = "scaletest-test" }) _ = coderdtest.AwaitWorkspaceBuildJob(t, client, ws.LatestBuild.ID) @@ -67,12 +68,15 @@ func TestRun(t *testing.T) { agentCloser := agent.New(agent.Options{ Client: agentClient, }) - ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitMedium) - + ctx, cancel := context.WithCancel(context.Background()) t.Cleanup(cancel) t.Cleanup(func() { _ = agentCloser.Close() }) + // We actually need to know the full user and not just the UserID / OrgID + user, err := client.User(ctx, firstUser.UserID.String()) + require.NoError(t, err, "get first user") + // Make sure the agent is connected before we go any further. resources := coderdtest.AwaitWorkspaceAgents(t, client, ws.ID) var agentID uuid.UUID @@ -84,6 +88,13 @@ func TestRun(t *testing.T) { require.NotEqual(t, uuid.Nil, agentID, "did not expect agentID to be nil") // Now we can start the runner. + var ( + bytesPerTick = 1024 + tickInterval = 1000 * time.Millisecond + cancelAfter = 1500 * time.Millisecond + fudgeRead = 2 // We also read some newlines + fudgeWrite = 12 // The ReconnectingPTY payload incurs some overhead + ) reg := prometheus.NewRegistry() metrics := workspacetraffic.NewMetrics(reg, "username", "workspace_name", "agent_name") runner := workspacetraffic.NewRunner(client, workspacetraffic.Config{ @@ -91,25 +102,75 @@ func TestRun(t *testing.T) { AgentName: agentName, WorkspaceName: ws.Name, WorkspaceOwner: ws.OwnerName, - BytesPerTick: 1024, - TickInterval: testutil.IntervalMedium, - Duration: testutil.WaitMedium - time.Second, + BytesPerTick: int64(bytesPerTick), + TickInterval: tickInterval, + Duration: testutil.WaitLong, Registry: reg, }, metrics) var logs strings.Builder + // Stop the test after one 'tick'. This will cause an EOF. + go func() { + <-time.After(cancelAfter) + cancel() + }() require.NoError(t, runner.Run(ctx, "", &logs), "unexpected error calling Run()") - var collected []prometheus.Metric - collectCh := make(chan prometheus.Metric) + // We want to ensure the metrics are somewhat accurate. + lvs := []string{user.Username, ws.Name, agentName} + assert.InDelta(t, bytesPerTick+fudgeWrite, toFloat64(t, metrics.BytesWrittenTotal.WithLabelValues(lvs...)), 0.1) + assert.InDelta(t, bytesPerTick+fudgeRead, toFloat64(t, metrics.BytesReadTotal.WithLabelValues(lvs...)), 0.1) + // Latency should report non-zero values. + assert.NotZero(t, toFloat64(t, metrics.ReadLatencySeconds)) + assert.NotZero(t, toFloat64(t, metrics.WriteLatencySeconds)) + // Should not report any errors! + assert.Zero(t, toFloat64(t, metrics.ReadErrorsTotal.WithLabelValues(lvs...))) + assert.Zero(t, toFloat64(t, metrics.ReadErrorsTotal.WithLabelValues(lvs...))) +} + +// toFloat64 version of Prometheus' testutil.ToFloat64 that integrates with +// github.com/stretchr/testify/require and handles histograms (somewhat) +func toFloat64(t testing.TB, c prometheus.Collector) float64 { + var ( + m prometheus.Metric + mCount int + mChan = make(chan prometheus.Metric) + done = make(chan struct{}) + ) + go func() { - for metric := range collectCh { - collected = append(collected, metric) + for m = range mChan { + mCount++ } + close(done) }() - reg.Collect(collectCh) - assert.NotEmpty(t, collected) - for _, m := range collected { - assert.NotZero(t, m.Desc()) + + c.Collect(mChan) + close(mChan) + <-done + + require.Equal(t, 1, mCount, "expected exactly 1 metric but got %d", mCount) + + pb := &dto.Metric{} + require.NoError(t, m.Write(pb), "unexpected error collecting metrics") + + if pb.Gauge != nil { + return pb.Gauge.GetValue() + } + if pb.Counter != nil { + return pb.Counter.GetValue() + } + if pb.Untyped != nil { + return pb.Untyped.GetValue() + } + if pb.Histogram != nil { + // If no samples, just return zero. + if pb.Histogram.GetSampleCount() == 0 { + return 0 + } + // Average is sufficient for testing purposes. + return pb.Histogram.GetSampleSum() / pb.Histogram.GetSampleCountFloat() } + require.Fail(t, "collected a non-gauge/counter/untyped/histogram metric: %s", pb) + return 0 } From fe0ecfc847a8a8940f67fc3b091b09c2ab87bcca Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Fri, 26 May 2023 12:25:58 +0100 Subject: [PATCH 18/19] fixup! test metrics more closely --- scaletest/workspacetraffic/run_test.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scaletest/workspacetraffic/run_test.go b/scaletest/workspacetraffic/run_test.go index c5e3dd246fc03..e53d408bcd428 100644 --- a/scaletest/workspacetraffic/run_test.go +++ b/scaletest/workspacetraffic/run_test.go @@ -92,7 +92,6 @@ func TestRun(t *testing.T) { bytesPerTick = 1024 tickInterval = 1000 * time.Millisecond cancelAfter = 1500 * time.Millisecond - fudgeRead = 2 // We also read some newlines fudgeWrite = 12 // The ReconnectingPTY payload incurs some overhead ) reg := prometheus.NewRegistry() @@ -119,7 +118,9 @@ func TestRun(t *testing.T) { // We want to ensure the metrics are somewhat accurate. lvs := []string{user.Username, ws.Name, agentName} assert.InDelta(t, bytesPerTick+fudgeWrite, toFloat64(t, metrics.BytesWrittenTotal.WithLabelValues(lvs...)), 0.1) - assert.InDelta(t, bytesPerTick+fudgeRead, toFloat64(t, metrics.BytesReadTotal.WithLabelValues(lvs...)), 0.1) + // Read is highly variable, depending on how far we read before stopping. + // Just ensure it's not zero. + assert.NotZero(t, bytesPerTick, toFloat64(t, metrics.BytesReadTotal.WithLabelValues(lvs...))) // Latency should report non-zero values. assert.NotZero(t, toFloat64(t, metrics.ReadLatencySeconds)) assert.NotZero(t, toFloat64(t, metrics.WriteLatencySeconds)) From 239ba960706431261e1bd6f24b6f2f97a5b1d64c Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Fri, 26 May 2023 13:04:27 +0100 Subject: [PATCH 19/19] add wait for prometheus metrics --- cli/scaletest.go | 11 +++++++++++ cli/scaletest_test.go | 1 + .../coder_scaletest_workspace-traffic_--help.golden | 4 ++++ docs/cli/scaletest_workspace-traffic.md | 10 ++++++++++ 4 files changed, 26 insertions(+) diff --git a/cli/scaletest.go b/cli/scaletest.go index feb603ad1a63b..0977ab0f7028b 100644 --- a/cli/scaletest.go +++ b/cli/scaletest.go @@ -904,6 +904,7 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd { tickInterval time.Duration bytesPerTick int64 scaletestPrometheusAddress string + scaletestPrometheusWait time.Duration client = &codersdk.Client{} tracingFlags = &scaletestTracingFlags{} @@ -956,6 +957,9 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd { _, _ = fmt.Fprintln(inv.Stderr, "\nUploading traces...") if err := closeTracing(ctx); err != nil { _, _ = fmt.Fprintf(inv.Stderr, "\nError uploading traces: %+v\n", err) + // Wait for prometheus metrics to be scraped + _, _ = fmt.Fprintf(inv.Stderr, "Waiting %s for prometheus metrics to be scraped\n", scaletestPrometheusWait) + <-time.After(scaletestPrometheusWait) } }() tracer := tracerProvider.Tracer(scaletestTracerName) @@ -1060,6 +1064,13 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd { Description: "Address on which to expose scaletest Prometheus metrics.", Value: clibase.StringOf(&scaletestPrometheusAddress), }, + { + Flag: "scaletest-prometheus-wait", + Env: "CODER_SCALETEST_PROMETHEUS_WAIT", + Default: "5s", + Description: "How long to wait before exiting in order to allow Prometheus metrics to be scraped.", + Value: clibase.DurationOf(&scaletestPrometheusWait), + }, } tracingFlags.attach(&cmd.Options) diff --git a/cli/scaletest_test.go b/cli/scaletest_test.go index db7588e8b22a2..b1473b64c990b 100644 --- a/cli/scaletest_test.go +++ b/cli/scaletest_test.go @@ -216,6 +216,7 @@ func TestScaleTestWorkspaceTraffic(t *testing.T) { "--bytes-per-tick", "1024", "--tick-interval", "100ms", "--scaletest-prometheus-address", "127.0.0.1:0", + "--scaletest-prometheus-wait", "0s", ) clitest.SetupConfig(t, client, root) var stdout, stderr bytes.Buffer diff --git a/cli/testdata/coder_scaletest_workspace-traffic_--help.golden b/cli/testdata/coder_scaletest_workspace-traffic_--help.golden index 55448f9a38d4d..04f7688937516 100644 --- a/cli/testdata/coder_scaletest_workspace-traffic_--help.golden +++ b/cli/testdata/coder_scaletest_workspace-traffic_--help.golden @@ -30,6 +30,10 @@ Generate traffic to scaletest workspaces through coderd --scaletest-prometheus-address string, $CODER_SCALETEST_PROMETHEUS_ADDRESS (default: 0.0.0.0:21112) Address on which to expose scaletest Prometheus metrics. + --scaletest-prometheus-wait duration, $CODER_SCALETEST_PROMETHEUS_WAIT (default: 5s) + How long to wait before exiting in order to allow Prometheus metrics + to be scraped. + --tick-interval duration, $CODER_SCALETEST_WORKSPACE_TRAFFIC_TICK_INTERVAL (default: 100ms) How often to send traffic. diff --git a/docs/cli/scaletest_workspace-traffic.md b/docs/cli/scaletest_workspace-traffic.md index 11c234129687b..399885f0c1002 100644 --- a/docs/cli/scaletest_workspace-traffic.md +++ b/docs/cli/scaletest_workspace-traffic.md @@ -92,6 +92,16 @@ Output format specs in the format "[:]". Not specifying a path wil Address on which to expose scaletest Prometheus metrics. +### --scaletest-prometheus-wait + +| | | +| ----------- | --------------------------------------------- | +| Type | duration | +| Environment | $CODER_SCALETEST_PROMETHEUS_WAIT | +| Default | 5s | + +How long to wait before exiting in order to allow Prometheus metrics to be scraped. + ### --tick-interval | | |