From 43fe44db509e1aa3944d93130dbfc6ad7e5bda7a Mon Sep 17 00:00:00 2001 From: Ethan <39577870+ethanndickson@users.noreply.github.com> Date: Thu, 28 Aug 2025 19:07:50 +1000 Subject: [PATCH 1/6] chore: delete scaletest infrastructure (#19603) We've successfully migrated the latest iteration of our scaletest infrastructure (`scaletest/terraform/action`) to https://github.com/coder/scaletest (private repo). This PR removes the older iterations, and the scriptsfor spinning up & running the load generators against that infrastructure (`scaletest.sh`). The tooling for generating load against a Coder deployment remains untouched, as does the public documentation for that tooling (i.e. `coder exp scaletest`). If we ever need that old scaletest Terraform code, it's always in the git history! --- docs/admin/infrastructure/scale-utility.md | 2 +- scaletest/README.md | 109 ----- scaletest/scaletest.sh | 240 ----------- scaletest/terraform/action/.gitignore | 1 - scaletest/terraform/action/cf_dns.tf | 21 - .../terraform/action/coder_helm_values.tftpl | 120 ------ scaletest/terraform/action/coder_proxies.tf | 102 ----- scaletest/terraform/action/coder_templates.tf | 340 ---------------- scaletest/terraform/action/coder_traffic.tf | 228 ----------- .../terraform/action/coder_workspaces.tf | 180 --------- scaletest/terraform/action/gcp_clusters.tf | 162 -------- scaletest/terraform/action/gcp_db.tf | 89 ----- scaletest/terraform/action/gcp_project.tf | 27 -- scaletest/terraform/action/gcp_vpc.tf | 154 ------- scaletest/terraform/action/k8s_coder_asia.tf | 131 ------ .../terraform/action/k8s_coder_europe.tf | 131 ------ .../terraform/action/k8s_coder_primary.tf | 160 -------- scaletest/terraform/action/kubeconfig.tftpl | 17 - scaletest/terraform/action/main.tf | 141 ------- scaletest/terraform/action/prometheus.tf | 174 -------- .../action/prometheus_helm_values.tftpl | 38 -- scaletest/terraform/action/scenarios.tf | 74 ---- scaletest/terraform/action/tls.tf | 13 - scaletest/terraform/action/vars.tf | 112 ------ scaletest/terraform/infra/gcp_cluster.tf | 186 --------- scaletest/terraform/infra/gcp_db.tf | 88 ---- scaletest/terraform/infra/gcp_project.tf | 27 -- scaletest/terraform/infra/gcp_vpc.tf | 39 -- scaletest/terraform/infra/main.tf | 20 - scaletest/terraform/infra/outputs.tf | 73 ---- scaletest/terraform/infra/vars.tf | 107 ----- scaletest/terraform/k8s/cert-manager.tf | 67 ---- scaletest/terraform/k8s/coder.tf | 375 ------------------ scaletest/terraform/k8s/main.tf | 35 -- scaletest/terraform/k8s/otel.tf | 69 ---- scaletest/terraform/k8s/prometheus.tf | 173 -------- scaletest/terraform/k8s/vars.tf | 219 ---------- scaletest/terraform/scenario-large.tfvars | 9 - scaletest/terraform/scenario-medium.tfvars | 7 - scaletest/terraform/scenario-small.tfvars | 6 - scaletest/terraform/secrets.tfvars.tpl | 4 - 41 files changed, 1 insertion(+), 4269 deletions(-) delete mode 100644 scaletest/README.md delete mode 100755 scaletest/scaletest.sh delete mode 100644 scaletest/terraform/action/.gitignore delete mode 100644 scaletest/terraform/action/cf_dns.tf delete mode 100644 scaletest/terraform/action/coder_helm_values.tftpl delete mode 100644 scaletest/terraform/action/coder_proxies.tf delete mode 100644 scaletest/terraform/action/coder_templates.tf delete mode 100644 scaletest/terraform/action/coder_traffic.tf delete mode 100644 scaletest/terraform/action/coder_workspaces.tf delete mode 100644 scaletest/terraform/action/gcp_clusters.tf delete mode 100644 scaletest/terraform/action/gcp_db.tf delete mode 100644 scaletest/terraform/action/gcp_project.tf delete mode 100644 scaletest/terraform/action/gcp_vpc.tf delete mode 100644 scaletest/terraform/action/k8s_coder_asia.tf delete mode 100644 scaletest/terraform/action/k8s_coder_europe.tf delete mode 100644 scaletest/terraform/action/k8s_coder_primary.tf delete mode 100644 scaletest/terraform/action/kubeconfig.tftpl delete mode 100644 scaletest/terraform/action/main.tf delete mode 100644 scaletest/terraform/action/prometheus.tf delete mode 100644 scaletest/terraform/action/prometheus_helm_values.tftpl delete mode 100644 scaletest/terraform/action/scenarios.tf delete mode 100644 scaletest/terraform/action/tls.tf delete mode 100644 scaletest/terraform/action/vars.tf delete mode 100644 scaletest/terraform/infra/gcp_cluster.tf delete mode 100644 scaletest/terraform/infra/gcp_db.tf delete mode 100644 scaletest/terraform/infra/gcp_project.tf delete mode 100644 scaletest/terraform/infra/gcp_vpc.tf delete mode 100644 scaletest/terraform/infra/main.tf delete mode 100644 scaletest/terraform/infra/outputs.tf delete mode 100644 scaletest/terraform/infra/vars.tf delete mode 100644 scaletest/terraform/k8s/cert-manager.tf delete mode 100644 scaletest/terraform/k8s/coder.tf delete mode 100644 scaletest/terraform/k8s/main.tf delete mode 100644 scaletest/terraform/k8s/otel.tf delete mode 100644 scaletest/terraform/k8s/prometheus.tf delete mode 100644 scaletest/terraform/k8s/vars.tf delete mode 100644 scaletest/terraform/scenario-large.tfvars delete mode 100644 scaletest/terraform/scenario-medium.tfvars delete mode 100644 scaletest/terraform/scenario-small.tfvars delete mode 100644 scaletest/terraform/secrets.tfvars.tpl diff --git a/docs/admin/infrastructure/scale-utility.md b/docs/admin/infrastructure/scale-utility.md index b66e7fca41394..6945b54bf559e 100644 --- a/docs/admin/infrastructure/scale-utility.md +++ b/docs/admin/infrastructure/scale-utility.md @@ -44,7 +44,7 @@ environments. > for your users. > To avoid potential outages and orphaned resources, we recommend that you run > scale tests on a secondary "staging" environment or a dedicated -> [Kubernetes playground cluster](https://github.com/coder/coder/tree/main/scaletest/terraform). +> Kubernetes playground cluster. > > Run it against a production environment at your own risk. diff --git a/scaletest/README.md b/scaletest/README.md deleted file mode 100644 index 9fa475ae29ab5..0000000000000 --- a/scaletest/README.md +++ /dev/null @@ -1,109 +0,0 @@ -# Scale Testing - -This folder contains CLI commands, Terraform code, and scripts to aid in performing load tests of Coder. -At a high level, it performs the following steps: - -- Using the Terraform code in `./terraform`, stands up a preconfigured Google Cloud environment - consisting of a VPC, GKE Cluster, and CloudSQL instance. - > **Note: You must have an existing Google Cloud project available.** -- Creates a dedicated namespace for Coder and installs Coder using the Helm chart in this namespace. -- Configures the Coder deployment with random credentials and a predefined Kubernetes template. - > **Note:** These credentials are stored in `${PROJECT_ROOT}/scaletest/.coderv2/coder.env`. -- Creates a number of workspaces and waits for them to all start successfully. These workspaces - are ephemeral and do not contain any persistent resources. -- Waits for 10 minutes to allow things to settle and establish a baseline. -- Generates web terminal traffic to all workspaces for 30 minutes. -- Directly after traffic generation, captures goroutine and heap snapshots of the Coder deployment. -- Tears down all resources (unless `--skip-cleanup` is specified). - -## Usage - -The main entrypoint is the `scaletest.sh` script. - -```console -$ scaletest.sh --help -Usage: scaletest.sh --name --project --num-workspaces --scenario [--dry-run] [--skip-cleanup] -``` - -### Required arguments - -- `--name`: Name for the loadtest. This is added as a prefix to resources created by Terraform (e.g. `joe-big-loadtest`). -- `--project`: Google Cloud project in which to create the resources (example: `my-loadtest-project`). -- `--num-workspaces`: Number of workspaces to create (example: `10`). -- `--scenario`: Deployment scenario to use (example: `small`). See `terraform/scenario-*.tfvars`. - -> **Note:** In order to capture Prometheus metrics, you must define the environment variables -> `SCALETEST_PROMETHEUS_REMOTE_WRITE_USER` and `SCALETEST_PROMETHEUS_REMOTE_WRITE_PASSWORD`. - -### Optional arguments - -- `--dry-run`: Do not perform any action and instead print what would be executed. -- `--skip-cleanup`: Do not perform any cleanup. You will be responsible for deleting any resources this creates. - -### Environment Variables - -All of the above arguments may be specified as environment variables. Consult the script for details. - -### Prometheus Metrics - -To capture Prometheus metrics from the loadtest, two environment variables are required: - -- `SCALETEST_PROMETHEUS_REMOTE_WRITE_USER` -- `SCALETEST_PROMETHEUS_REMOTE_WRITE_PASSWORD` - -### Enterprise License - -To add an Enterprise license, set the `SCALETEST_CODER_LICENSE` environment variable to the JWT string - -## Scenarios - -A scenario defines a number of variables that override the default Terraform variables. -A number of existing scenarios are provided in `scaletest/terraform/scenario-*.tfvars`. - -For example, `scenario-small.tfvars` includes the following variable definitions: - -```hcl -nodepool_machine_type_coder = "t2d-standard-2" -nodepool_machine_type_workspaces = "t2d-standard-2" -coder_cpu = "1000m" # Leaving 1 CPU for system workloads -coder_mem = "4Gi" # Leaving 4GB for system workloads -``` - -To create your own scenario, simply add a new file `terraform/scenario-$SCENARIO_NAME.tfvars`. -In this file, override variables as required, consulting `vars.tf` as needed. -You can then use this scenario by specifying `--scenario $SCENARIO_NAME`. -For example, if your scenario file were named `scenario-big-whopper2x.tfvars`, you would specify -`--scenario=big-whopper2x`. - -## Utility scripts - -A number of utility scripts are provided in `lib`, and are used by `scaletest.sh`: - -- `coder_shim.sh`: a convenience script to run the `coder` binary with a predefined config root. - This is intended to allow running Coder CLI commands against the loadtest cluster without - modifying a user's existing Coder CLI configuration. -- `coder_init.sh`: Performs first-time user setup of an existing Coder instance, generating - a random password for the admin user. The admin user is named `admin@coder.com` by default. - Credentials are written to `scaletest/.coderv2/coder.env`. -- `coder_workspacetraffic.sh`: Runs traffic generation against the loadtest cluster and creates - a monitoring manifest for the traffic generation pod. This pod will restart automatically - after the traffic generation has completed. - -## Grafana Dashboard - -A sample Grafana dashboard is provided in `scaletest_dashboard.json`. This dashboard is intended -to be imported into an existing Grafana instance. It provides a number of useful metrics: - -- **Control Plane Resources**: CPU, memory, and network usage for the Coder deployment, as well as the number of pod restarts. -- **Database**: Rows inserted/updated/deleted/returned, active connections, and transactions per second. Fine-grained `sqlQuerier` metrics are provided for Coder's database as well, broken down my query method. -- **HTTP requests**: Number of HTTP requests per second, broken down by status code and path. -- **Workspace Resources**: CPU, memory, and network usage for all workspaces. -- **Workspace Agents**: Workspace agent network usage, connection latency, and number of active connections. -- **Workspace Traffic**: Statistics related to workspace traffic generation. -- **Internals**: Provisioner job timings, concurrency, workspace builds, and AuthZ duration. - -A subset of these metrics may be useful for a production deployment, but some are only useful -for load testing. - -> **Note:** in particular, `sqlQuerier` metrics produce a large number of time series and may cause -> increased charges in your metrics provider. diff --git a/scaletest/scaletest.sh b/scaletest/scaletest.sh deleted file mode 100755 index dd0a6cb4f450c..0000000000000 --- a/scaletest/scaletest.sh +++ /dev/null @@ -1,240 +0,0 @@ -#!/usr/bin/env bash - -[[ -n ${VERBOSE:-} ]] && set -x -set -euo pipefail - -PROJECT_ROOT="$(git rev-parse --show-toplevel)" -# shellcheck source=scripts/lib.sh -source "${PROJECT_ROOT}/scripts/lib.sh" - -DRY_RUN="${DRY_RUN:-0}" -SCALETEST_NAME="${SCALETEST_NAME:-}" -SCALETEST_NUM_WORKSPACES="${SCALETEST_NUM_WORKSPACES:-}" -SCALETEST_SCENARIO="${SCALETEST_SCENARIO:-}" -SCALETEST_PROJECT="${SCALETEST_PROJECT:-}" -SCALETEST_PROMETHEUS_REMOTE_WRITE_USER="${SCALETEST_PROMETHEUS_REMOTE_WRITE_USER:-}" -SCALETEST_PROMETHEUS_REMOTE_WRITE_PASSWORD="${SCALETEST_PROMETHEUS_REMOTE_WRITE_PASSWORD:-}" -SCALETEST_CODER_LICENSE="${SCALETEST_CODER_LICENSE:-}" -SCALETEST_SKIP_CLEANUP="${SCALETEST_SKIP_CLEANUP:-0}" -SCALETEST_CREATE_CONCURRENCY="${SCALETEST_CREATE_CONCURRENCY:-10}" -SCALETEST_TRAFFIC_BYTES_PER_TICK="${SCALETEST_TRAFFIC_BYTES_PER_TICK:-1024}" -SCALETEST_TRAFFIC_TICK_INTERVAL="${SCALETEST_TRAFFIC_TICK_INTERVAL:-10s}" -SCALETEST_DESTROY="${SCALETEST_DESTROY:-0}" - -script_name=$(basename "$0") -args="$(getopt -o "" -l create-concurrency:,destroy,dry-run,help,name:,num-workspaces:,project:,scenario:,skip-cleanup,traffic-bytes-per-tick:,traffic-tick-interval:, -- "$@")" -eval set -- "$args" -while true; do - case "$1" in - --create-concurrency) - SCALETEST_CREATE_CONCURRENCY="$2" - shift 2 - ;; - --destroy) - SCALETEST_DESTROY=1 - shift - ;; - --dry-run) - DRY_RUN=1 - shift - ;; - --help) - echo "Usage: $script_name --name --project --num-workspaces --scenario [--create-concurrency ] [--destroy] [--dry-run] [--skip-cleanup] [--traffic-bytes-per-tick ] [--traffic-tick-interval ]" - exit 1 - ;; - --name) - SCALETEST_NAME="$2" - shift 2 - ;; - --num-workspaces) - SCALETEST_NUM_WORKSPACES="$2" - shift 2 - ;; - --project) - SCALETEST_PROJECT="$2" - shift 2 - ;; - --scenario) - SCALETEST_SCENARIO="$2" - shift 2 - ;; - --skip-cleanup) - SCALETEST_SKIP_CLEANUP=1 - shift - ;; - --traffic-bytes-per-tick) - SCALETEST_TRAFFIC_BYTES_PER_TICK="$2" - shift 2 - ;; - --traffic-tick-interval) - SCALETEST_TRAFFIC_TICK_INTERVAL="$2" - shift 2 - ;; - --) - shift - break - ;; - *) - error "Unrecognized option: $1" - ;; - esac -done - -dependencies gcloud kubectl terraform - -if [[ -z "${SCALETEST_NAME}" ]]; then - echo "Must specify --name" - exit 1 -fi - -if [[ -z "${SCALETEST_PROJECT}" ]]; then - echo "Must specify --project" - exit 1 -fi - -if [[ -z "${SCALETEST_NUM_WORKSPACES}" ]]; then - echo "Must specify --num-workspaces" - exit 1 -fi - -if [[ -z "${SCALETEST_SCENARIO}" ]]; then - echo "Must specify --scenario" - exit 1 -fi - -if [[ -z "${SCALETEST_PROMETHEUS_REMOTE_WRITE_USER}" ]] || [[ -z "${SCALETEST_PROMETHEUS_REMOTE_WRITE_PASSWORD}" ]]; then - echo "SCALETEST_PROMETHEUS_REMOTE_WRITE_USER or SCALETEST_PROMETHEUS_REMOTE_WRITE_PASSWORD not specified." - echo "No prometheus metrics will be collected!" - read -p "Continue (y/N)? " -n1 -r - if [[ "${REPLY}" != [yY] ]]; then - exit 1 - fi -fi - -SCALETEST_SCENARIO_VARS="${PROJECT_ROOT}/scaletest/terraform/scenario-${SCALETEST_SCENARIO}.tfvars" -if [[ ! -f "${SCALETEST_SCENARIO_VARS}" ]]; then - echo "Scenario ${SCALETEST_SCENARIO_VARS} not found." - echo "Please create it or choose another scenario:" - find "${PROJECT_ROOT}/scaletest/terraform" -type f -name 'scenario-*.tfvars' - exit 1 -fi - -if [[ "${SCALETEST_SKIP_CLEANUP}" == 1 ]]; then - log "WARNING: you told me to not clean up after myself, so this is now your job!" -fi - -CONFIG_DIR="${PROJECT_ROOT}/scaletest/.coderv2" -if [[ -d "${CONFIG_DIR}" ]] && files=$(ls -qAH -- "${CONFIG_DIR}") && [[ -z "$files" ]]; then - echo "Cleaning previous configuration" - maybedryrun "$DRY_RUN" rm -fv "${CONFIG_DIR}/*" -fi -maybedryrun "$DRY_RUN" mkdir -p "${CONFIG_DIR}" - -SCALETEST_SCENARIO_VARS="${PROJECT_ROOT}/scaletest/terraform/scenario-${SCALETEST_SCENARIO}.tfvars" -SCALETEST_SECRETS="${PROJECT_ROOT}/scaletest/terraform/secrets.tfvars" -SCALETEST_SECRETS_TEMPLATE="${PROJECT_ROOT}/scaletest/terraform/secrets.tfvars.tpl" - -log "Writing scaletest secrets to file." -SCALETEST_NAME="${SCALETEST_NAME}" \ - SCALETEST_PROJECT="${SCALETEST_PROJECT}" \ - SCALETEST_PROMETHEUS_REMOTE_WRITE_USER="${SCALETEST_PROMETHEUS_REMOTE_WRITE_USER}" \ - SCALETEST_PROMETHEUS_REMOTE_WRITE_PASSWORD="${SCALETEST_PROMETHEUS_REMOTE_WRITE_PASSWORD}" \ - envsubst <"${SCALETEST_SECRETS_TEMPLATE}" >"${SCALETEST_SECRETS}" - -pushd "${PROJECT_ROOT}/scaletest/terraform" - -echo "Initializing terraform." -maybedryrun "$DRY_RUN" terraform init - -echo "Setting up infrastructure." -maybedryrun "$DRY_RUN" terraform apply --var-file="${SCALETEST_SCENARIO_VARS}" --var-file="${SCALETEST_SECRETS}" --var state=started --auto-approve - -if [[ "${DRY_RUN}" != 1 ]]; then - SCALETEST_CODER_URL=$(<"${CONFIG_DIR}/url") -else - SCALETEST_CODER_URL="http://coder.dryrun.local:3000" -fi -KUBECONFIG="${PROJECT_ROOT}/scaletest/.coderv2/${SCALETEST_NAME}-cluster.kubeconfig" -echo "Waiting for Coder deployment at ${SCALETEST_CODER_URL} to become ready" -max_attempts=10 -for attempt in $(seq 1 $max_attempts); do - maybedryrun "$DRY_RUN" curl --silent --fail --output /dev/null "${SCALETEST_CODER_URL}/api/v2/buildinfo" - curl_status=$? - if [[ $curl_status -eq 0 ]]; then - break - fi - if attempt -eq $max_attempts; then - echo - echo "Coder deployment failed to become ready in time!" - exit 1 - fi - echo "Coder deployment not ready yet (${attempt}/${max_attempts}), sleeping 3 seconds" - maybedryrun "$DRY_RUN" sleep 3 -done - -echo "Initializing Coder deployment." -DRY_RUN="$DRY_RUN" "${PROJECT_ROOT}/scaletest/lib/coder_init.sh" "${SCALETEST_CODER_URL}" - -if [[ -n "${SCALETEST_CODER_LICENSE}" ]]; then - echo "Applying Coder Enterprise License" - DRY_RUN="$DRY_RUN" "${PROJECT_ROOT}/scaletest/lib/coder_shim.sh" license add -l "${SCALETEST_CODER_LICENSE}" -fi - -echo "Creating ${SCALETEST_NUM_WORKSPACES} workspaces." -DRY_RUN="$DRY_RUN" "${PROJECT_ROOT}/scaletest/lib/coder_shim.sh" exp scaletest create-workspaces \ - --count "${SCALETEST_NUM_WORKSPACES}" \ - --template=kubernetes \ - --concurrency "${SCALETEST_CREATE_CONCURRENCY}" \ - --no-cleanup - -echo "Sleeping 10 minutes to establish a baseline measurement." -maybedryrun "$DRY_RUN" sleep 600 - -echo "Sending traffic to workspaces" -maybedryrun "$DRY_RUN" "${PROJECT_ROOT}/scaletest/lib/coder_workspacetraffic.sh" \ - --name "${SCALETEST_NAME}" \ - --traffic-bytes-per-tick "${SCALETEST_TRAFFIC_BYTES_PER_TICK}" \ - --traffic-tick-interval "${SCALETEST_TRAFFIC_TICK_INTERVAL}" -maybedryrun "$DRY_RUN" kubectl --kubeconfig="${KUBECONFIG}" -n "coder-${SCALETEST_NAME}" wait pods coder-scaletest-workspace-traffic --for condition=Ready - -echo "Sleeping 15 minutes for traffic generation" -maybedryrun "$DRY_RUN" sleep 900 - -echo "Starting pprof" -maybedryrun "$DRY_RUN" kubectl -n "coder-${SCALETEST_NAME}" port-forward deployment/coder 6061:6060 & -pfpid=$! -maybedryrun "$DRY_RUN" trap "kill $pfpid" EXIT - -echo "Waiting for pprof endpoint to become available" -pprof_attempt_counter=0 -while ! maybedryrun "$DRY_RUN" timeout 1 bash -c "echo > /dev/tcp/localhost/6061"; do - if [[ $pprof_attempt_counter -eq 10 ]]; then - echo - echo "pprof failed to become ready in time!" - exit 1 - fi - ((pprof_attempt_counter += 1)) - maybedryrun "$DRY_RUN" sleep 3 -done - -echo "Taking pprof snapshots" -maybedryrun "$DRY_RUN" curl --silent --fail --output "${SCALETEST_NAME}-heap.pprof.gz" http://localhost:6061/debug/pprof/heap -maybedryrun "$DRY_RUN" curl --silent --fail --output "${SCALETEST_NAME}-goroutine.pprof.gz" http://localhost:6061/debug/pprof/goroutine -# No longer need to port-forward -maybedryrun "$DRY_RUN" kill "$pfpid" -maybedryrun "$DRY_RUN" trap - EXIT - -if [[ "${SCALETEST_SKIP_CLEANUP}" == 1 ]]; then - echo "Leaving resources up for you to inspect." - echo "Please don't forget to clean up afterwards:" - echo "cd terraform && terraform destroy --var-file=${SCALETEST_SCENARIO_VARS} --var-file=${SCALETEST_SECRETS} --auto-approve" - exit 0 -fi - -if [[ "${SCALETEST_DESTROY}" == 1 ]]; then - echo "Destroying infrastructure" - maybedryrun "$DRY_RUN" terraform destroy --var-file="${SCALETEST_SCENARIO_VARS}" --var-file="${SCALETEST_SECRETS}" --auto-approve -else - echo "Scaling down infrastructure" - maybedryrun "$DRY_RUN" terraform apply --var-file="${SCALETEST_SCENARIO_VARS}" --var-file="${SCALETEST_SECRETS}" --var state=stopped --auto-approve -fi diff --git a/scaletest/terraform/action/.gitignore b/scaletest/terraform/action/.gitignore deleted file mode 100644 index c45cf41694258..0000000000000 --- a/scaletest/terraform/action/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*.tfvars diff --git a/scaletest/terraform/action/cf_dns.tf b/scaletest/terraform/action/cf_dns.tf deleted file mode 100644 index 126c35c12cc76..0000000000000 --- a/scaletest/terraform/action/cf_dns.tf +++ /dev/null @@ -1,21 +0,0 @@ -data "cloudflare_zone" "domain" { - name = var.cloudflare_domain -} - -resource "cloudflare_record" "coder" { - for_each = local.deployments - zone_id = data.cloudflare_zone.domain.zone_id - name = "${each.value.subdomain}.${var.cloudflare_domain}" - content = google_compute_address.coder[each.key].address - type = "A" - ttl = 3600 -} - -resource "cloudflare_record" "coder_wildcard" { - for_each = local.deployments - zone_id = data.cloudflare_zone.domain.id - name = each.value.wildcard_subdomain - content = cloudflare_record.coder[each.key].name - type = "CNAME" - ttl = 3600 -} diff --git a/scaletest/terraform/action/coder_helm_values.tftpl b/scaletest/terraform/action/coder_helm_values.tftpl deleted file mode 100644 index 3fc8d5dfd4226..0000000000000 --- a/scaletest/terraform/action/coder_helm_values.tftpl +++ /dev/null @@ -1,120 +0,0 @@ -coder: - workspaceProxy: ${workspace_proxy} - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: "cloud.google.com/gke-nodepool" - operator: "In" - values: ["${node_pool}"] - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 1 - podAffinityTerm: - topologyKey: "kubernetes.io/hostname" - labelSelector: - matchExpressions: - - key: "app.kubernetes.io/instance" - operator: "In" - values: ["${release_name}"] - env: - %{~ if workspace_proxy ~} - - name: "CODER_ACCESS_URL" - value: "${access_url}" - - name: "CODER_WILDCARD_ACCESS_URL" - value: "${wildcard_access_url}" - - name: CODER_PRIMARY_ACCESS_URL - value: "${primary_url}" - - name: CODER_PROXY_SESSION_TOKEN - valueFrom: - secretKeyRef: - key: token - name: "${proxy_token}" - %{~ endif ~} - %{~ if provisionerd ~} - - name: "CODER_URL" - value: "${access_url}" - - name: "CODER_PROVISIONERD_TAGS" - value: "scope=organization,deployment=${deployment}" - - name: "CODER_PROVISIONER_DAEMON_NAME" - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: "CODER_CONFIG_DIR" - value: "/tmp/config" - %{~ endif ~} - %{~ if !workspace_proxy && !provisionerd ~} - - name: "CODER_ACCESS_URL" - value: "${access_url}" - - name: "CODER_WILDCARD_ACCESS_URL" - value: "${wildcard_access_url}" - - name: "CODER_PG_CONNECTION_URL" - valueFrom: - secretKeyRef: - name: "${db_secret}" - key: url - - name: "CODER_PROVISIONER_DAEMONS" - value: "0" - - name: CODER_PROVISIONER_DAEMON_PSK - valueFrom: - secretKeyRef: - key: psk - name: "${provisionerd_psk}" - - name: "CODER_PROMETHEUS_COLLECT_AGENT_STATS" - value: "true" - - name: "CODER_PROMETHEUS_COLLECT_DB_METRICS" - value: "true" - - name: "CODER_PPROF_ENABLE" - value: "true" - %{~ endif ~} - - name: "CODER_CACHE_DIRECTORY" - value: "/tmp/coder" - - name: "CODER_TELEMETRY_ENABLE" - value: "false" - - name: "CODER_LOGGING_HUMAN" - value: "/dev/null" - - name: "CODER_LOGGING_STACKDRIVER" - value: "/dev/stderr" - - name: "CODER_PROMETHEUS_ENABLE" - value: "true" - - name: "CODER_VERBOSE" - value: "true" - - name: "CODER_EXPERIMENTS" - value: "${experiments}" - - name: "CODER_DANGEROUS_DISABLE_RATE_LIMITS" - value: "true" - - name: "CODER_DANGEROUS_ALLOW_PATH_APP_SITE_OWNER_ACCESS" - value: "true" - image: - repo: ${image_repo} - tag: ${image_tag} - replicaCount: "${replicas}" - resources: - requests: - cpu: "${cpu_request}" - memory: "${mem_request}" - limits: - cpu: "${cpu_limit}" - memory: "${mem_limit}" - securityContext: - readOnlyRootFilesystem: true - %{~ if !provisionerd ~} - service: - enable: true - sessionAffinity: None - loadBalancerIP: "${ip_address}" - %{~ endif ~} - volumeMounts: - - mountPath: "/tmp" - name: cache - readOnly: false - volumes: - - emptyDir: - sizeLimit: 1024Mi - name: cache - %{~ if !provisionerd ~} - tls: - secretNames: - - "${tls_secret_name}" - %{~ endif ~} diff --git a/scaletest/terraform/action/coder_proxies.tf b/scaletest/terraform/action/coder_proxies.tf deleted file mode 100644 index 6af3ef82bb392..0000000000000 --- a/scaletest/terraform/action/coder_proxies.tf +++ /dev/null @@ -1,102 +0,0 @@ -data "http" "coder_healthy" { - url = local.deployments.primary.url - // Wait up to 5 minutes for DNS to propagate - retry { - attempts = 30 - min_delay_ms = 10000 - } - - lifecycle { - postcondition { - condition = self.status_code == 200 - error_message = "${self.url} returned an unhealthy status code" - } - } - - depends_on = [helm_release.coder_primary, cloudflare_record.coder["primary"]] -} - -resource "null_resource" "api_key" { - provisioner "local-exec" { - interpreter = ["/bin/bash", "-c"] - command = < ${path.module}/.coderv2/session_token - -api_key=$(curl '${local.deployments.primary.url}/api/v2/users/me/keys/tokens' \ - -H "Coder-Session-Token: $${session_token}" \ - --data-raw '{"token_name":"terraform","scope":"all"}' \ - --insecure --silent | jq -r .key) - -echo -n $${api_key} > ${path.module}/.coderv2/api_key -EOF - } - - depends_on = [data.http.coder_healthy] -} - -data "local_file" "api_key" { - filename = "${path.module}/.coderv2/api_key" - depends_on = [null_resource.api_key] -} - -resource "null_resource" "license" { - provisioner "local-exec" { - interpreter = ["/bin/bash", "-c"] - command = < ${path.module}/.coderv2/europe_proxy_token -EOF - } - - depends_on = [null_resource.license] -} - -data "local_file" "europe_proxy_token" { - filename = "${path.module}/.coderv2/europe_proxy_token" - depends_on = [null_resource.europe_proxy_token] -} - -resource "null_resource" "asia_proxy_token" { - provisioner "local-exec" { - interpreter = ["/bin/bash", "-c"] - command = < ${path.module}/.coderv2/asia_proxy_token -EOF - } - - depends_on = [null_resource.license] -} - -data "local_file" "asia_proxy_token" { - filename = "${path.module}/.coderv2/asia_proxy_token" - depends_on = [null_resource.asia_proxy_token] -} diff --git a/scaletest/terraform/action/coder_templates.tf b/scaletest/terraform/action/coder_templates.tf deleted file mode 100644 index d27c25844b91e..0000000000000 --- a/scaletest/terraform/action/coder_templates.tf +++ /dev/null @@ -1,340 +0,0 @@ -resource "local_file" "kubernetes_template" { - filename = "${path.module}/.coderv2/templates/kubernetes/main.tf" - content = < Date: Thu, 28 Aug 2025 12:37:13 +0100 Subject: [PATCH 2/6] chore(coderd/database/dbauthz): refactor TestPing, TestNew, TestInTX to use dbmock (#19604) Part of https://github.com/coder/internal/issues/869 --- coderd/database/dbauthz/dbauthz_test.go | 71 +++++++++++++------------ 1 file changed, 37 insertions(+), 34 deletions(-) diff --git a/coderd/database/dbauthz/dbauthz_test.go b/coderd/database/dbauthz/dbauthz_test.go index cda914cc47617..7321f9dfbd6e9 100644 --- a/coderd/database/dbauthz/dbauthz_test.go +++ b/coderd/database/dbauthz/dbauthz_test.go @@ -73,7 +73,9 @@ func TestAsNoActor(t *testing.T) { func TestPing(t *testing.T) { t.Parallel() - db, _ := dbtestutil.NewDB(t) + db := dbmock.NewMockStore(gomock.NewController(t)) + db.EXPECT().Wrappers().Times(1).Return([]string{}) + db.EXPECT().Ping(gomock.Any()).Times(1).Return(time.Second, nil) q := dbauthz.New(db, &coderdtest.RecordingAuthorizer{}, slog.Make(), coderdtest.AccessControlStorePointer()) _, err := q.Ping(context.Background()) require.NoError(t, err, "must not error") @@ -83,34 +85,39 @@ func TestPing(t *testing.T) { func TestInTX(t *testing.T) { t.Parallel() - db, _ := dbtestutil.NewDB(t) + var ( + ctrl = gomock.NewController(t) + db = dbmock.NewMockStore(ctrl) + mTx = dbmock.NewMockStore(ctrl) // to record the 'in tx' calls + faker = gofakeit.New(0) + w = testutil.Fake(t, faker, database.Workspace{}) + actor = rbac.Subject{ + ID: uuid.NewString(), + Roles: rbac.RoleIdentifiers{rbac.RoleOwner()}, + Groups: []string{}, + Scope: rbac.ScopeAll, + } + ctx = dbauthz.As(context.Background(), actor) + ) + + db.EXPECT().Wrappers().Times(1).Return([]string{}) // called by dbauthz.New q := dbauthz.New(db, &coderdtest.RecordingAuthorizer{ Wrapped: (&coderdtest.FakeAuthorizer{}).AlwaysReturn(xerrors.New("custom error")), }, slog.Make(), coderdtest.AccessControlStorePointer()) - actor := rbac.Subject{ - ID: uuid.NewString(), - Roles: rbac.RoleIdentifiers{rbac.RoleOwner()}, - Groups: []string{}, - Scope: rbac.ScopeAll, - } - u := dbgen.User(t, db, database.User{}) - o := dbgen.Organization(t, db, database.Organization{}) - tpl := dbgen.Template(t, db, database.Template{ - CreatedBy: u.ID, - OrganizationID: o.ID, - }) - w := dbgen.Workspace(t, db, database.WorkspaceTable{ - OwnerID: u.ID, - TemplateID: tpl.ID, - OrganizationID: o.ID, - }) - ctx := dbauthz.As(context.Background(), actor) + + db.EXPECT().InTx(gomock.Any(), gomock.Any()).Times(1).DoAndReturn( + func(f func(database.Store) error, _ *database.TxOptions) error { + return f(mTx) + }, + ) + mTx.EXPECT().Wrappers().Times(1).Return([]string{}) + mTx.EXPECT().GetWorkspaceByID(gomock.Any(), gomock.Any()).Times(1).Return(w, nil) err := q.InTx(func(tx database.Store) error { // The inner tx should use the parent's authz _, err := tx.GetWorkspaceByID(ctx, w.ID) return err }, nil) - require.Error(t, err, "must error") + require.ErrorContains(t, err, "custom error", "must be our custom error") require.ErrorAs(t, err, &dbauthz.NotAuthorizedError{}, "must be an authorized error") require.True(t, dbauthz.IsNotAuthorizedError(err), "must be an authorized error") } @@ -120,24 +127,18 @@ func TestNew(t *testing.T) { t.Parallel() var ( - db, _ = dbtestutil.NewDB(t) + ctrl = gomock.NewController(t) + db = dbmock.NewMockStore(ctrl) + faker = gofakeit.New(0) rec = &coderdtest.RecordingAuthorizer{ Wrapped: &coderdtest.FakeAuthorizer{}, } subj = rbac.Subject{} ctx = dbauthz.As(context.Background(), rbac.Subject{}) ) - u := dbgen.User(t, db, database.User{}) - org := dbgen.Organization(t, db, database.Organization{}) - tpl := dbgen.Template(t, db, database.Template{ - OrganizationID: org.ID, - CreatedBy: u.ID, - }) - exp := dbgen.Workspace(t, db, database.WorkspaceTable{ - OwnerID: u.ID, - OrganizationID: org.ID, - TemplateID: tpl.ID, - }) + db.EXPECT().Wrappers().Times(1).Return([]string{}).Times(2) // two calls to New() + exp := testutil.Fake(t, faker, database.Workspace{}) + db.EXPECT().GetWorkspaceByID(gomock.Any(), exp.ID).Times(1).Return(exp, nil) // Double wrap should not cause an actual double wrap. So only 1 rbac call // should be made. az := dbauthz.New(db, rec, slog.Make(), coderdtest.AccessControlStorePointer()) @@ -145,7 +146,7 @@ func TestNew(t *testing.T) { w, err := az.GetWorkspaceByID(ctx, exp.ID) require.NoError(t, err, "must not error") - require.Equal(t, exp, w.WorkspaceTable(), "must be equal") + require.Equal(t, exp, w, "must be equal") rec.AssertActor(t, subj, rec.Pair(policy.ActionRead, exp)) require.NoError(t, rec.AllAsserted(), "should only be 1 rbac call") @@ -154,6 +155,8 @@ func TestNew(t *testing.T) { // TestDBAuthzRecursive is a simple test to search for infinite recursion // bugs. It isn't perfect, and only catches a subset of the possible bugs // as only the first db call will be made. But it is better than nothing. +// This can be removed when all tests in this package are migrated to +// dbmock as it will immediately detect recursive calls. func TestDBAuthzRecursive(t *testing.T) { t.Parallel() db, _ := dbtestutil.NewDB(t) From 347ab5b3480db6c698292ad9773f45cd2be5408f Mon Sep 17 00:00:00 2001 From: Danielle Maywood Date: Thu, 28 Aug 2025 12:58:02 +0100 Subject: [PATCH 3/6] fix(coderd/taskname): ensure generated name is within 32 byte limit (#19612) The previous logic verified a generated name was valid, _and then appended a suffix to it_. This was flawed as it would allow a 32 character name, and then append an extra 5 characters to it. Instead we now append the suffix _and then_ verify it is valid. --- coderd/taskname/taskname.go | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/coderd/taskname/taskname.go b/coderd/taskname/taskname.go index dff57dfd0c7f5..734c23eb3dd76 100644 --- a/coderd/taskname/taskname.go +++ b/coderd/taskname/taskname.go @@ -24,7 +24,7 @@ const ( Requirements: - Only lowercase letters, numbers, and hyphens - Start with "task-" -- Maximum 28 characters total +- Maximum 27 characters total - Descriptive of the main task Examples: @@ -145,17 +145,23 @@ func Generate(ctx context.Context, prompt string, opts ...Option) (string, error return "", ErrNoNameGenerated } - generatedName := acc.Messages()[0].Content - - if err := codersdk.NameValid(generatedName); err != nil { - return "", xerrors.Errorf("generated name %v not valid: %w", generatedName, err) + taskName := acc.Messages()[0].Content + if taskName == "task-unnamed" { + return "", ErrNoNameGenerated } - if generatedName == "task-unnamed" { - return "", ErrNoNameGenerated + // We append a suffix to the end of the task name to reduce + // the chance of collisions. We truncate the task name to + // to a maximum of 27 bytes, so that when we append the + // 5 byte suffix (`-` and 4 byte hex slug), it should + // remain within the 32 byte workspace name limit. + taskName = taskName[:min(len(taskName), 27)] + taskName = fmt.Sprintf("%s-%s", taskName, generateSuffix()) + if err := codersdk.NameValid(taskName); err != nil { + return "", xerrors.Errorf("generated name %v not valid: %w", taskName, err) } - return fmt.Sprintf("%s-%s", generatedName, generateSuffix()), nil + return taskName, nil } func anthropicDataStream(ctx context.Context, client anthropic.Client, model anthropic.Model, input []aisdk.Message) (aisdk.DataStream, error) { From 8d6a3223448dcb8b7a0646592cffaa46364e959a Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Thu, 28 Aug 2025 12:58:36 +0100 Subject: [PATCH 4/6] chore(docs): document automatic task naming (#19614) Updates our experimental AI docs on how to automatically generate task names. --------- Co-authored-by: Ben Potter --- docs/ai-coder/tasks.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/ai-coder/tasks.md b/docs/ai-coder/tasks.md index 43c4becdf8be1..ef47a6b3fb874 100644 --- a/docs/ai-coder/tasks.md +++ b/docs/ai-coder/tasks.md @@ -82,6 +82,10 @@ If a workspace app has the special `"preview"` slug, a navbar will appear above We plan to introduce more customization options in future releases. +## Automatically name your tasks + +Coder can automatically generate a name your tasks if you set the `ANTHROPIC_API_KEY` environment variable on the Coder server. Otherwise, tasks will be given randomly generated names. + ## Opting out of Tasks If you tried Tasks and decided you don't want to use it, you can hide the Tasks tab by starting `coder server` with the `CODER_HIDE_AI_TASKS=true` environment variable or the `--hide-ai-tasks` flag. From 9fd33a765307b6e2ab0a0da5c59d38ad06d897df Mon Sep 17 00:00:00 2001 From: Kacper Sawicki Date: Thu, 28 Aug 2025 14:51:43 +0200 Subject: [PATCH 5/6] chore(docs): set external workspaces as premium feature in manifest.json (#19615) --- docs/manifest.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/manifest.json b/docs/manifest.json index 4d2a62c994c88..d2cd11ace699b 100644 --- a/docs/manifest.json +++ b/docs/manifest.json @@ -542,7 +542,7 @@ "title": "External Workspaces", "description": "Learn how to manage external workspaces", "path": "./admin/templates/managing-templates/external-workspaces.md", - "state": ["early access"] + "state": ["premium", "early access"] } ] }, From 0ab345ca845a51deaf1201a97983219d2a467351 Mon Sep 17 00:00:00 2001 From: Susana Ferreira Date: Thu, 28 Aug 2025 15:00:26 +0100 Subject: [PATCH 6/6] feat: add prebuild timing metrics to Prometheus (#19503) ## Description This PR introduces one counter and two histograms related to workspace creation and claiming. The goal is to provide clearer observability into how workspaces are created (regular vs prebuild) and the time cost of those operations. ### `coderd_workspace_creation_total` * Metric type: Counter * Name: `coderd_workspace_creation_total` * Labels: `organization_name`, `template_name`, `preset_name` This counter tracks whether a regular workspace (not created from a prebuild pool) was created using a preset or not. Currently, we already expose `coderd_prebuilt_workspaces_claimed_total` for claimed prebuilt workspaces, but we lack a comparable metric for regular workspace creations. This metric fills that gap, making it possible to compare regular creations against claims. Implementation notes: * Exposed as a `coderd_` metric, consistent with other workspace-related metrics (e.g. `coderd_api_workspace_latest_build`: https://github.com/coder/coder/blob/main/coderd/prometheusmetrics/prometheusmetrics.go#L149). * Every `defaultRefreshRate` (1 minute ), DB query `GetRegularWorkspaceCreateMetrics` is executed to fetch all regular workspaces (not created from a prebuild pool). * The counter is updated with the total from all time (not just since metric introduction). This differs from the histograms below, which only accumulate from their introduction forward. ### `coderd_workspace_creation_duration_seconds` & `coderd_prebuilt_workspace_claim_duration_seconds` * Metric types: Histogram * Names: * `coderd_workspace_creation_duration_seconds` * Labels: `organization_name`, `template_name`, `preset_name`, `type` (`regular`, `prebuild`) * `coderd_prebuilt_workspace_claim_duration_seconds` * Labels: `organization_name`, `template_name`, `preset_name` We already have `coderd_provisionerd_workspace_build_timings_seconds`, which tracks build run times for all workspace builds handled by the provisioner daemon. However, in the context of this issue, we are only interested in creation and claim build times, not all transitions; additionally, this metric does not include `preset_name`, and adding it there would significantly increase cardinality. Therefore, separate more focused metrics are introduced here: * `coderd_workspace_creation_duration_seconds`: Build time to create a workspace (either a regular workspace or the build into a prebuild pool, for prebuild initial provisioning build). * `coderd_prebuilt_workspace_claim_duration_seconds`: Time to claim a prebuilt workspace from the pool. The reason for two separate histograms is that: * Creation (regular or prebuild): provisioning builds with similar time magnitude, generally expected to take longer than a claim operation. * Claim: expected to be a much faster provisioning build. #### Native histogram usage Provisioning times vary widely between projects. Using static buckets risks unbalanced or poorly informative histograms. To address this, these metrics use [Prometheus native histograms](https://prometheus.io/docs/specs/native_histograms/): * First introduced in Prometheus v2.40.0 * Recommended stable usage from v2.45+ * Requires Go client `prometheus/client_golang` v1.15.0+ * Experimental and must be explicitly enabled on the server (`--enable-feature=native-histograms`) For compatibility, we also retain a classic bucket definition (aligned with the existing provisioner metric: https://github.com/coder/coder/blob/main/provisionerd/provisionerd.go#L182-L189). * If native histograms are enabled, Prometheus ingests the high-resolution histogram. * If not, it falls back to the predefined buckets. Implementation notes: * Unlike the counter, these histograms are updated in real-time at workspace build job completion. * They reflect data only from the point of introduction forward (no historical backfill). ## Relates to Closes: https://github.com/coder/coder/issues/19528 Native histograms tested in observability stack: https://github.com/coder/observability/pull/50 --- cli/server.go | 18 +- coderd/coderd.go | 3 + coderd/coderdtest/coderdtest.go | 3 + coderd/database/dbauthz/dbauthz.go | 7 + coderd/database/dbauthz/dbauthz_test.go | 4 + coderd/database/dbmetrics/querymetrics.go | 7 + coderd/database/dbmock/dbmock.go | 15 ++ coderd/database/querier.go | 3 + coderd/database/queries.sql.go | 71 ++++++- coderd/database/queries/prebuilds.sql | 2 +- coderd/database/queries/workspaces.sql | 33 ++++ coderd/prometheusmetrics/prometheusmetrics.go | 33 ++++ .../prometheusmetrics_test.go | 102 ++++++++++ coderd/provisionerdserver/metrics.go | 177 ++++++++++++++++++ .../provisionerdserver/provisionerdserver.go | 48 +++++ .../provisionerdserver_test.go | 1 + docs/admin/integrations/prometheus.md | 19 ++ .../prebuilt-workspaces.md | 1 + enterprise/coderd/provisionerdaemons.go | 1 + enterprise/coderd/workspaces_test.go | 128 +++++++++++++ scripts/metricsdocgen/metrics | 31 +++ 21 files changed, 699 insertions(+), 8 deletions(-) create mode 100644 coderd/provisionerdserver/metrics.go diff --git a/cli/server.go b/cli/server.go index f9e744761b22e..5018007e2b4e8 100644 --- a/cli/server.go +++ b/cli/server.go @@ -62,12 +62,6 @@ import ( "github.com/coder/serpent" "github.com/coder/wgtunnel/tunnelsdk" - "github.com/coder/coder/v2/coderd/entitlements" - "github.com/coder/coder/v2/coderd/notifications/reports" - "github.com/coder/coder/v2/coderd/runtimeconfig" - "github.com/coder/coder/v2/coderd/webpush" - "github.com/coder/coder/v2/codersdk/drpcsdk" - "github.com/coder/coder/v2/buildinfo" "github.com/coder/coder/v2/cli/clilog" "github.com/coder/coder/v2/cli/cliui" @@ -83,15 +77,19 @@ import ( "github.com/coder/coder/v2/coderd/database/migrations" "github.com/coder/coder/v2/coderd/database/pubsub" "github.com/coder/coder/v2/coderd/devtunnel" + "github.com/coder/coder/v2/coderd/entitlements" "github.com/coder/coder/v2/coderd/externalauth" "github.com/coder/coder/v2/coderd/gitsshkey" "github.com/coder/coder/v2/coderd/httpmw" "github.com/coder/coder/v2/coderd/jobreaper" "github.com/coder/coder/v2/coderd/notifications" + "github.com/coder/coder/v2/coderd/notifications/reports" "github.com/coder/coder/v2/coderd/oauthpki" "github.com/coder/coder/v2/coderd/prometheusmetrics" "github.com/coder/coder/v2/coderd/prometheusmetrics/insights" "github.com/coder/coder/v2/coderd/promoauth" + "github.com/coder/coder/v2/coderd/provisionerdserver" + "github.com/coder/coder/v2/coderd/runtimeconfig" "github.com/coder/coder/v2/coderd/schedule" "github.com/coder/coder/v2/coderd/telemetry" "github.com/coder/coder/v2/coderd/tracing" @@ -99,9 +97,11 @@ import ( "github.com/coder/coder/v2/coderd/util/ptr" "github.com/coder/coder/v2/coderd/util/slice" stringutil "github.com/coder/coder/v2/coderd/util/strings" + "github.com/coder/coder/v2/coderd/webpush" "github.com/coder/coder/v2/coderd/workspaceapps/appurl" "github.com/coder/coder/v2/coderd/workspacestats" "github.com/coder/coder/v2/codersdk" + "github.com/coder/coder/v2/codersdk/drpcsdk" "github.com/coder/coder/v2/cryptorand" "github.com/coder/coder/v2/provisioner/echo" "github.com/coder/coder/v2/provisioner/terraform" @@ -280,6 +280,12 @@ func enablePrometheus( } } + provisionerdserverMetrics := provisionerdserver.NewMetrics(logger) + if err := provisionerdserverMetrics.Register(options.PrometheusRegistry); err != nil { + return nil, xerrors.Errorf("failed to register provisionerd_server metrics: %w", err) + } + options.ProvisionerdServerMetrics = provisionerdserverMetrics + //nolint:revive return ServeHandler( ctx, logger, promhttp.InstrumentMetricHandler( diff --git a/coderd/coderd.go b/coderd/coderd.go index 724952bde7bb9..053880ce31b89 100644 --- a/coderd/coderd.go +++ b/coderd/coderd.go @@ -241,6 +241,8 @@ type Options struct { UpdateAgentMetrics func(ctx context.Context, labels prometheusmetrics.AgentMetricLabels, metrics []*agentproto.Stats_Metric) StatsBatcher workspacestats.Batcher + ProvisionerdServerMetrics *provisionerdserver.Metrics + // WorkspaceAppAuditSessionTimeout allows changing the timeout for audit // sessions. Raising or lowering this value will directly affect the write // load of the audit log table. This is used for testing. Default 1 hour. @@ -1930,6 +1932,7 @@ func (api *API) CreateInMemoryTaggedProvisionerDaemon(dialCtx context.Context, n }, api.NotificationsEnqueuer, &api.PrebuildsReconciler, + api.ProvisionerdServerMetrics, ) if err != nil { return nil, err diff --git a/coderd/coderdtest/coderdtest.go b/coderd/coderdtest/coderdtest.go index 34ba84a85e33a..f773053c3a56c 100644 --- a/coderd/coderdtest/coderdtest.go +++ b/coderd/coderdtest/coderdtest.go @@ -184,6 +184,8 @@ type Options struct { OIDCConvertKeyCache cryptokeys.SigningKeycache Clock quartz.Clock TelemetryReporter telemetry.Reporter + + ProvisionerdServerMetrics *provisionerdserver.Metrics } // New constructs a codersdk client connected to an in-memory API instance. @@ -604,6 +606,7 @@ func NewOptions(t testing.TB, options *Options) (func(http.Handler), context.Can Clock: options.Clock, AppEncryptionKeyCache: options.APIKeyEncryptionCache, OIDCConvertKeyCache: options.OIDCConvertKeyCache, + ProvisionerdServerMetrics: options.ProvisionerdServerMetrics, } } diff --git a/coderd/database/dbauthz/dbauthz.go b/coderd/database/dbauthz/dbauthz.go index d1363c974214f..53c58a5de15a7 100644 --- a/coderd/database/dbauthz/dbauthz.go +++ b/coderd/database/dbauthz/dbauthz.go @@ -2699,6 +2699,13 @@ func (q *querier) GetQuotaConsumedForUser(ctx context.Context, params database.G return q.db.GetQuotaConsumedForUser(ctx, params) } +func (q *querier) GetRegularWorkspaceCreateMetrics(ctx context.Context) ([]database.GetRegularWorkspaceCreateMetricsRow, error) { + if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceWorkspace.All()); err != nil { + return nil, err + } + return q.db.GetRegularWorkspaceCreateMetrics(ctx) +} + func (q *querier) GetReplicaByID(ctx context.Context, id uuid.UUID) (database.Replica, error) { if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceSystem); err != nil { return database.Replica{}, err diff --git a/coderd/database/dbauthz/dbauthz_test.go b/coderd/database/dbauthz/dbauthz_test.go index 7321f9dfbd6e9..68bed8f2ef5e9 100644 --- a/coderd/database/dbauthz/dbauthz_test.go +++ b/coderd/database/dbauthz/dbauthz_test.go @@ -2177,6 +2177,10 @@ func (s *MethodTestSuite) TestWorkspace() { dbm.EXPECT().GetWorkspaceAgentDevcontainersByAgentID(gomock.Any(), agt.ID).Return([]database.WorkspaceAgentDevcontainer{d}, nil).AnyTimes() check.Args(agt.ID).Asserts(w, policy.ActionRead).Returns([]database.WorkspaceAgentDevcontainer{d}) })) + s.Run("GetRegularWorkspaceCreateMetrics", s.Subtest(func(_ database.Store, check *expects) { + check.Args(). + Asserts(rbac.ResourceWorkspace.All(), policy.ActionRead) + })) } func (s *MethodTestSuite) TestWorkspacePortSharing() { diff --git a/coderd/database/dbmetrics/querymetrics.go b/coderd/database/dbmetrics/querymetrics.go index 4b5e953d771dd..3f729acdccf23 100644 --- a/coderd/database/dbmetrics/querymetrics.go +++ b/coderd/database/dbmetrics/querymetrics.go @@ -1356,6 +1356,13 @@ func (m queryMetricsStore) GetQuotaConsumedForUser(ctx context.Context, ownerID return consumed, err } +func (m queryMetricsStore) GetRegularWorkspaceCreateMetrics(ctx context.Context) ([]database.GetRegularWorkspaceCreateMetricsRow, error) { + start := time.Now() + r0, r1 := m.s.GetRegularWorkspaceCreateMetrics(ctx) + m.queryLatencies.WithLabelValues("GetRegularWorkspaceCreateMetrics").Observe(time.Since(start).Seconds()) + return r0, r1 +} + func (m queryMetricsStore) GetReplicaByID(ctx context.Context, id uuid.UUID) (database.Replica, error) { start := time.Now() replica, err := m.s.GetReplicaByID(ctx, id) diff --git a/coderd/database/dbmock/dbmock.go b/coderd/database/dbmock/dbmock.go index 02415d6cb8ea4..4f01933baf42b 100644 --- a/coderd/database/dbmock/dbmock.go +++ b/coderd/database/dbmock/dbmock.go @@ -2851,6 +2851,21 @@ func (mr *MockStoreMockRecorder) GetQuotaConsumedForUser(ctx, arg any) *gomock.C return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetQuotaConsumedForUser", reflect.TypeOf((*MockStore)(nil).GetQuotaConsumedForUser), ctx, arg) } +// GetRegularWorkspaceCreateMetrics mocks base method. +func (m *MockStore) GetRegularWorkspaceCreateMetrics(ctx context.Context) ([]database.GetRegularWorkspaceCreateMetricsRow, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetRegularWorkspaceCreateMetrics", ctx) + ret0, _ := ret[0].([]database.GetRegularWorkspaceCreateMetricsRow) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetRegularWorkspaceCreateMetrics indicates an expected call of GetRegularWorkspaceCreateMetrics. +func (mr *MockStoreMockRecorder) GetRegularWorkspaceCreateMetrics(ctx any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetRegularWorkspaceCreateMetrics", reflect.TypeOf((*MockStore)(nil).GetRegularWorkspaceCreateMetrics), ctx) +} + // GetReplicaByID mocks base method. func (m *MockStore) GetReplicaByID(ctx context.Context, id uuid.UUID) (database.Replica, error) { m.ctrl.T.Helper() diff --git a/coderd/database/querier.go b/coderd/database/querier.go index 28ed7609c53d6..6e955b82b0bce 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -306,6 +306,9 @@ type sqlcQuerier interface { GetProvisionerLogsAfterID(ctx context.Context, arg GetProvisionerLogsAfterIDParams) ([]ProvisionerJobLog, error) GetQuotaAllowanceForUser(ctx context.Context, arg GetQuotaAllowanceForUserParams) (int64, error) GetQuotaConsumedForUser(ctx context.Context, arg GetQuotaConsumedForUserParams) (int64, error) + // Count regular workspaces: only those whose first successful 'start' build + // was not initiated by the prebuild system user. + GetRegularWorkspaceCreateMetrics(ctx context.Context) ([]GetRegularWorkspaceCreateMetricsRow, error) GetReplicaByID(ctx context.Context, id uuid.UUID) (Replica, error) GetReplicasUpdatedAfter(ctx context.Context, updatedAt time.Time) ([]Replica, error) GetRunningPrebuiltWorkspaces(ctx context.Context) ([]GetRunningPrebuiltWorkspacesRow, error) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index d527d90887093..d5495c4df5a8c 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -7309,7 +7309,7 @@ const getPrebuildMetrics = `-- name: GetPrebuildMetrics :many SELECT t.name as template_name, tvp.name as preset_name, - o.name as organization_name, + o.name as organization_name, COUNT(*) as created_count, COUNT(*) FILTER (WHERE pj.job_status = 'failed'::provisioner_job_status) as failed_count, COUNT(*) FILTER ( @@ -20131,6 +20131,75 @@ func (q *sqlQuerier) GetDeploymentWorkspaceStats(ctx context.Context) (GetDeploy return i, err } +const getRegularWorkspaceCreateMetrics = `-- name: GetRegularWorkspaceCreateMetrics :many +WITH first_success_build AS ( + -- Earliest successful 'start' build per workspace + SELECT DISTINCT ON (wb.workspace_id) + wb.workspace_id, + wb.template_version_preset_id, + wb.initiator_id + FROM workspace_builds wb + JOIN provisioner_jobs pj ON pj.id = wb.job_id + WHERE + wb.transition = 'start'::workspace_transition + AND pj.job_status = 'succeeded'::provisioner_job_status + ORDER BY wb.workspace_id, wb.build_number, wb.id +) +SELECT + t.name AS template_name, + COALESCE(tvp.name, '') AS preset_name, + o.name AS organization_name, + COUNT(*) AS created_count +FROM first_success_build fsb + JOIN workspaces w ON w.id = fsb.workspace_id + JOIN templates t ON t.id = w.template_id + LEFT JOIN template_version_presets tvp ON tvp.id = fsb.template_version_preset_id + JOIN organizations o ON o.id = w.organization_id +WHERE + NOT t.deleted + -- Exclude workspaces whose first successful start was the prebuilds system user + AND fsb.initiator_id != 'c42fdf75-3097-471c-8c33-fb52454d81c0'::uuid +GROUP BY t.name, COALESCE(tvp.name, ''), o.name +ORDER BY t.name, preset_name, o.name +` + +type GetRegularWorkspaceCreateMetricsRow struct { + TemplateName string `db:"template_name" json:"template_name"` + PresetName string `db:"preset_name" json:"preset_name"` + OrganizationName string `db:"organization_name" json:"organization_name"` + CreatedCount int64 `db:"created_count" json:"created_count"` +} + +// Count regular workspaces: only those whose first successful 'start' build +// was not initiated by the prebuild system user. +func (q *sqlQuerier) GetRegularWorkspaceCreateMetrics(ctx context.Context) ([]GetRegularWorkspaceCreateMetricsRow, error) { + rows, err := q.db.QueryContext(ctx, getRegularWorkspaceCreateMetrics) + if err != nil { + return nil, err + } + defer rows.Close() + var items []GetRegularWorkspaceCreateMetricsRow + for rows.Next() { + var i GetRegularWorkspaceCreateMetricsRow + if err := rows.Scan( + &i.TemplateName, + &i.PresetName, + &i.OrganizationName, + &i.CreatedCount, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + const getWorkspaceACLByID = `-- name: GetWorkspaceACLByID :one SELECT group_acl as groups, diff --git a/coderd/database/queries/prebuilds.sql b/coderd/database/queries/prebuilds.sql index 8654453554e8c..2ad7f41d41fea 100644 --- a/coderd/database/queries/prebuilds.sql +++ b/coderd/database/queries/prebuilds.sql @@ -230,7 +230,7 @@ HAVING COUNT(*) = @hard_limit::bigint; SELECT t.name as template_name, tvp.name as preset_name, - o.name as organization_name, + o.name as organization_name, COUNT(*) as created_count, COUNT(*) FILTER (WHERE pj.job_status = 'failed'::provisioner_job_status) as failed_count, COUNT(*) FILTER ( diff --git a/coderd/database/queries/workspaces.sql b/coderd/database/queries/workspaces.sql index 802bded5b836b..80d8c7b920d74 100644 --- a/coderd/database/queries/workspaces.sql +++ b/coderd/database/queries/workspaces.sql @@ -923,3 +923,36 @@ SET user_acl = @user_acl WHERE id = @id; + +-- name: GetRegularWorkspaceCreateMetrics :many +-- Count regular workspaces: only those whose first successful 'start' build +-- was not initiated by the prebuild system user. +WITH first_success_build AS ( + -- Earliest successful 'start' build per workspace + SELECT DISTINCT ON (wb.workspace_id) + wb.workspace_id, + wb.template_version_preset_id, + wb.initiator_id + FROM workspace_builds wb + JOIN provisioner_jobs pj ON pj.id = wb.job_id + WHERE + wb.transition = 'start'::workspace_transition + AND pj.job_status = 'succeeded'::provisioner_job_status + ORDER BY wb.workspace_id, wb.build_number, wb.id +) +SELECT + t.name AS template_name, + COALESCE(tvp.name, '') AS preset_name, + o.name AS organization_name, + COUNT(*) AS created_count +FROM first_success_build fsb + JOIN workspaces w ON w.id = fsb.workspace_id + JOIN templates t ON t.id = w.template_id + LEFT JOIN template_version_presets tvp ON tvp.id = fsb.template_version_preset_id + JOIN organizations o ON o.id = w.organization_id +WHERE + NOT t.deleted + -- Exclude workspaces whose first successful start was the prebuilds system user + AND fsb.initiator_id != 'c42fdf75-3097-471c-8c33-fb52454d81c0'::uuid +GROUP BY t.name, COALESCE(tvp.name, ''), o.name +ORDER BY t.name, preset_name, o.name; diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 6ea8615f3779a..ed55e4598dc21 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -165,6 +165,18 @@ func Workspaces(ctx context.Context, logger slog.Logger, registerer prometheus.R return nil, err } + workspaceCreationTotal := prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "coderd", + Name: "workspace_creation_total", + Help: "Total regular (non-prebuilt) workspace creations by organization, template, and preset.", + }, + []string{"organization_name", "template_name", "preset_name"}, + ) + if err := registerer.Register(workspaceCreationTotal); err != nil { + return nil, err + } + ctx, cancelFunc := context.WithCancel(ctx) done := make(chan struct{}) @@ -200,6 +212,27 @@ func Workspaces(ctx context.Context, logger slog.Logger, registerer prometheus.R string(w.LatestBuildTransition), ).Add(1) } + + // Update regular workspaces (without a prebuild transition) creation counter + regularWorkspaces, err := db.GetRegularWorkspaceCreateMetrics(ctx) + if err != nil { + if errors.Is(err, sql.ErrNoRows) { + workspaceCreationTotal.Reset() + } else { + logger.Warn(ctx, "failed to load regular workspaces for metrics", slog.Error(err)) + } + return + } + + workspaceCreationTotal.Reset() + + for _, regularWorkspace := range regularWorkspaces { + workspaceCreationTotal.WithLabelValues( + regularWorkspace.OrganizationName, + regularWorkspace.TemplateName, + regularWorkspace.PresetName, + ).Add(float64(regularWorkspace.CreatedCount)) + } } // Use time.Nanosecond to force an initial tick. It will be reset to the diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index 28046c1dff3fb..3d8704f92460d 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -424,6 +424,107 @@ func TestWorkspaceLatestBuildStatuses(t *testing.T) { } } +func TestWorkspaceCreationTotal(t *testing.T) { + t.Parallel() + + for _, tc := range []struct { + Name string + Database func() database.Store + ExpectedWorkspaces int + }{ + { + Name: "None", + Database: func() database.Store { + db, _ := dbtestutil.NewDB(t) + return db + }, + ExpectedWorkspaces: 0, + }, + { + // Should count only the successfully created workspaces + Name: "Multiple", + Database: func() database.Store { + db, _ := dbtestutil.NewDB(t) + u := dbgen.User(t, db, database.User{}) + org := dbgen.Organization(t, db, database.Organization{}) + insertTemplates(t, db, u, org) + insertCanceled(t, db, u, org) + insertFailed(t, db, u, org) + insertFailed(t, db, u, org) + insertSuccess(t, db, u, org) + insertSuccess(t, db, u, org) + insertSuccess(t, db, u, org) + insertRunning(t, db, u, org) + return db + }, + ExpectedWorkspaces: 3, + }, + { + // Should not include prebuilt workspaces + Name: "MultipleWithPrebuild", + Database: func() database.Store { + ctx := context.Background() + db, _ := dbtestutil.NewDB(t) + u := dbgen.User(t, db, database.User{}) + prebuildUser, err := db.GetUserByID(ctx, database.PrebuildsSystemUserID) + require.NoError(t, err) + org := dbgen.Organization(t, db, database.Organization{}) + insertTemplates(t, db, u, org) + insertCanceled(t, db, u, org) + insertFailed(t, db, u, org) + insertSuccess(t, db, u, org) + insertSuccess(t, db, prebuildUser, org) + insertRunning(t, db, u, org) + return db + }, + ExpectedWorkspaces: 1, + }, + { + // Should include deleted workspaces + Name: "MultipleWithDeleted", + Database: func() database.Store { + db, _ := dbtestutil.NewDB(t) + u := dbgen.User(t, db, database.User{}) + org := dbgen.Organization(t, db, database.Organization{}) + insertTemplates(t, db, u, org) + insertCanceled(t, db, u, org) + insertFailed(t, db, u, org) + insertSuccess(t, db, u, org) + insertRunning(t, db, u, org) + insertDeleted(t, db, u, org) + return db + }, + ExpectedWorkspaces: 2, + }, + } { + t.Run(tc.Name, func(t *testing.T) { + t.Parallel() + registry := prometheus.NewRegistry() + closeFunc, err := prometheusmetrics.Workspaces(context.Background(), testutil.Logger(t), registry, tc.Database(), testutil.IntervalFast) + require.NoError(t, err) + t.Cleanup(closeFunc) + + require.Eventually(t, func() bool { + metrics, err := registry.Gather() + assert.NoError(t, err) + + sum := 0 + for _, m := range metrics { + if m.GetName() != "coderd_workspace_creation_total" { + continue + } + for _, metric := range m.Metric { + sum += int(metric.GetCounter().GetValue()) + } + } + + t.Logf("count = %d, expected == %d", sum, tc.ExpectedWorkspaces) + return sum == tc.ExpectedWorkspaces + }, testutil.WaitShort, testutil.IntervalFast) + }) + } +} + func TestAgents(t *testing.T) { t.Parallel() @@ -897,6 +998,7 @@ func insertRunning(t *testing.T, db database.Store, u database.User, org databas Transition: database.WorkspaceTransitionStart, Reason: database.BuildReasonInitiator, TemplateVersionID: templateVersionID, + InitiatorID: u.ID, }) require.NoError(t, err) // This marks the job as started. diff --git a/coderd/provisionerdserver/metrics.go b/coderd/provisionerdserver/metrics.go new file mode 100644 index 0000000000000..67bd997055e1a --- /dev/null +++ b/coderd/provisionerdserver/metrics.go @@ -0,0 +1,177 @@ +package provisionerdserver + +import ( + "context" + "time" + + "github.com/prometheus/client_golang/prometheus" + + "cdr.dev/slog" +) + +type Metrics struct { + logger slog.Logger + workspaceCreationTimings *prometheus.HistogramVec + workspaceClaimTimings *prometheus.HistogramVec +} + +type WorkspaceTimingType int + +const ( + Unsupported WorkspaceTimingType = iota + WorkspaceCreation + PrebuildCreation + PrebuildClaim +) + +const ( + workspaceTypeRegular = "regular" + workspaceTypePrebuild = "prebuild" +) + +type WorkspaceTimingFlags struct { + IsPrebuild bool + IsClaim bool + IsFirstBuild bool +} + +func NewMetrics(logger slog.Logger) *Metrics { + log := logger.Named("provisionerd_server_metrics") + + return &Metrics{ + logger: log, + workspaceCreationTimings: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "coderd", + Name: "workspace_creation_duration_seconds", + Help: "Time to create a workspace by organization, template, preset, and type (regular or prebuild).", + Buckets: []float64{ + 1, // 1s + 10, + 30, + 60, // 1min + 60 * 5, + 60 * 10, + 60 * 30, // 30min + 60 * 60, // 1hr + }, + NativeHistogramBucketFactor: 1.1, + // Max number of native buckets kept at once to bound memory. + NativeHistogramMaxBucketNumber: 100, + // Merge/flush small buckets periodically to control churn. + NativeHistogramMinResetDuration: time.Hour, + // Treat tiny values as zero (helps with noisy near-zero latencies). + NativeHistogramZeroThreshold: 0, + NativeHistogramMaxZeroThreshold: 0, + }, []string{"organization_name", "template_name", "preset_name", "type"}), + workspaceClaimTimings: prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "coderd", + Name: "prebuilt_workspace_claim_duration_seconds", + Help: "Time to claim a prebuilt workspace by organization, template, and preset.", + // Higher resolution between 1–5m to show typical prebuild claim times. + // Cap at 5m since longer claims diminish prebuild value. + Buckets: []float64{ + 1, // 1s + 5, + 10, + 20, + 30, + 60, // 1m + 120, // 2m + 180, // 3m + 240, // 4m + 300, // 5m + }, + NativeHistogramBucketFactor: 1.1, + // Max number of native buckets kept at once to bound memory. + NativeHistogramMaxBucketNumber: 100, + // Merge/flush small buckets periodically to control churn. + NativeHistogramMinResetDuration: time.Hour, + // Treat tiny values as zero (helps with noisy near-zero latencies). + NativeHistogramZeroThreshold: 0, + NativeHistogramMaxZeroThreshold: 0, + }, []string{"organization_name", "template_name", "preset_name"}), + } +} + +func (m *Metrics) Register(reg prometheus.Registerer) error { + if err := reg.Register(m.workspaceCreationTimings); err != nil { + return err + } + return reg.Register(m.workspaceClaimTimings) +} + +func (f WorkspaceTimingFlags) count() int { + count := 0 + if f.IsPrebuild { + count++ + } + if f.IsClaim { + count++ + } + if f.IsFirstBuild { + count++ + } + return count +} + +// getWorkspaceTimingType returns the type of the workspace build: +// - isPrebuild: if the workspace build corresponds to the creation of a prebuilt workspace +// - isClaim: if the workspace build corresponds to the claim of a prebuilt workspace +// - isWorkspaceFirstBuild: if the workspace build corresponds to the creation of a regular workspace +// (not created from the prebuild pool) +func getWorkspaceTimingType(flags WorkspaceTimingFlags) WorkspaceTimingType { + switch { + case flags.IsPrebuild: + return PrebuildCreation + case flags.IsClaim: + return PrebuildClaim + case flags.IsFirstBuild: + return WorkspaceCreation + default: + return Unsupported + } +} + +// UpdateWorkspaceTimingsMetrics updates the workspace timing metrics based on the workspace build type +func (m *Metrics) UpdateWorkspaceTimingsMetrics( + ctx context.Context, + flags WorkspaceTimingFlags, + organizationName string, + templateName string, + presetName string, + buildTime float64, +) { + m.logger.Debug(ctx, "update workspace timings metrics", + "organizationName", organizationName, + "templateName", templateName, + "presetName", presetName, + "isPrebuild", flags.IsPrebuild, + "isClaim", flags.IsClaim, + "isWorkspaceFirstBuild", flags.IsFirstBuild) + + if flags.count() > 1 { + m.logger.Warn(ctx, "invalid workspace timing flags", + "isPrebuild", flags.IsPrebuild, + "isClaim", flags.IsClaim, + "isWorkspaceFirstBuild", flags.IsFirstBuild) + return + } + + workspaceTimingType := getWorkspaceTimingType(flags) + switch workspaceTimingType { + case WorkspaceCreation: + // Regular workspace creation (without prebuild pool) + m.workspaceCreationTimings. + WithLabelValues(organizationName, templateName, presetName, workspaceTypeRegular).Observe(buildTime) + case PrebuildCreation: + // Prebuilt workspace creation duration + m.workspaceCreationTimings. + WithLabelValues(organizationName, templateName, presetName, workspaceTypePrebuild).Observe(buildTime) + case PrebuildClaim: + // Prebuilt workspace claim duration + m.workspaceClaimTimings. + WithLabelValues(organizationName, templateName, presetName).Observe(buildTime) + default: + m.logger.Warn(ctx, "unsupported workspace timing flags") + } +} diff --git a/coderd/provisionerdserver/provisionerdserver.go b/coderd/provisionerdserver/provisionerdserver.go index 938fdf1774008..4685dad881674 100644 --- a/coderd/provisionerdserver/provisionerdserver.go +++ b/coderd/provisionerdserver/provisionerdserver.go @@ -129,6 +129,8 @@ type server struct { heartbeatInterval time.Duration heartbeatFn func(ctx context.Context) error + + metrics *Metrics } // We use the null byte (0x00) in generating a canonical map key for tags, so @@ -178,6 +180,7 @@ func NewServer( options Options, enqueuer notifications.Enqueuer, prebuildsOrchestrator *atomic.Pointer[prebuilds.ReconciliationOrchestrator], + metrics *Metrics, ) (proto.DRPCProvisionerDaemonServer, error) { // Fail-fast if pointers are nil if lifecycleCtx == nil { @@ -248,6 +251,7 @@ func NewServer( heartbeatFn: options.HeartbeatFn, PrebuildsOrchestrator: prebuildsOrchestrator, UsageInserter: usageInserter, + metrics: metrics, } if s.heartbeatFn == nil { @@ -2281,6 +2285,50 @@ func (s *server) completeWorkspaceBuildJob(ctx context.Context, job database.Pro } } + // Update workspace (regular and prebuild) timing metrics + if s.metrics != nil { + // Only consider 'start' workspace builds + if workspaceBuild.Transition == database.WorkspaceTransitionStart { + // Get the updated job to report the metrics with correct data + updatedJob, err := s.Database.GetProvisionerJobByID(ctx, jobID) + if err != nil { + s.Logger.Error(ctx, "get updated job from database", slog.Error(err)) + } else + // Only consider 'succeeded' provisioner jobs + if updatedJob.JobStatus == database.ProvisionerJobStatusSucceeded { + presetName := "" + if workspaceBuild.TemplateVersionPresetID.Valid { + preset, err := s.Database.GetPresetByID(ctx, workspaceBuild.TemplateVersionPresetID.UUID) + if err != nil { + if !errors.Is(err, sql.ErrNoRows) { + s.Logger.Error(ctx, "get preset by ID for workspace timing metrics", slog.Error(err)) + } + } else { + presetName = preset.Name + } + } + + buildTime := updatedJob.CompletedAt.Time.Sub(updatedJob.StartedAt.Time).Seconds() + s.metrics.UpdateWorkspaceTimingsMetrics( + ctx, + WorkspaceTimingFlags{ + // Is a prebuilt workspace creation build + IsPrebuild: input.PrebuiltWorkspaceBuildStage.IsPrebuild(), + // Is a prebuilt workspace claim build + IsClaim: input.PrebuiltWorkspaceBuildStage.IsPrebuiltWorkspaceClaim(), + // Is a regular workspace creation build + // Only consider the first build number for regular workspaces + IsFirstBuild: workspaceBuild.BuildNumber == 1, + }, + workspace.OrganizationName, + workspace.TemplateName, + presetName, + buildTime, + ) + } + } + } + msg, err := json.Marshal(wspubsub.WorkspaceEvent{ Kind: wspubsub.WorkspaceEventKindStateChange, WorkspaceID: workspace.ID, diff --git a/coderd/provisionerdserver/provisionerdserver_test.go b/coderd/provisionerdserver/provisionerdserver_test.go index 98af0bb86a73f..914f6dd024193 100644 --- a/coderd/provisionerdserver/provisionerdserver_test.go +++ b/coderd/provisionerdserver/provisionerdserver_test.go @@ -4144,6 +4144,7 @@ func setup(t *testing.T, ignoreLogErrors bool, ov *overrides) (proto.DRPCProvisi }, notifEnq, &op, + provisionerdserver.NewMetrics(logger), ) require.NoError(t, err) return srv, db, ps, daemon diff --git a/docs/admin/integrations/prometheus.md b/docs/admin/integrations/prometheus.md index ac88c8c5beda7..47fbc575c7c2e 100644 --- a/docs/admin/integrations/prometheus.md +++ b/docs/admin/integrations/prometheus.md @@ -143,9 +143,12 @@ deployment. They will always be available from the agent. | `coderd_oauth2_external_requests_rate_limit_total` | gauge | DEPRECATED: use coderd_oauth2_external_requests_rate_limit instead | `name` `resource` | | `coderd_oauth2_external_requests_rate_limit_used` | gauge | The number of requests made in this interval. | `name` `resource` | | `coderd_oauth2_external_requests_total` | counter | The total number of api calls made to external oauth2 providers. 'status_code' will be 0 if the request failed with no response. | `name` `source` `status_code` | +| `coderd_prebuilt_workspace_claim_duration_seconds` | histogram | Time to claim a prebuilt workspace by organization, template, and preset. | `organization_name` `preset_name` `template_name` | | `coderd_provisionerd_job_timings_seconds` | histogram | The provisioner job time duration in seconds. | `provisioner` `status` | | `coderd_provisionerd_jobs_current` | gauge | The number of currently running provisioner jobs. | `provisioner` | | `coderd_workspace_builds_total` | counter | The number of workspaces started, updated, or deleted. | `action` `owner_email` `status` `template_name` `template_version` `workspace_name` | +| `coderd_workspace_creation_duration_seconds` | histogram | Time to create a workspace by organization, template, preset, and type (regular or prebuild). | `organization_name` `preset_name` `template_name` `type` | +| `coderd_workspace_creation_total` | counter | Total regular (non-prebuilt) workspace creations by organization, template, and preset. | `organization_name` `preset_name` `template_name` | | `coderd_workspace_latest_build_status` | gauge | The current workspace statuses by template, transition, and owner. | `status` `template_name` `template_version` `workspace_owner` `workspace_transition` | | `go_gc_duration_seconds` | summary | A summary of the pause duration of garbage collection cycles. | | | `go_goroutines` | gauge | Number of goroutines that currently exist. | | @@ -185,3 +188,19 @@ deployment. They will always be available from the agent. | `promhttp_metric_handler_requests_total` | counter | Total number of scrapes by HTTP status code. | `code` | + +### Note on Prometheus native histogram support + +The following metrics support native histograms: + +* `coderd_workspace_creation_duration_seconds` +* `coderd_prebuilt_workspace_claim_duration_seconds` + +Native histograms are an **experimental** Prometheus feature that removes the need to predefine bucket boundaries and allows higher-resolution buckets that adapt to deployment characteristics. +Whether a metric is exposed as classic or native depends entirely on the Prometheus server configuration (see [Prometheus docs](https://prometheus.io/docs/specs/native_histograms/) for details): + +* If native histograms are enabled, Prometheus ingests the high-resolution histogram. +* If not, it falls back to the predefined buckets. + +⚠️ Important: classic and native histograms cannot be aggregated together. If Prometheus is switched from classic to native at a certain point in time, dashboards may need to account for that transition. +For this reason, it’s recommended to follow [Prometheus’ migration guidelines](https://prometheus.io/docs/specs/native_histograms/#migration-considerations) when moving from classic to native histograms. diff --git a/docs/admin/templates/extending-templates/prebuilt-workspaces.md b/docs/admin/templates/extending-templates/prebuilt-workspaces.md index bf80ca479254a..61734679d4c7d 100644 --- a/docs/admin/templates/extending-templates/prebuilt-workspaces.md +++ b/docs/admin/templates/extending-templates/prebuilt-workspaces.md @@ -300,6 +300,7 @@ Coder provides several metrics to monitor your prebuilt workspaces: - `coderd_prebuilt_workspaces_desired` (gauge): Target number of prebuilt workspaces that should be available. - `coderd_prebuilt_workspaces_running` (gauge): Current number of prebuilt workspaces in a `running` state. - `coderd_prebuilt_workspaces_eligible` (gauge): Current number of prebuilt workspaces eligible to be claimed. +- `coderd_prebuilt_workspace_claim_duration_seconds` ([_native histogram_](https://prometheus.io/docs/specs/native_histograms) support): Time to claim a prebuilt workspace from the prebuild pool. #### Logs diff --git a/enterprise/coderd/provisionerdaemons.go b/enterprise/coderd/provisionerdaemons.go index 65b03a7d6b864..be03af29293f9 100644 --- a/enterprise/coderd/provisionerdaemons.go +++ b/enterprise/coderd/provisionerdaemons.go @@ -361,6 +361,7 @@ func (api *API) provisionerDaemonServe(rw http.ResponseWriter, r *http.Request) }, api.NotificationsEnqueuer, &api.AGPL.PrebuildsReconciler, + api.ProvisionerdServerMetrics, ) if err != nil { if !xerrors.Is(err, context.Canceled) { diff --git a/enterprise/coderd/workspaces_test.go b/enterprise/coderd/workspaces_test.go index 12a45cba952e2..31821bb798de9 100644 --- a/enterprise/coderd/workspaces_test.go +++ b/enterprise/coderd/workspaces_test.go @@ -26,6 +26,7 @@ import ( "github.com/coder/coder/v2/coderd/audit" "github.com/coder/coder/v2/coderd/autobuild" "github.com/coder/coder/v2/coderd/coderdtest" + "github.com/coder/coder/v2/coderd/coderdtest/promhelp" "github.com/coder/coder/v2/coderd/database" "github.com/coder/coder/v2/coderd/database/dbauthz" "github.com/coder/coder/v2/coderd/database/dbfake" @@ -2873,6 +2874,133 @@ func TestPrebuildActivityBump(t *testing.T) { require.Zero(t, workspace.LatestBuild.MaxDeadline) } +func TestWorkspaceProvisionerdServerMetrics(t *testing.T) { + t.Parallel() + + // Setup + log := testutil.Logger(t) + reg := prometheus.NewRegistry() + provisionerdserverMetrics := provisionerdserver.NewMetrics(log) + err := provisionerdserverMetrics.Register(reg) + require.NoError(t, err) + client, db, owner := coderdenttest.NewWithDatabase(t, &coderdenttest.Options{ + Options: &coderdtest.Options{ + IncludeProvisionerDaemon: true, + ProvisionerdServerMetrics: provisionerdserverMetrics, + }, + LicenseOptions: &coderdenttest.LicenseOptions{ + Features: license.Features{ + codersdk.FeatureWorkspacePrebuilds: 1, + }, + }, + }) + + // Given: a template and a template version with a preset without prebuild instances + presetNoPrebuildID := uuid.New() + versionNoPrebuild := coderdtest.CreateTemplateVersion(t, client, owner.OrganizationID, nil) + _ = coderdtest.AwaitTemplateVersionJobCompleted(t, client, versionNoPrebuild.ID) + templateNoPrebuild := coderdtest.CreateTemplate(t, client, owner.OrganizationID, versionNoPrebuild.ID) + presetNoPrebuild := dbgen.Preset(t, db, database.InsertPresetParams{ + ID: presetNoPrebuildID, + TemplateVersionID: versionNoPrebuild.ID, + }) + + // Given: a template and a template version with a preset with a prebuild instance + presetPrebuildID := uuid.New() + versionPrebuild := coderdtest.CreateTemplateVersion(t, client, owner.OrganizationID, nil) + _ = coderdtest.AwaitTemplateVersionJobCompleted(t, client, versionPrebuild.ID) + templatePrebuild := coderdtest.CreateTemplate(t, client, owner.OrganizationID, versionPrebuild.ID) + presetPrebuild := dbgen.Preset(t, db, database.InsertPresetParams{ + ID: presetPrebuildID, + TemplateVersionID: versionPrebuild.ID, + DesiredInstances: sql.NullInt32{Int32: 1, Valid: true}, + }) + // Given: a prebuild workspace + wb := dbfake.WorkspaceBuild(t, db, database.WorkspaceTable{ + OwnerID: database.PrebuildsSystemUserID, + TemplateID: templatePrebuild.ID, + }).Seed(database.WorkspaceBuild{ + TemplateVersionID: versionPrebuild.ID, + TemplateVersionPresetID: uuid.NullUUID{ + UUID: presetPrebuildID, + Valid: true, + }, + }).WithAgent(func(agent []*proto.Agent) []*proto.Agent { + return agent + }).Do() + + // Mark the prebuilt workspace's agent as ready so the prebuild can be claimed + // nolint:gocritic + ctx := dbauthz.AsSystemRestricted(testutil.Context(t, testutil.WaitLong)) + agent, err := db.GetWorkspaceAgentAndLatestBuildByAuthToken(ctx, uuid.MustParse(wb.AgentToken)) + require.NoError(t, err) + err = db.UpdateWorkspaceAgentLifecycleStateByID(ctx, database.UpdateWorkspaceAgentLifecycleStateByIDParams{ + ID: agent.WorkspaceAgent.ID, + LifecycleState: database.WorkspaceAgentLifecycleStateReady, + }) + require.NoError(t, err) + + organizationName, err := client.Organization(ctx, owner.OrganizationID) + require.NoError(t, err) + user, err := client.User(ctx, "testUser") + require.NoError(t, err) + + // Given: no histogram value for prebuilt workspaces claim + prebuiltWorkspaceHistogramMetric := promhelp.MetricValue(t, reg, "coderd_prebuilt_workspace_claim_duration_seconds", prometheus.Labels{ + "organization_name": organizationName.Name, + "template_name": templatePrebuild.Name, + "preset_name": presetPrebuild.Name, + }) + require.Nil(t, prebuiltWorkspaceHistogramMetric) + + // Given: the prebuilt workspace is claimed by a user + claimedWorkspace, err := client.CreateUserWorkspace(ctx, user.ID.String(), codersdk.CreateWorkspaceRequest{ + TemplateVersionID: versionPrebuild.ID, + TemplateVersionPresetID: presetPrebuildID, + Name: coderdtest.RandomUsername(t), + }) + require.NoError(t, err) + coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, claimedWorkspace.LatestBuild.ID) + require.Equal(t, wb.Workspace.ID, claimedWorkspace.ID) + + // Then: the histogram value for prebuilt workspace claim should be updated + prebuiltWorkspaceHistogram := promhelp.HistogramValue(t, reg, "coderd_prebuilt_workspace_claim_duration_seconds", prometheus.Labels{ + "organization_name": organizationName.Name, + "template_name": templatePrebuild.Name, + "preset_name": presetPrebuild.Name, + }) + require.NotNil(t, prebuiltWorkspaceHistogram) + require.Equal(t, uint64(1), prebuiltWorkspaceHistogram.GetSampleCount()) + + // Given: no histogram value for regular workspaces creation + regularWorkspaceHistogramMetric := promhelp.MetricValue(t, reg, "coderd_workspace_creation_duration_seconds", prometheus.Labels{ + "organization_name": organizationName.Name, + "template_name": templateNoPrebuild.Name, + "preset_name": presetNoPrebuild.Name, + "type": "regular", + }) + require.Nil(t, regularWorkspaceHistogramMetric) + + // Given: a user creates a regular workspace (without prebuild pool) + regularWorkspace, err := client.CreateUserWorkspace(ctx, user.ID.String(), codersdk.CreateWorkspaceRequest{ + TemplateVersionID: versionNoPrebuild.ID, + TemplateVersionPresetID: presetNoPrebuildID, + Name: coderdtest.RandomUsername(t), + }) + require.NoError(t, err) + coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, regularWorkspace.LatestBuild.ID) + + // Then: the histogram value for regular workspace creation should be updated + regularWorkspaceHistogram := promhelp.HistogramValue(t, reg, "coderd_workspace_creation_duration_seconds", prometheus.Labels{ + "organization_name": organizationName.Name, + "template_name": templateNoPrebuild.Name, + "preset_name": presetNoPrebuild.Name, + "type": "regular", + }) + require.NotNil(t, regularWorkspaceHistogram) + require.Equal(t, uint64(1), regularWorkspaceHistogram.GetSampleCount()) +} + // TestWorkspaceTemplateParamsChange tests a workspace with a parameter that // validation changes on apply. The params used in create workspace are invalid // according to the static params on import. diff --git a/scripts/metricsdocgen/metrics b/scripts/metricsdocgen/metrics index 35110a9834efb..20e24d9caa136 100644 --- a/scripts/metricsdocgen/metrics +++ b/scripts/metricsdocgen/metrics @@ -715,6 +715,37 @@ coderd_workspace_latest_build_status{status="failed",template_name="docker",temp coderd_workspace_builds_total{action="START",owner_email="admin@coder.com",status="failed",template_name="docker",template_version="gallant_wright0",workspace_name="test1"} 1 coderd_workspace_builds_total{action="START",owner_email="admin@coder.com",status="success",template_name="docker",template_version="gallant_wright0",workspace_name="test1"} 1 coderd_workspace_builds_total{action="STOP",owner_email="admin@coder.com",status="success",template_name="docker",template_version="gallant_wright0",workspace_name="test1"} 1 +# HELP coderd_workspace_creation_total Total regular (non-prebuilt) workspace creations by organization, template, and preset. +# TYPE coderd_workspace_creation_total counter +coderd_workspace_creation_total{organization_name="{organization}",preset_name="",template_name="docker"} 1 +# HELP coderd_workspace_creation_duration_seconds Time to create a workspace by organization, template, preset, and type (regular or prebuild). +# TYPE coderd_workspace_creation_duration_seconds histogram +coderd_workspace_creation_duration_seconds_bucket{organization_name="{organization}",preset_name="Falkenstein",template_name="docker",type="prebuild",le="1"} 0 +coderd_workspace_creation_duration_seconds_bucket{organization_name="{organization}",preset_name="Falkenstein",template_name="docker",type="prebuild",le="10"} 1 +coderd_workspace_creation_duration_seconds_bucket{organization_name="{organization}",preset_name="Falkenstein",template_name="docker",type="prebuild",le="30"} 1 +coderd_workspace_creation_duration_seconds_bucket{organization_name="{organization}",preset_name="Falkenstein",template_name="docker",type="prebuild",le="60"} 1 +coderd_workspace_creation_duration_seconds_bucket{organization_name="{organization}",preset_name="Falkenstein",template_name="docker",type="prebuild",le="300"} 1 +coderd_workspace_creation_duration_seconds_bucket{organization_name="{organization}",preset_name="Falkenstein",template_name="docker",type="prebuild",le="600"} 1 +coderd_workspace_creation_duration_seconds_bucket{organization_name="{organization}",preset_name="Falkenstein",template_name="docker",type="prebuild",le="1800"} 1 +coderd_workspace_creation_duration_seconds_bucket{organization_name="{organization}",preset_name="Falkenstein",template_name="docker",type="prebuild",le="3600"} 1 +coderd_workspace_creation_duration_seconds_bucket{organization_name="{organization}",preset_name="Falkenstein",template_name="docker",type="prebuild",le="+Inf"} 1 +coderd_workspace_creation_duration_seconds_sum{organization_name="{organization}",preset_name="Falkenstein",template_name="template-example",type="prebuild"} 4.406214 +coderd_workspace_creation_duration_seconds_count{organization_name="{organization}",preset_name="Falkenstein",template_name="template-example",type="prebuild"} 1 +# HELP coderd_prebuilt_workspace_claim_duration_seconds Time to claim a prebuilt workspace by organization, template, and preset. +# TYPE coderd_prebuilt_workspace_claim_duration_seconds histogram +coderd_prebuilt_workspace_claim_duration_seconds_bucket{organization_name="{organization}",preset_name="Falkenstein",template_name="docker",le="1"} 0 +coderd_prebuilt_workspace_claim_duration_seconds_bucket{organization_name="{organization}",preset_name="Falkenstein",template_name="docker",le="5"} 1 +coderd_prebuilt_workspace_claim_duration_seconds_bucket{organization_name="{organization}",preset_name="Falkenstein",template_name="docker",le="10"} 1 +coderd_prebuilt_workspace_claim_duration_seconds_bucket{organization_name="{organization}",preset_name="Falkenstein",template_name="docker",le="20"} 1 +coderd_prebuilt_workspace_claim_duration_seconds_bucket{organization_name="{organization}",preset_name="Falkenstein",template_name="docker",le="30"} 1 +coderd_prebuilt_workspace_claim_duration_seconds_bucket{organization_name="{organization}",preset_name="Falkenstein",template_name="docker",le="60"} 1 +coderd_prebuilt_workspace_claim_duration_seconds_bucket{organization_name="{organization}",preset_name="Falkenstein",template_name="docker",le="120"} 1 +coderd_prebuilt_workspace_claim_duration_seconds_bucket{organization_name="{organization}",preset_name="Falkenstein",template_name="docker",le="180"} 1 +coderd_prebuilt_workspace_claim_duration_seconds_bucket{organization_name="{organization}",preset_name="Falkenstein",template_name="docker",le="240"} 1 +coderd_prebuilt_workspace_claim_duration_seconds_bucket{organization_name="{organization}",preset_name="Falkenstein",template_name="docker",le="300"} 1 +coderd_prebuilt_workspace_claim_duration_seconds_bucket{organization_name="{organization}",preset_name="Falkenstein",template_name="docker",le="+Inf"} 1 +coderd_prebuilt_workspace_claim_duration_seconds_sum{organization_name="{organization}",preset_name="Falkenstein",template_name="docker"} 4.860075 +coderd_prebuilt_workspace_claim_duration_seconds_count{organization_name="{organization}",preset_name="Falkenstein",template_name="docker"} 1 # HELP go_gc_duration_seconds A summary of the pause duration of garbage collection cycles. # TYPE go_gc_duration_seconds summary go_gc_duration_seconds{quantile="0"} 2.4056e-05