From f77606d2ce598ed38799b17df0177c6502b6ac62 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Mon, 4 Nov 2024 08:23:03 +0000 Subject: [PATCH 01/41] Update actions/upload-artifact See https://github.blog/changelog/2024-02-13-deprecation-notice-v1-and-v2-of-the-artifact-actions/ Signed-off-by: Danny Kopping --- .github/workflows/nightly-build.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nightly-build.yaml b/.github/workflows/nightly-build.yaml index 40a6935..13316df 100644 --- a/.github/workflows/nightly-build.yaml +++ b/.github/workflows/nightly-build.yaml @@ -35,7 +35,7 @@ jobs: - name: Upload script output if: failure() - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: script-output path: output.log From 0c9503e39df903fb5b5afa0b2afaa7b9821520b6 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Mon, 4 Nov 2024 08:25:25 +0000 Subject: [PATCH 02/41] Always upload script output Signed-off-by: Danny Kopping --- .github/workflows/nightly-build.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/nightly-build.yaml b/.github/workflows/nightly-build.yaml index 13316df..b95af20 100644 --- a/.github/workflows/nightly-build.yaml +++ b/.github/workflows/nightly-build.yaml @@ -34,7 +34,6 @@ jobs: continue-on-error: true - name: Upload script output - if: failure() uses: actions/upload-artifact@v4 with: name: script-output From f841254ef412e09d604453492ce0fea8688802fc Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Mon, 4 Nov 2024 08:26:55 +0000 Subject: [PATCH 03/41] Do not continue on build error Signed-off-by: Danny Kopping --- .github/workflows/nightly-build.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nightly-build.yaml b/.github/workflows/nightly-build.yaml index b95af20..ea31b55 100644 --- a/.github/workflows/nightly-build.yaml +++ b/.github/workflows/nightly-build.yaml @@ -31,7 +31,7 @@ jobs: - name: make build run: | make build > output.log 2>&1 - continue-on-error: true + continue-on-error: false - name: Upload script output uses: actions/upload-artifact@v4 From 490afea1e2cbc6d7af44227b0809394893606787 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Mon, 4 Nov 2024 08:35:38 +0000 Subject: [PATCH 04/41] Update version constraints to match any patch releases only, not minors Fixes #27 Signed-off-by: Danny Kopping --- coder-observability/Chart.lock | 8 ++++---- coder-observability/Chart.yaml | 8 ++++---- compiled/resources.yaml | 10 +++++----- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/coder-observability/Chart.lock b/coder-observability/Chart.lock index 2aeb46c..45c04a9 100644 --- a/coder-observability/Chart.lock +++ b/coder-observability/Chart.lock @@ -4,12 +4,12 @@ dependencies: version: 7.3.12 - name: prometheus repository: https://prometheus-community.github.io/helm-charts - version: 25.24.1 + version: 25.24.2 - name: loki repository: https://grafana.github.io/helm-charts - version: 6.7.3 + version: 6.7.4 - name: grafana-agent repository: https://grafana.github.io/helm-charts version: 0.37.0 -digest: sha256:10a5d2b617b691e0ed87ca9e31c86618e05ca3b8031ddb3b417610f47e8bb069 -generated: "2024-07-26T10:52:20.819468+02:00" +digest: sha256:05e0dae0200cabf5cb9e2cfb18a4e166fcaceefaf39827addff4299b18c31d4e +generated: "2024-11-04T08:35:11.202671Z" diff --git a/coder-observability/Chart.yaml b/coder-observability/Chart.yaml index 8de1651..9e40bfa 100644 --- a/coder-observability/Chart.yaml +++ b/coder-observability/Chart.yaml @@ -8,20 +8,20 @@ dependencies: - name: grafana condition: grafana.enabled repository: https://grafana.github.io/helm-charts - version: '^v7.3.7' + version: '~v7.3.7' - name: prometheus condition: prometheus.enabled repository: https://prometheus-community.github.io/helm-charts - version: '^v25.18.0' + version: '~v25.24.1' - name: loki condition: loki.enabled repository: https://grafana.github.io/helm-charts - version: '^v6.3.4' + version: '~v6.7.3' - name: grafana-agent alias: grafana-agent condition: grafana-agent.enabled repository: https://grafana.github.io/helm-charts - version: '^0.37.0' + version: '~0.37.0' maintainers: - name: Coder Technologies, Inc. url: https://github.com/coder/observability/issues diff --git a/compiled/resources.yaml b/compiled/resources.yaml index c2d7968..2b93b3d 100644 --- a/compiled/resources.yaml +++ b/compiled/resources.yaml @@ -10162,7 +10162,7 @@ spec: template: metadata: annotations: - checksum/config: 308677931777ae343a565387f5edecd66c53876ef7d120e9df3c6196c1884b30 + checksum/config: bc7add19cdc0df1566dec1bf8f9421082357d4393124d6ea2df28d7e5888cc8a labels: app.kubernetes.io/name: loki app.kubernetes.io/instance: coder-observability @@ -10246,7 +10246,7 @@ spec: template: metadata: annotations: - checksum/config: 616b4c9b39f90d71edfd156f86c7f57751f662542a83f3e345c30496c3da7d27 + checksum/config: 4dbe50185304244ab527314b7723d048ea4544f97d0a4d8e0192863861811005 prometheus.io/scrape: "true" labels: app.kubernetes.io/part-of: memberlist @@ -10725,7 +10725,7 @@ spec: template: metadata: annotations: - checksum/config: 616b4c9b39f90d71edfd156f86c7f57751f662542a83f3e345c30496c3da7d27 + checksum/config: 4dbe50185304244ab527314b7723d048ea4544f97d0a4d8e0192863861811005 prometheus.io/scrape: "true" labels: app.kubernetes.io/name: loki @@ -11042,7 +11042,7 @@ spec: template: metadata: annotations: - checksum/config: 616b4c9b39f90d71edfd156f86c7f57751f662542a83f3e345c30496c3da7d27 + checksum/config: 4dbe50185304244ab527314b7723d048ea4544f97d0a4d8e0192863861811005 prometheus.io/scrape: "true" labels: app.kubernetes.io/name: loki @@ -11242,7 +11242,7 @@ spec: app.kubernetes.io/name: prometheus app.kubernetes.io/instance: coder-observability app.kubernetes.io/version: v2.53.1 - helm.sh/chart: prometheus-25.24.1 + helm.sh/chart: prometheus-25.24.2 app.kubernetes.io/managed-by: Helm app.kubernetes.io/part-of: prometheus spec: From b5eabbae78d013fabdc8fb4ee9dd377778118808 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Mon, 4 Nov 2024 08:40:10 +0000 Subject: [PATCH 05/41] Update README.md Signed-off-by: Danny Kopping --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index f479746..28080d1 100644 --- a/README.md +++ b/README.md @@ -203,10 +203,10 @@ stringData: | Repository | Name | Version | |------------|------|---------| -| https://grafana.github.io/helm-charts | grafana | ^v7.3.7 | -| https://grafana.github.io/helm-charts | grafana-agent(grafana-agent) | ^0.37.0 | -| https://grafana.github.io/helm-charts | loki | ^v6.3.4 | -| https://prometheus-community.github.io/helm-charts | prometheus | ^v25.18.0 | +| https://grafana.github.io/helm-charts | grafana | ~v7.3.7 | +| https://grafana.github.io/helm-charts | grafana-agent(grafana-agent) | ~0.37.0 | +| https://grafana.github.io/helm-charts | loki | ~v6.7.3 | +| https://prometheus-community.github.io/helm-charts | prometheus | ~v25.24.1 | Each subchart can be disabled by setting the `enabled` field to `false`. From 729a2018f0cd812575f73a9773e461eea825392b Mon Sep 17 00:00:00 2001 From: Eric Date: Wed, 15 Jan 2025 20:25:02 +0000 Subject: [PATCH 06/41] feat: add statefulset image overrides Signed-off-by: Danny Kopping --- coder-observability/Chart.lock | 2 +- .../templates/statefulset-postgres-exporter.yaml | 4 ++-- .../templates/statefulset-runbook-viewer.yaml | 4 ++-- .../templates/statefulset-sql-exporter.yaml | 4 ++-- coder-observability/values.yaml | 11 +++++++++++ 5 files changed, 18 insertions(+), 7 deletions(-) diff --git a/coder-observability/Chart.lock b/coder-observability/Chart.lock index 45c04a9..059ed11 100644 --- a/coder-observability/Chart.lock +++ b/coder-observability/Chart.lock @@ -12,4 +12,4 @@ dependencies: repository: https://grafana.github.io/helm-charts version: 0.37.0 digest: sha256:05e0dae0200cabf5cb9e2cfb18a4e166fcaceefaf39827addff4299b18c31d4e -generated: "2024-11-04T08:35:11.202671Z" +generated: "2025-01-15T20:35:57.318818175Z" diff --git a/coder-observability/templates/statefulset-postgres-exporter.yaml b/coder-observability/templates/statefulset-postgres-exporter.yaml index bcef353..13a4cbf 100644 --- a/coder-observability/templates/statefulset-postgres-exporter.yaml +++ b/coder-observability/templates/statefulset-postgres-exporter.yaml @@ -19,8 +19,8 @@ spec: app.kubernetes.io/name: "database-stats" spec: containers: - - name: postgres-exporter - image: quay.io/prometheuscommunity/postgres-exporter + - name: {{ .Values.global.postgres.exporter.containerName }} + image: {{ .Values.global.postgres.exporter.image }} args: - --collector.long_running_transactions ports: diff --git a/coder-observability/templates/statefulset-runbook-viewer.yaml b/coder-observability/templates/statefulset-runbook-viewer.yaml index 0ab2179..f2e5aed 100644 --- a/coder-observability/templates/statefulset-runbook-viewer.yaml +++ b/coder-observability/templates/statefulset-runbook-viewer.yaml @@ -18,8 +18,8 @@ spec: app: runbook-viewer spec: containers: - - name: madness - image: dannyben/madness + - name: {{ .Values.runbook.containerName }} + image: {{ .Values.runbook.image }} ports: - containerPort: 3000 name: madness diff --git a/coder-observability/templates/statefulset-sql-exporter.yaml b/coder-observability/templates/statefulset-sql-exporter.yaml index 3ef64c2..6846058 100644 --- a/coder-observability/templates/statefulset-sql-exporter.yaml +++ b/coder-observability/templates/statefulset-sql-exporter.yaml @@ -20,8 +20,8 @@ spec: app.kubernetes.io/name: "database-stats" spec: containers: - - name: sql-exporter - image: burningalchemist/sql_exporter + - name: {{ .Values.sqlExporter.containerName }} + image: {{ .Values.sqlExporter.image }} args: - -config.file=/cfg/config.yaml ports: diff --git a/coder-observability/values.yaml b/coder-observability/values.yaml index c66a6ac..ee3cc48 100644 --- a/coder-observability/values.yaml +++ b/coder-observability/values.yaml @@ -115,6 +115,9 @@ global: sslmode: disable # ensure that your secret has a field named `PGPASSWORD` mountSecret: "secret-postgres" + exporter: + containerName: "postgres-exporter" + image: "quay.io/prometheuscommunity/postgres-exporter" # global.postgres.alerts -- alerts for postgres alerts: @@ -146,6 +149,14 @@ global: # global.dashboards.queryTimeout -- how long until a query in Grafana will timeout after queryTimeout: 900 +runbook: + containerName: "madness" + image: "dannyben/madness" + +sqlExporter: + containerName: "sql-exporter" + image: "burningalchemist/sql_exporter" + grafana-agent: enabled: true fullnameOverride: grafana-agent From 0cac5bb750cd823376020563efcd679ed62c0d3c Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Wed, 15 Jan 2025 20:43:48 +0000 Subject: [PATCH 07/41] make readme Signed-off-by: Danny Kopping --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 28080d1..8acaa1b 100644 --- a/README.md +++ b/README.md @@ -242,7 +242,7 @@ values which are defined [here](https://github.com/grafana/helm-charts/tree/main | global.dashboards.timerange | string | `"12h"` | how far back dashboards should look | | global.externalScheme | string | `"http"` | | | global.externalZone | string | `"svc.cluster.local"` | | -| global.postgres | object | `{"alerts":{"groups":{"Basic":{"delay":"1m","enabled":true},"Connections":{"delay":"5m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}},"Notifications":{"delay":"15m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}}}},"database":"coder","hostname":"localhost","mountSecret":"secret-postgres","password":null,"port":5432,"sslmode":"disable","username":"coder"}` | postgres connection information NOTE: these settings are global so we can parameterise some values which get rendered by subcharts | +| global.postgres | object | `{"alerts":{"groups":{"Basic":{"delay":"1m","enabled":true},"Connections":{"delay":"5m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}},"Notifications":{"delay":"15m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}}}},"database":"coder","exporter":{"containerName":"postgres-exporter","image":"quay.io/prometheuscommunity/postgres-exporter"},"hostname":"localhost","mountSecret":"secret-postgres","password":null,"port":5432,"sslmode":"disable","username":"coder"}` | postgres connection information NOTE: these settings are global so we can parameterise some values which get rendered by subcharts | | global.postgres.alerts | object | `{"groups":{"Basic":{"delay":"1m","enabled":true},"Connections":{"delay":"5m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}},"Notifications":{"delay":"15m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}}}}` | alerts for postgres | | global.telemetry | object | `{"metrics":{"scrape_interval":"15s","scrape_timeout":"12s"}}` | control telemetry collection | | global.telemetry.metrics | object | `{"scrape_interval":"15s","scrape_timeout":"12s"}` | control metric collection | @@ -466,4 +466,8 @@ values which are defined [here](https://github.com/grafana/helm-charts/tree/main | prometheus.serverFiles."prometheus.yml".rule_files[0] | string | `"/etc/config/alerts/*.yaml"` | | | prometheus.serverFiles."prometheus.yml".scrape_configs | string | `nil` | | | prometheus.testFramework.enabled | bool | `false` | | +| runbook.containerName | string | `"madness"` | | +| runbook.image | string | `"dannyben/madness"` | | +| sqlExporter.containerName | string | `"sql-exporter"` | | +| sqlExporter.image | string | `"burningalchemist/sql_exporter"` | | From 9a759d60dc043c142c5f580685f44dd410c52e51 Mon Sep 17 00:00:00 2001 From: Eric Date: Wed, 15 Jan 2025 20:49:21 +0000 Subject: [PATCH 08/41] rm container name overrides Signed-off-by: Danny Kopping --- .vscode/settings.json | 3 +++ README.md | 6 ++---- coder-observability/Chart.lock | 2 +- .../templates/statefulset-postgres-exporter.yaml | 2 +- .../templates/statefulset-runbook-viewer.yaml | 4 ++-- coder-observability/templates/statefulset-sql-exporter.yaml | 2 +- coder-observability/values.yaml | 5 +---- 7 files changed, 11 insertions(+), 13 deletions(-) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..082b194 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "makefile.configureOnOpen": false +} \ No newline at end of file diff --git a/README.md b/README.md index 8acaa1b..cdb096b 100644 --- a/README.md +++ b/README.md @@ -242,7 +242,7 @@ values which are defined [here](https://github.com/grafana/helm-charts/tree/main | global.dashboards.timerange | string | `"12h"` | how far back dashboards should look | | global.externalScheme | string | `"http"` | | | global.externalZone | string | `"svc.cluster.local"` | | -| global.postgres | object | `{"alerts":{"groups":{"Basic":{"delay":"1m","enabled":true},"Connections":{"delay":"5m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}},"Notifications":{"delay":"15m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}}}},"database":"coder","exporter":{"containerName":"postgres-exporter","image":"quay.io/prometheuscommunity/postgres-exporter"},"hostname":"localhost","mountSecret":"secret-postgres","password":null,"port":5432,"sslmode":"disable","username":"coder"}` | postgres connection information NOTE: these settings are global so we can parameterise some values which get rendered by subcharts | +| global.postgres | object | `{"alerts":{"groups":{"Basic":{"delay":"1m","enabled":true},"Connections":{"delay":"5m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}},"Notifications":{"delay":"15m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}}}},"database":"coder","exporter":{"image":"quay.io/prometheuscommunity/postgres-exporter"},"hostname":"localhost","mountSecret":"secret-postgres","password":null,"port":5432,"sslmode":"disable","username":"coder"}` | postgres connection information NOTE: these settings are global so we can parameterise some values which get rendered by subcharts | | global.postgres.alerts | object | `{"groups":{"Basic":{"delay":"1m","enabled":true},"Connections":{"delay":"5m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}},"Notifications":{"delay":"15m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}}}}` | alerts for postgres | | global.telemetry | object | `{"metrics":{"scrape_interval":"15s","scrape_timeout":"12s"}}` | control telemetry collection | | global.telemetry.metrics | object | `{"scrape_interval":"15s","scrape_timeout":"12s"}` | control metric collection | @@ -466,8 +466,6 @@ values which are defined [here](https://github.com/grafana/helm-charts/tree/main | prometheus.serverFiles."prometheus.yml".rule_files[0] | string | `"/etc/config/alerts/*.yaml"` | | | prometheus.serverFiles."prometheus.yml".scrape_configs | string | `nil` | | | prometheus.testFramework.enabled | bool | `false` | | -| runbook.containerName | string | `"madness"` | | -| runbook.image | string | `"dannyben/madness"` | | -| sqlExporter.containerName | string | `"sql-exporter"` | | +| runbookViewer.image | string | `"dannyben/madness"` | | | sqlExporter.image | string | `"burningalchemist/sql_exporter"` | | diff --git a/coder-observability/Chart.lock b/coder-observability/Chart.lock index 059ed11..1782a88 100644 --- a/coder-observability/Chart.lock +++ b/coder-observability/Chart.lock @@ -12,4 +12,4 @@ dependencies: repository: https://grafana.github.io/helm-charts version: 0.37.0 digest: sha256:05e0dae0200cabf5cb9e2cfb18a4e166fcaceefaf39827addff4299b18c31d4e -generated: "2025-01-15T20:35:57.318818175Z" +generated: "2025-01-16T07:54:38.036598102Z" diff --git a/coder-observability/templates/statefulset-postgres-exporter.yaml b/coder-observability/templates/statefulset-postgres-exporter.yaml index 13a4cbf..229c650 100644 --- a/coder-observability/templates/statefulset-postgres-exporter.yaml +++ b/coder-observability/templates/statefulset-postgres-exporter.yaml @@ -19,7 +19,7 @@ spec: app.kubernetes.io/name: "database-stats" spec: containers: - - name: {{ .Values.global.postgres.exporter.containerName }} + - name: postgres-exporter image: {{ .Values.global.postgres.exporter.image }} args: - --collector.long_running_transactions diff --git a/coder-observability/templates/statefulset-runbook-viewer.yaml b/coder-observability/templates/statefulset-runbook-viewer.yaml index f2e5aed..64f50e4 100644 --- a/coder-observability/templates/statefulset-runbook-viewer.yaml +++ b/coder-observability/templates/statefulset-runbook-viewer.yaml @@ -18,8 +18,8 @@ spec: app: runbook-viewer spec: containers: - - name: {{ .Values.runbook.containerName }} - image: {{ .Values.runbook.image }} + - name: madness + image: {{ .Values.runbookViewer.image }} ports: - containerPort: 3000 name: madness diff --git a/coder-observability/templates/statefulset-sql-exporter.yaml b/coder-observability/templates/statefulset-sql-exporter.yaml index 6846058..628339e 100644 --- a/coder-observability/templates/statefulset-sql-exporter.yaml +++ b/coder-observability/templates/statefulset-sql-exporter.yaml @@ -20,7 +20,7 @@ spec: app.kubernetes.io/name: "database-stats" spec: containers: - - name: {{ .Values.sqlExporter.containerName }} + - name: sql-exporter image: {{ .Values.sqlExporter.image }} args: - -config.file=/cfg/config.yaml diff --git a/coder-observability/values.yaml b/coder-observability/values.yaml index ee3cc48..bcc35da 100644 --- a/coder-observability/values.yaml +++ b/coder-observability/values.yaml @@ -116,7 +116,6 @@ global: # ensure that your secret has a field named `PGPASSWORD` mountSecret: "secret-postgres" exporter: - containerName: "postgres-exporter" image: "quay.io/prometheuscommunity/postgres-exporter" # global.postgres.alerts -- alerts for postgres @@ -149,12 +148,10 @@ global: # global.dashboards.queryTimeout -- how long until a query in Grafana will timeout after queryTimeout: 900 -runbook: - containerName: "madness" +runbookViewer: image: "dannyben/madness" sqlExporter: - containerName: "sql-exporter" image: "burningalchemist/sql_exporter" grafana-agent: From c823932f62798ef9683536fb1f6037a93df827d2 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Thu, 16 Jan 2025 08:03:20 +0000 Subject: [PATCH 09/41] Update go version so prometheus rules linter passes Signed-off-by: Danny Kopping --- .github/workflows/lint.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index 7a73b63..4efb8fb 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -17,7 +17,7 @@ jobs: - name: Setup Go uses: actions/setup-go@v2 with: - go-version: 1.22 + go-version: 1.23 - name: Install Helm uses: azure/setup-helm@v4 From 76a8f346a5fe81ab41adc32657a541e629208355 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Thu, 16 Jan 2025 08:19:29 +0000 Subject: [PATCH 10/41] Removing unnecessary file Signed-off-by: Danny Kopping --- .vscode/settings.json | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 082b194..0000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "makefile.configureOnOpen": false -} \ No newline at end of file From a81a7181b124c7ea7ab8f234f4ec1b11cf4c0ee3 Mon Sep 17 00:00:00 2001 From: JD Baudean Date: Mon, 17 Feb 2025 15:10:22 -0500 Subject: [PATCH 11/41] Add Ingress config to README.md --- README.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/README.md b/README.md index cdb096b..9840f29 100644 --- a/README.md +++ b/README.md @@ -199,6 +199,22 @@ stringData: password: "" # this matches the "passwordKey" field above ``` +To add an Ingress for Grafana, define this in your `values.yaml`: + +```yaml +grafana: + grafana.ini: + server: + domain: observability.example.com + root_url: "%(protocol)s://%(domain)s/grafana" + serve_from_sub_path: true + ingress: + enabled: true + hosts: + - "observability.example.com" + path: "/" +``` + ## Subcharts | Repository | Name | Version | From a2ae0cf1ffea15c3728ea19947e2360f2fe5b208 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Fri, 18 Apr 2025 12:00:04 +0000 Subject: [PATCH 12/41] Correcting yq v4 syntax Signed-off-by: Danny Kopping --- scripts/compile.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/compile.sh b/scripts/compile.sh index 13515de..6e6474b 100755 --- a/scripts/compile.sh +++ b/scripts/compile.sh @@ -13,7 +13,7 @@ helm dependency update coder-observability/ # We *expect* that the versions will change in the rendered template output, so we ignore those, but # if there are changes to the manifests themselves then we need to fail the build to force manual review. helm template --namespace coder-observability -f coder-observability/values.yaml coder-observability coder-observability/ | \ - yq 'del(.spec.template.spec.containers[].image, .metadata.labels."helm.sh/chart", .metadata.labels."app.kubernetes.io/version")' - \ + yq e 'del(.spec.template.spec.containers[].image, .metadata.labels."helm.sh/chart", .metadata.labels."app.kubernetes.io/version")' - \ > compiled/resources.yaml check_unstaged "compiled" \ No newline at end of file From b41f9da85c76318d7e56a8f42fbd0bd35b7e0f20 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Fri, 18 Apr 2025 11:36:24 +0000 Subject: [PATCH 13/41] Overwrite default prometheus relabel configs Signed-off-by: Danny Kopping --- coder-observability/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/coder-observability/values.yaml b/coder-observability/values.yaml index bcc35da..d035dd8 100644 --- a/coder-observability/values.yaml +++ b/coder-observability/values.yaml @@ -452,7 +452,7 @@ prometheus: serverFiles: prometheus.yml: # disables scraping of metrics by the Prometheus helm chart since this is managed by the collector - scrape_configs: + scrape_configs: [] # use custom rule files to be able to render templates (can't do that in values.yaml, unless that value is evaluated by a tpl call) rule_files: - /etc/config/alerts/*.yaml From 4f2edce67e2f18fa20084159a4e898bd5667d167 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Fri, 18 Apr 2025 12:35:22 +0000 Subject: [PATCH 14/41] make lint Signed-off-by: Danny Kopping --- compiled/resources.yaml | 301 +--------------------------------------- 1 file changed, 1 insertion(+), 300 deletions(-) diff --git a/compiled/resources.yaml b/compiled/resources.yaml index 2b93b3d..ccdf33b 100644 --- a/compiled/resources.yaml +++ b/compiled/resources.yaml @@ -735,306 +735,7 @@ data: scrape_timeout: 10s rule_files: - /etc/config/alerts/*.yaml - scrape_configs: - - job_name: prometheus - static_configs: - - targets: - - localhost:9090 - - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - job_name: kubernetes-apiservers - kubernetes_sd_configs: - - role: endpoints - relabel_configs: - - action: keep - regex: default;kubernetes;https - source_labels: - - __meta_kubernetes_namespace - - __meta_kubernetes_service_name - - __meta_kubernetes_endpoint_port_name - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecure_skip_verify: true - - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - job_name: kubernetes-nodes - kubernetes_sd_configs: - - role: node - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - replacement: kubernetes.default.svc:443 - target_label: __address__ - - regex: (.+) - replacement: /api/v1/nodes/$1/proxy/metrics - source_labels: - - __meta_kubernetes_node_name - target_label: __metrics_path__ - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecure_skip_verify: true - - bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token - job_name: kubernetes-nodes-cadvisor - kubernetes_sd_configs: - - role: node - relabel_configs: - - action: labelmap - regex: __meta_kubernetes_node_label_(.+) - - replacement: kubernetes.default.svc:443 - target_label: __address__ - - regex: (.+) - replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor - source_labels: - - __meta_kubernetes_node_name - target_label: __metrics_path__ - scheme: https - tls_config: - ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - insecure_skip_verify: true - - honor_labels: true - job_name: kubernetes-service-endpoints - kubernetes_sd_configs: - - role: endpoints - relabel_configs: - - action: keep - regex: true - source_labels: - - __meta_kubernetes_service_annotation_prometheus_io_scrape - - action: drop - regex: true - source_labels: - - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow - - action: replace - regex: (https?) - source_labels: - - __meta_kubernetes_service_annotation_prometheus_io_scheme - target_label: __scheme__ - - action: replace - regex: (.+) - source_labels: - - __meta_kubernetes_service_annotation_prometheus_io_path - target_label: __metrics_path__ - - action: replace - regex: (.+?)(?::\d+)?;(\d+) - replacement: $1:$2 - source_labels: - - __address__ - - __meta_kubernetes_service_annotation_prometheus_io_port - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) - replacement: __param_$1 - - action: labelmap - regex: __meta_kubernetes_service_label_(.+) - - action: replace - source_labels: - - __meta_kubernetes_namespace - target_label: namespace - - action: replace - source_labels: - - __meta_kubernetes_service_name - target_label: service - - action: replace - source_labels: - - __meta_kubernetes_pod_node_name - target_label: node - - honor_labels: true - job_name: kubernetes-service-endpoints-slow - kubernetes_sd_configs: - - role: endpoints - relabel_configs: - - action: keep - regex: true - source_labels: - - __meta_kubernetes_service_annotation_prometheus_io_scrape_slow - - action: replace - regex: (https?) - source_labels: - - __meta_kubernetes_service_annotation_prometheus_io_scheme - target_label: __scheme__ - - action: replace - regex: (.+) - source_labels: - - __meta_kubernetes_service_annotation_prometheus_io_path - target_label: __metrics_path__ - - action: replace - regex: (.+?)(?::\d+)?;(\d+) - replacement: $1:$2 - source_labels: - - __address__ - - __meta_kubernetes_service_annotation_prometheus_io_port - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_service_annotation_prometheus_io_param_(.+) - replacement: __param_$1 - - action: labelmap - regex: __meta_kubernetes_service_label_(.+) - - action: replace - source_labels: - - __meta_kubernetes_namespace - target_label: namespace - - action: replace - source_labels: - - __meta_kubernetes_service_name - target_label: service - - action: replace - source_labels: - - __meta_kubernetes_pod_node_name - target_label: node - scrape_interval: 5m - scrape_timeout: 30s - - honor_labels: true - job_name: prometheus-pushgateway - kubernetes_sd_configs: - - role: service - relabel_configs: - - action: keep - regex: pushgateway - source_labels: - - __meta_kubernetes_service_annotation_prometheus_io_probe - - honor_labels: true - job_name: kubernetes-services - kubernetes_sd_configs: - - role: service - metrics_path: /probe - params: - module: - - http_2xx - relabel_configs: - - action: keep - regex: true - source_labels: - - __meta_kubernetes_service_annotation_prometheus_io_probe - - source_labels: - - __address__ - target_label: __param_target - - replacement: blackbox - target_label: __address__ - - source_labels: - - __param_target - target_label: instance - - action: labelmap - regex: __meta_kubernetes_service_label_(.+) - - source_labels: - - __meta_kubernetes_namespace - target_label: namespace - - source_labels: - - __meta_kubernetes_service_name - target_label: service - - honor_labels: true - job_name: kubernetes-pods - kubernetes_sd_configs: - - role: pod - relabel_configs: - - action: keep - regex: true - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scrape - - action: drop - regex: true - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow - - action: replace - regex: (https?) - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scheme - target_label: __scheme__ - - action: replace - regex: (.+) - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_path - target_label: __metrics_path__ - - action: replace - regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) - replacement: '[$2]:$1' - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_port - - __meta_kubernetes_pod_ip - target_label: __address__ - - action: replace - regex: (\d+);((([0-9]+?)(\.|$)){4}) - replacement: $2:$1 - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_port - - __meta_kubernetes_pod_ip - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) - replacement: __param_$1 - - action: labelmap - regex: __meta_kubernetes_pod_label_(.+) - - action: replace - source_labels: - - __meta_kubernetes_namespace - target_label: namespace - - action: replace - source_labels: - - __meta_kubernetes_pod_name - target_label: pod - - action: drop - regex: Pending|Succeeded|Failed|Completed - source_labels: - - __meta_kubernetes_pod_phase - - action: replace - source_labels: - - __meta_kubernetes_pod_node_name - target_label: node - - honor_labels: true - job_name: kubernetes-pods-slow - kubernetes_sd_configs: - - role: pod - relabel_configs: - - action: keep - regex: true - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scrape_slow - - action: replace - regex: (https?) - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_scheme - target_label: __scheme__ - - action: replace - regex: (.+) - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_path - target_label: __metrics_path__ - - action: replace - regex: (\d+);(([A-Fa-f0-9]{1,4}::?){1,7}[A-Fa-f0-9]{1,4}) - replacement: '[$2]:$1' - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_port - - __meta_kubernetes_pod_ip - target_label: __address__ - - action: replace - regex: (\d+);((([0-9]+?)(\.|$)){4}) - replacement: $2:$1 - source_labels: - - __meta_kubernetes_pod_annotation_prometheus_io_port - - __meta_kubernetes_pod_ip - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_pod_annotation_prometheus_io_param_(.+) - replacement: __param_$1 - - action: labelmap - regex: __meta_kubernetes_pod_label_(.+) - - action: replace - source_labels: - - __meta_kubernetes_namespace - target_label: namespace - - action: replace - source_labels: - - __meta_kubernetes_pod_name - target_label: pod - - action: drop - regex: Pending|Succeeded|Failed|Completed - source_labels: - - __meta_kubernetes_pod_phase - - action: replace - source_labels: - - __meta_kubernetes_pod_node_name - target_label: node - scrape_interval: 5m - scrape_timeout: 30s + scrape_configs: [] alerting: alertmanagers: - kubernetes_sd_configs: From 3133f19118a52e9f3d1497775ac0641e6914f9c6 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Fri, 18 Apr 2025 12:39:30 +0000 Subject: [PATCH 15/41] Version check dependencies, update docs Signed-off-by: Danny Kopping --- README.md | 2 +- scripts/compile.sh | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index cdb096b..645119f 100644 --- a/README.md +++ b/README.md @@ -464,7 +464,7 @@ values which are defined [here](https://github.com/grafana/helm-charts/tree/main | prometheus.server.service.type | string | `"ClusterIP"` | | | prometheus.server.statefulSet.enabled | bool | `true` | | | prometheus.serverFiles."prometheus.yml".rule_files[0] | string | `"/etc/config/alerts/*.yaml"` | | -| prometheus.serverFiles."prometheus.yml".scrape_configs | string | `nil` | | +| prometheus.serverFiles."prometheus.yml".scrape_configs | list | `[]` | | | prometheus.testFramework.enabled | bool | `false` | | | runbookViewer.image | string | `"dannyben/madness"` | | | sqlExporter.image | string | `"burningalchemist/sql_exporter"` | | diff --git a/scripts/compile.sh b/scripts/compile.sh index 6e6474b..91a81f5 100755 --- a/scripts/compile.sh +++ b/scripts/compile.sh @@ -1,6 +1,12 @@ #!/usr/bin/env bash set -euo pipefail +# check versions +HELM_VERSION=3.14 +YQ_VERSION=4.42 +[[ "$(helm version)" == *v${HELM_VERSION}* ]] || { echo "Expected helm version v${HELM_VERSION} but got $(helm version)" >&2; exit 1; } +[[ "$(yq --version)" == *v${YQ_VERSION}* ]] || { echo "Expected yq version v${YQ_VERSION} but got $(yq --version)" >&2; exit 1; } + source "$(dirname "${BASH_SOURCE[0]}")/lib.sh" helm repo add prometheus-community https://prometheus-community.github.io/helm-charts From 80e8924911ec6429bdd8b6014b81dc6b3373b6ce Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Fri, 18 Apr 2025 12:58:42 +0000 Subject: [PATCH 16/41] add prebuilds dashboard Signed-off-by: Danny Kopping --- .../dashboards/_dashboards_prebuilds.json.tpl | 1064 +++++++++++++++++ .../configmap-dashboards-prebuilds.yaml | 7 + coder-observability/values.yaml | 4 + 3 files changed, 1075 insertions(+) create mode 100644 coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl create mode 100644 coder-observability/templates/dashboards/configmap-dashboards-prebuilds.yaml diff --git a/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl b/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl new file mode 100644 index 0000000..1602e28 --- /dev/null +++ b/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl @@ -0,0 +1,1064 @@ +{{ define "prebuilds-dashboard.json" }} +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 10, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "0": { + "color": "orange", + "index": 2, + "text": "Not enabled" + }, + "1": { + "color": "green", + "index": 0, + "text": "Enabled" + } + }, + "type": "value" + }, + { + "options": { + "match": "null", + "result": { + "color": "orange", + "index": 1, + "text": "Not enabled" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 0 + }, + "id": 15, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": { + "valueSize": 15 + }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "min(coderd_experiments{experiment=\"workspace-prebuilds\"})", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Experiment enabled?", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "text", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 0 + }, + "id": 49, + "interval": "30s", + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "repeatDirection": "v", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(coderd_prebuilds_desired) or vector(0)", + "instant": true, + "interval": "", + "legendFormat": "Desired", + "range": false, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(coderd_prebuilds_running) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Running", + "range": false, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(coderd_prebuilds_eligible) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Eligible", + "range": false, + "refId": "E" + } + ], + "title": "Current: Global", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "text", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 0 + }, + "id": 48, + "interval": "30s", + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "repeatDirection": "v", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(coderd_prebuilds_created_total) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Created", + "range": false, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(coderd_prebuilds_failed_total) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Failed", + "range": false, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(coderd_prebuilds_claimed_total) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Claimed", + "range": false, + "refId": "A" + } + ], + "title": "All Time: Global", + "type": "stat" + }, + { + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 4 + }, + "id": 2, + "panels": [], + "repeat": "template", + "repeatDirection": "h", + "title": "$template", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "text", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 0, + "y": 5 + }, + "id": 31, + "interval": "30s", + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "repeat": "preset", + "repeatDirection": "v", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(coderd_prebuilds_desired{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "instant": true, + "interval": "", + "legendFormat": "Desired", + "range": false, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(coderd_prebuilds_running{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Running", + "range": false, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(coderd_prebuilds_eligible{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Eligible", + "range": false, + "refId": "E" + } + ], + "title": "Current: $preset", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 10, + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 18, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "fieldMinMax": false, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Desired" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + }, + { + "id": "custom.fillOpacity", + "value": 85 + }, + { + "id": "custom.fillBelowTo", + "value": "Running" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Running" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + }, + { + "id": "custom.fillBelowTo", + "value": "Eligible" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Eligible" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 4, + "y": 5 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.4.3", + "repeat": "preset", + "repeatDirection": "v", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(coderd_prebuilds_desired{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "instant": false, + "interval": "", + "legendFormat": "Desired", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(coderd_prebuilds_running{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Running", + "range": true, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(coderd_prebuilds_eligible{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Eligible", + "range": true, + "refId": "E" + } + ], + "title": "Change over range: $preset", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 10, + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 13, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "fieldMinMax": false, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Failed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Created" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Desired" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Running" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Eligible" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Claimed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 12, + "y": 5 + }, + "id": 38, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.4.3", + "repeat": "preset", + "repeatDirection": "v", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "floor(sum(increase(coderd_prebuilds_created_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Created", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "floor(sum(increase(coderd_prebuilds_failed_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Failed", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "floor(sum(increase(coderd_prebuilds_claimed_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Claimed", + "range": true, + "refId": "F" + } + ], + "title": "Change over range: $preset", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "text", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 20, + "y": 5 + }, + "id": 1, + "interval": "30s", + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "repeat": "preset", + "repeatDirection": "v", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(coderd_prebuilds_created_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Created", + "range": false, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(coderd_prebuilds_failed_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Failed", + "range": false, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(coderd_prebuilds_claimed_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Claimed", + "range": false, + "refId": "A" + } + ], + "title": "All Time: $preset", + "type": "stat" + } + ], + "refresh": "{{- include "dashboard-refresh" . -}}", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "allValue": "", + "current": { + "selected": false, + "text": "k8s", + "value": "k8s" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(coderd_prebuilds_desired,template_name)", + "hide": 0, + "includeAll": false, + "label": "Template", + "multi": false, + "name": "template", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(coderd_prebuilds_desired,template_name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "allValue": "", + "current": { + "selected": false, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(coderd_prebuilds_desired{template_name=~\"$template\"},preset_name)", + "hide": 0, + "includeAll": true, + "label": "Preset", + "multi": true, + "name": "preset", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(coderd_prebuilds_desired{template_name=~\"$template\"},preset_name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-{{- include "dashboard-range" . -}}", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Prebuilds", + "uid": "cej6jysyme22oa", + "version": 13, + "weekStart": "" +} +{{ end }} \ No newline at end of file diff --git a/coder-observability/templates/dashboards/configmap-dashboards-prebuilds.yaml b/coder-observability/templates/dashboards/configmap-dashboards-prebuilds.yaml new file mode 100644 index 0000000..14d5908 --- /dev/null +++ b/coder-observability/templates/dashboards/configmap-dashboards-prebuilds.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: dashboards-prebuilds + namespace: {{ .Release.Namespace }} +data: + prebuilds.json: |- {{- include "prebuilds-dashboard.json" . | trim | nindent 4 }} \ No newline at end of file diff --git a/coder-observability/values.yaml b/coder-observability/values.yaml index d035dd8..c33d14a 100644 --- a/coder-observability/values.yaml +++ b/coder-observability/values.yaml @@ -414,6 +414,10 @@ grafana: mountPath: /var/lib/grafana/dashboards/coder/4 configMap: dashboards-workspace-detail readOnly: false + - name: dashboards-prebuilds + mountPath: /var/lib/grafana/dashboards/coder/5 + configMap: dashboards-prebuilds + readOnly: false prometheus: enabled: true From 2eab6ae85e6e3c321252c204678982fa312b0138 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Fri, 18 Apr 2025 13:05:14 +0000 Subject: [PATCH 17/41] make lint Signed-off-by: Danny Kopping --- compiled/resources.yaml | 1078 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 1078 insertions(+) diff --git a/compiled/resources.yaml b/compiled/resources.yaml index ccdf33b..3083dea 100644 --- a/compiled/resources.yaml +++ b/compiled/resources.yaml @@ -2436,6 +2436,1077 @@ data: "weekStart": "" } --- +# Source: coder-observability/templates/dashboards/configmap-dashboards-prebuilds.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: dashboards-prebuilds + namespace: coder-observability +data: + prebuilds.json: |- + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 10, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "0": { + "color": "orange", + "index": 2, + "text": "Not enabled" + }, + "1": { + "color": "green", + "index": 0, + "text": "Enabled" + } + }, + "type": "value" + }, + { + "options": { + "match": "null", + "result": { + "color": "orange", + "index": 1, + "text": "Not enabled" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 0 + }, + "id": 15, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "center", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "text": { + "valueSize": 15 + }, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "min(coderd_experiments{experiment=\"workspace-prebuilds\"})", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Experiment enabled?", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "text", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 0 + }, + "id": 49, + "interval": "30s", + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "repeatDirection": "v", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(coderd_prebuilds_desired) or vector(0)", + "instant": true, + "interval": "", + "legendFormat": "Desired", + "range": false, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(coderd_prebuilds_running) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Running", + "range": false, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(coderd_prebuilds_eligible) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Eligible", + "range": false, + "refId": "E" + } + ], + "title": "Current: Global", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "text", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 0 + }, + "id": 48, + "interval": "30s", + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "repeatDirection": "v", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(coderd_prebuilds_created_total) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Created", + "range": false, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(coderd_prebuilds_failed_total) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Failed", + "range": false, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(coderd_prebuilds_claimed_total) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Claimed", + "range": false, + "refId": "A" + } + ], + "title": "All Time: Global", + "type": "stat" + }, + { + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 4 + }, + "id": 2, + "panels": [], + "repeat": "template", + "repeatDirection": "h", + "title": "$template", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "text", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 0, + "y": 5 + }, + "id": 31, + "interval": "30s", + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "repeat": "preset", + "repeatDirection": "v", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(coderd_prebuilds_desired{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "instant": true, + "interval": "", + "legendFormat": "Desired", + "range": false, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(coderd_prebuilds_running{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Running", + "range": false, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(coderd_prebuilds_eligible{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Eligible", + "range": false, + "refId": "E" + } + ], + "title": "Current: $preset", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 10, + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 18, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "fieldMinMax": false, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Desired" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + }, + { + "id": "custom.lineStyle", + "value": { + "dash": [ + 10, + 10 + ], + "fill": "dash" + } + }, + { + "id": "custom.fillOpacity", + "value": 85 + }, + { + "id": "custom.fillBelowTo", + "value": "Running" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Running" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + }, + { + "id": "custom.fillBelowTo", + "value": "Eligible" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Eligible" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 4, + "y": 5 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.4.3", + "repeat": "preset", + "repeatDirection": "v", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(coderd_prebuilds_desired{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "instant": false, + "interval": "", + "legendFormat": "Desired", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(coderd_prebuilds_running{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Running", + "range": true, + "refId": "D" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(coderd_prebuilds_eligible{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Eligible", + "range": true, + "refId": "E" + } + ], + "title": "Change over range: $preset", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 10, + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 13, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "fieldMinMax": false, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Failed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Created" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Desired" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Running" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "yellow", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Eligible" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Claimed" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "dark-green", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 12, + "y": 5 + }, + "id": 38, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.4.3", + "repeat": "preset", + "repeatDirection": "v", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "floor(sum(increase(coderd_prebuilds_created_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Created", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "floor(sum(increase(coderd_prebuilds_failed_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Failed", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "floor(sum(increase(coderd_prebuilds_claimed_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "Claimed", + "range": true, + "refId": "F" + } + ], + "title": "Change over range: $preset", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "text", + "mode": "fixed" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 20, + "y": 5 + }, + "id": 1, + "interval": "30s", + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "center", + "orientation": "vertical", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "repeat": "preset", + "repeatDirection": "v", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(coderd_prebuilds_created_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Created", + "range": false, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(coderd_prebuilds_failed_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Failed", + "range": false, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(coderd_prebuilds_claimed_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "Claimed", + "range": false, + "refId": "A" + } + ], + "title": "All Time: $preset", + "type": "stat" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "allValue": "", + "current": { + "selected": false, + "text": "k8s", + "value": "k8s" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(coderd_prebuilds_desired,template_name)", + "hide": 0, + "includeAll": false, + "label": "Template", + "multi": false, + "name": "template", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(coderd_prebuilds_desired,template_name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "allValue": "", + "current": { + "selected": false, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(coderd_prebuilds_desired{template_name=~\"$template\"},preset_name)", + "hide": 0, + "includeAll": true, + "label": "Preset", + "multi": true, + "name": "preset", + "options": [], + "query": { + "qryType": 1, + "query": "label_values(coderd_prebuilds_desired{template_name=~\"$template\"},preset_name)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-12h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Prebuilds", + "uid": "cej6jysyme22oa", + "version": 13, + "weekStart": "" + } +--- # Source: coder-observability/templates/dashboards/configmap-dashboards-provisionerd.yaml apiVersion: v1 kind: ConfigMap @@ -10219,6 +11290,10 @@ spec: mountPath: /var/lib/grafana/dashboards/coder/4 subPath: readOnly: false + - name: dashboards-prebuilds + mountPath: /var/lib/grafana/dashboards/coder/5 + subPath: + readOnly: false - name: storage mountPath: "/var/lib/grafana" - name: config @@ -10288,6 +11363,9 @@ spec: - name: dashboards-workspace-detail configMap: name: dashboards-workspace-detail + - name: dashboards-prebuilds + configMap: + name: dashboards-prebuilds - name: dashboards-infra configMap: name: grafana-dashboards-infra From f24e2c91bc895ce9caec0973a586ee9afc32397f Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Fri, 18 Apr 2025 13:07:46 +0000 Subject: [PATCH 18/41] Update README Signed-off-by: Danny Kopping --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 645119f..e785abf 100644 --- a/README.md +++ b/README.md @@ -360,6 +360,10 @@ values which are defined [here](https://github.com/grafana/helm-charts/tree/main | grafana.extraConfigmapMounts[4].mountPath | string | `"/var/lib/grafana/dashboards/coder/4"` | | | grafana.extraConfigmapMounts[4].name | string | `"dashboards-workspace-detail"` | | | grafana.extraConfigmapMounts[4].readOnly | bool | `false` | | +| grafana.extraConfigmapMounts[5].configMap | string | `"dashboards-prebuilds"` | | +| grafana.extraConfigmapMounts[5].mountPath | string | `"/var/lib/grafana/dashboards/coder/5"` | | +| grafana.extraConfigmapMounts[5].name | string | `"dashboards-prebuilds"` | | +| grafana.extraConfigmapMounts[5].readOnly | bool | `false` | | | grafana.fullnameOverride | string | `"grafana"` | | | grafana.persistence.enabled | bool | `true` | | | grafana.persistence.size | string | `"10Gi"` | | From 383aaf2efb62ab743f4977b95ca221f2251cd8c4 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Fri, 18 Apr 2025 13:27:51 +0000 Subject: [PATCH 19/41] Review comments Signed-off-by: Danny Kopping --- .../dashboards/_dashboards_prebuilds.json.tpl | 26 +++++-------------- compiled/resources.yaml | 26 +++++-------------- 2 files changed, 12 insertions(+), 40 deletions(-) diff --git a/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl b/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl index 1602e28..cdf45a0 100644 --- a/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl +++ b/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl @@ -286,7 +286,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilds_created_total) or vector(0)", + "expr": "sum(max by (template_name, preset_name) (coderd_prebuilds_created_total)) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -301,7 +301,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilds_failed_total) or vector(0)", + "expr": "sum(max by (template_name, preset_name) (coderd_prebuilds_failed_total)) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -316,7 +316,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilds_claimed_total) or vector(0)", + "expr": "sum(max by (template_name, preset_name) (coderd_prebuilds_claimed_total)) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -938,7 +938,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilds_created_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "max(coderd_prebuilds_created_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -953,7 +953,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilds_failed_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "max(coderd_prebuilds_failed_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -968,7 +968,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilds_claimed_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "max(coderd_prebuilds_claimed_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -988,11 +988,6 @@ "list": [ { "allValue": "", - "current": { - "selected": false, - "text": "k8s", - "value": "k8s" - }, "datasource": { "type": "prometheus", "uid": "prometheus" @@ -1017,15 +1012,6 @@ }, { "allValue": "", - "current": { - "selected": false, - "text": [ - "All" - ], - "value": [ - "$__all" - ] - }, "datasource": { "type": "prometheus", "uid": "prometheus" diff --git a/compiled/resources.yaml b/compiled/resources.yaml index 3083dea..34ac956 100644 --- a/compiled/resources.yaml +++ b/compiled/resources.yaml @@ -2731,7 +2731,7 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilds_created_total) or vector(0)", + "expr": "sum(max by (template_name, preset_name) (coderd_prebuilds_created_total)) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -2746,7 +2746,7 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilds_failed_total) or vector(0)", + "expr": "sum(max by (template_name, preset_name) (coderd_prebuilds_failed_total)) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -2761,7 +2761,7 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilds_claimed_total) or vector(0)", + "expr": "sum(max by (template_name, preset_name) (coderd_prebuilds_claimed_total)) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -3383,7 +3383,7 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilds_created_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "max(coderd_prebuilds_created_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -3398,7 +3398,7 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilds_failed_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "max(coderd_prebuilds_failed_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -3413,7 +3413,7 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilds_claimed_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "max(coderd_prebuilds_claimed_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -3433,11 +3433,6 @@ data: "list": [ { "allValue": "", - "current": { - "selected": false, - "text": "k8s", - "value": "k8s" - }, "datasource": { "type": "prometheus", "uid": "prometheus" @@ -3462,15 +3457,6 @@ data: }, { "allValue": "", - "current": { - "selected": false, - "text": [ - "All" - ], - "value": [ - "$__all" - ] - }, "datasource": { "type": "prometheus", "uid": "prometheus" From 413aabe8ca38ff8562a510213d5bc7095330709f Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Fri, 18 Apr 2025 14:03:47 +0000 Subject: [PATCH 20/41] Add CoderdIneligiblePrebuilds alert Signed-off-by: Danny Kopping --- README.md | 2 +- coder-observability/runbooks/coderd.md | 7 ++++++ .../configmap-prometheus-alerts.yaml | 23 ++++++++++++++++++- coder-observability/values.yaml | 5 ++++ compiled/resources.yaml | 17 ++++++++++---- 5 files changed, 47 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 645119f..d33b398 100644 --- a/README.md +++ b/README.md @@ -228,7 +228,7 @@ values which are defined [here](https://github.com/grafana/helm-charts/tree/main | Key | Type | Default | Description | |-----|------|---------|-------------| -| global.coder.alerts | object | `{"coderd":{"groups":{"CPU":{"delay":"10m","enabled":true,"period":"10m","thresholds":{"critical":0.9,"warning":0.8}},"Memory":{"delay":"10m","enabled":true,"thresholds":{"critical":0.9,"warning":0.8}},"Replicas":{"delay":"5m","enabled":true,"thresholds":{"critical":1,"notify":3,"warning":2}},"Restarts":{"delay":"1m","enabled":true,"period":"10m","thresholds":{"critical":3,"notify":1,"warning":2}},"WorkspaceBuildFailures":{"delay":"10m","enabled":true,"period":"10m","thresholds":{"critical":10,"notify":2,"warning":5}}}},"enterprise":{"groups":{"Licences":{"delay":"1m","enabled":true,"thresholds":{"critical":1,"warning":0.9}}}},"provisionerd":{"groups":{"Replicas":{"delay":"5m","enabled":true,"thresholds":{"critical":1,"notify":3,"warning":2}}}}}` | alerts for the various aspects of Coder | +| global.coder.alerts | object | `{"coderd":{"groups":{"CPU":{"delay":"10m","enabled":true,"period":"10m","thresholds":{"critical":0.9,"warning":0.8}},"IneligiblePrebuilds":{"delay":"10m","enabled":true,"thresholds":{"notify":1}},"Memory":{"delay":"10m","enabled":true,"thresholds":{"critical":0.9,"warning":0.8}},"Replicas":{"delay":"5m","enabled":true,"thresholds":{"critical":1,"notify":3,"warning":2}},"Restarts":{"delay":"1m","enabled":true,"period":"10m","thresholds":{"critical":3,"notify":1,"warning":2}},"WorkspaceBuildFailures":{"delay":"10m","enabled":true,"period":"10m","thresholds":{"critical":10,"notify":2,"warning":5}}}},"enterprise":{"groups":{"Licences":{"delay":"1m","enabled":true,"thresholds":{"critical":1,"warning":0.9}}}},"provisionerd":{"groups":{"Replicas":{"delay":"5m","enabled":true,"thresholds":{"critical":1,"notify":3,"warning":2}}}}}` | alerts for the various aspects of Coder | | global.coder.coderdSelector | string | `"pod=~`coder.*`, pod!~`.*provisioner.*`"` | series selector for Prometheus/Loki to locate provisioner pods. ensure this uses backticks for quotes! | | global.coder.controlPlaneNamespace | string | `"coder"` | the namespace into which the control plane has been deployed. | | global.coder.externalProvisionersNamespace | string | `"coder"` | the namespace into which any external provisioners have been deployed. | diff --git a/coder-observability/runbooks/coderd.md b/coder-observability/runbooks/coderd.md index 62c80f5..ef5eb71 100644 --- a/coder-observability/runbooks/coderd.md +++ b/coder-observability/runbooks/coderd.md @@ -76,3 +76,10 @@ Terraform plugin. Your Enterprise license is approaching or has exceeded the number of seats purchased. Please contact your Coder sales contact, or visit https://coder.com/contact/sales. + +## CoderdIneligiblePrebuilds + +Prebuilds only become eligible to be claimed by users once the workspace's agent is a) running and b) all of its startup +scripts have completed. + +If a prebuilt workspace is not eligible, view its agent logs to diagnose the problem. \ No newline at end of file diff --git a/coder-observability/templates/configmap-prometheus-alerts.yaml b/coder-observability/templates/configmap-prometheus-alerts.yaml index eec7171..3311be7 100644 --- a/coder-observability/templates/configmap-prometheus-alerts.yaml +++ b/coder-observability/templates/configmap-prometheus-alerts.yaml @@ -4,7 +4,7 @@ metadata: name: metrics-alerts namespace: {{ .Release.Namespace }} data: - {{- $service := dict "service" "coder" -}} + {{- $service := dict "service" "coderd" -}} {{- with .Values.global.coder.alerts.coderd }} {{/* start-section */}} coderd.yaml: |- @@ -104,6 +104,27 @@ data: {{- end }} {{- end }} + {{- with .groups.IneligiblePrebuilds }} + {{- $group := . }} + {{- if .enabled }} + - name: Coderd Ineligible Prebuilds + rules: + {{ $alert := "CoderdIneligiblePrebuilds" }} + {{- range $severity, $threshold := .thresholds }} + - alert: {{ $alert }} + expr: max by (template_name, preset_name) (coderd_prebuilds_running - coderd_prebuilds_eligible) > 0 + for: {{ $group.delay }} + annotations: + summary: > + {{ `{{ $value }}` }} prebuilt workspace(s) are currently ineligible for claiming for the "{{ `{{ $labels.template_name }}` }}" template and "{{ `{{ $labels.preset_name }}` }}" preset. + This usually indicates that the agent has not started correctly, or is still running its startup scripts after an extended period of time. + labels: + severity: {{ $severity }} + runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} + {{- end }} + {{- end }} + {{- end }} + {{- end }} {{/* end-section */}} diff --git a/coder-observability/values.yaml b/coder-observability/values.yaml index d035dd8..c2298fb 100644 --- a/coder-observability/values.yaml +++ b/coder-observability/values.yaml @@ -76,6 +76,11 @@ global: notify: 2 warning: 5 critical: 10 + IneligiblePrebuilds: + enabled: true + delay: 10m + thresholds: + notify: 1 provisionerd: groups: Replicas: diff --git a/compiled/resources.yaml b/compiled/resources.yaml index ccdf33b..9583951 100644 --- a/compiled/resources.yaml +++ b/compiled/resources.yaml @@ -777,8 +777,8 @@ metadata: name: metrics-alerts namespace: coder-observability data: - coderd.yaml: "groups:\n- name: CPU Usage\n rules:\n \n - alert: CoderdCPUUsage\n expr: max by (pod) (rate(container_cpu_usage_seconds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) / max by(pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"cpu\"}) > 0.9\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of CPU, which may impact application performance.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdcpuusage\n - alert: CoderdCPUUsage\n expr: max by (pod) (rate(container_cpu_usage_seconds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) / max by(pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"cpu\"}) > 0.8\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of CPU, which may impact application performance.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdcpuusage\n- name: Memory Usage\n rules:\n \n - alert: CoderdMemoryUsage\n expr: max by (pod) (container_memory_working_set_bytes{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) / max by (pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"memory\"}) > 0.9\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of memory, which may lead to an Out-Of-Memory (OOM) error.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdmemoryusage\n - alert: CoderdMemoryUsage\n expr: max by (pod) (container_memory_working_set_bytes{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) / max by (pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"memory\"}) > 0.8\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of memory, which may lead to an Out-Of-Memory (OOM) error.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdmemoryusage\n- name: Pod Restarts\n rules:\n \n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 3\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdrestarts\n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 1\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdrestarts\n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 2\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdrestarts\n- name: Coderd Replicas\n rules:\n \n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 1\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 1.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdreplicas\n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 3\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 3.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdreplicas\n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 2\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 2.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdreplicas\n- name: Coderd Workspace Build Failures\n rules:\n \n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 10\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdworkspacebuildfailures\n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 2\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdworkspacebuildfailures\n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 5\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#coderdworkspacebuildfailures " - provisionerd.yaml: "groups:\n- name: Provisionerd Replicas\n rules:\n \n - alert: ProvisionerdReplicas\n expr: sum(coderd_provisionerd_num_daemons{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 1\n for: 5m\n annotations:\n summary: Number of alive provisionerd replicas is below the threshold = 1.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#provisionerdreplicas\n - alert: ProvisionerdReplicas\n expr: sum(coderd_provisionerd_num_daemons{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 3\n for: 5m\n annotations:\n summary: Number of alive provisionerd replicas is below the threshold = 3.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#provisionerdreplicas\n - alert: ProvisionerdReplicas\n expr: sum(coderd_provisionerd_num_daemons{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 2\n for: 5m\n annotations:\n summary: Number of alive provisionerd replicas is below the threshold = 2.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coder#provisionerdreplicas " + coderd.yaml: "groups:\n- name: CPU Usage\n rules:\n \n - alert: CoderdCPUUsage\n expr: max by (pod) (rate(container_cpu_usage_seconds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) / max by(pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"cpu\"}) > 0.9\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of CPU, which may impact application performance.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdcpuusage\n - alert: CoderdCPUUsage\n expr: max by (pod) (rate(container_cpu_usage_seconds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) / max by(pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"cpu\"}) > 0.8\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of CPU, which may impact application performance.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdcpuusage\n- name: Memory Usage\n rules:\n \n - alert: CoderdMemoryUsage\n expr: max by (pod) (container_memory_working_set_bytes{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) / max by (pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"memory\"}) > 0.9\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of memory, which may lead to an Out-Of-Memory (OOM) error.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdmemoryusage\n - alert: CoderdMemoryUsage\n expr: max by (pod) (container_memory_working_set_bytes{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) / max by (pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"memory\"}) > 0.8\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of memory, which may lead to an Out-Of-Memory (OOM) error.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdmemoryusage\n- name: Pod Restarts\n rules:\n \n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 3\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdrestarts\n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 1\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdrestarts\n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 2\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdrestarts\n- name: Coderd Replicas\n rules:\n \n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 1\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 1.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdreplicas\n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 3\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 3.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdreplicas\n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 2\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 2.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdreplicas\n- name: Coderd Workspace Build Failures\n rules:\n \n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 10\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdworkspacebuildfailures\n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 2\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdworkspacebuildfailures\n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 5\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdworkspacebuildfailures\n- name: Coderd Ineligible Prebuilds\n rules:\n \n - alert: CoderdIneligiblePrebuilds\n expr: max by (template_name, preset_name) (coderd_prebuilds_running - coderd_prebuilds_eligible) > 0\n for: 10m\n annotations:\n summary: >\n {{ $value }} prebuilt workspace(s) are currently ineligible for claiming for the \"{{ $labels.template_name }}\" template and \"{{ $labels.preset_name }}\" preset.\n This usually indicates that the agent has not started correctly, or is still running its startup scripts after an extended period of time.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdineligibleprebuilds " + provisionerd.yaml: "groups:\n- name: Provisionerd Replicas\n rules:\n \n - alert: ProvisionerdReplicas\n expr: sum(coderd_provisionerd_num_daemons{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 1\n for: 5m\n annotations:\n summary: Number of alive provisionerd replicas is below the threshold = 1.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#provisionerdreplicas\n - alert: ProvisionerdReplicas\n expr: sum(coderd_provisionerd_num_daemons{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 3\n for: 5m\n annotations:\n summary: Number of alive provisionerd replicas is below the threshold = 3.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#provisionerdreplicas\n - alert: ProvisionerdReplicas\n expr: sum(coderd_provisionerd_num_daemons{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 2\n for: 5m\n annotations:\n summary: Number of alive provisionerd replicas is below the threshold = 2.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#provisionerdreplicas " enterprise.yaml: "groups:\n - name: Licences\n rules:\n \n - alert: CoderLicenseSeats\n expr: 'max(coderd_license_active_users) / max(coderd_license_limit_users) >=1'\n for: 1m\n annotations:\n summary: Your Coder enterprise licence usage is now at {{ $value | humanizePercentage }} capacity.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/enterprise#coderlicenseseats\n - alert: CoderLicenseSeats\n expr: 'max(coderd_license_active_users) / max(coderd_license_limit_users) >=0.9'\n for: 1m\n annotations:\n summary: Your Coder enterprise licence usage is now at {{ $value | humanizePercentage }} capacity.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/enterprise#coderlicenseseats " postgres.yaml: "groups:\n- name: Notifications\n rules:\n \n - alert: PostgresNotificationQueueFillingUp\n expr: pg_pubsub_usage > 0.9\n for: 15m\n annotations:\n summary: The postgres instance {{ $labels.instance }} has a notification that is filling up, which may impact application performance.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresnotificationqueuefillingup\n - alert: PostgresNotificationQueueFillingUp\n expr: pg_pubsub_usage > 0.5\n for: 15m\n annotations:\n summary: The postgres instance {{ $labels.instance }} has a notification that is filling up, which may impact application performance.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresnotificationqueuefillingup\n - alert: PostgresNotificationQueueFillingUp\n expr: pg_pubsub_usage > 0.8\n for: 15m\n annotations:\n summary: The postgres instance {{ $labels.instance }} has a notification that is filling up, which may impact application performance.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresnotificationqueuefillingup\n\n- name: Liveness\n rules:\n \n - alert: PostgresDown\n expr: pg_up == 0\n for: 1m\n annotations:\n summary: The postgres instance {{ $labels.instance }} is down!\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresdown\n\n\n- name: Connections\n rules:\n \n - alert: PostgresConnectionsRunningLow\n expr: sum by (datname, instance) (pg_stat_activity_count) > on () group_left() (pg_settings_max_connections * 0.9)\n for: 5m\n labels:\n summary: The postgres instance {{ $labels.instance }} is running low on connections which may impact application performance.\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresconnectionsrunninglow\n - alert: PostgresConnectionsRunningLow\n expr: sum by (datname, instance) (pg_stat_activity_count) > on () group_left() (pg_settings_max_connections * 0.5)\n for: 5m\n labels:\n summary: The postgres instance {{ $labels.instance }} is running low on connections which may impact application performance.\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresconnectionsrunninglow\n - alert: PostgresConnectionsRunningLow\n expr: sum by (datname, instance) (pg_stat_activity_count) > on () group_left() (pg_settings_max_connections * 0.8)\n for: 5m\n labels:\n summary: The postgres instance {{ $labels.instance }} is running low on connections which may impact application performance.\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresconnectionsrunninglow" --- @@ -789,9 +789,9 @@ metadata: name: runbooks namespace: coder-observability annotations: - checksum/config: 7ab4d06d5b454cc584880f9b58fb50fe85f2803bbd0021edb957c5a2d73b640e + checksum/config: c7f0ed6a8142ac725babf921fbbc4275738149cf4da49b3d4311b37f61994267 data: - coderd.md: | + coderd.md: |- # Coderd Runbooks ## CoderdCPUUsage @@ -870,6 +870,13 @@ data: Your Enterprise license is approaching or has exceeded the number of seats purchased. Please contact your Coder sales contact, or visit https://coder.com/contact/sales. + + ## CoderdIneligiblePrebuilds + + Prebuilds only become eligible to be claimed by users once the workspace's agent is a) running and b) all of its startup + scripts have completed. + + If a prebuilt workspace is not eligible, view its agent logs to diagnose the problem. postgres.md: | # Postgres Runbooks @@ -11114,7 +11121,7 @@ spec: template: metadata: annotations: - checksum/config: 7ab4d06d5b454cc584880f9b58fb50fe85f2803bbd0021edb957c5a2d73b640e + checksum/config: c7f0ed6a8142ac725babf921fbbc4275738149cf4da49b3d4311b37f61994267 labels: app: runbook-viewer spec: From b6af003113ca2636c7a0b180017d302117c60387 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Mon, 21 Apr 2025 13:56:08 +0200 Subject: [PATCH 21/41] Adding changes from https://github.com/coder/observability/pull/30 to README template Signed-off-by: Danny Kopping --- README.gotmpl | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/README.gotmpl b/README.gotmpl index 840296e..411d638 100644 --- a/README.gotmpl +++ b/README.gotmpl @@ -199,6 +199,22 @@ stringData: password: "" # this matches the "passwordKey" field above ``` +To add an Ingress for Grafana, define this in your `values.yaml`: + +```yaml +grafana: + grafana.ini: + server: + domain: observability.example.com + root_url: "%(protocol)s://%(domain)s/grafana" + serve_from_sub_path: true + ingress: + enabled: true + hosts: + - "observability.example.com" + path: "/" +``` + ## Subcharts {{ template "chart.requirementsTable" . }} From 71298a0be0f992a1b3a72960a14b77de022cbbdf Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Mon, 21 Apr 2025 13:49:43 +0200 Subject: [PATCH 22/41] Fix enabled panel Signed-off-by: Danny Kopping --- .../templates/dashboards/_dashboards_prebuilds.json.tpl | 4 ++-- compiled/resources.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl b/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl index cdf45a0..85f68af 100644 --- a/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl +++ b/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl @@ -111,9 +111,9 @@ }, "editorMode": "code", "expr": "min(coderd_experiments{experiment=\"workspace-prebuilds\"})", - "instant": false, + "instant": true, "legendFormat": "__auto", - "range": true, + "range": false, "refId": "A" } ], diff --git a/compiled/resources.yaml b/compiled/resources.yaml index 7deb644..392d777 100644 --- a/compiled/resources.yaml +++ b/compiled/resources.yaml @@ -2563,9 +2563,9 @@ data: }, "editorMode": "code", "expr": "min(coderd_experiments{experiment=\"workspace-prebuilds\"})", - "instant": false, + "instant": true, "legendFormat": "__auto", - "range": true, + "range": false, "refId": "A" } ], From 89b30fc986af123f5fc68609093cc93f8a9e616f Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Mon, 21 Apr 2025 13:49:55 +0200 Subject: [PATCH 23/41] Add CoderdUnprovisionedPrebuild alert Signed-off-by: Danny Kopping --- README.md | 2 +- coder-observability/runbooks/coderd.md | 52 ++++++++++++++++- .../configmap-prometheus-alerts.yaml | 20 +++++++ coder-observability/values.yaml | 5 ++ compiled/resources.yaml | 56 ++++++++++++++++++- 5 files changed, 130 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index ec3ae09..616428a 100644 --- a/README.md +++ b/README.md @@ -244,7 +244,7 @@ values which are defined [here](https://github.com/grafana/helm-charts/tree/main | Key | Type | Default | Description | |-----|------|---------|-------------| -| global.coder.alerts | object | `{"coderd":{"groups":{"CPU":{"delay":"10m","enabled":true,"period":"10m","thresholds":{"critical":0.9,"warning":0.8}},"IneligiblePrebuilds":{"delay":"10m","enabled":true,"thresholds":{"notify":1}},"Memory":{"delay":"10m","enabled":true,"thresholds":{"critical":0.9,"warning":0.8}},"Replicas":{"delay":"5m","enabled":true,"thresholds":{"critical":1,"notify":3,"warning":2}},"Restarts":{"delay":"1m","enabled":true,"period":"10m","thresholds":{"critical":3,"notify":1,"warning":2}},"WorkspaceBuildFailures":{"delay":"10m","enabled":true,"period":"10m","thresholds":{"critical":10,"notify":2,"warning":5}}}},"enterprise":{"groups":{"Licences":{"delay":"1m","enabled":true,"thresholds":{"critical":1,"warning":0.9}}}},"provisionerd":{"groups":{"Replicas":{"delay":"5m","enabled":true,"thresholds":{"critical":1,"notify":3,"warning":2}}}}}` | alerts for the various aspects of Coder | +| global.coder.alerts | object | `{"coderd":{"groups":{"CPU":{"delay":"10m","enabled":true,"period":"10m","thresholds":{"critical":0.9,"warning":0.8}},"IneligiblePrebuilds":{"delay":"10m","enabled":true,"thresholds":{"notify":1}},"Memory":{"delay":"10m","enabled":true,"thresholds":{"critical":0.9,"warning":0.8}},"Replicas":{"delay":"5m","enabled":true,"thresholds":{"critical":1,"notify":3,"warning":2}},"Restarts":{"delay":"1m","enabled":true,"period":"10m","thresholds":{"critical":3,"notify":1,"warning":2}},"UnprovisionedPrebuiltWorkspaces":{"delay":"10m","enabled":true,"thresholds":{"warn":1}},"WorkspaceBuildFailures":{"delay":"10m","enabled":true,"period":"10m","thresholds":{"critical":10,"notify":2,"warning":5}}}},"enterprise":{"groups":{"Licences":{"delay":"1m","enabled":true,"thresholds":{"critical":1,"warning":0.9}}}},"provisionerd":{"groups":{"Replicas":{"delay":"5m","enabled":true,"thresholds":{"critical":1,"notify":3,"warning":2}}}}}` | alerts for the various aspects of Coder | | global.coder.coderdSelector | string | `"pod=~`coder.*`, pod!~`.*provisioner.*`"` | series selector for Prometheus/Loki to locate provisioner pods. ensure this uses backticks for quotes! | | global.coder.controlPlaneNamespace | string | `"coder"` | the namespace into which the control plane has been deployed. | | global.coder.externalProvisionersNamespace | string | `"coder"` | the namespace into which any external provisioners have been deployed. | diff --git a/coder-observability/runbooks/coderd.md b/coder-observability/runbooks/coderd.md index ef5eb71..4a42444 100644 --- a/coder-observability/runbooks/coderd.md +++ b/coder-observability/runbooks/coderd.md @@ -82,4 +82,54 @@ Please contact your Coder sales contact, or visit https://coder.com/contact/sale Prebuilds only become eligible to be claimed by users once the workspace's agent is a) running and b) all of its startup scripts have completed. -If a prebuilt workspace is not eligible, view its agent logs to diagnose the problem. \ No newline at end of file +If a prebuilt workspace is not eligible, view its agent logs to diagnose the problem. + +## CoderdUnprovisionedPrebuiltWorkspaces + +The number of running prebuilt workspaces is lower than the desired instances. This could be for several reasons, +ordered by likehood: + +### Experiment/License + +The prebuilds feature is currently gated behind an experiment *and* a premium license. + +Ensure that the prebuilds experiment is enabled with `CODER_EXPERIMENTS=workspace-prebuilds`, and that you have a premium +license added. + +### Preset Validation Issue + +Templates which have prebuilds configured will require a configured preset defined, with ALL of the required parameters +set in the preset. If any of these are missing, or any of the parameters - as defined - fail validation, then the prebuilds +subsystem will refuse to attempt a workspace build. + +Consult the coderd logs for more information; look out for errors or warnings from the prebuilds subsystem. + +### Template Misconfiguration or Error + +Prebuilt workspaces cannot be provisioned due to some issue at `terraform apply`-time. This could be due to misconfigured +cloud resources, improper authorization, or any number of other issues. + +Visit the Workspaces page, change the search term to `owner:prebuilds`, and view on the previously failed builds. The +error will likely be quite obvious. + +### Provisioner Latency + +If your provisioners are overloaded and cannot process provisioner jobs quickly enough, prebuilt workspaces may be affected. +There is no prioritization at present for prebuilt workspace jobs. + +Ensure your provisioners are appropriately resources (i.e. you have enough instances) to handle the concurrent build demand. + +### Use of Workspace Tags + +If you are using `coder_workspace_tags` ([docs](https://coder.com/docs/admin/templates/extending-templates/workspace-tags)) +in your template, chances are you do not have any provisioners running or they are under-resourced (see **Provisioner Latency**). + +Ensure your running provisioners are configured with your desired tags. + +### Reconciliation Loop Issue + +The prebuilds subsystem runs a _reconciliation loop_ which monitors the state of prebuilt workspaces to ensure the desired +number of instances are present at all times. Workspace Prebuilds is currently a BETA feature and so there could be a bug +in this _reconciliation loop_, which should be reported to Coder. + +Examine your coderd logs for any errors or warnings relating to prebuilds. \ No newline at end of file diff --git a/coder-observability/templates/configmap-prometheus-alerts.yaml b/coder-observability/templates/configmap-prometheus-alerts.yaml index 3311be7..c70b0b0 100644 --- a/coder-observability/templates/configmap-prometheus-alerts.yaml +++ b/coder-observability/templates/configmap-prometheus-alerts.yaml @@ -125,6 +125,26 @@ data: {{- end }} {{- end }} + {{- with .groups.UnprovisionedPrebuiltWorkspaces }} + {{- $group := . }} + {{- if .enabled }} + - name: Coderd Unprovisioned Prebuilt Workspaces + rules: + {{ $alert := "CoderdUnprovisionedPrebuiltWorkspaces" }} + {{- range $severity, $threshold := .thresholds }} + - alert: {{ $alert }} + expr: max by (template_name, preset_name) (coderd_prebuilds_desired - coderd_prebuilds_running) > 0 + for: {{ $group.delay }} + annotations: + summary: > + {{ `{{ $value }}` }} prebuilt workspace(s) not yet been provisioned for the "{{ `{{ $labels.template_name }}` }}" template and "{{ `{{ $labels.preset_name }}` }}" preset. + labels: + severity: {{ $severity }} + runbook_url: {{ template "runbook-url" (deepCopy $ | merge (dict "alert" $alert) $service) }} + {{- end }} + {{- end }} + {{- end }} + {{- end }} {{/* end-section */}} diff --git a/coder-observability/values.yaml b/coder-observability/values.yaml index 25a04c5..f35e12b 100644 --- a/coder-observability/values.yaml +++ b/coder-observability/values.yaml @@ -81,6 +81,11 @@ global: delay: 10m thresholds: notify: 1 + UnprovisionedPrebuiltWorkspaces: + enabled: true + delay: 10m + thresholds: + warn: 1 provisionerd: groups: Replicas: diff --git a/compiled/resources.yaml b/compiled/resources.yaml index 392d777..e4fd17d 100644 --- a/compiled/resources.yaml +++ b/compiled/resources.yaml @@ -777,7 +777,7 @@ metadata: name: metrics-alerts namespace: coder-observability data: - coderd.yaml: "groups:\n- name: CPU Usage\n rules:\n \n - alert: CoderdCPUUsage\n expr: max by (pod) (rate(container_cpu_usage_seconds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) / max by(pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"cpu\"}) > 0.9\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of CPU, which may impact application performance.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdcpuusage\n - alert: CoderdCPUUsage\n expr: max by (pod) (rate(container_cpu_usage_seconds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) / max by(pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"cpu\"}) > 0.8\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of CPU, which may impact application performance.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdcpuusage\n- name: Memory Usage\n rules:\n \n - alert: CoderdMemoryUsage\n expr: max by (pod) (container_memory_working_set_bytes{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) / max by (pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"memory\"}) > 0.9\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of memory, which may lead to an Out-Of-Memory (OOM) error.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdmemoryusage\n - alert: CoderdMemoryUsage\n expr: max by (pod) (container_memory_working_set_bytes{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) / max by (pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"memory\"}) > 0.8\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of memory, which may lead to an Out-Of-Memory (OOM) error.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdmemoryusage\n- name: Pod Restarts\n rules:\n \n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 3\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdrestarts\n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 1\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdrestarts\n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 2\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdrestarts\n- name: Coderd Replicas\n rules:\n \n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 1\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 1.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdreplicas\n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 3\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 3.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdreplicas\n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 2\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 2.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdreplicas\n- name: Coderd Workspace Build Failures\n rules:\n \n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 10\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdworkspacebuildfailures\n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 2\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdworkspacebuildfailures\n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 5\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdworkspacebuildfailures\n- name: Coderd Ineligible Prebuilds\n rules:\n \n - alert: CoderdIneligiblePrebuilds\n expr: max by (template_name, preset_name) (coderd_prebuilds_running - coderd_prebuilds_eligible) > 0\n for: 10m\n annotations:\n summary: >\n {{ $value }} prebuilt workspace(s) are currently ineligible for claiming for the \"{{ $labels.template_name }}\" template and \"{{ $labels.preset_name }}\" preset.\n This usually indicates that the agent has not started correctly, or is still running its startup scripts after an extended period of time.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdineligibleprebuilds " + coderd.yaml: "groups:\n- name: CPU Usage\n rules:\n \n - alert: CoderdCPUUsage\n expr: max by (pod) (rate(container_cpu_usage_seconds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) / max by(pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"cpu\"}) > 0.9\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of CPU, which may impact application performance.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdcpuusage\n - alert: CoderdCPUUsage\n expr: max by (pod) (rate(container_cpu_usage_seconds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) / max by(pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"cpu\"}) > 0.8\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of CPU, which may impact application performance.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdcpuusage\n- name: Memory Usage\n rules:\n \n - alert: CoderdMemoryUsage\n expr: max by (pod) (container_memory_working_set_bytes{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) / max by (pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"memory\"}) > 0.9\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of memory, which may lead to an Out-Of-Memory (OOM) error.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdmemoryusage\n - alert: CoderdMemoryUsage\n expr: max by (pod) (container_memory_working_set_bytes{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) / max by (pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"memory\"}) > 0.8\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of memory, which may lead to an Out-Of-Memory (OOM) error.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdmemoryusage\n- name: Pod Restarts\n rules:\n \n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 3\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdrestarts\n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 1\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdrestarts\n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 2\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdrestarts\n- name: Coderd Replicas\n rules:\n \n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 1\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 1.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdreplicas\n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 3\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 3.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdreplicas\n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 2\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 2.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdreplicas\n- name: Coderd Workspace Build Failures\n rules:\n \n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 10\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdworkspacebuildfailures\n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 2\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdworkspacebuildfailures\n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 5\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdworkspacebuildfailures\n- name: Coderd Ineligible Prebuilds\n rules:\n \n - alert: CoderdIneligiblePrebuilds\n expr: max by (template_name, preset_name) (coderd_prebuilds_running - coderd_prebuilds_eligible) > 0\n for: 10m\n annotations:\n summary: >\n {{ $value }} prebuilt workspace(s) are currently ineligible for claiming for the \"{{ $labels.template_name }}\" template and \"{{ $labels.preset_name }}\" preset.\n This usually indicates that the agent has not started correctly, or is still running its startup scripts after an extended period of time.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdineligibleprebuilds\n- name: Coderd Unprovisioned Prebuilt Workspaces\n rules:\n \n - alert: CoderdUnprovisionedPrebuiltWorkspaces\n expr: max by (template_name, preset_name) (coderd_prebuilds_desired - coderd_prebuilds_running) > 0\n for: 10m\n annotations:\n summary: >\n {{ $value }} prebuilt workspace(s) not yet been provisioned for the \"{{ $labels.template_name }}\" template and \"{{ $labels.preset_name }}\" preset.\n labels:\n severity: warn\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdunprovisionedprebuiltworkspaces " provisionerd.yaml: "groups:\n- name: Provisionerd Replicas\n rules:\n \n - alert: ProvisionerdReplicas\n expr: sum(coderd_provisionerd_num_daemons{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 1\n for: 5m\n annotations:\n summary: Number of alive provisionerd replicas is below the threshold = 1.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#provisionerdreplicas\n - alert: ProvisionerdReplicas\n expr: sum(coderd_provisionerd_num_daemons{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 3\n for: 5m\n annotations:\n summary: Number of alive provisionerd replicas is below the threshold = 3.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#provisionerdreplicas\n - alert: ProvisionerdReplicas\n expr: sum(coderd_provisionerd_num_daemons{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 2\n for: 5m\n annotations:\n summary: Number of alive provisionerd replicas is below the threshold = 2.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#provisionerdreplicas " enterprise.yaml: "groups:\n - name: Licences\n rules:\n \n - alert: CoderLicenseSeats\n expr: 'max(coderd_license_active_users) / max(coderd_license_limit_users) >=1'\n for: 1m\n annotations:\n summary: Your Coder enterprise licence usage is now at {{ $value | humanizePercentage }} capacity.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/enterprise#coderlicenseseats\n - alert: CoderLicenseSeats\n expr: 'max(coderd_license_active_users) / max(coderd_license_limit_users) >=0.9'\n for: 1m\n annotations:\n summary: Your Coder enterprise licence usage is now at {{ $value | humanizePercentage }} capacity.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/enterprise#coderlicenseseats " postgres.yaml: "groups:\n- name: Notifications\n rules:\n \n - alert: PostgresNotificationQueueFillingUp\n expr: pg_pubsub_usage > 0.9\n for: 15m\n annotations:\n summary: The postgres instance {{ $labels.instance }} has a notification that is filling up, which may impact application performance.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresnotificationqueuefillingup\n - alert: PostgresNotificationQueueFillingUp\n expr: pg_pubsub_usage > 0.5\n for: 15m\n annotations:\n summary: The postgres instance {{ $labels.instance }} has a notification that is filling up, which may impact application performance.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresnotificationqueuefillingup\n - alert: PostgresNotificationQueueFillingUp\n expr: pg_pubsub_usage > 0.8\n for: 15m\n annotations:\n summary: The postgres instance {{ $labels.instance }} has a notification that is filling up, which may impact application performance.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresnotificationqueuefillingup\n\n- name: Liveness\n rules:\n \n - alert: PostgresDown\n expr: pg_up == 0\n for: 1m\n annotations:\n summary: The postgres instance {{ $labels.instance }} is down!\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresdown\n\n\n- name: Connections\n rules:\n \n - alert: PostgresConnectionsRunningLow\n expr: sum by (datname, instance) (pg_stat_activity_count) > on () group_left() (pg_settings_max_connections * 0.9)\n for: 5m\n labels:\n summary: The postgres instance {{ $labels.instance }} is running low on connections which may impact application performance.\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresconnectionsrunninglow\n - alert: PostgresConnectionsRunningLow\n expr: sum by (datname, instance) (pg_stat_activity_count) > on () group_left() (pg_settings_max_connections * 0.5)\n for: 5m\n labels:\n summary: The postgres instance {{ $labels.instance }} is running low on connections which may impact application performance.\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresconnectionsrunninglow\n - alert: PostgresConnectionsRunningLow\n expr: sum by (datname, instance) (pg_stat_activity_count) > on () group_left() (pg_settings_max_connections * 0.8)\n for: 5m\n labels:\n summary: The postgres instance {{ $labels.instance }} is running low on connections which may impact application performance.\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresconnectionsrunninglow" @@ -789,7 +789,7 @@ metadata: name: runbooks namespace: coder-observability annotations: - checksum/config: c7f0ed6a8142ac725babf921fbbc4275738149cf4da49b3d4311b37f61994267 + checksum/config: b0c41033d0385ee3d46488f08e85bcef0d939614dcb99194e0c5913dbf0c2c33 data: coderd.md: |- # Coderd Runbooks @@ -877,6 +877,56 @@ data: scripts have completed. If a prebuilt workspace is not eligible, view its agent logs to diagnose the problem. + + ## CoderdUnprovisionedPrebuiltWorkspaces + + The number of running prebuilt workspaces is lower than the desired instances. This could be for several reasons, + ordered by likehood: + + ### Experiment/License + + The prebuilds feature is currently gated behind an experiment *and* a premium license. + + Ensure that the prebuilds experiment is enabled with `CODER_EXPERIMENTS=workspace-prebuilds`, and that you have a premium + license added. + + ### Preset Validation Issue + + Templates which have prebuilds configured will require a configured preset defined, with ALL of the required parameters + set in the preset. If any of these are missing, or any of the parameters - as defined - fail validation, then the prebuilds + subsystem will refuse to attempt a workspace build. + + Consult the coderd logs for more information; look out for errors or warnings from the prebuilds subsystem. + + ### Template Misconfiguration or Error + + Prebuilt workspaces cannot be provisioned due to some issue at `terraform apply`-time. This could be due to misconfigured + cloud resources, improper authorization, or any number of other issues. + + Visit the Workspaces page, change the search term to `owner:prebuilds`, and view on the previously failed builds. The + error will likely be quite obvious. + + ### Provisioner Latency + + If your provisioners are overloaded and cannot process provisioner jobs quickly enough, prebuilt workspaces may be affected. + There is no prioritization at present for prebuilt workspace jobs. + + Ensure your provisioners are appropriately resources (i.e. you have enough instances) to handle the concurrent build demand. + + ### Use of Workspace Tags + + If you are using `coder_workspace_tags` ([docs](https://coder.com/docs/admin/templates/extending-templates/workspace-tags)) + in your template, chances are you do not have any provisioners running or they are under-resourced (see **Provisioner Latency**). + + Ensure your running provisioners are configured with your desired tags. + + ### Reconciliation Loop Issue + + The prebuilds subsystem runs a _reconciliation loop_ which monitors the state of prebuilt workspaces to ensure the desired + number of instances are present at all times. Workspace Prebuilds is currently a BETA feature and so there could be a bug + in this _reconciliation loop_, which should be reported to Coder. + + Examine your coderd logs for any errors or warnings relating to prebuilds. postgres.md: | # Postgres Runbooks @@ -12185,7 +12235,7 @@ spec: template: metadata: annotations: - checksum/config: c7f0ed6a8142ac725babf921fbbc4275738149cf4da49b3d4311b37f61994267 + checksum/config: b0c41033d0385ee3d46488f08e85bcef0d939614dcb99194e0c5913dbf0c2c33 labels: app: runbook-viewer spec: From 391c7f6bdca5a9ad688c20644a6a59d751a9f7c4 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Mon, 21 Apr 2025 16:21:41 +0200 Subject: [PATCH 24/41] Rename panels Signed-off-by: Danny Kopping --- .../templates/dashboards/_dashboards_prebuilds.json.tpl | 4 ++-- compiled/resources.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl b/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl index 85f68af..29410a2 100644 --- a/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl +++ b/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl @@ -645,7 +645,7 @@ "refId": "E" } ], - "title": "Change over range: $preset", + "title": "Pool Capacity: $preset", "type": "timeseries" }, { @@ -871,7 +871,7 @@ "refId": "F" } ], - "title": "Change over range: $preset", + "title": "Pool Operations: $preset", "type": "timeseries" }, { diff --git a/compiled/resources.yaml b/compiled/resources.yaml index e4fd17d..ecd9f6d 100644 --- a/compiled/resources.yaml +++ b/compiled/resources.yaml @@ -3147,7 +3147,7 @@ data: "refId": "E" } ], - "title": "Change over range: $preset", + "title": "Pool Capacity: $preset", "type": "timeseries" }, { @@ -3373,7 +3373,7 @@ data: "refId": "F" } ], - "title": "Change over range: $preset", + "title": "Pool Operations: $preset", "type": "timeseries" }, { From 696de9f58584cf291b07f1c683659180d7ba5b8e Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Mon, 28 Apr 2025 15:59:13 +0200 Subject: [PATCH 25/41] CHANGELOG for v0.3.0 Signed-off-by: Danny Kopping --- CHANGELOG.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f97b441..8bb9049 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # CHANGELOG +## v0.3.0 + +- Adding prebuilt workspace dashboard & alerts + ## v0.2.1 - Upgraded subcharts @@ -23,4 +27,4 @@ ## v0.0.1 -- Initial release \ No newline at end of file +- Initial release From 65195d2ca99b8bf411d067387d6e782b667d7cfd Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Wed, 30 Apr 2025 16:01:34 +0200 Subject: [PATCH 26/41] chore: correct metric names to use renamed ones Signed-off-by: Danny Kopping --- .../configmap-prometheus-alerts.yaml | 4 +- .../dashboards/_dashboards_prebuilds.json.tpl | 44 +++++++++--------- compiled/resources.yaml | 46 +++++++++---------- scripts/compile.sh | 2 +- 4 files changed, 48 insertions(+), 48 deletions(-) diff --git a/coder-observability/templates/configmap-prometheus-alerts.yaml b/coder-observability/templates/configmap-prometheus-alerts.yaml index c70b0b0..bf9bcc4 100644 --- a/coder-observability/templates/configmap-prometheus-alerts.yaml +++ b/coder-observability/templates/configmap-prometheus-alerts.yaml @@ -112,7 +112,7 @@ data: {{ $alert := "CoderdIneligiblePrebuilds" }} {{- range $severity, $threshold := .thresholds }} - alert: {{ $alert }} - expr: max by (template_name, preset_name) (coderd_prebuilds_running - coderd_prebuilds_eligible) > 0 + expr: max by (template_name, preset_name) (coderd_prebuilt_workspaces_running - coderd_prebuilt_workspaces_eligible) > 0 for: {{ $group.delay }} annotations: summary: > @@ -133,7 +133,7 @@ data: {{ $alert := "CoderdUnprovisionedPrebuiltWorkspaces" }} {{- range $severity, $threshold := .thresholds }} - alert: {{ $alert }} - expr: max by (template_name, preset_name) (coderd_prebuilds_desired - coderd_prebuilds_running) > 0 + expr: max by (template_name, preset_name) (coderd_prebuilt_workspaces_desired - coderd_prebuilt_workspaces_running) > 0 for: {{ $group.delay }} annotations: summary: > diff --git a/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl b/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl index 29410a2..cfc236f 100644 --- a/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl +++ b/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl @@ -182,7 +182,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilds_desired) or vector(0)", + "expr": "sum(coderd_prebuilt_workspaces_desired) or vector(0)", "instant": true, "interval": "", "legendFormat": "Desired", @@ -196,7 +196,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilds_running) or vector(0)", + "expr": "sum(coderd_prebuilt_workspaces_running) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -211,7 +211,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilds_eligible) or vector(0)", + "expr": "sum(coderd_prebuilt_workspaces_eligible) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -286,7 +286,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(max by (template_name, preset_name) (coderd_prebuilds_created_total)) or vector(0)", + "expr": "sum(max by (template_name, preset_name) (coderd_prebuilt_workspaces_created_total)) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -301,7 +301,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(max by (template_name, preset_name) (coderd_prebuilds_failed_total)) or vector(0)", + "expr": "sum(max by (template_name, preset_name) (coderd_prebuilt_workspaces_failed_total)) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -316,7 +316,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(max by (template_name, preset_name) (coderd_prebuilds_claimed_total)) or vector(0)", + "expr": "sum(max by (template_name, preset_name) (coderd_prebuilt_workspaces_claimed_total)) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -405,7 +405,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilds_desired{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "sum(coderd_prebuilt_workspaces_desired{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "instant": true, "interval": "", "legendFormat": "Desired", @@ -419,7 +419,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilds_running{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "sum(coderd_prebuilt_workspaces_running{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -434,7 +434,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilds_eligible{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "sum(coderd_prebuilt_workspaces_eligible{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -609,7 +609,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(coderd_prebuilds_desired{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "sum(coderd_prebuilt_workspaces_desired{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "instant": false, "interval": "", "legendFormat": "Desired", @@ -622,7 +622,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(coderd_prebuilds_running{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "sum(coderd_prebuilt_workspaces_running{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "hide": false, "instant": false, "interval": "", @@ -636,7 +636,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(coderd_prebuilds_eligible{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "sum(coderd_prebuilt_workspaces_eligible{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "hide": false, "instant": false, "interval": "", @@ -834,7 +834,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "floor(sum(increase(coderd_prebuilds_created_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", + "expr": "floor(sum(increase(coderd_prebuilt_workspaces_created_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", "hide": false, "instant": false, "interval": "", @@ -848,7 +848,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "floor(sum(increase(coderd_prebuilds_failed_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", + "expr": "floor(sum(increase(coderd_prebuilt_workspaces_failed_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", "hide": false, "instant": false, "interval": "", @@ -862,7 +862,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "floor(sum(increase(coderd_prebuilds_claimed_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", + "expr": "floor(sum(increase(coderd_prebuilt_workspaces_claimed_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", "hide": false, "instant": false, "interval": "", @@ -938,7 +938,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "max(coderd_prebuilds_created_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "max(coderd_prebuilt_workspaces_created_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -953,7 +953,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "max(coderd_prebuilds_failed_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "max(coderd_prebuilt_workspaces_failed_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -968,7 +968,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "max(coderd_prebuilds_claimed_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "max(coderd_prebuilt_workspaces_claimed_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -992,7 +992,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(coderd_prebuilds_desired,template_name)", + "definition": "label_values(coderd_prebuilt_workspaces_desired,template_name)", "hide": 0, "includeAll": false, "label": "Template", @@ -1001,7 +1001,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(coderd_prebuilds_desired,template_name)", + "query": "label_values(coderd_prebuilt_workspaces_desired,template_name)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -1016,7 +1016,7 @@ "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(coderd_prebuilds_desired{template_name=~\"$template\"},preset_name)", + "definition": "label_values(coderd_prebuilt_workspaces_desired{template_name=~\"$template\"},preset_name)", "hide": 0, "includeAll": true, "label": "Preset", @@ -1025,7 +1025,7 @@ "options": [], "query": { "qryType": 1, - "query": "label_values(coderd_prebuilds_desired{template_name=~\"$template\"},preset_name)", + "query": "label_values(coderd_prebuilt_workspaces_desired{template_name=~\"$template\"},preset_name)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/compiled/resources.yaml b/compiled/resources.yaml index ecd9f6d..4270a40 100644 --- a/compiled/resources.yaml +++ b/compiled/resources.yaml @@ -777,7 +777,7 @@ metadata: name: metrics-alerts namespace: coder-observability data: - coderd.yaml: "groups:\n- name: CPU Usage\n rules:\n \n - alert: CoderdCPUUsage\n expr: max by (pod) (rate(container_cpu_usage_seconds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) / max by(pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"cpu\"}) > 0.9\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of CPU, which may impact application performance.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdcpuusage\n - alert: CoderdCPUUsage\n expr: max by (pod) (rate(container_cpu_usage_seconds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) / max by(pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"cpu\"}) > 0.8\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of CPU, which may impact application performance.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdcpuusage\n- name: Memory Usage\n rules:\n \n - alert: CoderdMemoryUsage\n expr: max by (pod) (container_memory_working_set_bytes{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) / max by (pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"memory\"}) > 0.9\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of memory, which may lead to an Out-Of-Memory (OOM) error.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdmemoryusage\n - alert: CoderdMemoryUsage\n expr: max by (pod) (container_memory_working_set_bytes{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) / max by (pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"memory\"}) > 0.8\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of memory, which may lead to an Out-Of-Memory (OOM) error.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdmemoryusage\n- name: Pod Restarts\n rules:\n \n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 3\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdrestarts\n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 1\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdrestarts\n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 2\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdrestarts\n- name: Coderd Replicas\n rules:\n \n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 1\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 1.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdreplicas\n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 3\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 3.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdreplicas\n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 2\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 2.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdreplicas\n- name: Coderd Workspace Build Failures\n rules:\n \n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 10\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdworkspacebuildfailures\n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 2\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdworkspacebuildfailures\n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 5\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdworkspacebuildfailures\n- name: Coderd Ineligible Prebuilds\n rules:\n \n - alert: CoderdIneligiblePrebuilds\n expr: max by (template_name, preset_name) (coderd_prebuilds_running - coderd_prebuilds_eligible) > 0\n for: 10m\n annotations:\n summary: >\n {{ $value }} prebuilt workspace(s) are currently ineligible for claiming for the \"{{ $labels.template_name }}\" template and \"{{ $labels.preset_name }}\" preset.\n This usually indicates that the agent has not started correctly, or is still running its startup scripts after an extended period of time.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdineligibleprebuilds\n- name: Coderd Unprovisioned Prebuilt Workspaces\n rules:\n \n - alert: CoderdUnprovisionedPrebuiltWorkspaces\n expr: max by (template_name, preset_name) (coderd_prebuilds_desired - coderd_prebuilds_running) > 0\n for: 10m\n annotations:\n summary: >\n {{ $value }} prebuilt workspace(s) not yet been provisioned for the \"{{ $labels.template_name }}\" template and \"{{ $labels.preset_name }}\" preset.\n labels:\n severity: warn\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdunprovisionedprebuiltworkspaces " + coderd.yaml: "groups:\n- name: CPU Usage\n rules:\n \n - alert: CoderdCPUUsage\n expr: max by (pod) (rate(container_cpu_usage_seconds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) / max by(pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"cpu\"}) > 0.9\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of CPU, which may impact application performance.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdcpuusage\n - alert: CoderdCPUUsage\n expr: max by (pod) (rate(container_cpu_usage_seconds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) / max by(pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"cpu\"}) > 0.8\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of CPU, which may impact application performance.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdcpuusage\n- name: Memory Usage\n rules:\n \n - alert: CoderdMemoryUsage\n expr: max by (pod) (container_memory_working_set_bytes{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) / max by (pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"memory\"}) > 0.9\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of memory, which may lead to an Out-Of-Memory (OOM) error.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdmemoryusage\n - alert: CoderdMemoryUsage\n expr: max by (pod) (container_memory_working_set_bytes{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) / max by (pod) (kube_pod_container_resource_limits{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, resource=\"memory\"}) > 0.8\n for: 10m\n annotations:\n summary: The Coder instance {{ $labels.pod }} is using high amounts of memory, which may lead to an Out-Of-Memory (OOM) error.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdmemoryusage\n- name: Pod Restarts\n rules:\n \n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 3\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdrestarts\n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 1\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdrestarts\n - alert: CoderdRestarts\n expr: sum by(pod) (increase(kube_pod_container_status_restarts_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}[10m])) > 2\n for: 1m\n annotations:\n summary: The Coder instance {{ $labels.pod }} has restarted multiple times in the last 10m, which may indicate a CrashLoop.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdrestarts\n- name: Coderd Replicas\n rules:\n \n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 1\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 1.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdreplicas\n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 3\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 3.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdreplicas\n - alert: CoderdReplicas\n expr: sum(up{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 2\n for: 5m\n annotations:\n summary: Number of alive coderd replicas is below the threshold = 2.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdreplicas\n- name: Coderd Workspace Build Failures\n rules:\n \n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 10\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdworkspacebuildfailures\n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 2\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdworkspacebuildfailures\n - alert: CoderdWorkspaceBuildFailures\n expr: sum(increase(coderd_workspace_builds_total{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`, status=\"failed\" }[10m])) > 5\n for: 10m\n annotations:\n summary: Workspace builds have failed multiple times in the last 10m, which may indicate a broken Coder template.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdworkspacebuildfailures\n- name: Coderd Ineligible Prebuilds\n rules:\n \n - alert: CoderdIneligiblePrebuilds\n expr: max by (template_name, preset_name) (coderd_prebuilt_workspaces_running - coderd_prebuilt_workspaces_eligible) > 0\n for: 10m\n annotations:\n summary: >\n {{ $value }} prebuilt workspace(s) are currently ineligible for claiming for the \"{{ $labels.template_name }}\" template and \"{{ $labels.preset_name }}\" preset.\n This usually indicates that the agent has not started correctly, or is still running its startup scripts after an extended period of time.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdineligibleprebuilds\n- name: Coderd Unprovisioned Prebuilt Workspaces\n rules:\n \n - alert: CoderdUnprovisionedPrebuiltWorkspaces\n expr: max by (template_name, preset_name) (coderd_prebuilt_workspaces_desired - coderd_prebuilt_workspaces_running) > 0\n for: 10m\n annotations:\n summary: >\n {{ $value }} prebuilt workspace(s) not yet been provisioned for the \"{{ $labels.template_name }}\" template and \"{{ $labels.preset_name }}\" preset.\n labels:\n severity: warn\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#coderdunprovisionedprebuiltworkspaces " provisionerd.yaml: "groups:\n- name: Provisionerd Replicas\n rules:\n \n - alert: ProvisionerdReplicas\n expr: sum(coderd_provisionerd_num_daemons{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 1\n for: 5m\n annotations:\n summary: Number of alive provisionerd replicas is below the threshold = 1.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#provisionerdreplicas\n - alert: ProvisionerdReplicas\n expr: sum(coderd_provisionerd_num_daemons{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 3\n for: 5m\n annotations:\n summary: Number of alive provisionerd replicas is below the threshold = 3.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#provisionerdreplicas\n - alert: ProvisionerdReplicas\n expr: sum(coderd_provisionerd_num_daemons{pod=~`coder.*`, pod!~`.*provisioner.*`, namespace=`coder`}) < 2\n for: 5m\n annotations:\n summary: Number of alive provisionerd replicas is below the threshold = 2.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/coderd#provisionerdreplicas " enterprise.yaml: "groups:\n - name: Licences\n rules:\n \n - alert: CoderLicenseSeats\n expr: 'max(coderd_license_active_users) / max(coderd_license_limit_users) >=1'\n for: 1m\n annotations:\n summary: Your Coder enterprise licence usage is now at {{ $value | humanizePercentage }} capacity.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/enterprise#coderlicenseseats\n - alert: CoderLicenseSeats\n expr: 'max(coderd_license_active_users) / max(coderd_license_limit_users) >=0.9'\n for: 1m\n annotations:\n summary: Your Coder enterprise licence usage is now at {{ $value | humanizePercentage }} capacity.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/enterprise#coderlicenseseats " postgres.yaml: "groups:\n- name: Notifications\n rules:\n \n - alert: PostgresNotificationQueueFillingUp\n expr: pg_pubsub_usage > 0.9\n for: 15m\n annotations:\n summary: The postgres instance {{ $labels.instance }} has a notification that is filling up, which may impact application performance.\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresnotificationqueuefillingup\n - alert: PostgresNotificationQueueFillingUp\n expr: pg_pubsub_usage > 0.5\n for: 15m\n annotations:\n summary: The postgres instance {{ $labels.instance }} has a notification that is filling up, which may impact application performance.\n labels:\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresnotificationqueuefillingup\n - alert: PostgresNotificationQueueFillingUp\n expr: pg_pubsub_usage > 0.8\n for: 15m\n annotations:\n summary: The postgres instance {{ $labels.instance }} has a notification that is filling up, which may impact application performance.\n labels:\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresnotificationqueuefillingup\n\n- name: Liveness\n rules:\n \n - alert: PostgresDown\n expr: pg_up == 0\n for: 1m\n annotations:\n summary: The postgres instance {{ $labels.instance }} is down!\n labels:\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresdown\n\n\n- name: Connections\n rules:\n \n - alert: PostgresConnectionsRunningLow\n expr: sum by (datname, instance) (pg_stat_activity_count) > on () group_left() (pg_settings_max_connections * 0.9)\n for: 5m\n labels:\n summary: The postgres instance {{ $labels.instance }} is running low on connections which may impact application performance.\n severity: critical\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresconnectionsrunninglow\n - alert: PostgresConnectionsRunningLow\n expr: sum by (datname, instance) (pg_stat_activity_count) > on () group_left() (pg_settings_max_connections * 0.5)\n for: 5m\n labels:\n summary: The postgres instance {{ $labels.instance }} is running low on connections which may impact application performance.\n severity: notify\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresconnectionsrunninglow\n - alert: PostgresConnectionsRunningLow\n expr: sum by (datname, instance) (pg_stat_activity_count) > on () group_left() (pg_settings_max_connections * 0.8)\n for: 5m\n labels:\n summary: The postgres instance {{ $labels.instance }} is running low on connections which may impact application performance.\n severity: warning\n runbook_url: http://runbook-viewer.coder-observability.svc.cluster.local/postgres#postgresconnectionsrunninglow" @@ -2684,7 +2684,7 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilds_desired) or vector(0)", + "expr": "sum(coderd_prebuilt_workspaces_desired) or vector(0)", "instant": true, "interval": "", "legendFormat": "Desired", @@ -2698,7 +2698,7 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilds_running) or vector(0)", + "expr": "sum(coderd_prebuilt_workspaces_running) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -2713,7 +2713,7 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilds_eligible) or vector(0)", + "expr": "sum(coderd_prebuilt_workspaces_eligible) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -2788,7 +2788,7 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "sum(max by (template_name, preset_name) (coderd_prebuilds_created_total)) or vector(0)", + "expr": "sum(max by (template_name, preset_name) (coderd_prebuilt_workspaces_created_total)) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -2803,7 +2803,7 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "sum(max by (template_name, preset_name) (coderd_prebuilds_failed_total)) or vector(0)", + "expr": "sum(max by (template_name, preset_name) (coderd_prebuilt_workspaces_failed_total)) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -2818,7 +2818,7 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "sum(max by (template_name, preset_name) (coderd_prebuilds_claimed_total)) or vector(0)", + "expr": "sum(max by (template_name, preset_name) (coderd_prebuilt_workspaces_claimed_total)) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -2907,7 +2907,7 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilds_desired{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "sum(coderd_prebuilt_workspaces_desired{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "instant": true, "interval": "", "legendFormat": "Desired", @@ -2921,7 +2921,7 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilds_running{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "sum(coderd_prebuilt_workspaces_running{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -2936,7 +2936,7 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilds_eligible{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "sum(coderd_prebuilt_workspaces_eligible{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -3111,7 +3111,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(coderd_prebuilds_desired{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "sum(coderd_prebuilt_workspaces_desired{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "instant": false, "interval": "", "legendFormat": "Desired", @@ -3124,7 +3124,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(coderd_prebuilds_running{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "sum(coderd_prebuilt_workspaces_running{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "hide": false, "instant": false, "interval": "", @@ -3138,7 +3138,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(coderd_prebuilds_eligible{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "sum(coderd_prebuilt_workspaces_eligible{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "hide": false, "instant": false, "interval": "", @@ -3336,7 +3336,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "floor(sum(increase(coderd_prebuilds_created_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", + "expr": "floor(sum(increase(coderd_prebuilt_workspaces_created_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", "hide": false, "instant": false, "interval": "", @@ -3350,7 +3350,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "floor(sum(increase(coderd_prebuilds_failed_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", + "expr": "floor(sum(increase(coderd_prebuilt_workspaces_failed_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", "hide": false, "instant": false, "interval": "", @@ -3364,7 +3364,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "floor(sum(increase(coderd_prebuilds_claimed_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", + "expr": "floor(sum(increase(coderd_prebuilt_workspaces_claimed_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", "hide": false, "instant": false, "interval": "", @@ -3440,7 +3440,7 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "max(coderd_prebuilds_created_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "max(coderd_prebuilt_workspaces_created_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -3455,7 +3455,7 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "max(coderd_prebuilds_failed_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "max(coderd_prebuilt_workspaces_failed_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -3470,7 +3470,7 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "max(coderd_prebuilds_claimed_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "max(coderd_prebuilt_workspaces_claimed_total{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -3494,7 +3494,7 @@ data: "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(coderd_prebuilds_desired,template_name)", + "definition": "label_values(coderd_prebuilt_workspaces_desired,template_name)", "hide": 0, "includeAll": false, "label": "Template", @@ -3503,7 +3503,7 @@ data: "options": [], "query": { "qryType": 1, - "query": "label_values(coderd_prebuilds_desired,template_name)", + "query": "label_values(coderd_prebuilt_workspaces_desired,template_name)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, @@ -3518,7 +3518,7 @@ data: "type": "prometheus", "uid": "prometheus" }, - "definition": "label_values(coderd_prebuilds_desired{template_name=~\"$template\"},preset_name)", + "definition": "label_values(coderd_prebuilt_workspaces_desired{template_name=~\"$template\"},preset_name)", "hide": 0, "includeAll": true, "label": "Preset", @@ -3527,7 +3527,7 @@ data: "options": [], "query": { "qryType": 1, - "query": "label_values(coderd_prebuilds_desired{template_name=~\"$template\"},preset_name)", + "query": "label_values(coderd_prebuilt_workspaces_desired{template_name=~\"$template\"},preset_name)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, diff --git a/scripts/compile.sh b/scripts/compile.sh index 91a81f5..a00ce4f 100755 --- a/scripts/compile.sh +++ b/scripts/compile.sh @@ -2,7 +2,7 @@ set -euo pipefail # check versions -HELM_VERSION=3.14 +HELM_VERSION=3.17 YQ_VERSION=4.42 [[ "$(helm version)" == *v${HELM_VERSION}* ]] || { echo "Expected helm version v${HELM_VERSION} but got $(helm version)" >&2; exit 1; } [[ "$(yq --version)" == *v${YQ_VERSION}* ]] || { echo "Expected yq version v${YQ_VERSION} but got $(yq --version)" >&2; exit 1; } From 8d0b3f61254f0944cbda93cb7f5cb0cc72097502 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Wed, 30 Apr 2025 16:04:24 +0200 Subject: [PATCH 27/41] chore: fix helm version Signed-off-by: Danny Kopping --- .github/workflows/lint.yaml | 2 +- .github/workflows/nightly-build.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index 4efb8fb..4433626 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -22,7 +22,7 @@ jobs: - name: Install Helm uses: azure/setup-helm@v4 with: - version: v3.14.4 + version: v3.17.1 - name: Install yq run: | diff --git a/.github/workflows/nightly-build.yaml b/.github/workflows/nightly-build.yaml index ea31b55..e64f92c 100644 --- a/.github/workflows/nightly-build.yaml +++ b/.github/workflows/nightly-build.yaml @@ -21,7 +21,7 @@ jobs: - name: Install Helm uses: azure/setup-helm@v4 with: - version: v3.14.4 + version: v3.17.1 - name: Install yq run: | From 728804949d29c6dfafb120bbb44387d166b78b83 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Wed, 30 Apr 2025 16:51:28 +0200 Subject: [PATCH 28/41] chore: correct calculations with multiple coderd replicas Signed-off-by: Danny Kopping --- .../dashboards/_dashboards_prebuilds.json.tpl | 24 +++++++++---------- compiled/resources.yaml | 24 +++++++++---------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl b/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl index cfc236f..976ce22 100644 --- a/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl +++ b/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl @@ -182,7 +182,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilt_workspaces_desired) or vector(0)", + "expr": "max(coderd_prebuilt_workspaces_desired) or vector(0)", "instant": true, "interval": "", "legendFormat": "Desired", @@ -196,7 +196,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilt_workspaces_running) or vector(0)", + "expr": "max(coderd_prebuilt_workspaces_running) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -211,7 +211,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilt_workspaces_eligible) or vector(0)", + "expr": "max(coderd_prebuilt_workspaces_eligible) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -405,7 +405,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilt_workspaces_desired{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "max(coderd_prebuilt_workspaces_desired{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "instant": true, "interval": "", "legendFormat": "Desired", @@ -419,7 +419,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilt_workspaces_running{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "max(coderd_prebuilt_workspaces_running{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -434,7 +434,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilt_workspaces_eligible{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "max(coderd_prebuilt_workspaces_eligible{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -609,7 +609,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(coderd_prebuilt_workspaces_desired{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "max(coderd_prebuilt_workspaces_desired{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "instant": false, "interval": "", "legendFormat": "Desired", @@ -622,7 +622,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(coderd_prebuilt_workspaces_running{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "max(coderd_prebuilt_workspaces_running{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "hide": false, "instant": false, "interval": "", @@ -636,7 +636,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(coderd_prebuilt_workspaces_eligible{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "max(coderd_prebuilt_workspaces_eligible{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "hide": false, "instant": false, "interval": "", @@ -834,7 +834,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "floor(sum(increase(coderd_prebuilt_workspaces_created_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", + "expr": "floor(max(increase(coderd_prebuilt_workspaces_created_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", "hide": false, "instant": false, "interval": "", @@ -848,7 +848,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "floor(sum(increase(coderd_prebuilt_workspaces_failed_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", + "expr": "floor(max(increase(coderd_prebuilt_workspaces_failed_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", "hide": false, "instant": false, "interval": "", @@ -862,7 +862,7 @@ "uid": "prometheus" }, "editorMode": "code", - "expr": "floor(sum(increase(coderd_prebuilt_workspaces_claimed_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", + "expr": "floor(max(increase(coderd_prebuilt_workspaces_claimed_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", "hide": false, "instant": false, "interval": "", diff --git a/compiled/resources.yaml b/compiled/resources.yaml index 4270a40..4286f03 100644 --- a/compiled/resources.yaml +++ b/compiled/resources.yaml @@ -2684,7 +2684,7 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilt_workspaces_desired) or vector(0)", + "expr": "max(coderd_prebuilt_workspaces_desired) or vector(0)", "instant": true, "interval": "", "legendFormat": "Desired", @@ -2698,7 +2698,7 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilt_workspaces_running) or vector(0)", + "expr": "max(coderd_prebuilt_workspaces_running) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -2713,7 +2713,7 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilt_workspaces_eligible) or vector(0)", + "expr": "max(coderd_prebuilt_workspaces_eligible) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -2907,7 +2907,7 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilt_workspaces_desired{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "max(coderd_prebuilt_workspaces_desired{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "instant": true, "interval": "", "legendFormat": "Desired", @@ -2921,7 +2921,7 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilt_workspaces_running{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "max(coderd_prebuilt_workspaces_running{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -2936,7 +2936,7 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "sum(coderd_prebuilt_workspaces_eligible{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "max(coderd_prebuilt_workspaces_eligible{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -3111,7 +3111,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(coderd_prebuilt_workspaces_desired{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "max(coderd_prebuilt_workspaces_desired{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "instant": false, "interval": "", "legendFormat": "Desired", @@ -3124,7 +3124,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(coderd_prebuilt_workspaces_running{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "max(coderd_prebuilt_workspaces_running{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "hide": false, "instant": false, "interval": "", @@ -3138,7 +3138,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(coderd_prebuilt_workspaces_eligible{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", + "expr": "max(coderd_prebuilt_workspaces_eligible{template_name=~\"$template\", preset_name=~\"$preset\"}) or vector(0)", "hide": false, "instant": false, "interval": "", @@ -3336,7 +3336,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "floor(sum(increase(coderd_prebuilt_workspaces_created_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", + "expr": "floor(max(increase(coderd_prebuilt_workspaces_created_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", "hide": false, "instant": false, "interval": "", @@ -3350,7 +3350,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "floor(sum(increase(coderd_prebuilt_workspaces_failed_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", + "expr": "floor(max(increase(coderd_prebuilt_workspaces_failed_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", "hide": false, "instant": false, "interval": "", @@ -3364,7 +3364,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "floor(sum(increase(coderd_prebuilt_workspaces_claimed_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", + "expr": "floor(max(increase(coderd_prebuilt_workspaces_claimed_total{template_name=~\"$template\", preset_name=~\"$preset\"}[$__rate_interval]))) or vector(0)", "hide": false, "instant": false, "interval": "", From 2220dd62d35a6e2b4f82119febf8297957715e68 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Wed, 14 May 2025 13:42:27 +0200 Subject: [PATCH 29/41] chore: fix global count when multiple templates/presets have prebuilds Signed-off-by: Danny Kopping --- .../templates/dashboards/_dashboards_prebuilds.json.tpl | 6 +++--- compiled/resources.yaml | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl b/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl index 976ce22..938b501 100644 --- a/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl +++ b/coder-observability/templates/dashboards/_dashboards_prebuilds.json.tpl @@ -182,7 +182,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "max(coderd_prebuilt_workspaces_desired) or vector(0)", + "expr": "sum(max(coderd_prebuilt_workspaces_desired) by (template_name, preset_name)) or vector(0)", "instant": true, "interval": "", "legendFormat": "Desired", @@ -196,7 +196,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "max(coderd_prebuilt_workspaces_running) or vector(0)", + "expr": "sum(max(coderd_prebuilt_workspaces_running) by (template_name, preset_name)) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -211,7 +211,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "max(coderd_prebuilt_workspaces_eligible) or vector(0)", + "expr": "sum(max(coderd_prebuilt_workspaces_eligible) by (template_name, preset_name)) or vector(0)", "hide": false, "instant": true, "interval": "", diff --git a/compiled/resources.yaml b/compiled/resources.yaml index 4286f03..6f4518e 100644 --- a/compiled/resources.yaml +++ b/compiled/resources.yaml @@ -2684,7 +2684,7 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "max(coderd_prebuilt_workspaces_desired) or vector(0)", + "expr": "sum(max(coderd_prebuilt_workspaces_desired) by (template_name, preset_name)) or vector(0)", "instant": true, "interval": "", "legendFormat": "Desired", @@ -2698,7 +2698,7 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "max(coderd_prebuilt_workspaces_running) or vector(0)", + "expr": "sum(max(coderd_prebuilt_workspaces_running) by (template_name, preset_name)) or vector(0)", "hide": false, "instant": true, "interval": "", @@ -2713,7 +2713,7 @@ data: }, "editorMode": "code", "exemplar": false, - "expr": "max(coderd_prebuilt_workspaces_eligible) or vector(0)", + "expr": "sum(max(coderd_prebuilt_workspaces_eligible) by (template_name, preset_name)) or vector(0)", "hide": false, "instant": true, "interval": "", From a3678c9281e46d26d735e4eadee719998dccb345 Mon Sep 17 00:00:00 2001 From: noratanxz Date: Fri, 16 May 2025 12:47:44 +0100 Subject: [PATCH 30/41] fix: postgres connection string to use db name set in values --- coder-observability/templates/_helpers.tpl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/coder-observability/templates/_helpers.tpl b/coder-observability/templates/_helpers.tpl index a182b3d..12ab859 100644 --- a/coder-observability/templates/_helpers.tpl +++ b/coder-observability/templates/_helpers.tpl @@ -64,9 +64,9 @@ Create the name of the service account to use {{/* Postgres connector string */}} {{- define "postgres-connector-string" -}} {{- if .Values.global.postgres.password -}} -postgresql://{{ .Values.global.postgres.username }}:{{ urlquery .Values.global.postgres.password }}@{{ .Values.global.postgres.hostname }}:{{ .Values.global.postgres.port }}/postgres?sslmode={{ .Values.global.postgres.sslmode }} +postgresql://{{ .Values.global.postgres.username }}:{{ urlquery .Values.global.postgres.password }}@{{ .Values.global.postgres.hostname }}:{{ .Values.global.postgres.port }}/{{ .Values.global.postgres.database }}?sslmode={{ .Values.global.postgres.sslmode }} {{- else if .Values.global.postgres.mountSecret -}} -postgresql://{{ .Values.global.postgres.username }}@{{ .Values.global.postgres.hostname }}:{{ .Values.global.postgres.port }}/postgres?sslmode={{ .Values.global.postgres.sslmode }} +postgresql://{{ .Values.global.postgres.username }}@{{ .Values.global.postgres.hostname }}:{{ .Values.global.postgres.port }}/{{ .Values.global.postgres.database }}?sslmode={{ .Values.global.postgres.sslmode }} {{- else -}} {{ fail "either postgres.password or postgres.mountSecret must be defined" }} {{- end -}} From c1d15d74f0791c519cbdeeedc2deb7beeee20919 Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Fri, 23 May 2025 09:19:38 +0200 Subject: [PATCH 31/41] chore: make lint Signed-off-by: Danny Kopping --- compiled/resources.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/compiled/resources.yaml b/compiled/resources.yaml index 6f4518e..dea0d3e 100644 --- a/compiled/resources.yaml +++ b/compiled/resources.yaml @@ -994,7 +994,7 @@ data: global: target: name: postgres - data_source_name: 'postgresql://coder@localhost:5432/postgres?sslmode=disable' + data_source_name: 'postgresql://coder@localhost:5432/coder?sslmode=disable' collectors: - notify collectors: @@ -12215,7 +12215,7 @@ spec: name: exporter env: - name: DATA_SOURCE_NAME - value: 'postgresql://coder@localhost:5432/postgres?sslmode=disable' + value: 'postgresql://coder@localhost:5432/coder?sslmode=disable' envFrom: - secretRef: name: secret-postgres @@ -12270,7 +12270,7 @@ spec: metadata: annotations: prometheus.io/scrape: 'true' - checksum/config: e12c0044ef2cab3438ffdc8f5e16c24c5acf5ee36dcc8bee77294f27e53ce4a2 + checksum/config: 71bb9e7579b6e138ae28c623aa29d72025be00387da6c1b8dd5aa168c96ca1e0 labels: app: sql-exporter app.kubernetes.io/name: "database-stats" From ce59115577b2678818a7c04a4bf07b50657f584b Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Mon, 26 May 2025 13:53:38 +0200 Subject: [PATCH 32/41] chore: upgrade grafana to latest patch (v10.4.19) Signed-off-by: Danny Kopping --- coder-observability/values.yaml | 2 ++ compiled/resources.yaml | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/coder-observability/values.yaml b/coder-observability/values.yaml index f35e12b..653b829 100644 --- a/coder-observability/values.yaml +++ b/coder-observability/values.yaml @@ -275,6 +275,8 @@ grafana-agent: grafana: enabled: true + image: + tag: 10.4.19 fullnameOverride: grafana useStatefulSet: true replicas: 1 diff --git a/compiled/resources.yaml b/compiled/resources.yaml index dea0d3e..aff5679 100644 --- a/compiled/resources.yaml +++ b/compiled/resources.yaml @@ -11247,8 +11247,8 @@ spec: app.kubernetes.io/name: grafana app.kubernetes.io/instance: coder-observability annotations: - checksum/config: 4b5f6512e962f90e1dcdfbecb3713a10f3a998745141a4fc1adfcbb4cff23282 - checksum/dashboards-json-config: 3f59a9bfe9e7e9b7e6ca4ea81afd7bac7a8d78eadb7edbb44be4a327efd1d931 + checksum/config: 2828a490315379b00f2116ebe6a20dd3ca9a4d5ce5839f037c1eb0a4501ecb18 + checksum/dashboards-json-config: 010b57348b6dd1f09007330c03d22a0570022534712646511cad39a9e3cb4bb7 checksum/sc-dashboard-provider-config: 01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b kubectl.kubernetes.io/default-container: grafana spec: From 9f7fb99470f0915ddcfdf1b965aa47c5453a862c Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Mon, 26 May 2025 14:40:06 +0200 Subject: [PATCH 33/41] chore: make lint Signed-off-by: Danny Kopping --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 616428a..ba5fde2 100644 --- a/README.md +++ b/README.md @@ -381,6 +381,7 @@ values which are defined [here](https://github.com/grafana/helm-charts/tree/main | grafana.extraConfigmapMounts[5].name | string | `"dashboards-prebuilds"` | | | grafana.extraConfigmapMounts[5].readOnly | bool | `false` | | | grafana.fullnameOverride | string | `"grafana"` | | +| grafana.image.tag | string | `"10.4.19"` | | | grafana.persistence.enabled | bool | `true` | | | grafana.persistence.size | string | `"10Gi"` | | | grafana.replicas | int | `1` | | From ee6d508d4354eed5ed3806e1c3211fdbb3fb3972 Mon Sep 17 00:00:00 2001 From: noratanxz Date: Wed, 4 Jun 2025 10:49:04 +0100 Subject: [PATCH 34/41] feat: SSL connectivity support --- coder-observability/templates/_helpers.tpl | 8 ++++++-- .../templates/statefulset-postgres-exporter.yaml | 8 +++++++- coder-observability/values.yaml | 13 +++++++++++++ 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/coder-observability/templates/_helpers.tpl b/coder-observability/templates/_helpers.tpl index 12ab859..f4b455e 100644 --- a/coder-observability/templates/_helpers.tpl +++ b/coder-observability/templates/_helpers.tpl @@ -63,10 +63,14 @@ Create the name of the service account to use {{/* Postgres connector string */}} {{- define "postgres-connector-string" -}} -{{- if .Values.global.postgres.password -}} +{{- if and .Values.global.postgres.password (eq .Values.global.postgres.sslmode "disable") -}} postgresql://{{ .Values.global.postgres.username }}:{{ urlquery .Values.global.postgres.password }}@{{ .Values.global.postgres.hostname }}:{{ .Values.global.postgres.port }}/{{ .Values.global.postgres.database }}?sslmode={{ .Values.global.postgres.sslmode }} -{{- else if .Values.global.postgres.mountSecret -}} +{{- else if and .Values.global.postgres.password (ne .Values.global.postgres.sslmode "disable") -}} +postgresql://{{ .Values.global.postgres.username }}:{{ urlquery .Values.global.postgres.password }}@{{ .Values.global.postgres.hostname }}:{{ .Values.global.postgres.port }}/{{ .Values.global.postgres.database }}?sslmode={{ .Values.global.postgres.sslmode }}&sslrootcert={{ .Values.global.postgres.sslrootcert }} +{{- else if and .Values.global.postgres.mountSecret (eq .Values.global.postgres.sslmode "disable") -}} postgresql://{{ .Values.global.postgres.username }}@{{ .Values.global.postgres.hostname }}:{{ .Values.global.postgres.port }}/{{ .Values.global.postgres.database }}?sslmode={{ .Values.global.postgres.sslmode }} +{{- else if and .Values.global.postgres.mountSecret (ne .Values.global.postgres.sslmode "disable") -}} +postgresql://{{ .Values.global.postgres.username }}@{{ .Values.global.postgres.hostname }}:{{ .Values.global.postgres.port }}/{{ .Values.global.postgres.database }}?sslmode={{ .Values.global.postgres.sslmode }}&sslrootcert={{ .Values.global.postgres.sslrootcert }} {{- else -}} {{ fail "either postgres.password or postgres.mountSecret must be defined" }} {{- end -}} diff --git a/coder-observability/templates/statefulset-postgres-exporter.yaml b/coder-observability/templates/statefulset-postgres-exporter.yaml index 229c650..4f33f25 100644 --- a/coder-observability/templates/statefulset-postgres-exporter.yaml +++ b/coder-observability/templates/statefulset-postgres-exporter.yaml @@ -29,4 +29,10 @@ spec: env: - name: DATA_SOURCE_NAME value: '{{ include "postgres-connector-string" . }}' - {{ include "postgres-secret-mount" . | nindent 10 }} \ No newline at end of file + {{ include "postgres-secret-mount" . | nindent 10 }} + + volumeMounts: + {{ toYaml .Values.global.postgres.volumeMounts | nindent 12 }} + + volumes: + {{ toYaml .Values.global.postgres.volumes | nindent 8 }} \ No newline at end of file diff --git a/coder-observability/values.yaml b/coder-observability/values.yaml index 653b829..42ac2bc 100644 --- a/coder-observability/values.yaml +++ b/coder-observability/values.yaml @@ -123,11 +123,24 @@ global: password: database: coder sslmode: disable + # add root cert path if using SSL + # sslrootcert: /home/coder/.postgresql/rootcert.pem + # ensure that your secret has a field named `PGPASSWORD` mountSecret: "secret-postgres" exporter: image: "quay.io/prometheuscommunity/postgres-exporter" + volumes: + - name: "pg-certs-mount" + configMap: + name: "pg-certs-mount-config-map" + + volumeMounts: + - name: "pg-certs-mount" + mountPath: "/home/coder/.postgresql" + readOnly: true + # global.postgres.alerts -- alerts for postgres alerts: groups: From 965a534fe76cbd12d758925b31481adc30cf1bb3 Mon Sep 17 00:00:00 2001 From: noratanxz Date: Wed, 4 Jun 2025 10:55:32 +0100 Subject: [PATCH 35/41] chore: make lint --- compiled/resources.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/compiled/resources.yaml b/compiled/resources.yaml index aff5679..503f485 100644 --- a/compiled/resources.yaml +++ b/compiled/resources.yaml @@ -12219,6 +12219,14 @@ spec: envFrom: - secretRef: name: secret-postgres + volumeMounts: + - mountPath: /home/coder/.postgresql + name: pg-certs-mount + readOnly: true + volumes: + - configMap: + name: pg-certs-mount-config-map + name: pg-certs-mount --- # Source: coder-observability/templates/statefulset-runbook-viewer.yaml apiVersion: apps/v1 From b4f803634c02dfeab013add82ff76d32a30a838e Mon Sep 17 00:00:00 2001 From: noratanxz Date: Wed, 4 Jun 2025 10:59:13 +0100 Subject: [PATCH 36/41] chore: uncomment value --- coder-observability/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/coder-observability/values.yaml b/coder-observability/values.yaml index 42ac2bc..c4277d7 100644 --- a/coder-observability/values.yaml +++ b/coder-observability/values.yaml @@ -124,7 +124,7 @@ global: database: coder sslmode: disable # add root cert path if using SSL - # sslrootcert: /home/coder/.postgresql/rootcert.pem + sslrootcert: /home/coder/.postgresql/rootcert.pem # ensure that your secret has a field named `PGPASSWORD` mountSecret: "secret-postgres" From 91e58e22889020db99268a8c2643fe5a415e7cd2 Mon Sep 17 00:00:00 2001 From: Eric Date: Wed, 4 Jun 2025 11:29:36 +0000 Subject: [PATCH 37/41] update readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ba5fde2..1edc211 100644 --- a/README.md +++ b/README.md @@ -258,7 +258,7 @@ values which are defined [here](https://github.com/grafana/helm-charts/tree/main | global.dashboards.timerange | string | `"12h"` | how far back dashboards should look | | global.externalScheme | string | `"http"` | | | global.externalZone | string | `"svc.cluster.local"` | | -| global.postgres | object | `{"alerts":{"groups":{"Basic":{"delay":"1m","enabled":true},"Connections":{"delay":"5m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}},"Notifications":{"delay":"15m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}}}},"database":"coder","exporter":{"image":"quay.io/prometheuscommunity/postgres-exporter"},"hostname":"localhost","mountSecret":"secret-postgres","password":null,"port":5432,"sslmode":"disable","username":"coder"}` | postgres connection information NOTE: these settings are global so we can parameterise some values which get rendered by subcharts | +| global.postgres | object | `{"alerts":{"groups":{"Basic":{"delay":"1m","enabled":true},"Connections":{"delay":"5m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}},"Notifications":{"delay":"15m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}}}},"database":"coder","exporter":{"image":"quay.io/prometheuscommunity/postgres-exporter"},"hostname":"localhost","mountSecret":"secret-postgres","password":null,"port":5432,"sslmode":"disable","sslrootcert":"/home/coder/.postgresql/rootcert.pem","username":"coder","volumeMounts":[{"mountPath":"/home/coder/.postgresql","name":"pg-certs-mount","readOnly":true}],"volumes":[{"configMap":{"name":"pg-certs-mount-config-map"},"name":"pg-certs-mount"}]}` | postgres connection information NOTE: these settings are global so we can parameterise some values which get rendered by subcharts | | global.postgres.alerts | object | `{"groups":{"Basic":{"delay":"1m","enabled":true},"Connections":{"delay":"5m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}},"Notifications":{"delay":"15m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}}}}` | alerts for postgres | | global.telemetry | object | `{"metrics":{"scrape_interval":"15s","scrape_timeout":"12s"}}` | control telemetry collection | | global.telemetry.metrics | object | `{"scrape_interval":"15s","scrape_timeout":"12s"}` | control metric collection | From 847beb3af40ac92f4a501f4d1a46a28e4b36404f Mon Sep 17 00:00:00 2001 From: Danny Kopping Date: Thu, 5 Jun 2025 09:28:44 +0200 Subject: [PATCH 38/41] Add CODEOWNERS Signed-off-by: Danny Kopping --- CODEOWNERS | 1 + 1 file changed, 1 insertion(+) create mode 100644 CODEOWNERS diff --git a/CODEOWNERS b/CODEOWNERS new file mode 100644 index 0000000..4521512 --- /dev/null +++ b/CODEOWNERS @@ -0,0 +1 @@ +* @dannykopping \ No newline at end of file From 6db14cffa7248050469002721825e3ba903669c1 Mon Sep 17 00:00:00 2001 From: "blink-so[bot]" <211532188+blink-so[bot]@users.noreply.github.com> Date: Thu, 5 Jun 2025 19:11:37 +0000 Subject: [PATCH 39/41] feat: make postgres SSL certificate paths configurable - Remove hardcoded /home/coder/.postgresql paths - Make sslrootcert, volumes, and volumeMounts configurable with no defaults - Add conditional rendering to avoid empty volume mounts - Update postgres connector string to handle optional sslrootcert - Update README documentation Fixes hardcoded certificate paths introduced in PR #40 --- README.md | 2 +- coder-observability/templates/_helpers.tpl | 8 ++++++ .../statefulset-postgres-exporter.yaml | 8 +++--- coder-observability/values.yaml | 27 +++++++++++-------- 4 files changed, 30 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 1edc211..1a80c26 100644 --- a/README.md +++ b/README.md @@ -258,7 +258,7 @@ values which are defined [here](https://github.com/grafana/helm-charts/tree/main | global.dashboards.timerange | string | `"12h"` | how far back dashboards should look | | global.externalScheme | string | `"http"` | | | global.externalZone | string | `"svc.cluster.local"` | | -| global.postgres | object | `{"alerts":{"groups":{"Basic":{"delay":"1m","enabled":true},"Connections":{"delay":"5m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}},"Notifications":{"delay":"15m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}}}},"database":"coder","exporter":{"image":"quay.io/prometheuscommunity/postgres-exporter"},"hostname":"localhost","mountSecret":"secret-postgres","password":null,"port":5432,"sslmode":"disable","sslrootcert":"/home/coder/.postgresql/rootcert.pem","username":"coder","volumeMounts":[{"mountPath":"/home/coder/.postgresql","name":"pg-certs-mount","readOnly":true}],"volumes":[{"configMap":{"name":"pg-certs-mount-config-map"},"name":"pg-certs-mount"}]}` | postgres connection information NOTE: these settings are global so we can parameterise some values which get rendered by subcharts | +| global.postgres | object | `{"alerts":{"groups":{"Basic":{"delay":"1m","enabled":true},"Connections":{"delay":"5m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}},"Notifications":{"delay":"15m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}}}},"database":"coder","exporter":{"image":"quay.io/prometheuscommunity/postgres-exporter"},"hostname":"localhost","mountSecret":"secret-postgres","password":null,"port":5432,"sslmode":"disable","sslrootcert":null,"username":"coder","volumeMounts":[],"volumes":[]}` | postgres connection information NOTE: these settings are global so we can parameterise some values which get rendered by subcharts | | global.postgres.alerts | object | `{"groups":{"Basic":{"delay":"1m","enabled":true},"Connections":{"delay":"5m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}},"Notifications":{"delay":"15m","enabled":true,"thresholds":{"critical":0.9,"notify":0.5,"warning":0.8}}}}` | alerts for postgres | | global.telemetry | object | `{"metrics":{"scrape_interval":"15s","scrape_timeout":"12s"}}` | control telemetry collection | | global.telemetry.metrics | object | `{"scrape_interval":"15s","scrape_timeout":"12s"}` | control metric collection | diff --git a/coder-observability/templates/_helpers.tpl b/coder-observability/templates/_helpers.tpl index f4b455e..0d8578d 100644 --- a/coder-observability/templates/_helpers.tpl +++ b/coder-observability/templates/_helpers.tpl @@ -66,12 +66,20 @@ Create the name of the service account to use {{- if and .Values.global.postgres.password (eq .Values.global.postgres.sslmode "disable") -}} postgresql://{{ .Values.global.postgres.username }}:{{ urlquery .Values.global.postgres.password }}@{{ .Values.global.postgres.hostname }}:{{ .Values.global.postgres.port }}/{{ .Values.global.postgres.database }}?sslmode={{ .Values.global.postgres.sslmode }} {{- else if and .Values.global.postgres.password (ne .Values.global.postgres.sslmode "disable") -}} +{{- if .Values.global.postgres.sslrootcert -}} postgresql://{{ .Values.global.postgres.username }}:{{ urlquery .Values.global.postgres.password }}@{{ .Values.global.postgres.hostname }}:{{ .Values.global.postgres.port }}/{{ .Values.global.postgres.database }}?sslmode={{ .Values.global.postgres.sslmode }}&sslrootcert={{ .Values.global.postgres.sslrootcert }} +{{- else -}} +postgresql://{{ .Values.global.postgres.username }}:{{ urlquery .Values.global.postgres.password }}@{{ .Values.global.postgres.hostname }}:{{ .Values.global.postgres.port }}/{{ .Values.global.postgres.database }}?sslmode={{ .Values.global.postgres.sslmode }} +{{- end -}} {{- else if and .Values.global.postgres.mountSecret (eq .Values.global.postgres.sslmode "disable") -}} postgresql://{{ .Values.global.postgres.username }}@{{ .Values.global.postgres.hostname }}:{{ .Values.global.postgres.port }}/{{ .Values.global.postgres.database }}?sslmode={{ .Values.global.postgres.sslmode }} {{- else if and .Values.global.postgres.mountSecret (ne .Values.global.postgres.sslmode "disable") -}} +{{- if .Values.global.postgres.sslrootcert -}} postgresql://{{ .Values.global.postgres.username }}@{{ .Values.global.postgres.hostname }}:{{ .Values.global.postgres.port }}/{{ .Values.global.postgres.database }}?sslmode={{ .Values.global.postgres.sslmode }}&sslrootcert={{ .Values.global.postgres.sslrootcert }} {{- else -}} +postgresql://{{ .Values.global.postgres.username }}@{{ .Values.global.postgres.hostname }}:{{ .Values.global.postgres.port }}/{{ .Values.global.postgres.database }}?sslmode={{ .Values.global.postgres.sslmode }} +{{- end -}} +{{- else -}} {{ fail "either postgres.password or postgres.mountSecret must be defined" }} {{- end -}} {{- end }} diff --git a/coder-observability/templates/statefulset-postgres-exporter.yaml b/coder-observability/templates/statefulset-postgres-exporter.yaml index 4f33f25..a1f6e55 100644 --- a/coder-observability/templates/statefulset-postgres-exporter.yaml +++ b/coder-observability/templates/statefulset-postgres-exporter.yaml @@ -30,9 +30,11 @@ spec: - name: DATA_SOURCE_NAME value: '{{ include "postgres-connector-string" . }}' {{ include "postgres-secret-mount" . | nindent 10 }} - + {{- if .Values.global.postgres.volumeMounts }} volumeMounts: {{ toYaml .Values.global.postgres.volumeMounts | nindent 12 }} - + {{- end }} + {{- if .Values.global.postgres.volumes }} volumes: - {{ toYaml .Values.global.postgres.volumes | nindent 8 }} \ No newline at end of file + {{ toYaml .Values.global.postgres.volumes | nindent 8 }} + {{- end }} \ No newline at end of file diff --git a/coder-observability/values.yaml b/coder-observability/values.yaml index c4277d7..0413cce 100644 --- a/coder-observability/values.yaml +++ b/coder-observability/values.yaml @@ -123,23 +123,28 @@ global: password: database: coder sslmode: disable - # add root cert path if using SSL - sslrootcert: /home/coder/.postgresql/rootcert.pem + # SSL root certificate path - only required when sslmode != "disable" + # Example: /path/to/certs/rootcert.pem + sslrootcert: # ensure that your secret has a field named `PGPASSWORD` mountSecret: "secret-postgres" exporter: image: "quay.io/prometheuscommunity/postgres-exporter" - volumes: - - name: "pg-certs-mount" - configMap: - name: "pg-certs-mount-config-map" - - volumeMounts: - - name: "pg-certs-mount" - mountPath: "/home/coder/.postgresql" - readOnly: true + # volumes and volumeMounts for SSL certificates + # Only required when using SSL connections (sslmode != "disable") + # Example configuration: + # volumes: + # - name: "pg-certs-mount" + # configMap: + # name: "pg-certs-mount-config-map" + # volumeMounts: + # - name: "pg-certs-mount" + # mountPath: "/path/to/certs" + # readOnly: true + volumes: [] + volumeMounts: [] # global.postgres.alerts -- alerts for postgres alerts: From 6e8c46d38973c550bc4286165cf7a70e707313df Mon Sep 17 00:00:00 2001 From: "blink-so[bot]" <211532188+blink-so[bot]@users.noreply.github.com> Date: Thu, 5 Jun 2025 19:15:05 +0000 Subject: [PATCH 40/41] refactor: make comments more brief in values.yaml - Remove example configuration comments - Keep only essential comments for SSL certificate configuration --- coder-observability/values.yaml | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/coder-observability/values.yaml b/coder-observability/values.yaml index 0413cce..6d06981 100644 --- a/coder-observability/values.yaml +++ b/coder-observability/values.yaml @@ -124,7 +124,6 @@ global: database: coder sslmode: disable # SSL root certificate path - only required when sslmode != "disable" - # Example: /path/to/certs/rootcert.pem sslrootcert: # ensure that your secret has a field named `PGPASSWORD` @@ -133,16 +132,6 @@ global: image: "quay.io/prometheuscommunity/postgres-exporter" # volumes and volumeMounts for SSL certificates - # Only required when using SSL connections (sslmode != "disable") - # Example configuration: - # volumes: - # - name: "pg-certs-mount" - # configMap: - # name: "pg-certs-mount-config-map" - # volumeMounts: - # - name: "pg-certs-mount" - # mountPath: "/path/to/certs" - # readOnly: true volumes: [] volumeMounts: [] From 3560b220cdb8362a4033bd83830018dc227e0bb5 Mon Sep 17 00:00:00 2001 From: Eric Date: Thu, 5 Jun 2025 19:49:59 +0000 Subject: [PATCH 41/41] update compiled resources.yaml --- compiled/resources.yaml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/compiled/resources.yaml b/compiled/resources.yaml index 503f485..aff5679 100644 --- a/compiled/resources.yaml +++ b/compiled/resources.yaml @@ -12219,14 +12219,6 @@ spec: envFrom: - secretRef: name: secret-postgres - volumeMounts: - - mountPath: /home/coder/.postgresql - name: pg-certs-mount - readOnly: true - volumes: - - configMap: - name: pg-certs-mount-config-map - name: pg-certs-mount --- # Source: coder-observability/templates/statefulset-runbook-viewer.yaml apiVersion: apps/v1