From c4ff5090866f4739a6c73a3e47ac8ee759859e8a Mon Sep 17 00:00:00 2001 From: Garrett Delfosse Date: Wed, 22 Jan 2025 19:44:39 +0000 Subject: [PATCH 1/5] feat: remote write prometheus data --- scaletest/terraform/action/coder_traffic.tf | 17 +---- scaletest/terraform/action/gcp_clusters.tf | 10 +-- scaletest/terraform/action/gcp_db.tf | 2 +- scaletest/terraform/action/gcp_vpc.tf | 24 ++----- scaletest/terraform/action/prometheus.tf | 78 +++++++++++++++++++++ scaletest/terraform/action/vars.tf | 5 ++ scaletest/terraform/infra/gcp_db.tf | 2 +- scaletest/terraform/infra/gcp_vpc.tf | 4 +- 8 files changed, 100 insertions(+), 42 deletions(-) create mode 100644 scaletest/terraform/action/prometheus.tf diff --git a/scaletest/terraform/action/coder_traffic.tf b/scaletest/terraform/action/coder_traffic.tf index bea829427af82..ebc809273c598 100644 --- a/scaletest/terraform/action/coder_traffic.tf +++ b/scaletest/terraform/action/coder_traffic.tf @@ -5,7 +5,6 @@ locals { traffic_types = { ssh = { - wait_duration = "0m" duration = "30m" job_timeout = "35m" flags = [ @@ -13,13 +12,11 @@ locals { ] } webterminal = { - wait_duration = "5m" duration = "25m" job_timeout = "30m" flags = [] } app = { - wait_duration = "10m" duration = "20m" job_timeout = "25m" flags = [ @@ -39,14 +36,6 @@ resource "time_sleep" "wait_baseline" { create_duration = local.wait_baseline_duration } -resource "time_sleep" "wait_traffic" { - for_each = local.traffic_types - - depends_on = [time_sleep.wait_baseline] - - create_duration = local.traffic_types[each.key].wait_duration -} - resource "kubernetes_job" "workspace_traffic_primary" { provider = kubernetes.primary @@ -106,7 +95,7 @@ resource "kubernetes_job" "workspace_traffic_primary" { create = local.traffic_types[each.key].job_timeout } - depends_on = [time_sleep.wait_baseline, time_sleep.wait_traffic[each.key]] + depends_on = [time_sleep.wait_baseline] } resource "kubernetes_job" "workspace_traffic_europe" { @@ -169,7 +158,7 @@ resource "kubernetes_job" "workspace_traffic_europe" { create = local.traffic_types[each.key].job_timeout } - depends_on = [time_sleep.wait_baseline, time_sleep.wait_traffic[each.key]] + depends_on = [time_sleep.wait_baseline] } resource "kubernetes_job" "workspace_traffic_asia" { @@ -232,5 +221,5 @@ resource "kubernetes_job" "workspace_traffic_asia" { create = local.traffic_types[each.key].job_timeout } - depends_on = [time_sleep.wait_baseline, time_sleep.wait_traffic[each.key]] + depends_on = [time_sleep.wait_baseline] } diff --git a/scaletest/terraform/action/gcp_clusters.tf b/scaletest/terraform/action/gcp_clusters.tf index c41d06c6c1c83..5681ff8b44ce5 100644 --- a/scaletest/terraform/action/gcp_clusters.tf +++ b/scaletest/terraform/action/gcp_clusters.tf @@ -10,21 +10,21 @@ locals { url = "http://${var.name}-scaletest.${var.cloudflare_domain}" region = "us-east1" zone = "us-east1-c" - cidr = "10.200.0.0/24" + subnet = "scaletest" } europe = { subdomain = "${var.name}-europe-scaletest" url = "http://${var.name}-europe-scaletest.${var.cloudflare_domain}" region = "europe-west1" zone = "europe-west1-b" - cidr = "10.201.0.0/24" + subnet = "scaletest" } asia = { subdomain = "${var.name}-asia-scaletest" url = "http://${var.name}-asia-scaletest.${var.cloudflare_domain}" region = "asia-southeast1" zone = "asia-southeast1-a" - cidr = "10.202.0.0/24" + subnet = "scaletest" } } node_pools = { @@ -72,8 +72,8 @@ resource "google_container_cluster" "cluster" { name = "${var.name}-${each.key}" location = each.value.zone project = var.project_id - network = google_compute_network.vpc.name - subnetwork = google_compute_subnetwork.subnet[each.key].name + network = local.vpc_name + subnetwork = local.subnet_name networking_mode = "VPC_NATIVE" default_max_pods_per_node = 256 ip_allocation_policy { # Required with networking_mode=VPC_NATIVE diff --git a/scaletest/terraform/action/gcp_db.tf b/scaletest/terraform/action/gcp_db.tf index 0443fa771fe65..318c1d027f64f 100644 --- a/scaletest/terraform/action/gcp_db.tf +++ b/scaletest/terraform/action/gcp_db.tf @@ -23,7 +23,7 @@ resource "google_sql_database_instance" "db" { ip_configuration { ipv4_enabled = false - private_network = google_compute_network.vpc.id + private_network = "projects/${var.project_id}/global/networks/${local.vpc_name}" } insights_config { diff --git a/scaletest/terraform/action/gcp_vpc.tf b/scaletest/terraform/action/gcp_vpc.tf index c9fd412aa3cb4..4bd29a14008e3 100644 --- a/scaletest/terraform/action/gcp_vpc.tf +++ b/scaletest/terraform/action/gcp_vpc.tf @@ -1,20 +1,6 @@ - -resource "google_compute_network" "vpc" { - project = var.project_id - name = var.name - auto_create_subnetworks = "false" - depends_on = [ - google_project_service.api["compute.googleapis.com"] - ] -} - -resource "google_compute_subnetwork" "subnet" { - for_each = local.deployments - name = "${var.name}-${each.key}" - project = var.project_id - region = each.value.region - network = google_compute_network.vpc.name - ip_cidr_range = each.value.cidr +locals { + vpc_name = "scaletest" + subnet_name = "scaletest" } resource "google_compute_address" "coder" { @@ -32,11 +18,11 @@ resource "google_compute_global_address" "sql_peering" { purpose = "VPC_PEERING" address_type = "INTERNAL" prefix_length = 16 - network = google_compute_network.vpc.id + network = local.vpc_name } resource "google_service_networking_connection" "private_vpc_connection" { - network = google_compute_network.vpc.id + network = local.vpc_name service = "servicenetworking.googleapis.com" reserved_peering_ranges = [google_compute_global_address.sql_peering.name] } diff --git a/scaletest/terraform/action/prometheus.tf b/scaletest/terraform/action/prometheus.tf new file mode 100644 index 0000000000000..4b65230282626 --- /dev/null +++ b/scaletest/terraform/action/prometheus.tf @@ -0,0 +1,78 @@ +locals { + prometheus_helm_repo = "https://charts.bitnami.com/bitnami" + prometheus_helm_chart = "kube-prometheus" + prometheus_exporter_helm_repo = "https://prometheus-community.github.io/helm-charts" + prometheus_exporter_helm_chart = "prometheus-postgres-exporter" + prometheus_release_name = "prometheus" + prometheus_exporter_release_name = "prometheus-postgres-exporter" + prometheus_namespace = "prometheus" + prometheus_remote_write_send_interval = "15s" + prometheus_remote_write_metrics_regex = ".*" +} + +resource "kubernetes_namespace" "prometheus_namespace_primary" { + provider = kubernetes.primary + + metadata { + name = local.prometheus_namespace + } + lifecycle { + ignore_changes = [timeouts, wait_for_default_service_account] + } +} + +resource "helm_release" "prometheus_chart_primary" { + provider = helm.primary + + repository = local.prometheus_helm_repo + chart = local.prometheus_helm_chart + name = local.prometheus_release_name + namespace = kubernetes_namespace.prometheus_namespace_primary.metadata.0.name + values = [< Date: Wed, 22 Jan 2025 23:51:42 +0000 Subject: [PATCH 2/5] use bitnami chart --- scaletest/terraform/action/coder_traffic.tf | 3 + scaletest/terraform/action/gcp_db.tf | 2 +- scaletest/terraform/action/gcp_vpc.tf | 3 +- scaletest/terraform/action/prometheus.tf | 176 +++++++++++++----- .../action/prometheus_helm_values.tftpl | 44 +++++ 5 files changed, 175 insertions(+), 53 deletions(-) create mode 100644 scaletest/terraform/action/prometheus_helm_values.tftpl diff --git a/scaletest/terraform/action/coder_traffic.tf b/scaletest/terraform/action/coder_traffic.tf index ebc809273c598..f51661b7edfde 100644 --- a/scaletest/terraform/action/coder_traffic.tf +++ b/scaletest/terraform/action/coder_traffic.tf @@ -31,6 +31,9 @@ resource "time_sleep" "wait_baseline" { kubernetes_job.create_workspaces_primary, kubernetes_job.create_workspaces_europe, kubernetes_job.create_workspaces_asia, + helm_release.prometheus_chart_primary, + helm_release.prometheus_chart_europe, + helm_release.prometheus_chart_asia, ] create_duration = local.wait_baseline_duration diff --git a/scaletest/terraform/action/gcp_db.tf b/scaletest/terraform/action/gcp_db.tf index 318c1d027f64f..9eb17464e1ce9 100644 --- a/scaletest/terraform/action/gcp_db.tf +++ b/scaletest/terraform/action/gcp_db.tf @@ -23,7 +23,7 @@ resource "google_sql_database_instance" "db" { ip_configuration { ipv4_enabled = false - private_network = "projects/${var.project_id}/global/networks/${local.vpc_name}" + private_network = local.vpc_id } insights_config { diff --git a/scaletest/terraform/action/gcp_vpc.tf b/scaletest/terraform/action/gcp_vpc.tf index 4bd29a14008e3..d0b7c36760c18 100644 --- a/scaletest/terraform/action/gcp_vpc.tf +++ b/scaletest/terraform/action/gcp_vpc.tf @@ -1,5 +1,6 @@ locals { vpc_name = "scaletest" + vpc_id = "projects/${var.project_id}/global/networks/${local.vpc_name}" subnet_name = "scaletest" } @@ -22,7 +23,7 @@ resource "google_compute_global_address" "sql_peering" { } resource "google_service_networking_connection" "private_vpc_connection" { - network = local.vpc_name + network = local.vpc_id service = "servicenetworking.googleapis.com" reserved_peering_ranges = [google_compute_global_address.sql_peering.name] } diff --git a/scaletest/terraform/action/prometheus.tf b/scaletest/terraform/action/prometheus.tf index 4b65230282626..d7c94f9fb384c 100644 --- a/scaletest/terraform/action/prometheus.tf +++ b/scaletest/terraform/action/prometheus.tf @@ -1,10 +1,7 @@ locals { - prometheus_helm_repo = "https://charts.bitnami.com/bitnami" + prometheus_helm_repo = "oci://registry-1.docker.io/bitnamicharts" prometheus_helm_chart = "kube-prometheus" - prometheus_exporter_helm_repo = "https://prometheus-community.github.io/helm-charts" - prometheus_exporter_helm_chart = "prometheus-postgres-exporter" prometheus_release_name = "prometheus" - prometheus_exporter_release_name = "prometheus-postgres-exporter" prometheus_namespace = "prometheus" prometheus_remote_write_send_interval = "15s" prometheus_remote_write_metrics_regex = ".*" @@ -28,51 +25,128 @@ resource "helm_release" "prometheus_chart_primary" { chart = local.prometheus_helm_chart name = local.prometheus_release_name namespace = kubernetes_namespace.prometheus_namespace_primary.metadata.0.name - values = [< Date: Wed, 22 Jan 2025 23:53:55 +0000 Subject: [PATCH 3/5] fix replace --- scaletest/terraform/infra/gcp_db.tf | 2 +- scaletest/terraform/infra/gcp_vpc.tf | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scaletest/terraform/infra/gcp_db.tf b/scaletest/terraform/infra/gcp_db.tf index 2de7dd1ab2770..4d13b262c615f 100644 --- a/scaletest/terraform/infra/gcp_db.tf +++ b/scaletest/terraform/infra/gcp_db.tf @@ -22,7 +22,7 @@ resource "google_sql_database_instance" "db" { ip_configuration { ipv4_enabled = false - private_network = "scaletest" + private_network = google_compute_network.vpc.id } insights_config { diff --git a/scaletest/terraform/infra/gcp_vpc.tf b/scaletest/terraform/infra/gcp_vpc.tf index 63d63403b3ce5..b125c60cfd25a 100644 --- a/scaletest/terraform/infra/gcp_vpc.tf +++ b/scaletest/terraform/infra/gcp_vpc.tf @@ -21,7 +21,7 @@ resource "google_compute_global_address" "sql_peering" { purpose = "VPC_PEERING" address_type = "INTERNAL" prefix_length = 16 - network = "scaletest" + network = google_compute_network.vpc.id } resource "google_compute_address" "coder" { @@ -33,7 +33,7 @@ resource "google_compute_address" "coder" { } resource "google_service_networking_connection" "private_vpc_connection" { - network = "scaletest" + network = google_compute_network.vpc.id service = "servicenetworking.googleapis.com" reserved_peering_ranges = [google_compute_global_address.sql_peering.name] } From c4937806d9a29a65ee6572a41654e35046ba568d Mon Sep 17 00:00:00 2001 From: Garrett Delfosse Date: Wed, 22 Jan 2025 23:54:11 +0000 Subject: [PATCH 4/5] fmt --- scaletest/terraform/action/coder_traffic.tf | 14 ++++----- scaletest/terraform/action/gcp_vpc.tf | 4 +-- scaletest/terraform/action/prometheus.tf | 32 ++++++++++----------- 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/scaletest/terraform/action/coder_traffic.tf b/scaletest/terraform/action/coder_traffic.tf index f51661b7edfde..b477f3847a6d6 100644 --- a/scaletest/terraform/action/coder_traffic.tf +++ b/scaletest/terraform/action/coder_traffic.tf @@ -5,20 +5,20 @@ locals { traffic_types = { ssh = { - duration = "30m" - job_timeout = "35m" + duration = "30m" + job_timeout = "35m" flags = [ "--ssh", ] } webterminal = { - duration = "25m" - job_timeout = "30m" - flags = [] + duration = "25m" + job_timeout = "30m" + flags = [] } app = { - duration = "20m" - job_timeout = "25m" + duration = "20m" + job_timeout = "25m" flags = [ "--app=wsec", ] diff --git a/scaletest/terraform/action/gcp_vpc.tf b/scaletest/terraform/action/gcp_vpc.tf index d0b7c36760c18..10624edaddf91 100644 --- a/scaletest/terraform/action/gcp_vpc.tf +++ b/scaletest/terraform/action/gcp_vpc.tf @@ -1,6 +1,6 @@ locals { - vpc_name = "scaletest" - vpc_id = "projects/${var.project_id}/global/networks/${local.vpc_name}" + vpc_name = "scaletest" + vpc_id = "projects/${var.project_id}/global/networks/${local.vpc_name}" subnet_name = "scaletest" } diff --git a/scaletest/terraform/action/prometheus.tf b/scaletest/terraform/action/prometheus.tf index d7c94f9fb384c..8de2b7e90fda4 100644 --- a/scaletest/terraform/action/prometheus.tf +++ b/scaletest/terraform/action/prometheus.tf @@ -1,8 +1,8 @@ locals { - prometheus_helm_repo = "oci://registry-1.docker.io/bitnamicharts" - prometheus_helm_chart = "kube-prometheus" - prometheus_release_name = "prometheus" - prometheus_namespace = "prometheus" + prometheus_helm_repo = "oci://registry-1.docker.io/bitnamicharts" + prometheus_helm_chart = "kube-prometheus" + prometheus_release_name = "prometheus" + prometheus_namespace = "prometheus" prometheus_remote_write_send_interval = "15s" prometheus_remote_write_metrics_regex = ".*" } @@ -26,9 +26,9 @@ resource "helm_release" "prometheus_chart_primary" { name = local.prometheus_release_name namespace = kubernetes_namespace.prometheus_namespace_primary.metadata.0.name values = [templatefile("${path.module}/prometheus_helm_values.tftpl", { - nodepool = google_container_node_pool.node_pool["primary_misc"].name, - cluster = "primary", - prometheus_remote_write_url = var.prometheus_remote_write_url, + nodepool = google_container_node_pool.node_pool["primary_misc"].name, + cluster = "primary", + prometheus_remote_write_url = var.prometheus_remote_write_url, prometheus_remote_write_metrics_regex = local.prometheus_remote_write_metrics_regex, prometheus_remote_write_send_interval = local.prometheus_remote_write_send_interval, })] @@ -52,7 +52,7 @@ spec: interval: 30s YAML - depends_on = [ helm_release.prometheus_chart_primary ] + depends_on = [helm_release.prometheus_chart_primary] } resource "kubernetes_namespace" "prometheus_namespace_europe" { @@ -74,9 +74,9 @@ resource "helm_release" "prometheus_chart_europe" { name = local.prometheus_release_name namespace = kubernetes_namespace.prometheus_namespace_europe.metadata.0.name values = [templatefile("${path.module}/prometheus_helm_values.tftpl", { - nodepool = google_container_node_pool.node_pool["europe_misc"].name, - cluster = "europe", - prometheus_remote_write_url = var.prometheus_remote_write_url, + nodepool = google_container_node_pool.node_pool["europe_misc"].name, + cluster = "europe", + prometheus_remote_write_url = var.prometheus_remote_write_url, prometheus_remote_write_metrics_regex = local.prometheus_remote_write_metrics_regex, prometheus_remote_write_send_interval = local.prometheus_remote_write_send_interval, })] @@ -100,7 +100,7 @@ spec: interval: 30s YAML - depends_on = [ helm_release.prometheus_chart_europe ] + depends_on = [helm_release.prometheus_chart_europe] } resource "kubernetes_namespace" "prometheus_namespace_asia" { @@ -122,9 +122,9 @@ resource "helm_release" "prometheus_chart_asia" { name = local.prometheus_release_name namespace = kubernetes_namespace.prometheus_namespace_asia.metadata.0.name values = [templatefile("${path.module}/prometheus_helm_values.tftpl", { - nodepool = google_container_node_pool.node_pool["asia_misc"].name, - cluster = "asia", - prometheus_remote_write_url = var.prometheus_remote_write_url, + nodepool = google_container_node_pool.node_pool["asia_misc"].name, + cluster = "asia", + prometheus_remote_write_url = var.prometheus_remote_write_url, prometheus_remote_write_metrics_regex = local.prometheus_remote_write_metrics_regex, prometheus_remote_write_send_interval = local.prometheus_remote_write_send_interval, })] @@ -148,5 +148,5 @@ spec: interval: 30s YAML - depends_on = [ helm_release.prometheus_chart_asia ] + depends_on = [helm_release.prometheus_chart_asia] } From c43d65f7803602e566333fd0546dc13830727417 Mon Sep 17 00:00:00 2001 From: Garrett Delfosse Date: Thu, 23 Jan 2025 16:51:52 +0000 Subject: [PATCH 5/5] use community chart --- scaletest/terraform/action/prometheus.tf | 44 +++---------------- .../action/prometheus_helm_values.tftpl | 42 +++++++----------- 2 files changed, 22 insertions(+), 64 deletions(-) diff --git a/scaletest/terraform/action/prometheus.tf b/scaletest/terraform/action/prometheus.tf index 8de2b7e90fda4..de22a5c949684 100644 --- a/scaletest/terraform/action/prometheus.tf +++ b/scaletest/terraform/action/prometheus.tf @@ -1,30 +1,18 @@ locals { - prometheus_helm_repo = "oci://registry-1.docker.io/bitnamicharts" - prometheus_helm_chart = "kube-prometheus" + prometheus_helm_repo = "https://prometheus-community.github.io/helm-charts" + prometheus_helm_chart = "kube-prometheus-stack" prometheus_release_name = "prometheus" - prometheus_namespace = "prometheus" prometheus_remote_write_send_interval = "15s" prometheus_remote_write_metrics_regex = ".*" } -resource "kubernetes_namespace" "prometheus_namespace_primary" { - provider = kubernetes.primary - - metadata { - name = local.prometheus_namespace - } - lifecycle { - ignore_changes = [timeouts, wait_for_default_service_account] - } -} - resource "helm_release" "prometheus_chart_primary" { provider = helm.primary repository = local.prometheus_helm_repo chart = local.prometheus_helm_chart name = local.prometheus_release_name - namespace = kubernetes_namespace.prometheus_namespace_primary.metadata.0.name + namespace = kubernetes_namespace.coder_primary.metadata.0.name values = [templatefile("${path.module}/prometheus_helm_values.tftpl", { nodepool = google_container_node_pool.node_pool["primary_misc"].name, cluster = "primary", @@ -55,24 +43,13 @@ YAML depends_on = [helm_release.prometheus_chart_primary] } -resource "kubernetes_namespace" "prometheus_namespace_europe" { - provider = kubernetes.europe - - metadata { - name = local.prometheus_namespace - } - lifecycle { - ignore_changes = [timeouts, wait_for_default_service_account] - } -} - resource "helm_release" "prometheus_chart_europe" { provider = helm.europe repository = local.prometheus_helm_repo chart = local.prometheus_helm_chart name = local.prometheus_release_name - namespace = kubernetes_namespace.prometheus_namespace_europe.metadata.0.name + namespace = kubernetes_namespace.coder_europe.metadata.0.name values = [templatefile("${path.module}/prometheus_helm_values.tftpl", { nodepool = google_container_node_pool.node_pool["europe_misc"].name, cluster = "europe", @@ -103,24 +80,13 @@ YAML depends_on = [helm_release.prometheus_chart_europe] } -resource "kubernetes_namespace" "prometheus_namespace_asia" { - provider = kubernetes.asia - - metadata { - name = local.prometheus_namespace - } - lifecycle { - ignore_changes = [timeouts, wait_for_default_service_account] - } -} - resource "helm_release" "prometheus_chart_asia" { provider = helm.asia repository = local.prometheus_helm_repo chart = local.prometheus_helm_chart name = local.prometheus_release_name - namespace = kubernetes_namespace.prometheus_namespace_asia.metadata.0.name + namespace = kubernetes_namespace.coder_asia.metadata.0.name values = [templatefile("${path.module}/prometheus_helm_values.tftpl", { nodepool = google_container_node_pool.node_pool["asia_misc"].name, cluster = "asia", diff --git a/scaletest/terraform/action/prometheus_helm_values.tftpl b/scaletest/terraform/action/prometheus_helm_values.tftpl index 4729aa5d7103d..1db6d1e2c9bef 100644 --- a/scaletest/terraform/action/prometheus_helm_values.tftpl +++ b/scaletest/terraform/action/prometheus_helm_values.tftpl @@ -1,15 +1,8 @@ alertmanager: enabled: false -blackboxExporter: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: "cloud.google.com/gke-nodepool" - operator: "In" - values: ["${nodepool}"] -operator: +grafana: + enabled: false +prometheusOperator: affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: @@ -27,18 +20,17 @@ prometheus: - key: "cloud.google.com/gke-nodepool" operator: "In" values: ["${nodepool}"] - externalLabels: - cluster: "${cluster}" - persistence: - enabled: true - storageClass: standard - remoteWrite: - - url: "${prometheus_remote_write_url}" - tlsConfig: - insecureSkipVerify: true - writeRelabelConfigs: - - sourceLabels: [__name__] - regex: "${prometheus_remote_write_metrics_regex}" - action: keep - metadataConfig: - sendInterval: "${prometheus_remote_write_send_interval}" + prometheusSpec: + externalLabels: + cluster: "${cluster}" + podMonitorSelectorNilUsesHelmValues: false + remoteWrite: + - url: "${prometheus_remote_write_url}" + tlsConfig: + insecureSkipVerify: true + writeRelabelConfigs: + - sourceLabels: [__name__] + regex: "${prometheus_remote_write_metrics_regex}" + action: keep + metadataConfig: + sendInterval: "${prometheus_remote_write_send_interval}"