Skip to content

Commit 1e8cc2c

Browse files
authored
feat: scaletest: scale down nodegroups by default (#8276)
* feat: allow scaling down scaletest environments * fix bugged namespace deletion * misc fixes to scaletest.sh * destroy namespaces is a no-op as the cluster will be gone anyway
1 parent a6bd85d commit 1e8cc2c

File tree

5 files changed

+109
-25
lines changed

5 files changed

+109
-25
lines changed

scaletest/scaletest.sh

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,23 +18,28 @@ SCALETEST_CODER_LICENSE="${SCALETEST_CODER_LICENSE:-}"
1818
SCALETEST_SKIP_CLEANUP="${SCALETEST_SKIP_CLEANUP:-0}"
1919
SCALETEST_CREATE_CONCURRENCY="${SCALETEST_CREATE_CONCURRENCY:-10}"
2020
SCALETEST_TRAFFIC_BYTES_PER_TICK="${SCALETEST_TRAFFIC_BYTES_PER_TICK:-1024}"
21-
SCALETEST_TRAFFIC_TICK_INTERVAL="${SCALETEST_TRAFFIC_TICK_INTERVAL:-10}"
21+
SCALETEST_TRAFFIC_TICK_INTERVAL="${SCALETEST_TRAFFIC_TICK_INTERVAL:-10s}"
22+
SCALETEST_DESTROY="${SCALETEST_DESTROY:-0}"
2223

2324
script_name=$(basename "$0")
24-
args="$(getopt -o "" -l create-concurrency:,dry-run,help,name:,num-workspaces:,project:,scenario:,skip-cleanup,traffic-bytes-per-tick:,traffic-tick-interval:, -- "$@")"
25+
args="$(getopt -o "" -l create-concurrency:,destroy,dry-run,help,name:,num-workspaces:,project:,scenario:,skip-cleanup,traffic-bytes-per-tick:,traffic-tick-interval:, -- "$@")"
2526
eval set -- "$args"
2627
while true; do
2728
case "$1" in
2829
--create-concurrency)
2930
SCALETEST_CREATE_CONCURRENCY="$2"
3031
shift 2
3132
;;
33+
--destroy)
34+
SCALETEST_DESTROY=1
35+
shift
36+
;;
3237
--dry-run)
3338
DRY_RUN=1
3439
shift
3540
;;
3641
--help)
37-
echo "Usage: $script_name --name <name> --project <project> --num-workspaces <num-workspaces> --scenario <scenario> [--dry-run] [--skip-cleanup] [--create-concurrency=<create-concurrency>]"
42+
echo "Usage: $script_name --name <name> --project <project> --num-workspaces <num-workspaces> --scenario <scenario> [--create-concurrency <create-concurrency>] [--destroy] [--dry-run] [--skip-cleanup] [--traffic-bytes-per-tick <number>] [--traffic-tick-interval <duration>]"
3843
exit 1
3944
;;
4045
--name)
@@ -142,7 +147,7 @@ echo "Initializing terraform."
142147
maybedryrun "$DRY_RUN" terraform init
143148

144149
echo "Setting up infrastructure."
145-
maybedryrun "$DRY_RUN" terraform apply --var-file="${SCALETEST_SCENARIO_VARS}" --var-file="${SCALETEST_SECRETS}" --auto-approve
150+
maybedryrun "$DRY_RUN" terraform apply --var-file="${SCALETEST_SCENARIO_VARS}" --var-file="${SCALETEST_SECRETS}" --var state=started --auto-approve
146151

147152
if [[ "${DRY_RUN}" != 1 ]]; then
148153
SCALETEST_CODER_URL=$(<"${CONFIG_DIR}/url")
@@ -151,7 +156,21 @@ else
151156
fi
152157
KUBECONFIG="${PROJECT_ROOT}/scaletest/.coderv2/${SCALETEST_NAME}-cluster.kubeconfig"
153158
echo "Waiting for Coder deployment at ${SCALETEST_CODER_URL} to become ready"
154-
maybedryrun "$DRY_RUN" kubectl --kubeconfig="${KUBECONFIG}" -n "coder-${SCALETEST_NAME}" rollout status deployment/coder
159+
max_attempts=10
160+
for attempt in $(seq 1 $max_attempts); do
161+
maybedryrun "$DRY_RUN" curl --silent --fail --output /dev/null "${SCALETEST_CODER_URL}/api/v2/buildinfo"
162+
curl_status=$?
163+
if [[ $curl_status -eq 0 ]]; then
164+
break
165+
fi
166+
if attempt -eq $max_attempts; then
167+
echo
168+
echo "Coder deployment failed to become ready in time!"
169+
exit 1
170+
fi
171+
echo "Coder deployment not ready yet (${attempt}/${max_attempts}), sleeping 3 seconds"
172+
maybedryrun "$DRY_RUN" sleep 3
173+
done
155174

156175
echo "Initializing Coder deployment."
157176
DRY_RUN="$DRY_RUN" "${PROJECT_ROOT}/scaletest/lib/coder_init.sh" "${SCALETEST_CODER_URL}"
@@ -212,5 +231,10 @@ if [[ "${SCALETEST_SKIP_CLEANUP}" == 1 ]]; then
212231
exit 0
213232
fi
214233

215-
echo "Cleaning up"
216-
maybedryrun "$DRY_RUN" terraform destroy --var-file="${SCALETEST_SCENARIO_VARS}" --var-file="${SCALETEST_SECRETS}" --auto-approve
234+
if [[ "${SCALETEST_DESTROY}" == 1 ]]; then
235+
echo "Destroying infrastructure"
236+
maybedryrun "$DRY_RUN" terraform destroy --var-file="${SCALETEST_SCENARIO_VARS}" --var-file="${SCALETEST_SECRETS}" --auto-approve
237+
else
238+
echo "Scaling down infrastructure"
239+
maybedryrun "$DRY_RUN" terraform apply --var-file="${SCALETEST_SCENARIO_VARS}" --var-file="${SCALETEST_SECRETS}" --var state=stopped --auto-approve
240+
fi

scaletest/terraform/coder.tf

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,24 @@ provider "helm" {
2525
}
2626
}
2727

28-
resource "kubernetes_namespace" "coder_namespace" {
29-
metadata {
30-
name = local.coder_namespace
28+
resource "null_resource" "coder_namespace" {
29+
triggers = {
30+
namespace = local.coder_namespace
31+
kubeconfig_path = local.cluster_kubeconfig_path
3132
}
3233
depends_on = [
3334
google_container_node_pool.coder
3435
]
36+
provisioner "local-exec" {
37+
when = create
38+
command = <<EOF
39+
KUBECONFIG=${self.triggers.kubeconfig_path} kubectl create namespace ${self.triggers.namespace}
40+
EOF
41+
}
42+
provisioner "local-exec" {
43+
when = destroy
44+
command = "true"
45+
}
3546
}
3647

3748
resource "random_password" "coder-postgres-password" {
@@ -46,8 +57,9 @@ resource "kubernetes_secret" "coder-db" {
4657
type = "" # Opaque
4758
metadata {
4859
name = "coder-db-url"
49-
namespace = kubernetes_namespace.coder_namespace.metadata.0.name
60+
namespace = local.coder_namespace
5061
}
62+
depends_on = [null_resource.coder_namespace]
5163
data = {
5264
url = "postgres://${google_sql_user.coder.name}:${urlencode(random_password.coder-postgres-password.result)}@${google_sql_database_instance.db.private_ip_address}/${google_sql_database.coder.name}?sslmode=disable"
5365
}
@@ -58,9 +70,10 @@ resource "helm_release" "coder-chart" {
5870
chart = local.coder_helm_chart
5971
name = local.coder_release_name
6072
version = var.coder_chart_version
61-
namespace = kubernetes_namespace.coder_namespace.metadata.0.name
73+
namespace = local.coder_namespace
6274
depends_on = [
6375
google_container_node_pool.coder,
76+
null_resource.coder_namespace
6477
]
6578
values = [<<EOF
6679
coder:
@@ -176,7 +189,7 @@ resource "local_file" "kubernetes_template" {
176189
count = data.coder_workspace.me.start_count
177190
metadata {
178191
name = "coder-$${lower(data.coder_workspace.me.owner)}-$${lower(data.coder_workspace.me.name)}"
179-
namespace = "${kubernetes_namespace.coder_namespace.metadata.0.name}"
192+
namespace = "${local.coder_namespace}"
180193
labels = {
181194
"app.kubernetes.io/name" = "coder-workspace"
182195
"app.kubernetes.io/instance" = "coder-workspace-$${lower(data.coder_workspace.me.owner)}-$${lower(data.coder_workspace.me.name)}"

scaletest/terraform/gcp_cluster.tf

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@ data "google_compute_default_service_account" "default" {
22
project = var.project_id
33
}
44

5+
locals {
6+
cluster_kubeconfig_path = "${abspath(path.module)}/../.coderv2/${var.name}-cluster.kubeconfig"
7+
}
8+
59
resource "google_container_cluster" "primary" {
610
name = var.name
711
location = var.zone
@@ -40,7 +44,7 @@ resource "google_container_node_pool" "coder" {
4044
location = var.zone
4145
project = var.project_id
4246
cluster = google_container_cluster.primary.name
43-
node_count = var.nodepool_size_coder
47+
node_count = var.state == "stopped" ? 0 : var.nodepool_size_coder
4448
node_config {
4549
oauth_scopes = [
4650
"https://www.googleapis.com/auth/logging.write",
@@ -70,7 +74,7 @@ resource "google_container_node_pool" "workspaces" {
7074
location = var.zone
7175
project = var.project_id
7276
cluster = google_container_cluster.primary.name
73-
node_count = var.nodepool_size_workspaces
77+
node_count = var.state == "stopped" ? 0 : var.nodepool_size_workspaces
7478
node_config {
7579
oauth_scopes = [
7680
"https://www.googleapis.com/auth/logging.write",
@@ -100,7 +104,7 @@ resource "google_container_node_pool" "misc" {
100104
location = var.zone
101105
project = var.project_id
102106
cluster = google_container_cluster.primary.name
103-
node_count = var.nodepool_size_misc
107+
node_count = var.state == "stopped" ? 0 : var.nodepool_size_misc
104108
node_config {
105109
oauth_scopes = [
106110
"https://www.googleapis.com/auth/logging.write",
@@ -124,3 +128,25 @@ resource "google_container_node_pool" "misc" {
124128
}
125129
}
126130
}
131+
132+
resource "null_resource" "cluster_kubeconfig" {
133+
depends_on = [google_container_cluster.primary]
134+
triggers = {
135+
path = local.cluster_kubeconfig_path
136+
name = google_container_cluster.primary.name
137+
project_id = var.project_id
138+
zone = var.zone
139+
}
140+
provisioner "local-exec" {
141+
command = <<EOF
142+
KUBECONFIG=${self.triggers.path} gcloud container clusters get-credentials ${self.triggers.name} --project=${self.triggers.project_id} --zone=${self.triggers.zone}
143+
EOF
144+
}
145+
146+
provisioner "local-exec" {
147+
when = destroy
148+
command = <<EOF
149+
rm -f ${self.triggers.path}
150+
EOF
151+
}
152+
}

scaletest/terraform/prometheus.tf

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,24 @@ locals {
1010
}
1111

1212
# Create a namespace to hold our Prometheus deployment.
13-
resource "kubernetes_namespace" "prometheus_namespace" {
14-
metadata {
15-
name = local.prometheus_namespace
13+
resource "null_resource" "prometheus_namespace" {
14+
triggers = {
15+
namespace = local.prometheus_namespace
16+
kubeconfig_path = local.cluster_kubeconfig_path
1617
}
1718
depends_on = [
1819
google_container_node_pool.misc
1920
]
21+
provisioner "local-exec" {
22+
when = create
23+
command = <<EOF
24+
KUBECONFIG=${self.triggers.kubeconfig_path} kubectl create namespace ${self.triggers.namespace}
25+
EOF
26+
}
27+
provisioner "local-exec" {
28+
when = destroy
29+
command = "true"
30+
}
2031
}
2132

2233
# Create a secret to store the remote write key
@@ -25,7 +36,7 @@ resource "kubernetes_secret" "prometheus-credentials" {
2536
type = "kubernetes.io/basic-auth"
2637
metadata {
2738
name = "prometheus-credentials"
28-
namespace = kubernetes_namespace.prometheus_namespace.metadata.0.name
39+
namespace = local.prometheus_namespace
2940
}
3041

3142
data = {
@@ -39,7 +50,8 @@ resource "helm_release" "prometheus-chart" {
3950
repository = local.prometheus_helm_repo
4051
chart = local.prometheus_helm_chart
4152
name = local.prometheus_release_name
42-
namespace = kubernetes_namespace.prometheus_namespace.metadata.0.name
53+
namespace = local.prometheus_namespace
54+
depends_on = [null_resource.prometheus_namespace]
4355
values = [<<EOF
4456
alertmanager:
4557
enabled: false
@@ -102,8 +114,9 @@ resource "kubernetes_secret" "prometheus-postgres-password" {
102114
type = "kubernetes.io/basic-auth"
103115
metadata {
104116
name = "prometheus-postgres"
105-
namespace = kubernetes_namespace.prometheus_namespace.metadata.0.name
117+
namespace = local.prometheus_namespace
106118
}
119+
depends_on = [null_resource.prometheus_namespace]
107120
data = {
108121
username = google_sql_user.prometheus.name
109122
password = google_sql_user.prometheus.password
@@ -152,7 +165,7 @@ resource "local_file" "coder-monitoring-manifest" {
152165
apiVersion: monitoring.coreos.com/v1
153166
kind: PodMonitor
154167
metadata:
155-
namespace: ${kubernetes_namespace.coder_namespace.metadata.0.name}
168+
namespace: ${local.prometheus_namespace}
156169
name: coder-monitoring
157170
spec:
158171
selector:
@@ -168,8 +181,7 @@ resource "null_resource" "coder-monitoring-manifest_apply" {
168181
provisioner "local-exec" {
169182
working_dir = "${abspath(path.module)}/../.coderv2"
170183
command = <<EOF
171-
KUBECONFIG=${var.name}-cluster.kubeconfig gcloud container clusters get-credentials ${google_container_cluster.primary.name} --project=${var.project_id} --zone=${var.zone} && \
172-
KUBECONFIG=${var.name}-cluster.kubeconfig kubectl apply -f ${abspath(local_file.coder-monitoring-manifest.filename)}
184+
KUBECONFIG=${local.cluster_kubeconfig_path} kubectl apply -f ${abspath(local_file.coder-monitoring-manifest.filename)}
173185
EOF
174186
}
175187
depends_on = [helm_release.prometheus-chart]

scaletest/terraform/vars.tf

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
variable "state" {
2+
description = "The state of the cluster. Valid values are 'started', and 'stopped'."
3+
validation {
4+
condition = contains(["started", "stopped"], var.state)
5+
error_message = "value must be one of 'started' or 'stopped'"
6+
}
7+
default = "started"
8+
}
9+
110
variable "project_id" {
211
description = "The project in which to provision resources"
312
}

0 commit comments

Comments
 (0)