Skip to content

feat: scaletest: scale down nodegroups by default #8276

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 31 additions & 7 deletions scaletest/scaletest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,23 +18,28 @@ SCALETEST_CODER_LICENSE="${SCALETEST_CODER_LICENSE:-}"
SCALETEST_SKIP_CLEANUP="${SCALETEST_SKIP_CLEANUP:-0}"
SCALETEST_CREATE_CONCURRENCY="${SCALETEST_CREATE_CONCURRENCY:-10}"
SCALETEST_TRAFFIC_BYTES_PER_TICK="${SCALETEST_TRAFFIC_BYTES_PER_TICK:-1024}"
SCALETEST_TRAFFIC_TICK_INTERVAL="${SCALETEST_TRAFFIC_TICK_INTERVAL:-10}"
SCALETEST_TRAFFIC_TICK_INTERVAL="${SCALETEST_TRAFFIC_TICK_INTERVAL:-10s}"
SCALETEST_DESTROY="${SCALETEST_DESTROY:-0}"

script_name=$(basename "$0")
args="$(getopt -o "" -l create-concurrency:,dry-run,help,name:,num-workspaces:,project:,scenario:,skip-cleanup,traffic-bytes-per-tick:,traffic-tick-interval:, -- "$@")"
args="$(getopt -o "" -l create-concurrency:,destroy,dry-run,help,name:,num-workspaces:,project:,scenario:,skip-cleanup,traffic-bytes-per-tick:,traffic-tick-interval:, -- "$@")"
eval set -- "$args"
while true; do
case "$1" in
--create-concurrency)
SCALETEST_CREATE_CONCURRENCY="$2"
shift 2
;;
--destroy)
SCALETEST_DESTROY=1
shift
;;
--dry-run)
DRY_RUN=1
shift
;;
--help)
echo "Usage: $script_name --name <name> --project <project> --num-workspaces <num-workspaces> --scenario <scenario> [--dry-run] [--skip-cleanup] [--create-concurrency=<create-concurrency>]"
echo "Usage: $script_name --name <name> --project <project> --num-workspaces <num-workspaces> --scenario <scenario> [--create-concurrency <create-concurrency>] [--destroy] [--dry-run] [--skip-cleanup] [--traffic-bytes-per-tick <number>] [--traffic-tick-interval <duration>]"
exit 1
;;
--name)
Expand Down Expand Up @@ -142,7 +147,7 @@ echo "Initializing terraform."
maybedryrun "$DRY_RUN" terraform init

echo "Setting up infrastructure."
maybedryrun "$DRY_RUN" terraform apply --var-file="${SCALETEST_SCENARIO_VARS}" --var-file="${SCALETEST_SECRETS}" --auto-approve
maybedryrun "$DRY_RUN" terraform apply --var-file="${SCALETEST_SCENARIO_VARS}" --var-file="${SCALETEST_SECRETS}" --var state=started --auto-approve

if [[ "${DRY_RUN}" != 1 ]]; then
SCALETEST_CODER_URL=$(<"${CONFIG_DIR}/url")
Expand All @@ -151,7 +156,21 @@ else
fi
KUBECONFIG="${PROJECT_ROOT}/scaletest/.coderv2/${SCALETEST_NAME}-cluster.kubeconfig"
echo "Waiting for Coder deployment at ${SCALETEST_CODER_URL} to become ready"
maybedryrun "$DRY_RUN" kubectl --kubeconfig="${KUBECONFIG}" -n "coder-${SCALETEST_NAME}" rollout status deployment/coder
max_attempts=10
for attempt in $(seq 1 $max_attempts); do
maybedryrun "$DRY_RUN" curl --silent --fail --output /dev/null "${SCALETEST_CODER_URL}/api/v2/buildinfo"
curl_status=$?
if [[ $curl_status -eq 0 ]]; then
break
fi
if attempt -eq $max_attempts; then
echo
echo "Coder deployment failed to become ready in time!"
exit 1
fi
echo "Coder deployment not ready yet (${attempt}/${max_attempts}), sleeping 3 seconds"
maybedryrun "$DRY_RUN" sleep 3
done
Comment on lines +159 to +173
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

review: there is a race condition between the rollout status returning true and the service actually becoming ready; so I'm just going back to curl :-)


echo "Initializing Coder deployment."
DRY_RUN="$DRY_RUN" "${PROJECT_ROOT}/scaletest/lib/coder_init.sh" "${SCALETEST_CODER_URL}"
Expand Down Expand Up @@ -212,5 +231,10 @@ if [[ "${SCALETEST_SKIP_CLEANUP}" == 1 ]]; then
exit 0
fi

echo "Cleaning up"
maybedryrun "$DRY_RUN" terraform destroy --var-file="${SCALETEST_SCENARIO_VARS}" --var-file="${SCALETEST_SECRETS}" --auto-approve
if [[ "${SCALETEST_DESTROY}" == 1 ]]; then
echo "Destroying infrastructure"
maybedryrun "$DRY_RUN" terraform destroy --var-file="${SCALETEST_SCENARIO_VARS}" --var-file="${SCALETEST_SECRETS}" --auto-approve
else
echo "Scaling down infrastructure"
maybedryrun "$DRY_RUN" terraform apply --var-file="${SCALETEST_SCENARIO_VARS}" --var-file="${SCALETEST_SECRETS}" --var state=stopped --auto-approve
fi
25 changes: 19 additions & 6 deletions scaletest/terraform/coder.tf
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,24 @@ provider "helm" {
}
}

resource "kubernetes_namespace" "coder_namespace" {
metadata {
name = local.coder_namespace
resource "null_resource" "coder_namespace" {
triggers = {
namespace = local.coder_namespace
kubeconfig_path = local.cluster_kubeconfig_path
}
depends_on = [
google_container_node_pool.coder
]
provisioner "local-exec" {
when = create
command = <<EOF
KUBECONFIG=${self.triggers.kubeconfig_path} kubectl create namespace ${self.triggers.namespace}
EOF
}
provisioner "local-exec" {
when = destroy
command = "true"
}
}

resource "random_password" "coder-postgres-password" {
Expand All @@ -46,8 +57,9 @@ resource "kubernetes_secret" "coder-db" {
type = "" # Opaque
metadata {
name = "coder-db-url"
namespace = kubernetes_namespace.coder_namespace.metadata.0.name
namespace = local.coder_namespace
}
depends_on = [null_resource.coder_namespace]
data = {
url = "postgres://${google_sql_user.coder.name}:${urlencode(random_password.coder-postgres-password.result)}@${google_sql_database_instance.db.private_ip_address}/${google_sql_database.coder.name}?sslmode=disable"
}
Expand All @@ -58,9 +70,10 @@ resource "helm_release" "coder-chart" {
chart = local.coder_helm_chart
name = local.coder_release_name
version = var.coder_chart_version
namespace = kubernetes_namespace.coder_namespace.metadata.0.name
namespace = local.coder_namespace
depends_on = [
google_container_node_pool.coder,
null_resource.coder_namespace
]
values = [<<EOF
coder:
Expand Down Expand Up @@ -176,7 +189,7 @@ resource "local_file" "kubernetes_template" {
count = data.coder_workspace.me.start_count
metadata {
name = "coder-$${lower(data.coder_workspace.me.owner)}-$${lower(data.coder_workspace.me.name)}"
namespace = "${kubernetes_namespace.coder_namespace.metadata.0.name}"
namespace = "${local.coder_namespace}"
labels = {
"app.kubernetes.io/name" = "coder-workspace"
"app.kubernetes.io/instance" = "coder-workspace-$${lower(data.coder_workspace.me.owner)}-$${lower(data.coder_workspace.me.name)}"
Expand Down
32 changes: 29 additions & 3 deletions scaletest/terraform/gcp_cluster.tf
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@ data "google_compute_default_service_account" "default" {
project = var.project_id
}

locals {
cluster_kubeconfig_path = "${abspath(path.module)}/../.coderv2/${var.name}-cluster.kubeconfig"
}

resource "google_container_cluster" "primary" {
name = var.name
location = var.zone
Expand Down Expand Up @@ -40,7 +44,7 @@ resource "google_container_node_pool" "coder" {
location = var.zone
project = var.project_id
cluster = google_container_cluster.primary.name
node_count = var.nodepool_size_coder
node_count = var.state == "stopped" ? 0 : var.nodepool_size_coder
node_config {
oauth_scopes = [
"https://www.googleapis.com/auth/logging.write",
Expand Down Expand Up @@ -70,7 +74,7 @@ resource "google_container_node_pool" "workspaces" {
location = var.zone
project = var.project_id
cluster = google_container_cluster.primary.name
node_count = var.nodepool_size_workspaces
node_count = var.state == "stopped" ? 0 : var.nodepool_size_workspaces
node_config {
oauth_scopes = [
"https://www.googleapis.com/auth/logging.write",
Expand Down Expand Up @@ -100,7 +104,7 @@ resource "google_container_node_pool" "misc" {
location = var.zone
project = var.project_id
cluster = google_container_cluster.primary.name
node_count = var.nodepool_size_misc
node_count = var.state == "stopped" ? 0 : var.nodepool_size_misc
node_config {
oauth_scopes = [
"https://www.googleapis.com/auth/logging.write",
Expand All @@ -124,3 +128,25 @@ resource "google_container_node_pool" "misc" {
}
}
}

resource "null_resource" "cluster_kubeconfig" {
depends_on = [google_container_cluster.primary]
triggers = {
path = local.cluster_kubeconfig_path
name = google_container_cluster.primary.name
project_id = var.project_id
zone = var.zone
}
provisioner "local-exec" {
command = <<EOF
KUBECONFIG=${self.triggers.path} gcloud container clusters get-credentials ${self.triggers.name} --project=${self.triggers.project_id} --zone=${self.triggers.zone}
EOF
}

provisioner "local-exec" {
when = destroy
command = <<EOF
rm -f ${self.triggers.path}
EOF
}
}
Comment on lines +132 to +152
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

review: this was previously being created when applying prom monitoring manifests; moved it to its own resource declaration here.

30 changes: 21 additions & 9 deletions scaletest/terraform/prometheus.tf
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,24 @@ locals {
}

# Create a namespace to hold our Prometheus deployment.
resource "kubernetes_namespace" "prometheus_namespace" {
metadata {
name = local.prometheus_namespace
resource "null_resource" "prometheus_namespace" {
triggers = {
namespace = local.prometheus_namespace
kubeconfig_path = local.cluster_kubeconfig_path
}
depends_on = [
google_container_node_pool.misc
]
provisioner "local-exec" {
when = create
command = <<EOF
KUBECONFIG=${self.triggers.kubeconfig_path} kubectl create namespace ${self.triggers.namespace}
EOF
}
provisioner "local-exec" {
when = destroy
command = "true"
}
}

# Create a secret to store the remote write key
Expand All @@ -25,7 +36,7 @@ resource "kubernetes_secret" "prometheus-credentials" {
type = "kubernetes.io/basic-auth"
metadata {
name = "prometheus-credentials"
namespace = kubernetes_namespace.prometheus_namespace.metadata.0.name
namespace = local.prometheus_namespace
}

data = {
Expand All @@ -39,7 +50,8 @@ resource "helm_release" "prometheus-chart" {
repository = local.prometheus_helm_repo
chart = local.prometheus_helm_chart
name = local.prometheus_release_name
namespace = kubernetes_namespace.prometheus_namespace.metadata.0.name
namespace = local.prometheus_namespace
depends_on = [null_resource.prometheus_namespace]
values = [<<EOF
alertmanager:
enabled: false
Expand Down Expand Up @@ -102,8 +114,9 @@ resource "kubernetes_secret" "prometheus-postgres-password" {
type = "kubernetes.io/basic-auth"
metadata {
name = "prometheus-postgres"
namespace = kubernetes_namespace.prometheus_namespace.metadata.0.name
namespace = local.prometheus_namespace
}
depends_on = [null_resource.prometheus_namespace]
data = {
username = google_sql_user.prometheus.name
password = google_sql_user.prometheus.password
Expand Down Expand Up @@ -152,7 +165,7 @@ resource "local_file" "coder-monitoring-manifest" {
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
namespace: ${kubernetes_namespace.coder_namespace.metadata.0.name}
namespace: ${local.prometheus_namespace}
name: coder-monitoring
spec:
selector:
Expand All @@ -168,8 +181,7 @@ resource "null_resource" "coder-monitoring-manifest_apply" {
provisioner "local-exec" {
working_dir = "${abspath(path.module)}/../.coderv2"
command = <<EOF
KUBECONFIG=${var.name}-cluster.kubeconfig gcloud container clusters get-credentials ${google_container_cluster.primary.name} --project=${var.project_id} --zone=${var.zone} && \
KUBECONFIG=${var.name}-cluster.kubeconfig kubectl apply -f ${abspath(local_file.coder-monitoring-manifest.filename)}
KUBECONFIG=${local.cluster_kubeconfig_path} kubectl apply -f ${abspath(local_file.coder-monitoring-manifest.filename)}
EOF
}
depends_on = [helm_release.prometheus-chart]
Expand Down
9 changes: 9 additions & 0 deletions scaletest/terraform/vars.tf
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
variable "state" {
description = "The state of the cluster. Valid values are 'started', and 'stopped'."
validation {
condition = contains(["started", "stopped"], var.state)
error_message = "value must be one of 'started' or 'stopped'"
}
default = "started"
}

variable "project_id" {
description = "The project in which to provision resources"
}
Expand Down