From 8ee96f82419d06b40600d51a1d647d0a09bd9652 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Wed, 28 Jun 2023 15:16:38 +0100 Subject: [PATCH 1/4] chore: update scaletest terraform with latest findings --- scaletest/terraform/coder.tf | 6 +++++- scaletest/terraform/gcp_cluster.tf | 13 +++++++------ scaletest/terraform/scenario-large.tfvars | 1 + scaletest/terraform/scenario-medium.tfvars | 1 + 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/scaletest/terraform/coder.tf b/scaletest/terraform/coder.tf index 3463188a580fc..144132bdc8785 100644 --- a/scaletest/terraform/coder.tf +++ b/scaletest/terraform/coder.tf @@ -83,6 +83,8 @@ coder: operator: "In" values: ["${local.coder_release_name}"] env: + - name: "CODER_ACCESS_URL" + value: "${local.coder_url}" - name: "CODER_CACHE_DIRECTORY" value: "/tmp/coder" - name: "CODER_ENABLE_TELEMETRY" @@ -108,6 +110,8 @@ coder: value: "true" - name: "CODER_EXPERIMENTS" value: "${var.coder_experiments}" + - name: "CODER_DANGEROUS_DISABLE_RATE_LIMITS" + value: "true" image: repo: ${var.coder_image_repo} tag: ${var.coder_image_tag} @@ -197,7 +201,7 @@ resource "local_file" "kubernetes_template" { } resources { requests = { - "cpu" = "0.1" + "cpu" = "0.01" "memory" = "128Mi" } limits = { diff --git a/scaletest/terraform/gcp_cluster.tf b/scaletest/terraform/gcp_cluster.tf index 3dc1049d718f9..70223c8251e30 100644 --- a/scaletest/terraform/gcp_cluster.tf +++ b/scaletest/terraform/gcp_cluster.tf @@ -3,12 +3,13 @@ data "google_compute_default_service_account" "default" { } resource "google_container_cluster" "primary" { - name = var.name - location = var.zone - project = var.project_id - network = google_compute_network.vpc.name - subnetwork = google_compute_subnetwork.subnet.name - networking_mode = "VPC_NATIVE" + name = var.name + location = var.zone + project = var.project_id + network = google_compute_network.vpc.name + subnetwork = google_compute_subnetwork.subnet.name + networking_mode = "VPC_NATIVE" + default_max_pods_per_node = 256 ip_allocation_policy { # Required with networking_mode=VPC_NATIVE } diff --git a/scaletest/terraform/scenario-large.tfvars b/scaletest/terraform/scenario-large.tfvars index ba2f5ccc96a26..1393142f7ce56 100644 --- a/scaletest/terraform/scenario-large.tfvars +++ b/scaletest/terraform/scenario-large.tfvars @@ -1,6 +1,7 @@ nodepool_machine_type_coder = "t2d-standard-8" nodepool_size_coder = 3 nodepool_machine_type_workspaces = "t2d-standard-8" +cloudsql_tier = "db-custom-4-7680" coder_cpu = "6000m" # Leaving 2 CPUs for system workloads coder_mem = "24Gi" # Leaving 8 GB for system workloads coder_replicas = 3 diff --git a/scaletest/terraform/scenario-medium.tfvars b/scaletest/terraform/scenario-medium.tfvars index f55d7f7bdd06f..e8e55b14d028b 100644 --- a/scaletest/terraform/scenario-medium.tfvars +++ b/scaletest/terraform/scenario-medium.tfvars @@ -1,4 +1,5 @@ nodepool_machine_type_coder = "t2d-standard-8" nodepool_machine_type_workspaces = "t2d-standard-8" +cloudsql_tier = "db-custom-2-3840" coder_cpu = "6000m" # Leaving 2 CPUs for system workloads coder_mem = "24Gi" # Leaving 8 GB for system workloads From 13ab81802972af81e871304851b5ebb710e82a20 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Wed, 28 Jun 2023 15:21:42 +0100 Subject: [PATCH 2/4] fixup! chore: update scaletest terraform with latest findings --- scaletest/terraform/scenario-large.tfvars | 2 +- scaletest/terraform/scenario-medium.tfvars | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scaletest/terraform/scenario-large.tfvars b/scaletest/terraform/scenario-large.tfvars index 1393142f7ce56..348df85a70021 100644 --- a/scaletest/terraform/scenario-large.tfvars +++ b/scaletest/terraform/scenario-large.tfvars @@ -1,7 +1,7 @@ nodepool_machine_type_coder = "t2d-standard-8" nodepool_size_coder = 3 nodepool_machine_type_workspaces = "t2d-standard-8" -cloudsql_tier = "db-custom-4-7680" +cloudsql_tier = "db-custom-2-7680" coder_cpu = "6000m" # Leaving 2 CPUs for system workloads coder_mem = "24Gi" # Leaving 8 GB for system workloads coder_replicas = 3 diff --git a/scaletest/terraform/scenario-medium.tfvars b/scaletest/terraform/scenario-medium.tfvars index e8e55b14d028b..57ddd3b157715 100644 --- a/scaletest/terraform/scenario-medium.tfvars +++ b/scaletest/terraform/scenario-medium.tfvars @@ -1,5 +1,5 @@ nodepool_machine_type_coder = "t2d-standard-8" nodepool_machine_type_workspaces = "t2d-standard-8" -cloudsql_tier = "db-custom-2-3840" +cloudsql_tier = "db-custom-1-3840" coder_cpu = "6000m" # Leaving 2 CPUs for system workloads coder_mem = "24Gi" # Leaving 8 GB for system workloads From 3d348cb66f91d1b465191b7c3d557ad3eac058b1 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Wed, 28 Jun 2023 17:35:26 +0100 Subject: [PATCH 3/4] mount cache dir under /tmp/coder directly This will ensure that the only place writeable in /tmp is /tmp/coder - we want to ensure that we're not trying to download the terraform binary and that it's already in the image. This will make it super-apparent. --- scaletest/terraform/coder.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scaletest/terraform/coder.tf b/scaletest/terraform/coder.tf index 144132bdc8785..7688b44c02b1c 100644 --- a/scaletest/terraform/coder.tf +++ b/scaletest/terraform/coder.tf @@ -130,7 +130,7 @@ coder: sessionAffinity: None loadBalancerIP: "${local.coder_address}" volumeMounts: - - mountPath: "/tmp" + - mountPath: "/tmp/coder" name: cache readOnly: false volumes: From 874529e3b6a16326d9727f6c9667a5c36b75e427 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Thu, 29 Jun 2023 14:58:00 +0100 Subject: [PATCH 4/4] plumb through requests/limits for workspaces, add requests for coder --- scaletest/terraform/coder.tf | 16 +++--- scaletest/terraform/scenario-large.tfvars | 6 ++- scaletest/terraform/scenario-medium.tfvars | 6 ++- scaletest/terraform/scenario-small.tfvars | 6 ++- scaletest/terraform/vars.tf | 57 +++++++++++++++++----- 5 files changed, 66 insertions(+), 25 deletions(-) diff --git a/scaletest/terraform/coder.tf b/scaletest/terraform/coder.tf index 7688b44c02b1c..e216766a5a3da 100644 --- a/scaletest/terraform/coder.tf +++ b/scaletest/terraform/coder.tf @@ -118,11 +118,11 @@ coder: replicaCount: "${var.coder_replicas}" resources: requests: - cpu: "${var.coder_cpu}" - memory: "${var.coder_mem}" + cpu: "${var.coder_cpu_request}" + memory: "${var.coder_mem_request}" limits: - cpu: "${var.coder_cpu}" - memory: "${var.coder_mem}" + cpu: "${var.coder_cpu_limit}" + memory: "${var.coder_mem_limit}" securityContext: readOnlyRootFilesystem: true service: @@ -201,12 +201,12 @@ resource "local_file" "kubernetes_template" { } resources { requests = { - "cpu" = "0.01" - "memory" = "128Mi" + "cpu" = "${var.workspace_cpu_request}" + "memory" = "${var.workspace_mem_request}" } limits = { - "cpu" = "1" - "memory" = "1Gi" + "cpu" = "${var.workspace_cpu_limit}" + "memory" = "${var.workspace_mem_limit}" } } } diff --git a/scaletest/terraform/scenario-large.tfvars b/scaletest/terraform/scenario-large.tfvars index 348df85a70021..9bd4aa1e454fb 100644 --- a/scaletest/terraform/scenario-large.tfvars +++ b/scaletest/terraform/scenario-large.tfvars @@ -2,6 +2,8 @@ nodepool_machine_type_coder = "t2d-standard-8" nodepool_size_coder = 3 nodepool_machine_type_workspaces = "t2d-standard-8" cloudsql_tier = "db-custom-2-7680" -coder_cpu = "6000m" # Leaving 2 CPUs for system workloads -coder_mem = "24Gi" # Leaving 8 GB for system workloads +coder_cpu_request = "3000m" +coder_mem_request = "12Gi" +coder_cpu_limit = "6000m" # Leaving 2 CPUs for system workloads +coder_mem_limit = "24Gi" # Leaving 8 GB for system workloads coder_replicas = 3 diff --git a/scaletest/terraform/scenario-medium.tfvars b/scaletest/terraform/scenario-medium.tfvars index 57ddd3b157715..2c5f9c99407fa 100644 --- a/scaletest/terraform/scenario-medium.tfvars +++ b/scaletest/terraform/scenario-medium.tfvars @@ -1,5 +1,7 @@ nodepool_machine_type_coder = "t2d-standard-8" nodepool_machine_type_workspaces = "t2d-standard-8" cloudsql_tier = "db-custom-1-3840" -coder_cpu = "6000m" # Leaving 2 CPUs for system workloads -coder_mem = "24Gi" # Leaving 8 GB for system workloads +coder_cpu_request = "3000m" +coder_mem_request = "12Gi" +coder_cpu_limit = "6000m" # Leaving 2 CPUs for system workloads +coder_mem_limit = "24Gi" # Leaving 8 GB for system workloads diff --git a/scaletest/terraform/scenario-small.tfvars b/scaletest/terraform/scenario-small.tfvars index 8c97aea4095ad..0387701c3b94e 100644 --- a/scaletest/terraform/scenario-small.tfvars +++ b/scaletest/terraform/scenario-small.tfvars @@ -1,4 +1,6 @@ nodepool_machine_type_coder = "t2d-standard-4" nodepool_machine_type_workspaces = "t2d-standard-4" -coder_cpu = "2000m" # Leaving 2 CPUs for system workloads -coder_mem = "12Gi" # Leaving 4GB for system workloads +coder_cpu_request = "1000m" +coder_mem_request = "6Gi" +coder_cpu_limit = "2000m" # Leaving 2 CPUs for system workloads +coder_mem_limit = "12Gi" # Leaving 4GB for system workloads diff --git a/scaletest/terraform/vars.tf b/scaletest/terraform/vars.tf index 6b51425b92af0..ad534f2f99b65 100644 --- a/scaletest/terraform/vars.tf +++ b/scaletest/terraform/vars.tf @@ -94,17 +94,30 @@ variable "cloudsql_max_connections" { // These variables control the Coder deployment. variable "coder_replicas" { - description = "Number of Coder replicas to provision" + description = "Number of Coder replicas to provision." default = 1 } -variable "coder_cpu" { - description = "CPU to allocate to Coder" +// Ensure that requests allow for at least two replicas to be scheduled +// on a single node temporarily, otherwise deployments may fail due to +// lack of resources. +variable "coder_cpu_request" { + description = "CPU request to allocate to Coder." + default = "500m" +} + +variable "coder_mem_request" { + description = "Memory request to allocate to Coder." + default = "512Mi" +} + +variable "coder_cpu_limit" { + description = "CPU limit to allocate to Coder." default = "1000m" } -variable "coder_mem" { - description = "Memory to allocate to Coder" +variable "coder_mem_limit" { + description = "Memory limit to allocate to Coder." default = "1024Mi" } @@ -123,11 +136,38 @@ variable "coder_image_tag" { default = "latest" } +variable "coder_experiments" { + description = "Coder Experiments to enable." + default = "" +} + +// These variables control the default workspace template. variable "workspace_image" { description = "Image and tag to use for workspaces." default = "docker.io/codercom/enterprise-minimal:ubuntu" } +variable "workspace_cpu_request" { + description = "CPU request to allocate to workspaces." + default = "100m" +} + +variable "workspace_cpu_limit" { + description = "CPU limit to allocate to workspaces." + default = "100m" +} + +variable "workspace_mem_request" { + description = "Memory request to allocate to workspaces." + default = "128Mi" +} + +variable "workspace_mem_limit" { + description = "Memory limit to allocate to workspaces." + default = "128Mi" +} + +// These variables control the Prometheus deployment. variable "prometheus_remote_write_user" { description = "Username for Prometheus remote write." default = "" @@ -139,7 +179,7 @@ variable "prometheus_remote_write_password" { } variable "prometheus_remote_write_url" { - description = "URL for Prometheus remote write. Defaults to stats.dev.c8s.io" + description = "URL for Prometheus remote write. Defaults to stats.dev.c8s.io." default = "https://stats.dev.c8s.io:9443/api/v1/write" } @@ -157,8 +197,3 @@ variable "prometheus_remote_write_send_interval" { description = "Prometheus remote write interval." default = "15s" } - -variable "coder_experiments" { - description = "Coder Experiments to enable" - default = "" -}