coder · mafredri · Jan 30, 2024 · Jan 22, 2024 · Jan 22, 2024 · Jan 22, 2024
diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf
@@ -12,11 +12,12 @@ terraform {
 }
 
 resource "time_static" "start_time" {
-  # We con't set `count = data.coder_workspace.me.start_count` here because then
-  # we can't use this value in `locals`. The permission check is recreated on
-  # start, which will update the timestamp.
+  # We don't set `count = data.coder_workspace.me.start_count` here because then
+  # we can't use this value in `locals`, but we want to trigger recreation when
+  # the scaletest is restarted.
   triggers = {
-    count : length(null_resource.permission_check)
+    count : data.coder_workspace.me.start_count
+    token : data.coder_workspace.me.owner_session_token # Rely on this being re-generated every start.
   }
 }
 
@@ -39,8 +40,6 @@ locals {
   workspace_pod_instance                         = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
   workspace_pod_termination_grace_period_seconds = 5 * 60 * 60 # 5 hours (cleanup timeout).
   service_account_name                           = "scaletest-sa"
-  cpu                                            = 16
-  memory                                         = 64
   home_disk_size                                 = 10
   scaletest_run_id                               = "scaletest-${replace(time_static.start_time.rfc3339, ":", "-")}"
   scaletest_run_dir                              = "/home/coder/${local.scaletest_run_id}"
@@ -171,6 +170,16 @@ data "coder_parameter" "cleanup_strategy" {
   }
 }
 
+data "coder_parameter" "cleanup_prepare" {
+  order       = 14
+  type        = "bool"
+  name        = "Cleanup before scaletest"
+  default     = true
+  description = "Cleanup existing scaletest users and workspaces before the scaletest starts (prepare phase)."
+  mutable     = true
+  ephemeral   = true
+}
+
 
 data "coder_parameter" "workspace_template" {
   order        = 20
@@ -226,9 +235,18 @@ data "coder_parameter" "num_workspaces" {
   }
 }
 
+data "coder_parameter" "skip_create_workspaces" {
+  order       = 22
+  type        = "bool"
+  name        = "DEBUG: Skip creating workspaces"
+  default     = false
+  description = "Skip creating workspaces (for resuming failed scaletests or debugging)"
+  mutable     = true
+}
+
 
 data "coder_parameter" "load_scenarios" {
-  order       = 22
+  order       = 23
   name        = "Load Scenarios"
   type        = "list(string)"
   description = "The load scenarios to run."
@@ -237,12 +255,31 @@ data "coder_parameter" "load_scenarios" {
   default = jsonencode([
     "SSH Traffic",
     "Web Terminal Traffic",
+    "App Traffic",
     "Dashboard Traffic",
   ])
 }
 
+data "coder_parameter" "load_scenario_run_concurrently" {
+  order       = 24
+  name        = "Run Load Scenarios Concurrently"
+  type        = "bool"
+  default     = false
+  description = "Run all load scenarios concurrently, this setting enables the load scenario percentages so that they can be assigned a percentage of 1-100%."
+  mutable     = true
+}
+
+data "coder_parameter" "load_scenario_concurrency_stagger_delay_mins" {
+  order       = 25
+  name        = "Load Scenario Concurrency Stagger Delay"
+  type        = "number"
+  default     = 3
+  description = "The number of minutes to wait between starting each load scenario when run concurrently."
+  mutable     = true
+}
+
 data "coder_parameter" "load_scenario_ssh_traffic_duration" {
-  order       = 23
+  order       = 30
   name        = "SSH Traffic Duration"
   type        = "number"
   description = "The duration of the SSH traffic load scenario in minutes."
@@ -255,7 +292,7 @@ data "coder_parameter" "load_scenario_ssh_traffic_duration" {
 }
 
 data "coder_parameter" "load_scenario_ssh_bytes_per_tick" {
-  order       = 24
+  order       = 31
   name        = "SSH Bytes Per Tick"
   type        = "number"
   description = "The number of bytes to send per tick in the SSH traffic load scenario."
@@ -267,7 +304,7 @@ data "coder_parameter" "load_scenario_ssh_bytes_per_tick" {
 }
 
 data "coder_parameter" "load_scenario_ssh_tick_interval" {
-  order       = 25
+  order       = 32
   name        = "SSH Tick Interval"
   type        = "number"
   description = "The number of milliseconds between each tick in the SSH traffic load scenario."
@@ -278,8 +315,21 @@ data "coder_parameter" "load_scenario_ssh_tick_interval" {
   }
 }
 
+data "coder_parameter" "load_scenario_ssh_traffic_percentage" {
+  order       = 33
+  name        = "SSH Traffic Percentage"
+  type        = "number"
+  description = "The percentage of workspaces that should be targeted for SSH traffic."
+  mutable     = true
+  default     = 100
+  validation {
+    min = 1
+    max = 100
+  }
+}
+
 data "coder_parameter" "load_scenario_web_terminal_traffic_duration" {
-  order       = 26
+  order       = 40
   name        = "Web Terminal Traffic Duration"
   type        = "number"
   description = "The duration of the web terminal traffic load scenario in minutes."
@@ -292,7 +342,7 @@ data "coder_parameter" "load_scenario_web_terminal_traffic_duration" {
 }
 
 data "coder_parameter" "load_scenario_web_terminal_bytes_per_tick" {
-  order       = 27
+  order       = 41
   name        = "Web Terminal Bytes Per Tick"
   type        = "number"
   description = "The number of bytes to send per tick in the web terminal traffic load scenario."
@@ -304,7 +354,7 @@ data "coder_parameter" "load_scenario_web_terminal_bytes_per_tick" {
 }
 
 data "coder_parameter" "load_scenario_web_terminal_tick_interval" {
-  order       = 28
+  order       = 42
   name        = "Web Terminal Tick Interval"
   type        = "number"
   description = "The number of milliseconds between each tick in the web terminal traffic load scenario."
@@ -315,8 +365,94 @@ data "coder_parameter" "load_scenario_web_terminal_tick_interval" {
   }
 }
 
+data "coder_parameter" "load_scenario_web_terminal_traffic_percentage" {
+  order       = 43
+  name        = "Web Terminal Traffic Percentage"
+  type        = "number"
+  description = "The percentage of workspaces that should be targeted for web terminal traffic."
+  mutable     = true
+  default     = 100
+  validation {
+    min = 1
+    max = 100
+  }
+}
+
+data "coder_parameter" "load_scenario_app_traffic_duration" {
+  order       = 50
+  name        = "App Traffic Duration"
+  type        = "number"
+  description = "The duration of the app traffic load scenario in minutes."
+  mutable     = true
+  default     = 30
+  validation {
+    min = 1
+    max = 1440 // 24 hours.
+  }
+}
+
+data "coder_parameter" "load_scenario_app_bytes_per_tick" {
+  order       = 51
+  name        = "App Bytes Per Tick"
+  type        = "number"
+  description = "The number of bytes to send per tick in the app traffic load scenario."
+  mutable     = true
+  default     = 1024
+  validation {
+    min = 1
+  }
+}
+
+data "coder_parameter" "load_scenario_app_tick_interval" {
+  order       = 52
+  name        = "App Tick Interval"
+  type        = "number"
+  description = "The number of milliseconds between each tick in the app traffic load scenario."
+  mutable     = true
+  default     = 100
+  validation {
+    min = 1
+  }
+}
+
+data "coder_parameter" "load_scenario_app_traffic_percentage" {
+  order       = 53
+  name        = "App Traffic Percentage"
+  type        = "number"
+  description = "The percentage of workspaces that should be targeted for app traffic."
+  mutable     = true
+  default     = 100
+  validation {
+    min = 1
+    max = 100
+  }
+}
+
+data "coder_parameter" "load_scenario_app_traffic_mode" {
+  order       = 54
+  name        = "App Traffic Mode"
+  default     = "wsec"
+  description = "The mode of the app traffic load scenario."
+  mutable     = true
+  option {
+    name        = "WebSocket Echo"
+    value       = "wsec"
+    description = "Send traffic to the workspace via the app websocket and read it back."
+  }
+  option {
+    name        = "WebSocket Read (Random)"
+    value       = "wsra"
+    description = "Read traffic from the workspace via the app websocket."
+  }
+  option {
+    name        = "WebSocket Write (Discard)"
+    value       = "wsdi"
+    description = "Send traffic to the workspace via the app websocket."
+  }
+}
+
 data "coder_parameter" "load_scenario_dashboard_traffic_duration" {
-  order       = 29
+  order       = 60
   name        = "Dashboard Traffic Duration"
   type        = "number"
   description = "The duration of the dashboard traffic load scenario in minutes."
@@ -328,8 +464,21 @@ data "coder_parameter" "load_scenario_dashboard_traffic_duration" {
   }
 }
 
+data "coder_parameter" "load_scenario_dashboard_traffic_percentage" {
+  order       = 61
+  name        = "Dashboard Traffic Percentage"
+  type        = "number"
+  description = "The percentage of users that should be targeted for dashboard traffic."
+  mutable     = true
+  default     = 100
+  validation {
+    min = 1
+    max = 100
+  }
+}
+
 data "coder_parameter" "load_scenario_baseline_duration" {
-  order       = 26
+  order       = 100
   name        = "Baseline Wait Duration"
   type        = "number"
   description = "The duration to wait before starting a load scenario in minutes."
@@ -342,7 +491,7 @@ data "coder_parameter" "load_scenario_baseline_duration" {
 }
 
 data "coder_parameter" "greedy_agent" {
-  order       = 30
+  order       = 200
   type        = "bool"
   name        = "Greedy Agent"
   default     = false
@@ -352,7 +501,7 @@ data "coder_parameter" "greedy_agent" {
 }
 
 data "coder_parameter" "greedy_agent_template" {
-  order        = 31
+  order        = 201
   name         = "Greedy Agent Template"
   display_name = "Greedy Agent Template"
   description  = "The template used for the greedy agent workspace (must not be same as workspace template)."
@@ -432,6 +581,7 @@ resource "coder_agent" "main" {
     SCALETEST_RUN_ID : local.scaletest_run_id,
     SCALETEST_RUN_DIR : local.scaletest_run_dir,
     SCALETEST_RUN_START_TIME : local.scaletest_run_start_time,
+    SCALETEST_PROMETHEUS_START_PORT : "21112",
 
     # Comment is a scaletest param, but we want to surface it separately from
     # the rest, so we use a different name.
@@ -440,16 +590,28 @@ resource "coder_agent" "main" {
     SCALETEST_PARAM_TEMPLATE : data.coder_parameter.workspace_template.value,
     SCALETEST_PARAM_REPO_BRANCH : data.coder_parameter.repo_branch.value,
     SCALETEST_PARAM_NUM_WORKSPACES : data.coder_parameter.num_workspaces.value,
+    SCALETEST_PARAM_SKIP_CREATE_WORKSPACES : data.coder_parameter.skip_create_workspaces.value ? "1" : "0",
     SCALETEST_PARAM_CREATE_CONCURRENCY : "${data.coder_parameter.create_concurrency.value}",
     SCALETEST_PARAM_CLEANUP_STRATEGY : data.coder_parameter.cleanup_strategy.value,
+    SCALETEST_PARAM_CLEANUP_PREPARE : data.coder_parameter.cleanup_prepare.value ? "1" : "0",
     SCALETEST_PARAM_LOAD_SCENARIOS : data.coder_parameter.load_scenarios.value,
+    SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY : data.coder_parameter.load_scenario_run_concurrently.value ? "1" : "0",
+    SCALETEST_PARAM_LOAD_SCENARIO_CONCURRENCY_STAGGER_DELAY_MINS : "${data.coder_parameter.load_scenario_concurrency_stagger_delay_mins.value}",
     SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_ssh_traffic_duration.value}",
     SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_ssh_bytes_per_tick.value}",
     SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_ssh_tick_interval.value}",
+    SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_ssh_traffic_percentage.value}",
     SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_web_terminal_traffic_duration.value}",
     SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_web_terminal_bytes_per_tick.value}",
     SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_web_terminal_tick_interval.value}",
+    SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_web_terminal_traffic_percentage.value}",
+    SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_app_traffic_duration.value}",
+    SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_app_bytes_per_tick.value}",
+    SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_app_tick_interval.value}",
+    SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_app_traffic_percentage.value}",
+    SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_MODE : data.coder_parameter.load_scenario_app_traffic_mode.value,
     SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_dashboard_traffic_duration.value}",
+    SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_dashboard_traffic_percentage.value}",
     SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION : "${data.coder_parameter.load_scenario_baseline_duration.value}",
     SCALETEST_PARAM_GREEDY_AGENT : data.coder_parameter.greedy_agent.value ? "1" : "0",
     SCALETEST_PARAM_GREEDY_AGENT_TEMPLATE : data.coder_parameter.greedy_agent_template.value,
@@ -693,26 +855,24 @@ resource "kubernetes_pod" "main" {
         }
       }
       resources {
-        # Set requests and limits values such that we can do performant
-        # execution of `coder scaletest` commands.
         requests = {
           "cpu"    = "250m"
           "memory" = "512Mi"
         }
-        limits = {
-          "cpu"    = "${local.cpu}"
-          "memory" = "${local.memory}Gi"
-        }
       }
       volume_mount {
         mount_path = "/home/coder"
         name       = "home"
         read_only  = false
       }
-      port {
-        container_port = 21112
-        name           = "prometheus-http"
-        protocol       = "TCP"
+      dynamic "port" {
+        for_each = data.coder_parameter.load_scenario_run_concurrently.value ? jsondecode(data.coder_parameter.load_scenarios.value) : [""]
+        iterator = it
+        content {
+          container_port = 21112 + it.key
+          name           = "prom-http${it.key}"
+          protocol       = "TCP"
+        }
       }
     }
 
@@ -787,8 +947,12 @@ resource "kubernetes_manifest" "pod_monitor" {
         }
       }
       podMetricsEndpoints = [
-        {
-          port     = "prometheus-http"
+        # NOTE(mafredri): We could add more information here by including the
+        # scenario name in the port name (although it's limited to 15 chars so
+        # it needs to be short). That said, someone looking at the stats can
+        # assume that there's a 1-to-1 mapping between scenario# and port.
+        for i, _ in data.coder_parameter.load_scenario_run_concurrently.value ? jsondecode(data.coder_parameter.load_scenarios.value) : [""] : {
+          port     = "prom-http${i}"
           interval = "15s"
         }
       ]