From 036aa8d7476314af35c8c629f908c4c75f56e18d Mon Sep 17 00:00:00 2001
From: Mathias Fredriksson <mafredri@gmail.com>
Date: Mon, 22 Jan 2024 17:08:56 +0200
Subject: [PATCH 01/18] feat(scaletest/templates): add support for concurrent
 scenarios

---
 scaletest/templates/scaletest-runner/main.tf  | 168 ++++++++++++++--
 .../templates/scaletest-runner/scripts/run.sh | 189 +++++++++++++++---
 .../templates/scaletest-runner/startup.sh     |   5 +
 3 files changed, 317 insertions(+), 45 deletions(-)

diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf
index b536fc51afbb3..cd1d95cbd27b4 100644
--- a/scaletest/templates/scaletest-runner/main.tf
+++ b/scaletest/templates/scaletest-runner/main.tf
@@ -237,12 +237,22 @@ data "coder_parameter" "load_scenarios" {
   default = jsonencode([
     "SSH Traffic",
     "Web Terminal Traffic",
+    "App Traffic",
     "Dashboard Traffic",
   ])
 }
 
-data "coder_parameter" "load_scenario_ssh_traffic_duration" {
+data "coder_parameter" "load_scenario_run_concurrently" {
   order       = 23
+  name        = "Run Load Scenarios Concurrently"
+  type        = "bool"
+  default     = false
+  description = "Run all load scenarios concurrently, this setting enables the load scenario percentages so that they can be assigned a percentage of 1-100%."
+  mutable     = true
+}
+
+data "coder_parameter" "load_scenario_ssh_traffic_duration" {
+  order       = 30
   name        = "SSH Traffic Duration"
   type        = "number"
   description = "The duration of the SSH traffic load scenario in minutes."
@@ -255,7 +265,7 @@ data "coder_parameter" "load_scenario_ssh_traffic_duration" {
 }
 
 data "coder_parameter" "load_scenario_ssh_bytes_per_tick" {
-  order       = 24
+  order       = 31
   name        = "SSH Bytes Per Tick"
   type        = "number"
   description = "The number of bytes to send per tick in the SSH traffic load scenario."
@@ -267,7 +277,7 @@ data "coder_parameter" "load_scenario_ssh_bytes_per_tick" {
 }
 
 data "coder_parameter" "load_scenario_ssh_tick_interval" {
-  order       = 25
+  order       = 32
   name        = "SSH Tick Interval"
   type        = "number"
   description = "The number of milliseconds between each tick in the SSH traffic load scenario."
@@ -278,8 +288,21 @@ data "coder_parameter" "load_scenario_ssh_tick_interval" {
   }
 }
 
+data "coder_parameter" "load_scenario_ssh_traffic_percentage" {
+  order       = 33
+  name        = "SSH Traffic Percentage"
+  type        = "number"
+  description = "The percentage of workspaces that should be targeted for SSH traffic."
+  mutable     = true
+  default     = 100
+  validation {
+    min = 1
+    max = 100
+  }
+}
+
 data "coder_parameter" "load_scenario_web_terminal_traffic_duration" {
-  order       = 26
+  order       = 40
   name        = "Web Terminal Traffic Duration"
   type        = "number"
   description = "The duration of the web terminal traffic load scenario in minutes."
@@ -292,7 +315,7 @@ data "coder_parameter" "load_scenario_web_terminal_traffic_duration" {
 }
 
 data "coder_parameter" "load_scenario_web_terminal_bytes_per_tick" {
-  order       = 27
+  order       = 41
   name        = "Web Terminal Bytes Per Tick"
   type        = "number"
   description = "The number of bytes to send per tick in the web terminal traffic load scenario."
@@ -304,7 +327,7 @@ data "coder_parameter" "load_scenario_web_terminal_bytes_per_tick" {
 }
 
 data "coder_parameter" "load_scenario_web_terminal_tick_interval" {
-  order       = 28
+  order       = 42
   name        = "Web Terminal Tick Interval"
   type        = "number"
   description = "The number of milliseconds between each tick in the web terminal traffic load scenario."
@@ -315,8 +338,94 @@ data "coder_parameter" "load_scenario_web_terminal_tick_interval" {
   }
 }
 
+data "coder_parameter" "load_scenario_web_terminal_traffic_percentage" {
+  order       = 43
+  name        = "Web Terminal Traffic Percentage"
+  type        = "number"
+  description = "The percentage of workspaces that should be targeted for web terminal traffic."
+  mutable     = true
+  default     = 100
+  validation {
+    min = 1
+    max = 100
+  }
+}
+
+data "coder_parameter" "load_scenario_app_traffic_duration" {
+  order       = 50
+  name        = "App Traffic Duration"
+  type        = "number"
+  description = "The duration of the app traffic load scenario in minutes."
+  mutable     = true
+  default     = 30
+  validation {
+    min = 1
+    max = 1440 // 24 hours.
+  }
+}
+
+data "coder_parameter" "load_scenario_app_bytes_per_tick" {
+  order       = 51
+  name        = "App Bytes Per Tick"
+  type        = "number"
+  description = "The number of bytes to send per tick in the app traffic load scenario."
+  mutable     = true
+  default     = 1024
+  validation {
+    min = 1
+  }
+}
+
+data "coder_parameter" "load_scenario_app_tick_interval" {
+  order       = 52
+  name        = "App Tick Interval"
+  type        = "number"
+  description = "The number of milliseconds between each tick in the app traffic load scenario."
+  mutable     = true
+  default     = 100
+  validation {
+    min = 1
+  }
+}
+
+data "coder_parameter" "load_scenario_app_traffic_percentage" {
+  order       = 53
+  name        = "App Traffic Percentage"
+  type        = "number"
+  description = "The percentage of workspaces that should be targeted for app traffic."
+  mutable     = true
+  default     = 100
+  validation {
+    min = 1
+    max = 100
+  }
+}
+
+data "coder_parameter" "load_scenario_app_traffic_mode" {
+  order       = 54
+  name        = "App Traffic Mode"
+  default     = "wsec"
+  description = "The mode of the app traffic load scenario."
+  mutable     = true
+  option {
+    name        = "WebSocket Echo"
+    value       = "wsec"
+    description = "Send traffic to the workspace via the app websocket and read it back."
+  }
+  option {
+    name        = "WebSocket Read (Random)"
+    value       = "wsra"
+    description = "Read traffic from the workspace via the app websocket."
+  }
+  option {
+    name        = "WebSocket Write (Discard)"
+    value       = "wsdi"
+    description = "Send traffic to the workspace via the app websocket."
+  }
+}
+
 data "coder_parameter" "load_scenario_dashboard_traffic_duration" {
-  order       = 29
+  order       = 60
   name        = "Dashboard Traffic Duration"
   type        = "number"
   description = "The duration of the dashboard traffic load scenario in minutes."
@@ -328,8 +437,21 @@ data "coder_parameter" "load_scenario_dashboard_traffic_duration" {
   }
 }
 
+data "coder_parameter" "load_scenario_dashboard_traffic_percentage" {
+  order       = 61
+  name        = "Dashboard Traffic Percentage"
+  type        = "number"
+  description = "The percentage of users that should be targeted for dashboard traffic."
+  mutable     = true
+  default     = 100
+  validation {
+    min = 1
+    max = 100
+  }
+}
+
 data "coder_parameter" "load_scenario_baseline_duration" {
-  order       = 26
+  order       = 100
   name        = "Baseline Wait Duration"
   type        = "number"
   description = "The duration to wait before starting a load scenario in minutes."
@@ -342,7 +464,7 @@ data "coder_parameter" "load_scenario_baseline_duration" {
 }
 
 data "coder_parameter" "greedy_agent" {
-  order       = 30
+  order       = 200
   type        = "bool"
   name        = "Greedy Agent"
   default     = false
@@ -352,7 +474,7 @@ data "coder_parameter" "greedy_agent" {
 }
 
 data "coder_parameter" "greedy_agent_template" {
-  order        = 31
+  order        = 201
   name         = "Greedy Agent Template"
   display_name = "Greedy Agent Template"
   description  = "The template used for the greedy agent workspace (must not be same as workspace template)."
@@ -432,6 +554,7 @@ resource "coder_agent" "main" {
     SCALETEST_RUN_ID : local.scaletest_run_id,
     SCALETEST_RUN_DIR : local.scaletest_run_dir,
     SCALETEST_RUN_START_TIME : local.scaletest_run_start_time,
+    SCALETEST_PROMETHEUS_START_PORT : "21112",
 
     # Comment is a scaletest param, but we want to surface it separately from
     # the rest, so we use a different name.
@@ -443,13 +566,22 @@ resource "coder_agent" "main" {
     SCALETEST_PARAM_CREATE_CONCURRENCY : "${data.coder_parameter.create_concurrency.value}",
     SCALETEST_PARAM_CLEANUP_STRATEGY : data.coder_parameter.cleanup_strategy.value,
     SCALETEST_PARAM_LOAD_SCENARIOS : data.coder_parameter.load_scenarios.value,
+    SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY : data.coder_parameter.load_scenario_run_concurrently.value ? "1" : "0",
     SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_ssh_traffic_duration.value}",
     SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_ssh_bytes_per_tick.value}",
     SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_ssh_tick_interval.value}",
+    SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_ssh_traffic_percentage.value}",
     SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_web_terminal_traffic_duration.value}",
     SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_web_terminal_bytes_per_tick.value}",
     SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_web_terminal_tick_interval.value}",
+    SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_web_terminal_traffic_percentage.value}",
+    SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_app_traffic_duration.value}",
+    SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_app_bytes_per_tick.value}",
+    SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_app_tick_interval.value}",
+    SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_app_traffic_percentage.value}",
+    SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_MODE : data.coder_parameter.load_scenario_app_traffic_mode.value,
     SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_dashboard_traffic_duration.value}",
+    SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_dashboard_traffic_percentage.value}",
     SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION : "${data.coder_parameter.load_scenario_baseline_duration.value}",
     SCALETEST_PARAM_GREEDY_AGENT : data.coder_parameter.greedy_agent.value ? "1" : "0",
     SCALETEST_PARAM_GREEDY_AGENT_TEMPLATE : data.coder_parameter.greedy_agent_template.value,
@@ -709,10 +841,14 @@ resource "kubernetes_pod" "main" {
         name       = "home"
         read_only  = false
       }
-      port {
-        container_port = 21112
-        name           = "prometheus-http"
-        protocol       = "TCP"
+      dynamic "port" {
+        for_each = data.coder_parameter.load_scenario_run_concurrently.value ? jsondecode(data.coder_parameter.load_scenarios.value) : [""]
+        iterator = it
+        content {
+          container_port = 21112 + it.key
+          name           = "prom-http${it.key}"
+          protocol       = "TCP"
+        }
       }
     }
 
@@ -787,8 +923,8 @@ resource "kubernetes_manifest" "pod_monitor" {
         }
       }
       podMetricsEndpoints = [
-        {
-          port     = "prometheus-http"
+        for i, _ in data.coder_parameter.load_scenario_run_concurrently.value ? jsondecode(data.coder_parameter.load_scenarios.value) : [""] : {
+          port     = "prom-http${i}"
           interval = "15s"
         }
       ]
diff --git a/scaletest/templates/scaletest-runner/scripts/run.sh b/scaletest/templates/scaletest-runner/scripts/run.sh
index 03bafc7cf6a84..c117ca36cf185 100755
--- a/scaletest/templates/scaletest-runner/scripts/run.sh
+++ b/scaletest/templates/scaletest-runner/scripts/run.sh
@@ -90,15 +90,33 @@ else
 	}
 fi
 
+if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then
+	start_phase "Load scenarios: ${SCALETEST_PARAM_LOAD_SCENARIOS[*]}"
+fi
+
+declare -a pids=()
 declare -A failed=()
+target_start=0
+target_end=-1
 for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do
-	start_phase "Load scenario: ${scenario}"
+	if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
+		start_phase "Load scenario: ${scenario}"
+	fi
 
 	set +e
 	status=0
 	case "${scenario}" in
 	"SSH Traffic")
 		greedy_agent_traffic "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}" "${scenario}" &
+		greedy_agent_traffic_pid=$!
+
+		target_count=$(jq -n --argjson percentage "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_PERCENTAGE}" --argjson num_workspaces "${SCALETEST_PARAM_NUM_WORKSPACES}" '$percentage / 100 * $num_workspaces | floor')
+		target_end=$((target_start + target_count))
+		if [[ ${target_end} -gt ${SCALETEST_PARAM_NUM_WORKSPACES} ]]; then
+			log "WARNING: Target count ${target_end} exceeds number of workspaces ${SCALETEST_PARAM_NUM_WORKSPACES}, using ${SCALETEST_PARAM_NUM_WORKSPACES} instead."
+			target_start=0
+			target_end=${target_count}
+		fi
 		coder exp scaletest workspace-traffic \
 			--template "${SCALETEST_PARAM_TEMPLATE}" \
 			--ssh \
@@ -107,17 +125,34 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do
 			--timeout "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}m" \
 			--job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}m30s" \
 			--output json:"${SCALETEST_RESULTS_DIR}/traffic-ssh.json" \
-			"${non_greedy_agent_traffic_args[@]}"
-		status=$?
-		wait
+			--scaletest-prometheus-address "0.0.0.0:${SCALETEST_PROMETHEUS_START_PORT}" \
+			--target-workspaces "${target_start}:${target_end}" \
+			"${non_greedy_agent_traffic_args[@]}" &
+		pids+=($!)
+		if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
+			wait "${pids[-1]}"
+			status=$?
+			show_json "${SCALETEST_RESULTS_DIR}/traffic-ssh.json"
+		else
+			SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
+		fi
+		wait "${greedy_agent_traffic_pid}"
 		status2=$?
 		if [[ ${status} == 0 ]]; then
 			status=${status2}
 		fi
-		show_json "${SCALETEST_RESULTS_DIR}/traffic-ssh.json"
 		;;
 	"Web Terminal Traffic")
 		greedy_agent_traffic "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}" "${scenario}" &
+		greedy_agent_traffic_pid=$!
+
+		target_count=$(jq -n --argjson percentage "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_PERCENTAGE}" --argjson num_workspaces "${SCALETEST_PARAM_NUM_WORKSPACES}" '$percentage / 100 * $num_workspaces | floor')
+		target_end=$((target_start + target_count))
+		if [[ ${target_end} -gt ${SCALETEST_PARAM_NUM_WORKSPACES} ]]; then
+			log "WARNING: Target count ${target_end} exceeds number of workspaces ${SCALETEST_PARAM_NUM_WORKSPACES}, using ${SCALETEST_PARAM_NUM_WORKSPACES} instead."
+			target_start=0
+			target_end=${target_count}
+		fi
 		coder exp scaletest workspace-traffic \
 			--template "${SCALETEST_PARAM_TEMPLATE}" \
 			--bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_BYTES_PER_TICK}" \
@@ -125,37 +160,114 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do
 			--timeout "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}m" \
 			--job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}m30s" \
 			--output json:"${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json" \
-			"${non_greedy_agent_traffic_args[@]}"
-		status=$?
-		wait
+			--scaletest-prometheus-address "0.0.0.0:${SCALETEST_PROMETHEUS_START_PORT}" \
+			--target-workspaces "${target_start}:${target_end}" \
+			"${non_greedy_agent_traffic_args[@]}" &
+		pids+=($!)
+		if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
+			wait "${pids[-1]}"
+			status=$?
+			show_json "${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json"
+		else
+			SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
+		fi
+		wait "${greedy_agent_traffic_pid}"
+		status2=$?
+		if [[ ${status} == 0 ]]; then
+			status=${status2}
+		fi
+		;;
+	"App Traffic")
+		greedy_agent_traffic "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION}" "${scenario}" &
+		greedy_agent_traffic_pid=$!
+
+		target_count=$(jq -n --argjson percentage "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_PERCENTAGE}" --argjson num_workspaces "${SCALETEST_PARAM_NUM_WORKSPACES}" '$percentage / 100 * $num_workspaces | floor')
+		target_end=$((target_start + target_count))
+		if [[ ${target_end} -gt ${SCALETEST_PARAM_NUM_WORKSPACES} ]]; then
+			log "WARNING: Target count ${target_end} exceeds number of workspaces ${SCALETEST_PARAM_NUM_WORKSPACES}, using ${SCALETEST_PARAM_NUM_WORKSPACES} instead."
+			target_start=0
+			target_end=${target_count}
+		fi
+		coder exp scaletest workspace-traffic \
+			--template "${SCALETEST_PARAM_TEMPLATE}" \
+			--bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_BYTES_PER_TICK}" \
+			--tick-interval "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_TICK_INTERVAL}ms" \
+			--timeout "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION}m" \
+			--job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION}m30s" \
+			--output json:"${SCALETEST_RESULTS_DIR}/traffic-app.json" \
+			--scaletest-prometheus-address "0.0.0.0:${SCALETEST_PROMETHEUS_START_PORT}" \
+			--app "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_MODE}" \
+			--target-workspaces "${target_start}:${target_end}" \
+			"${non_greedy_agent_traffic_args[@]}" &
+		pids+=($!)
+		if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
+			wait "${pids[-1]}"
+			status=$?
+			show_json "${SCALETEST_RESULTS_DIR}/traffic-app.json"
+		else
+			SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
+		fi
+		wait "${greedy_agent_traffic_pid}"
 		status2=$?
 		if [[ ${status} == 0 ]]; then
 			status=${status2}
 		fi
-		show_json "${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json"
 		;;
 	"Dashboard Traffic")
+		target_count=$(jq -n --argjson percentage "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_PERCENTAGE}" --argjson num_workspaces "${SCALETEST_PARAM_NUM_WORKSPACES}" '$percentage / 100 * $num_workspaces | floor')
+		target_end=$((target_start + target_count))
+		if [[ ${target_end} -gt ${SCALETEST_PARAM_NUM_WORKSPACES} ]]; then
+			log "WARNING: Target count ${target_end} exceeds number of workspaces ${SCALETEST_PARAM_NUM_WORKSPACES}, using ${SCALETEST_PARAM_NUM_WORKSPACES} instead."
+			target_start=0
+			target_end=${target_count}
+		fi
 		coder exp scaletest dashboard \
 			--timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m" \
 			--job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m30s" \
 			--output json:"${SCALETEST_RESULTS_DIR}/traffic-dashboard.json" \
-			>"${SCALETEST_RESULTS_DIR}/traffic-dashboard-output.log"
-		status=$?
-		show_json "${SCALETEST_RESULTS_DIR}/traffic-dashboard.json"
+			--scaletest-prometheus-address "0.0.0.0:${SCALETEST_PROMETHEUS_START_PORT}" \
+			--target-users "${target_start}:${target_end}" \
+			>"${SCALETEST_RESULTS_DIR}/traffic-dashboard-output.log" &
+		pids+=($!)
+		if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
+			wait "${pids[-1]}"
+			status=$?
+			show_json "${SCALETEST_RESULTS_DIR}/traffic-dashboard.json"
+		else
+			SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
+		fi
 		;;
 
 	# Debug scenarios, for testing the runner.
 	"debug:greedy_agent_traffic")
-		greedy_agent_traffic 10 "${scenario}"
+		greedy_agent_traffic 10 "${scenario}" &
 		status=$?
 		;;
 	"debug:success")
-		maybedryrun "$DRY_RUN" sleep 10
-		status=0
+		{
+			maybedryrun "$DRY_RUN" sleep 10
+			true
+		} &
+		pids+=($!)
+		if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
+			wait "${pids[-1]}"
+			status=$?
+		else
+			SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
+		fi
 		;;
 	"debug:error")
-		maybedryrun "$DRY_RUN" sleep 10
-		status=1
+		{
+			maybedryrun "$DRY_RUN" sleep 10
+			false
+		} &
+		pids+=($!)
+		if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
+			wait "${pids[-1]}"
+			status=$?
+		else
+			SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
+		fi
 		;;
 
 	*)
@@ -163,23 +275,42 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do
 		;;
 	esac
 	set -e
+
+	# Allow targeting to be distributed evenly across workspaces when each
+	# scenario is run concurrently and all percentages add up to 100.
+	target_start=${target_end}
+
+	if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
+		if ((status > 0)); then
+			log "Load scenario failed: ${scenario} (exit=${status})"
+			failed+=(["${scenario}"]="$status")
+			PHASE_ADD_TAGS=error end_phase
+		else
+			end_phase
+		fi
+
+		wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}"
+	fi
+done
+
+if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then
+	wait "${pids[@]}"
+	status=$?
 	if ((status > 0)); then
-		log "Load scenario failed: ${scenario} (exit=${status})"
-		failed+=(["${scenario}"]="$status")
+		log "One or more load scenarios failed: ${SCALETEST_PARAM_LOAD_SCENARIOS[*]} (exit=${status})"
 		PHASE_ADD_TAGS=error end_phase
+		exit 1
 	else
 		end_phase
 	fi
-
-	wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}"
-done
-
-if ((${#failed[@]} > 0)); then
-	log "Load scenarios failed: ${!failed[*]}"
-	for scenario in "${!failed[@]}"; do
-		log "  ${scenario}: exit=${failed[$scenario]}"
-	done
-	exit 1
+else
+	if ((${#failed[@]} > 0)); then
+		log "Load scenarios failed: ${!failed[*]}"
+		for scenario in "${!failed[@]}"; do
+			log "  ${scenario}: exit=${failed[$scenario]}"
+		done
+		exit 1
+	fi
 fi
 
 log "Scaletest complete!"
diff --git a/scaletest/templates/scaletest-runner/startup.sh b/scaletest/templates/scaletest-runner/startup.sh
index 45bf4fb9ebd5c..3e4eb94f41810 100755
--- a/scaletest/templates/scaletest-runner/startup.sh
+++ b/scaletest/templates/scaletest-runner/startup.sh
@@ -8,6 +8,11 @@ if [[ ${SCALETEST_PARAM_GREEDY_AGENT_TEMPLATE} == "${SCALETEST_PARAM_TEMPLATE}"
 	exit 1
 fi
 
+if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]] && [[ ${SCALETEST_PARAM_GREEDY_AGENT} == 1 ]]; then
+	echo "ERROR: Load scenario concurrency and greedy agent test cannot be enabled at the same time." >&2
+	exit 1
+fi
+
 # Unzip scripts and add to path.
 # shellcheck disable=SC2153
 echo "Extracting scaletest scripts into ${SCRIPTS_DIR}..."

From 17605a776d7bf03e9fa00fa90278a6d4f988a17a Mon Sep 17 00:00:00 2001
From: Mathias Fredriksson <mafredri@gmail.com>
Date: Mon, 22 Jan 2024 17:15:06 +0200
Subject: [PATCH 02/18] bump resources to the max

---
 scaletest/templates/scaletest-runner/main.tf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf
index cd1d95cbd27b4..7bb155312084d 100644
--- a/scaletest/templates/scaletest-runner/main.tf
+++ b/scaletest/templates/scaletest-runner/main.tf
@@ -39,8 +39,8 @@ locals {
   workspace_pod_instance                         = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
   workspace_pod_termination_grace_period_seconds = 5 * 60 * 60 # 5 hours (cleanup timeout).
   service_account_name                           = "scaletest-sa"
-  cpu                                            = 16
-  memory                                         = 64
+  cpu                                            = 32
+  memory                                         = 256
   home_disk_size                                 = 10
   scaletest_run_id                               = "scaletest-${replace(time_static.start_time.rfc3339, ":", "-")}"
   scaletest_run_dir                              = "/home/coder/${local.scaletest_run_id}"

From 806c73f9dee79601fbe5517b770402bbe2d2fc5e Mon Sep 17 00:00:00 2001
From: Mathias Fredriksson <mafredri@gmail.com>
Date: Mon, 22 Jan 2024 17:16:32 +0200
Subject: [PATCH 03/18] remove limits entirely

---
 scaletest/templates/scaletest-runner/main.tf | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf
index 7bb155312084d..82b4e8e15f955 100644
--- a/scaletest/templates/scaletest-runner/main.tf
+++ b/scaletest/templates/scaletest-runner/main.tf
@@ -39,8 +39,6 @@ locals {
   workspace_pod_instance                         = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
   workspace_pod_termination_grace_period_seconds = 5 * 60 * 60 # 5 hours (cleanup timeout).
   service_account_name                           = "scaletest-sa"
-  cpu                                            = 32
-  memory                                         = 256
   home_disk_size                                 = 10
   scaletest_run_id                               = "scaletest-${replace(time_static.start_time.rfc3339, ":", "-")}"
   scaletest_run_dir                              = "/home/coder/${local.scaletest_run_id}"
@@ -825,16 +823,10 @@ resource "kubernetes_pod" "main" {
         }
       }
       resources {
-        # Set requests and limits values such that we can do performant
-        # execution of `coder scaletest` commands.
         requests = {
           "cpu"    = "250m"
           "memory" = "512Mi"
         }
-        limits = {
-          "cpu"    = "${local.cpu}"
-          "memory" = "${local.memory}Gi"
-        }
       }
       volume_mount {
         mount_path = "/home/coder"

From 96100c0b2497bbf52526893176cb3905bf0c5b69 Mon Sep 17 00:00:00 2001
From: Mathias Fredriksson <mafredri@gmail.com>
Date: Tue, 23 Jan 2024 16:42:46 +0200
Subject: [PATCH 04/18] allow more flexibility in cleanup

---
 scaletest/templates/scaletest-runner/main.tf    | 11 +++++++++++
 .../scaletest-runner/scripts/cleanup.sh         | 17 ++++++++++-------
 .../scaletest-runner/scripts/prepare.sh         |  6 ++++--
 .../templates/scaletest-runner/shutdown.sh      |  6 +++++-
 4 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf
index 82b4e8e15f955..5a97465dcb102 100644
--- a/scaletest/templates/scaletest-runner/main.tf
+++ b/scaletest/templates/scaletest-runner/main.tf
@@ -169,6 +169,16 @@ data "coder_parameter" "cleanup_strategy" {
   }
 }
 
+data "coder_parameter" "cleanup_prepare" {
+  order       = 14
+  type        = "bool"
+  name        = "Cleanup before scaletest"
+  default     = true
+  description = "Cleanup existing scaletest users and workspaces before the scaletest starts (prepare phase)."
+  mutable     = true
+  ephemeral   = true
+}
+
 
 data "coder_parameter" "workspace_template" {
   order        = 20
@@ -563,6 +573,7 @@ resource "coder_agent" "main" {
     SCALETEST_PARAM_NUM_WORKSPACES : data.coder_parameter.num_workspaces.value,
     SCALETEST_PARAM_CREATE_CONCURRENCY : "${data.coder_parameter.create_concurrency.value}",
     SCALETEST_PARAM_CLEANUP_STRATEGY : data.coder_parameter.cleanup_strategy.value,
+    SCALETEST_PARAM_CLEANUP_PREPARE : data.coder_parameter.cleanup_prepare.value ? "1" : "0",
     SCALETEST_PARAM_LOAD_SCENARIOS : data.coder_parameter.load_scenarios.value,
     SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY : data.coder_parameter.load_scenario_run_concurrently.value ? "1" : "0",
     SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_ssh_traffic_duration.value}",
diff --git a/scaletest/templates/scaletest-runner/scripts/cleanup.sh b/scaletest/templates/scaletest-runner/scripts/cleanup.sh
index ecdd086d9a4e0..b65d261308788 100755
--- a/scaletest/templates/scaletest-runner/scripts/cleanup.sh
+++ b/scaletest/templates/scaletest-runner/scripts/cleanup.sh
@@ -21,17 +21,20 @@ if [[ $event = manual ]]; then
 	fi
 fi
 
-start_phase "Cleanup (${event})"
-coder exp scaletest cleanup \
-	--cleanup-job-timeout 2h \
-	--cleanup-timeout 5h |
-	tee "${SCALETEST_RESULTS_DIR}/cleanup-${event}.txt"
-end_phase
+if [[ $event != shutdown_scale_down_only ]]; then
+	start_phase "Cleanup (${event})"
+	coder exp scaletest cleanup \
+		--cleanup-job-timeout 2h \
+		--cleanup-timeout 5h \
+		| tee "${SCALETEST_RESULTS_DIR}/cleanup-${event}.txt"
+	end_phase
+fi
 
 if [[ $event != prepare ]]; then
-	start_phase "Scaling down provisioners..."
+	start_phase "Scale down provisioners"
 	maybedryrun "$DRY_RUN" kubectl scale deployment/coder-provisioner --replicas 1
 	maybedryrun "$DRY_RUN" kubectl rollout status deployment/coder-provisioner
+	end_phase
 fi
 
 if [[ $event = manual ]]; then
diff --git a/scaletest/templates/scaletest-runner/scripts/prepare.sh b/scaletest/templates/scaletest-runner/scripts/prepare.sh
index e7e6c4d2a292a..90b2dd05f945f 100755
--- a/scaletest/templates/scaletest-runner/scripts/prepare.sh
+++ b/scaletest/templates/scaletest-runner/scripts/prepare.sh
@@ -47,8 +47,10 @@ unset CODER_SESSION_TOKEN
 echo -n "${token}" >"${CODER_CONFIG_DIR}/session"
 [[ $VERBOSE == 1 ]] && set -x # Restore logging (if enabled).
 
-log "Cleaning up from previous runs (if applicable)..."
-"${SCRIPTS_DIR}/cleanup.sh" "prepare"
+if [[ ${SCALETEST_PARAM_CLEANUP_PREPARE} == 1 ]]; then
+	log "Cleaning up from previous runs (if applicable)..."
+	"${SCRIPTS_DIR}/cleanup.sh" prepare
+fi
 
 log "Preparation complete!"
 
diff --git a/scaletest/templates/scaletest-runner/shutdown.sh b/scaletest/templates/scaletest-runner/shutdown.sh
index d5c81366b1217..9e75864d73120 100755
--- a/scaletest/templates/scaletest-runner/shutdown.sh
+++ b/scaletest/templates/scaletest-runner/shutdown.sh
@@ -14,7 +14,11 @@ trap cleanup EXIT
 
 annotate_grafana "workspace" "Agent stopping..."
 
-"${SCRIPTS_DIR}/cleanup.sh" shutdown
+shutdown_event=shutdown_scale_down_only
+if [[ ${SCALETEST_PARAM_CLEANUP_STRATEGY} == on_stop ]]; then
+	shutdown_event=shutdown
+fi
+"${SCRIPTS_DIR}/cleanup.sh" "${shutdown_event}"
 
 annotate_grafana_end "workspace" "Agent running"
 

From 569cfc174055ebfbc42ed4fc080e5abf2c56fdf1 Mon Sep 17 00:00:00 2001
From: Mathias Fredriksson <mafredri@gmail.com>
Date: Wed, 24 Jan 2024 13:05:30 +0200
Subject: [PATCH 05/18] allow staggered start

---
 scaletest/templates/scaletest-runner/main.tf  | 10 ++++++
 .../templates/scaletest-runner/scripts/run.sh | 34 ++++++++++++++++---
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf
index 5a97465dcb102..4ecf73b77616c 100644
--- a/scaletest/templates/scaletest-runner/main.tf
+++ b/scaletest/templates/scaletest-runner/main.tf
@@ -259,6 +259,15 @@ data "coder_parameter" "load_scenario_run_concurrently" {
   mutable     = true
 }
 
+data "coder_parameter" "load_scenario_concurrency_staggering" {
+  order       = 23
+  name        = "Load Scenario Concurrency Staggering"
+  type        = "number"
+  default     = 3
+  description = "The number of minutes to wait between starting each load scenario when run concurrently."
+  mutable     = true
+}
+
 data "coder_parameter" "load_scenario_ssh_traffic_duration" {
   order       = 30
   name        = "SSH Traffic Duration"
@@ -576,6 +585,7 @@ resource "coder_agent" "main" {
     SCALETEST_PARAM_CLEANUP_PREPARE : data.coder_parameter.cleanup_prepare.value ? "1" : "0",
     SCALETEST_PARAM_LOAD_SCENARIOS : data.coder_parameter.load_scenarios.value,
     SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY : data.coder_parameter.load_scenario_run_concurrently.value ? "1" : "0",
+    SCALETEST_PARAM_LOAD_SCENARIO_CONCURRENCY_STAGGERING : "${data.coder_parameter.load_scenario_concurrency_staggering.value}",
     SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_ssh_traffic_duration.value}",
     SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_ssh_bytes_per_tick.value}",
     SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_ssh_tick_interval.value}",
diff --git a/scaletest/templates/scaletest-runner/scripts/run.sh b/scaletest/templates/scaletest-runner/scripts/run.sh
index c117ca36cf185..fd9dbcd84e9e3 100755
--- a/scaletest/templates/scaletest-runner/scripts/run.sh
+++ b/scaletest/templates/scaletest-runner/scripts/run.sh
@@ -94,10 +94,32 @@ if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then
 	start_phase "Load scenarios: ${SCALETEST_PARAM_LOAD_SCENARIOS[*]}"
 fi
 
+run_scenario_cmd() {
+	local scenario=${1}
+	shift
+	local command=("$@")
+
+	set +e
+	if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then
+		annotate_grafana scenario "Load scenario: ${scenario}"
+	fi
+	"${command[@]}"
+	status=${?}
+	if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then
+		export GRAFANA_ADD_TAGS=
+		if [[ ${status} != 0 ]]; then
+			GRAFANA_ADD_TAGS=error
+		fi
+		annotate_grafana_end scenario "Load scenario: ${scenario}"
+	fi
+	exit "${status}"
+}
+
 declare -a pids=()
 declare -A failed=()
 target_start=0
 target_end=-1
+
 for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do
 	if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
 		start_phase "Load scenario: ${scenario}"
@@ -117,7 +139,7 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do
 			target_start=0
 			target_end=${target_count}
 		fi
-		coder exp scaletest workspace-traffic \
+		run_scenario_cmd "${scenario}" coder exp scaletest workspace-traffic \
 			--template "${SCALETEST_PARAM_TEMPLATE}" \
 			--ssh \
 			--bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_BYTES_PER_TICK}" \
@@ -153,7 +175,7 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do
 			target_start=0
 			target_end=${target_count}
 		fi
-		coder exp scaletest workspace-traffic \
+		run_scenario_cmd "${scenario}" coder exp scaletest workspace-traffic \
 			--template "${SCALETEST_PARAM_TEMPLATE}" \
 			--bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_BYTES_PER_TICK}" \
 			--tick-interval "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_TICK_INTERVAL}ms" \
@@ -188,7 +210,7 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do
 			target_start=0
 			target_end=${target_count}
 		fi
-		coder exp scaletest workspace-traffic \
+		run_scenario_cmd "${scenario}" coder exp scaletest workspace-traffic \
 			--template "${SCALETEST_PARAM_TEMPLATE}" \
 			--bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_BYTES_PER_TICK}" \
 			--tick-interval "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_TICK_INTERVAL}ms" \
@@ -221,7 +243,7 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do
 			target_start=0
 			target_end=${target_count}
 		fi
-		coder exp scaletest dashboard \
+		run_scenario_cmd "${scenario}" coder exp scaletest dashboard \
 			--timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m" \
 			--job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m30s" \
 			--output json:"${SCALETEST_RESULTS_DIR}/traffic-dashboard.json" \
@@ -290,6 +312,10 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do
 		fi
 
 		wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}"
+	else
+		# Stagger the start of each scenario to avoid a burst of load and deted
+		# problematic scenarios.
+		sleep $((SCALETEST_PARAM_LOAD_SCENARIO_CONCURRENCY_STAGGERING * 60))
 	fi
 done
 

From d0fbd2ce98b631104c5d6801cc5c8a7d31f7c701 Mon Sep 17 00:00:00 2001
From: Mathias Fredriksson <mafredri@gmail.com>
Date: Wed, 24 Jan 2024 15:25:28 +0200
Subject: [PATCH 06/18] improve cleanup

---
 .../scaletest-runner/scripts/cleanup.sh       | 53 ++++++++++++-------
 1 file changed, 35 insertions(+), 18 deletions(-)

diff --git a/scaletest/templates/scaletest-runner/scripts/cleanup.sh b/scaletest/templates/scaletest-runner/scripts/cleanup.sh
index b65d261308788..1dba743a0e516 100755
--- a/scaletest/templates/scaletest-runner/scripts/cleanup.sh
+++ b/scaletest/templates/scaletest-runner/scripts/cleanup.sh
@@ -12,32 +12,49 @@ if [[ -z $event ]]; then
 	event=manual
 fi
 
-if [[ $event = manual ]]; then
-	echo -n 'WARNING: This will clean up all scaletest resources, continue? (y/n) '
-	read -r -n 1
-	if [[ $REPLY != [yY] ]]; then
-		echo $'\nAborting...'
-		exit 1
-	fi
-fi
-
-if [[ $event != shutdown_scale_down_only ]]; then
+do_cleanup() {
 	start_phase "Cleanup (${event})"
 	coder exp scaletest cleanup \
 		--cleanup-job-timeout 2h \
-		--cleanup-timeout 5h \
-		| tee "${SCALETEST_RESULTS_DIR}/cleanup-${event}.txt"
+		--cleanup-timeout 5h |
+		tee "${SCALETEST_RESULTS_DIR}/cleanup-${event}.txt"
 	end_phase
-fi
+}
 
-if [[ $event != prepare ]]; then
-	start_phase "Scale down provisioners"
+do_scaledown() {
+	start_phase "Scale down provisioners (${event})"
 	maybedryrun "$DRY_RUN" kubectl scale deployment/coder-provisioner --replicas 1
 	maybedryrun "$DRY_RUN" kubectl rollout status deployment/coder-provisioner
 	end_phase
-fi
+}
+
+case "${event}" in
+manual)
+	echo -n 'WARNING: This will clean up all scaletest resources, continue? (y/n) '
+	read -r -n 1
+	if [[ $REPLY != [yY] ]]; then
+		echo $'\nAborting...'
+		exit 1
+	fi
+
+	do_cleanup
+	do_scaledown
 
-if [[ $event = manual ]]; then
 	echo 'Press any key to continue...'
 	read -s -r -n 1
-fi
+	;;
+prepare)
+	do_cleanup
+	;;
+shutdown)
+	do_cleanup
+	do_scaledown
+	;;
+shutdown_scale_down_only)
+	do_scaledown
+	;;
+*)
+	echo "Unknown event: ${event}" >&2
+	exit 1
+	;;
+esac

From 3141efe89475a88d77717a78887e0234aba35819 Mon Sep 17 00:00:00 2001
From: Mathias Fredriksson <mafredri@gmail.com>
Date: Wed, 24 Jan 2024 16:05:50 +0200
Subject: [PATCH 07/18] fix dashboard screengrabs location

---
 scaletest/templates/scaletest-runner/scripts/run.sh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/scaletest/templates/scaletest-runner/scripts/run.sh b/scaletest/templates/scaletest-runner/scripts/run.sh
index fd9dbcd84e9e3..23ad5c500343c 100755
--- a/scaletest/templates/scaletest-runner/scripts/run.sh
+++ b/scaletest/templates/scaletest-runner/scripts/run.sh
@@ -243,6 +243,10 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do
 			target_start=0
 			target_end=${target_count}
 		fi
+		# TODO: Remove this once the dashboard traffic command is fixed,
+		# (i.e. once images are no longer dumped into PWD).
+		mkdir -p dashboard
+		pushd dashboard
 		run_scenario_cmd "${scenario}" coder exp scaletest dashboard \
 			--timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m" \
 			--job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m30s" \
@@ -251,6 +255,7 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do
 			--target-users "${target_start}:${target_end}" \
 			>"${SCALETEST_RESULTS_DIR}/traffic-dashboard-output.log" &
 		pids+=($!)
+		popd
 		if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
 			wait "${pids[-1]}"
 			status=$?

From f58bf411b0b16b5d15c16801487c69705848528d Mon Sep 17 00:00:00 2001
From: Mathias Fredriksson <mafredri@gmail.com>
Date: Wed, 24 Jan 2024 16:59:47 +0200
Subject: [PATCH 08/18] add more cleanup cases

---
 scaletest/templates/scaletest-runner/scripts/cleanup.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scaletest/templates/scaletest-runner/scripts/cleanup.sh b/scaletest/templates/scaletest-runner/scripts/cleanup.sh
index 1dba743a0e516..343f360189969 100755
--- a/scaletest/templates/scaletest-runner/scripts/cleanup.sh
+++ b/scaletest/templates/scaletest-runner/scripts/cleanup.sh
@@ -46,7 +46,8 @@ manual)
 prepare)
 	do_cleanup
 	;;
-shutdown)
+on_stop) ;; # Do nothing, handled by "shutdown".
+always | on_success | on_error | shutdown)
 	do_cleanup
 	do_scaledown
 	;;

From 4754f2f0959efeb5e76e1b728401ea9860cac555 Mon Sep 17 00:00:00 2001
From: Mathias Fredriksson <mafredri@gmail.com>
Date: Wed, 24 Jan 2024 17:02:59 +0200
Subject: [PATCH 09/18] fix updating of static time

---
 scaletest/templates/scaletest-runner/main.tf | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf
index 4ecf73b77616c..6b374d86f8b38 100644
--- a/scaletest/templates/scaletest-runner/main.tf
+++ b/scaletest/templates/scaletest-runner/main.tf
@@ -13,10 +13,11 @@ terraform {
 
 resource "time_static" "start_time" {
   # We con't set `count = data.coder_workspace.me.start_count` here because then
-  # we can't use this value in `locals`. The permission check is recreated on
-  # start, which will update the timestamp.
+  # we can't use this value in `locals`, but we want to trigger recreation when
+  # the scaletest is restarted.
   triggers = {
-    count : length(null_resource.permission_check)
+    count : data.coder_workspace.me.start_count
+    id : data.coder_workspace.me.start_count > 0 ? data.coder_workspace.me.id : ""
   }
 }
 

From 1236b9390998247cc7d70863307a0cfa786bd75b Mon Sep 17 00:00:00 2001
From: Mathias Fredriksson <mafredri@gmail.com>
Date: Wed, 24 Jan 2024 19:18:25 +0200
Subject: [PATCH 10/18] echo

---
 scaletest/templates/scaletest-runner/scripts/cleanup.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scaletest/templates/scaletest-runner/scripts/cleanup.sh b/scaletest/templates/scaletest-runner/scripts/cleanup.sh
index 343f360189969..c80982497b5e9 100755
--- a/scaletest/templates/scaletest-runner/scripts/cleanup.sh
+++ b/scaletest/templates/scaletest-runner/scripts/cleanup.sh
@@ -36,6 +36,7 @@ manual)
 		echo $'\nAborting...'
 		exit 1
 	fi
+	echo
 
 	do_cleanup
 	do_scaledown

From 724e1c5354c8b0c2e2a9c3b9e5c3f99d236c917f Mon Sep 17 00:00:00 2001
From: Mathias Fredriksson <mafredri@gmail.com>
Date: Thu, 25 Jan 2024 14:47:33 +0200
Subject: [PATCH 11/18] add skip create workspaces option

---
 scaletest/templates/scaletest-runner/main.tf  | 16 ++++++++++++---
 .../templates/scaletest-runner/scripts/run.sh | 20 ++++++++++---------
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf
index 6b374d86f8b38..a143c4499fdca 100644
--- a/scaletest/templates/scaletest-runner/main.tf
+++ b/scaletest/templates/scaletest-runner/main.tf
@@ -235,9 +235,18 @@ data "coder_parameter" "num_workspaces" {
   }
 }
 
+data "coder_parameter" "skip_create_workspaces" {
+  order       = 22
+  type        = "bool"
+  name        = "DEBUG: Skip creating workspaces"
+  default     = false
+  description = "Skip creating workspaces (for resuming failed scaletests or debugging)"
+  mutable     = true
+}
+
 
 data "coder_parameter" "load_scenarios" {
-  order       = 22
+  order       = 23
   name        = "Load Scenarios"
   type        = "list(string)"
   description = "The load scenarios to run."
@@ -252,7 +261,7 @@ data "coder_parameter" "load_scenarios" {
 }
 
 data "coder_parameter" "load_scenario_run_concurrently" {
-  order       = 23
+  order       = 24
   name        = "Run Load Scenarios Concurrently"
   type        = "bool"
   default     = false
@@ -261,7 +270,7 @@ data "coder_parameter" "load_scenario_run_concurrently" {
 }
 
 data "coder_parameter" "load_scenario_concurrency_staggering" {
-  order       = 23
+  order       = 25
   name        = "Load Scenario Concurrency Staggering"
   type        = "number"
   default     = 3
@@ -581,6 +590,7 @@ resource "coder_agent" "main" {
     SCALETEST_PARAM_TEMPLATE : data.coder_parameter.workspace_template.value,
     SCALETEST_PARAM_REPO_BRANCH : data.coder_parameter.repo_branch.value,
     SCALETEST_PARAM_NUM_WORKSPACES : data.coder_parameter.num_workspaces.value,
+    SCALETEST_PARAM_SKIP_CREATE_WORKSPACES : data.coder_parameter.skip_create_workspaces.value ? "1" : "0",
     SCALETEST_PARAM_CREATE_CONCURRENCY : "${data.coder_parameter.create_concurrency.value}",
     SCALETEST_PARAM_CLEANUP_STRATEGY : data.coder_parameter.cleanup_strategy.value,
     SCALETEST_PARAM_CLEANUP_PREPARE : data.coder_parameter.cleanup_prepare.value ? "1" : "0",
diff --git a/scaletest/templates/scaletest-runner/scripts/run.sh b/scaletest/templates/scaletest-runner/scripts/run.sh
index 23ad5c500343c..50cab189fae62 100755
--- a/scaletest/templates/scaletest-runner/scripts/run.sh
+++ b/scaletest/templates/scaletest-runner/scripts/run.sh
@@ -13,15 +13,17 @@ log "Running scaletest..."
 set_status Running
 
 start_phase "Creating workspaces"
-coder exp scaletest create-workspaces \
-	--count "${SCALETEST_PARAM_NUM_WORKSPACES}" \
-	--template "${SCALETEST_PARAM_TEMPLATE}" \
-	--concurrency "${SCALETEST_PARAM_CREATE_CONCURRENCY}" \
-	--timeout 5h \
-	--job-timeout 5h \
-	--no-cleanup \
-	--output json:"${SCALETEST_RESULTS_DIR}/create-workspaces.json"
-show_json "${SCALETEST_RESULTS_DIR}/create-workspaces.json"
+if [[ ${SCALETEST_PARAM_SKIP_CREATE_WORKSPACES} == 0 ]]; then
+	coder exp scaletest create-workspaces \
+		--count "${SCALETEST_PARAM_NUM_WORKSPACES}" \
+		--template "${SCALETEST_PARAM_TEMPLATE}" \
+		--concurrency "${SCALETEST_PARAM_CREATE_CONCURRENCY}" \
+		--timeout 5h \
+		--job-timeout 5h \
+		--no-cleanup \
+		--output json:"${SCALETEST_RESULTS_DIR}/create-workspaces.json"
+	show_json "${SCALETEST_RESULTS_DIR}/create-workspaces.json"
+fi
 end_phase
 
 wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}"

From 916c74d1076a744a81eb1600c2d78edd6e572b42 Mon Sep 17 00:00:00 2001
From: Mathias Fredriksson <mafredri@gmail.com>
Date: Thu, 25 Jan 2024 16:24:55 +0200
Subject: [PATCH 12/18] fix start time regen

---
 scaletest/templates/scaletest-runner/main.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf
index a143c4499fdca..094168694336c 100644
--- a/scaletest/templates/scaletest-runner/main.tf
+++ b/scaletest/templates/scaletest-runner/main.tf
@@ -17,7 +17,7 @@ resource "time_static" "start_time" {
   # the scaletest is restarted.
   triggers = {
     count : data.coder_workspace.me.start_count
-    id : data.coder_workspace.me.start_count > 0 ? data.coder_workspace.me.id : ""
+    token : data.coder_workspace.me.owner_session_token # Rely on this being re-generated every start.
   }
 }
 

From aeb9c529107271292dbe836ff84fb64084149456 Mon Sep 17 00:00:00 2001
From: Mathias Fredriksson <mafredri@gmail.com>
Date: Mon, 29 Jan 2024 19:13:33 +0200
Subject: [PATCH 13/18] add retry

---
 scaletest/templates/scaletest-runner/scripts/run.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/scaletest/templates/scaletest-runner/scripts/run.sh b/scaletest/templates/scaletest-runner/scripts/run.sh
index 50cab189fae62..49004a7ff4e58 100755
--- a/scaletest/templates/scaletest-runner/scripts/run.sh
+++ b/scaletest/templates/scaletest-runner/scripts/run.sh
@@ -14,7 +14,11 @@ set_status Running
 
 start_phase "Creating workspaces"
 if [[ ${SCALETEST_PARAM_SKIP_CREATE_WORKSPACES} == 0 ]]; then
+	# Note that we allow up to 5 failures to bring up the workspace, since
+	# we're creating a lot of workspaces at once and some of them may fail
+	# due to network issues or other transient errors.
 	coder exp scaletest create-workspaces \
+		--retry 5 \
 		--count "${SCALETEST_PARAM_NUM_WORKSPACES}" \
 		--template "${SCALETEST_PARAM_TEMPLATE}" \
 		--concurrency "${SCALETEST_PARAM_CREATE_CONCURRENCY}" \

From 60df607aef32360c74df434c5f575026091974ae Mon Sep 17 00:00:00 2001
From: Mathias Fredriksson <mafredri@gmail.com>
Date: Mon, 29 Jan 2024 19:28:37 +0200
Subject: [PATCH 14/18] fix undetected error exit code

---
 .../templates/scaletest-runner/scripts/run.sh | 42 +++++++++++++------
 1 file changed, 29 insertions(+), 13 deletions(-)

diff --git a/scaletest/templates/scaletest-runner/scripts/run.sh b/scaletest/templates/scaletest-runner/scripts/run.sh
index 49004a7ff4e58..d61cea04b87d3 100755
--- a/scaletest/templates/scaletest-runner/scripts/run.sh
+++ b/scaletest/templates/scaletest-runner/scripts/run.sh
@@ -122,6 +122,7 @@ run_scenario_cmd() {
 }
 
 declare -a pids=()
+declare -A pid_to_scenario=()
 declare -A failed=()
 target_start=0
 target_end=-1
@@ -274,7 +275,13 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do
 	# Debug scenarios, for testing the runner.
 	"debug:greedy_agent_traffic")
 		greedy_agent_traffic 10 "${scenario}" &
-		status=$?
+		pids+=($!)
+		if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
+			wait "${pids[-1]}"
+			status=$?
+		else
+			SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1))
+		fi
 		;;
 	"debug:success")
 		{
@@ -324,6 +331,7 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do
 
 		wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}"
 	else
+		pid_to_scenario+=(["${pids[-1]}"]="${scenario}")
 		# Stagger the start of each scenario to avoid a burst of load and deted
 		# problematic scenarios.
 		sleep $((SCALETEST_PARAM_LOAD_SCENARIO_CONCURRENCY_STAGGERING * 60))
@@ -332,22 +340,30 @@ done
 
 if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then
 	wait "${pids[@]}"
-	status=$?
-	if ((status > 0)); then
-		log "One or more load scenarios failed: ${SCALETEST_PARAM_LOAD_SCENARIOS[*]} (exit=${status})"
+	# Wait on all pids will wait until all have exited, but we need to
+	# check their individual exit codes.
+	for pid in "${pids[@]}"; do
+		wait "${pid}"
+		status=${?}
+		scenario=${pid_to_scenario[${pid}]}
+		if ((status > 0)); then
+			log "Load scenario failed: ${scenario} (exit=${status})"
+			failed+=(["${scenario}"]="$status")
+		fi
+	done
+	if ((${#failed[@]} > 0)); then
 		PHASE_ADD_TAGS=error end_phase
-		exit 1
 	else
 		end_phase
 	fi
-else
-	if ((${#failed[@]} > 0)); then
-		log "Load scenarios failed: ${!failed[*]}"
-		for scenario in "${!failed[@]}"; do
-			log "  ${scenario}: exit=${failed[$scenario]}"
-		done
-		exit 1
-	fi
+fi
+
+if ((${#failed[@]} > 0)); then
+	log "Load scenarios failed: ${!failed[*]}"
+	for scenario in "${!failed[@]}"; do
+		log "  ${scenario}: exit=${failed[$scenario]}"
+	done
+	exit 1
 fi
 
 log "Scaletest complete!"

From 23d936e2927e371e65fcebb3f0ddb52c6cbdf35d Mon Sep 17 00:00:00 2001
From: Mathias Fredriksson <mafredri@gmail.com>
Date: Tue, 30 Jan 2024 12:14:14 +0200
Subject: [PATCH 15/18] minor cleanup

---
 .../templates/scaletest-runner/scripts/run.sh | 35 +++++++++----------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/scaletest/templates/scaletest-runner/scripts/run.sh b/scaletest/templates/scaletest-runner/scripts/run.sh
index d61cea04b87d3..6ca913abca9cf 100755
--- a/scaletest/templates/scaletest-runner/scripts/run.sh
+++ b/scaletest/templates/scaletest-runner/scripts/run.sh
@@ -92,14 +92,10 @@ else
 		fi
 		annotate_grafana_end greedy_agent "${scenario}: Greedy agent traffic"
 
-		return ${status}
+		return "${status}"
 	}
 fi
 
-if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then
-	start_phase "Load scenarios: ${SCALETEST_PARAM_LOAD_SCENARIOS[*]}"
-fi
-
 run_scenario_cmd() {
 	local scenario=${1}
 	shift
@@ -127,6 +123,9 @@ declare -A failed=()
 target_start=0
 target_end=-1
 
+if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then
+	start_phase "Load scenarios: ${SCALETEST_PARAM_LOAD_SCENARIOS[*]}"
+fi
 for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do
 	if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
 		start_phase "Load scenario: ${scenario}"
@@ -320,24 +319,24 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do
 	# scenario is run concurrently and all percentages add up to 100.
 	target_start=${target_end}
 
-	if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then
-		if ((status > 0)); then
-			log "Load scenario failed: ${scenario} (exit=${status})"
-			failed+=(["${scenario}"]="$status")
-			PHASE_ADD_TAGS=error end_phase
-		else
-			end_phase
-		fi
-
-		wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}"
-	else
+	if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then
 		pid_to_scenario+=(["${pids[-1]}"]="${scenario}")
 		# Stagger the start of each scenario to avoid a burst of load and deted
 		# problematic scenarios.
 		sleep $((SCALETEST_PARAM_LOAD_SCENARIO_CONCURRENCY_STAGGERING * 60))
+		continue
 	fi
-done
 
+	if ((status > 0)); then
+		log "Load scenario failed: ${scenario} (exit=${status})"
+		failed+=(["${scenario}"]="${status}")
+		PHASE_ADD_TAGS=error end_phase
+	else
+		end_phase
+	fi
+
+	wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}"
+done
 if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then
 	wait "${pids[@]}"
 	# Wait on all pids will wait until all have exited, but we need to
@@ -348,7 +347,7 @@ if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then
 		scenario=${pid_to_scenario[${pid}]}
 		if ((status > 0)); then
 			log "Load scenario failed: ${scenario} (exit=${status})"
-			failed+=(["${scenario}"]="$status")
+			failed+=(["${scenario}"]="${status}")
 		fi
 	done
 	if ((${#failed[@]} > 0)); then

From 82b95d4dd4c5177fd91f92209661c6e91395d82e Mon Sep 17 00:00:00 2001
From: Mathias Fredriksson <mafredri@gmail.com>
Date: Tue, 30 Jan 2024 12:17:58 +0200
Subject: [PATCH 16/18] fix typo

---
 scaletest/templates/scaletest-runner/main.tf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf
index 094168694336c..558f44a8d9e6a 100644
--- a/scaletest/templates/scaletest-runner/main.tf
+++ b/scaletest/templates/scaletest-runner/main.tf
@@ -12,7 +12,7 @@ terraform {
 }
 
 resource "time_static" "start_time" {
-  # We con't set `count = data.coder_workspace.me.start_count` here because then
+  # We don't set `count = data.coder_workspace.me.start_count` here because then
   # we can't use this value in `locals`, but we want to trigger recreation when
   # the scaletest is restarted.
   triggers = {

From 74812f68e73384d643f3433e2d4439701d3f280c Mon Sep 17 00:00:00 2001
From: Mathias Fredriksson <mafredri@gmail.com>
Date: Tue, 30 Jan 2024 12:52:11 +0200
Subject: [PATCH 17/18] stagger delay mins

---
 scaletest/templates/scaletest-runner/main.tf        | 6 +++---
 scaletest/templates/scaletest-runner/scripts/run.sh | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf
index 558f44a8d9e6a..ef1c7ba814b4b 100644
--- a/scaletest/templates/scaletest-runner/main.tf
+++ b/scaletest/templates/scaletest-runner/main.tf
@@ -269,9 +269,9 @@ data "coder_parameter" "load_scenario_run_concurrently" {
   mutable     = true
 }
 
-data "coder_parameter" "load_scenario_concurrency_staggering" {
+data "coder_parameter" "load_scenario_concurrency_stagger_delay_mins" {
   order       = 25
-  name        = "Load Scenario Concurrency Staggering"
+  name        = "Load Scenario Concurrency Stagger Delay"
   type        = "number"
   default     = 3
   description = "The number of minutes to wait between starting each load scenario when run concurrently."
@@ -596,7 +596,7 @@ resource "coder_agent" "main" {
     SCALETEST_PARAM_CLEANUP_PREPARE : data.coder_parameter.cleanup_prepare.value ? "1" : "0",
     SCALETEST_PARAM_LOAD_SCENARIOS : data.coder_parameter.load_scenarios.value,
     SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY : data.coder_parameter.load_scenario_run_concurrently.value ? "1" : "0",
-    SCALETEST_PARAM_LOAD_SCENARIO_CONCURRENCY_STAGGERING : "${data.coder_parameter.load_scenario_concurrency_staggering.value}",
+    SCALETEST_PARAM_LOAD_SCENARIO_CONCURRENCY_STAGGER_DELAY_MINS : "${data.coder_parameter.load_scenario_concurrency_stagger_delay_mins.value}",
     SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_ssh_traffic_duration.value}",
     SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_ssh_bytes_per_tick.value}",
     SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_ssh_tick_interval.value}",
diff --git a/scaletest/templates/scaletest-runner/scripts/run.sh b/scaletest/templates/scaletest-runner/scripts/run.sh
index 6ca913abca9cf..47a6042a18598 100755
--- a/scaletest/templates/scaletest-runner/scripts/run.sh
+++ b/scaletest/templates/scaletest-runner/scripts/run.sh
@@ -323,7 +323,7 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do
 		pid_to_scenario+=(["${pids[-1]}"]="${scenario}")
 		# Stagger the start of each scenario to avoid a burst of load and deted
 		# problematic scenarios.
-		sleep $((SCALETEST_PARAM_LOAD_SCENARIO_CONCURRENCY_STAGGERING * 60))
+		sleep $((SCALETEST_PARAM_LOAD_SCENARIO_CONCURRENCY_STAGGER_DELAY_MINS * 60))
 		continue
 	fi
 

From 21bc64b7ab8302f51a94df8ed1df4c96cc8b6c06 Mon Sep 17 00:00:00 2001
From: Mathias Fredriksson <mafredri@gmail.com>
Date: Tue, 30 Jan 2024 13:35:16 +0200
Subject: [PATCH 18/18] add note about prom http port name

---
 scaletest/templates/scaletest-runner/main.tf | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf
index ef1c7ba814b4b..2a6eb8ca21ed5 100644
--- a/scaletest/templates/scaletest-runner/main.tf
+++ b/scaletest/templates/scaletest-runner/main.tf
@@ -947,6 +947,10 @@ resource "kubernetes_manifest" "pod_monitor" {
         }
       }
       podMetricsEndpoints = [
+        # NOTE(mafredri): We could add more information here by including the
+        # scenario name in the port name (although it's limited to 15 chars so
+        # it needs to be short). That said, someone looking at the stats can
+        # assume that there's a 1-to-1 mapping between scenario# and port.
         for i, _ in data.coder_parameter.load_scenario_run_concurrently.value ? jsondecode(data.coder_parameter.load_scenarios.value) : [""] : {
           port     = "prom-http${i}"
           interval = "15s"