From 036aa8d7476314af35c8c629f908c4c75f56e18d Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Mon, 22 Jan 2024 17:08:56 +0200 Subject: [PATCH 01/18] feat(scaletest/templates): add support for concurrent scenarios --- scaletest/templates/scaletest-runner/main.tf | 168 ++++++++++++++-- .../templates/scaletest-runner/scripts/run.sh | 189 +++++++++++++++--- .../templates/scaletest-runner/startup.sh | 5 + 3 files changed, 317 insertions(+), 45 deletions(-) diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf index b536fc51afbb3..cd1d95cbd27b4 100644 --- a/scaletest/templates/scaletest-runner/main.tf +++ b/scaletest/templates/scaletest-runner/main.tf @@ -237,12 +237,22 @@ data "coder_parameter" "load_scenarios" { default = jsonencode([ "SSH Traffic", "Web Terminal Traffic", + "App Traffic", "Dashboard Traffic", ]) } -data "coder_parameter" "load_scenario_ssh_traffic_duration" { +data "coder_parameter" "load_scenario_run_concurrently" { order = 23 + name = "Run Load Scenarios Concurrently" + type = "bool" + default = false + description = "Run all load scenarios concurrently, this setting enables the load scenario percentages so that they can be assigned a percentage of 1-100%." + mutable = true +} + +data "coder_parameter" "load_scenario_ssh_traffic_duration" { + order = 30 name = "SSH Traffic Duration" type = "number" description = "The duration of the SSH traffic load scenario in minutes." @@ -255,7 +265,7 @@ data "coder_parameter" "load_scenario_ssh_traffic_duration" { } data "coder_parameter" "load_scenario_ssh_bytes_per_tick" { - order = 24 + order = 31 name = "SSH Bytes Per Tick" type = "number" description = "The number of bytes to send per tick in the SSH traffic load scenario." @@ -267,7 +277,7 @@ data "coder_parameter" "load_scenario_ssh_bytes_per_tick" { } data "coder_parameter" "load_scenario_ssh_tick_interval" { - order = 25 + order = 32 name = "SSH Tick Interval" type = "number" description = "The number of milliseconds between each tick in the SSH traffic load scenario." @@ -278,8 +288,21 @@ data "coder_parameter" "load_scenario_ssh_tick_interval" { } } +data "coder_parameter" "load_scenario_ssh_traffic_percentage" { + order = 33 + name = "SSH Traffic Percentage" + type = "number" + description = "The percentage of workspaces that should be targeted for SSH traffic." + mutable = true + default = 100 + validation { + min = 1 + max = 100 + } +} + data "coder_parameter" "load_scenario_web_terminal_traffic_duration" { - order = 26 + order = 40 name = "Web Terminal Traffic Duration" type = "number" description = "The duration of the web terminal traffic load scenario in minutes." @@ -292,7 +315,7 @@ data "coder_parameter" "load_scenario_web_terminal_traffic_duration" { } data "coder_parameter" "load_scenario_web_terminal_bytes_per_tick" { - order = 27 + order = 41 name = "Web Terminal Bytes Per Tick" type = "number" description = "The number of bytes to send per tick in the web terminal traffic load scenario." @@ -304,7 +327,7 @@ data "coder_parameter" "load_scenario_web_terminal_bytes_per_tick" { } data "coder_parameter" "load_scenario_web_terminal_tick_interval" { - order = 28 + order = 42 name = "Web Terminal Tick Interval" type = "number" description = "The number of milliseconds between each tick in the web terminal traffic load scenario." @@ -315,8 +338,94 @@ data "coder_parameter" "load_scenario_web_terminal_tick_interval" { } } +data "coder_parameter" "load_scenario_web_terminal_traffic_percentage" { + order = 43 + name = "Web Terminal Traffic Percentage" + type = "number" + description = "The percentage of workspaces that should be targeted for web terminal traffic." + mutable = true + default = 100 + validation { + min = 1 + max = 100 + } +} + +data "coder_parameter" "load_scenario_app_traffic_duration" { + order = 50 + name = "App Traffic Duration" + type = "number" + description = "The duration of the app traffic load scenario in minutes." + mutable = true + default = 30 + validation { + min = 1 + max = 1440 // 24 hours. + } +} + +data "coder_parameter" "load_scenario_app_bytes_per_tick" { + order = 51 + name = "App Bytes Per Tick" + type = "number" + description = "The number of bytes to send per tick in the app traffic load scenario." + mutable = true + default = 1024 + validation { + min = 1 + } +} + +data "coder_parameter" "load_scenario_app_tick_interval" { + order = 52 + name = "App Tick Interval" + type = "number" + description = "The number of milliseconds between each tick in the app traffic load scenario." + mutable = true + default = 100 + validation { + min = 1 + } +} + +data "coder_parameter" "load_scenario_app_traffic_percentage" { + order = 53 + name = "App Traffic Percentage" + type = "number" + description = "The percentage of workspaces that should be targeted for app traffic." + mutable = true + default = 100 + validation { + min = 1 + max = 100 + } +} + +data "coder_parameter" "load_scenario_app_traffic_mode" { + order = 54 + name = "App Traffic Mode" + default = "wsec" + description = "The mode of the app traffic load scenario." + mutable = true + option { + name = "WebSocket Echo" + value = "wsec" + description = "Send traffic to the workspace via the app websocket and read it back." + } + option { + name = "WebSocket Read (Random)" + value = "wsra" + description = "Read traffic from the workspace via the app websocket." + } + option { + name = "WebSocket Write (Discard)" + value = "wsdi" + description = "Send traffic to the workspace via the app websocket." + } +} + data "coder_parameter" "load_scenario_dashboard_traffic_duration" { - order = 29 + order = 60 name = "Dashboard Traffic Duration" type = "number" description = "The duration of the dashboard traffic load scenario in minutes." @@ -328,8 +437,21 @@ data "coder_parameter" "load_scenario_dashboard_traffic_duration" { } } +data "coder_parameter" "load_scenario_dashboard_traffic_percentage" { + order = 61 + name = "Dashboard Traffic Percentage" + type = "number" + description = "The percentage of users that should be targeted for dashboard traffic." + mutable = true + default = 100 + validation { + min = 1 + max = 100 + } +} + data "coder_parameter" "load_scenario_baseline_duration" { - order = 26 + order = 100 name = "Baseline Wait Duration" type = "number" description = "The duration to wait before starting a load scenario in minutes." @@ -342,7 +464,7 @@ data "coder_parameter" "load_scenario_baseline_duration" { } data "coder_parameter" "greedy_agent" { - order = 30 + order = 200 type = "bool" name = "Greedy Agent" default = false @@ -352,7 +474,7 @@ data "coder_parameter" "greedy_agent" { } data "coder_parameter" "greedy_agent_template" { - order = 31 + order = 201 name = "Greedy Agent Template" display_name = "Greedy Agent Template" description = "The template used for the greedy agent workspace (must not be same as workspace template)." @@ -432,6 +554,7 @@ resource "coder_agent" "main" { SCALETEST_RUN_ID : local.scaletest_run_id, SCALETEST_RUN_DIR : local.scaletest_run_dir, SCALETEST_RUN_START_TIME : local.scaletest_run_start_time, + SCALETEST_PROMETHEUS_START_PORT : "21112", # Comment is a scaletest param, but we want to surface it separately from # the rest, so we use a different name. @@ -443,13 +566,22 @@ resource "coder_agent" "main" { SCALETEST_PARAM_CREATE_CONCURRENCY : "${data.coder_parameter.create_concurrency.value}", SCALETEST_PARAM_CLEANUP_STRATEGY : data.coder_parameter.cleanup_strategy.value, SCALETEST_PARAM_LOAD_SCENARIOS : data.coder_parameter.load_scenarios.value, + SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY : data.coder_parameter.load_scenario_run_concurrently.value ? "1" : "0", SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_ssh_traffic_duration.value}", SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_ssh_bytes_per_tick.value}", SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_ssh_tick_interval.value}", + SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_ssh_traffic_percentage.value}", SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_web_terminal_traffic_duration.value}", SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_web_terminal_bytes_per_tick.value}", SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_web_terminal_tick_interval.value}", + SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_web_terminal_traffic_percentage.value}", + SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_app_traffic_duration.value}", + SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_app_bytes_per_tick.value}", + SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_app_tick_interval.value}", + SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_app_traffic_percentage.value}", + SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_MODE : data.coder_parameter.load_scenario_app_traffic_mode.value, SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_dashboard_traffic_duration.value}", + SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_dashboard_traffic_percentage.value}", SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION : "${data.coder_parameter.load_scenario_baseline_duration.value}", SCALETEST_PARAM_GREEDY_AGENT : data.coder_parameter.greedy_agent.value ? "1" : "0", SCALETEST_PARAM_GREEDY_AGENT_TEMPLATE : data.coder_parameter.greedy_agent_template.value, @@ -709,10 +841,14 @@ resource "kubernetes_pod" "main" { name = "home" read_only = false } - port { - container_port = 21112 - name = "prometheus-http" - protocol = "TCP" + dynamic "port" { + for_each = data.coder_parameter.load_scenario_run_concurrently.value ? jsondecode(data.coder_parameter.load_scenarios.value) : [""] + iterator = it + content { + container_port = 21112 + it.key + name = "prom-http${it.key}" + protocol = "TCP" + } } } @@ -787,8 +923,8 @@ resource "kubernetes_manifest" "pod_monitor" { } } podMetricsEndpoints = [ - { - port = "prometheus-http" + for i, _ in data.coder_parameter.load_scenario_run_concurrently.value ? jsondecode(data.coder_parameter.load_scenarios.value) : [""] : { + port = "prom-http${i}" interval = "15s" } ] diff --git a/scaletest/templates/scaletest-runner/scripts/run.sh b/scaletest/templates/scaletest-runner/scripts/run.sh index 03bafc7cf6a84..c117ca36cf185 100755 --- a/scaletest/templates/scaletest-runner/scripts/run.sh +++ b/scaletest/templates/scaletest-runner/scripts/run.sh @@ -90,15 +90,33 @@ else } fi +if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then + start_phase "Load scenarios: ${SCALETEST_PARAM_LOAD_SCENARIOS[*]}" +fi + +declare -a pids=() declare -A failed=() +target_start=0 +target_end=-1 for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do - start_phase "Load scenario: ${scenario}" + if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then + start_phase "Load scenario: ${scenario}" + fi set +e status=0 case "${scenario}" in "SSH Traffic") greedy_agent_traffic "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}" "${scenario}" & + greedy_agent_traffic_pid=$! + + target_count=$(jq -n --argjson percentage "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_PERCENTAGE}" --argjson num_workspaces "${SCALETEST_PARAM_NUM_WORKSPACES}" '$percentage / 100 * $num_workspaces | floor') + target_end=$((target_start + target_count)) + if [[ ${target_end} -gt ${SCALETEST_PARAM_NUM_WORKSPACES} ]]; then + log "WARNING: Target count ${target_end} exceeds number of workspaces ${SCALETEST_PARAM_NUM_WORKSPACES}, using ${SCALETEST_PARAM_NUM_WORKSPACES} instead." + target_start=0 + target_end=${target_count} + fi coder exp scaletest workspace-traffic \ --template "${SCALETEST_PARAM_TEMPLATE}" \ --ssh \ @@ -107,17 +125,34 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do --timeout "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}m" \ --job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}m30s" \ --output json:"${SCALETEST_RESULTS_DIR}/traffic-ssh.json" \ - "${non_greedy_agent_traffic_args[@]}" - status=$? - wait + --scaletest-prometheus-address "0.0.0.0:${SCALETEST_PROMETHEUS_START_PORT}" \ + --target-workspaces "${target_start}:${target_end}" \ + "${non_greedy_agent_traffic_args[@]}" & + pids+=($!) + if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then + wait "${pids[-1]}" + status=$? + show_json "${SCALETEST_RESULTS_DIR}/traffic-ssh.json" + else + SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1)) + fi + wait "${greedy_agent_traffic_pid}" status2=$? if [[ ${status} == 0 ]]; then status=${status2} fi - show_json "${SCALETEST_RESULTS_DIR}/traffic-ssh.json" ;; "Web Terminal Traffic") greedy_agent_traffic "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}" "${scenario}" & + greedy_agent_traffic_pid=$! + + target_count=$(jq -n --argjson percentage "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_PERCENTAGE}" --argjson num_workspaces "${SCALETEST_PARAM_NUM_WORKSPACES}" '$percentage / 100 * $num_workspaces | floor') + target_end=$((target_start + target_count)) + if [[ ${target_end} -gt ${SCALETEST_PARAM_NUM_WORKSPACES} ]]; then + log "WARNING: Target count ${target_end} exceeds number of workspaces ${SCALETEST_PARAM_NUM_WORKSPACES}, using ${SCALETEST_PARAM_NUM_WORKSPACES} instead." + target_start=0 + target_end=${target_count} + fi coder exp scaletest workspace-traffic \ --template "${SCALETEST_PARAM_TEMPLATE}" \ --bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_BYTES_PER_TICK}" \ @@ -125,37 +160,114 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do --timeout "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}m" \ --job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}m30s" \ --output json:"${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json" \ - "${non_greedy_agent_traffic_args[@]}" - status=$? - wait + --scaletest-prometheus-address "0.0.0.0:${SCALETEST_PROMETHEUS_START_PORT}" \ + --target-workspaces "${target_start}:${target_end}" \ + "${non_greedy_agent_traffic_args[@]}" & + pids+=($!) + if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then + wait "${pids[-1]}" + status=$? + show_json "${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json" + else + SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1)) + fi + wait "${greedy_agent_traffic_pid}" + status2=$? + if [[ ${status} == 0 ]]; then + status=${status2} + fi + ;; + "App Traffic") + greedy_agent_traffic "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION}" "${scenario}" & + greedy_agent_traffic_pid=$! + + target_count=$(jq -n --argjson percentage "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_PERCENTAGE}" --argjson num_workspaces "${SCALETEST_PARAM_NUM_WORKSPACES}" '$percentage / 100 * $num_workspaces | floor') + target_end=$((target_start + target_count)) + if [[ ${target_end} -gt ${SCALETEST_PARAM_NUM_WORKSPACES} ]]; then + log "WARNING: Target count ${target_end} exceeds number of workspaces ${SCALETEST_PARAM_NUM_WORKSPACES}, using ${SCALETEST_PARAM_NUM_WORKSPACES} instead." + target_start=0 + target_end=${target_count} + fi + coder exp scaletest workspace-traffic \ + --template "${SCALETEST_PARAM_TEMPLATE}" \ + --bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_BYTES_PER_TICK}" \ + --tick-interval "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_TICK_INTERVAL}ms" \ + --timeout "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION}m" \ + --job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION}m30s" \ + --output json:"${SCALETEST_RESULTS_DIR}/traffic-app.json" \ + --scaletest-prometheus-address "0.0.0.0:${SCALETEST_PROMETHEUS_START_PORT}" \ + --app "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_MODE}" \ + --target-workspaces "${target_start}:${target_end}" \ + "${non_greedy_agent_traffic_args[@]}" & + pids+=($!) + if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then + wait "${pids[-1]}" + status=$? + show_json "${SCALETEST_RESULTS_DIR}/traffic-app.json" + else + SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1)) + fi + wait "${greedy_agent_traffic_pid}" status2=$? if [[ ${status} == 0 ]]; then status=${status2} fi - show_json "${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json" ;; "Dashboard Traffic") + target_count=$(jq -n --argjson percentage "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_PERCENTAGE}" --argjson num_workspaces "${SCALETEST_PARAM_NUM_WORKSPACES}" '$percentage / 100 * $num_workspaces | floor') + target_end=$((target_start + target_count)) + if [[ ${target_end} -gt ${SCALETEST_PARAM_NUM_WORKSPACES} ]]; then + log "WARNING: Target count ${target_end} exceeds number of workspaces ${SCALETEST_PARAM_NUM_WORKSPACES}, using ${SCALETEST_PARAM_NUM_WORKSPACES} instead." + target_start=0 + target_end=${target_count} + fi coder exp scaletest dashboard \ --timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m" \ --job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m30s" \ --output json:"${SCALETEST_RESULTS_DIR}/traffic-dashboard.json" \ - >"${SCALETEST_RESULTS_DIR}/traffic-dashboard-output.log" - status=$? - show_json "${SCALETEST_RESULTS_DIR}/traffic-dashboard.json" + --scaletest-prometheus-address "0.0.0.0:${SCALETEST_PROMETHEUS_START_PORT}" \ + --target-users "${target_start}:${target_end}" \ + >"${SCALETEST_RESULTS_DIR}/traffic-dashboard-output.log" & + pids+=($!) + if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then + wait "${pids[-1]}" + status=$? + show_json "${SCALETEST_RESULTS_DIR}/traffic-dashboard.json" + else + SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1)) + fi ;; # Debug scenarios, for testing the runner. "debug:greedy_agent_traffic") - greedy_agent_traffic 10 "${scenario}" + greedy_agent_traffic 10 "${scenario}" & status=$? ;; "debug:success") - maybedryrun "$DRY_RUN" sleep 10 - status=0 + { + maybedryrun "$DRY_RUN" sleep 10 + true + } & + pids+=($!) + if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then + wait "${pids[-1]}" + status=$? + else + SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1)) + fi ;; "debug:error") - maybedryrun "$DRY_RUN" sleep 10 - status=1 + { + maybedryrun "$DRY_RUN" sleep 10 + false + } & + pids+=($!) + if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then + wait "${pids[-1]}" + status=$? + else + SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1)) + fi ;; *) @@ -163,23 +275,42 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do ;; esac set -e + + # Allow targeting to be distributed evenly across workspaces when each + # scenario is run concurrently and all percentages add up to 100. + target_start=${target_end} + + if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then + if ((status > 0)); then + log "Load scenario failed: ${scenario} (exit=${status})" + failed+=(["${scenario}"]="$status") + PHASE_ADD_TAGS=error end_phase + else + end_phase + fi + + wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}" + fi +done + +if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then + wait "${pids[@]}" + status=$? if ((status > 0)); then - log "Load scenario failed: ${scenario} (exit=${status})" - failed+=(["${scenario}"]="$status") + log "One or more load scenarios failed: ${SCALETEST_PARAM_LOAD_SCENARIOS[*]} (exit=${status})" PHASE_ADD_TAGS=error end_phase + exit 1 else end_phase fi - - wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}" -done - -if ((${#failed[@]} > 0)); then - log "Load scenarios failed: ${!failed[*]}" - for scenario in "${!failed[@]}"; do - log " ${scenario}: exit=${failed[$scenario]}" - done - exit 1 +else + if ((${#failed[@]} > 0)); then + log "Load scenarios failed: ${!failed[*]}" + for scenario in "${!failed[@]}"; do + log " ${scenario}: exit=${failed[$scenario]}" + done + exit 1 + fi fi log "Scaletest complete!" diff --git a/scaletest/templates/scaletest-runner/startup.sh b/scaletest/templates/scaletest-runner/startup.sh index 45bf4fb9ebd5c..3e4eb94f41810 100755 --- a/scaletest/templates/scaletest-runner/startup.sh +++ b/scaletest/templates/scaletest-runner/startup.sh @@ -8,6 +8,11 @@ if [[ ${SCALETEST_PARAM_GREEDY_AGENT_TEMPLATE} == "${SCALETEST_PARAM_TEMPLATE}" exit 1 fi +if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]] && [[ ${SCALETEST_PARAM_GREEDY_AGENT} == 1 ]]; then + echo "ERROR: Load scenario concurrency and greedy agent test cannot be enabled at the same time." >&2 + exit 1 +fi + # Unzip scripts and add to path. # shellcheck disable=SC2153 echo "Extracting scaletest scripts into ${SCRIPTS_DIR}..." From 17605a776d7bf03e9fa00fa90278a6d4f988a17a Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Mon, 22 Jan 2024 17:15:06 +0200 Subject: [PATCH 02/18] bump resources to the max --- scaletest/templates/scaletest-runner/main.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf index cd1d95cbd27b4..7bb155312084d 100644 --- a/scaletest/templates/scaletest-runner/main.tf +++ b/scaletest/templates/scaletest-runner/main.tf @@ -39,8 +39,8 @@ locals { workspace_pod_instance = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}" workspace_pod_termination_grace_period_seconds = 5 * 60 * 60 # 5 hours (cleanup timeout). service_account_name = "scaletest-sa" - cpu = 16 - memory = 64 + cpu = 32 + memory = 256 home_disk_size = 10 scaletest_run_id = "scaletest-${replace(time_static.start_time.rfc3339, ":", "-")}" scaletest_run_dir = "/home/coder/${local.scaletest_run_id}" From 806c73f9dee79601fbe5517b770402bbe2d2fc5e Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Mon, 22 Jan 2024 17:16:32 +0200 Subject: [PATCH 03/18] remove limits entirely --- scaletest/templates/scaletest-runner/main.tf | 8 -------- 1 file changed, 8 deletions(-) diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf index 7bb155312084d..82b4e8e15f955 100644 --- a/scaletest/templates/scaletest-runner/main.tf +++ b/scaletest/templates/scaletest-runner/main.tf @@ -39,8 +39,6 @@ locals { workspace_pod_instance = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}" workspace_pod_termination_grace_period_seconds = 5 * 60 * 60 # 5 hours (cleanup timeout). service_account_name = "scaletest-sa" - cpu = 32 - memory = 256 home_disk_size = 10 scaletest_run_id = "scaletest-${replace(time_static.start_time.rfc3339, ":", "-")}" scaletest_run_dir = "/home/coder/${local.scaletest_run_id}" @@ -825,16 +823,10 @@ resource "kubernetes_pod" "main" { } } resources { - # Set requests and limits values such that we can do performant - # execution of `coder scaletest` commands. requests = { "cpu" = "250m" "memory" = "512Mi" } - limits = { - "cpu" = "${local.cpu}" - "memory" = "${local.memory}Gi" - } } volume_mount { mount_path = "/home/coder" From 96100c0b2497bbf52526893176cb3905bf0c5b69 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Tue, 23 Jan 2024 16:42:46 +0200 Subject: [PATCH 04/18] allow more flexibility in cleanup --- scaletest/templates/scaletest-runner/main.tf | 11 +++++++++++ .../scaletest-runner/scripts/cleanup.sh | 17 ++++++++++------- .../scaletest-runner/scripts/prepare.sh | 6 ++++-- .../templates/scaletest-runner/shutdown.sh | 6 +++++- 4 files changed, 30 insertions(+), 10 deletions(-) diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf index 82b4e8e15f955..5a97465dcb102 100644 --- a/scaletest/templates/scaletest-runner/main.tf +++ b/scaletest/templates/scaletest-runner/main.tf @@ -169,6 +169,16 @@ data "coder_parameter" "cleanup_strategy" { } } +data "coder_parameter" "cleanup_prepare" { + order = 14 + type = "bool" + name = "Cleanup before scaletest" + default = true + description = "Cleanup existing scaletest users and workspaces before the scaletest starts (prepare phase)." + mutable = true + ephemeral = true +} + data "coder_parameter" "workspace_template" { order = 20 @@ -563,6 +573,7 @@ resource "coder_agent" "main" { SCALETEST_PARAM_NUM_WORKSPACES : data.coder_parameter.num_workspaces.value, SCALETEST_PARAM_CREATE_CONCURRENCY : "${data.coder_parameter.create_concurrency.value}", SCALETEST_PARAM_CLEANUP_STRATEGY : data.coder_parameter.cleanup_strategy.value, + SCALETEST_PARAM_CLEANUP_PREPARE : data.coder_parameter.cleanup_prepare.value ? "1" : "0", SCALETEST_PARAM_LOAD_SCENARIOS : data.coder_parameter.load_scenarios.value, SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY : data.coder_parameter.load_scenario_run_concurrently.value ? "1" : "0", SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_ssh_traffic_duration.value}", diff --git a/scaletest/templates/scaletest-runner/scripts/cleanup.sh b/scaletest/templates/scaletest-runner/scripts/cleanup.sh index ecdd086d9a4e0..b65d261308788 100755 --- a/scaletest/templates/scaletest-runner/scripts/cleanup.sh +++ b/scaletest/templates/scaletest-runner/scripts/cleanup.sh @@ -21,17 +21,20 @@ if [[ $event = manual ]]; then fi fi -start_phase "Cleanup (${event})" -coder exp scaletest cleanup \ - --cleanup-job-timeout 2h \ - --cleanup-timeout 5h | - tee "${SCALETEST_RESULTS_DIR}/cleanup-${event}.txt" -end_phase +if [[ $event != shutdown_scale_down_only ]]; then + start_phase "Cleanup (${event})" + coder exp scaletest cleanup \ + --cleanup-job-timeout 2h \ + --cleanup-timeout 5h \ + | tee "${SCALETEST_RESULTS_DIR}/cleanup-${event}.txt" + end_phase +fi if [[ $event != prepare ]]; then - start_phase "Scaling down provisioners..." + start_phase "Scale down provisioners" maybedryrun "$DRY_RUN" kubectl scale deployment/coder-provisioner --replicas 1 maybedryrun "$DRY_RUN" kubectl rollout status deployment/coder-provisioner + end_phase fi if [[ $event = manual ]]; then diff --git a/scaletest/templates/scaletest-runner/scripts/prepare.sh b/scaletest/templates/scaletest-runner/scripts/prepare.sh index e7e6c4d2a292a..90b2dd05f945f 100755 --- a/scaletest/templates/scaletest-runner/scripts/prepare.sh +++ b/scaletest/templates/scaletest-runner/scripts/prepare.sh @@ -47,8 +47,10 @@ unset CODER_SESSION_TOKEN echo -n "${token}" >"${CODER_CONFIG_DIR}/session" [[ $VERBOSE == 1 ]] && set -x # Restore logging (if enabled). -log "Cleaning up from previous runs (if applicable)..." -"${SCRIPTS_DIR}/cleanup.sh" "prepare" +if [[ ${SCALETEST_PARAM_CLEANUP_PREPARE} == 1 ]]; then + log "Cleaning up from previous runs (if applicable)..." + "${SCRIPTS_DIR}/cleanup.sh" prepare +fi log "Preparation complete!" diff --git a/scaletest/templates/scaletest-runner/shutdown.sh b/scaletest/templates/scaletest-runner/shutdown.sh index d5c81366b1217..9e75864d73120 100755 --- a/scaletest/templates/scaletest-runner/shutdown.sh +++ b/scaletest/templates/scaletest-runner/shutdown.sh @@ -14,7 +14,11 @@ trap cleanup EXIT annotate_grafana "workspace" "Agent stopping..." -"${SCRIPTS_DIR}/cleanup.sh" shutdown +shutdown_event=shutdown_scale_down_only +if [[ ${SCALETEST_PARAM_CLEANUP_STRATEGY} == on_stop ]]; then + shutdown_event=shutdown +fi +"${SCRIPTS_DIR}/cleanup.sh" "${shutdown_event}" annotate_grafana_end "workspace" "Agent running" From 569cfc174055ebfbc42ed4fc080e5abf2c56fdf1 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Wed, 24 Jan 2024 13:05:30 +0200 Subject: [PATCH 05/18] allow staggered start --- scaletest/templates/scaletest-runner/main.tf | 10 ++++++ .../templates/scaletest-runner/scripts/run.sh | 34 ++++++++++++++++--- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf index 5a97465dcb102..4ecf73b77616c 100644 --- a/scaletest/templates/scaletest-runner/main.tf +++ b/scaletest/templates/scaletest-runner/main.tf @@ -259,6 +259,15 @@ data "coder_parameter" "load_scenario_run_concurrently" { mutable = true } +data "coder_parameter" "load_scenario_concurrency_staggering" { + order = 23 + name = "Load Scenario Concurrency Staggering" + type = "number" + default = 3 + description = "The number of minutes to wait between starting each load scenario when run concurrently." + mutable = true +} + data "coder_parameter" "load_scenario_ssh_traffic_duration" { order = 30 name = "SSH Traffic Duration" @@ -576,6 +585,7 @@ resource "coder_agent" "main" { SCALETEST_PARAM_CLEANUP_PREPARE : data.coder_parameter.cleanup_prepare.value ? "1" : "0", SCALETEST_PARAM_LOAD_SCENARIOS : data.coder_parameter.load_scenarios.value, SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY : data.coder_parameter.load_scenario_run_concurrently.value ? "1" : "0", + SCALETEST_PARAM_LOAD_SCENARIO_CONCURRENCY_STAGGERING : "${data.coder_parameter.load_scenario_concurrency_staggering.value}", SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_ssh_traffic_duration.value}", SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_ssh_bytes_per_tick.value}", SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_ssh_tick_interval.value}", diff --git a/scaletest/templates/scaletest-runner/scripts/run.sh b/scaletest/templates/scaletest-runner/scripts/run.sh index c117ca36cf185..fd9dbcd84e9e3 100755 --- a/scaletest/templates/scaletest-runner/scripts/run.sh +++ b/scaletest/templates/scaletest-runner/scripts/run.sh @@ -94,10 +94,32 @@ if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then start_phase "Load scenarios: ${SCALETEST_PARAM_LOAD_SCENARIOS[*]}" fi +run_scenario_cmd() { + local scenario=${1} + shift + local command=("$@") + + set +e + if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then + annotate_grafana scenario "Load scenario: ${scenario}" + fi + "${command[@]}" + status=${?} + if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then + export GRAFANA_ADD_TAGS= + if [[ ${status} != 0 ]]; then + GRAFANA_ADD_TAGS=error + fi + annotate_grafana_end scenario "Load scenario: ${scenario}" + fi + exit "${status}" +} + declare -a pids=() declare -A failed=() target_start=0 target_end=-1 + for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then start_phase "Load scenario: ${scenario}" @@ -117,7 +139,7 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do target_start=0 target_end=${target_count} fi - coder exp scaletest workspace-traffic \ + run_scenario_cmd "${scenario}" coder exp scaletest workspace-traffic \ --template "${SCALETEST_PARAM_TEMPLATE}" \ --ssh \ --bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_BYTES_PER_TICK}" \ @@ -153,7 +175,7 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do target_start=0 target_end=${target_count} fi - coder exp scaletest workspace-traffic \ + run_scenario_cmd "${scenario}" coder exp scaletest workspace-traffic \ --template "${SCALETEST_PARAM_TEMPLATE}" \ --bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_BYTES_PER_TICK}" \ --tick-interval "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_TICK_INTERVAL}ms" \ @@ -188,7 +210,7 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do target_start=0 target_end=${target_count} fi - coder exp scaletest workspace-traffic \ + run_scenario_cmd "${scenario}" coder exp scaletest workspace-traffic \ --template "${SCALETEST_PARAM_TEMPLATE}" \ --bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_BYTES_PER_TICK}" \ --tick-interval "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_TICK_INTERVAL}ms" \ @@ -221,7 +243,7 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do target_start=0 target_end=${target_count} fi - coder exp scaletest dashboard \ + run_scenario_cmd "${scenario}" coder exp scaletest dashboard \ --timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m" \ --job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m30s" \ --output json:"${SCALETEST_RESULTS_DIR}/traffic-dashboard.json" \ @@ -290,6 +312,10 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do fi wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}" + else + # Stagger the start of each scenario to avoid a burst of load and deted + # problematic scenarios. + sleep $((SCALETEST_PARAM_LOAD_SCENARIO_CONCURRENCY_STAGGERING * 60)) fi done From d0fbd2ce98b631104c5d6801cc5c8a7d31f7c701 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Wed, 24 Jan 2024 15:25:28 +0200 Subject: [PATCH 06/18] improve cleanup --- .../scaletest-runner/scripts/cleanup.sh | 53 ++++++++++++------- 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/scaletest/templates/scaletest-runner/scripts/cleanup.sh b/scaletest/templates/scaletest-runner/scripts/cleanup.sh index b65d261308788..1dba743a0e516 100755 --- a/scaletest/templates/scaletest-runner/scripts/cleanup.sh +++ b/scaletest/templates/scaletest-runner/scripts/cleanup.sh @@ -12,32 +12,49 @@ if [[ -z $event ]]; then event=manual fi -if [[ $event = manual ]]; then - echo -n 'WARNING: This will clean up all scaletest resources, continue? (y/n) ' - read -r -n 1 - if [[ $REPLY != [yY] ]]; then - echo $'\nAborting...' - exit 1 - fi -fi - -if [[ $event != shutdown_scale_down_only ]]; then +do_cleanup() { start_phase "Cleanup (${event})" coder exp scaletest cleanup \ --cleanup-job-timeout 2h \ - --cleanup-timeout 5h \ - | tee "${SCALETEST_RESULTS_DIR}/cleanup-${event}.txt" + --cleanup-timeout 5h | + tee "${SCALETEST_RESULTS_DIR}/cleanup-${event}.txt" end_phase -fi +} -if [[ $event != prepare ]]; then - start_phase "Scale down provisioners" +do_scaledown() { + start_phase "Scale down provisioners (${event})" maybedryrun "$DRY_RUN" kubectl scale deployment/coder-provisioner --replicas 1 maybedryrun "$DRY_RUN" kubectl rollout status deployment/coder-provisioner end_phase -fi +} + +case "${event}" in +manual) + echo -n 'WARNING: This will clean up all scaletest resources, continue? (y/n) ' + read -r -n 1 + if [[ $REPLY != [yY] ]]; then + echo $'\nAborting...' + exit 1 + fi + + do_cleanup + do_scaledown -if [[ $event = manual ]]; then echo 'Press any key to continue...' read -s -r -n 1 -fi + ;; +prepare) + do_cleanup + ;; +shutdown) + do_cleanup + do_scaledown + ;; +shutdown_scale_down_only) + do_scaledown + ;; +*) + echo "Unknown event: ${event}" >&2 + exit 1 + ;; +esac From 3141efe89475a88d77717a78887e0234aba35819 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Wed, 24 Jan 2024 16:05:50 +0200 Subject: [PATCH 07/18] fix dashboard screengrabs location --- scaletest/templates/scaletest-runner/scripts/run.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scaletest/templates/scaletest-runner/scripts/run.sh b/scaletest/templates/scaletest-runner/scripts/run.sh index fd9dbcd84e9e3..23ad5c500343c 100755 --- a/scaletest/templates/scaletest-runner/scripts/run.sh +++ b/scaletest/templates/scaletest-runner/scripts/run.sh @@ -243,6 +243,10 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do target_start=0 target_end=${target_count} fi + # TODO: Remove this once the dashboard traffic command is fixed, + # (i.e. once images are no longer dumped into PWD). + mkdir -p dashboard + pushd dashboard run_scenario_cmd "${scenario}" coder exp scaletest dashboard \ --timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m" \ --job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m30s" \ @@ -251,6 +255,7 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do --target-users "${target_start}:${target_end}" \ >"${SCALETEST_RESULTS_DIR}/traffic-dashboard-output.log" & pids+=($!) + popd if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then wait "${pids[-1]}" status=$? From f58bf411b0b16b5d15c16801487c69705848528d Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Wed, 24 Jan 2024 16:59:47 +0200 Subject: [PATCH 08/18] add more cleanup cases --- scaletest/templates/scaletest-runner/scripts/cleanup.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scaletest/templates/scaletest-runner/scripts/cleanup.sh b/scaletest/templates/scaletest-runner/scripts/cleanup.sh index 1dba743a0e516..343f360189969 100755 --- a/scaletest/templates/scaletest-runner/scripts/cleanup.sh +++ b/scaletest/templates/scaletest-runner/scripts/cleanup.sh @@ -46,7 +46,8 @@ manual) prepare) do_cleanup ;; -shutdown) +on_stop) ;; # Do nothing, handled by "shutdown". +always | on_success | on_error | shutdown) do_cleanup do_scaledown ;; From 4754f2f0959efeb5e76e1b728401ea9860cac555 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Wed, 24 Jan 2024 17:02:59 +0200 Subject: [PATCH 09/18] fix updating of static time --- scaletest/templates/scaletest-runner/main.tf | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf index 4ecf73b77616c..6b374d86f8b38 100644 --- a/scaletest/templates/scaletest-runner/main.tf +++ b/scaletest/templates/scaletest-runner/main.tf @@ -13,10 +13,11 @@ terraform { resource "time_static" "start_time" { # We con't set `count = data.coder_workspace.me.start_count` here because then - # we can't use this value in `locals`. The permission check is recreated on - # start, which will update the timestamp. + # we can't use this value in `locals`, but we want to trigger recreation when + # the scaletest is restarted. triggers = { - count : length(null_resource.permission_check) + count : data.coder_workspace.me.start_count + id : data.coder_workspace.me.start_count > 0 ? data.coder_workspace.me.id : "" } } From 1236b9390998247cc7d70863307a0cfa786bd75b Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Wed, 24 Jan 2024 19:18:25 +0200 Subject: [PATCH 10/18] echo --- scaletest/templates/scaletest-runner/scripts/cleanup.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scaletest/templates/scaletest-runner/scripts/cleanup.sh b/scaletest/templates/scaletest-runner/scripts/cleanup.sh index 343f360189969..c80982497b5e9 100755 --- a/scaletest/templates/scaletest-runner/scripts/cleanup.sh +++ b/scaletest/templates/scaletest-runner/scripts/cleanup.sh @@ -36,6 +36,7 @@ manual) echo $'\nAborting...' exit 1 fi + echo do_cleanup do_scaledown From 724e1c5354c8b0c2e2a9c3b9e5c3f99d236c917f Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Thu, 25 Jan 2024 14:47:33 +0200 Subject: [PATCH 11/18] add skip create workspaces option --- scaletest/templates/scaletest-runner/main.tf | 16 ++++++++++++--- .../templates/scaletest-runner/scripts/run.sh | 20 ++++++++++--------- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf index 6b374d86f8b38..a143c4499fdca 100644 --- a/scaletest/templates/scaletest-runner/main.tf +++ b/scaletest/templates/scaletest-runner/main.tf @@ -235,9 +235,18 @@ data "coder_parameter" "num_workspaces" { } } +data "coder_parameter" "skip_create_workspaces" { + order = 22 + type = "bool" + name = "DEBUG: Skip creating workspaces" + default = false + description = "Skip creating workspaces (for resuming failed scaletests or debugging)" + mutable = true +} + data "coder_parameter" "load_scenarios" { - order = 22 + order = 23 name = "Load Scenarios" type = "list(string)" description = "The load scenarios to run." @@ -252,7 +261,7 @@ data "coder_parameter" "load_scenarios" { } data "coder_parameter" "load_scenario_run_concurrently" { - order = 23 + order = 24 name = "Run Load Scenarios Concurrently" type = "bool" default = false @@ -261,7 +270,7 @@ data "coder_parameter" "load_scenario_run_concurrently" { } data "coder_parameter" "load_scenario_concurrency_staggering" { - order = 23 + order = 25 name = "Load Scenario Concurrency Staggering" type = "number" default = 3 @@ -581,6 +590,7 @@ resource "coder_agent" "main" { SCALETEST_PARAM_TEMPLATE : data.coder_parameter.workspace_template.value, SCALETEST_PARAM_REPO_BRANCH : data.coder_parameter.repo_branch.value, SCALETEST_PARAM_NUM_WORKSPACES : data.coder_parameter.num_workspaces.value, + SCALETEST_PARAM_SKIP_CREATE_WORKSPACES : data.coder_parameter.skip_create_workspaces.value ? "1" : "0", SCALETEST_PARAM_CREATE_CONCURRENCY : "${data.coder_parameter.create_concurrency.value}", SCALETEST_PARAM_CLEANUP_STRATEGY : data.coder_parameter.cleanup_strategy.value, SCALETEST_PARAM_CLEANUP_PREPARE : data.coder_parameter.cleanup_prepare.value ? "1" : "0", diff --git a/scaletest/templates/scaletest-runner/scripts/run.sh b/scaletest/templates/scaletest-runner/scripts/run.sh index 23ad5c500343c..50cab189fae62 100755 --- a/scaletest/templates/scaletest-runner/scripts/run.sh +++ b/scaletest/templates/scaletest-runner/scripts/run.sh @@ -13,15 +13,17 @@ log "Running scaletest..." set_status Running start_phase "Creating workspaces" -coder exp scaletest create-workspaces \ - --count "${SCALETEST_PARAM_NUM_WORKSPACES}" \ - --template "${SCALETEST_PARAM_TEMPLATE}" \ - --concurrency "${SCALETEST_PARAM_CREATE_CONCURRENCY}" \ - --timeout 5h \ - --job-timeout 5h \ - --no-cleanup \ - --output json:"${SCALETEST_RESULTS_DIR}/create-workspaces.json" -show_json "${SCALETEST_RESULTS_DIR}/create-workspaces.json" +if [[ ${SCALETEST_PARAM_SKIP_CREATE_WORKSPACES} == 0 ]]; then + coder exp scaletest create-workspaces \ + --count "${SCALETEST_PARAM_NUM_WORKSPACES}" \ + --template "${SCALETEST_PARAM_TEMPLATE}" \ + --concurrency "${SCALETEST_PARAM_CREATE_CONCURRENCY}" \ + --timeout 5h \ + --job-timeout 5h \ + --no-cleanup \ + --output json:"${SCALETEST_RESULTS_DIR}/create-workspaces.json" + show_json "${SCALETEST_RESULTS_DIR}/create-workspaces.json" +fi end_phase wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}" From 916c74d1076a744a81eb1600c2d78edd6e572b42 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Thu, 25 Jan 2024 16:24:55 +0200 Subject: [PATCH 12/18] fix start time regen --- scaletest/templates/scaletest-runner/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf index a143c4499fdca..094168694336c 100644 --- a/scaletest/templates/scaletest-runner/main.tf +++ b/scaletest/templates/scaletest-runner/main.tf @@ -17,7 +17,7 @@ resource "time_static" "start_time" { # the scaletest is restarted. triggers = { count : data.coder_workspace.me.start_count - id : data.coder_workspace.me.start_count > 0 ? data.coder_workspace.me.id : "" + token : data.coder_workspace.me.owner_session_token # Rely on this being re-generated every start. } } From aeb9c529107271292dbe836ff84fb64084149456 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Mon, 29 Jan 2024 19:13:33 +0200 Subject: [PATCH 13/18] add retry --- scaletest/templates/scaletest-runner/scripts/run.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scaletest/templates/scaletest-runner/scripts/run.sh b/scaletest/templates/scaletest-runner/scripts/run.sh index 50cab189fae62..49004a7ff4e58 100755 --- a/scaletest/templates/scaletest-runner/scripts/run.sh +++ b/scaletest/templates/scaletest-runner/scripts/run.sh @@ -14,7 +14,11 @@ set_status Running start_phase "Creating workspaces" if [[ ${SCALETEST_PARAM_SKIP_CREATE_WORKSPACES} == 0 ]]; then + # Note that we allow up to 5 failures to bring up the workspace, since + # we're creating a lot of workspaces at once and some of them may fail + # due to network issues or other transient errors. coder exp scaletest create-workspaces \ + --retry 5 \ --count "${SCALETEST_PARAM_NUM_WORKSPACES}" \ --template "${SCALETEST_PARAM_TEMPLATE}" \ --concurrency "${SCALETEST_PARAM_CREATE_CONCURRENCY}" \ From 60df607aef32360c74df434c5f575026091974ae Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Mon, 29 Jan 2024 19:28:37 +0200 Subject: [PATCH 14/18] fix undetected error exit code --- .../templates/scaletest-runner/scripts/run.sh | 42 +++++++++++++------ 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/scaletest/templates/scaletest-runner/scripts/run.sh b/scaletest/templates/scaletest-runner/scripts/run.sh index 49004a7ff4e58..d61cea04b87d3 100755 --- a/scaletest/templates/scaletest-runner/scripts/run.sh +++ b/scaletest/templates/scaletest-runner/scripts/run.sh @@ -122,6 +122,7 @@ run_scenario_cmd() { } declare -a pids=() +declare -A pid_to_scenario=() declare -A failed=() target_start=0 target_end=-1 @@ -274,7 +275,13 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do # Debug scenarios, for testing the runner. "debug:greedy_agent_traffic") greedy_agent_traffic 10 "${scenario}" & - status=$? + pids+=($!) + if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then + wait "${pids[-1]}" + status=$? + else + SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1)) + fi ;; "debug:success") { @@ -324,6 +331,7 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}" else + pid_to_scenario+=(["${pids[-1]}"]="${scenario}") # Stagger the start of each scenario to avoid a burst of load and deted # problematic scenarios. sleep $((SCALETEST_PARAM_LOAD_SCENARIO_CONCURRENCY_STAGGERING * 60)) @@ -332,22 +340,30 @@ done if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then wait "${pids[@]}" - status=$? - if ((status > 0)); then - log "One or more load scenarios failed: ${SCALETEST_PARAM_LOAD_SCENARIOS[*]} (exit=${status})" + # Wait on all pids will wait until all have exited, but we need to + # check their individual exit codes. + for pid in "${pids[@]}"; do + wait "${pid}" + status=${?} + scenario=${pid_to_scenario[${pid}]} + if ((status > 0)); then + log "Load scenario failed: ${scenario} (exit=${status})" + failed+=(["${scenario}"]="$status") + fi + done + if ((${#failed[@]} > 0)); then PHASE_ADD_TAGS=error end_phase - exit 1 else end_phase fi -else - if ((${#failed[@]} > 0)); then - log "Load scenarios failed: ${!failed[*]}" - for scenario in "${!failed[@]}"; do - log " ${scenario}: exit=${failed[$scenario]}" - done - exit 1 - fi +fi + +if ((${#failed[@]} > 0)); then + log "Load scenarios failed: ${!failed[*]}" + for scenario in "${!failed[@]}"; do + log " ${scenario}: exit=${failed[$scenario]}" + done + exit 1 fi log "Scaletest complete!" From 23d936e2927e371e65fcebb3f0ddb52c6cbdf35d Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Tue, 30 Jan 2024 12:14:14 +0200 Subject: [PATCH 15/18] minor cleanup --- .../templates/scaletest-runner/scripts/run.sh | 35 +++++++++---------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/scaletest/templates/scaletest-runner/scripts/run.sh b/scaletest/templates/scaletest-runner/scripts/run.sh index d61cea04b87d3..6ca913abca9cf 100755 --- a/scaletest/templates/scaletest-runner/scripts/run.sh +++ b/scaletest/templates/scaletest-runner/scripts/run.sh @@ -92,14 +92,10 @@ else fi annotate_grafana_end greedy_agent "${scenario}: Greedy agent traffic" - return ${status} + return "${status}" } fi -if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then - start_phase "Load scenarios: ${SCALETEST_PARAM_LOAD_SCENARIOS[*]}" -fi - run_scenario_cmd() { local scenario=${1} shift @@ -127,6 +123,9 @@ declare -A failed=() target_start=0 target_end=-1 +if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then + start_phase "Load scenarios: ${SCALETEST_PARAM_LOAD_SCENARIOS[*]}" +fi for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then start_phase "Load scenario: ${scenario}" @@ -320,24 +319,24 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do # scenario is run concurrently and all percentages add up to 100. target_start=${target_end} - if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then - if ((status > 0)); then - log "Load scenario failed: ${scenario} (exit=${status})" - failed+=(["${scenario}"]="$status") - PHASE_ADD_TAGS=error end_phase - else - end_phase - fi - - wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}" - else + if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then pid_to_scenario+=(["${pids[-1]}"]="${scenario}") # Stagger the start of each scenario to avoid a burst of load and deted # problematic scenarios. sleep $((SCALETEST_PARAM_LOAD_SCENARIO_CONCURRENCY_STAGGERING * 60)) + continue fi -done + if ((status > 0)); then + log "Load scenario failed: ${scenario} (exit=${status})" + failed+=(["${scenario}"]="${status}") + PHASE_ADD_TAGS=error end_phase + else + end_phase + fi + + wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}" +done if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then wait "${pids[@]}" # Wait on all pids will wait until all have exited, but we need to @@ -348,7 +347,7 @@ if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then scenario=${pid_to_scenario[${pid}]} if ((status > 0)); then log "Load scenario failed: ${scenario} (exit=${status})" - failed+=(["${scenario}"]="$status") + failed+=(["${scenario}"]="${status}") fi done if ((${#failed[@]} > 0)); then From 82b95d4dd4c5177fd91f92209661c6e91395d82e Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Tue, 30 Jan 2024 12:17:58 +0200 Subject: [PATCH 16/18] fix typo --- scaletest/templates/scaletest-runner/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf index 094168694336c..558f44a8d9e6a 100644 --- a/scaletest/templates/scaletest-runner/main.tf +++ b/scaletest/templates/scaletest-runner/main.tf @@ -12,7 +12,7 @@ terraform { } resource "time_static" "start_time" { - # We con't set `count = data.coder_workspace.me.start_count` here because then + # We don't set `count = data.coder_workspace.me.start_count` here because then # we can't use this value in `locals`, but we want to trigger recreation when # the scaletest is restarted. triggers = { From 74812f68e73384d643f3433e2d4439701d3f280c Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Tue, 30 Jan 2024 12:52:11 +0200 Subject: [PATCH 17/18] stagger delay mins --- scaletest/templates/scaletest-runner/main.tf | 6 +++--- scaletest/templates/scaletest-runner/scripts/run.sh | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf index 558f44a8d9e6a..ef1c7ba814b4b 100644 --- a/scaletest/templates/scaletest-runner/main.tf +++ b/scaletest/templates/scaletest-runner/main.tf @@ -269,9 +269,9 @@ data "coder_parameter" "load_scenario_run_concurrently" { mutable = true } -data "coder_parameter" "load_scenario_concurrency_staggering" { +data "coder_parameter" "load_scenario_concurrency_stagger_delay_mins" { order = 25 - name = "Load Scenario Concurrency Staggering" + name = "Load Scenario Concurrency Stagger Delay" type = "number" default = 3 description = "The number of minutes to wait between starting each load scenario when run concurrently." @@ -596,7 +596,7 @@ resource "coder_agent" "main" { SCALETEST_PARAM_CLEANUP_PREPARE : data.coder_parameter.cleanup_prepare.value ? "1" : "0", SCALETEST_PARAM_LOAD_SCENARIOS : data.coder_parameter.load_scenarios.value, SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY : data.coder_parameter.load_scenario_run_concurrently.value ? "1" : "0", - SCALETEST_PARAM_LOAD_SCENARIO_CONCURRENCY_STAGGERING : "${data.coder_parameter.load_scenario_concurrency_staggering.value}", + SCALETEST_PARAM_LOAD_SCENARIO_CONCURRENCY_STAGGER_DELAY_MINS : "${data.coder_parameter.load_scenario_concurrency_stagger_delay_mins.value}", SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_ssh_traffic_duration.value}", SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_ssh_bytes_per_tick.value}", SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_ssh_tick_interval.value}", diff --git a/scaletest/templates/scaletest-runner/scripts/run.sh b/scaletest/templates/scaletest-runner/scripts/run.sh index 6ca913abca9cf..47a6042a18598 100755 --- a/scaletest/templates/scaletest-runner/scripts/run.sh +++ b/scaletest/templates/scaletest-runner/scripts/run.sh @@ -323,7 +323,7 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do pid_to_scenario+=(["${pids[-1]}"]="${scenario}") # Stagger the start of each scenario to avoid a burst of load and deted # problematic scenarios. - sleep $((SCALETEST_PARAM_LOAD_SCENARIO_CONCURRENCY_STAGGERING * 60)) + sleep $((SCALETEST_PARAM_LOAD_SCENARIO_CONCURRENCY_STAGGER_DELAY_MINS * 60)) continue fi From 21bc64b7ab8302f51a94df8ed1df4c96cc8b6c06 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Tue, 30 Jan 2024 13:35:16 +0200 Subject: [PATCH 18/18] add note about prom http port name --- scaletest/templates/scaletest-runner/main.tf | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf index ef1c7ba814b4b..2a6eb8ca21ed5 100644 --- a/scaletest/templates/scaletest-runner/main.tf +++ b/scaletest/templates/scaletest-runner/main.tf @@ -947,6 +947,10 @@ resource "kubernetes_manifest" "pod_monitor" { } } podMetricsEndpoints = [ + # NOTE(mafredri): We could add more information here by including the + # scenario name in the port name (although it's limited to 15 chars so + # it needs to be short). That said, someone looking at the stats can + # assume that there's a 1-to-1 mapping between scenario# and port. for i, _ in data.coder_parameter.load_scenario_run_concurrently.value ? jsondecode(data.coder_parameter.load_scenarios.value) : [""] : { port = "prom-http${i}" interval = "15s"