diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf index b536fc51afbb3..2a6eb8ca21ed5 100644 --- a/scaletest/templates/scaletest-runner/main.tf +++ b/scaletest/templates/scaletest-runner/main.tf @@ -12,11 +12,12 @@ terraform { } resource "time_static" "start_time" { - # We con't set `count = data.coder_workspace.me.start_count` here because then - # we can't use this value in `locals`. The permission check is recreated on - # start, which will update the timestamp. + # We don't set `count = data.coder_workspace.me.start_count` here because then + # we can't use this value in `locals`, but we want to trigger recreation when + # the scaletest is restarted. triggers = { - count : length(null_resource.permission_check) + count : data.coder_workspace.me.start_count + token : data.coder_workspace.me.owner_session_token # Rely on this being re-generated every start. } } @@ -39,8 +40,6 @@ locals { workspace_pod_instance = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}" workspace_pod_termination_grace_period_seconds = 5 * 60 * 60 # 5 hours (cleanup timeout). service_account_name = "scaletest-sa" - cpu = 16 - memory = 64 home_disk_size = 10 scaletest_run_id = "scaletest-${replace(time_static.start_time.rfc3339, ":", "-")}" scaletest_run_dir = "/home/coder/${local.scaletest_run_id}" @@ -171,6 +170,16 @@ data "coder_parameter" "cleanup_strategy" { } } +data "coder_parameter" "cleanup_prepare" { + order = 14 + type = "bool" + name = "Cleanup before scaletest" + default = true + description = "Cleanup existing scaletest users and workspaces before the scaletest starts (prepare phase)." + mutable = true + ephemeral = true +} + data "coder_parameter" "workspace_template" { order = 20 @@ -226,9 +235,18 @@ data "coder_parameter" "num_workspaces" { } } +data "coder_parameter" "skip_create_workspaces" { + order = 22 + type = "bool" + name = "DEBUG: Skip creating workspaces" + default = false + description = "Skip creating workspaces (for resuming failed scaletests or debugging)" + mutable = true +} + data "coder_parameter" "load_scenarios" { - order = 22 + order = 23 name = "Load Scenarios" type = "list(string)" description = "The load scenarios to run." @@ -237,12 +255,31 @@ data "coder_parameter" "load_scenarios" { default = jsonencode([ "SSH Traffic", "Web Terminal Traffic", + "App Traffic", "Dashboard Traffic", ]) } +data "coder_parameter" "load_scenario_run_concurrently" { + order = 24 + name = "Run Load Scenarios Concurrently" + type = "bool" + default = false + description = "Run all load scenarios concurrently, this setting enables the load scenario percentages so that they can be assigned a percentage of 1-100%." + mutable = true +} + +data "coder_parameter" "load_scenario_concurrency_stagger_delay_mins" { + order = 25 + name = "Load Scenario Concurrency Stagger Delay" + type = "number" + default = 3 + description = "The number of minutes to wait between starting each load scenario when run concurrently." + mutable = true +} + data "coder_parameter" "load_scenario_ssh_traffic_duration" { - order = 23 + order = 30 name = "SSH Traffic Duration" type = "number" description = "The duration of the SSH traffic load scenario in minutes." @@ -255,7 +292,7 @@ data "coder_parameter" "load_scenario_ssh_traffic_duration" { } data "coder_parameter" "load_scenario_ssh_bytes_per_tick" { - order = 24 + order = 31 name = "SSH Bytes Per Tick" type = "number" description = "The number of bytes to send per tick in the SSH traffic load scenario." @@ -267,7 +304,7 @@ data "coder_parameter" "load_scenario_ssh_bytes_per_tick" { } data "coder_parameter" "load_scenario_ssh_tick_interval" { - order = 25 + order = 32 name = "SSH Tick Interval" type = "number" description = "The number of milliseconds between each tick in the SSH traffic load scenario." @@ -278,8 +315,21 @@ data "coder_parameter" "load_scenario_ssh_tick_interval" { } } +data "coder_parameter" "load_scenario_ssh_traffic_percentage" { + order = 33 + name = "SSH Traffic Percentage" + type = "number" + description = "The percentage of workspaces that should be targeted for SSH traffic." + mutable = true + default = 100 + validation { + min = 1 + max = 100 + } +} + data "coder_parameter" "load_scenario_web_terminal_traffic_duration" { - order = 26 + order = 40 name = "Web Terminal Traffic Duration" type = "number" description = "The duration of the web terminal traffic load scenario in minutes." @@ -292,7 +342,7 @@ data "coder_parameter" "load_scenario_web_terminal_traffic_duration" { } data "coder_parameter" "load_scenario_web_terminal_bytes_per_tick" { - order = 27 + order = 41 name = "Web Terminal Bytes Per Tick" type = "number" description = "The number of bytes to send per tick in the web terminal traffic load scenario." @@ -304,7 +354,7 @@ data "coder_parameter" "load_scenario_web_terminal_bytes_per_tick" { } data "coder_parameter" "load_scenario_web_terminal_tick_interval" { - order = 28 + order = 42 name = "Web Terminal Tick Interval" type = "number" description = "The number of milliseconds between each tick in the web terminal traffic load scenario." @@ -315,8 +365,94 @@ data "coder_parameter" "load_scenario_web_terminal_tick_interval" { } } +data "coder_parameter" "load_scenario_web_terminal_traffic_percentage" { + order = 43 + name = "Web Terminal Traffic Percentage" + type = "number" + description = "The percentage of workspaces that should be targeted for web terminal traffic." + mutable = true + default = 100 + validation { + min = 1 + max = 100 + } +} + +data "coder_parameter" "load_scenario_app_traffic_duration" { + order = 50 + name = "App Traffic Duration" + type = "number" + description = "The duration of the app traffic load scenario in minutes." + mutable = true + default = 30 + validation { + min = 1 + max = 1440 // 24 hours. + } +} + +data "coder_parameter" "load_scenario_app_bytes_per_tick" { + order = 51 + name = "App Bytes Per Tick" + type = "number" + description = "The number of bytes to send per tick in the app traffic load scenario." + mutable = true + default = 1024 + validation { + min = 1 + } +} + +data "coder_parameter" "load_scenario_app_tick_interval" { + order = 52 + name = "App Tick Interval" + type = "number" + description = "The number of milliseconds between each tick in the app traffic load scenario." + mutable = true + default = 100 + validation { + min = 1 + } +} + +data "coder_parameter" "load_scenario_app_traffic_percentage" { + order = 53 + name = "App Traffic Percentage" + type = "number" + description = "The percentage of workspaces that should be targeted for app traffic." + mutable = true + default = 100 + validation { + min = 1 + max = 100 + } +} + +data "coder_parameter" "load_scenario_app_traffic_mode" { + order = 54 + name = "App Traffic Mode" + default = "wsec" + description = "The mode of the app traffic load scenario." + mutable = true + option { + name = "WebSocket Echo" + value = "wsec" + description = "Send traffic to the workspace via the app websocket and read it back." + } + option { + name = "WebSocket Read (Random)" + value = "wsra" + description = "Read traffic from the workspace via the app websocket." + } + option { + name = "WebSocket Write (Discard)" + value = "wsdi" + description = "Send traffic to the workspace via the app websocket." + } +} + data "coder_parameter" "load_scenario_dashboard_traffic_duration" { - order = 29 + order = 60 name = "Dashboard Traffic Duration" type = "number" description = "The duration of the dashboard traffic load scenario in minutes." @@ -328,8 +464,21 @@ data "coder_parameter" "load_scenario_dashboard_traffic_duration" { } } +data "coder_parameter" "load_scenario_dashboard_traffic_percentage" { + order = 61 + name = "Dashboard Traffic Percentage" + type = "number" + description = "The percentage of users that should be targeted for dashboard traffic." + mutable = true + default = 100 + validation { + min = 1 + max = 100 + } +} + data "coder_parameter" "load_scenario_baseline_duration" { - order = 26 + order = 100 name = "Baseline Wait Duration" type = "number" description = "The duration to wait before starting a load scenario in minutes." @@ -342,7 +491,7 @@ data "coder_parameter" "load_scenario_baseline_duration" { } data "coder_parameter" "greedy_agent" { - order = 30 + order = 200 type = "bool" name = "Greedy Agent" default = false @@ -352,7 +501,7 @@ data "coder_parameter" "greedy_agent" { } data "coder_parameter" "greedy_agent_template" { - order = 31 + order = 201 name = "Greedy Agent Template" display_name = "Greedy Agent Template" description = "The template used for the greedy agent workspace (must not be same as workspace template)." @@ -432,6 +581,7 @@ resource "coder_agent" "main" { SCALETEST_RUN_ID : local.scaletest_run_id, SCALETEST_RUN_DIR : local.scaletest_run_dir, SCALETEST_RUN_START_TIME : local.scaletest_run_start_time, + SCALETEST_PROMETHEUS_START_PORT : "21112", # Comment is a scaletest param, but we want to surface it separately from # the rest, so we use a different name. @@ -440,16 +590,28 @@ resource "coder_agent" "main" { SCALETEST_PARAM_TEMPLATE : data.coder_parameter.workspace_template.value, SCALETEST_PARAM_REPO_BRANCH : data.coder_parameter.repo_branch.value, SCALETEST_PARAM_NUM_WORKSPACES : data.coder_parameter.num_workspaces.value, + SCALETEST_PARAM_SKIP_CREATE_WORKSPACES : data.coder_parameter.skip_create_workspaces.value ? "1" : "0", SCALETEST_PARAM_CREATE_CONCURRENCY : "${data.coder_parameter.create_concurrency.value}", SCALETEST_PARAM_CLEANUP_STRATEGY : data.coder_parameter.cleanup_strategy.value, + SCALETEST_PARAM_CLEANUP_PREPARE : data.coder_parameter.cleanup_prepare.value ? "1" : "0", SCALETEST_PARAM_LOAD_SCENARIOS : data.coder_parameter.load_scenarios.value, + SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY : data.coder_parameter.load_scenario_run_concurrently.value ? "1" : "0", + SCALETEST_PARAM_LOAD_SCENARIO_CONCURRENCY_STAGGER_DELAY_MINS : "${data.coder_parameter.load_scenario_concurrency_stagger_delay_mins.value}", SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_ssh_traffic_duration.value}", SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_ssh_bytes_per_tick.value}", SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_ssh_tick_interval.value}", + SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_ssh_traffic_percentage.value}", SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_web_terminal_traffic_duration.value}", SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_web_terminal_bytes_per_tick.value}", SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_web_terminal_tick_interval.value}", + SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_web_terminal_traffic_percentage.value}", + SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_app_traffic_duration.value}", + SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_app_bytes_per_tick.value}", + SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_app_tick_interval.value}", + SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_app_traffic_percentage.value}", + SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_MODE : data.coder_parameter.load_scenario_app_traffic_mode.value, SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_dashboard_traffic_duration.value}", + SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_dashboard_traffic_percentage.value}", SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION : "${data.coder_parameter.load_scenario_baseline_duration.value}", SCALETEST_PARAM_GREEDY_AGENT : data.coder_parameter.greedy_agent.value ? "1" : "0", SCALETEST_PARAM_GREEDY_AGENT_TEMPLATE : data.coder_parameter.greedy_agent_template.value, @@ -693,26 +855,24 @@ resource "kubernetes_pod" "main" { } } resources { - # Set requests and limits values such that we can do performant - # execution of `coder scaletest` commands. requests = { "cpu" = "250m" "memory" = "512Mi" } - limits = { - "cpu" = "${local.cpu}" - "memory" = "${local.memory}Gi" - } } volume_mount { mount_path = "/home/coder" name = "home" read_only = false } - port { - container_port = 21112 - name = "prometheus-http" - protocol = "TCP" + dynamic "port" { + for_each = data.coder_parameter.load_scenario_run_concurrently.value ? jsondecode(data.coder_parameter.load_scenarios.value) : [""] + iterator = it + content { + container_port = 21112 + it.key + name = "prom-http${it.key}" + protocol = "TCP" + } } } @@ -787,8 +947,12 @@ resource "kubernetes_manifest" "pod_monitor" { } } podMetricsEndpoints = [ - { - port = "prometheus-http" + # NOTE(mafredri): We could add more information here by including the + # scenario name in the port name (although it's limited to 15 chars so + # it needs to be short). That said, someone looking at the stats can + # assume that there's a 1-to-1 mapping between scenario# and port. + for i, _ in data.coder_parameter.load_scenario_run_concurrently.value ? jsondecode(data.coder_parameter.load_scenarios.value) : [""] : { + port = "prom-http${i}" interval = "15s" } ] diff --git a/scaletest/templates/scaletest-runner/scripts/cleanup.sh b/scaletest/templates/scaletest-runner/scripts/cleanup.sh index ecdd086d9a4e0..c80982497b5e9 100755 --- a/scaletest/templates/scaletest-runner/scripts/cleanup.sh +++ b/scaletest/templates/scaletest-runner/scripts/cleanup.sh @@ -12,29 +12,51 @@ if [[ -z $event ]]; then event=manual fi -if [[ $event = manual ]]; then +do_cleanup() { + start_phase "Cleanup (${event})" + coder exp scaletest cleanup \ + --cleanup-job-timeout 2h \ + --cleanup-timeout 5h | + tee "${SCALETEST_RESULTS_DIR}/cleanup-${event}.txt" + end_phase +} + +do_scaledown() { + start_phase "Scale down provisioners (${event})" + maybedryrun "$DRY_RUN" kubectl scale deployment/coder-provisioner --replicas 1 + maybedryrun "$DRY_RUN" kubectl rollout status deployment/coder-provisioner + end_phase +} + +case "${event}" in +manual) echo -n 'WARNING: This will clean up all scaletest resources, continue? (y/n) ' read -r -n 1 if [[ $REPLY != [yY] ]]; then echo $'\nAborting...' exit 1 fi -fi + echo -start_phase "Cleanup (${event})" -coder exp scaletest cleanup \ - --cleanup-job-timeout 2h \ - --cleanup-timeout 5h | - tee "${SCALETEST_RESULTS_DIR}/cleanup-${event}.txt" -end_phase + do_cleanup + do_scaledown -if [[ $event != prepare ]]; then - start_phase "Scaling down provisioners..." - maybedryrun "$DRY_RUN" kubectl scale deployment/coder-provisioner --replicas 1 - maybedryrun "$DRY_RUN" kubectl rollout status deployment/coder-provisioner -fi - -if [[ $event = manual ]]; then echo 'Press any key to continue...' read -s -r -n 1 -fi + ;; +prepare) + do_cleanup + ;; +on_stop) ;; # Do nothing, handled by "shutdown". +always | on_success | on_error | shutdown) + do_cleanup + do_scaledown + ;; +shutdown_scale_down_only) + do_scaledown + ;; +*) + echo "Unknown event: ${event}" >&2 + exit 1 + ;; +esac diff --git a/scaletest/templates/scaletest-runner/scripts/prepare.sh b/scaletest/templates/scaletest-runner/scripts/prepare.sh index e7e6c4d2a292a..90b2dd05f945f 100755 --- a/scaletest/templates/scaletest-runner/scripts/prepare.sh +++ b/scaletest/templates/scaletest-runner/scripts/prepare.sh @@ -47,8 +47,10 @@ unset CODER_SESSION_TOKEN echo -n "${token}" >"${CODER_CONFIG_DIR}/session" [[ $VERBOSE == 1 ]] && set -x # Restore logging (if enabled). -log "Cleaning up from previous runs (if applicable)..." -"${SCRIPTS_DIR}/cleanup.sh" "prepare" +if [[ ${SCALETEST_PARAM_CLEANUP_PREPARE} == 1 ]]; then + log "Cleaning up from previous runs (if applicable)..." + "${SCRIPTS_DIR}/cleanup.sh" prepare +fi log "Preparation complete!" diff --git a/scaletest/templates/scaletest-runner/scripts/run.sh b/scaletest/templates/scaletest-runner/scripts/run.sh index 03bafc7cf6a84..47a6042a18598 100755 --- a/scaletest/templates/scaletest-runner/scripts/run.sh +++ b/scaletest/templates/scaletest-runner/scripts/run.sh @@ -13,15 +13,21 @@ log "Running scaletest..." set_status Running start_phase "Creating workspaces" -coder exp scaletest create-workspaces \ - --count "${SCALETEST_PARAM_NUM_WORKSPACES}" \ - --template "${SCALETEST_PARAM_TEMPLATE}" \ - --concurrency "${SCALETEST_PARAM_CREATE_CONCURRENCY}" \ - --timeout 5h \ - --job-timeout 5h \ - --no-cleanup \ - --output json:"${SCALETEST_RESULTS_DIR}/create-workspaces.json" -show_json "${SCALETEST_RESULTS_DIR}/create-workspaces.json" +if [[ ${SCALETEST_PARAM_SKIP_CREATE_WORKSPACES} == 0 ]]; then + # Note that we allow up to 5 failures to bring up the workspace, since + # we're creating a lot of workspaces at once and some of them may fail + # due to network issues or other transient errors. + coder exp scaletest create-workspaces \ + --retry 5 \ + --count "${SCALETEST_PARAM_NUM_WORKSPACES}" \ + --template "${SCALETEST_PARAM_TEMPLATE}" \ + --concurrency "${SCALETEST_PARAM_CREATE_CONCURRENCY}" \ + --timeout 5h \ + --job-timeout 5h \ + --no-cleanup \ + --output json:"${SCALETEST_RESULTS_DIR}/create-workspaces.json" + show_json "${SCALETEST_RESULTS_DIR}/create-workspaces.json" +fi end_phase wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}" @@ -86,20 +92,60 @@ else fi annotate_grafana_end greedy_agent "${scenario}: Greedy agent traffic" - return ${status} + return "${status}" } fi +run_scenario_cmd() { + local scenario=${1} + shift + local command=("$@") + + set +e + if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then + annotate_grafana scenario "Load scenario: ${scenario}" + fi + "${command[@]}" + status=${?} + if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then + export GRAFANA_ADD_TAGS= + if [[ ${status} != 0 ]]; then + GRAFANA_ADD_TAGS=error + fi + annotate_grafana_end scenario "Load scenario: ${scenario}" + fi + exit "${status}" +} + +declare -a pids=() +declare -A pid_to_scenario=() declare -A failed=() +target_start=0 +target_end=-1 + +if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then + start_phase "Load scenarios: ${SCALETEST_PARAM_LOAD_SCENARIOS[*]}" +fi for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do - start_phase "Load scenario: ${scenario}" + if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then + start_phase "Load scenario: ${scenario}" + fi set +e status=0 case "${scenario}" in "SSH Traffic") greedy_agent_traffic "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}" "${scenario}" & - coder exp scaletest workspace-traffic \ + greedy_agent_traffic_pid=$! + + target_count=$(jq -n --argjson percentage "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_PERCENTAGE}" --argjson num_workspaces "${SCALETEST_PARAM_NUM_WORKSPACES}" '$percentage / 100 * $num_workspaces | floor') + target_end=$((target_start + target_count)) + if [[ ${target_end} -gt ${SCALETEST_PARAM_NUM_WORKSPACES} ]]; then + log "WARNING: Target count ${target_end} exceeds number of workspaces ${SCALETEST_PARAM_NUM_WORKSPACES}, using ${SCALETEST_PARAM_NUM_WORKSPACES} instead." + target_start=0 + target_end=${target_count} + fi + run_scenario_cmd "${scenario}" coder exp scaletest workspace-traffic \ --template "${SCALETEST_PARAM_TEMPLATE}" \ --ssh \ --bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_BYTES_PER_TICK}" \ @@ -107,55 +153,160 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do --timeout "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}m" \ --job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}m30s" \ --output json:"${SCALETEST_RESULTS_DIR}/traffic-ssh.json" \ - "${non_greedy_agent_traffic_args[@]}" - status=$? - wait + --scaletest-prometheus-address "0.0.0.0:${SCALETEST_PROMETHEUS_START_PORT}" \ + --target-workspaces "${target_start}:${target_end}" \ + "${non_greedy_agent_traffic_args[@]}" & + pids+=($!) + if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then + wait "${pids[-1]}" + status=$? + show_json "${SCALETEST_RESULTS_DIR}/traffic-ssh.json" + else + SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1)) + fi + wait "${greedy_agent_traffic_pid}" status2=$? if [[ ${status} == 0 ]]; then status=${status2} fi - show_json "${SCALETEST_RESULTS_DIR}/traffic-ssh.json" ;; "Web Terminal Traffic") greedy_agent_traffic "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}" "${scenario}" & - coder exp scaletest workspace-traffic \ + greedy_agent_traffic_pid=$! + + target_count=$(jq -n --argjson percentage "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_PERCENTAGE}" --argjson num_workspaces "${SCALETEST_PARAM_NUM_WORKSPACES}" '$percentage / 100 * $num_workspaces | floor') + target_end=$((target_start + target_count)) + if [[ ${target_end} -gt ${SCALETEST_PARAM_NUM_WORKSPACES} ]]; then + log "WARNING: Target count ${target_end} exceeds number of workspaces ${SCALETEST_PARAM_NUM_WORKSPACES}, using ${SCALETEST_PARAM_NUM_WORKSPACES} instead." + target_start=0 + target_end=${target_count} + fi + run_scenario_cmd "${scenario}" coder exp scaletest workspace-traffic \ --template "${SCALETEST_PARAM_TEMPLATE}" \ --bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_BYTES_PER_TICK}" \ --tick-interval "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_TICK_INTERVAL}ms" \ --timeout "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}m" \ --job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}m30s" \ --output json:"${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json" \ - "${non_greedy_agent_traffic_args[@]}" - status=$? - wait + --scaletest-prometheus-address "0.0.0.0:${SCALETEST_PROMETHEUS_START_PORT}" \ + --target-workspaces "${target_start}:${target_end}" \ + "${non_greedy_agent_traffic_args[@]}" & + pids+=($!) + if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then + wait "${pids[-1]}" + status=$? + show_json "${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json" + else + SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1)) + fi + wait "${greedy_agent_traffic_pid}" + status2=$? + if [[ ${status} == 0 ]]; then + status=${status2} + fi + ;; + "App Traffic") + greedy_agent_traffic "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION}" "${scenario}" & + greedy_agent_traffic_pid=$! + + target_count=$(jq -n --argjson percentage "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_PERCENTAGE}" --argjson num_workspaces "${SCALETEST_PARAM_NUM_WORKSPACES}" '$percentage / 100 * $num_workspaces | floor') + target_end=$((target_start + target_count)) + if [[ ${target_end} -gt ${SCALETEST_PARAM_NUM_WORKSPACES} ]]; then + log "WARNING: Target count ${target_end} exceeds number of workspaces ${SCALETEST_PARAM_NUM_WORKSPACES}, using ${SCALETEST_PARAM_NUM_WORKSPACES} instead." + target_start=0 + target_end=${target_count} + fi + run_scenario_cmd "${scenario}" coder exp scaletest workspace-traffic \ + --template "${SCALETEST_PARAM_TEMPLATE}" \ + --bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_BYTES_PER_TICK}" \ + --tick-interval "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_TICK_INTERVAL}ms" \ + --timeout "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION}m" \ + --job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION}m30s" \ + --output json:"${SCALETEST_RESULTS_DIR}/traffic-app.json" \ + --scaletest-prometheus-address "0.0.0.0:${SCALETEST_PROMETHEUS_START_PORT}" \ + --app "${SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_MODE}" \ + --target-workspaces "${target_start}:${target_end}" \ + "${non_greedy_agent_traffic_args[@]}" & + pids+=($!) + if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then + wait "${pids[-1]}" + status=$? + show_json "${SCALETEST_RESULTS_DIR}/traffic-app.json" + else + SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1)) + fi + wait "${greedy_agent_traffic_pid}" status2=$? if [[ ${status} == 0 ]]; then status=${status2} fi - show_json "${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json" ;; "Dashboard Traffic") - coder exp scaletest dashboard \ + target_count=$(jq -n --argjson percentage "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_PERCENTAGE}" --argjson num_workspaces "${SCALETEST_PARAM_NUM_WORKSPACES}" '$percentage / 100 * $num_workspaces | floor') + target_end=$((target_start + target_count)) + if [[ ${target_end} -gt ${SCALETEST_PARAM_NUM_WORKSPACES} ]]; then + log "WARNING: Target count ${target_end} exceeds number of workspaces ${SCALETEST_PARAM_NUM_WORKSPACES}, using ${SCALETEST_PARAM_NUM_WORKSPACES} instead." + target_start=0 + target_end=${target_count} + fi + # TODO: Remove this once the dashboard traffic command is fixed, + # (i.e. once images are no longer dumped into PWD). + mkdir -p dashboard + pushd dashboard + run_scenario_cmd "${scenario}" coder exp scaletest dashboard \ --timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m" \ --job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m30s" \ --output json:"${SCALETEST_RESULTS_DIR}/traffic-dashboard.json" \ - >"${SCALETEST_RESULTS_DIR}/traffic-dashboard-output.log" - status=$? - show_json "${SCALETEST_RESULTS_DIR}/traffic-dashboard.json" + --scaletest-prometheus-address "0.0.0.0:${SCALETEST_PROMETHEUS_START_PORT}" \ + --target-users "${target_start}:${target_end}" \ + >"${SCALETEST_RESULTS_DIR}/traffic-dashboard-output.log" & + pids+=($!) + popd + if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then + wait "${pids[-1]}" + status=$? + show_json "${SCALETEST_RESULTS_DIR}/traffic-dashboard.json" + else + SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1)) + fi ;; # Debug scenarios, for testing the runner. "debug:greedy_agent_traffic") - greedy_agent_traffic 10 "${scenario}" - status=$? + greedy_agent_traffic 10 "${scenario}" & + pids+=($!) + if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then + wait "${pids[-1]}" + status=$? + else + SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1)) + fi ;; "debug:success") - maybedryrun "$DRY_RUN" sleep 10 - status=0 + { + maybedryrun "$DRY_RUN" sleep 10 + true + } & + pids+=($!) + if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then + wait "${pids[-1]}" + status=$? + else + SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1)) + fi ;; "debug:error") - maybedryrun "$DRY_RUN" sleep 10 - status=1 + { + maybedryrun "$DRY_RUN" sleep 10 + false + } & + pids+=($!) + if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 0 ]]; then + wait "${pids[-1]}" + status=$? + else + SCALETEST_PROMETHEUS_START_PORT=$((SCALETEST_PROMETHEUS_START_PORT + 1)) + fi ;; *) @@ -163,9 +314,22 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do ;; esac set -e + + # Allow targeting to be distributed evenly across workspaces when each + # scenario is run concurrently and all percentages add up to 100. + target_start=${target_end} + + if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then + pid_to_scenario+=(["${pids[-1]}"]="${scenario}") + # Stagger the start of each scenario to avoid a burst of load and deted + # problematic scenarios. + sleep $((SCALETEST_PARAM_LOAD_SCENARIO_CONCURRENCY_STAGGER_DELAY_MINS * 60)) + continue + fi + if ((status > 0)); then log "Load scenario failed: ${scenario} (exit=${status})" - failed+=(["${scenario}"]="$status") + failed+=(["${scenario}"]="${status}") PHASE_ADD_TAGS=error end_phase else end_phase @@ -173,6 +337,25 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}" done +if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]]; then + wait "${pids[@]}" + # Wait on all pids will wait until all have exited, but we need to + # check their individual exit codes. + for pid in "${pids[@]}"; do + wait "${pid}" + status=${?} + scenario=${pid_to_scenario[${pid}]} + if ((status > 0)); then + log "Load scenario failed: ${scenario} (exit=${status})" + failed+=(["${scenario}"]="${status}") + fi + done + if ((${#failed[@]} > 0)); then + PHASE_ADD_TAGS=error end_phase + else + end_phase + fi +fi if ((${#failed[@]} > 0)); then log "Load scenarios failed: ${!failed[*]}" diff --git a/scaletest/templates/scaletest-runner/shutdown.sh b/scaletest/templates/scaletest-runner/shutdown.sh index d5c81366b1217..9e75864d73120 100755 --- a/scaletest/templates/scaletest-runner/shutdown.sh +++ b/scaletest/templates/scaletest-runner/shutdown.sh @@ -14,7 +14,11 @@ trap cleanup EXIT annotate_grafana "workspace" "Agent stopping..." -"${SCRIPTS_DIR}/cleanup.sh" shutdown +shutdown_event=shutdown_scale_down_only +if [[ ${SCALETEST_PARAM_CLEANUP_STRATEGY} == on_stop ]]; then + shutdown_event=shutdown +fi +"${SCRIPTS_DIR}/cleanup.sh" "${shutdown_event}" annotate_grafana_end "workspace" "Agent running" diff --git a/scaletest/templates/scaletest-runner/startup.sh b/scaletest/templates/scaletest-runner/startup.sh index 45bf4fb9ebd5c..3e4eb94f41810 100755 --- a/scaletest/templates/scaletest-runner/startup.sh +++ b/scaletest/templates/scaletest-runner/startup.sh @@ -8,6 +8,11 @@ if [[ ${SCALETEST_PARAM_GREEDY_AGENT_TEMPLATE} == "${SCALETEST_PARAM_TEMPLATE}" exit 1 fi +if [[ ${SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY} == 1 ]] && [[ ${SCALETEST_PARAM_GREEDY_AGENT} == 1 ]]; then + echo "ERROR: Load scenario concurrency and greedy agent test cannot be enabled at the same time." >&2 + exit 1 +fi + # Unzip scripts and add to path. # shellcheck disable=SC2153 echo "Extracting scaletest scripts into ${SCRIPTS_DIR}..."