Skip to content

feat(scaletest/templates): add support for concurrent scenarios #11753

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
222 changes: 193 additions & 29 deletions scaletest/templates/scaletest-runner/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@ terraform {
}

resource "time_static" "start_time" {
# We con't set `count = data.coder_workspace.me.start_count` here because then
# we can't use this value in `locals`. The permission check is recreated on
# start, which will update the timestamp.
# We don't set `count = data.coder_workspace.me.start_count` here because then
# we can't use this value in `locals`, but we want to trigger recreation when
# the scaletest is restarted.
triggers = {
count : length(null_resource.permission_check)
count : data.coder_workspace.me.start_count
token : data.coder_workspace.me.owner_session_token # Rely on this being re-generated every start.
}
}

Expand All @@ -39,8 +40,6 @@ locals {
workspace_pod_instance = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
workspace_pod_termination_grace_period_seconds = 5 * 60 * 60 # 5 hours (cleanup timeout).
service_account_name = "scaletest-sa"
cpu = 16
memory = 64
home_disk_size = 10
scaletest_run_id = "scaletest-${replace(time_static.start_time.rfc3339, ":", "-")}"
scaletest_run_dir = "/home/coder/${local.scaletest_run_id}"
Expand Down Expand Up @@ -171,6 +170,16 @@ data "coder_parameter" "cleanup_strategy" {
}
}

data "coder_parameter" "cleanup_prepare" {
order = 14
type = "bool"
name = "Cleanup before scaletest"
default = true
description = "Cleanup existing scaletest users and workspaces before the scaletest starts (prepare phase)."
mutable = true
ephemeral = true
}


data "coder_parameter" "workspace_template" {
order = 20
Expand Down Expand Up @@ -226,9 +235,18 @@ data "coder_parameter" "num_workspaces" {
}
}

data "coder_parameter" "skip_create_workspaces" {
order = 22
type = "bool"
name = "DEBUG: Skip creating workspaces"
default = false
description = "Skip creating workspaces (for resuming failed scaletests or debugging)"
mutable = true
}


data "coder_parameter" "load_scenarios" {
order = 22
order = 23
name = "Load Scenarios"
type = "list(string)"
description = "The load scenarios to run."
Expand All @@ -237,12 +255,31 @@ data "coder_parameter" "load_scenarios" {
default = jsonencode([
"SSH Traffic",
"Web Terminal Traffic",
"App Traffic",
"Dashboard Traffic",
])
}

data "coder_parameter" "load_scenario_run_concurrently" {
order = 24
name = "Run Load Scenarios Concurrently"
type = "bool"
default = false
description = "Run all load scenarios concurrently, this setting enables the load scenario percentages so that they can be assigned a percentage of 1-100%."
mutable = true
}

data "coder_parameter" "load_scenario_concurrency_stagger_delay_mins" {
order = 25
name = "Load Scenario Concurrency Stagger Delay"
type = "number"
default = 3
description = "The number of minutes to wait between starting each load scenario when run concurrently."
mutable = true
}

data "coder_parameter" "load_scenario_ssh_traffic_duration" {
order = 23
order = 30
name = "SSH Traffic Duration"
type = "number"
description = "The duration of the SSH traffic load scenario in minutes."
Expand All @@ -255,7 +292,7 @@ data "coder_parameter" "load_scenario_ssh_traffic_duration" {
}

data "coder_parameter" "load_scenario_ssh_bytes_per_tick" {
order = 24
order = 31
name = "SSH Bytes Per Tick"
type = "number"
description = "The number of bytes to send per tick in the SSH traffic load scenario."
Expand All @@ -267,7 +304,7 @@ data "coder_parameter" "load_scenario_ssh_bytes_per_tick" {
}

data "coder_parameter" "load_scenario_ssh_tick_interval" {
order = 25
order = 32
name = "SSH Tick Interval"
type = "number"
description = "The number of milliseconds between each tick in the SSH traffic load scenario."
Expand All @@ -278,8 +315,21 @@ data "coder_parameter" "load_scenario_ssh_tick_interval" {
}
}

data "coder_parameter" "load_scenario_ssh_traffic_percentage" {
order = 33
name = "SSH Traffic Percentage"
type = "number"
description = "The percentage of workspaces that should be targeted for SSH traffic."
mutable = true
default = 100
validation {
min = 1
max = 100
}
}

data "coder_parameter" "load_scenario_web_terminal_traffic_duration" {
order = 26
order = 40
name = "Web Terminal Traffic Duration"
type = "number"
description = "The duration of the web terminal traffic load scenario in minutes."
Expand All @@ -292,7 +342,7 @@ data "coder_parameter" "load_scenario_web_terminal_traffic_duration" {
}

data "coder_parameter" "load_scenario_web_terminal_bytes_per_tick" {
order = 27
order = 41
name = "Web Terminal Bytes Per Tick"
type = "number"
description = "The number of bytes to send per tick in the web terminal traffic load scenario."
Expand All @@ -304,7 +354,7 @@ data "coder_parameter" "load_scenario_web_terminal_bytes_per_tick" {
}

data "coder_parameter" "load_scenario_web_terminal_tick_interval" {
order = 28
order = 42
name = "Web Terminal Tick Interval"
type = "number"
description = "The number of milliseconds between each tick in the web terminal traffic load scenario."
Expand All @@ -315,8 +365,94 @@ data "coder_parameter" "load_scenario_web_terminal_tick_interval" {
}
}

data "coder_parameter" "load_scenario_web_terminal_traffic_percentage" {
order = 43
name = "Web Terminal Traffic Percentage"
type = "number"
description = "The percentage of workspaces that should be targeted for web terminal traffic."
mutable = true
default = 100
validation {
min = 1
max = 100
}
}

data "coder_parameter" "load_scenario_app_traffic_duration" {
order = 50
name = "App Traffic Duration"
type = "number"
description = "The duration of the app traffic load scenario in minutes."
mutable = true
default = 30
validation {
min = 1
max = 1440 // 24 hours.
}
}

data "coder_parameter" "load_scenario_app_bytes_per_tick" {
order = 51
name = "App Bytes Per Tick"
type = "number"
description = "The number of bytes to send per tick in the app traffic load scenario."
mutable = true
default = 1024
validation {
min = 1
}
}

data "coder_parameter" "load_scenario_app_tick_interval" {
order = 52
name = "App Tick Interval"
type = "number"
description = "The number of milliseconds between each tick in the app traffic load scenario."
mutable = true
default = 100
validation {
min = 1
}
}

data "coder_parameter" "load_scenario_app_traffic_percentage" {
order = 53
name = "App Traffic Percentage"
type = "number"
description = "The percentage of workspaces that should be targeted for app traffic."
mutable = true
default = 100
validation {
min = 1
max = 100
}
}

data "coder_parameter" "load_scenario_app_traffic_mode" {
order = 54
name = "App Traffic Mode"
default = "wsec"
description = "The mode of the app traffic load scenario."
mutable = true
option {
name = "WebSocket Echo"
value = "wsec"
description = "Send traffic to the workspace via the app websocket and read it back."
}
option {
name = "WebSocket Read (Random)"
value = "wsra"
description = "Read traffic from the workspace via the app websocket."
}
option {
name = "WebSocket Write (Discard)"
value = "wsdi"
description = "Send traffic to the workspace via the app websocket."
}
}

data "coder_parameter" "load_scenario_dashboard_traffic_duration" {
order = 29
order = 60
name = "Dashboard Traffic Duration"
type = "number"
description = "The duration of the dashboard traffic load scenario in minutes."
Expand All @@ -328,8 +464,21 @@ data "coder_parameter" "load_scenario_dashboard_traffic_duration" {
}
}

data "coder_parameter" "load_scenario_dashboard_traffic_percentage" {
order = 61
name = "Dashboard Traffic Percentage"
type = "number"
description = "The percentage of users that should be targeted for dashboard traffic."
mutable = true
default = 100
validation {
min = 1
max = 100
}
}

data "coder_parameter" "load_scenario_baseline_duration" {
order = 26
order = 100
name = "Baseline Wait Duration"
type = "number"
description = "The duration to wait before starting a load scenario in minutes."
Expand All @@ -342,7 +491,7 @@ data "coder_parameter" "load_scenario_baseline_duration" {
}

data "coder_parameter" "greedy_agent" {
order = 30
order = 200
type = "bool"
name = "Greedy Agent"
default = false
Expand All @@ -352,7 +501,7 @@ data "coder_parameter" "greedy_agent" {
}

data "coder_parameter" "greedy_agent_template" {
order = 31
order = 201
name = "Greedy Agent Template"
display_name = "Greedy Agent Template"
description = "The template used for the greedy agent workspace (must not be same as workspace template)."
Expand Down Expand Up @@ -432,6 +581,7 @@ resource "coder_agent" "main" {
SCALETEST_RUN_ID : local.scaletest_run_id,
SCALETEST_RUN_DIR : local.scaletest_run_dir,
SCALETEST_RUN_START_TIME : local.scaletest_run_start_time,
SCALETEST_PROMETHEUS_START_PORT : "21112",

# Comment is a scaletest param, but we want to surface it separately from
# the rest, so we use a different name.
Expand All @@ -440,16 +590,28 @@ resource "coder_agent" "main" {
SCALETEST_PARAM_TEMPLATE : data.coder_parameter.workspace_template.value,
SCALETEST_PARAM_REPO_BRANCH : data.coder_parameter.repo_branch.value,
SCALETEST_PARAM_NUM_WORKSPACES : data.coder_parameter.num_workspaces.value,
SCALETEST_PARAM_SKIP_CREATE_WORKSPACES : data.coder_parameter.skip_create_workspaces.value ? "1" : "0",
SCALETEST_PARAM_CREATE_CONCURRENCY : "${data.coder_parameter.create_concurrency.value}",
SCALETEST_PARAM_CLEANUP_STRATEGY : data.coder_parameter.cleanup_strategy.value,
SCALETEST_PARAM_CLEANUP_PREPARE : data.coder_parameter.cleanup_prepare.value ? "1" : "0",
SCALETEST_PARAM_LOAD_SCENARIOS : data.coder_parameter.load_scenarios.value,
SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY : data.coder_parameter.load_scenario_run_concurrently.value ? "1" : "0",
SCALETEST_PARAM_LOAD_SCENARIO_CONCURRENCY_STAGGER_DELAY_MINS : "${data.coder_parameter.load_scenario_concurrency_stagger_delay_mins.value}",
SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_ssh_traffic_duration.value}",
SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_ssh_bytes_per_tick.value}",
SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_ssh_tick_interval.value}",
SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_ssh_traffic_percentage.value}",
SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_web_terminal_traffic_duration.value}",
SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_web_terminal_bytes_per_tick.value}",
SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_web_terminal_tick_interval.value}",
SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_web_terminal_traffic_percentage.value}",
SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_app_traffic_duration.value}",
SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_app_bytes_per_tick.value}",
SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_app_tick_interval.value}",
SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_app_traffic_percentage.value}",
SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_MODE : data.coder_parameter.load_scenario_app_traffic_mode.value,
SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_dashboard_traffic_duration.value}",
SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_dashboard_traffic_percentage.value}",
SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION : "${data.coder_parameter.load_scenario_baseline_duration.value}",
SCALETEST_PARAM_GREEDY_AGENT : data.coder_parameter.greedy_agent.value ? "1" : "0",
SCALETEST_PARAM_GREEDY_AGENT_TEMPLATE : data.coder_parameter.greedy_agent_template.value,
Expand Down Expand Up @@ -693,26 +855,24 @@ resource "kubernetes_pod" "main" {
}
}
resources {
# Set requests and limits values such that we can do performant
# execution of `coder scaletest` commands.
requests = {
"cpu" = "250m"
"memory" = "512Mi"
}
limits = {
"cpu" = "${local.cpu}"
"memory" = "${local.memory}Gi"
}
}
volume_mount {
mount_path = "/home/coder"
name = "home"
read_only = false
}
port {
container_port = 21112
name = "prometheus-http"
protocol = "TCP"
dynamic "port" {
for_each = data.coder_parameter.load_scenario_run_concurrently.value ? jsondecode(data.coder_parameter.load_scenarios.value) : [""]
iterator = it
content {
container_port = 21112 + it.key
name = "prom-http${it.key}"
protocol = "TCP"
}
}
}

Expand Down Expand Up @@ -787,8 +947,12 @@ resource "kubernetes_manifest" "pod_monitor" {
}
}
podMetricsEndpoints = [
{
port = "prometheus-http"
# NOTE(mafredri): We could add more information here by including the
# scenario name in the port name (although it's limited to 15 chars so
# it needs to be short). That said, someone looking at the stats can
# assume that there's a 1-to-1 mapping between scenario# and port.
for i, _ in data.coder_parameter.load_scenario_run_concurrently.value ? jsondecode(data.coder_parameter.load_scenarios.value) : [""] : {
port = "prom-http${i}"
interval = "15s"
}
]
Expand Down
Loading