Skip to content

Commit 83eea2d

Browse files
authored
feat(scaletest/templates): add support for concurrent scenarios (#11753)
1 parent 4b27c77 commit 83eea2d

File tree

6 files changed

+460
-80
lines changed

6 files changed

+460
-80
lines changed

scaletest/templates/scaletest-runner/main.tf

+193-29
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,12 @@ terraform {
1212
}
1313

1414
resource "time_static" "start_time" {
15-
# We con't set `count = data.coder_workspace.me.start_count` here because then
16-
# we can't use this value in `locals`. The permission check is recreated on
17-
# start, which will update the timestamp.
15+
# We don't set `count = data.coder_workspace.me.start_count` here because then
16+
# we can't use this value in `locals`, but we want to trigger recreation when
17+
# the scaletest is restarted.
1818
triggers = {
19-
count : length(null_resource.permission_check)
19+
count : data.coder_workspace.me.start_count
20+
token : data.coder_workspace.me.owner_session_token # Rely on this being re-generated every start.
2021
}
2122
}
2223

@@ -39,8 +40,6 @@ locals {
3940
workspace_pod_instance = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
4041
workspace_pod_termination_grace_period_seconds = 5 * 60 * 60 # 5 hours (cleanup timeout).
4142
service_account_name = "scaletest-sa"
42-
cpu = 16
43-
memory = 64
4443
home_disk_size = 10
4544
scaletest_run_id = "scaletest-${replace(time_static.start_time.rfc3339, ":", "-")}"
4645
scaletest_run_dir = "/home/coder/${local.scaletest_run_id}"
@@ -171,6 +170,16 @@ data "coder_parameter" "cleanup_strategy" {
171170
}
172171
}
173172

173+
data "coder_parameter" "cleanup_prepare" {
174+
order = 14
175+
type = "bool"
176+
name = "Cleanup before scaletest"
177+
default = true
178+
description = "Cleanup existing scaletest users and workspaces before the scaletest starts (prepare phase)."
179+
mutable = true
180+
ephemeral = true
181+
}
182+
174183

175184
data "coder_parameter" "workspace_template" {
176185
order = 20
@@ -226,9 +235,18 @@ data "coder_parameter" "num_workspaces" {
226235
}
227236
}
228237

238+
data "coder_parameter" "skip_create_workspaces" {
239+
order = 22
240+
type = "bool"
241+
name = "DEBUG: Skip creating workspaces"
242+
default = false
243+
description = "Skip creating workspaces (for resuming failed scaletests or debugging)"
244+
mutable = true
245+
}
246+
229247

230248
data "coder_parameter" "load_scenarios" {
231-
order = 22
249+
order = 23
232250
name = "Load Scenarios"
233251
type = "list(string)"
234252
description = "The load scenarios to run."
@@ -237,12 +255,31 @@ data "coder_parameter" "load_scenarios" {
237255
default = jsonencode([
238256
"SSH Traffic",
239257
"Web Terminal Traffic",
258+
"App Traffic",
240259
"Dashboard Traffic",
241260
])
242261
}
243262

263+
data "coder_parameter" "load_scenario_run_concurrently" {
264+
order = 24
265+
name = "Run Load Scenarios Concurrently"
266+
type = "bool"
267+
default = false
268+
description = "Run all load scenarios concurrently, this setting enables the load scenario percentages so that they can be assigned a percentage of 1-100%."
269+
mutable = true
270+
}
271+
272+
data "coder_parameter" "load_scenario_concurrency_stagger_delay_mins" {
273+
order = 25
274+
name = "Load Scenario Concurrency Stagger Delay"
275+
type = "number"
276+
default = 3
277+
description = "The number of minutes to wait between starting each load scenario when run concurrently."
278+
mutable = true
279+
}
280+
244281
data "coder_parameter" "load_scenario_ssh_traffic_duration" {
245-
order = 23
282+
order = 30
246283
name = "SSH Traffic Duration"
247284
type = "number"
248285
description = "The duration of the SSH traffic load scenario in minutes."
@@ -255,7 +292,7 @@ data "coder_parameter" "load_scenario_ssh_traffic_duration" {
255292
}
256293

257294
data "coder_parameter" "load_scenario_ssh_bytes_per_tick" {
258-
order = 24
295+
order = 31
259296
name = "SSH Bytes Per Tick"
260297
type = "number"
261298
description = "The number of bytes to send per tick in the SSH traffic load scenario."
@@ -267,7 +304,7 @@ data "coder_parameter" "load_scenario_ssh_bytes_per_tick" {
267304
}
268305

269306
data "coder_parameter" "load_scenario_ssh_tick_interval" {
270-
order = 25
307+
order = 32
271308
name = "SSH Tick Interval"
272309
type = "number"
273310
description = "The number of milliseconds between each tick in the SSH traffic load scenario."
@@ -278,8 +315,21 @@ data "coder_parameter" "load_scenario_ssh_tick_interval" {
278315
}
279316
}
280317

318+
data "coder_parameter" "load_scenario_ssh_traffic_percentage" {
319+
order = 33
320+
name = "SSH Traffic Percentage"
321+
type = "number"
322+
description = "The percentage of workspaces that should be targeted for SSH traffic."
323+
mutable = true
324+
default = 100
325+
validation {
326+
min = 1
327+
max = 100
328+
}
329+
}
330+
281331
data "coder_parameter" "load_scenario_web_terminal_traffic_duration" {
282-
order = 26
332+
order = 40
283333
name = "Web Terminal Traffic Duration"
284334
type = "number"
285335
description = "The duration of the web terminal traffic load scenario in minutes."
@@ -292,7 +342,7 @@ data "coder_parameter" "load_scenario_web_terminal_traffic_duration" {
292342
}
293343

294344
data "coder_parameter" "load_scenario_web_terminal_bytes_per_tick" {
295-
order = 27
345+
order = 41
296346
name = "Web Terminal Bytes Per Tick"
297347
type = "number"
298348
description = "The number of bytes to send per tick in the web terminal traffic load scenario."
@@ -304,7 +354,7 @@ data "coder_parameter" "load_scenario_web_terminal_bytes_per_tick" {
304354
}
305355

306356
data "coder_parameter" "load_scenario_web_terminal_tick_interval" {
307-
order = 28
357+
order = 42
308358
name = "Web Terminal Tick Interval"
309359
type = "number"
310360
description = "The number of milliseconds between each tick in the web terminal traffic load scenario."
@@ -315,8 +365,94 @@ data "coder_parameter" "load_scenario_web_terminal_tick_interval" {
315365
}
316366
}
317367

368+
data "coder_parameter" "load_scenario_web_terminal_traffic_percentage" {
369+
order = 43
370+
name = "Web Terminal Traffic Percentage"
371+
type = "number"
372+
description = "The percentage of workspaces that should be targeted for web terminal traffic."
373+
mutable = true
374+
default = 100
375+
validation {
376+
min = 1
377+
max = 100
378+
}
379+
}
380+
381+
data "coder_parameter" "load_scenario_app_traffic_duration" {
382+
order = 50
383+
name = "App Traffic Duration"
384+
type = "number"
385+
description = "The duration of the app traffic load scenario in minutes."
386+
mutable = true
387+
default = 30
388+
validation {
389+
min = 1
390+
max = 1440 // 24 hours.
391+
}
392+
}
393+
394+
data "coder_parameter" "load_scenario_app_bytes_per_tick" {
395+
order = 51
396+
name = "App Bytes Per Tick"
397+
type = "number"
398+
description = "The number of bytes to send per tick in the app traffic load scenario."
399+
mutable = true
400+
default = 1024
401+
validation {
402+
min = 1
403+
}
404+
}
405+
406+
data "coder_parameter" "load_scenario_app_tick_interval" {
407+
order = 52
408+
name = "App Tick Interval"
409+
type = "number"
410+
description = "The number of milliseconds between each tick in the app traffic load scenario."
411+
mutable = true
412+
default = 100
413+
validation {
414+
min = 1
415+
}
416+
}
417+
418+
data "coder_parameter" "load_scenario_app_traffic_percentage" {
419+
order = 53
420+
name = "App Traffic Percentage"
421+
type = "number"
422+
description = "The percentage of workspaces that should be targeted for app traffic."
423+
mutable = true
424+
default = 100
425+
validation {
426+
min = 1
427+
max = 100
428+
}
429+
}
430+
431+
data "coder_parameter" "load_scenario_app_traffic_mode" {
432+
order = 54
433+
name = "App Traffic Mode"
434+
default = "wsec"
435+
description = "The mode of the app traffic load scenario."
436+
mutable = true
437+
option {
438+
name = "WebSocket Echo"
439+
value = "wsec"
440+
description = "Send traffic to the workspace via the app websocket and read it back."
441+
}
442+
option {
443+
name = "WebSocket Read (Random)"
444+
value = "wsra"
445+
description = "Read traffic from the workspace via the app websocket."
446+
}
447+
option {
448+
name = "WebSocket Write (Discard)"
449+
value = "wsdi"
450+
description = "Send traffic to the workspace via the app websocket."
451+
}
452+
}
453+
318454
data "coder_parameter" "load_scenario_dashboard_traffic_duration" {
319-
order = 29
455+
order = 60
320456
name = "Dashboard Traffic Duration"
321457
type = "number"
322458
description = "The duration of the dashboard traffic load scenario in minutes."
@@ -328,8 +464,21 @@ data "coder_parameter" "load_scenario_dashboard_traffic_duration" {
328464
}
329465
}
330466

467+
data "coder_parameter" "load_scenario_dashboard_traffic_percentage" {
468+
order = 61
469+
name = "Dashboard Traffic Percentage"
470+
type = "number"
471+
description = "The percentage of users that should be targeted for dashboard traffic."
472+
mutable = true
473+
default = 100
474+
validation {
475+
min = 1
476+
max = 100
477+
}
478+
}
479+
331480
data "coder_parameter" "load_scenario_baseline_duration" {
332-
order = 26
481+
order = 100
333482
name = "Baseline Wait Duration"
334483
type = "number"
335484
description = "The duration to wait before starting a load scenario in minutes."
@@ -342,7 +491,7 @@ data "coder_parameter" "load_scenario_baseline_duration" {
342491
}
343492

344493
data "coder_parameter" "greedy_agent" {
345-
order = 30
494+
order = 200
346495
type = "bool"
347496
name = "Greedy Agent"
348497
default = false
@@ -352,7 +501,7 @@ data "coder_parameter" "greedy_agent" {
352501
}
353502

354503
data "coder_parameter" "greedy_agent_template" {
355-
order = 31
504+
order = 201
356505
name = "Greedy Agent Template"
357506
display_name = "Greedy Agent Template"
358507
description = "The template used for the greedy agent workspace (must not be same as workspace template)."
@@ -432,6 +581,7 @@ resource "coder_agent" "main" {
432581
SCALETEST_RUN_ID : local.scaletest_run_id,
433582
SCALETEST_RUN_DIR : local.scaletest_run_dir,
434583
SCALETEST_RUN_START_TIME : local.scaletest_run_start_time,
584+
SCALETEST_PROMETHEUS_START_PORT : "21112",
435585

436586
# Comment is a scaletest param, but we want to surface it separately from
437587
# the rest, so we use a different name.
@@ -440,16 +590,28 @@ resource "coder_agent" "main" {
440590
SCALETEST_PARAM_TEMPLATE : data.coder_parameter.workspace_template.value,
441591
SCALETEST_PARAM_REPO_BRANCH : data.coder_parameter.repo_branch.value,
442592
SCALETEST_PARAM_NUM_WORKSPACES : data.coder_parameter.num_workspaces.value,
593+
SCALETEST_PARAM_SKIP_CREATE_WORKSPACES : data.coder_parameter.skip_create_workspaces.value ? "1" : "0",
443594
SCALETEST_PARAM_CREATE_CONCURRENCY : "${data.coder_parameter.create_concurrency.value}",
444595
SCALETEST_PARAM_CLEANUP_STRATEGY : data.coder_parameter.cleanup_strategy.value,
596+
SCALETEST_PARAM_CLEANUP_PREPARE : data.coder_parameter.cleanup_prepare.value ? "1" : "0",
445597
SCALETEST_PARAM_LOAD_SCENARIOS : data.coder_parameter.load_scenarios.value,
598+
SCALETEST_PARAM_LOAD_SCENARIO_RUN_CONCURRENTLY : data.coder_parameter.load_scenario_run_concurrently.value ? "1" : "0",
599+
SCALETEST_PARAM_LOAD_SCENARIO_CONCURRENCY_STAGGER_DELAY_MINS : "${data.coder_parameter.load_scenario_concurrency_stagger_delay_mins.value}",
446600
SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_ssh_traffic_duration.value}",
447601
SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_ssh_bytes_per_tick.value}",
448602
SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_ssh_tick_interval.value}",
603+
SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_ssh_traffic_percentage.value}",
449604
SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_web_terminal_traffic_duration.value}",
450605
SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_web_terminal_bytes_per_tick.value}",
451606
SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_web_terminal_tick_interval.value}",
607+
SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_web_terminal_traffic_percentage.value}",
608+
SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_app_traffic_duration.value}",
609+
SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_app_bytes_per_tick.value}",
610+
SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_app_tick_interval.value}",
611+
SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_app_traffic_percentage.value}",
612+
SCALETEST_PARAM_LOAD_SCENARIO_APP_TRAFFIC_MODE : data.coder_parameter.load_scenario_app_traffic_mode.value,
452613
SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_dashboard_traffic_duration.value}",
614+
SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_PERCENTAGE : "${data.coder_parameter.load_scenario_dashboard_traffic_percentage.value}",
453615
SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION : "${data.coder_parameter.load_scenario_baseline_duration.value}",
454616
SCALETEST_PARAM_GREEDY_AGENT : data.coder_parameter.greedy_agent.value ? "1" : "0",
455617
SCALETEST_PARAM_GREEDY_AGENT_TEMPLATE : data.coder_parameter.greedy_agent_template.value,
@@ -693,26 +855,24 @@ resource "kubernetes_pod" "main" {
693855
}
694856
}
695857
resources {
696-
# Set requests and limits values such that we can do performant
697-
# execution of `coder scaletest` commands.
698858
requests = {
699859
"cpu" = "250m"
700860
"memory" = "512Mi"
701861
}
702-
limits = {
703-
"cpu" = "${local.cpu}"
704-
"memory" = "${local.memory}Gi"
705-
}
706862
}
707863
volume_mount {
708864
mount_path = "/home/coder"
709865
name = "home"
710866
read_only = false
711867
}
712-
port {
713-
container_port = 21112
714-
name = "prometheus-http"
715-
protocol = "TCP"
868+
dynamic "port" {
869+
for_each = data.coder_parameter.load_scenario_run_concurrently.value ? jsondecode(data.coder_parameter.load_scenarios.value) : [""]
870+
iterator = it
871+
content {
872+
container_port = 21112 + it.key
873+
name = "prom-http${it.key}"
874+
protocol = "TCP"
875+
}
716876
}
717877
}
718878

@@ -787,8 +947,12 @@ resource "kubernetes_manifest" "pod_monitor" {
787947
}
788948
}
789949
podMetricsEndpoints = [
790-
{
791-
port = "prometheus-http"
950+
# NOTE(mafredri): We could add more information here by including the
951+
# scenario name in the port name (although it's limited to 15 chars so
952+
# it needs to be short). That said, someone looking at the stats can
953+
# assume that there's a 1-to-1 mapping between scenario# and port.
954+
for i, _ in data.coder_parameter.load_scenario_run_concurrently.value ? jsondecode(data.coder_parameter.load_scenarios.value) : [""] : {
955+
port = "prom-http${i}"
792956
interval = "15s"
793957
}
794958
]

0 commit comments

Comments
 (0)