Skip to content

Commit 6873877

Browse files
authored
feat(scaletest): create automated pprof dumps during scaletest (#9887)
1 parent fad0208 commit 6873877

File tree

7 files changed

+57
-12
lines changed

7 files changed

+57
-12
lines changed

scaletest/templates/scaletest-runner/main.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ resource "null_resource" "permission_check" {
3737
locals {
3838
workspace_pod_name = "coder-scaletest-runner-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
3939
workspace_pod_instance = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
40-
workspace_pod_termination_grace_period_seconds = 7200 # 2 hours (cleanup timeout).
40+
workspace_pod_termination_grace_period_seconds = 5 * 60 * 60 # 5 hours (cleanup timeout).
4141
service_account_name = "scaletest-sa"
4242
cpu = 16
4343
memory = 64

scaletest/templates/scaletest-runner/scripts/cleanup.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@ fi
2323

2424
start_phase "Cleanup (${event})"
2525
coder exp scaletest cleanup \
26-
--cleanup-job-timeout 15m \
27-
--cleanup-timeout 2h |
26+
--cleanup-job-timeout 2h \
27+
--cleanup-timeout 5h |
2828
tee "${SCALETEST_RESULTS_DIR}/cleanup-${event}.txt"
2929
end_phase
3030

scaletest/templates/scaletest-runner/scripts/lib.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ SCALETEST_STATE_DIR="${SCALETEST_RUN_DIR}/state"
1919
SCALETEST_PHASE_FILE="${SCALETEST_STATE_DIR}/phase"
2020
# shellcheck disable=SC2034
2121
SCALETEST_RESULTS_DIR="${SCALETEST_RUN_DIR}/results"
22+
SCALETEST_PPROF_DIR="${SCALETEST_RUN_DIR}/pprof"
23+
24+
mkdir -p "${SCALETEST_STATE_DIR}" "${SCALETEST_RESULTS_DIR}" "${SCALETEST_PPROF_DIR}"
2225

2326
coder() {
2427
maybedryrun "${DRY_RUN}" command coder "${@}"
@@ -142,9 +145,6 @@ annotate_grafana() {
142145

143146
log "Grafana annotation added!"
144147

145-
if [[ ! -f "${SCALETEST_STATE_DIR}" ]]; then
146-
mkdir -p "${SCALETEST_STATE_DIR}"
147-
fi
148148
id="$(jq -r '.id' <<<"${resp}")"
149149
echo "${id}:${tags}:${text}:${start}" >>"${SCALETEST_STATE_DIR}/grafana-annotations"
150150
}

scaletest/templates/scaletest-runner/scripts/prepare.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,13 @@ echo -n "${CODER_URL}" >"${CODER_CONFIG_DIR}/url"
3636
set +x # Avoid logging the token.
3737
# Persist configuration for shutdown script too since the
3838
# owner token is invalidated immediately on workspace stop.
39-
export CODER_SESSION_TOKEN=$CODER_USER_TOKEN
39+
export CODER_SESSION_TOKEN=${CODER_USER_TOKEN}
4040
coder tokens delete scaletest_runner >/dev/null 2>&1 || true
4141
# TODO(mafredri): Set TTL? This could interfere with delayed stop though.
4242
token=$(coder tokens create --name scaletest_runner)
43+
if [[ $DRY_RUN == 1 ]]; then
44+
token=${CODER_SESSION_TOKEN}
45+
fi
4346
unset CODER_SESSION_TOKEN
4447
echo -n "${token}" >"${CODER_CONFIG_DIR}/session"
4548
[[ $VERBOSE == 1 ]] && set -x # Restore logging (if enabled).

scaletest/templates/scaletest-runner/scripts/report.sh

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,11 @@ server_version="$(jq -r '.version' <<<"${buildinfo}")"
2727
server_version_commit="$(jq -r '.external_url' <<<"${buildinfo}")"
2828

2929
# Since `coder show` doesn't support JSON output, we list the workspaces instead.
30-
workspace_json="$(DRYRUN=0 coder list --all --output json | jq --arg workspace "${CODER_WORKSPACE}" --arg user "${CODER_USER}" 'map(select(.name == $workspace) | select(.owner_name == $user)) | .[0]')"
30+
# Use `command` here to bypass dry run.
31+
workspace_json="$(
32+
command coder list --all --output json |
33+
jq --arg workspace "${CODER_WORKSPACE}" --arg user "${CODER_USER}" 'map(select(.name == $workspace) | select(.owner_name == $user)) | .[0]'
34+
)"
3135
owner_name="$(jq -r '.latest_build.workspace_owner_name' <<<"${workspace_json}")"
3236
workspace_name="$(jq -r '.latest_build.workspace_name' <<<"${workspace_json}")"
3337
initiator_name="$(jq -r '.latest_build.initiator_name' <<<"${workspace_json}")"
@@ -43,7 +47,7 @@ while read -r app_name; do
4347
app_url="${app_url//to=now/to=$(($(date +%s) * 1000))}"
4448
bold='*'
4549
fi
46-
app_urls+=("${bullet} ${bold}${app_name}: ${app_url}${bold}")
50+
app_urls+=("${bullet} ${bold}${app_name}${bold}: ${app_url}")
4751
done <<<"${app_urls_raw}"
4852

4953
params=()

scaletest/templates/scaletest-runner/scripts/run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ coder exp scaletest create-workspaces \
1717
--count "${SCALETEST_PARAM_NUM_WORKSPACES}" \
1818
--template "${SCALETEST_PARAM_TEMPLATE}" \
1919
--concurrency "${SCALETEST_PARAM_CREATE_CONCURRENCY}" \
20-
--job-timeout 2h \
20+
--job-timeout 5h \
2121
--no-cleanup \
2222
--output json:"${SCALETEST_RESULTS_DIR}/create-workspaces.json"
2323
show_json "${SCALETEST_RESULTS_DIR}/create-workspaces.json"

scaletest/templates/scaletest-runner/startup.sh

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,22 +23,58 @@ fi
2323

2424
annotate_grafana "workspace" "Agent running" # Ended in shutdown.sh.
2525

26+
{
27+
pids=()
28+
ports=()
29+
declare -A pods=()
30+
next_port=6061
31+
for pod in $(kubectl get pods -l app.kubernetes.io/name=coder -o jsonpath='{.items[*].metadata.name}'); do
32+
maybedryrun "${DRY_RUN}" kubectl -n coder-big port-forward "${pod}" "${next_port}:6060" &
33+
pids+=($!)
34+
ports+=("${next_port}")
35+
pods[${next_port}]="${pod}"
36+
next_port=$((next_port + 1))
37+
done
38+
39+
trap 'trap - EXIT; kill -INT "${pids[@]}"; exit 1' INT EXIT
40+
41+
while :; do
42+
sleep 285 # ~300 when accounting for profile and trace.
43+
log "Grabbing pprof dumps"
44+
start="$(date +%s)"
45+
annotate_grafana "pprof" "Grab pprof dumps (start=${start})"
46+
for type in allocs block heap goroutine mutex 'profile?seconds=10' 'trace?seconds=5'; do
47+
for port in "${ports[@]}"; do
48+
tidy_type="${type//\?/_}"
49+
tidy_type="${tidy_type//=/_}"
50+
maybedryrun "${DRY_RUN}" curl -sSL --output "${SCALETEST_PPROF_DIR}/pprof-${tidy_type}-${pods[${port}]}-${start}.gz" "http://localhost:${port}/debug/pprof/${type}"
51+
done
52+
done
53+
annotate_grafana_end "pprof" "Grab pprof dumps (start=${start})"
54+
done
55+
} &
56+
pprof_pid=$!
57+
2658
# Show failure in the UI if script exits with error.
2759
failed_status=Failed
2860
on_exit() {
61+
code=${?}
2962
trap - ERR EXIT
63+
set +e
64+
65+
kill -INT "${pprof_pid}"
3066

3167
case "${SCALETEST_PARAM_CLEANUP_STRATEGY}" in
3268
on_stop)
3369
# Handled by shutdown script.
3470
;;
3571
on_success)
36-
if [[ $(get_status) != "${failed_status}" ]]; then
72+
if ((code == 0)); then
3773
"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_PARAM_CLEANUP_STRATEGY}"
3874
fi
3975
;;
4076
on_error)
41-
if [[ $(get_status) = "${failed_status}" ]]; then
77+
if ((code > 0)); then
4278
"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_PARAM_CLEANUP_STRATEGY}"
4379
fi
4480
;;
@@ -60,6 +96,8 @@ on_err() {
6096
GRAFANA_EXTRA_TAGS=error set_status "${failed_status} (exit=${code})"
6197
"${SCRIPTS_DIR}/report.sh" failed
6298
lock_status # Ensure we never rewrite the status after a failure.
99+
100+
exit "${code}"
63101
}
64102
trap on_err ERR
65103

0 commit comments

Comments
 (0)