Skip to content

feat(scaletest): create automated pprof dumps during scaletest #9887

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Sep 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion scaletest/templates/scaletest-runner/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ resource "null_resource" "permission_check" {
locals {
workspace_pod_name = "coder-scaletest-runner-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
workspace_pod_instance = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}"
workspace_pod_termination_grace_period_seconds = 7200 # 2 hours (cleanup timeout).
workspace_pod_termination_grace_period_seconds = 5 * 60 * 60 # 5 hours (cleanup timeout).
service_account_name = "scaletest-sa"
cpu = 16
memory = 64
Expand Down
4 changes: 2 additions & 2 deletions scaletest/templates/scaletest-runner/scripts/cleanup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ fi

start_phase "Cleanup (${event})"
coder exp scaletest cleanup \
--cleanup-job-timeout 15m \
--cleanup-timeout 2h |
--cleanup-job-timeout 2h \
--cleanup-timeout 5h |
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: I'm curious, should it be a little shorter than workspace_pod_termination_grace_period_seconds?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do it the other way, we add 1-2m to workspace_pod_termination_grace_period_seconds in the template instead. 😅

But we should probably parameterize these in the future.

tee "${SCALETEST_RESULTS_DIR}/cleanup-${event}.txt"
end_phase

Expand Down
6 changes: 3 additions & 3 deletions scaletest/templates/scaletest-runner/scripts/lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ SCALETEST_STATE_DIR="${SCALETEST_RUN_DIR}/state"
SCALETEST_PHASE_FILE="${SCALETEST_STATE_DIR}/phase"
# shellcheck disable=SC2034
SCALETEST_RESULTS_DIR="${SCALETEST_RUN_DIR}/results"
SCALETEST_PPROF_DIR="${SCALETEST_RUN_DIR}/pprof"

mkdir -p "${SCALETEST_STATE_DIR}" "${SCALETEST_RESULTS_DIR}" "${SCALETEST_PPROF_DIR}"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Follow-up idea: would it make sense to push these to a cloud bucket or something for easier perusal?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've been thinking about it, and we could definitely do that. Or we could push them to a separate workspace that can serve them in some nifty way. For instance, if we did take prometheus snapshots, this workspace could run a prom instance with that data, etc.


coder() {
maybedryrun "${DRY_RUN}" command coder "${@}"
Expand Down Expand Up @@ -142,9 +145,6 @@ annotate_grafana() {

log "Grafana annotation added!"

if [[ ! -f "${SCALETEST_STATE_DIR}" ]]; then
mkdir -p "${SCALETEST_STATE_DIR}"
fi
id="$(jq -r '.id' <<<"${resp}")"
echo "${id}:${tags}:${text}:${start}" >>"${SCALETEST_STATE_DIR}/grafana-annotations"
}
Expand Down
5 changes: 4 additions & 1 deletion scaletest/templates/scaletest-runner/scripts/prepare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,13 @@ echo -n "${CODER_URL}" >"${CODER_CONFIG_DIR}/url"
set +x # Avoid logging the token.
# Persist configuration for shutdown script too since the
# owner token is invalidated immediately on workspace stop.
export CODER_SESSION_TOKEN=$CODER_USER_TOKEN
export CODER_SESSION_TOKEN=${CODER_USER_TOKEN}
coder tokens delete scaletest_runner >/dev/null 2>&1 || true
# TODO(mafredri): Set TTL? This could interfere with delayed stop though.
token=$(coder tokens create --name scaletest_runner)
if [[ $DRY_RUN == 1 ]]; then
token=${CODER_SESSION_TOKEN}
fi
unset CODER_SESSION_TOKEN
echo -n "${token}" >"${CODER_CONFIG_DIR}/session"
[[ $VERBOSE == 1 ]] && set -x # Restore logging (if enabled).
Expand Down
8 changes: 6 additions & 2 deletions scaletest/templates/scaletest-runner/scripts/report.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,11 @@ server_version="$(jq -r '.version' <<<"${buildinfo}")"
server_version_commit="$(jq -r '.external_url' <<<"${buildinfo}")"

# Since `coder show` doesn't support JSON output, we list the workspaces instead.
workspace_json="$(DRYRUN=0 coder list --all --output json | jq --arg workspace "${CODER_WORKSPACE}" --arg user "${CODER_USER}" 'map(select(.name == $workspace) | select(.owner_name == $user)) | .[0]')"
# Use `command` here to bypass dry run.
workspace_json="$(
command coder list --all --output json |
jq --arg workspace "${CODER_WORKSPACE}" --arg user "${CODER_USER}" 'map(select(.name == $workspace) | select(.owner_name == $user)) | .[0]'
)"
owner_name="$(jq -r '.latest_build.workspace_owner_name' <<<"${workspace_json}")"
workspace_name="$(jq -r '.latest_build.workspace_name' <<<"${workspace_json}")"
initiator_name="$(jq -r '.latest_build.initiator_name' <<<"${workspace_json}")"
Expand All @@ -43,7 +47,7 @@ while read -r app_name; do
app_url="${app_url//to=now/to=$(($(date +%s) * 1000))}"
bold='*'
fi
app_urls+=("${bullet} ${bold}${app_name}: ${app_url}${bold}")
app_urls+=("${bullet} ${bold}${app_name}${bold}: ${app_url}")
done <<<"${app_urls_raw}"

params=()
Expand Down
2 changes: 1 addition & 1 deletion scaletest/templates/scaletest-runner/scripts/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ coder exp scaletest create-workspaces \
--count "${SCALETEST_PARAM_NUM_WORKSPACES}" \
--template "${SCALETEST_PARAM_TEMPLATE}" \
--concurrency "${SCALETEST_PARAM_CREATE_CONCURRENCY}" \
--job-timeout 2h \
--job-timeout 5h \
--no-cleanup \
--output json:"${SCALETEST_RESULTS_DIR}/create-workspaces.json"
show_json "${SCALETEST_RESULTS_DIR}/create-workspaces.json"
Expand Down
42 changes: 40 additions & 2 deletions scaletest/templates/scaletest-runner/startup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,22 +23,58 @@ fi

annotate_grafana "workspace" "Agent running" # Ended in shutdown.sh.

{
pids=()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

potential follow-up: this would be insanely cool to have as a standalone script

ports=()
declare -A pods=()
next_port=6061
for pod in $(kubectl get pods -l app.kubernetes.io/name=coder -o jsonpath='{.items[*].metadata.name}'); do
maybedryrun "${DRY_RUN}" kubectl -n coder-big port-forward "${pod}" "${next_port}:6060" &
pids+=($!)
ports+=("${next_port}")
pods[${next_port}]="${pod}"
next_port=$((next_port + 1))
done

trap 'trap - EXIT; kill -INT "${pids[@]}"; exit 1' INT EXIT

while :; do
sleep 285 # ~300 when accounting for profile and trace.
log "Grabbing pprof dumps"
start="$(date +%s)"
annotate_grafana "pprof" "Grab pprof dumps (start=${start})"
for type in allocs block heap goroutine mutex 'profile?seconds=10' 'trace?seconds=5'; do
for port in "${ports[@]}"; do
tidy_type="${type//\?/_}"
tidy_type="${tidy_type//=/_}"
maybedryrun "${DRY_RUN}" curl -sSL --output "${SCALETEST_PPROF_DIR}/pprof-${tidy_type}-${pods[${port}]}-${start}.gz" "http://localhost:${port}/debug/pprof/${type}"
done
done
annotate_grafana_end "pprof" "Grab pprof dumps (start=${start})"
done
} &
pprof_pid=$!

# Show failure in the UI if script exits with error.
failed_status=Failed
on_exit() {
code=${?}
trap - ERR EXIT
set +e

kill -INT "${pprof_pid}"

case "${SCALETEST_PARAM_CLEANUP_STRATEGY}" in
on_stop)
# Handled by shutdown script.
;;
on_success)
if [[ $(get_status) != "${failed_status}" ]]; then
if ((code == 0)); then
"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_PARAM_CLEANUP_STRATEGY}"
fi
;;
on_error)
if [[ $(get_status) = "${failed_status}" ]]; then
if ((code > 0)); then
"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_PARAM_CLEANUP_STRATEGY}"
fi
;;
Expand All @@ -60,6 +96,8 @@ on_err() {
GRAFANA_EXTRA_TAGS=error set_status "${failed_status} (exit=${code})"
"${SCRIPTS_DIR}/report.sh" failed
lock_status # Ensure we never rewrite the status after a failure.

exit "${code}"
}
trap on_err ERR

Expand Down