-
Notifications
You must be signed in to change notification settings - Fork 888
feat(scaletest): create automated pprof dumps during scaletest #9887
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,9 @@ SCALETEST_STATE_DIR="${SCALETEST_RUN_DIR}/state" | |
SCALETEST_PHASE_FILE="${SCALETEST_STATE_DIR}/phase" | ||
# shellcheck disable=SC2034 | ||
SCALETEST_RESULTS_DIR="${SCALETEST_RUN_DIR}/results" | ||
SCALETEST_PPROF_DIR="${SCALETEST_RUN_DIR}/pprof" | ||
|
||
mkdir -p "${SCALETEST_STATE_DIR}" "${SCALETEST_RESULTS_DIR}" "${SCALETEST_PPROF_DIR}" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Follow-up idea: would it make sense to push these to a cloud bucket or something for easier perusal? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've been thinking about it, and we could definitely do that. Or we could push them to a separate workspace that can serve them in some nifty way. For instance, if we did take prometheus snapshots, this workspace could run a prom instance with that data, etc. |
||
|
||
coder() { | ||
maybedryrun "${DRY_RUN}" command coder "${@}" | ||
|
@@ -142,9 +145,6 @@ annotate_grafana() { | |
|
||
log "Grafana annotation added!" | ||
|
||
if [[ ! -f "${SCALETEST_STATE_DIR}" ]]; then | ||
mkdir -p "${SCALETEST_STATE_DIR}" | ||
fi | ||
id="$(jq -r '.id' <<<"${resp}")" | ||
echo "${id}:${tags}:${text}:${start}" >>"${SCALETEST_STATE_DIR}/grafana-annotations" | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,22 +23,58 @@ fi | |
|
||
annotate_grafana "workspace" "Agent running" # Ended in shutdown.sh. | ||
|
||
{ | ||
pids=() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. potential follow-up: this would be insanely cool to have as a standalone script |
||
ports=() | ||
declare -A pods=() | ||
next_port=6061 | ||
for pod in $(kubectl get pods -l app.kubernetes.io/name=coder -o jsonpath='{.items[*].metadata.name}'); do | ||
maybedryrun "${DRY_RUN}" kubectl -n coder-big port-forward "${pod}" "${next_port}:6060" & | ||
pids+=($!) | ||
ports+=("${next_port}") | ||
pods[${next_port}]="${pod}" | ||
next_port=$((next_port + 1)) | ||
done | ||
|
||
trap 'trap - EXIT; kill -INT "${pids[@]}"; exit 1' INT EXIT | ||
|
||
while :; do | ||
sleep 285 # ~300 when accounting for profile and trace. | ||
log "Grabbing pprof dumps" | ||
start="$(date +%s)" | ||
annotate_grafana "pprof" "Grab pprof dumps (start=${start})" | ||
for type in allocs block heap goroutine mutex 'profile?seconds=10' 'trace?seconds=5'; do | ||
for port in "${ports[@]}"; do | ||
tidy_type="${type//\?/_}" | ||
tidy_type="${tidy_type//=/_}" | ||
maybedryrun "${DRY_RUN}" curl -sSL --output "${SCALETEST_PPROF_DIR}/pprof-${tidy_type}-${pods[${port}]}-${start}.gz" "http://localhost:${port}/debug/pprof/${type}" | ||
done | ||
done | ||
annotate_grafana_end "pprof" "Grab pprof dumps (start=${start})" | ||
done | ||
} & | ||
pprof_pid=$! | ||
|
||
# Show failure in the UI if script exits with error. | ||
failed_status=Failed | ||
on_exit() { | ||
code=${?} | ||
trap - ERR EXIT | ||
set +e | ||
|
||
kill -INT "${pprof_pid}" | ||
|
||
case "${SCALETEST_PARAM_CLEANUP_STRATEGY}" in | ||
on_stop) | ||
# Handled by shutdown script. | ||
;; | ||
on_success) | ||
if [[ $(get_status) != "${failed_status}" ]]; then | ||
if ((code == 0)); then | ||
"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_PARAM_CLEANUP_STRATEGY}" | ||
fi | ||
;; | ||
on_error) | ||
if [[ $(get_status) = "${failed_status}" ]]; then | ||
if ((code > 0)); then | ||
"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_PARAM_CLEANUP_STRATEGY}" | ||
fi | ||
;; | ||
|
@@ -60,6 +96,8 @@ on_err() { | |
GRAFANA_EXTRA_TAGS=error set_status "${failed_status} (exit=${code})" | ||
"${SCRIPTS_DIR}/report.sh" failed | ||
lock_status # Ensure we never rewrite the status after a failure. | ||
|
||
exit "${code}" | ||
} | ||
trap on_err ERR | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: I'm curious, should it be a little shorter than
workspace_pod_termination_grace_period_seconds
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We do it the other way, we add 1-2m to
workspace_pod_termination_grace_period_seconds
in the template instead. 😅But we should probably parameterize these in the future.