Skip to content

feat(scaletest): add grafana annotations and slack reporting #9852

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Sep 27, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
feat(scaletest): add grafana annotations
  • Loading branch information
mafredri committed Sep 25, 2023
commit 86ff71d1fa137eef6fbed4c78c2bb4be478e1f16
24 changes: 23 additions & 1 deletion scaletest/templates/scaletest-runner/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ locals {
home_disk_size = 10
scaletest_run_id = "scaletest-${time_static.start_time.rfc3339}"
scaletest_run_dir = "/home/coder/${local.scaletest_run_id}"
grafana_url = "https://stats.dev.c8s.io"
grafana_dashboard_uid = "qLVSTR-Vz"
grafana_dashboard_name = "coderv2-loadtest-dashboard"
}

data "coder_provisioner" "me" {
Expand Down Expand Up @@ -237,6 +240,9 @@ resource "coder_agent" "main" {
SCALETEST_CREATE_CONCURRENCY : "${data.coder_parameter.create_concurrency.value}",
SCALETEST_CLEANUP_STRATEGY : data.coder_parameter.cleanup_strategy.value,

GRAFANA_URL : local.grafana_url,
# GRAFANA_DASHBOARD_UID : local.grafana_dashboard_uid,

SCRIPTS_ZIP : filebase64(data.archive_file.scripts_zip.output_path),
SCRIPTS_DIR : "/tmp/scripts",
}
Expand Down Expand Up @@ -332,7 +338,7 @@ resource "coder_app" "grafana" {
agent_id = coder_agent.main.id
slug = "00-grafana"
display_name = "Grafana"
url = "https://stats.dev.c8s.io/d/qLVSTR-Vz/coderv2-loadtest-dashboard?orgId=1&from=${time_static.start_time.unix * 1000}&to=now"
url = "${local.grafana_url}/d/${local.grafana_dashboard_uid}/${local.grafana_dashboard_name}?orgId=1&from=${time_static.start_time.unix * 1000}&to=now"
icon = "https://grafana.com/static/assets/img/fav32.png"
external = true
}
Expand Down Expand Up @@ -440,6 +446,15 @@ resource "kubernetes_pod" "main" {
name = "CODER_AGENT_LOG_DIR"
value = "${local.scaletest_run_dir}/logs"
}
env {
name = "GRAFANA_API_TOKEN"
value_from {
secret_key_ref {
name = data.kubernetes_secret.grafana_editor_api_token.metadata[0].name
key = "token"
}
}
}
resources {
# Set requests and limits values such that we can do performant
# execution of `coder scaletest` commands.
Expand Down Expand Up @@ -505,6 +520,13 @@ resource "kubernetes_pod" "main" {
}
}

data "kubernetes_secret" "grafana_editor_api_token" {
metadata {
name = "grafana-editor-api-token"
namespace = data.coder_parameter.namespace.value
}
}

resource "kubernetes_manifest" "pod_monitor" {
count = data.coder_workspace.me.start_count
manifest = {
Expand Down
130 changes: 124 additions & 6 deletions scaletest/templates/scaletest-runner/scripts/lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,13 @@ set_status() {
if [[ ${DRY_RUN} == 1 ]]; then
dry_run=" (dry-ryn)"
fi
prev_status=$(get_status)
if [[ ${prev_status} != *"Not started"* ]]; then
annotate_grafana_end "status" "Status: ${prev_status}"
fi
echo "$(date -Ins) ${*}${dry_run}" >>"${SCALETEST_STATE_DIR}/status"

annotate_grafana "status" "Status: ${*}"
}
lock_status() {
chmod 0440 "${SCALETEST_STATE_DIR}/status"
Expand All @@ -51,25 +57,29 @@ phase_num=0
start_phase() {
# This may be incremented from another script, so we read it every time.
if [[ -f "${SCALETEST_PHASE_FILE}" ]]; then
phase_num="$(grep -c START: "${SCALETEST_PHASE_FILE}")"
phase_num=$(grep -c START: "${SCALETEST_PHASE_FILE}")
fi
phase_num=$((phase_num + 1))
log "Start phase ${phase_num}: ${*}"
echo "$(date -Ins) START:${phase_num}: ${*}" >>"${SCALETEST_PHASE_FILE}"

GRAFANA_EXTRA_TAGS="${PHASE_TYPE:-phase-default}" annotate_grafana "phase" "Phase ${phase_num}: ${*}"
}
end_phase() {
phase="$(tail -n 1 "${SCALETEST_PHASE_FILE}" | grep "START:${phase_num}:" | cut -d' ' -f3-)"
phase=$(tail -n 1 "${SCALETEST_PHASE_FILE}" | grep "START:${phase_num}:" | cut -d' ' -f3-)
if [[ -z ${phase} ]]; then
log "BUG: Could not find start phase ${phase_num} in ${SCALETEST_PHASE_FILE}"
exit 1
fi
log "End phase ${phase_num}: ${phase}"
echo "$(date -Ins) END:${phase_num}: ${phase}" >>"${SCALETEST_PHASE_FILE}"

GRAFANA_EXTRA_TAGS="${PHASE_TYPE:-phase-default}" annotate_grafana_end "phase" "Phase ${phase_num}: ${phase}"
}
get_phase() {
if [[ -f "${SCALETEST_PHASE_FILE}" ]]; then
phase_raw="$(tail -n1 "${SCALETEST_PHASE_FILE}")"
phase="$(echo "${phase_raw}" | cut -d' ' -f3-)"
phase_raw=$(tail -n1 "${SCALETEST_PHASE_FILE}")
phase=$(echo "${phase_raw}" | cut -d' ' -f3-)
if [[ ${phase_raw} == *"END:"* ]]; then
phase+=" [done]"
fi
Expand All @@ -86,9 +96,117 @@ get_previous_phase() {
fi
}

annotate_grafana() {
local tags=${1} text=${2} start=${3:-$(($(date +%s) * 1000))}
local json resp id

if [[ -z $tags ]]; then
tags="scaletest,runner"
else
tags="scaletest,runner,${tags}"
fi
if [[ -n ${GRAFANA_EXTRA_TAGS:-} ]]; then
tags="${tags},${GRAFANA_EXTRA_TAGS}"
fi

log "Annotating Grafana (start=${start}): ${text} [${tags}]"

json="$(
jq \
--argjson time "${start}" \
--arg text "${text}" \
--arg tags "${tags}" \
'{time: $time, tags: $tags | split(","), text: $text}' <<<'{}'
)"
if [[ ${DRY_RUN} == 1 ]]; then
log "Would have annotated Grafana, data=${json}"
return 0
fi
if ! resp="$(
curl -sSL \
--insecure \
-H "Authorization: Bearer ${GRAFANA_API_TOKEN}" \
-H "Content-Type: application/json" \
-d "${json}" \
"${GRAFANA_URL}/api/annotations"
)"; then
# Don't abort scaletest just because we couldn't annotate Grafana.
log "Failed to annotate Grafana: ${resp}"
return 0
fi

if [[ $(jq -r '.message' <<<"${resp}") != "Annotation added" ]]; then
log "Failed to annotate Grafana: ${resp}"
return 0
fi

log "Grafana annotation added!"

if [[ ! -f "${SCALETEST_STATE_DIR}" ]]; then
mkdir -p "${SCALETEST_STATE_DIR}"
fi
id="$(jq -r '.id' <<<"${resp}")"
echo "${id}:${tags}:${text}:${start}" >>"${SCALETEST_STATE_DIR}/grafana-annotations"
}
annotate_grafana_end() {
local tags=${1} text=${2} start=${3:-} end=${4:-$(($(date +%s) * 1000))}
local id json resp

if [[ -z $tags ]]; then
tags="scaletest,runner"
else
tags="scaletest,runner,${tags}"
fi
if [[ -n ${GRAFANA_EXTRA_TAGS:-} ]]; then
tags="${tags},${GRAFANA_EXTRA_TAGS}"
fi

if [[ ${DRY_RUN} == 1 ]]; then
log "Would have updated Grafana annotation (end=${end}): ${text} [${tags}]"
return 0
fi

if ! id=$(grep ":${tags}:${text}:${start}" "${SCALETEST_STATE_DIR}/grafana-annotations" | sort -n | tail -n1 | cut -d: -f1); then
log "NOTICE: Could not find Grafana annotation to end: '${tags}:${text}:${start}', skipping..."
return 0
fi

log "Annotating Grafana (end=${end}): ${text} [${tags}]"

json="$(
jq \
--argjson timeEnd "${end}" \
'{timeEnd: $timeEnd}' <<<'{}'
)"
if [[ ${DRY_RUN} == 1 ]]; then
log "Would have patched Grafana annotation: id=${id}, data=${json}"
return 0
fi
if ! resp="$(
curl -sSL \
--insecure \
-H "Authorization: Bearer ${GRAFANA_API_TOKEN}" \
-H "Content-Type: application/json" \
-X PATCH \
-d "${json}" \
"${GRAFANA_URL}/api/annotations/${id}"
)"; then
# Don't abort scaletest just because we couldn't annotate Grafana.
log "Failed to annotate Grafana end: ${resp}"
return 0
fi

if [[ $(jq -r '.message' <<<"${resp}") != "Annotation patched" ]]; then
log "Failed to annotate Grafana end: ${resp}"
return 0
fi

log "Grafana annotation patched!"
}

wait_baseline() {
s=${1:-2}
start_phase "Waiting ${s}m to establish baseline"
PHASE_TYPE="phase-wait" start_phase "Waiting ${s}m to establish baseline"
maybedryrun "$DRY_RUN" sleep $((s * 60))
end_phase
PHASE_TYPE="phase-wait" end_phase
}
7 changes: 0 additions & 7 deletions scaletest/templates/scaletest-runner/scripts/prepare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,6 @@ for dir in "${HOME}/scaletest-"*; do
fi
done

log "Cloning coder/coder repo..."

if [[ ! -d "${HOME}/coder" ]]; then
git clone https://github.com/coder/coder.git "${HOME}/coder"
fi
(cd "${HOME}/coder" && git pull)

log "Creating coder CLI token (needed for cleanup during shutdown)..."

mkdir -p "${CODER_CONFIG_DIR}"
Expand Down
4 changes: 4 additions & 0 deletions scaletest/templates/scaletest-runner/shutdown.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,8 @@ cleanup() {
}
trap cleanup EXIT

annotate_grafana "workspace" "Agent stopping..."

"${SCRIPTS_DIR}/cleanup.sh" shutdown

annotate_grafana_end "workspace" "Agent running"
18 changes: 17 additions & 1 deletion scaletest/templates/scaletest-runner/startup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,17 @@ mkdir -p "${SCRIPTS_DIR}"
unzip -o /tmp/scripts.zip -d "${SCRIPTS_DIR}"
rm /tmp/scripts.zip

echo "Cloning coder/coder repo..."
if [[ ! -d "${HOME}/coder" ]]; then
git clone https://github.com/coder/coder.git "${HOME}/coder"
fi
(cd "${HOME}/coder" && git pull)

# shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh
. "${SCRIPTS_DIR}/lib.sh"

annotate_grafana "workspace" "Agent running" # Ended in shutdown.sh.

# Show failure in the UI if script exits with error.
failed_status=Failed
on_exit() {
Expand All @@ -38,15 +46,23 @@ on_exit() {
"${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_CLEANUP_STRATEGY}"
;;
esac

annotate_grafana_end "" "Start scaletest"
}
trap on_exit EXIT

on_err() {
code=${?}
trap - ERR

log "Scaletest failed!"
set_status "${failed_status}"
GRAFANA_EXTRA_TAGS=error set_status "${failed_status} (exit=${code})"
lock_status # Ensure we never rewrite the status after a failure.
}
trap on_err ERR

annotate_grafana "" "Start scaletest"

"${SCRIPTS_DIR}/prepare.sh"

"${SCRIPTS_DIR}/run.sh"