From b51555b7dfc0d40b7b1afb07f953d939ae9847a9 Mon Sep 17 00:00:00 2001
From: Mathias Fredriksson <mafredri@gmail.com>
Date: Mon, 16 Oct 2023 13:13:34 +0000
Subject: [PATCH] feat(scaletest/templates): gather pod logs at the end of a
 scale test

---
 scaletest/templates/scaletest-runner/main.tf  |  4 ++-
 .../templates/scaletest-runner/scripts/lib.sh |  5 ++--
 .../templates/scaletest-runner/startup.sh     | 29 +++++++++++++++++++
 3 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf
index ca9d8b7b6bfb0..f07f723d649d4 100644
--- a/scaletest/templates/scaletest-runner/main.tf
+++ b/scaletest/templates/scaletest-runner/main.tf
@@ -42,8 +42,9 @@ locals {
   cpu                                            = 16
   memory                                         = 64
   home_disk_size                                 = 10
-  scaletest_run_id                               = "scaletest-${time_static.start_time.rfc3339}"
+  scaletest_run_id                               = "scaletest-${replace(time_static.start_time.rfc3339, ":", "-")}"
   scaletest_run_dir                              = "/home/coder/${local.scaletest_run_id}"
+  scaletest_run_start_time                       = time_static.start_time.rfc3339
   grafana_url                                    = "https://stats.dev.c8s.io"
   grafana_dashboard_uid                          = "qLVSTR-Vz"
   grafana_dashboard_name                         = "coderv2-loadtest-dashboard"
@@ -364,6 +365,7 @@ resource "coder_agent" "main" {
     # Local envs passed as arguments to `coder exp scaletest` invocations.
     SCALETEST_RUN_ID : local.scaletest_run_id,
     SCALETEST_RUN_DIR : local.scaletest_run_dir,
+    SCALETEST_RUN_START_TIME : local.scaletest_run_start_time,
 
     SCALETEST_PARAM_TEMPLATE : data.coder_parameter.workspace_template.value,
     SCALETEST_PARAM_REPO_BRANCH : data.coder_parameter.repo_branch.value,
diff --git a/scaletest/templates/scaletest-runner/scripts/lib.sh b/scaletest/templates/scaletest-runner/scripts/lib.sh
index f70b92fcdd6b1..07398bc58e577 100644
--- a/scaletest/templates/scaletest-runner/scripts/lib.sh
+++ b/scaletest/templates/scaletest-runner/scripts/lib.sh
@@ -19,11 +19,12 @@ SCALETEST_STATE_DIR="${SCALETEST_RUN_DIR}/state"
 SCALETEST_PHASE_FILE="${SCALETEST_STATE_DIR}/phase"
 # shellcheck disable=SC2034
 SCALETEST_RESULTS_DIR="${SCALETEST_RUN_DIR}/results"
+SCALETEST_LOGS_DIR="${SCALETEST_RUN_DIR}/logs"
 SCALETEST_PPROF_DIR="${SCALETEST_RUN_DIR}/pprof"
 # https://github.com/kubernetes/kubernetes/issues/72501 :-(
-SCALETEST_CODER_BINARY="/tmp/coder-full-${SCALETEST_RUN_ID//:/-}"
+SCALETEST_CODER_BINARY="/tmp/coder-full-${SCALETEST_RUN_ID}"
 
-mkdir -p "${SCALETEST_STATE_DIR}" "${SCALETEST_RESULTS_DIR}" "${SCALETEST_PPROF_DIR}"
+mkdir -p "${SCALETEST_STATE_DIR}" "${SCALETEST_RESULTS_DIR}" "${SCALETEST_LOGS_DIR}" "${SCALETEST_PPROF_DIR}"
 
 coder() {
 	if [[ ! -x "${SCALETEST_CODER_BINARY}" ]]; then
diff --git a/scaletest/templates/scaletest-runner/startup.sh b/scaletest/templates/scaletest-runner/startup.sh
index 7d0ef2d593f18..57b8b1091efa8 100755
--- a/scaletest/templates/scaletest-runner/startup.sh
+++ b/scaletest/templates/scaletest-runner/startup.sh
@@ -60,6 +60,28 @@ annotate_grafana "workspace" "Agent running" # Ended in shutdown.sh.
 } &
 pprof_pid=$!
 
+logs_gathered=0
+gather_logs() {
+	if ((logs_gathered == 1)); then
+		return
+	fi
+	logs_gathered=1
+
+	# Gather logs from all coderd and provisioner instances, and all workspaces.
+	annotate_grafana "logs" "Gather logs"
+	podsraw="$(
+		kubectl -n coder-big get pods -l app.kubernetes.io/name=coder -o name
+		kubectl -n coder-big get pods -l app.kubernetes.io/name=coder-provisioner -o name
+		kubectl -n coder-big get pods -l app.kubernetes.io/name=coder-workspace -o name | grep "^pod/scaletest-"
+	)"
+	mapfile -t pods <<<"${podsraw}"
+	for pod in "${pods[@]}"; do
+		pod_name="${pod#pod/}"
+		kubectl -n coder-big logs "${pod}" --since="${SCALETEST_RUN_START_TIME}" >"${SCALETEST_LOGS_DIR}/${pod_name}.txt"
+	done
+	annotate_grafana_end "logs" "Gather logs"
+}
+
 set_appearance "${appearance_json}" "${service_banner_color}" "${service_banner_message} | Scaletest running: [${CODER_USER}/${CODER_WORKSPACE}](${CODER_URL}/@${CODER_USER}/${CODER_WORKSPACE})!"
 
 # Show failure in the UI if script exits with error.
@@ -77,6 +99,10 @@ on_exit() {
 		message_status=FAILED
 	fi
 
+	# In case the test failed before gathering logs, gather them before
+	# cleaning up, whilst the workspaces are still present.
+	gather_logs
+
 	case "${SCALETEST_PARAM_CLEANUP_STRATEGY}" in
 	on_stop)
 		# Handled by shutdown script.
@@ -127,4 +153,7 @@ annotate_grafana "" "Start scaletest"
 
 "${SCRIPTS_DIR}/run.sh"
 
+# Gather logs before ending the test.
+gather_logs
+
 "${SCRIPTS_DIR}/report.sh" completed