From ca8aac50965a4d33ffb3cd09f07717d6987775b4 Mon Sep 17 00:00:00 2001 From: johnstcn Date: Mon, 11 Jul 2022 12:55:15 +0000 Subject: [PATCH 1/8] add sleeps to agent startup script --- provisionersdk/scripts/bootstrap_linux.sh | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/provisionersdk/scripts/bootstrap_linux.sh b/provisionersdk/scripts/bootstrap_linux.sh index 3bf6065164a68..42dbe6e7d85ca 100644 --- a/provisionersdk/scripts/bootstrap_linux.sh +++ b/provisionersdk/scripts/bootstrap_linux.sh @@ -6,16 +6,26 @@ BINARY_NAME=coder BINARY_URL=${ACCESS_URL}bin/coder-linux-${ARCH} cd "$BINARY_DIR" if command -v curl >/dev/null 2>&1; then - curl -fsSL --compressed "${BINARY_URL}" -o "${BINARY_NAME}" + curl -fsSL --compressed "${BINARY_URL}" -o "${BINARY_NAME}" || ( + echo "error: failed to download coder agent" && sleep 600 + ) elif command -v wget >/dev/null 2>&1; then - wget -q "${BINARY_URL}" -O "${BINARY_NAME}" + wget -q "${BINARY_URL}" -O "${BINARY_NAME}" || ( + echo "error: failed to download coder agent" && sleep 600 + ) elif command -v busybox >/dev/null 2>&1; then - busybox wget -q "${BINARY_URL}" -O "${BINARY_NAME}" + busybox wget -q "${BINARY_URL}" -O "${BINARY_NAME}" || ( + echo "error: failed to download coder agent" && sleep 600 + ) else echo "error: no download tool found, please install curl, wget or busybox wget" exit 1 fi -chmod +x $BINARY_NAME +chmod +x $BINARY_NAME || ( + echo "Failed to make $BINARY_NAME executable" && sleep 600 +) export CODER_AGENT_AUTH="${AUTH_TYPE}" export CODER_AGENT_URL="${ACCESS_URL}" -exec ./$BINARY_NAME agent +exec ./$BINARY_NAME agent || ( + echo "Failed to exec ${BINARY_NAME}" && sleep 600 +) From 51235b955a18c44f600cec277913aab3868a93c7 Mon Sep 17 00:00:00 2001 From: johnstcn Date: Mon, 11 Jul 2022 12:56:08 +0000 Subject: [PATCH 2/8] exit 1 --- provisionersdk/scripts/bootstrap_linux.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/provisionersdk/scripts/bootstrap_linux.sh b/provisionersdk/scripts/bootstrap_linux.sh index 42dbe6e7d85ca..90569d3d8b6ef 100644 --- a/provisionersdk/scripts/bootstrap_linux.sh +++ b/provisionersdk/scripts/bootstrap_linux.sh @@ -7,22 +7,22 @@ BINARY_URL=${ACCESS_URL}bin/coder-linux-${ARCH} cd "$BINARY_DIR" if command -v curl >/dev/null 2>&1; then curl -fsSL --compressed "${BINARY_URL}" -o "${BINARY_NAME}" || ( - echo "error: failed to download coder agent" && sleep 600 + echo "error: failed to download coder agent" && sleep 600 && exit 1 ) elif command -v wget >/dev/null 2>&1; then wget -q "${BINARY_URL}" -O "${BINARY_NAME}" || ( - echo "error: failed to download coder agent" && sleep 600 + echo "error: failed to download coder agent" && sleep 600 && exit 1 ) elif command -v busybox >/dev/null 2>&1; then busybox wget -q "${BINARY_URL}" -O "${BINARY_NAME}" || ( - echo "error: failed to download coder agent" && sleep 600 + echo "error: failed to download coder agent" && sleep 600 && exit 1 ) else echo "error: no download tool found, please install curl, wget or busybox wget" exit 1 fi chmod +x $BINARY_NAME || ( - echo "Failed to make $BINARY_NAME executable" && sleep 600 + echo "Failed to make $BINARY_NAME executable" && sleep 600 && exit 1 ) export CODER_AGENT_AUTH="${AUTH_TYPE}" export CODER_AGENT_URL="${ACCESS_URL}" From 1b9368fb065d6aded68e4a06849bc0588bf84f39 Mon Sep 17 00:00:00 2001 From: johnstcn Date: Mon, 11 Jul 2022 15:02:15 +0000 Subject: [PATCH 3/8] add more detailed exit statuses --- provisionersdk/scripts/bootstrap_linux.sh | 26 ++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/provisionersdk/scripts/bootstrap_linux.sh b/provisionersdk/scripts/bootstrap_linux.sh index 90569d3d8b6ef..2aaf997e698af 100644 --- a/provisionersdk/scripts/bootstrap_linux.sh +++ b/provisionersdk/scripts/bootstrap_linux.sh @@ -5,27 +5,43 @@ BINARY_DIR=$(mktemp -d -t coder.XXXXXX) BINARY_NAME=coder BINARY_URL=${ACCESS_URL}bin/coder-linux-${ARCH} cd "$BINARY_DIR" +# In the below invocations, we sleep for 30 seconds before exiting. +# This is because some providers (e.g. kreuzwerker/docker) will +# automatically remove a Docker container that exits within 15 +# seconds, making troubleshooting a failed workspace build +# extremely difficult. if command -v curl >/dev/null 2>&1; then curl -fsSL --compressed "${BINARY_URL}" -o "${BINARY_NAME}" || ( - echo "error: failed to download coder agent" && sleep 600 && exit 1 + status=$? + echo "error: failed to download coder agent using curl" + sleep 30 + exit $status ) elif command -v wget >/dev/null 2>&1; then wget -q "${BINARY_URL}" -O "${BINARY_NAME}" || ( - echo "error: failed to download coder agent" && sleep 600 && exit 1 + status=$? + echo "error: failed to download coder agent using wget" + sleep 30 + exit $status ) elif command -v busybox >/dev/null 2>&1; then busybox wget -q "${BINARY_URL}" -O "${BINARY_NAME}" || ( - echo "error: failed to download coder agent" && sleep 600 && exit 1 + status=$? + echo "error: failed to download coder agent using busybox wget" + sleep 30 + exit $status ) else echo "error: no download tool found, please install curl, wget or busybox wget" exit 1 fi chmod +x $BINARY_NAME || ( - echo "Failed to make $BINARY_NAME executable" && sleep 600 && exit 1 + echo "Failed to make $BINARY_NAME executable" && sleep 30 && exit 1 ) export CODER_AGENT_AUTH="${AUTH_TYPE}" export CODER_AGENT_URL="${ACCESS_URL}" exec ./$BINARY_NAME agent || ( - echo "Failed to exec ${BINARY_NAME}" && sleep 600 + echo "Failed to exec ${BINARY_NAME}" + sleep 30 + exit 126 ) From 9586595a2720fd011c707a1d94dd096578507809 Mon Sep 17 00:00:00 2001 From: johnstcn Date: Mon, 11 Jul 2022 16:45:19 +0000 Subject: [PATCH 4/8] quick --- provisionersdk/scripts/bootstrap_linux.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/provisionersdk/scripts/bootstrap_linux.sh b/provisionersdk/scripts/bootstrap_linux.sh index 2aaf997e698af..e03b1d309e348 100644 --- a/provisionersdk/scripts/bootstrap_linux.sh +++ b/provisionersdk/scripts/bootstrap_linux.sh @@ -1,6 +1,10 @@ #!/usr/bin/env sh set -eux pipefail -trap 'echo === Agent script exited with non-zero code. Sleeping 24h to preserve logs... && sleep 86400' EXIT +waitonexit() { + echo '=== Agent script exited with non-zero code. Sleeping 24h to preserve logs...' + sleep 86400 +} +trap waitonexit EXIT BINARY_DIR=$(mktemp -d -t coder.XXXXXX) BINARY_NAME=coder BINARY_URL=${ACCESS_URL}bin/coder-linux-${ARCH} From 49423350420b53e16cf3b46fbc88907438fe9e44 Mon Sep 17 00:00:00 2001 From: johnstcn Date: Tue, 12 Jul 2022 11:20:40 +0000 Subject: [PATCH 5/8] make scripts loop forever --- provisionersdk/scripts/bootstrap_darwin.sh | 29 ++++++++- provisionersdk/scripts/bootstrap_linux.sh | 63 ++++++++++---------- provisionersdk/scripts/bootstrap_windows.ps1 | 32 +++++++--- 3 files changed, 81 insertions(+), 43 deletions(-) diff --git a/provisionersdk/scripts/bootstrap_darwin.sh b/provisionersdk/scripts/bootstrap_darwin.sh index 1e5d549b1d627..64d5745951a62 100644 --- a/provisionersdk/scripts/bootstrap_darwin.sh +++ b/provisionersdk/scripts/bootstrap_darwin.sh @@ -1,11 +1,34 @@ #!/usr/bin/env sh set -eux pipefail -trap 'echo === Agent script exited with non-zero code. Sleeping 24h to preserve logs... && sleep 86400' EXIT +# Sleep for a good long while before exiting. +# This is to allow folks to exec into a failed workspace and poke around to +# troubleshoot. +waitonexit() { + echo '=== Agent script exited with non-zero code. Sleeping 24h to preserve logs...' + sleep 86400 +} +trap waitonexit EXIT BINARY_DIR=$(mktemp -d -t coder.XXXXXX) BINARY_NAME=coder +BINARY_URL=${ACCESS_URL}bin/coder-darwin-${ARCH} cd "$BINARY_DIR" -curl -fsSL --compressed "${ACCESS_URL}bin/coder-darwin-${ARCH}" -o "${BINARY_NAME}" -chmod +x $BINARY_NAME +# Attempt to download the coder agent. +# This could fail for a number of reasons, many of which are likely transient. +# So just keep trying! +while true; do + curl -fsSL --compressed "${BINARY_URL}" -o "${BINARY_NAME}" && break + status=$? + echo "error: failed to download coder agent using curl" + echo "curl exit code: ${status}" + echo "trying again in 30 seconds..." + sleep 30 +done + +if ! chmod +x $BINARY_NAME; then + echo "Failed to make $BINARY_NAME executable" + exit 1 +fi + export CODER_AGENT_AUTH="${AUTH_TYPE}" export CODER_AGENT_URL="${ACCESS_URL}" exec ./$BINARY_NAME agent diff --git a/provisionersdk/scripts/bootstrap_linux.sh b/provisionersdk/scripts/bootstrap_linux.sh index e03b1d309e348..5507a97a07803 100644 --- a/provisionersdk/scripts/bootstrap_linux.sh +++ b/provisionersdk/scripts/bootstrap_linux.sh @@ -1,5 +1,8 @@ #!/usr/bin/env sh set -eux pipefail +# Sleep for a good long while before exiting. +# This is to allow folks to exec into a failed workspace and poke around to +# troubleshoot. waitonexit() { echo '=== Agent script exited with non-zero code. Sleeping 24h to preserve logs...' sleep 86400 @@ -9,43 +12,41 @@ BINARY_DIR=$(mktemp -d -t coder.XXXXXX) BINARY_NAME=coder BINARY_URL=${ACCESS_URL}bin/coder-linux-${ARCH} cd "$BINARY_DIR" -# In the below invocations, we sleep for 30 seconds before exiting. -# This is because some providers (e.g. kreuzwerker/docker) will -# automatically remove a Docker container that exits within 15 -# seconds, making troubleshooting a failed workspace build -# extremely difficult. -if command -v curl >/dev/null 2>&1; then - curl -fsSL --compressed "${BINARY_URL}" -o "${BINARY_NAME}" || ( +# Attempt to download the coder agent. +# This could fail for a number of reasons, many of which are likely transient. +# So just keep trying! +while true; do + # Try a number of different download tools, as we don't know what we'll + # have available + if command -v curl >/dev/null 2>&1; then + curl -fsSL --compressed "${BINARY_URL}" -o "${BINARY_NAME}" && break status=$? echo "error: failed to download coder agent using curl" - sleep 30 - exit $status - ) -elif command -v wget >/dev/null 2>&1; then - wget -q "${BINARY_URL}" -O "${BINARY_NAME}" || ( + echo "curl exit code: ${status}" + elif command -v wget >/dev/null 2>&1; then + wget -q "${BINARY_URL}" -O "${BINARY_NAME}" && break status=$? + test "${status}" -eq 0 && break echo "error: failed to download coder agent using wget" - sleep 30 - exit $status - ) -elif command -v busybox >/dev/null 2>&1; then - busybox wget -q "${BINARY_URL}" -O "${BINARY_NAME}" || ( - status=$? + echo "wget exit code: ${status}" + elif command -v busybox >/dev/null 2>&1; then + busybox wget -q "${BINARY_URL}" -O "${BINARY_NAME}" && break + test "${status}" -eq 0 && break echo "error: failed to download coder agent using busybox wget" - sleep 30 - exit $status - ) -else - echo "error: no download tool found, please install curl, wget or busybox wget" + echo "busybox wget exit code: ${status}" + else + echo "error: no download tool found, please install curl, wget or busybox wget" + exit 127 + fi + echo "trying again in 30 seconds..." + sleep 30 +done + +if ! chmod +x $BINARY_NAME; then + echo "Failed to make $BINARY_NAME executable" exit 1 fi -chmod +x $BINARY_NAME || ( - echo "Failed to make $BINARY_NAME executable" && sleep 30 && exit 1 -) + export CODER_AGENT_AUTH="${AUTH_TYPE}" export CODER_AGENT_URL="${ACCESS_URL}" -exec ./$BINARY_NAME agent || ( - echo "Failed to exec ${BINARY_NAME}" - sleep 30 - exit 126 -) +exec ./$BINARY_NAME agent diff --git a/provisionersdk/scripts/bootstrap_windows.ps1 b/provisionersdk/scripts/bootstrap_windows.ps1 index 810012155bbc4..13bfe41522d98 100644 --- a/provisionersdk/scripts/bootstrap_windows.ps1 +++ b/provisionersdk/scripts/bootstrap_windows.ps1 @@ -1,9 +1,23 @@ -# On Windows, VS Code Remote requires a parent process of the -# executing shell to be named "sshd", otherwise it fails. See: -# https://github.com/microsoft/vscode-remote-release/issues/5699 -$ProgressPreference = "SilentlyContinue" -Invoke-WebRequest -Uri ${ACCESS_URL}bin/coder-windows-${ARCH}.exe -OutFile $env:TEMP\sshd.exe -Set-MpPreference -DisableRealtimeMonitoring $true -ExclusionPath $env:TEMP\sshd.exe -$env:CODER_AGENT_AUTH = "${AUTH_TYPE}" -$env:CODER_AGENT_URL = "${ACCESS_URL}" -Start-Process -FilePath $env:TEMP\sshd.exe -ArgumentList "agent" -PassThru +while ($true) { + try { + $ProgressPreference = "SilentlyContinue" + # On Windows, VS Code Remote requires a parent process of the + # executing shell to be named "sshd", otherwise it fails. See: + # https://github.com/microsoft/vscode-remote-release/issues/5699 + $BINARY_URL="${ACCESS_URL}/bin/coder-windows-${ARCH}.exe" + Invoke-WebRequest -Uri "${BINARY_URL}" -OutFile $env:TEMP\sshd.exe + Set-MpPreference -DisableRealtimeMonitoring $true -ExclusionPath $env:TEMP\sshd.exe + $env:CODER_AGENT_AUTH = "${AUTH_TYPE}" + $env:CODER_AGENT_URL = "${ACCESS_URL}" + Start-Process -FilePath $env:TEMP\sshd.exe -ArgumentList "agent" -PassThru + } catch [System.Net.WebException],[System.IO.IOException] { + Write-Error "error: failed to download coder agent from ${ACCESS_URL}" + Write-Error $_.ScriptStackTrace + } catch { + Write-Error "error: unhandled exception fetching and starting coder agent:" + Write-Error $_.ScriptStackTrace + } finally { + Write-Output "trying again in 30 seconds..." + Start-Sleep -Seconds 30 + } +} From 38bb5f54df3be2469c56cb838357082cf303a7cb Mon Sep 17 00:00:00 2001 From: johnstcn Date: Tue, 12 Jul 2022 11:21:00 +0000 Subject: [PATCH 6/8] more shellcheck --- scripts/coder-dev.sh | 2 +- scripts/develop.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/coder-dev.sh b/scripts/coder-dev.sh index 138db49734596..bf2c423cb90c1 100755 --- a/scripts/coder-dev.sh +++ b/scripts/coder-dev.sh @@ -5,7 +5,7 @@ set -euo pipefail SCRIPT_DIR=$(dirname "${BASH_SOURCE[0]}") -# shellcheck disable=SC1091 +# shellcheck disable=SC1091,SC1090 source "${SCRIPT_DIR}/lib.sh" PROJECT_ROOT=$(cd "$SCRIPT_DIR" && git rev-parse --show-toplevel) diff --git a/scripts/develop.sh b/scripts/develop.sh index 24c8e9e427571..0c91e30cb65d3 100755 --- a/scripts/develop.sh +++ b/scripts/develop.sh @@ -5,7 +5,7 @@ set -euo pipefail SCRIPT_DIR=$(dirname "${BASH_SOURCE[0]}") -# shellcheck disable=SC1091 +# shellcheck disable=SC1091,SC1090 source "${SCRIPT_DIR}/lib.sh" PROJECT_ROOT=$(cd "$SCRIPT_DIR" && git rev-parse --show-toplevel) CODER_DEV_BIN="${PROJECT_ROOT}/.coderv2/coder" From 89f2f7ce89023de1f332a05bfd2cce6b34bfa082 Mon Sep 17 00:00:00 2001 From: johnstcn Date: Tue, 12 Jul 2022 13:20:59 +0000 Subject: [PATCH 7/8] address PR comments --- provisionersdk/scripts/bootstrap_darwin.sh | 4 +-- provisionersdk/scripts/bootstrap_linux.sh | 16 ++++------- provisionersdk/scripts/bootstrap_windows.ps1 | 29 +++++++++++++------- 3 files changed, 27 insertions(+), 22 deletions(-) diff --git a/provisionersdk/scripts/bootstrap_darwin.sh b/provisionersdk/scripts/bootstrap_darwin.sh index 64d5745951a62..6d381fa7d9593 100644 --- a/provisionersdk/scripts/bootstrap_darwin.sh +++ b/provisionersdk/scripts/bootstrap_darwin.sh @@ -15,12 +15,12 @@ cd "$BINARY_DIR" # Attempt to download the coder agent. # This could fail for a number of reasons, many of which are likely transient. # So just keep trying! -while true; do +while :; do curl -fsSL --compressed "${BINARY_URL}" -o "${BINARY_NAME}" && break status=$? echo "error: failed to download coder agent using curl" echo "curl exit code: ${status}" - echo "trying again in 30 seconds..." + echo "Trying again in 30 seconds..." sleep 30 done diff --git a/provisionersdk/scripts/bootstrap_linux.sh b/provisionersdk/scripts/bootstrap_linux.sh index 5507a97a07803..e29d277df575f 100644 --- a/provisionersdk/scripts/bootstrap_linux.sh +++ b/provisionersdk/scripts/bootstrap_linux.sh @@ -15,30 +15,26 @@ cd "$BINARY_DIR" # Attempt to download the coder agent. # This could fail for a number of reasons, many of which are likely transient. # So just keep trying! -while true; do +while :; do # Try a number of different download tools, as we don't know what we'll # have available + status="" if command -v curl >/dev/null 2>&1; then curl -fsSL --compressed "${BINARY_URL}" -o "${BINARY_NAME}" && break status=$? - echo "error: failed to download coder agent using curl" - echo "curl exit code: ${status}" elif command -v wget >/dev/null 2>&1; then wget -q "${BINARY_URL}" -O "${BINARY_NAME}" && break status=$? - test "${status}" -eq 0 && break - echo "error: failed to download coder agent using wget" - echo "wget exit code: ${status}" elif command -v busybox >/dev/null 2>&1; then busybox wget -q "${BINARY_URL}" -O "${BINARY_NAME}" && break - test "${status}" -eq 0 && break - echo "error: failed to download coder agent using busybox wget" - echo "busybox wget exit code: ${status}" + status=$? else echo "error: no download tool found, please install curl, wget or busybox wget" exit 127 fi - echo "trying again in 30 seconds..." + echo "error: failed to download coder agent" + echo " command returned: ${status}" + echo "Trying again in 30 seconds..." sleep 30 done diff --git a/provisionersdk/scripts/bootstrap_windows.ps1 b/provisionersdk/scripts/bootstrap_windows.ps1 index 13bfe41522d98..c64a8c6bf7c75 100644 --- a/provisionersdk/scripts/bootstrap_windows.ps1 +++ b/provisionersdk/scripts/bootstrap_windows.ps1 @@ -1,3 +1,12 @@ +# Sleep for a while in case the underlyine provider deletes the resource on error. +trap { + Write-Error '=== Agent script exited with non-zero code. Sleeping 24h to preserve logs...' + Start-Sleep -Seconds 86400 +} + +# Attempt to download the coder agent. +# This could fail for a number of reasons, many of which are likely transient. +# So just keep trying! while ($true) { try { $ProgressPreference = "SilentlyContinue" @@ -5,19 +14,19 @@ while ($true) { # executing shell to be named "sshd", otherwise it fails. See: # https://github.com/microsoft/vscode-remote-release/issues/5699 $BINARY_URL="${ACCESS_URL}/bin/coder-windows-${ARCH}.exe" + Write-Output "Fetching coder agent from ${BINARY_URL}" Invoke-WebRequest -Uri "${BINARY_URL}" -OutFile $env:TEMP\sshd.exe - Set-MpPreference -DisableRealtimeMonitoring $true -ExclusionPath $env:TEMP\sshd.exe - $env:CODER_AGENT_AUTH = "${AUTH_TYPE}" - $env:CODER_AGENT_URL = "${ACCESS_URL}" - Start-Process -FilePath $env:TEMP\sshd.exe -ArgumentList "agent" -PassThru - } catch [System.Net.WebException],[System.IO.IOException] { - Write-Error "error: failed to download coder agent from ${ACCESS_URL}" - Write-Error $_.ScriptStackTrace + break } catch { - Write-Error "error: unhandled exception fetching and starting coder agent:" - Write-Error $_.ScriptStackTrace - } finally { + Write-Output "error: unhandled exception fetching coder agent:" + Write-Output $_ Write-Output "trying again in 30 seconds..." Start-Sleep -Seconds 30 } } + +# If the below fails, retrying probably won't help. +Set-MpPreference -DisableRealtimeMonitoring $true -ExclusionPath $env:TEMP\sshd.exe +$env:CODER_AGENT_AUTH = "${AUTH_TYPE}" +$env:CODER_AGENT_URL = "${ACCESS_URL}" +Start-Process -FilePath $env:TEMP\sshd.exe -ArgumentList "agent" -PassThru From 5e315efece14f5c216acdeadea2e18a544f09460 Mon Sep 17 00:00:00 2001 From: Cian Johnston Date: Tue, 12 Jul 2022 14:22:15 +0100 Subject: [PATCH 8/8] Update provisionersdk/scripts/bootstrap_windows.ps1 --- provisionersdk/scripts/bootstrap_windows.ps1 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/provisionersdk/scripts/bootstrap_windows.ps1 b/provisionersdk/scripts/bootstrap_windows.ps1 index c64a8c6bf7c75..882df4a8cebaf 100644 --- a/provisionersdk/scripts/bootstrap_windows.ps1 +++ b/provisionersdk/scripts/bootstrap_windows.ps1 @@ -1,4 +1,4 @@ -# Sleep for a while in case the underlyine provider deletes the resource on error. +# Sleep for a while in case the underlying provider deletes the resource on error. trap { Write-Error '=== Agent script exited with non-zero code. Sleeping 24h to preserve logs...' Start-Sleep -Seconds 86400