Skip to content

chore: improve CI reliability #16169

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jan 20, 2025
Merged
88 changes: 56 additions & 32 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -378,8 +378,62 @@ jobs:
with:
api-key: ${{ secrets.DATADOG_API_KEY }}

# We don't run the full test-suite for Windows & MacOS, so we just run the CLI tests on every PR.
# We run the test suite in test-go-pg, including CLI.
test-cli:
runs-on: ${{ matrix.os == 'macos-latest' && github.repository_owner == 'coder' && 'depot-macos-latest' || matrix.os == 'windows-2022' && github.repository_owner == 'coder' && 'windows-latest-16-cores' || matrix.os }}
needs: changes
if: needs.changes.outputs.go == 'true' || needs.changes.outputs.ci == 'true' || github.ref == 'refs/heads/main'
strategy:
matrix:
os:
- macos-latest
- windows-2022
steps:
- name: Harden Runner
uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
with:
egress-policy: audit

- name: Checkout
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
with:
fetch-depth: 1

- name: Setup Go
uses: ./.github/actions/setup-go

- name: Setup Terraform
uses: ./.github/actions/setup-tf

# Sets up the ImDisk toolkit for Windows and creates a RAM disk on drive R:.
- name: Setup ImDisk
if: runner.os == 'Windows'
uses: ./.github/actions/setup-imdisk

- name: Test CLI
env:
TS_DEBUG_DISCO: "true"
LC_CTYPE: "en_US.UTF-8"
LC_ALL: "en_US.UTF-8"
shell: bash
run: |
# By default Go will use the number of logical CPUs, which
# is a fine default.
PARALLEL_FLAG=""

make test-cli

- name: Upload test stats to Datadog
timeout-minutes: 1
continue-on-error: true
uses: ./.github/actions/upload-datadog
if: success() || failure()
with:
api-key: ${{ secrets.DATADOG_API_KEY }}

test-go-pg:
runs-on: ${{ matrix.os == 'ubuntu-latest' && github.repository_owner == 'coder' && 'depot-ubuntu-22.04-4' || matrix.os == 'macos-latest' && github.repository_owner == 'coder' && 'depot-macos-latest' || matrix.os == 'windows-2022' && github.repository_owner == 'coder' && 'windows-latest-16-cores' || matrix.os }}
runs-on: ${{ matrix.os == 'ubuntu-latest' && github.repository_owner == 'coder' && 'depot-ubuntu-22.04-4' || matrix.os }}
needs: changes
if: needs.changes.outputs.go == 'true' || needs.changes.outputs.ci == 'true' || github.ref == 'refs/heads/main'
# This timeout must be greater than the timeout set by `go test` in
Expand All @@ -391,8 +445,6 @@ jobs:
matrix:
os:
- ubuntu-latest
- macos-latest
- windows-2022
steps:
- name: Harden Runner
uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
Expand Down Expand Up @@ -423,39 +475,11 @@ jobs:
LC_ALL: "en_US.UTF-8"
shell: bash
run: |
# if macOS, install google-chrome for scaletests
# As another concern, should we really have this kind of external dependency
# requirement on standard CI?
if [ "${{ matrix.os }}" == "macos-latest" ]; then
brew install google-chrome
fi

# By default Go will use the number of logical CPUs, which
# is a fine default.
PARALLEL_FLAG=""

# macOS will output "The default interactive shell is now zsh"
# intermittently in CI...
if [ "${{ matrix.os }}" == "macos-latest" ]; then
touch ~/.bash_profile && echo "export BASH_SILENCE_DEPRECATION_WARNING=1" >> ~/.bash_profile
fi

if [ "${{ runner.os }}" == "Linux" ]; then
make test-postgres
elif [ "${{ runner.os }}" == "Windows" ]; then
# Create a temp dir on the R: ramdisk drive for Windows. The default
# C: drive is extremely slow: https://github.com/actions/runner-images/issues/8755
mkdir -p "R:/temp/embedded-pg"
go run scripts/embedded-pg/main.go -path "R:/temp/embedded-pg"
# Reduce test parallelism, mirroring what we do for race tests.
# We'd been encountering issues with timing related flakes, and
# this seems to help.
DB=ci gotestsum --format standard-quiet -- -v -short -count=1 -parallel 4 -p 4 ./...
else
go run scripts/embedded-pg/main.go
# Reduce test parallelism, like for Windows above.
DB=ci gotestsum --format standard-quiet -- -v -short -count=1 -parallel 4 -p 4 ./...
fi
make test-postgres

- name: Upload test stats to Datadog
timeout-minutes: 1
Expand Down
103 changes: 61 additions & 42 deletions .github/workflows/nightly-gauntlet.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,27 @@
name: nightly-gauntlet
on:
schedule:
# Every day at midnight
- cron: "0 0 * * *"
# Every day at 4AM
- cron: "0 4 * * 1-5"
workflow_dispatch:

permissions:
contents: read

jobs:
go-race:
# While GitHub's toaster runners are likelier to flake, we want consistency
# between this environment and the regular test environment for DataDog
# statistics and to only show real workflow threats.
runs-on: ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04-8' || 'ubuntu-latest' }}
# This runner costs 0.016 USD per minute,
# so 0.016 * 240 = 3.84 USD per run.
timeout-minutes: 240
test-go-pg:
runs-on: ${{ matrix.os == 'macos-latest' && github.repository_owner == 'coder' && 'depot-macos-latest' || matrix.os == 'windows-2022' && github.repository_owner == 'coder' && 'windows-latest-16-cores' || matrix.os }}
if: github.ref == 'refs/heads/main'
# This timeout must be greater than the timeout set by `go test` in
# `make test-postgres` to ensure we receive a trace of running
# goroutines. Setting this to the timeout +5m should work quite well
# even if some of the preceding steps are slow.
timeout-minutes: 25
strategy:
matrix:
os:
- macos-latest
- windows-2022
steps:
- name: Harden Runner
uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
Expand All @@ -27,58 +32,72 @@ jobs:

- name: Checkout
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
with:
fetch-depth: 1

- name: Setup Go
uses: ./.github/actions/setup-go

- name: Setup Terraform
uses: ./.github/actions/setup-tf

- name: Run Tests
run: |
# -race is likeliest to catch flaky tests
# due to correctness detection and its performance
# impact.
gotestsum --junitfile="gotests.xml" -- -timeout=240m -count=10 -race ./...
# Sets up the ImDisk toolkit for Windows and creates a RAM disk on drive R:.
- name: Setup ImDisk
if: runner.os == 'Windows'
uses: ./.github/actions/setup-imdisk

- name: Upload test results to DataDog
uses: ./.github/actions/upload-datadog
if: always()
with:
api-key: ${{ secrets.DATADOG_API_KEY }}
- name: Test with PostgreSQL Database
env:
POSTGRES_VERSION: "13"
TS_DEBUG_DISCO: "true"
LC_CTYPE: "en_US.UTF-8"
LC_ALL: "en_US.UTF-8"
shell: bash
run: |
# if macOS, install google-chrome for scaletests
# As another concern, should we really have this kind of external dependency
# requirement on standard CI?
if [ "${{ matrix.os }}" == "macos-latest" ]; then
brew install google-chrome
fi

go-timing:
# We run these tests with p=1 so we don't need a lot of compute.
runs-on: ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04' || 'ubuntu-latest' }}
timeout-minutes: 10
steps:
- name: Harden Runner
uses: step-security/harden-runner@0080882f6c36860b6ba35c610c98ce87d4e2f26f # v2.10.2
with:
egress-policy: audit
# By default Go will use the number of logical CPUs, which
# is a fine default.
PARALLEL_FLAG=""

- name: Checkout
uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
# macOS will output "The default interactive shell is now zsh"
# intermittently in CI...
if [ "${{ matrix.os }}" == "macos-latest" ]; then
touch ~/.bash_profile && echo "export BASH_SILENCE_DEPRECATION_WARNING=1" >> ~/.bash_profile
fi

- name: Setup Go
uses: ./.github/actions/setup-go
if [ "${{ runner.os }}" == "Windows" ]; then
# Create a temp dir on the R: ramdisk drive for Windows. The default
# C: drive is extremely slow: https://github.com/actions/runner-images/issues/8755
mkdir -p "R:/temp/embedded-pg"
go run scripts/embedded-pg/main.go -path "R:/temp/embedded-pg"
else
go run scripts/embedded-pg/main.go
fi

- name: Run Tests
run: |
gotestsum --junitfile="gotests.xml" -- --tags="timing" -p=1 -run='_Timing/' ./...
# Reduce test parallelism, mirroring what we do for race tests.
# We'd been encountering issues with timing related flakes, and
# this seems to help.
DB=ci gotestsum --format standard-quiet -- -v -short -count=1 -parallel 4 -p 4 ./...

- name: Upload test results to DataDog
- name: Upload test stats to Datadog
timeout-minutes: 1
continue-on-error: true
uses: ./.github/actions/upload-datadog
if: always()
if: success() || failure()
with:
api-key: ${{ secrets.DATADOG_API_KEY }}

notify-slack-on-failure:
needs:
- go-race
- go-timing
- test-go-pg
runs-on: ubuntu-latest
if: failure()
if: failure() && github.ref == 'refs/heads/main'

steps:
- name: Send Slack notification
Expand Down
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -807,6 +807,10 @@ test:
$(GIT_FLAGS) gotestsum --format standard-quiet -- -v -short -count=1 ./...
.PHONY: test

test-cli:
$(GIT_FLAGS) gotestsum --format standard-quiet -- -v -short -count=1 ./cli/...
.PHONY: test-cli

# sqlc-cloud-is-setup will fail if no SQLc auth token is set. Use this as a
# dependency for any sqlc-cloud related targets.
sqlc-cloud-is-setup:
Expand Down
Loading