Skip to content

nightly-gauntlet

nightly-gauntlet #749

# The nightly-gauntlet runs tests that are either too flaky or too slow to block
# every PR.
name: nightly-gauntlet
on:
schedule:
# Every day at 4AM
- cron: "0 4 * * 1-5"
workflow_dispatch:
permissions:
contents: read
jobs:
test-go-pg:
# make sure to adjust NUM_PARALLEL_PACKAGES and NUM_PARALLEL_TESTS below
# when changing runner sizes
runs-on: ${{ matrix.os == 'macos-latest' && github.repository_owner == 'coder' && 'depot-macos-latest' || matrix.os == 'windows-2022' && github.repository_owner == 'coder' && 'depot-windows-2022-16' || matrix.os }}
# This timeout must be greater than the timeout set by `go test` in
# `make test-postgres` to ensure we receive a trace of running
# goroutines. Setting this to the timeout +5m should work quite well
# even if some of the preceding steps are slow.
timeout-minutes: 25
strategy:
matrix:
os:
- macos-latest
- windows-2022
steps:
- name: Harden Runner
uses: step-security/harden-runner@6c439dc8bdf85cadbbce9ed30d1c7b959517bc49 # v2.12.2
with:
egress-policy: audit
# macOS indexes all new files in the background. Our Postgres tests
# create and destroy thousands of databases on disk, and Spotlight
# tries to index all of them, seriously slowing down the tests.
- name: Disable Spotlight Indexing
if: runner.os == 'macOS'
run: |
sudo mdutil -a -i off
sudo mdutil -X /
sudo launchctl bootout system /System/Library/LaunchDaemons/com.apple.metadata.mds.plist
# Set up RAM disks to speed up the rest of the job. This action is in
# a separate repository to allow its use before actions/checkout.
- name: Setup RAM Disks
if: runner.os == 'Windows'
uses: coder/setup-ramdisk-action@e1100847ab2d7bcd9d14bcda8f2d1b0f07b36f1b
- name: Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 1
- name: Setup Go
uses: ./.github/actions/setup-go
with:
# Runners have Go baked-in and Go will automatically
# download the toolchain configured in go.mod, so we don't
# need to reinstall it. It's faster on Windows runners.
use-preinstalled-go: ${{ runner.os == 'Windows' }}
- name: Setup Terraform
uses: ./.github/actions/setup-tf
- name: Setup Embedded Postgres Cache Paths
id: embedded-pg-cache
uses: ./.github/actions/setup-embedded-pg-cache-paths
- name: Download Embedded Postgres Cache
id: download-embedded-pg-cache
uses: ./.github/actions/embedded-pg-cache/download
with:
key-prefix: embedded-pg-${{ runner.os }}-${{ runner.arch }}
cache-path: ${{ steps.embedded-pg-cache.outputs.cached-dirs }}
- name: Test with PostgreSQL Database
env:
POSTGRES_VERSION: "13"
TS_DEBUG_DISCO: "true"
LC_CTYPE: "en_US.UTF-8"
LC_ALL: "en_US.UTF-8"
shell: bash
run: |
set -o errexit
set -o pipefail
if [ "${{ runner.os }}" == "Windows" ]; then
# Create a temp dir on the R: ramdisk drive for Windows. The default
# C: drive is extremely slow: https://github.com/actions/runner-images/issues/8755
mkdir -p "R:/temp/embedded-pg"
go run scripts/embedded-pg/main.go -path "R:/temp/embedded-pg" -cache "${EMBEDDED_PG_CACHE_DIR}"
elif [ "${{ runner.os }}" == "macOS" ]; then
# Postgres runs faster on a ramdisk on macOS too
mkdir -p /tmp/tmpfs
sudo mount_tmpfs -o noowners -s 8g /tmp/tmpfs
go run scripts/embedded-pg/main.go -path /tmp/tmpfs/embedded-pg -cache "${EMBEDDED_PG_CACHE_DIR}"
elif [ "${{ runner.os }}" == "Linux" ]; then
make test-postgres-docker
fi
# if macOS, install google-chrome for scaletests
# As another concern, should we really have this kind of external dependency
# requirement on standard CI?
if [ "${{ matrix.os }}" == "macos-latest" ]; then
brew install google-chrome
fi
# macOS will output "The default interactive shell is now zsh"
# intermittently in CI...
if [ "${{ matrix.os }}" == "macos-latest" ]; then
touch ~/.bash_profile && echo "export BASH_SILENCE_DEPRECATION_WARNING=1" >> ~/.bash_profile
fi
if [ "${{ runner.os }}" == "Windows" ]; then
# Our Windows runners have 16 cores.
# On Windows Postgres chokes up when we have 16x16=256 tests
# running in parallel, and dbtestutil.NewDB starts to take more than
# 10s to complete sometimes causing test timeouts. With 16x8=128 tests
# Postgres tends not to choke.
NUM_PARALLEL_PACKAGES=8
NUM_PARALLEL_TESTS=16
elif [ "${{ runner.os }}" == "macOS" ]; then
# Our macOS runners have 8 cores. We set NUM_PARALLEL_TESTS to 16
# because the tests complete faster and Postgres doesn't choke. It seems
# that macOS's tmpfs is faster than the one on Windows.
NUM_PARALLEL_PACKAGES=8
NUM_PARALLEL_TESTS=16
elif [ "${{ runner.os }}" == "Linux" ]; then
# Our Linux runners have 8 cores.
NUM_PARALLEL_PACKAGES=8
NUM_PARALLEL_TESTS=8
fi
# run tests without cache
TESTCOUNT="-count=1"
DB=ci gotestsum \
--format standard-quiet --packages "./..." \
-- -timeout=20m -v -p $NUM_PARALLEL_PACKAGES -parallel=$NUM_PARALLEL_TESTS $TESTCOUNT
- name: Upload Embedded Postgres Cache
uses: ./.github/actions/embedded-pg-cache/upload
# We only use the embedded Postgres cache on macOS and Windows runners.
if: runner.OS == 'macOS' || runner.OS == 'Windows'
with:
cache-key: ${{ steps.download-embedded-pg-cache.outputs.cache-key }}
cache-path: "${{ steps.embedded-pg-cache.outputs.embedded-pg-cache }}"
- name: Upload test stats to Datadog
timeout-minutes: 1
continue-on-error: true
uses: ./.github/actions/upload-datadog
if: success() || failure()
with:
api-key: ${{ secrets.DATADOG_API_KEY }}
notify-slack-on-failure:
needs:
- test-go-pg
runs-on: ubuntu-latest
if: failure() && github.ref == 'refs/heads/main'
steps:
- name: Send Slack notification
run: |
curl -X POST -H 'Content-type: application/json' \
--data '{
"blocks": [
{
"type": "header",
"text": {
"type": "plain_text",
"text": "❌ Nightly gauntlet failed",
"emoji": true
}
},
{
"type": "section",
"fields": [
{
"type": "mrkdwn",
"text": "*Workflow:*\n${{ github.workflow }}"
},
{
"type": "mrkdwn",
"text": "*Committer:*\n${{ github.actor }}"
},
{
"type": "mrkdwn",
"text": "*Commit:*\n${{ github.sha }}"
}
]
},
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": "*View failure:* <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|Click here>"
}
}
]
}' ${{ secrets.CI_FAILURE_SLACK_WEBHOOK }}