Skip to content
Merged
79 changes: 79 additions & 0 deletions .github/actions/test-go-pg/action.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
name: "Test Go with PostgreSQL"
description: "Run Go tests with PostgreSQL database"

inputs:
postgres-version:
description: "PostgreSQL version to use"
required: false
default: "13"
test-parallelism-packages:
description: "Number of packages to test in parallel (-p flag)"
required: false
default: "8"
test-parallelism-tests:
description: "Number of tests to run in parallel within each package (-parallel flag)"
required: false
default: "8"
race-detection:
description: "Enable race detection"
required: false
default: "false"
test-count:
description: "Number of times to run each test (empty for cached results)"
required: false
default: ""
test-packages:
description: "Packages to test (default: ./...)"
required: false
default: "./..."
embedded-pg-path:
description: "Path for embedded postgres data (Windows/macOS only)"
required: false
default: ""
embedded-pg-cache:
description: "Path for embedded postgres cache (Windows/macOS only)"
required: false
default: ""

runs:
using: "composite"
steps:
- name: Start PostgreSQL Docker container (Linux)
if: runner.os == 'Linux'
shell: bash
env:
POSTGRES_VERSION: ${{ inputs.postgres-version }}
run: make test-postgres-docker

- name: Setup Embedded Postgres (Windows/macOS)
if: runner.os != 'Linux'
shell: bash
env:
POSTGRES_VERSION: ${{ inputs.postgres-version }}
EMBEDDED_PG_PATH: ${{ inputs.embedded-pg-path }}
EMBEDDED_PG_CACHE_DIR: ${{ inputs.embedded-pg-cache }}
run: |
go run scripts/embedded-pg/main.go -path "${EMBEDDED_PG_PATH}" -cache "${EMBEDDED_PG_CACHE_DIR}"

- name: Run tests
shell: bash
env:
TEST_NUM_PARALLEL_PACKAGES: ${{ inputs.test-parallelism-packages }}
TEST_NUM_PARALLEL_TESTS: ${{ inputs.test-parallelism-tests }}
TEST_COUNT: ${{ inputs.test-count }}
TEST_PACKAGES: ${{ inputs.test-packages }}
RACE_DETECTION: ${{ inputs.race-detection }}
TS_DEBUG_DISCO: "true"
LC_CTYPE: "en_US.UTF-8"
LC_ALL: "en_US.UTF-8"
run: |
set -euo pipefail

if [[ ${RACE_DETECTION} == true ]]; then
gotestsum --junitfile="gotests.xml" --packages="${TEST_PACKAGES}" -- \
-race \
-parallel "${TEST_NUM_PARALLEL_TESTS}" \
-p "${TEST_NUM_PARALLEL_PACKAGES}"
else
make test
fi
189 changes: 108 additions & 81 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,7 @@ jobs:
test-go-pg:
# make sure to adjust NUM_PARALLEL_PACKAGES and NUM_PARALLEL_TESTS below
# when changing runner sizes
runs-on: ${{ matrix.os == 'ubuntu-latest' && github.repository_owner == 'coder' && 'depot-ubuntu-22.04-8' || matrix.os && matrix.os == 'macos-latest' && github.repository_owner == 'coder' && 'depot-macos-latest' || matrix.os == 'windows-2022' && github.repository_owner == 'coder' && 'depot-windows-2022-16' || matrix.os }}
runs-on: ${{ matrix.os == 'ubuntu-latest' && github.repository_owner == 'coder' && 'depot-ubuntu-22.04-16' || matrix.os && matrix.os == 'macos-latest' && github.repository_owner == 'coder' && 'depot-macos-latest' || matrix.os == 'windows-2022' && github.repository_owner == 'coder' && 'depot-windows-2022-32' || matrix.os }}
needs: changes
if: needs.changes.outputs.go == 'true' || needs.changes.outputs.ci == 'true' || github.ref == 'refs/heads/main'
# This timeout must be greater than the timeout set by `go test` in
Expand All @@ -336,6 +336,7 @@ jobs:
# even if some of the preceding steps are slow.
timeout-minutes: 25
strategy:
fail-fast: false
matrix:
os:
- ubuntu-latest
Expand Down Expand Up @@ -416,82 +417,83 @@ jobs:
find . -type f ! -path ./.git/\*\* | mtimehash
find . -type d ! -path ./.git/\*\* -exec touch -t 200601010000 {} +

- name: Test with PostgreSQL Database
env:
POSTGRES_VERSION: "13"
TS_DEBUG_DISCO: "true"
LC_CTYPE: "en_US.UTF-8"
LC_ALL: "en_US.UTF-8"
- name: Normalize Terraform Path for Caching
shell: bash
# Terraform gets installed in a random directory, so we need to normalize
# the path or many cached tests will be invalidated.
run: |
set -o errexit
set -o pipefail

if [ "$RUNNER_OS" == "Windows" ]; then
# Create a temp dir on the R: ramdisk drive for Windows. The default
# C: drive is extremely slow: https://github.com/actions/runner-images/issues/8755
mkdir -p "R:/temp/embedded-pg"
go run scripts/embedded-pg/main.go -path "R:/temp/embedded-pg" -cache "${EMBEDDED_PG_CACHE_DIR}"
elif [ "$RUNNER_OS" == "macOS" ]; then
# Postgres runs faster on a ramdisk on macOS too
mkdir -p /tmp/tmpfs
sudo mount_tmpfs -o noowners -s 8g /tmp/tmpfs
go run scripts/embedded-pg/main.go -path /tmp/tmpfs/embedded-pg -cache "${EMBEDDED_PG_CACHE_DIR}"
elif [ "$RUNNER_OS" == "Linux" ]; then
make test-postgres-docker
fi
mkdir -p "$RUNNER_TEMP/sym"
source scripts/normalize_path.sh
normalize_path_with_symlinks "$RUNNER_TEMP/sym" "$(dirname "$(which terraform)")"

# if macOS, install google-chrome for scaletests
# As another concern, should we really have this kind of external dependency
# requirement on standard CI?
if [ "${RUNNER_OS}" == "macOS" ]; then
brew install google-chrome
fi
- name: Setup RAM disk for Embedded Postgres (Windows)
if: runner.os == 'Windows'
shell: bash
# The default C: drive is extremely slow:
# https://github.com/actions/runner-images/issues/8755
run: mkdir -p "R:/temp/embedded-pg"

# macOS will output "The default interactive shell is now zsh"
# intermittently in CI...
if [ "${RUNNER_OS}" == "macOS" ]; then
touch ~/.bash_profile && echo "export BASH_SILENCE_DEPRECATION_WARNING=1" >> ~/.bash_profile
fi
- name: Setup RAM disk for Embedded Postgres (macOS)
if: runner.os == 'macOS'
shell: bash
run: |
# Postgres runs faster on a ramdisk on macOS.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have we verified this recently? I'd especially be interested in adjusting PostgreSQL settings to see if we can alleviate it rather than using ramdisk. We simply need to increase RAM retention for PG on macOS and it should be more efficient than placing both storage and cache in RAM.

Guessing this applies to Windows as well.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not changing anything about this right now. We can follow this PR with some PG changes.

mkdir -p /tmp/tmpfs
sudo mount_tmpfs -o noowners -s 8g /tmp/tmpfs

if [ "${RUNNER_OS}" == "Windows" ]; then
# Our Windows runners have 16 cores.
# On Windows Postgres chokes up when we have 16x16=256 tests
# running in parallel, and dbtestutil.NewDB starts to take more than
# 10s to complete sometimes causing test timeouts. With 16x8=128 tests
# Postgres tends not to choke.
export TEST_NUM_PARALLEL_PACKAGES=8
export TEST_NUM_PARALLEL_TESTS=16
# Only the CLI and Agent are officially supported on Windows and the rest are too flaky
export TEST_PACKAGES="./cli/... ./enterprise/cli/... ./agent/..."
elif [ "${RUNNER_OS}" == "macOS" ]; then
# Our macOS runners have 8 cores. We set NUM_PARALLEL_TESTS to 16
# because the tests complete faster and Postgres doesn't choke. It seems
# that macOS's tmpfs is faster than the one on Windows.
export TEST_NUM_PARALLEL_PACKAGES=8
export TEST_NUM_PARALLEL_TESTS=16
# Only the CLI and Agent are officially supported on macOS and the rest are too flaky
export TEST_PACKAGES="./cli/... ./enterprise/cli/... ./agent/..."
elif [ "${RUNNER_OS}" == "Linux" ]; then
# Our Linux runners have 8 cores.
export TEST_NUM_PARALLEL_PACKAGES=8
export TEST_NUM_PARALLEL_TESTS=8
fi
# Install google-chrome for scaletests.
# As another concern, should we really have this kind of external dependency
# requirement on standard CI?
brew install google-chrome

# by default, run tests with cache
if [ "${GITHUB_REF}" == "refs/heads/main" ]; then
# on main, run tests without cache
export TEST_COUNT="1"
fi
# macOS will output "The default interactive shell is now zsh" intermittently in CI.
touch ~/.bash_profile && echo "export BASH_SILENCE_DEPRECATION_WARNING=1" >> ~/.bash_profile

mkdir -p "$RUNNER_TEMP/sym"
source scripts/normalize_path.sh
# terraform gets installed in a random directory, so we need to normalize
# the path to the terraform binary or a bunch of cached tests will be
# invalidated. See scripts/normalize_path.sh for more details.
normalize_path_with_symlinks "$RUNNER_TEMP/sym" "$(dirname "$(which terraform)")"
- name: Test with PostgreSQL Database (Linux)
if: runner.os == 'Linux'
uses: ./.github/actions/test-go-pg
with:
postgres-version: "13"
# Our Linux runners have 16 cores.
test-parallelism-packages: "16"
test-parallelism-tests: "8"
# By default, run tests with cache for improved speed (possibly at the expense of correctness).
# On main, run tests without cache for the inverse.
test-count: ${{ github.ref == 'refs/heads/main' && '1' || '' }}

make test
- name: Test with PostgreSQL Database (macOS)
if: runner.os == 'macOS'
uses: ./.github/actions/test-go-pg
with:
postgres-version: "13"
# Our macOS runners have 8 cores.
# Even though this parallelism seems high, we've observed relatively low flakiness in the past.
# See https://github.com/coder/coder/pull/21091#discussion_r2609891540.
test-parallelism-packages: "8"
test-parallelism-tests: "16"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Linux has 16 cores and 16x8 parallelism, but macOS has 8 cores and 8x16 parallelism --- seems wrong, since in both cases you can have 128 tests running concurrently.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can't scale MacOS any further, and for Linux I just naïvely doubled the package parallelism since we now have double the CPUs.

It was like this before:

          elif [ "${RUNNER_OS}" == "macOS" ]; then
            # Our macOS runners have 8 cores. We set NUM_PARALLEL_TESTS to 16
            # because the tests complete faster and Postgres doesn't choke. It seems
            # that macOS's tmpfs is faster than the one on Windows.
            export TEST_NUM_PARALLEL_PACKAGES=8
            export TEST_NUM_PARALLEL_TESTS=16
            # Only the CLI and Agent are officially supported on macOS and the rest are too flaky
            export TEST_PACKAGES="./cli/... ./enterprise/cli/... ./agent/..."
          elif [ "${RUNNER_OS}" == "Linux" ]; then
            # Our Linux runners have 8 cores.
            export TEST_NUM_PARALLEL_PACKAGES=8
            export TEST_NUM_PARALLEL_TESTS=8
          fi

Are you suggesting I bump test-parallelism-tests to 16 for Linux as well? i.e. 256 parallelism.
That would be quadruple what we had before (8*8), where I was attempting to keep the resources to parallelism scaling linear.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If anything I'd cut the macOS parallelism. If you leave it as is, then maybe a comment explaining that the numbers were kinda determined empirically where things don't break horribly.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://github.com/coder/coder/blob/main/.github/workflows/ci.yaml#L467-L472

I haven't change the parallelism (sorry, it hard to track changes because of the reorganisation); if you're aware that it's the same, why do you want to cut parallelism?

Image

According to this, it seems kinda OK?

Copy link
Contributor

@spikecurtis spikecurtis Dec 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally we'd have some theoretical consistency --- a model of parallelism that maps to CPU cores.

Cutting macOS parallelism would align with that consistent model and be easier to reason about. I guess upping Linux parallelism would also be consistent, but I'm gun shy about increasing things and potentially causing more flakes.

In the absence of consistency, we can just document what we've observed and 🤷

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cool, documented in 983515f 👍

Ideally we'd have some theoretical consistency --- a model of parallelism that maps to CPU cores.

Let's link up next week to try reason through this and develop a heuristic which will set the parallelism automatically and consistently across all these different platforms & jobs?

# By default, run tests with cache for improved speed (possibly at the expense of correctness).
# On main, run tests without cache for the inverse.
test-count: ${{ github.ref == 'refs/heads/main' && '1' || '' }}
# Only the CLI and Agent are officially supported on macOS; the rest are too flaky.
test-packages: "./cli/... ./enterprise/cli/... ./agent/..."
embedded-pg-path: "/tmp/tmpfs/embedded-pg"
embedded-pg-cache: ${{ steps.embedded-pg-cache.outputs.embedded-pg-cache }}

- name: Test with PostgreSQL Database (Windows)
if: runner.os == 'Windows'
uses: ./.github/actions/test-go-pg
with:
postgres-version: "13"
# Our Windows runners have 32 cores.
test-parallelism-packages: "32"
test-parallelism-tests: "16"
# By default, run tests with cache for improved speed (possibly at the expense of correctness).
# On main, run tests without cache for the inverse.
test-count: ${{ github.ref == 'refs/heads/main' && '1' || '' }}
# Only the CLI and Agent are officially supported on Windows; the rest are too flaky.
test-packages: "./cli/... ./enterprise/cli/... ./agent/..."
embedded-pg-path: "R:/temp/embedded-pg"
embedded-pg-cache: ${{ steps.embedded-pg-cache.outputs.embedded-pg-cache }}

- name: Upload failed test db dumps
uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
Expand Down Expand Up @@ -521,7 +523,7 @@ jobs:
api-key: ${{ secrets.DATADOG_API_KEY }}

test-go-pg-17:
runs-on: ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04-8' || 'ubuntu-latest' }}
runs-on: ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04-16' || 'ubuntu-latest' }}
needs:
- changes
if: needs.changes.outputs.go == 'true' || needs.changes.outputs.ci == 'true' || github.ref == 'refs/heads/main'
Expand Down Expand Up @@ -554,12 +556,25 @@ jobs:
with:
key-prefix: test-go-pg-17-${{ runner.os }}-${{ runner.arch }}

- name: Test with PostgreSQL Database
env:
POSTGRES_VERSION: "17"
TS_DEBUG_DISCO: "true"
- name: Normalize Terraform Path for Caching
shell: bash
# Terraform gets installed in a random directory, so we need to normalize
# the path or many cached tests will be invalidated.
run: |
make test-postgres
mkdir -p "$RUNNER_TEMP/sym"
source scripts/normalize_path.sh
normalize_path_with_symlinks "$RUNNER_TEMP/sym" "$(dirname "$(which terraform)")"

- name: Test with PostgreSQL Database
uses: ./.github/actions/test-go-pg
with:
postgres-version: "17"
# Our Linux runners have 16 cores.
test-parallelism-packages: "16"
test-parallelism-tests: "8"
# By default, run tests with cache for improved speed (possibly at the expense of correctness).
# On main, run tests without cache for the inverse.
test-count: ${{ github.ref == 'refs/heads/main' && '1' || '' }}

- name: Upload Test Cache
uses: ./.github/actions/test-cache/upload
Expand All @@ -575,7 +590,7 @@ jobs:
api-key: ${{ secrets.DATADOG_API_KEY }}

test-go-race-pg:
runs-on: ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04-16' || 'ubuntu-latest' }}
runs-on: ${{ github.repository_owner == 'coder' && 'depot-ubuntu-22.04-32' || 'ubuntu-latest' }}
needs: changes
if: needs.changes.outputs.go == 'true' || needs.changes.outputs.ci == 'true' || github.ref == 'refs/heads/main'
timeout-minutes: 25
Expand Down Expand Up @@ -603,16 +618,28 @@ jobs:
with:
key-prefix: test-go-race-pg-${{ runner.os }}-${{ runner.arch }}

- name: Normalize Terraform Path for Caching
shell: bash
# Terraform gets installed in a random directory, so we need to normalize
# the path or many cached tests will be invalidated.
run: |
mkdir -p "$RUNNER_TEMP/sym"
source scripts/normalize_path.sh
normalize_path_with_symlinks "$RUNNER_TEMP/sym" "$(dirname "$(which terraform)")"

# We run race tests with reduced parallelism because they use more CPU and we were finding
# instances where tests appear to hang for multiple seconds, resulting in flaky tests when
# short timeouts are used.
# c.f. discussion on https://github.com/coder/coder/pull/15106
# Our Linux runners have 32 cores, but we reduce parallelism since race detection adds a lot of overhead.
# We aim to have parallelism match CPU count (8*4=32) to avoid making flakes worse.
- name: Run Tests
env:
POSTGRES_VERSION: "17"
run: |
make test-postgres-docker
gotestsum --junitfile="gotests.xml" --packages="./..." -- -race -parallel 4 -p 4
uses: ./.github/actions/test-go-pg
with:
postgres-version: "17"
test-parallelism-packages: "8"
test-parallelism-tests: "4"
race-detection: "true"

- name: Upload Test Cache
uses: ./.github/actions/test-cache/upload
Expand Down
Loading
Loading