Skip to content

fix: don't create autostart workspace builds with no available provisioners #19067

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions coderd/autobuild/lifecycle_executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"strings"
"sync"
"sync/atomic"
"testing"
"time"

"github.com/dustin/go-humanize"
Expand All @@ -28,11 +29,14 @@ import (
"github.com/coder/coder/v2/coderd/database/provisionerjobs"
"github.com/coder/coder/v2/coderd/database/pubsub"
"github.com/coder/coder/v2/coderd/notifications"
"github.com/coder/coder/v2/coderd/provisionerdserver"
"github.com/coder/coder/v2/coderd/schedule"
"github.com/coder/coder/v2/coderd/wsbuilder"
"github.com/coder/coder/v2/codersdk"
)

const TestingStaleInterval = time.Second * 5

// Executor automatically starts or stops workspaces.
type Executor struct {
ctx context.Context
Expand All @@ -52,6 +56,9 @@ type Executor struct {
experiments codersdk.Experiments

metrics executorMetrics

// Skip provisioner availability check (should only be true for *some* tests)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It says it should only be true for some tests, but it seems it's true for almost every test using coderdtest. How many tests are actually broken by this change? If it's only a few, I'd rather we just fix those tests (e.g. by adding provisioners) rather than add a skip flag.

SkipProvisionerCheck bool
}

type executorMetrics struct {
Expand Down Expand Up @@ -131,6 +138,39 @@ func (e *Executor) Run() {
}()
}

func (e *Executor) hasAvailableProvisioners(ctx context.Context, tx database.Store, ws database.Workspace, templateVersionJob database.ProvisionerJob) (bool, error) {
if e.SkipProvisionerCheck {
return true, nil
}

// Use a shorter stale interval for tests
staleInterval := provisionerdserver.StaleInterval
if testing.Testing() {
staleInterval = TestingStaleInterval
}
Comment on lines +148 to +150
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you use the passed in time from runOnce I don't think changing the interval would be necessary, you could write the test in a way that the default interval works fine


// Get eligible provisioner daemons for this workspace's template
provisionerDaemons, err := tx.GetProvisionerDaemonsByOrganization(ctx, database.GetProvisionerDaemonsByOrganizationParams{
OrganizationID: ws.OrganizationID,
WantTags: templateVersionJob.Tags,
})
if err != nil {
return false, xerrors.Errorf("get provisioner daemons: %w", err)
}

// Check if any provisioners are active (not stale)
now := dbtime.Now()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be using the passed in time from runOnce rather than getting it's own time

for _, pd := range provisionerDaemons {
if pd.LastSeenAt.Valid {
age := now.Sub(pd.LastSeenAt.Time)
if age <= staleInterval {
return true, nil
}
}
}
return false, nil
}

func (e *Executor) runOnce(t time.Time) Stats {
stats := Stats{
Transitions: make(map[uuid.UUID]database.WorkspaceTransition),
Expand Down Expand Up @@ -280,6 +320,22 @@ func (e *Executor) runOnce(t time.Time) Stats {
return nil
}

// Get the template version job to access tags
templateVersionJob, err := tx.GetProvisionerJobByID(e.ctx, activeTemplateVersion.JobID)
if err != nil {
return xerrors.Errorf("get template version job: %w", err)
}

// Before creating the workspace build, check for available provisioners
hasProvisioners, err := e.hasAvailableProvisioners(e.ctx, tx, ws, templateVersionJob)
if err != nil {
return xerrors.Errorf("check provisioner availability: %w", err)
}
if !hasProvisioners {
log.Warn(e.ctx, "skipping autostart - no available provisioners")
return nil // Skip this workspace
}

if nextTransition != "" {
builder := wsbuilder.New(ws, nextTransition, *e.buildUsageChecker.Load()).
SetLastWorkspaceBuildInTx(&latestBuild).
Expand Down
77 changes: 77 additions & 0 deletions coderd/autobuild/lifecycle_executor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1583,3 +1583,80 @@ func mustWorkspaceParameters(t *testing.T, client *codersdk.Client, workspaceID
func TestMain(m *testing.M) {
goleak.VerifyTestMain(m, testutil.GoleakOptions...)
}

func TestExecutorAutostartSkipsWhenNoProvisionersAvailable(t *testing.T) {
t.Parallel()

db, ps := dbtestutil.NewDB(t)
var (
sched = mustSchedule(t, "CRON_TZ=UTC 0 * * * *")
tickCh = make(chan time.Time)
statsCh = make(chan autobuild.Stats)
)

// Create client with provisioner closer
client, provisionerCloser := coderdtest.NewWithProvisionerCloser(t, &coderdtest.Options{
AutobuildTicker: tickCh,
IncludeProvisionerDaemon: true,
AutobuildStats: statsCh,
Database: db,
Pubsub: ps,
SkipProvisionerCheck: ptr.Ref(false),
})

// Create workspace with autostart enabled
workspace := mustProvisionWorkspace(t, client, func(cwr *codersdk.CreateWorkspaceRequest) {
cwr.AutostartSchedule = ptr.Ref(sched.String())
})

// Stop the workspace while provisioner is available
workspace = coderdtest.MustTransitionWorkspace(t, client, workspace.ID,
codersdk.WorkspaceTransitionStart, codersdk.WorkspaceTransitionStop)

// Wait for provisioner to be registered
ctx := testutil.Context(t, testutil.WaitShort)
require.Eventually(t, func() bool {
daemons, err := db.GetProvisionerDaemons(ctx)
return err == nil && len(daemons) > 0
}, testutil.WaitShort, testutil.IntervalFast)

// Now shut down the provisioner daemon
err := provisionerCloser.Close()
require.NoError(t, err)

// Debug: check what's in the database
daemons, err := db.GetProvisionerDaemons(ctx)
require.NoError(t, err)
t.Logf("After close: found %d daemons", len(daemons))
for i, daemon := range daemons {
t.Logf("Daemon %d: ID=%s, Name=%s, LastSeen=%v", i, daemon.ID, daemon.Name, daemon.LastSeenAt)
}

// Wait for provisioner to become stale (LastSeenAt > StaleInterval ago)
require.Eventually(t, func() bool {
daemons, err := db.GetProvisionerDaemons(ctx)
if err != nil || len(daemons) == 0 {
return false
}

now := time.Now()
for _, daemon := range daemons {
if daemon.LastSeenAt.Valid {
age := now.Sub(daemon.LastSeenAt.Time)
if age > autobuild.TestingStaleInterval {
return true // Provisioner is now stale
}
}
}
return false
}, testutil.WaitLong, testutil.IntervalMedium) // Use longer timeout since we need to wait for staleness

// Trigger autobuild
go func() {
tickCh <- sched.Next(workspace.LatestBuild.CreatedAt)
close(tickCh)
}()

stats := <-statsCh
assert.Len(t, stats.Transitions, 0, "should not create builds when no provisioners available")
}
8 changes: 8 additions & 0 deletions coderd/coderdtest/coderdtest.go
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ type Options struct {
OIDCConvertKeyCache cryptokeys.SigningKeycache
Clock quartz.Clock
TelemetryReporter telemetry.Reporter
SkipProvisionerCheck *bool // Pointer so we can detect if it's set
}

// New constructs a codersdk client connected to an in-memory API instance.
Expand Down Expand Up @@ -386,6 +387,13 @@ func NewOptions(t testing.TB, options *Options) (func(http.Handler), context.Can
options.NotificationsEnqueuer,
experiments,
).WithStatsChannel(options.AutobuildStats)

skipProvisionerCheck := true
if options.SkipProvisionerCheck != nil {
skipProvisionerCheck = *options.SkipProvisionerCheck
}
lifecycleExecutor.SkipProvisionerCheck = skipProvisionerCheck

lifecycleExecutor.Run()

jobReaperTicker := time.NewTicker(options.DeploymentValues.JobReaperDetectorInterval.Value())
Expand Down