Skip to content

fix: don't create autostart workspace builds with no available provisioners #19067

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions coderd/autobuild/lifecycle_executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,14 @@ import (
"github.com/coder/coder/v2/coderd/database/provisionerjobs"
"github.com/coder/coder/v2/coderd/database/pubsub"
"github.com/coder/coder/v2/coderd/notifications"
"github.com/coder/coder/v2/coderd/provisionerdserver"
"github.com/coder/coder/v2/coderd/schedule"
"github.com/coder/coder/v2/coderd/wsbuilder"
"github.com/coder/coder/v2/codersdk"
)

const TestingStaleInterval = time.Second * 5

// Executor automatically starts or stops workspaces.
type Executor struct {
ctx context.Context
Expand Down Expand Up @@ -131,6 +134,36 @@ func (e *Executor) Run() {
}()
}

func (e *Executor) hasAvailableProvisioners(ctx context.Context, tx database.Store, t time.Time, ws database.Workspace, templateVersionJob database.ProvisionerJob) (bool, error) {
queryParams := database.GetProvisionerDaemonsByOrganizationParams{
OrganizationID: ws.OrganizationID,
WantTags: templateVersionJob.Tags,
}

// nolint: gocritic // The user (in this case, the user/context for autostart builds) may not have the full
// permissions to read provisioner daemons, but we need to check if there's any for the job prior to the
// execution of the job via autostart to fix: https://github.com/coder/coder/issues/17941
provisionerDaemons, err := tx.GetProvisionerDaemonsByOrganization(dbauthz.AsSystemReadProvisionerDaemons(ctx), queryParams)
if err != nil {
return false, xerrors.Errorf("get provisioner daemons: %w", err)
}

// Check if any provisioners are active (not stale)
for _, pd := range provisionerDaemons {
if pd.LastSeenAt.Valid {
age := t.Sub(pd.LastSeenAt.Time)
if age <= provisionerdserver.StaleInterval {
e.log.Debug(ctx, "hasAvailableProvisioners: found active provisioner",
slog.F("daemon_id", pd.ID),
)
return true, nil
}
}
}
e.log.Debug(ctx, "hasAvailableProvisioners: no active provisioners found")
return false, nil
}

func (e *Executor) runOnce(t time.Time) Stats {
stats := Stats{
Transitions: make(map[uuid.UUID]database.WorkspaceTransition),
Expand Down Expand Up @@ -280,6 +313,22 @@ func (e *Executor) runOnce(t time.Time) Stats {
return nil
}

// Get the template version job to access tags
templateVersionJob, err := tx.GetProvisionerJobByID(e.ctx, activeTemplateVersion.JobID)
if err != nil {
return xerrors.Errorf("get template version job: %w", err)
}

// Before creating the workspace build, check for available provisioners
hasProvisioners, err := e.hasAvailableProvisioners(e.ctx, tx, t, ws, templateVersionJob)
if err != nil {
return xerrors.Errorf("check provisioner availability: %w", err)
}
if !hasProvisioners {
log.Warn(e.ctx, "skipping autostart - no available provisioners")
return nil // Skip this workspace
}

if nextTransition != "" {
builder := wsbuilder.New(ws, nextTransition, *e.buildUsageChecker.Load()).
SetLastWorkspaceBuildInTx(&latestBuild).
Expand Down
262 changes: 262 additions & 0 deletions coderd/autobuild/lifecycle_executor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -676,6 +676,8 @@ func TestExecuteAutostopSuspendedUser(t *testing.T) {
)

admin := coderdtest.CreateFirstUser(t, client)
// Wait for provisioner to be available
mustWaitForProvisionersWithClient(t, client)
version := coderdtest.CreateTemplateVersion(t, client, admin.OrganizationID, nil)
coderdtest.AwaitTemplateVersionJobCompleted(t, client, version.ID)
template := coderdtest.CreateTemplate(t, client, admin.OrganizationID, version.ID)
Expand Down Expand Up @@ -976,6 +978,8 @@ func TestExecutorRequireActiveVersion(t *testing.T) {
TemplateScheduleStore: schedule.NewAGPLTemplateScheduleStore(),
})
)
// Wait for provisioner to be available
mustWaitForProvisionersWithClient(t, ownerClient)
ctx := testutil.Context(t, testutil.WaitShort)
owner := coderdtest.CreateFirstUser(t, ownerClient)
me, err := ownerClient.User(ctx, codersdk.Me)
Expand Down Expand Up @@ -1543,6 +1547,25 @@ func mustProvisionWorkspace(t *testing.T, client *codersdk.Client, mut ...func(*
return coderdtest.MustWorkspace(t, client, ws.ID)
}

// mustProvisionWorkspaceWithProvisionerTags creates a workspace with a template version that has specific provisioner tags
func mustProvisionWorkspaceWithProvisionerTags(t *testing.T, client *codersdk.Client, provisionerTags map[string]string, mut ...func(*codersdk.CreateWorkspaceRequest)) codersdk.Workspace {
t.Helper()
user := coderdtest.CreateFirstUser(t, client)

// Create template version with specific provisioner tags
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, nil, func(request *codersdk.CreateTemplateVersionRequest) {
request.ProvisionerTags = provisionerTags
})
coderdtest.AwaitTemplateVersionJobCompleted(t, client, version.ID)
t.Logf("template version %s job has completed with provisioner tags %v", version.ID, provisionerTags)

template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID)

ws := coderdtest.CreateWorkspace(t, client, template.ID, mut...)
coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, ws.LatestBuild.ID)
return coderdtest.MustWorkspace(t, client, ws.ID)
}

func mustProvisionWorkspaceWithParameters(t *testing.T, client *codersdk.Client, richParameters []*proto.RichParameter, mut ...func(*codersdk.CreateWorkspaceRequest)) codersdk.Workspace {
t.Helper()
user := coderdtest.CreateFirstUser(t, client)
Expand Down Expand Up @@ -1580,6 +1603,245 @@ func mustWorkspaceParameters(t *testing.T, client *codersdk.Client, workspaceID
require.NotEmpty(t, buildParameters)
}

func mustWaitForProvisioners(t *testing.T, db database.Store) {
t.Helper()
ctx := testutil.Context(t, testutil.WaitShort)
require.Eventually(t, func() bool {
daemons, err := db.GetProvisionerDaemons(ctx)
return err == nil && len(daemons) > 0
}, testutil.WaitShort, testutil.IntervalFast)
}

func mustWaitForProvisionersWithClient(t *testing.T, client *codersdk.Client) {
t.Helper()
ctx := testutil.Context(t, testutil.WaitShort)
require.Eventually(t, func() bool {
daemons, err := client.ProvisionerDaemons(ctx)
return err == nil && len(daemons) > 0
}, testutil.WaitShort, testutil.IntervalFast)
}

// mustWaitForProvisionersAvailable waits for provisioners to be available for a specific workspace
func mustWaitForProvisionersAvailable(t *testing.T, db database.Store, workspace codersdk.Workspace) {
t.Helper()
ctx := testutil.Context(t, testutil.WaitShort)

// Get the workspace from the database
require.Eventually(t, func() bool {
ws, err := db.GetWorkspaceByID(ctx, workspace.ID)
if err != nil {
return false
}

// Get the latest build
latestBuild, err := db.GetWorkspaceBuildByID(ctx, workspace.LatestBuild.ID)
if err != nil {
return false
}

// Get the template version job
templateVersionJob, err := db.GetProvisionerJobByID(ctx, latestBuild.JobID)
if err != nil {
return false
}

// Check if provisioners are available using the same logic as hasAvailableProvisioners
provisionerDaemons, err := db.GetProvisionerDaemonsByOrganization(ctx, database.GetProvisionerDaemonsByOrganizationParams{
OrganizationID: ws.OrganizationID,
WantTags: templateVersionJob.Tags,
})
if err != nil {
return false
}

// Check if any provisioners are active (not stale)
now := time.Now()
for _, pd := range provisionerDaemons {
if pd.LastSeenAt.Valid {
age := now.Sub(pd.LastSeenAt.Time)
if age <= autobuild.TestingStaleInterval {
return true // Found an active provisioner
}
}
}
return false // No active provisioners found
}, testutil.WaitShort, testutil.IntervalFast)
}

func TestMain(m *testing.M) {
goleak.VerifyTestMain(m, testutil.GoleakOptions...)
}

func TestExecutorAutostartSkipsWhenNoProvisionersAvailable(t *testing.T) {
t.Parallel()

db, ps := dbtestutil.NewDB(t)
var (
sched = mustSchedule(t, "CRON_TZ=UTC 0 * * * *")
tickCh = make(chan time.Time)
statsCh = make(chan autobuild.Stats)
)

// Create client with provisioner closer
provisionerDaemonTags := map[string]string{"owner": "testowner", "scope": "organization"}
t.Logf("Setting provisioner daemon tags: %v", provisionerDaemonTags)
client, provisionerCloser := coderdtest.NewWithProvisionerCloser(t, &coderdtest.Options{
AutobuildTicker: tickCh,
IncludeProvisionerDaemon: true,
AutobuildStats: statsCh,
Database: db,
Pubsub: ps,
ProvisionerDaemonTags: provisionerDaemonTags,
})

// Create workspace with autostart enabled and matching provisioner tags
provisionerTags := map[string]string{"owner": "testowner", "scope": "organization"}
workspace := mustProvisionWorkspaceWithProvisionerTags(t, client, provisionerTags, func(cwr *codersdk.CreateWorkspaceRequest) {
cwr.AutostartSchedule = ptr.Ref(sched.String())
})

// Stop the workspace while provisioner is available
workspace = coderdtest.MustTransitionWorkspace(t, client, workspace.ID,
codersdk.WorkspaceTransitionStart, codersdk.WorkspaceTransitionStop)

// Wait for provisioner to be registered
mustWaitForProvisioners(t, db)

// Wait for provisioner to be available for this specific workspace
mustWaitForProvisionersAvailable(t, db, workspace)

// Now shut down the provisioner daemon
ctx := testutil.Context(t, testutil.WaitShort)
err := provisionerCloser.Close()
require.NoError(t, err)

// Wait for the provisioner to become stale (heartbeat stops)
// The stale interval is 5 seconds, so we need to wait a bit longer
time.Sleep(6 * time.Second)

// Debug: check what's in the database
daemons, err := db.GetProvisionerDaemons(ctx)
require.NoError(t, err)
t.Logf("After close: found %d daemons", len(daemons))
for i, daemon := range daemons {
t.Logf("Daemon %d: ID=%s, Name=%s, LastSeen=%v", i, daemon.ID, daemon.Name, daemon.LastSeenAt)
}

// Since we commented out the close call, the provisioner should NOT become stale
// Let's wait a bit but NOT longer than the stale interval
time.Sleep(2 * time.Second) // Wait less than the 5-second stale interval

daemons, err = db.GetProvisionerDaemons(ctx)
require.NoError(t, err)
require.NotEmpty(t, daemons, "should have provisioner daemons")

now := time.Now()
for _, daemon := range daemons {
if daemon.LastSeenAt.Valid {
age := now.Sub(daemon.LastSeenAt.Time)
t.Logf("Daemon %s: age=%v, staleInterval=%v, isStale=%v", daemon.Name, age, autobuild.TestingStaleInterval, age > autobuild.TestingStaleInterval)
// Since we closed the provisioner, it should be stale
require.True(t, age > autobuild.TestingStaleInterval, "provisioner should be stale since we closed it")
}
}

// Debug: Check the template version job tags and organization
templateVersion, err := client.TemplateVersion(context.Background(), workspace.LatestBuild.TemplateVersionID)
require.NoError(t, err)
t.Logf("Template version job ID: %s", templateVersion.Job.ID)
t.Logf("Template version job tags: %v", templateVersion.Job.Tags)
t.Logf("Workspace organization ID: %s", workspace.OrganizationID)
t.Logf("Template version organization ID: %s", templateVersion.OrganizationID)
t.Logf("Workspace LatestBuild.TemplateVersionID: %s", workspace.LatestBuild.TemplateVersionID)

// Debug: Get the template version job directly from database to compare
templateVersionJobFromDB, err := db.GetProvisionerJobByID(context.Background(), templateVersion.Job.ID)
require.NoError(t, err)
t.Logf("Template version job from DB - ID: %s, Tags: %v", templateVersionJobFromDB.ID, templateVersionJobFromDB.Tags)

// Debug: Query database directly to see what provisioner daemons exist
allDaemons, err := db.GetProvisionerDaemons(context.Background())
require.NoError(t, err)
t.Logf("Total provisioner daemons in database: %d", len(allDaemons))
for i, daemon := range allDaemons {
t.Logf("Daemon %d: ID=%s, Name=%s, OrgID=%s, Tags=%v", i, daemon.ID, daemon.Name, daemon.OrganizationID, daemon.Tags)
}

// Debug: Test if we can query using the ACTUAL provisioner daemon tags
if len(allDaemons) > 0 {
actualDaemonTags := allDaemons[0].Tags
t.Logf("Testing query with ACTUAL daemon tags: %v", actualDaemonTags)
queryWithActualTags := database.GetProvisionerDaemonsByOrganizationParams{
OrganizationID: workspace.OrganizationID,
WantTags: actualDaemonTags,
}
matchingWithActualTags, err := db.GetProvisionerDaemonsByOrganization(context.Background(), queryWithActualTags)
require.NoError(t, err)
t.Logf("Query with actual daemon tags returns: %d daemons", len(matchingWithActualTags))
}

// Debug: Test the exact query that hasAvailableProvisioners uses
queryParams := database.GetProvisionerDaemonsByOrganizationParams{
OrganizationID: workspace.OrganizationID,
WantTags: templateVersion.Job.Tags,
}
t.Logf("Query params: OrgID=%s, WantTags=%v", queryParams.OrganizationID, queryParams.WantTags)
t.Logf("Test query detailed params: org_id_type=%T, org_id_value=%s, want_tags_type=%T, want_tags_value=%v",
queryParams.OrganizationID, queryParams.OrganizationID, queryParams.WantTags, queryParams.WantTags)

// Test 1: Transaction isolation - try with different transaction contexts
t.Logf("=== Testing transaction isolation ===")

// Query with context.Background() (what we used above)
matchingDaemons1, err := db.GetProvisionerDaemonsByOrganization(context.Background(), queryParams)
require.NoError(t, err)
t.Logf("With context.Background(): %d daemons", len(matchingDaemons1))

// Query with a fresh context
ctx2 := context.Background()
matchingDaemons2, err2 := db.GetProvisionerDaemonsByOrganization(ctx2, queryParams)
require.NoError(t, err2)
t.Logf("With fresh context: %d daemons", len(matchingDaemons2))

// Query within a transaction (like hasAvailableProvisioners might use)
err = db.InTx(func(tx database.Store) error {
matchingDaemons3, err := tx.GetProvisionerDaemonsByOrganization(ctx2, queryParams)
if err != nil {
return err
}
t.Logf("Within transaction: %d daemons", len(matchingDaemons3))
return nil
}, nil)
require.NoError(t, err)

// Test 2: Timing issue - query right before and after hasAvailableProvisioners
t.Logf("=== Testing timing issue ===")

// Query right before the autostart trigger
beforeDaemons, err := db.GetProvisionerDaemonsByOrganization(context.Background(), queryParams)
require.NoError(t, err)
t.Logf("Right before autostart: %d daemons", len(beforeDaemons))

// Since we commented out the close call, the provisioner is still active
// This means autobuild should proceed (not be skipped) and create transitions
// The test expects 0 transitions (skipped autostart), but with an active provisioner,
// it should get >0 transitions, causing the test to FAIL as intended

// Trigger autobuild
go func() {
tickCh <- sched.Next(workspace.LatestBuild.CreatedAt)
close(tickCh)
}()

stats := <-statsCh

// Query right after hasAvailableProvisioners was called
afterDaemons, err := db.GetProvisionerDaemonsByOrganization(context.Background(), queryParams)
require.NoError(t, err)
t.Logf("Right after autostart: %d daemons", len(afterDaemons))

// This assertion should FAIL when provisioner is available (demonstrating the fix works)
// When provisioner close is commented out: provisioner available → autostart proceeds → transitions > 0 → test fails ✓
// When provisioner close is active: provisioner unavailable → autostart skipped → transitions = 0 → test passes ✓
assert.Len(t, stats.Transitions, 0, "should not create builds when no provisioners available")
}
1 change: 1 addition & 0 deletions coderd/coderdtest/coderdtest.go
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,7 @@ func NewOptions(t testing.TB, options *Options) (func(http.Handler), context.Can
options.NotificationsEnqueuer,
experiments,
).WithStatsChannel(options.AutobuildStats)

lifecycleExecutor.Run()

jobReaperTicker := time.NewTicker(options.DeploymentValues.JobReaperDetectorInterval.Value())
Expand Down
Loading