Skip to content

Commit 532a1f3

Browse files
authored
fix(coderd): exclude sub-agents from workspace health calculation (#21098)
1 parent 6aeb144 commit 532a1f3

File tree

2 files changed

+82
-0
lines changed

2 files changed

+82
-0
lines changed

coderd/workspaces.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2598,6 +2598,13 @@ func convertWorkspace(
25982598
failingAgents := []uuid.UUID{}
25992599
for _, resource := range workspaceBuild.Resources {
26002600
for _, agent := range resource.Agents {
2601+
// Sub-agents (e.g., devcontainer agents) are excluded from the
2602+
// workspace health calculation. Their health is managed by
2603+
// their parent agent, and temporary disconnections during
2604+
// devcontainer rebuilds should not affect workspace health.
2605+
if agent.ParentID.Valid {
2606+
continue
2607+
}
26012608
if !agent.Health.Healthy {
26022609
failingAgents = append(failingAgents, agent.ID)
26032610
}

coderd/workspaces_test.go

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,81 @@ func TestWorkspace(t *testing.T) {
346346
assert.False(t, agent2.Health.Healthy)
347347
assert.NotEmpty(t, agent2.Health.Reason)
348348
})
349+
350+
t.Run("Sub-agent excluded", func(t *testing.T) {
351+
t.Parallel()
352+
// This test verifies that sub-agents (e.g., devcontainer agents)
353+
// are excluded from the workspace health calculation. When a
354+
// devcontainer is rebuilding, the sub-agent may be temporarily
355+
// disconnected, but this should not make the workspace unhealthy.
356+
client, db := coderdtest.NewWithDatabase(t, &coderdtest.Options{IncludeProvisionerDaemon: true})
357+
user := coderdtest.CreateFirstUser(t, client)
358+
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
359+
Parse: echo.ParseComplete,
360+
ProvisionApply: []*proto.Response{{
361+
Type: &proto.Response_Apply{
362+
Apply: &proto.ApplyComplete{
363+
Resources: []*proto.Resource{{
364+
Name: "some",
365+
Type: "example",
366+
Agents: []*proto.Agent{{
367+
Id: uuid.NewString(),
368+
Name: "parent",
369+
Auth: &proto.Agent_Token{},
370+
}},
371+
}},
372+
},
373+
},
374+
}},
375+
})
376+
coderdtest.AwaitTemplateVersionJobCompleted(t, client, version.ID)
377+
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID)
378+
workspace := coderdtest.CreateWorkspace(t, client, template.ID)
379+
coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, workspace.LatestBuild.ID)
380+
381+
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
382+
defer cancel()
383+
384+
// Get the workspace and parent agent.
385+
workspace, err := client.Workspace(ctx, workspace.ID)
386+
require.NoError(t, err)
387+
parentAgent := workspace.LatestBuild.Resources[0].Agents[0]
388+
require.True(t, parentAgent.Health.Healthy, "parent agent should be healthy initially")
389+
390+
// Create a sub-agent with a short connection timeout so it becomes
391+
// unhealthy quickly (simulating a devcontainer rebuild scenario).
392+
subAgent := dbgen.WorkspaceAgent(t, db, database.WorkspaceAgent{
393+
ParentID: uuid.NullUUID{Valid: true, UUID: parentAgent.ID},
394+
ResourceID: parentAgent.ResourceID,
395+
Name: "subagent",
396+
ConnectionTimeoutSeconds: 1,
397+
})
398+
399+
// Wait for the sub-agent to become unhealthy due to timeout.
400+
var subAgentUnhealthy bool
401+
require.Eventually(t, func() bool {
402+
workspace, err = client.Workspace(ctx, workspace.ID)
403+
if err != nil {
404+
return false
405+
}
406+
for _, res := range workspace.LatestBuild.Resources {
407+
for _, agent := range res.Agents {
408+
if agent.ID == subAgent.ID && !agent.Health.Healthy {
409+
subAgentUnhealthy = true
410+
return true
411+
}
412+
}
413+
}
414+
return false
415+
}, testutil.WaitShort, testutil.IntervalFast, "sub-agent should become unhealthy")
416+
417+
require.True(t, subAgentUnhealthy, "sub-agent should be unhealthy")
418+
419+
// Verify that the workspace is still healthy because sub-agents
420+
// are excluded from the health calculation.
421+
assert.True(t, workspace.Health.Healthy, "workspace should be healthy despite unhealthy sub-agent")
422+
assert.Empty(t, workspace.Health.FailingAgents, "failing agents should not include sub-agent")
423+
})
349424
})
350425

351426
t.Run("Archived", func(t *testing.T) {

0 commit comments

Comments
 (0)