Skip to content

Commit 3e94111

Browse files
committed
fix(coderd): exclude sub-agents from workspace health calculation
Previously, when a devcontainer was rebuilding, its sub-agent would become temporarily disconnected, causing the workspace to show as unhealthy. This was confusing because the workspace itself was fine. Sub-agents (agents with a ParentID) are now excluded from the workspace health calculation. Their health is managed by their parent agent, and temporary disconnections during devcontainer rebuilds should not affect the overall workspace health status.
1 parent 6aeb144 commit 3e94111

File tree

2 files changed

+83
-0
lines changed

2 files changed

+83
-0
lines changed

coderd/workspaces.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2598,6 +2598,13 @@ func convertWorkspace(
25982598
failingAgents := []uuid.UUID{}
25992599
for _, resource := range workspaceBuild.Resources {
26002600
for _, agent := range resource.Agents {
2601+
// Sub-agents (e.g., devcontainer agents) are excluded from the
2602+
// workspace health calculation. Their health is managed by
2603+
// their parent agent, and temporary disconnections during
2604+
// devcontainer rebuilds should not affect workspace health.
2605+
if agent.ParentID.Valid {
2606+
continue
2607+
}
26012608
if !agent.Health.Healthy {
26022609
failingAgents = append(failingAgents, agent.ID)
26032610
}

coderd/workspaces_test.go

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,82 @@ func TestWorkspace(t *testing.T) {
346346
assert.False(t, agent2.Health.Healthy)
347347
assert.NotEmpty(t, agent2.Health.Reason)
348348
})
349+
350+
t.Run("Sub-agent excluded", func(t *testing.T) {
351+
t.Parallel()
352+
// This test verifies that sub-agents (e.g., devcontainer agents)
353+
// are excluded from the workspace health calculation. When a
354+
// devcontainer is rebuilding, the sub-agent may be temporarily
355+
// disconnected, but this should not make the workspace unhealthy.
356+
client, db := coderdtest.NewWithDatabase(t, &coderdtest.Options{IncludeProvisionerDaemon: true})
357+
user := coderdtest.CreateFirstUser(t, client)
358+
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
359+
Parse: echo.ParseComplete,
360+
ProvisionApply: []*proto.Response{{
361+
Type: &proto.Response_Apply{
362+
Apply: &proto.ApplyComplete{
363+
Resources: []*proto.Resource{{
364+
Name: "some",
365+
Type: "example",
366+
Agents: []*proto.Agent{{
367+
Id: uuid.NewString(),
368+
Name: "parent",
369+
Auth: &proto.Agent_Token{},
370+
}},
371+
}},
372+
},
373+
},
374+
}},
375+
})
376+
coderdtest.AwaitTemplateVersionJobCompleted(t, client, version.ID)
377+
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID)
378+
workspace := coderdtest.CreateWorkspace(t, client, template.ID)
379+
coderdtest.AwaitWorkspaceBuildJobCompleted(t, client, workspace.LatestBuild.ID)
380+
381+
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
382+
defer cancel()
383+
384+
// Get the workspace and parent agent.
385+
workspace, err := client.Workspace(ctx, workspace.ID)
386+
require.NoError(t, err)
387+
parentAgent := workspace.LatestBuild.Resources[0].Agents[0]
388+
require.True(t, parentAgent.Health.Healthy, "parent agent should be healthy initially")
389+
390+
// Create a sub-agent with a short connection timeout so it becomes
391+
// unhealthy quickly (simulating a devcontainer rebuild scenario).
392+
//nolint:gocritic // This is a test, we need to insert a sub-agent directly.
393+
subAgent := dbgen.WorkspaceAgent(t, db, database.WorkspaceAgent{
394+
ParentID: uuid.NullUUID{Valid: true, UUID: parentAgent.ID},
395+
ResourceID: parentAgent.ResourceID,
396+
Name: "subagent",
397+
ConnectionTimeoutSeconds: 1,
398+
})
399+
400+
// Wait for the sub-agent to become unhealthy due to timeout.
401+
var subAgentUnhealthy bool
402+
require.Eventually(t, func() bool {
403+
workspace, err = client.Workspace(ctx, workspace.ID)
404+
if err != nil {
405+
return false
406+
}
407+
for _, res := range workspace.LatestBuild.Resources {
408+
for _, agent := range res.Agents {
409+
if agent.ID == subAgent.ID && !agent.Health.Healthy {
410+
subAgentUnhealthy = true
411+
return true
412+
}
413+
}
414+
}
415+
return false
416+
}, testutil.WaitShort, testutil.IntervalFast, "sub-agent should become unhealthy")
417+
418+
require.True(t, subAgentUnhealthy, "sub-agent should be unhealthy")
419+
420+
// Verify that the workspace is still healthy because sub-agents
421+
// are excluded from the health calculation.
422+
assert.True(t, workspace.Health.Healthy, "workspace should be healthy despite unhealthy sub-agent")
423+
assert.Empty(t, workspace.Health.FailingAgents, "failing agents should not include sub-agent")
424+
})
349425
})
350426

351427
t.Run("Archived", func(t *testing.T) {

0 commit comments

Comments
 (0)