Skip to content

Commit aff2080

Browse files
authored
feat: fix build timeline to include entire stage timings (#20805)
Measure entire stage durations for each terraform cmd execution
1 parent a10c5ff commit aff2080

File tree

9 files changed

+133
-65
lines changed

9 files changed

+133
-65
lines changed

provisioner/terraform/executor.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -325,7 +325,9 @@ func (e *executor) plan(ctx, killCtx context.Context, env, vars []string, logr l
325325
<-doneErr
326326
}()
327327

328+
endStage := e.timings.startStage(database.ProvisionerJobTimingStagePlan)
328329
err := e.execWriteOutput(ctx, killCtx, args, env, outWriter, errWriter)
330+
endStage(err)
329331
if err != nil {
330332
return nil, xerrors.Errorf("terraform plan: %w", err)
331333
}
@@ -596,10 +598,15 @@ func (e *executor) apply(
596598
<-doneErr
597599
}()
598600

601+
// `terraform apply`
602+
endStage := e.timings.startStage(database.ProvisionerJobTimingStageApply)
599603
err := e.execWriteOutput(ctx, killCtx, args, env, outWriter, errWriter)
604+
endStage(err)
600605
if err != nil {
601606
return nil, xerrors.Errorf("terraform apply: %w", err)
602607
}
608+
609+
// `terraform show` & `terraform graph`
603610
state, err := e.stateResources(ctx, killCtx)
604611
if err != nil {
605612
return nil, err
@@ -610,12 +617,13 @@ func (e *executor) apply(
610617
return nil, xerrors.Errorf("read statefile %q: %w", statefilePath, err)
611618
}
612619

620+
agg := e.timings.aggregate()
613621
return &proto.ApplyComplete{
614622
Parameters: state.Parameters,
615623
Resources: state.Resources,
616624
ExternalAuthProviders: state.ExternalAuthProviders,
617625
State: stateContent,
618-
Timings: e.timings.aggregate(),
626+
Timings: agg,
619627
AiTasks: state.AITasks,
620628
}, nil
621629
}

provisioner/terraform/inittimings.go

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -137,30 +137,3 @@ func (t *timingAggregator) finishPrevious(ts time.Time, s *timingSpan) {
137137

138138
t.lookupMu.Unlock()
139139
}
140-
141-
// mergeInitTimings merges manual init timings with existing timings that are
142-
// sourced by the logs. This is done because prior to Terraform v1.9, init logs
143-
// did not have a `-json` formatting option.
144-
// So before v1.9, the init stage is manually timed outside the `terraform init`.
145-
// After v1.9, the init stage is timed via logs.
146-
func mergeInitTimings(manualInit []*proto.Timing, existing []*proto.Timing) []*proto.Timing {
147-
initFailed := slices.ContainsFunc(existing, func(timing *proto.Timing) bool {
148-
return timing.State == proto.TimingState_FAILED
149-
})
150-
151-
if initFailed {
152-
// The init logs do not provide enough information for failed init timings.
153-
// So use the manual timings in this case.
154-
return append(manualInit, existing...)
155-
}
156-
157-
hasInitStage := slices.ContainsFunc(existing, func(timing *proto.Timing) bool {
158-
return timing.Stage == string(database.ProvisionerJobTimingStageInit)
159-
})
160-
161-
if hasInitStage {
162-
return existing
163-
}
164-
165-
return append(manualInit, existing...)
166-
}

provisioner/terraform/provision.go

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -110,12 +110,11 @@ func (s *server) Plan(
110110
// The JSON output of `terraform init` doesn't include discrete fields for capturing timings of each plugin,
111111
// so we capture the whole init process.
112112
initTimings := newTimingAggregator(database.ProvisionerJobTimingStageInit)
113-
initTimings.ingest(createInitTimingsEvent(timingInitStart))
113+
endStage := initTimings.startStage(database.ProvisionerJobTimingStageInit)
114114

115115
err = e.init(ctx, killCtx, sess)
116+
endStage(err)
116117
if err != nil {
117-
initTimings.ingest(createInitTimingsEvent(timingInitErrored))
118-
119118
s.logger.Debug(ctx, "init failed", slog.Error(err))
120119

121120
// Special handling for "text file busy" c.f. https://github.com/coder/coder/issues/14726
@@ -148,8 +147,6 @@ func (s *server) Plan(
148147
s.logger.Error(ctx, "failed to get modules from disk", slog.Error(err))
149148
}
150149

151-
initTimings.ingest(createInitTimingsEvent(timingInitComplete))
152-
153150
s.logger.Debug(ctx, "ran initialization")
154151

155152
env, err := provisionEnv(sess.Config, request.Metadata, request.PreviousParameterValues, request.RichParameterValues, request.ExternalAuthProviders)
@@ -170,7 +167,7 @@ func (s *server) Plan(
170167

171168
// Prepend init timings since they occur prior to plan timings.
172169
// Order is irrelevant; this is merely indicative.
173-
resp.Timings = mergeInitTimings(initTimings.aggregate(), resp.Timings)
170+
resp.Timings = append(initTimings.aggregate(), resp.Timings...) // mergeInitTimings(initTimings.aggregate(), resp.Timings)
174171
resp.Modules = modules
175172
return resp
176173
}

provisioner/terraform/timings.go

Lines changed: 40 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,13 @@ type timingKind string
1919
// Copied from https://github.com/hashicorp/terraform/blob/01c0480e77263933b2b086dc8d600a69f80fad2d/internal/command/jsonformat/renderer.go
2020
// We cannot reference these because they're in an internal package.
2121
const (
22+
// Stage markers are used to denote the beginning and end of stages. Without
23+
// these, only discrete events (i.e. resource changes) within stages can be
24+
// measured, which may omit setup/teardown time or other unmeasured overhead.
25+
timingStageStart timingKind = "stage_start"
26+
timingStageEnd timingKind = "stage_end"
27+
timingStageError timingKind = "stage_error"
28+
2229
timingApplyStart timingKind = "apply_start"
2330
timingApplyProgress timingKind = "apply_progress"
2431
timingApplyComplete timingKind = "apply_complete"
@@ -37,9 +44,6 @@ const (
3744
timingResourceDrift timingKind = "resource_drift"
3845
timingVersion timingKind = "version"
3946
// These are not part of message_types, but we want to track init/graph timings as well.
40-
timingInitStart timingKind = "init_start"
41-
timingInitComplete timingKind = "init_complete"
42-
timingInitErrored timingKind = "init_errored"
4347
timingGraphStart timingKind = "graph_start"
4448
timingGraphComplete timingKind = "graph_complete"
4549
timingGraphErrored timingKind = "graph_errored"
@@ -109,13 +113,13 @@ func (t *timingAggregator) ingest(ts time.Time, s *timingSpan) {
109113
ts = dbtime.Time(ts.UTC())
110114

111115
switch s.kind {
112-
case timingApplyStart, timingProvisionStart, timingRefreshStart, timingInitStart, timingGraphStart:
116+
case timingApplyStart, timingProvisionStart, timingRefreshStart, timingGraphStart, timingStageStart:
113117
s.start = ts
114118
s.state = proto.TimingState_STARTED
115-
case timingApplyComplete, timingProvisionComplete, timingRefreshComplete, timingInitComplete, timingGraphComplete:
119+
case timingApplyComplete, timingProvisionComplete, timingRefreshComplete, timingGraphComplete, timingStageEnd:
116120
s.end = ts
117121
s.state = proto.TimingState_COMPLETED
118-
case timingApplyErrored, timingProvisionErrored, timingInitErrored, timingGraphErrored:
122+
case timingApplyErrored, timingProvisionErrored, timingGraphErrored, timingStageError:
119123
s.end = ts
120124
s.state = proto.TimingState_FAILED
121125
case timingInitOutput:
@@ -176,8 +180,35 @@ func (t *timingAggregator) aggregate() []*proto.Timing {
176180
return out
177181
}
178182

183+
// startStage denotes the beginning of a stage and returns a function which
184+
// should be called to mark the end of the stage. This is used to measure a
185+
// stage's total duration across all it's discrete events and unmeasured
186+
// overhead/events.
187+
func (t *timingAggregator) startStage(stage database.ProvisionerJobTimingStage) (end func(err error)) {
188+
ts := timingSpan{
189+
kind: timingStageStart,
190+
stage: stage,
191+
resource: "coder_stage_" + string(stage),
192+
action: "terraform",
193+
provider: "coder",
194+
}
195+
endTs := ts
196+
t.ingest(dbtime.Now(), &ts)
197+
198+
return func(err error) {
199+
endTs.kind = timingStageEnd
200+
if err != nil {
201+
endTs.kind = timingStageError
202+
}
203+
t.ingest(dbtime.Now(), &endTs)
204+
}
205+
}
206+
179207
func (l timingKind) Valid() bool {
180208
return slices.Contains([]timingKind{
209+
timingStageStart,
210+
timingStageEnd,
211+
timingStageError,
181212
timingApplyStart,
182213
timingApplyProgress,
183214
timingApplyComplete,
@@ -194,9 +225,6 @@ func (l timingKind) Valid() bool {
194225
timingOutputs,
195226
timingResourceDrift,
196227
timingVersion,
197-
timingInitStart,
198-
timingInitComplete,
199-
timingInitErrored,
200228
timingGraphStart,
201229
timingGraphComplete,
202230
timingGraphErrored,
@@ -210,7 +238,9 @@ func (l timingKind) Valid() bool {
210238
// if all other attributes are identical.
211239
func (l timingKind) Category() string {
212240
switch l {
213-
case timingInitStart, timingInitComplete, timingInitErrored, timingInitOutput:
241+
case timingStageStart, timingStageEnd, timingStageError:
242+
return "stage"
243+
case timingInitOutput:
214244
return "init"
215245
case timingGraphStart, timingGraphComplete, timingGraphErrored:
216246
return "graph"
@@ -252,15 +282,6 @@ func (e *timingSpan) toProto() *proto.Timing {
252282
}
253283
}
254284

255-
func createInitTimingsEvent(event timingKind) (time.Time, *timingSpan) {
256-
return dbtime.Now(), &timingSpan{
257-
kind: event,
258-
action: "initializing terraform",
259-
provider: "terraform",
260-
resource: "init",
261-
}
262-
}
263-
264285
func createGraphTimingsEvent(event timingKind) (time.Time, *timingSpan) {
265286
return dbtime.Now(), &timingSpan{
266287
kind: event,

provisioner/terraform/timings_internal_test.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,3 +155,18 @@ func printTimings(t *testing.T, timings []*proto.Timing) {
155155
terraform_internal.PrintTiming(t, a)
156156
}
157157
}
158+
159+
func TestTimingStages(t *testing.T) {
160+
t.Parallel()
161+
162+
agg := &timingAggregator{
163+
stage: database.ProvisionerJobTimingStageApply,
164+
stateLookup: make(map[uint64]*timingSpan),
165+
}
166+
167+
end := agg.startStage(database.ProvisionerJobTimingStageApply)
168+
end(nil)
169+
170+
evts := agg.aggregate()
171+
require.Len(t, evts, 1)
172+
}

provisioner/terraform/timings_test.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,13 @@ import (
66
"context"
77
"os"
88
"path/filepath"
9+
"strings"
910
"testing"
1011

1112
"github.com/stretchr/testify/require"
1213

1314
"github.com/coder/coder/v2/coderd/database"
15+
"github.com/coder/coder/v2/coderd/util/slice"
1416
terraform_internal "github.com/coder/coder/v2/provisioner/terraform/internal"
1517
"github.com/coder/coder/v2/provisionersdk/proto"
1618
"github.com/coder/coder/v2/testutil"
@@ -95,6 +97,12 @@ func TestTimingsFromProvision(t *testing.T) {
9597

9698
// Sort the timings stably to keep reduce flakiness.
9799
terraform_internal.StableSortTimings(t, timings)
100+
// `coder_stage_` timings use `dbtime.Now()`, which makes them hard to compare to
101+
// a static set of expected timings. Filter them out. This test is good for
102+
// testing timings sourced from terraform logs, not internal coder timings.
103+
timings = slice.Filter(timings, func(tim *proto.Timing) bool {
104+
return !strings.HasPrefix(tim.Resource, "coder_stage_")
105+
})
98106

99107
// Then: the received timings should match the expected values below.
100108
// NOTE: These timings have been encoded to JSON format to make the tests more readable.

site/src/modules/workspaces/WorkspaceTiming/ResourcesChart.tsx

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,10 @@ export const ResourcesChart: FC<ResourcesChartProps> = ({
5252
const [ticks, scale] = makeTicks(totalTime);
5353
const [filter, setFilter] = useState("");
5454
const visibleTimings = timings.filter(
55-
(t) => !isCoderResource(t.name) && t.name.includes(filter),
55+
// Stage boundaries are also included
56+
(t) =>
57+
(!isCoderResource(t.name) || isStageBoundary(t.name)) &&
58+
t.name.includes(filter),
5659
);
5760
const theme = useTheme();
5861
const legendsByAction = getLegendsByAction(theme);
@@ -86,20 +89,27 @@ export const ResourcesChart: FC<ResourcesChartProps> = ({
8689
<YAxisSection>
8790
<YAxisHeader>{stage.name} stage</YAxisHeader>
8891
<YAxisLabels>
89-
{visibleTimings.map((t) => (
90-
<YAxisLabel key={t.name} id={encodeURIComponent(t.name)}>
91-
{t.name}
92-
</YAxisLabel>
93-
))}
92+
{visibleTimings.map((t) => {
93+
const label = isStageBoundary(t.name)
94+
? "total stage duration"
95+
: t.name;
96+
return (
97+
<YAxisLabel key={label} id={encodeURIComponent(t.name)}>
98+
{label}
99+
</YAxisLabel>
100+
);
101+
})}
94102
</YAxisLabels>
95103
</YAxisSection>
96104
</YAxis>
97105

98106
<XAxis ticks={ticks} scale={scale}>
99107
<XAxisSection>
100108
{visibleTimings.map((t) => {
109+
const stageBoundary = isStageBoundary(t.name);
101110
const duration = calcDuration(t.range);
102111
const legend = legendsByAction[t.action] ?? { label: t.action };
112+
const label = stageBoundary ? "total stage duration" : t.name;
103113

104114
return (
105115
<XAxisRow
@@ -109,8 +119,11 @@ export const ResourcesChart: FC<ResourcesChartProps> = ({
109119
<Tooltip
110120
title={
111121
<>
112-
<TooltipTitle>{t.name}</TooltipTitle>
113-
<TooltipLink to="">view template</TooltipLink>
122+
<TooltipTitle>{label}</TooltipTitle>
123+
{/* Stage boundaries should not have these links */}
124+
{!stageBoundary && (
125+
<TooltipLink to="">view template</TooltipLink>
126+
)}
114127
</>
115128
}
116129
>
@@ -132,6 +145,10 @@ export const ResourcesChart: FC<ResourcesChartProps> = ({
132145
);
133146
};
134147

148+
export const isStageBoundary = (resource: string) => {
149+
return resource.startsWith("coder_stage_");
150+
};
151+
135152
export const isCoderResource = (resource: string) => {
136153
return (
137154
resource.startsWith("data.coder") ||

site/src/modules/workspaces/WorkspaceTiming/WorkspaceTimings.tsx

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,11 @@ import {
1717
mergeTimeRanges,
1818
type TimeRange,
1919
} from "./Chart/utils";
20-
import { isCoderResource, ResourcesChart } from "./ResourcesChart";
20+
import {
21+
isCoderResource,
22+
isStageBoundary,
23+
ResourcesChart,
24+
} from "./ResourcesChart";
2125
import { ScriptsChart } from "./ScriptsChart";
2226
import {
2327
agentStages,
@@ -140,6 +144,13 @@ export const WorkspaceTimings: FC<WorkspaceTimingsProps> = ({
140144
// user and would add noise.
141145
const visibleResources = stageTimings.filter((t) => {
142146
const isProvisionerTiming = "resource" in t;
147+
148+
// StageBoundaries are being drawn on the total timeline.
149+
// Do not show them as discrete resources inside the stage view.
150+
if (isProvisionerTiming && isStageBoundary(t.resource)) {
151+
return false;
152+
}
153+
143154
return isProvisionerTiming
144155
? !isCoderResource(t.resource)
145156
: true;

site/src/modules/workspaces/WorkspaceTiming/storybookData.ts

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,18 @@ export const WorkspaceTimingsResponse: WorkspaceBuildTimings = {
77
started_at: "2024-10-14T11:30:38.582305Z",
88
ended_at: "2024-10-14T11:30:47.707708Z",
99
stage: "init",
10-
source: "terraform",
11-
action: "initializing terraform",
12-
resource: "state file",
10+
source: "coder",
11+
action: "terraform",
12+
resource: "coder_stage_init",
13+
},
14+
{
15+
job_id: "86fd4143-d95f-4602-b464-1149ede62269",
16+
started_at: "2024-10-14T11:30:48.105148Z",
17+
ended_at: "2024-10-14T11:30:49.911366Z",
18+
stage: "plan",
19+
source: "coder",
20+
action: "terraform",
21+
resource: "coder_stage_plan",
1322
},
1423
{
1524
job_id: "86fd4143-d95f-4602-b464-1149ede62269",
@@ -310,6 +319,15 @@ export const WorkspaceTimingsResponse: WorkspaceBuildTimings = {
310319
action: "building terraform dependency graph",
311320
resource: "state file",
312321
},
322+
{
323+
job_id: "86fd4143-d95f-4602-b464-1149ede62269",
324+
started_at: "2024-10-14T11:30:50.161398Z",
325+
ended_at: "2024-10-14T11:30:53.993767Z",
326+
stage: "apply",
327+
source: "coder",
328+
action: "terraform",
329+
resource: "coder_stage_apply",
330+
},
313331
{
314332
job_id: "86fd4143-d95f-4602-b464-1149ede62269",
315333
started_at: "2024-10-14T11:30:50.861398Z",

0 commit comments

Comments
 (0)