Skip to content

Commit 658e8c3

Browse files
authored
perf: improve performance of metricsAggregator path by reducing memory allocations (#20724)
Signed-off-by: Callum Styan <callumstyan@gmail.com>
1 parent ca560d3 commit 658e8c3

File tree

3 files changed

+240
-37
lines changed

3 files changed

+240
-37
lines changed

coderd/prometheusmetrics/aggregator.go

Lines changed: 100 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,11 @@ const (
3737

3838
var MetricLabelValueEncoder = strings.NewReplacer("\\", "\\\\", "|", "\\|", ",", "\\,", "=", "\\=")
3939

40+
type descCacheEntry struct {
41+
desc *prometheus.Desc
42+
lastUsed time.Time
43+
}
44+
4045
type MetricsAggregator struct {
4146
store map[metricKey]annotatedMetric
4247

@@ -50,6 +55,8 @@ type MetricsAggregator struct {
5055
updateHistogram prometheus.Histogram
5156
cleanupHistogram prometheus.Histogram
5257
aggregateByLabels []string
58+
// per-aggregator cache of descriptors
59+
descCache map[string]descCacheEntry
5360
}
5461

5562
type updateRequest struct {
@@ -107,42 +114,6 @@ func hashKey(req *updateRequest, m *agentproto.Stats_Metric) metricKey {
107114

108115
var _ prometheus.Collector = new(MetricsAggregator)
109116

110-
func (am *annotatedMetric) asPrometheus() (prometheus.Metric, error) {
111-
var (
112-
baseLabelNames = am.aggregateByLabels
113-
baseLabelValues []string
114-
extraLabels = am.Labels
115-
)
116-
117-
for _, label := range baseLabelNames {
118-
val, err := am.getFieldByLabel(label)
119-
if err != nil {
120-
return nil, err
121-
}
122-
123-
baseLabelValues = append(baseLabelValues, val)
124-
}
125-
126-
labels := make([]string, 0, len(baseLabelNames)+len(extraLabels))
127-
labelValues := make([]string, 0, len(baseLabelNames)+len(extraLabels))
128-
129-
labels = append(labels, baseLabelNames...)
130-
labelValues = append(labelValues, baseLabelValues...)
131-
132-
for _, l := range extraLabels {
133-
labels = append(labels, l.Name)
134-
labelValues = append(labelValues, l.Value)
135-
}
136-
137-
desc := prometheus.NewDesc(am.Name, metricHelpForAgent, labels, nil)
138-
valueType, err := asPrometheusValueType(am.Type)
139-
if err != nil {
140-
return nil, err
141-
}
142-
143-
return prometheus.MustNewConstMetric(desc, valueType, am.Value, labelValues...), nil
144-
}
145-
146117
// getFieldByLabel returns the related field value for a given label
147118
func (am *annotatedMetric) getFieldByLabel(label string) (string, error) {
148119
var labelVal string
@@ -364,7 +335,7 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() {
364335
}
365336

366337
for _, m := range input {
367-
promMetric, err := m.asPrometheus()
338+
promMetric, err := ma.asPrometheus(&m)
368339
if err != nil {
369340
ma.log.Error(ctx, "can't convert Prometheus value type", slog.F("name", m.Name), slog.F("type", m.Type), slog.F("value", m.Value), slog.Error(err))
370341
continue
@@ -386,6 +357,8 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() {
386357
}
387358
}
388359

360+
ma.cleanupDescCache()
361+
389362
timer.ObserveDuration()
390363
cleanupTicker.Reset(ma.metricsCleanupInterval)
391364
ma.storeSizeGauge.Set(float64(len(ma.store)))
@@ -407,6 +380,86 @@ func (ma *MetricsAggregator) Run(ctx context.Context) func() {
407380
func (*MetricsAggregator) Describe(_ chan<- *prometheus.Desc) {
408381
}
409382

383+
// cacheKeyForDesc is used to determine the cache key for a set of labels/extra labels. Used with the aggregators description cache.
384+
// for strings.Builder returned errors from these functions are always nil.
385+
// nolint:revive
386+
func cacheKeyForDesc(name string, baseLabelNames []string, extraLabels []*agentproto.Stats_Metric_Label) string {
387+
var b strings.Builder
388+
hint := len(name) + (len(baseLabelNames)+len(extraLabels))*8
389+
b.Grow(hint)
390+
b.WriteString(name)
391+
for _, ln := range baseLabelNames {
392+
b.WriteByte('|')
393+
b.WriteString(ln)
394+
}
395+
for _, l := range extraLabels {
396+
b.WriteByte('|')
397+
b.WriteString(l.Name)
398+
}
399+
return b.String()
400+
}
401+
402+
// getOrCreateDec checks if we already have a metric description in the aggregators cache for a given combination of base
403+
// labels and extra labels. If we do not, we create a new description and cache it.
404+
func (ma *MetricsAggregator) getOrCreateDesc(name string, help string, baseLabelNames []string, extraLabels []*agentproto.Stats_Metric_Label) *prometheus.Desc {
405+
if ma.descCache == nil {
406+
ma.descCache = make(map[string]descCacheEntry)
407+
}
408+
key := cacheKeyForDesc(name, baseLabelNames, extraLabels)
409+
if d, ok := ma.descCache[key]; ok {
410+
d.lastUsed = time.Now()
411+
ma.descCache[key] = d
412+
return d.desc
413+
}
414+
nBase := len(baseLabelNames)
415+
nExtra := len(extraLabels)
416+
labels := make([]string, nBase+nExtra)
417+
copy(labels, baseLabelNames)
418+
for i, l := range extraLabels {
419+
labels[nBase+i] = l.Name
420+
}
421+
d := prometheus.NewDesc(name, help, labels, nil)
422+
ma.descCache[key] = descCacheEntry{d, time.Now()}
423+
return d
424+
}
425+
426+
// asPrometheus returns the annotatedMetric as a prometheus.Metric, it preallocates/fills by index, uses the aggregators
427+
// metric description cache, and a small stack buffer for values in order to reduce memory allocations.
428+
func (ma *MetricsAggregator) asPrometheus(am *annotatedMetric) (prometheus.Metric, error) {
429+
baseLabelNames := am.aggregateByLabels
430+
extraLabels := am.Labels
431+
432+
nBase := len(baseLabelNames)
433+
nExtra := len(extraLabels)
434+
nTotal := nBase + nExtra
435+
436+
var scratch [16]string
437+
var labelValues []string
438+
if nTotal <= len(scratch) {
439+
labelValues = scratch[:nTotal]
440+
} else {
441+
labelValues = make([]string, nTotal)
442+
}
443+
444+
for i, label := range baseLabelNames {
445+
val, err := am.getFieldByLabel(label)
446+
if err != nil {
447+
return nil, err
448+
}
449+
labelValues[i] = val
450+
}
451+
for i, l := range extraLabels {
452+
labelValues[nBase+i] = l.Value
453+
}
454+
455+
desc := ma.getOrCreateDesc(am.Name, metricHelpForAgent, baseLabelNames, extraLabels)
456+
valueType, err := asPrometheusValueType(am.Type)
457+
if err != nil {
458+
return nil, err
459+
}
460+
return prometheus.MustNewConstMetric(desc, valueType, am.Value, labelValues...), nil
461+
}
462+
410463
var defaultAgentMetricsLabels = []string{agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName, agentmetrics.LabelAgentName, agentmetrics.LabelTemplateName}
411464

412465
// AgentMetricLabels are the labels used to decorate an agent's metrics.
@@ -453,6 +506,16 @@ func (ma *MetricsAggregator) Update(ctx context.Context, labels AgentMetricLabel
453506
}
454507
}
455508

509+
// Move to a function for testability
510+
func (ma *MetricsAggregator) cleanupDescCache() {
511+
now := time.Now()
512+
for key, entry := range ma.descCache {
513+
if now.Sub(entry.lastUsed) > ma.metricsCleanupInterval {
514+
delete(ma.descCache, key)
515+
}
516+
}
517+
}
518+
456519
func asPrometheusValueType(metricType agentproto.Stats_Metric_Type) (prometheus.ValueType, error) {
457520
switch metricType {
458521
case agentproto.Stats_Metric_GAUGE:
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
package prometheusmetrics
2+
3+
import (
4+
"testing"
5+
"time"
6+
7+
"github.com/prometheus/client_golang/prometheus"
8+
"github.com/stretchr/testify/require"
9+
10+
"cdr.dev/slog/sloggers/slogtest"
11+
agentproto "github.com/coder/coder/v2/agent/proto"
12+
"github.com/coder/coder/v2/coderd/agentmetrics"
13+
"github.com/coder/coder/v2/testutil"
14+
)
15+
16+
func TestDescCache_DescExpire(t *testing.T) {
17+
const (
18+
testWorkspaceName = "yogi-workspace"
19+
testUsername = "yogi-bear"
20+
testAgentName = "main-agent"
21+
testTemplateName = "main-template"
22+
)
23+
24+
testLabels := AgentMetricLabels{
25+
Username: testUsername,
26+
WorkspaceName: testWorkspaceName,
27+
AgentName: testAgentName,
28+
TemplateName: testTemplateName,
29+
}
30+
31+
t.Parallel()
32+
33+
// given
34+
registry := prometheus.NewRegistry()
35+
ma, err := NewMetricsAggregator(slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}), registry, time.Millisecond, agentmetrics.LabelAll)
36+
require.NoError(t, err)
37+
38+
given := []*agentproto.Stats_Metric{
39+
{Name: "a_counter_one", Type: agentproto.Stats_Metric_COUNTER, Value: 1},
40+
}
41+
42+
_, err = ma.asPrometheus(&annotatedMetric{
43+
given[0],
44+
testLabels.Username,
45+
testLabels.WorkspaceName,
46+
testLabels.AgentName,
47+
testLabels.TemplateName,
48+
// the rest doesn't matter for this test
49+
time.Now(),
50+
[]string{},
51+
})
52+
require.NoError(t, err)
53+
54+
require.Eventually(t, func() bool {
55+
ma.cleanupDescCache()
56+
return len(ma.descCache) == 0
57+
}, testutil.WaitShort, testutil.IntervalFast)
58+
}
59+
60+
// TestDescCacheTimestampUpdate ensures that the timestamp update in getOrCreateDesc
61+
// updates the map entry because d is a copy, not a pointer.
62+
func TestDescCacheTimestampUpdate(t *testing.T) {
63+
t.Parallel()
64+
65+
registry := prometheus.NewRegistry()
66+
ma, err := NewMetricsAggregator(slogtest.Make(t, nil), registry, time.Hour, nil)
67+
require.NoError(t, err)
68+
69+
baseLabelNames := []string{"label1", "label2"}
70+
extraLabels := []*agentproto.Stats_Metric_Label{
71+
{Name: "extra1", Value: "value1"},
72+
}
73+
74+
desc1 := ma.getOrCreateDesc("test_metric", "help text", baseLabelNames, extraLabels)
75+
require.NotNil(t, desc1)
76+
77+
key := cacheKeyForDesc("test_metric", baseLabelNames, extraLabels)
78+
initialEntry := ma.descCache[key]
79+
initialTime := initialEntry.lastUsed
80+
81+
desc2 := ma.getOrCreateDesc("test_metric", "help text", baseLabelNames, extraLabels)
82+
require.NotNil(t, desc2)
83+
84+
updatedEntry := ma.descCache[key]
85+
updatedTime := updatedEntry.lastUsed
86+
87+
require.NotEqual(t, initialTime, updatedTime,
88+
"Timestamp was NOT updated in map when accessing a metric description that should be cached")
89+
}

coderd/prometheusmetrics/prometheusmetrics_internal_test.go

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
package prometheusmetrics
22

33
import (
4+
"fmt"
45
"testing"
56

67
"github.com/stretchr/testify/require"
78

9+
agentproto "github.com/coder/coder/v2/agent/proto"
810
"github.com/coder/coder/v2/coderd/agentmetrics"
911
)
1012

@@ -36,3 +38,52 @@ func TestFilterAcceptableAgentLabels(t *testing.T) {
3638
})
3739
}
3840
}
41+
42+
func benchAsPrometheus(b *testing.B, base []string, extraN int) {
43+
am := annotatedMetric{
44+
Stats_Metric: &agentproto.Stats_Metric{
45+
Name: "blink_test_metric",
46+
Type: agentproto.Stats_Metric_GAUGE,
47+
Value: 1,
48+
Labels: make([]*agentproto.Stats_Metric_Label, extraN),
49+
},
50+
username: "user",
51+
workspaceName: "ws",
52+
agentName: "agent",
53+
templateName: "tmpl",
54+
aggregateByLabels: base,
55+
}
56+
for i := 0; i < extraN; i++ {
57+
am.Labels[i] = &agentproto.Stats_Metric_Label{Name: fmt.Sprintf("l%d", i), Value: "v"}
58+
}
59+
60+
ma := &MetricsAggregator{}
61+
62+
b.ReportAllocs()
63+
b.ResetTimer()
64+
for i := 0; i < b.N; i++ {
65+
_, err := ma.asPrometheus(&am)
66+
if err != nil {
67+
b.Fatal(err)
68+
}
69+
}
70+
}
71+
72+
func Benchmark_asPrometheus(b *testing.B) {
73+
cases := []struct {
74+
name string
75+
base []string
76+
extraN int
77+
}{
78+
{"base4_extra0", defaultAgentMetricsLabels, 0},
79+
{"base4_extra2", defaultAgentMetricsLabels, 2},
80+
{"base4_extra5", defaultAgentMetricsLabels, 5},
81+
{"base4_extra10", defaultAgentMetricsLabels, 10},
82+
{"base2_extra5", []string{agentmetrics.LabelUsername, agentmetrics.LabelWorkspaceName}, 5},
83+
}
84+
for _, tc := range cases {
85+
b.Run(tc.name, func(b *testing.B) {
86+
benchAsPrometheus(b, tc.base, tc.extraN)
87+
})
88+
}
89+
}

0 commit comments

Comments
 (0)