Skip to content

Commit c6631e1

Browse files
authored
feat: expose aibridged metrics (#20865)
Upgrades `coder/aibridge` to v0.2.0 which includes coder/aibridge#62. Creates a `prometheus.Registerer` with a prefix `coder_aibridged_` and passes that along to coder/aibridge which actually exposes the metrics. Also includes a side-effect of a change described in coder/aibridge#62 (comment). --------- Signed-off-by: Danny Kopping <danny@coder.com>
1 parent 6882c43 commit c6631e1

File tree

12 files changed

+295
-124
lines changed

12 files changed

+295
-124
lines changed

docs/admin/integrations/prometheus.md

Lines changed: 91 additions & 84 deletions
Large diffs are not rendered by default.

enterprise/aibridged/aibridged.go

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -55,13 +55,14 @@ func New(ctx context.Context, pool Pooler, rpcDialer Dialer, logger slog.Logger)
5555

5656
ctx, cancel := context.WithCancel(ctx)
5757
daemon := &Server{
58-
logger: logger,
59-
clientDialer: rpcDialer,
58+
logger: logger,
59+
clientDialer: rpcDialer,
60+
clientCh: make(chan DRPCClient),
61+
lifecycleCtx: ctx,
62+
cancelFn: cancel,
63+
initConnectionCh: make(chan struct{}),
64+
6065
requestBridgePool: pool,
61-
clientCh: make(chan DRPCClient),
62-
lifecycleCtx: ctx,
63-
cancelFn: cancel,
64-
initConnectionCh: make(chan struct{}),
6566
}
6667

6768
daemon.wg.Add(1)

enterprise/aibridged/aibridged_integration_test.go

Lines changed: 109 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ import (
99
"testing"
1010
"time"
1111

12+
"github.com/prometheus/client_golang/prometheus"
13+
promtest "github.com/prometheus/client_golang/prometheus/testutil"
1214
"github.com/stretchr/testify/require"
1315

1416
"github.com/coder/aibridge"
@@ -166,7 +168,7 @@ func TestIntegration(t *testing.T) {
166168

167169
logger := testutil.Logger(t)
168170
providers := []aibridge.Provider{aibridge.NewOpenAIProvider(aibridge.OpenAIConfig{BaseURL: mockOpenAI.URL})}
169-
pool, err := aibridged.NewCachedBridgePool(aibridged.DefaultPoolOptions, providers, logger)
171+
pool, err := aibridged.NewCachedBridgePool(aibridged.DefaultPoolOptions, providers, nil, logger)
170172
require.NoError(t, err)
171173

172174
// Given: aibridged is started.
@@ -253,3 +255,109 @@ func TestIntegration(t *testing.T) {
253255
// Then: the MCP server was initialized.
254256
require.Contains(t, mcpTokenReceived, authLink.OAuthAccessToken, "mock MCP server not requested")
255257
}
258+
259+
// TestIntegrationWithMetrics validates that Prometheus metrics are correctly incremented
260+
// when requests are processed through aibridged.
261+
func TestIntegrationWithMetrics(t *testing.T) {
262+
t.Parallel()
263+
264+
ctx := testutil.Context(t, testutil.WaitLong)
265+
266+
// Create prometheus registry and metrics.
267+
registry := prometheus.NewRegistry()
268+
metrics := aibridge.NewMetrics(registry)
269+
270+
// Set up mock OpenAI server.
271+
mockOpenAI := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
272+
w.Header().Set("Content-Type", "application/json")
273+
w.WriteHeader(http.StatusOK)
274+
_, _ = w.Write([]byte(`{
275+
"id": "chatcmpl-test",
276+
"object": "chat.completion",
277+
"created": 1753343279,
278+
"model": "gpt-4.1",
279+
"choices": [
280+
{
281+
"index": 0,
282+
"message": {
283+
"role": "assistant",
284+
"content": "test response"
285+
},
286+
"finish_reason": "stop"
287+
}
288+
],
289+
"usage": {
290+
"prompt_tokens": 10,
291+
"completion_tokens": 5,
292+
"total_tokens": 15
293+
}
294+
}`))
295+
}))
296+
t.Cleanup(mockOpenAI.Close)
297+
298+
// Database and coderd setup.
299+
db, ps := dbtestutil.NewDB(t)
300+
client, _, api, firstUser := coderdenttest.NewWithAPI(t, &coderdenttest.Options{
301+
Options: &coderdtest.Options{
302+
Database: db,
303+
Pubsub: ps,
304+
},
305+
})
306+
307+
userClient, _ := coderdtest.CreateAnotherUser(t, client, firstUser.OrganizationID)
308+
309+
// Create an API token for the user.
310+
apiKey, err := userClient.CreateToken(ctx, "me", codersdk.CreateTokenRequest{
311+
TokenName: fmt.Sprintf("test-key-%d", time.Now().UnixNano()),
312+
Lifetime: time.Hour,
313+
Scope: codersdk.APIKeyScopeCoderAll,
314+
})
315+
require.NoError(t, err)
316+
317+
// Create aibridge client.
318+
aiBridgeClient, err := api.CreateInMemoryAIBridgeServer(ctx)
319+
require.NoError(t, err)
320+
321+
logger := testutil.Logger(t)
322+
providers := []aibridge.Provider{aibridge.NewOpenAIProvider(aibridge.OpenAIConfig{BaseURL: mockOpenAI.URL})}
323+
324+
// Create pool with metrics.
325+
pool, err := aibridged.NewCachedBridgePool(aibridged.DefaultPoolOptions, providers, metrics, logger)
326+
require.NoError(t, err)
327+
328+
// Given: aibridged is started.
329+
srv, err := aibridged.New(ctx, pool, func(ctx context.Context) (aibridged.DRPCClient, error) {
330+
return aiBridgeClient, nil
331+
}, logger)
332+
require.NoError(t, err, "create new aibridged")
333+
t.Cleanup(func() {
334+
_ = srv.Shutdown(ctx)
335+
})
336+
337+
// When: a request is made to aibridged.
338+
req, err := http.NewRequestWithContext(ctx, http.MethodPost, "/openai/v1/chat/completions", bytes.NewBufferString(`{
339+
"messages": [
340+
{
341+
"role": "user",
342+
"content": "test message"
343+
}
344+
],
345+
"model": "gpt-4.1"
346+
}`))
347+
require.NoError(t, err, "make request to test server")
348+
req.Header.Add("Authorization", "Bearer "+apiKey.Key)
349+
req.Header.Add("Accept", "application/json")
350+
351+
// When: aibridged handles the request.
352+
rec := httptest.NewRecorder()
353+
srv.ServeHTTP(rec, req)
354+
require.Equal(t, http.StatusOK, rec.Code)
355+
356+
// Then: the interceptions metric should increase to 1.
357+
// This is not exhaustively checking the available metrics; just an indicative one to prove
358+
// the plumbing is working.
359+
require.Eventually(t, func() bool {
360+
count := promtest.ToFloat64(metrics.InterceptionCount)
361+
return count == 1
362+
}, testutil.WaitShort, testutil.IntervalFast, "interceptions_total metric should be 1")
363+
}

enterprise/aibridged/aibridged_test.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,7 @@ func newTestServer(t *testing.T) (*aibridged.Server, *mock.MockDRPCClient, *mock
4141
pool,
4242
func(ctx context.Context) (aibridged.DRPCClient, error) {
4343
return client, nil
44-
},
45-
logger)
44+
}, logger)
4645
require.NoError(t, err, "create new aibridged")
4746
t.Cleanup(func() {
4847
srv.Shutdown(context.Background())
@@ -291,7 +290,7 @@ func TestRouting(t *testing.T) {
291290
aibridge.NewOpenAIProvider(aibridge.OpenAIConfig{BaseURL: openaiSrv.URL}),
292291
aibridge.NewAnthropicProvider(aibridge.AnthropicConfig{BaseURL: antSrv.URL}, nil),
293292
}
294-
pool, err := aibridged.NewCachedBridgePool(aibridged.DefaultPoolOptions, providers, logger)
293+
pool, err := aibridged.NewCachedBridgePool(aibridged.DefaultPoolOptions, providers, nil, logger)
295294
require.NoError(t, err)
296295
conn := &mockDRPCConn{}
297296
client.EXPECT().DRPCConn().AnyTimes().Return(conn)

enterprise/aibridged/pool.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,13 @@ type CachedBridgePool struct {
5151

5252
singleflight *singleflight.Group[string, *aibridge.RequestBridge]
5353

54+
metrics *aibridge.Metrics
55+
5456
shutDownOnce sync.Once
5557
shuttingDownCh chan struct{}
5658
}
5759

58-
func NewCachedBridgePool(options PoolOptions, providers []aibridge.Provider, logger slog.Logger) (*CachedBridgePool, error) {
60+
func NewCachedBridgePool(options PoolOptions, providers []aibridge.Provider, metrics *aibridge.Metrics, logger slog.Logger) (*CachedBridgePool, error) {
5961
cache, err := ristretto.NewCache(&ristretto.Config[string, *aibridge.RequestBridge]{
6062
NumCounters: options.MaxItems * 10, // Docs suggest setting this 10x number of keys.
6163
MaxCost: options.MaxItems * cacheCost, // Up to n instances.
@@ -88,6 +90,8 @@ func NewCachedBridgePool(options PoolOptions, providers []aibridge.Provider, log
8890

8991
singleflight: &singleflight.Group[string, *aibridge.RequestBridge]{},
9092

93+
metrics: metrics,
94+
9195
shuttingDownCh: make(chan struct{}),
9296
}, nil
9397
}
@@ -154,7 +158,7 @@ func (p *CachedBridgePool) Acquire(ctx context.Context, req Request, clientFn Cl
154158
}
155159
}
156160

157-
bridge, err := aibridge.NewRequestBridge(ctx, p.providers, p.logger, recorder, mcpServers)
161+
bridge, err := aibridge.NewRequestBridge(ctx, p.providers, recorder, mcpServers, p.metrics, p.logger)
158162
if err != nil {
159163
return nil, xerrors.Errorf("create new request bridge: %w", err)
160164
}
@@ -167,7 +171,7 @@ func (p *CachedBridgePool) Acquire(ctx context.Context, req Request, clientFn Cl
167171
return instance, err
168172
}
169173

170-
func (p *CachedBridgePool) Metrics() PoolMetrics {
174+
func (p *CachedBridgePool) CacheMetrics() PoolMetrics {
171175
if p.cache == nil {
172176
return nil
173177
}

enterprise/aibridged/pool_test.go

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ func TestPool(t *testing.T) {
3030
mcpProxy := mcpmock.NewMockServerProxier(ctrl)
3131

3232
opts := aibridged.PoolOptions{MaxItems: 1, TTL: time.Second}
33-
pool, err := aibridged.NewCachedBridgePool(opts, nil, logger)
33+
pool, err := aibridged.NewCachedBridgePool(opts, nil, nil, logger)
3434
require.NoError(t, err)
3535
t.Cleanup(func() { pool.Shutdown(context.Background()) })
3636

@@ -63,11 +63,11 @@ func TestPool(t *testing.T) {
6363
require.NoError(t, err, "acquire pool instance")
6464
require.Same(t, inst, instB)
6565

66-
metrics := pool.Metrics()
67-
require.EqualValues(t, 1, metrics.KeysAdded())
68-
require.EqualValues(t, 0, metrics.KeysEvicted())
69-
require.EqualValues(t, 1, metrics.Hits())
70-
require.EqualValues(t, 1, metrics.Misses())
66+
cacheMetrics := pool.CacheMetrics()
67+
require.EqualValues(t, 1, cacheMetrics.KeysAdded())
68+
require.EqualValues(t, 0, cacheMetrics.KeysEvicted())
69+
require.EqualValues(t, 1, cacheMetrics.Hits())
70+
require.EqualValues(t, 1, cacheMetrics.Misses())
7171

7272
// This will get called again because a new instance will be created.
7373
mcpProxy.EXPECT().Init(gomock.Any()).Times(1).Return(nil)
@@ -81,11 +81,11 @@ func TestPool(t *testing.T) {
8181
require.NoError(t, err, "acquire pool instance")
8282
require.NotSame(t, inst, inst2)
8383

84-
metrics = pool.Metrics()
85-
require.EqualValues(t, 2, metrics.KeysAdded())
86-
require.EqualValues(t, 1, metrics.KeysEvicted())
87-
require.EqualValues(t, 1, metrics.Hits())
88-
require.EqualValues(t, 2, metrics.Misses())
84+
cacheMetrics = pool.CacheMetrics()
85+
require.EqualValues(t, 2, cacheMetrics.KeysAdded())
86+
require.EqualValues(t, 1, cacheMetrics.KeysEvicted())
87+
require.EqualValues(t, 1, cacheMetrics.Hits())
88+
require.EqualValues(t, 2, cacheMetrics.Misses())
8989

9090
// This will get called again because a new instance will be created.
9191
mcpProxy.EXPECT().Init(gomock.Any()).Times(1).Return(nil)
@@ -99,11 +99,11 @@ func TestPool(t *testing.T) {
9999
require.NoError(t, err, "acquire pool instance 2B")
100100
require.NotSame(t, inst2, inst2B)
101101

102-
metrics = pool.Metrics()
103-
require.EqualValues(t, 3, metrics.KeysAdded())
104-
require.EqualValues(t, 2, metrics.KeysEvicted())
105-
require.EqualValues(t, 1, metrics.Hits())
106-
require.EqualValues(t, 3, metrics.Misses())
102+
cacheMetrics = pool.CacheMetrics()
103+
require.EqualValues(t, 3, cacheMetrics.KeysAdded())
104+
require.EqualValues(t, 2, cacheMetrics.KeysEvicted())
105+
require.EqualValues(t, 1, cacheMetrics.Hits())
106+
require.EqualValues(t, 3, cacheMetrics.Misses())
107107

108108
// TODO: add test for expiry.
109109
// This requires Go 1.25's [synctest](https://pkg.go.dev/testing/synctest) since the

enterprise/aibridged/translator.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,16 @@ func (t *recorderTranslation) RecordPromptUsage(ctx context.Context, req *aibrid
5757
}
5858

5959
func (t *recorderTranslation) RecordTokenUsage(ctx context.Context, req *aibridge.TokenUsageRecord) error {
60+
merged := req.Metadata
61+
if merged == nil {
62+
merged = aibridge.Metadata{}
63+
}
64+
65+
// Merge the token usage values into metadata; later we might want to store some of these in their own fields.
66+
for k, v := range req.ExtraTokenTypes {
67+
merged[k] = v
68+
}
69+
6070
_, err := t.client.RecordTokenUsage(ctx, &proto.RecordTokenUsageRequest{
6171
InterceptionId: req.InterceptionID,
6272
MsgId: req.MsgID,

enterprise/cli/aibridged.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ import (
77

88
"golang.org/x/xerrors"
99

10+
"github.com/prometheus/client_golang/prometheus"
11+
1012
"github.com/coder/aibridge"
1113
"github.com/coder/coder/v2/codersdk"
1214
"github.com/coder/coder/v2/enterprise/aibridged"
@@ -31,8 +33,11 @@ func newAIBridgeDaemon(coderAPI *coderd.API) (*aibridged.Server, error) {
3133
}, getBedrockConfig(coderAPI.DeploymentValues.AI.BridgeConfig.Bedrock)),
3234
}
3335

36+
reg := prometheus.WrapRegistererWithPrefix("coder_aibridged_", coderAPI.PrometheusRegistry)
37+
metrics := aibridge.NewMetrics(reg)
38+
3439
// Create pool for reusable stateful [aibridge.RequestBridge] instances (one per user).
35-
pool, err := aibridged.NewCachedBridgePool(aibridged.DefaultPoolOptions, providers, logger.Named("pool")) // TODO: configurable.
40+
pool, err := aibridged.NewCachedBridgePool(aibridged.DefaultPoolOptions, providers, metrics, logger.Named("pool")) // TODO: configurable size.
3641
if err != nil {
3742
return nil, xerrors.Errorf("create request pool: %w", err)
3843
}

go.mod

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -165,9 +165,9 @@ require (
165165
github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e
166166
github.com/pkg/sftp v1.13.7
167167
github.com/prometheus-community/pro-bing v0.7.0
168-
github.com/prometheus/client_golang v1.23.0
168+
github.com/prometheus/client_golang v1.23.2
169169
github.com/prometheus/client_model v0.6.2
170-
github.com/prometheus/common v0.65.0
170+
github.com/prometheus/common v0.66.1
171171
github.com/quasilyte/go-ruleguard/dsl v0.3.22
172172
github.com/robfig/cron/v3 v3.0.1
173173
github.com/shirou/gopsutil/v4 v4.25.5
@@ -476,7 +476,7 @@ require (
476476
github.com/anthropics/anthropic-sdk-go v1.18.0
477477
github.com/brianvoe/gofakeit/v7 v7.9.0
478478
github.com/coder/agentapi-sdk-go v0.0.0-20250505131810-560d1d88d225
479-
github.com/coder/aibridge v0.1.7
479+
github.com/coder/aibridge v0.2.0
480480
github.com/coder/aisdk-go v0.0.9
481481
github.com/coder/boundary v1.0.1-0.20250925154134-55a44f2a7945
482482
github.com/coder/preview v1.0.4

go.sum

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -919,8 +919,8 @@ github.com/cncf/xds/go v0.0.0-20250501225837-2ac532fd4443 h1:aQ3y1lwWyqYPiWZThqv
919919
github.com/cncf/xds/go v0.0.0-20250501225837-2ac532fd4443/go.mod h1:W+zGtBO5Y1IgJhy4+A9GOqVhqLpfZi+vwmdNXUehLA8=
920920
github.com/coder/agentapi-sdk-go v0.0.0-20250505131810-560d1d88d225 h1:tRIViZ5JRmzdOEo5wUWngaGEFBG8OaE1o2GIHN5ujJ8=
921921
github.com/coder/agentapi-sdk-go v0.0.0-20250505131810-560d1d88d225/go.mod h1:rNLVpYgEVeu1Zk29K64z6Od8RBP9DwqCu9OfCzh8MR4=
922-
github.com/coder/aibridge v0.1.7 h1:GTAM8nHawXMeb/pxAIwvzr76dyVGu9hw9qV6Gvpc7nw=
923-
github.com/coder/aibridge v0.1.7/go.mod h1:7GhrLbzf6uM3sCA7OPaDzvq9QNrCjNuzMy+WgipYwfQ=
922+
github.com/coder/aibridge v0.2.0 h1:kAWhHD6fsmDLH1WxIwXPu9Ineijj+lVniko45C003Vo=
923+
github.com/coder/aibridge v0.2.0/go.mod h1:2T0RSnIX1WTqFajzXsaNsoNe6mmNsNeCTxiHBWEsFnE=
924924
github.com/coder/aisdk-go v0.0.9 h1:Vzo/k2qwVGLTR10ESDeP2Ecek1SdPfZlEjtTfMveiVo=
925925
github.com/coder/aisdk-go v0.0.9/go.mod h1:KF6/Vkono0FJJOtWtveh5j7yfNrSctVTpwgweYWSp5M=
926926
github.com/coder/boundary v1.0.1-0.20250925154134-55a44f2a7945 h1:hDUf02kTX8EGR3+5B+v5KdYvORs4YNfDPci0zCs+pC0=
@@ -1718,15 +1718,15 @@ github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 h1:o4JXh1EVt
17181718
github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE=
17191719
github.com/prometheus-community/pro-bing v0.7.0 h1:KFYFbxC2f2Fp6c+TyxbCOEarf7rbnzr9Gw8eIb0RfZA=
17201720
github.com/prometheus-community/pro-bing v0.7.0/go.mod h1:Moob9dvlY50Bfq6i88xIwfyw7xLFHH69LUgx9n5zqCE=
1721-
github.com/prometheus/client_golang v1.23.0 h1:ust4zpdl9r4trLY/gSjlm07PuiBq2ynaXXlptpfy8Uc=
1722-
github.com/prometheus/client_golang v1.23.0/go.mod h1:i/o0R9ByOnHX0McrTMTyhYvKE4haaf2mW08I+jGAjEE=
1721+
github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o=
1722+
github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg=
17231723
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
17241724
github.com/prometheus/client_model v0.2.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
17251725
github.com/prometheus/client_model v0.3.0/go.mod h1:LDGWKZIo7rky3hgvBe+caln+Dr3dPggB5dvjtD7w9+w=
17261726
github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk=
17271727
github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
1728-
github.com/prometheus/common v0.65.0 h1:QDwzd+G1twt//Kwj/Ww6E9FQq1iVMmODnILtW1t2VzE=
1729-
github.com/prometheus/common v0.65.0/go.mod h1:0gZns+BLRQ3V6NdaerOhMbwwRbNh9hkGINtQAsP5GS8=
1728+
github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs=
1729+
github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA=
17301730
github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg=
17311731
github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is=
17321732
github.com/puzpuzpuz/xsync/v3 v3.5.1 h1:GJYJZwO6IdxN/IKbneznS6yPkVC+c3zyY/j19c++5Fg=

0 commit comments

Comments
 (0)