Skip to content

Commit e24cc5e

Browse files
feat: add tracing to aibridge (#21106)
Adds tracing for AIBridge. Updates github.com/coder/aibridge version from `v0.2.2` to `v0.3.0` Depends on: coder/aibridge#63 Fixes: coder/aibridge#26 --------- Co-authored-by: Danny Kopping <danny@coder.com>
1 parent 259dee2 commit e24cc5e

File tree

12 files changed

+169
-31
lines changed

12 files changed

+169
-31
lines changed

docs/ai-coder/ai-bridge/monitoring.md

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,3 +60,65 @@ needs.
6060

6161
For configuration options and details, see [Data Retention](./setup.md#data-retention)
6262
in the AI Bridge setup guide.
63+
64+
## Tracing
65+
66+
AI Bridge supports tracing via [OpenTelemetry](https://opentelemetry.io/),
67+
providing visibility into request processing, upstream API calls, and MCP server
68+
interactions.
69+
70+
### Enabling Tracing
71+
72+
AI Bridge tracing is enabled when tracing is enabled for the Coder server.
73+
To enable tracing set `CODER_TRACE_ENABLE` environment variable or
74+
[--trace](https://coder.com/docs/reference/cli/server#--trace) CLI flag:
75+
76+
```sh
77+
export CODER_TRACE_ENABLE=true
78+
```
79+
80+
```sh
81+
coder server --trace
82+
```
83+
84+
### What is Traced
85+
86+
AI Bridge creates spans for the following operations:
87+
88+
| Span Name | Description |
89+
|---------------------------------------------|------------------------------------------------------|
90+
| `CachedBridgePool.Acquire` | Acquiring a request bridge instance from the pool |
91+
| `Intercept` | Top-level span for processing an intercepted request |
92+
| `Intercept.CreateInterceptor` | Creating the request interceptor |
93+
| `Intercept.ProcessRequest` | Processing the request through the bridge |
94+
| `Intercept.ProcessRequest.Upstream` | Forwarding the request to the upstream AI provider |
95+
| `Intercept.ProcessRequest.ToolCall` | Executing a tool call requested by the AI model |
96+
| `Intercept.RecordInterception` | Recording creating interception record |
97+
| `Intercept.RecordPromptUsage` | Recording prompt/message data |
98+
| `Intercept.RecordTokenUsage` | Recording token consumption |
99+
| `Intercept.RecordToolUsage` | Recording tool/function calls |
100+
| `Intercept.RecordInterceptionEnded` | Recording the interception as completed |
101+
| `ServerProxyManager.Init` | Initializing MCP server proxy connections |
102+
| `StreamableHTTPServerProxy.Init` | Setting up HTTP-based MCP server proxies |
103+
| `StreamableHTTPServerProxy.Init.fetchTools` | Fetching available tools from MCP servers |
104+
105+
Example trace of an interception using Jaeger backend:
106+
107+
![Trace of interception](../../images/aibridge/jaeger_interception_trace.png)
108+
109+
### Capturing Logs in Traces
110+
111+
> **Note:** Enabling log capture may generate a large volume of trace events.
112+
113+
To include log messages as trace events, enable trace log capture
114+
by setting `CODER_TRACE_LOGS` environment variable or using
115+
[--trace-logs](https://coder.com/docs/reference/cli/server#--trace-logs) flag:
116+
117+
```sh
118+
export CODER_TRACE_ENABLE=true
119+
export CODER_TRACE_LOGS=true
120+
```
121+
122+
```sh
123+
coder server --trace --trace-logs
124+
```
412 KB
Loading

enterprise/aibridged/aibridged.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"sync"
99
"time"
1010

11+
"go.opentelemetry.io/otel/trace"
1112
"golang.org/x/xerrors"
1213

1314
"cdr.dev/slog"
@@ -33,6 +34,7 @@ type Server struct {
3334
requestBridgePool Pooler
3435

3536
logger slog.Logger
37+
tracer trace.Tracer
3638
wg sync.WaitGroup
3739

3840
// initConnectionCh will receive when the daemon connects to coderd for the
@@ -48,14 +50,15 @@ type Server struct {
4850
shutdownOnce sync.Once
4951
}
5052

51-
func New(ctx context.Context, pool Pooler, rpcDialer Dialer, logger slog.Logger) (*Server, error) {
53+
func New(ctx context.Context, pool Pooler, rpcDialer Dialer, logger slog.Logger, tracer trace.Tracer) (*Server, error) {
5254
if rpcDialer == nil {
5355
return nil, xerrors.Errorf("nil rpcDialer given")
5456
}
5557

5658
ctx, cancel := context.WithCancel(ctx)
5759
daemon := &Server{
5860
logger: logger,
61+
tracer: tracer,
5962
clientDialer: rpcDialer,
6063
clientCh: make(chan DRPCClient),
6164
lifecycleCtx: ctx,
@@ -143,7 +146,7 @@ func (s *Server) GetRequestHandler(ctx context.Context, req Request) (http.Handl
143146
return nil, xerrors.New("nil requestBridgePool")
144147
}
145148

146-
reqBridge, err := s.requestBridgePool.Acquire(ctx, req, s.Client, NewMCPProxyFactory(s.logger, s.Client))
149+
reqBridge, err := s.requestBridgePool.Acquire(ctx, req, s.Client, NewMCPProxyFactory(s.logger, s.tracer, s.Client))
147150
if err != nil {
148151
return nil, xerrors.Errorf("acquire request bridge: %w", err)
149152
}

enterprise/aibridged/aibridged_integration_test.go

Lines changed: 56 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,21 @@ import (
66
"fmt"
77
"net/http"
88
"net/http/httptest"
9+
"slices"
910
"testing"
1011
"time"
1112

1213
"github.com/prometheus/client_golang/prometheus"
1314
promtest "github.com/prometheus/client_golang/prometheus/testutil"
1415
"github.com/stretchr/testify/require"
1516
"github.com/tidwall/gjson"
17+
"go.opentelemetry.io/otel"
18+
"go.opentelemetry.io/otel/attribute"
19+
sdktrace "go.opentelemetry.io/otel/sdk/trace"
20+
"go.opentelemetry.io/otel/sdk/trace/tracetest"
1621

1722
"github.com/coder/aibridge"
23+
aibtracing "github.com/coder/aibridge/tracing"
1824
"github.com/coder/coder/v2/coderd/coderdtest"
1925
"github.com/coder/coder/v2/coderd/database"
2026
"github.com/coder/coder/v2/coderd/database/dbauthz"
@@ -28,18 +34,26 @@ import (
2834
"github.com/coder/coder/v2/testutil"
2935
)
3036

37+
var testTracer = otel.Tracer("aibridged_test")
38+
3139
// TestIntegration is not an exhaustive test against the upstream AI providers' SDKs (see coder/aibridge for those).
3240
// This test validates that:
3341
// - intercepted requests can be authenticated/authorized
3442
// - requests can be routed to an appropriate handler
3543
// - responses can be returned as expected
3644
// - interceptions are logged, as well as their related prompt, token, and tool calls
3745
// - MCP server configurations are returned as expected
46+
// - tracing spans are properly recorded
3847
func TestIntegration(t *testing.T) {
3948
t.Parallel()
4049

4150
ctx := testutil.Context(t, testutil.WaitLong)
4251

52+
sr := tracetest.NewSpanRecorder()
53+
tp := sdktrace.NewTracerProvider(sdktrace.WithSpanProcessor(sr))
54+
tracer := tp.Tracer(t.Name())
55+
defer func() { _ = tp.Shutdown(t.Context()) }()
56+
4357
// Create mock MCP server.
4458
var mcpTokenReceived string
4559
mockMCPServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
@@ -169,13 +183,13 @@ func TestIntegration(t *testing.T) {
169183

170184
logger := testutil.Logger(t)
171185
providers := []aibridge.Provider{aibridge.NewOpenAIProvider(aibridge.OpenAIConfig{BaseURL: mockOpenAI.URL})}
172-
pool, err := aibridged.NewCachedBridgePool(aibridged.DefaultPoolOptions, providers, nil, logger)
186+
pool, err := aibridged.NewCachedBridgePool(aibridged.DefaultPoolOptions, providers, logger, nil, tracer)
173187
require.NoError(t, err)
174188

175189
// Given: aibridged is started.
176190
srv, err := aibridged.New(t.Context(), pool, func(ctx context.Context) (aibridged.DRPCClient, error) {
177191
return aiBridgeClient, nil
178-
}, logger)
192+
}, logger, tracer)
179193
require.NoError(t, err, "create new aibridged")
180194
t.Cleanup(func() {
181195
_ = srv.Shutdown(ctx)
@@ -256,6 +270,44 @@ func TestIntegration(t *testing.T) {
256270

257271
// Then: the MCP server was initialized.
258272
require.Contains(t, mcpTokenReceived, authLink.OAuthAccessToken, "mock MCP server not requested")
273+
274+
// Then: verify tracing spans were recorded.
275+
spans := sr.Ended()
276+
require.NotEmpty(t, spans)
277+
i := slices.IndexFunc(spans, func(s sdktrace.ReadOnlySpan) bool { return s.Name() == "CachedBridgePool.Acquire" })
278+
require.NotEqual(t, -1, i, "span named 'CachedBridgePool.Acquire' not found")
279+
280+
expectAttrs := []attribute.KeyValue{
281+
attribute.String(aibtracing.InitiatorID, user.ID.String()),
282+
attribute.String(aibtracing.APIKeyID, keyID),
283+
}
284+
require.Equal(t, spans[i].Attributes(), expectAttrs)
285+
286+
// Check for aibridge spans.
287+
spanNames := make(map[string]bool)
288+
for _, span := range spans {
289+
spanNames[span.Name()] = true
290+
}
291+
292+
expectedAibridgeSpans := []string{
293+
"CachedBridgePool.Acquire",
294+
"ServerProxyManager.Init",
295+
"StreamableHTTPServerProxy.Init",
296+
"StreamableHTTPServerProxy.Init.fetchTools",
297+
"Intercept",
298+
"Intercept.CreateInterceptor",
299+
"Intercept.RecordInterception",
300+
"Intercept.ProcessRequest",
301+
"Intercept.ProcessRequest.Upstream",
302+
"Intercept.RecordPromptUsage",
303+
"Intercept.RecordTokenUsage",
304+
"Intercept.RecordToolUsage",
305+
"Intercept.RecordInterceptionEnded",
306+
}
307+
308+
for _, expectedSpan := range expectedAibridgeSpans {
309+
require.Contains(t, spanNames, expectedSpan)
310+
}
259311
}
260312

261313
// TestIntegrationWithMetrics validates that Prometheus metrics are correctly incremented
@@ -324,13 +376,13 @@ func TestIntegrationWithMetrics(t *testing.T) {
324376
providers := []aibridge.Provider{aibridge.NewOpenAIProvider(aibridge.OpenAIConfig{BaseURL: mockOpenAI.URL})}
325377

326378
// Create pool with metrics.
327-
pool, err := aibridged.NewCachedBridgePool(aibridged.DefaultPoolOptions, providers, metrics, logger)
379+
pool, err := aibridged.NewCachedBridgePool(aibridged.DefaultPoolOptions, providers, logger, metrics, testTracer)
328380
require.NoError(t, err)
329381

330382
// Given: aibridged is started.
331383
srv, err := aibridged.New(ctx, pool, func(ctx context.Context) (aibridged.DRPCClient, error) {
332384
return aiBridgeClient, nil
333-
}, logger)
385+
}, logger, testTracer)
334386
require.NoError(t, err, "create new aibridged")
335387
t.Cleanup(func() {
336388
_ = srv.Shutdown(ctx)

enterprise/aibridged/aibridged_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ func newTestServer(t *testing.T) (*aibridged.Server, *mock.MockDRPCClient, *mock
4141
pool,
4242
func(ctx context.Context) (aibridged.DRPCClient, error) {
4343
return client, nil
44-
}, logger)
44+
}, logger, testTracer)
4545
require.NoError(t, err, "create new aibridged")
4646
t.Cleanup(func() {
4747
srv.Shutdown(context.Background())
@@ -290,7 +290,7 @@ func TestRouting(t *testing.T) {
290290
aibridge.NewOpenAIProvider(aibridge.OpenAIConfig{BaseURL: openaiSrv.URL}),
291291
aibridge.NewAnthropicProvider(aibridge.AnthropicConfig{BaseURL: antSrv.URL}, nil),
292292
}
293-
pool, err := aibridged.NewCachedBridgePool(aibridged.DefaultPoolOptions, providers, nil, logger)
293+
pool, err := aibridged.NewCachedBridgePool(aibridged.DefaultPoolOptions, providers, logger, nil, testTracer)
294294
require.NoError(t, err)
295295
conn := &mockDRPCConn{}
296296
client.EXPECT().DRPCConn().AnyTimes().Return(conn)
@@ -309,7 +309,7 @@ func TestRouting(t *testing.T) {
309309
// Given: aibridged is started.
310310
srv, err := aibridged.New(t.Context(), pool, func(ctx context.Context) (aibridged.DRPCClient, error) {
311311
return client, nil
312-
}, logger)
312+
}, logger, testTracer)
313313
require.NoError(t, err, "create new aibridged")
314314
t.Cleanup(func() {
315315
_ = srv.Shutdown(testutil.Context(t, testutil.WaitShort))

enterprise/aibridged/mcp.go

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"regexp"
77
"time"
88

9+
"go.opentelemetry.io/otel/trace"
910
"golang.org/x/xerrors"
1011

1112
"cdr.dev/slog"
@@ -28,30 +29,32 @@ type MCPProxyBuilder interface {
2829
// The SessionKey from [Request] is used to authenticate against the Coder MCP server.
2930
//
3031
// NOTE: the [mcp.ServerProxier] instance may be proxying one or more MCP servers.
31-
Build(ctx context.Context, req Request) (mcp.ServerProxier, error)
32+
Build(ctx context.Context, req Request, tracer trace.Tracer) (mcp.ServerProxier, error)
3233
}
3334

3435
var _ MCPProxyBuilder = &MCPProxyFactory{}
3536

3637
type MCPProxyFactory struct {
3738
logger slog.Logger
39+
tracer trace.Tracer
3840
clientFn ClientFunc
3941
}
4042

41-
func NewMCPProxyFactory(logger slog.Logger, clientFn ClientFunc) *MCPProxyFactory {
43+
func NewMCPProxyFactory(logger slog.Logger, tracer trace.Tracer, clientFn ClientFunc) *MCPProxyFactory {
4244
return &MCPProxyFactory{
4345
logger: logger,
46+
tracer: tracer,
4447
clientFn: clientFn,
4548
}
4649
}
4750

48-
func (m *MCPProxyFactory) Build(ctx context.Context, req Request) (mcp.ServerProxier, error) {
51+
func (m *MCPProxyFactory) Build(ctx context.Context, req Request, tracer trace.Tracer) (mcp.ServerProxier, error) {
4952
proxiers, err := m.retrieveMCPServerConfigs(ctx, req)
5053
if err != nil {
5154
return nil, xerrors.Errorf("resolve configs: %w", err)
5255
}
5356

54-
return mcp.NewServerProxyManager(proxiers), nil
57+
return mcp.NewServerProxyManager(proxiers, tracer), nil
5558
}
5659

5760
func (m *MCPProxyFactory) retrieveMCPServerConfigs(ctx context.Context, req Request) (map[string]mcp.ServerProxier, error) {
@@ -173,7 +176,6 @@ func (m *MCPProxyFactory) newStreamableHTTPServerProxy(cfg *proto.MCPServerConfi
173176
// The proxy could then use its interface to retrieve a new access token and re-establish a connection.
174177
// For now though, the short TTL of this cache should mostly mask this problem.
175178
srv, err := mcp.NewStreamableHTTPServerProxy(
176-
m.logger.Named(fmt.Sprintf("mcp-server-proxy-%s", cfg.GetId())),
177179
cfg.GetId(),
178180
cfg.GetUrl(),
179181
// See https://modelcontextprotocol.io/specification/2025-06-18/basic/authorization#token-requirements.
@@ -182,6 +184,8 @@ func (m *MCPProxyFactory) newStreamableHTTPServerProxy(cfg *proto.MCPServerConfi
182184
},
183185
allowlist,
184186
denylist,
187+
m.logger.Named(fmt.Sprintf("mcp-server-proxy-%s", cfg.GetId())),
188+
m.tracer,
185189
)
186190
if err != nil {
187191
return nil, xerrors.Errorf("create streamable HTTP MCP server proxy: %w", err)

enterprise/aibridged/mcp_internal_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"testing"
55

66
"github.com/stretchr/testify/require"
7+
"go.opentelemetry.io/otel"
78

89
"github.com/coder/coder/v2/enterprise/aibridged/proto"
910
"github.com/coder/coder/v2/testutil"
@@ -42,7 +43,7 @@ func TestMCPRegex(t *testing.T) {
4243
t.Parallel()
4344

4445
logger := testutil.Logger(t)
45-
f := NewMCPProxyFactory(logger, nil)
46+
f := NewMCPProxyFactory(logger, otel.Tracer("aibridged_test"), nil)
4647

4748
_, err := f.newStreamableHTTPServerProxy(&proto.MCPServerConfig{
4849
Id: "mock",

0 commit comments

Comments
 (0)