Skip to content

Commit 0d27d6b

Browse files
committed
feat(agent): add boundary log forwarding to coderd
Add a feature that transmits boundary audit logs from workspaces to coderd via the agent API, then re-emits them to stderr in a structured format. The implementation includes: - BoundaryLog proto messages and ReportBoundaryLogs RPC (API v2.7) - BoundaryLogProxy server that accepts connections from boundary processes on a Unix socket and forwards logs to coderd - Server-side handler that formats logs to stderr - Environment variables CODER_BOUNDARY_LOG_SOCKET and CODER_WORKSPACE_ID automatically set for all commands in the workspace Architecture: - Boundary process connects to Unix socket at $CODER_BOUNDARY_LOG_SOCKET - Sends length-prefixed protobuf ReportBoundaryLogsRequest messages - Agent proxies messages to coderd via DRPC - coderd re-emits to stderr Log format: [API] 2025-12-08 20:58:46.093 [warn] boundary: workspace.id=... decision=deny http.method="GET" http.url="..." time="..."
1 parent 8ed1c1d commit 0d27d6b

File tree

14 files changed

+1768
-2226
lines changed

14 files changed

+1768
-2226
lines changed

agent/agent.go

Lines changed: 70 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ import (
4343
"github.com/coder/coder/v2/agent/agentscripts"
4444
"github.com/coder/coder/v2/agent/agentsocket"
4545
"github.com/coder/coder/v2/agent/agentssh"
46+
"github.com/coder/coder/v2/agent/boundarylogproxy"
4647
"github.com/coder/coder/v2/agent/proto"
4748
"github.com/coder/coder/v2/agent/proto/resourcesmonitor"
4849
"github.com/coder/coder/v2/agent/reconnectingpty"
@@ -103,8 +104,8 @@ type Options struct {
103104
}
104105

105106
type Client interface {
106-
ConnectRPC26(ctx context.Context) (
107-
proto.DRPCAgentClient26, tailnetproto.DRPCTailnetClient26, error,
107+
ConnectRPC27(ctx context.Context) (
108+
proto.DRPCAgentClient27, tailnetproto.DRPCTailnetClient26, error,
108109
)
109110
tailnet.DERPMapRewriter
110111
agentsdk.RefreshableSessionTokenProvider
@@ -275,6 +276,9 @@ type agent struct {
275276

276277
logSender *agentsdk.LogSender
277278

279+
boundaryLogProxy *boundarylogproxy.Server
280+
boundaryLogProxyMu sync.Mutex
281+
278282
prometheusRegistry *prometheus.Registry
279283
// metrics are prometheus registered metrics that will be collected and
280284
// labeled in Coder with the agent + workspace.
@@ -369,6 +373,7 @@ func (a *agent) init() {
369373
)
370374

371375
a.initSocketServer()
376+
a.startBoundaryLogSocketServer()
372377

373378
go a.runLoop()
374379
}
@@ -393,6 +398,38 @@ func (a *agent) initSocketServer() {
393398
a.logger.Debug(a.hardCtx, "socket server started", slog.F("path", a.socketPath))
394399
}
395400

401+
// startBoundaryLogSocketServer starts the boundary log proxy socket server.
402+
// This creates the Unix socket and begins accepting connections from boundary
403+
// processes. Logs are buffered until the forwarder is started via startAgentAPI.
404+
func (a *agent) startBoundaryLogSocketServer() {
405+
a.boundaryLogProxyMu.Lock()
406+
defer a.boundaryLogProxyMu.Unlock()
407+
408+
if a.boundaryLogProxy != nil {
409+
// Already started.
410+
return
411+
}
412+
413+
// Create socket path in temp directory.
414+
socketPath := filepath.Join(a.tempDir, "boundary-logs.sock")
415+
416+
proxy := boundarylogproxy.NewServer(a.logger, socketPath)
417+
if err := proxy.Start(a.hardCtx); err != nil {
418+
a.logger.Warn(a.hardCtx, "failed to start boundary log proxy", slog.Error(err))
419+
return
420+
}
421+
422+
a.boundaryLogProxy = proxy
423+
a.logger.Info(a.hardCtx, "boundary log socket server started",
424+
slog.F("socket_path", socketPath))
425+
}
426+
427+
// forwardBoundaryLogs forwards buffered boundary audit logs to coderd.
428+
// This is called via startAgentAPI to ensure the API client is always current.
429+
func (a *agent) forwardBoundaryLogs(ctx context.Context, aAPI proto.DRPCAgentClient27) error {
430+
return a.boundaryLogProxy.RunForwarder(ctx, aAPI)
431+
}
432+
396433
// runLoop attempts to start the agent in a retry loop.
397434
// Coder may be offline temporarily, a connection issue
398435
// may be happening, but regardless after the intermittent
@@ -503,7 +540,7 @@ func (t *trySingleflight) Do(key string, fn func()) {
503540
fn()
504541
}
505542

506-
func (a *agent) reportMetadata(ctx context.Context, aAPI proto.DRPCAgentClient26) error {
543+
func (a *agent) reportMetadata(ctx context.Context, aAPI proto.DRPCAgentClient27) error {
507544
tickerDone := make(chan struct{})
508545
collectDone := make(chan struct{})
509546
ctx, cancel := context.WithCancel(ctx)
@@ -718,7 +755,7 @@ func (a *agent) reportMetadata(ctx context.Context, aAPI proto.DRPCAgentClient26
718755

719756
// reportLifecycle reports the current lifecycle state once. All state
720757
// changes are reported in order.
721-
func (a *agent) reportLifecycle(ctx context.Context, aAPI proto.DRPCAgentClient26) error {
758+
func (a *agent) reportLifecycle(ctx context.Context, aAPI proto.DRPCAgentClient27) error {
722759
for {
723760
select {
724761
case <-a.lifecycleUpdate:
@@ -798,7 +835,7 @@ func (a *agent) setLifecycle(state codersdk.WorkspaceAgentLifecycle) {
798835
}
799836

800837
// reportConnectionsLoop reports connections to the agent for auditing.
801-
func (a *agent) reportConnectionsLoop(ctx context.Context, aAPI proto.DRPCAgentClient26) error {
838+
func (a *agent) reportConnectionsLoop(ctx context.Context, aAPI proto.DRPCAgentClient27) error {
802839
for {
803840
select {
804841
case <-a.reportConnectionsUpdate:
@@ -929,7 +966,7 @@ func (a *agent) reportConnection(id uuid.UUID, connectionType proto.Connection_T
929966
// fetchServiceBannerLoop fetches the service banner on an interval. It will
930967
// not be fetched immediately; the expectation is that it is primed elsewhere
931968
// (and must be done before the session actually starts).
932-
func (a *agent) fetchServiceBannerLoop(ctx context.Context, aAPI proto.DRPCAgentClient26) error {
969+
func (a *agent) fetchServiceBannerLoop(ctx context.Context, aAPI proto.DRPCAgentClient27) error {
933970
ticker := time.NewTicker(a.announcementBannersRefreshInterval)
934971
defer ticker.Stop()
935972
for {
@@ -964,7 +1001,7 @@ func (a *agent) run() (retErr error) {
9641001
}
9651002

9661003
// ConnectRPC returns the dRPC connection we use for the Agent and Tailnet v2+ APIs
967-
aAPI, tAPI, err := a.client.ConnectRPC26(a.hardCtx)
1004+
aAPI, tAPI, err := a.client.ConnectRPC27(a.hardCtx)
9681005
if err != nil {
9691006
return err
9701007
}
@@ -981,7 +1018,7 @@ func (a *agent) run() (retErr error) {
9811018
connMan := newAPIConnRoutineManager(a.gracefulCtx, a.hardCtx, a.logger, aAPI, tAPI)
9821019

9831020
connMan.startAgentAPI("init notification banners", gracefulShutdownBehaviorStop,
984-
func(ctx context.Context, aAPI proto.DRPCAgentClient26) error {
1021+
func(ctx context.Context, aAPI proto.DRPCAgentClient27) error {
9851022
bannersProto, err := aAPI.GetAnnouncementBanners(ctx, &proto.GetAnnouncementBannersRequest{})
9861023
if err != nil {
9871024
return xerrors.Errorf("fetch service banner: %w", err)
@@ -998,7 +1035,7 @@ func (a *agent) run() (retErr error) {
9981035
// sending logs gets gracefulShutdownBehaviorRemain because we want to send logs generated by
9991036
// shutdown scripts.
10001037
connMan.startAgentAPI("send logs", gracefulShutdownBehaviorRemain,
1001-
func(ctx context.Context, aAPI proto.DRPCAgentClient26) error {
1038+
func(ctx context.Context, aAPI proto.DRPCAgentClient27) error {
10021039
err := a.logSender.SendLoop(ctx, aAPI)
10031040
if xerrors.Is(err, agentsdk.ErrLogLimitExceeded) {
10041041
// we don't want this error to tear down the API connection and propagate to the
@@ -1009,6 +1046,11 @@ func (a *agent) run() (retErr error) {
10091046
return err
10101047
})
10111048

1049+
1050+
// Forward boundary audit logs to coderd. These are audit logs so they should
1051+
// be forwarded during graceful shutdown (gracefulShutdownBehaviorRemain).
1052+
connMan.startAgentAPI("boundary log forwarder", gracefulShutdownBehaviorRemain, a.forwardBoundaryLogs)
1053+
10121054
// part of graceful shut down is reporting the final lifecycle states, e.g "ShuttingDown" so the
10131055
// lifecycle reporting has to be via gracefulShutdownBehaviorRemain
10141056
connMan.startAgentAPI("report lifecycle", gracefulShutdownBehaviorRemain, a.reportLifecycle)
@@ -1017,7 +1059,7 @@ func (a *agent) run() (retErr error) {
10171059
connMan.startAgentAPI("report metadata", gracefulShutdownBehaviorStop, a.reportMetadata)
10181060

10191061
// resources monitor can cease as soon as we start gracefully shutting down.
1020-
connMan.startAgentAPI("resources monitor", gracefulShutdownBehaviorStop, func(ctx context.Context, aAPI proto.DRPCAgentClient26) error {
1062+
connMan.startAgentAPI("resources monitor", gracefulShutdownBehaviorStop, func(ctx context.Context, aAPI proto.DRPCAgentClient27) error {
10211063
logger := a.logger.Named("resources_monitor")
10221064
clk := quartz.NewReal()
10231065
config, err := aAPI.GetResourcesMonitoringConfiguration(ctx, &proto.GetResourcesMonitoringConfigurationRequest{})
@@ -1064,7 +1106,7 @@ func (a *agent) run() (retErr error) {
10641106
connMan.startAgentAPI("handle manifest", gracefulShutdownBehaviorStop, a.handleManifest(manifestOK))
10651107

10661108
connMan.startAgentAPI("app health reporter", gracefulShutdownBehaviorStop,
1067-
func(ctx context.Context, aAPI proto.DRPCAgentClient26) error {
1109+
func(ctx context.Context, aAPI proto.DRPCAgentClient27) error {
10681110
if err := manifestOK.wait(ctx); err != nil {
10691111
return xerrors.Errorf("no manifest: %w", err)
10701112
}
@@ -1097,7 +1139,7 @@ func (a *agent) run() (retErr error) {
10971139

10981140
connMan.startAgentAPI("fetch service banner loop", gracefulShutdownBehaviorStop, a.fetchServiceBannerLoop)
10991141

1100-
connMan.startAgentAPI("stats report loop", gracefulShutdownBehaviorStop, func(ctx context.Context, aAPI proto.DRPCAgentClient26) error {
1142+
connMan.startAgentAPI("stats report loop", gracefulShutdownBehaviorStop, func(ctx context.Context, aAPI proto.DRPCAgentClient27) error {
11011143
if err := networkOK.wait(ctx); err != nil {
11021144
return xerrors.Errorf("no network: %w", err)
11031145
}
@@ -1112,8 +1154,8 @@ func (a *agent) run() (retErr error) {
11121154
}
11131155

11141156
// handleManifest returns a function that fetches and processes the manifest
1115-
func (a *agent) handleManifest(manifestOK *checkpoint) func(ctx context.Context, aAPI proto.DRPCAgentClient26) error {
1116-
return func(ctx context.Context, aAPI proto.DRPCAgentClient26) error {
1157+
func (a *agent) handleManifest(manifestOK *checkpoint) func(ctx context.Context, aAPI proto.DRPCAgentClient27) error {
1158+
return func(ctx context.Context, aAPI proto.DRPCAgentClient27) error {
11171159
var (
11181160
sentResult = false
11191161
err error
@@ -1276,7 +1318,7 @@ func (a *agent) handleManifest(manifestOK *checkpoint) func(ctx context.Context,
12761318

12771319
func (a *agent) createDevcontainer(
12781320
ctx context.Context,
1279-
aAPI proto.DRPCAgentClient26,
1321+
aAPI proto.DRPCAgentClient27,
12801322
dc codersdk.WorkspaceAgentDevcontainer,
12811323
script codersdk.WorkspaceAgentScript,
12821324
) (err error) {
@@ -1308,8 +1350,8 @@ func (a *agent) createDevcontainer(
13081350

13091351
// createOrUpdateNetwork waits for the manifest to be set using manifestOK, then creates or updates
13101352
// the tailnet using the information in the manifest
1311-
func (a *agent) createOrUpdateNetwork(manifestOK, networkOK *checkpoint) func(context.Context, proto.DRPCAgentClient26) error {
1312-
return func(ctx context.Context, aAPI proto.DRPCAgentClient26) (retErr error) {
1353+
func (a *agent) createOrUpdateNetwork(manifestOK, networkOK *checkpoint) func(context.Context, proto.DRPCAgentClient27) error {
1354+
return func(ctx context.Context, aAPI proto.DRPCAgentClient27) (retErr error) {
13131355
if err := manifestOK.wait(ctx); err != nil {
13141356
return xerrors.Errorf("no manifest: %w", err)
13151357
}
@@ -1398,6 +1440,7 @@ func (a *agent) updateCommandEnv(current []string) (updated []string, err error)
13981440
"CODER_WORKSPACE_NAME": manifest.WorkspaceName,
13991441
"CODER_WORKSPACE_AGENT_NAME": manifest.AgentName,
14001442
"CODER_WORKSPACE_OWNER_NAME": manifest.OwnerName,
1443+
"CODER_WORKSPACE_ID": manifest.WorkspaceID.String(),
14011444

14021445
// Specific Coder subcommands require the agent token exposed!
14031446
"CODER_AGENT_TOKEN": a.client.GetSessionToken(),
@@ -1409,6 +1452,13 @@ func (a *agent) updateCommandEnv(current []string) (updated []string, err error)
14091452
"CS_DISABLE_GETTING_STARTED_OVERRIDE": "true",
14101453
}
14111454

1455+
// Add boundary log socket path if the proxy is running.
1456+
a.boundaryLogProxyMu.Lock()
1457+
if a.boundaryLogProxy != nil {
1458+
envs["CODER_BOUNDARY_LOG_SOCKET"] = a.boundaryLogProxy.SocketPath()
1459+
}
1460+
a.boundaryLogProxyMu.Unlock()
1461+
14121462
// This adds the ports dialog to code-server that enables
14131463
// proxying a port dynamically.
14141464
// If this is empty string, do not set anything. Code-server auto defaults
@@ -2095,7 +2145,7 @@ const (
20952145

20962146
type apiConnRoutineManager struct {
20972147
logger slog.Logger
2098-
aAPI proto.DRPCAgentClient26
2148+
aAPI proto.DRPCAgentClient27
20992149
tAPI tailnetproto.DRPCTailnetClient24
21002150
eg *errgroup.Group
21012151
stopCtx context.Context
@@ -2104,7 +2154,7 @@ type apiConnRoutineManager struct {
21042154

21052155
func newAPIConnRoutineManager(
21062156
gracefulCtx, hardCtx context.Context, logger slog.Logger,
2107-
aAPI proto.DRPCAgentClient26, tAPI tailnetproto.DRPCTailnetClient24,
2157+
aAPI proto.DRPCAgentClient27, tAPI tailnetproto.DRPCTailnetClient24,
21082158
) *apiConnRoutineManager {
21092159
// routines that remain in operation during graceful shutdown use the remainCtx. They'll still
21102160
// exit if the errgroup hits an error, which usually means a problem with the conn.
@@ -2137,7 +2187,7 @@ func newAPIConnRoutineManager(
21372187
// but for Tailnet.
21382188
func (a *apiConnRoutineManager) startAgentAPI(
21392189
name string, behavior gracefulShutdownBehavior,
2140-
f func(context.Context, proto.DRPCAgentClient26) error,
2190+
f func(context.Context, proto.DRPCAgentClient27) error,
21412191
) {
21422192
logger := a.logger.With(slog.F("name", name))
21432193
var ctx context.Context

agent/agentcontainers/subagent.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -147,12 +147,12 @@ type SubAgentClient interface {
147147
// agent API client.
148148
type subAgentAPIClient struct {
149149
logger slog.Logger
150-
api agentproto.DRPCAgentClient26
150+
api agentproto.DRPCAgentClient27
151151
}
152152

153153
var _ SubAgentClient = (*subAgentAPIClient)(nil)
154154

155-
func NewSubAgentClientFromAPI(logger slog.Logger, agentAPI agentproto.DRPCAgentClient26) SubAgentClient {
155+
func NewSubAgentClientFromAPI(logger slog.Logger, agentAPI agentproto.DRPCAgentClient27) SubAgentClient {
156156
if agentAPI == nil {
157157
panic("developer error: agentAPI cannot be nil")
158158
}

agent/agentcontainers/subagent_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ func TestSubAgentClient_CreateWithDisplayApps(t *testing.T) {
8181

8282
agentAPI := agenttest.NewClient(t, logger, uuid.New(), agentsdk.Manifest{}, statsCh, tailnet.NewCoordinator(logger))
8383

84-
agentClient, _, err := agentAPI.ConnectRPC26(ctx)
84+
agentClient, _, err := agentAPI.ConnectRPC27(ctx)
8585
require.NoError(t, err)
8686

8787
subAgentClient := agentcontainers.NewSubAgentClientFromAPI(logger, agentClient)
@@ -245,7 +245,7 @@ func TestSubAgentClient_CreateWithDisplayApps(t *testing.T) {
245245

246246
agentAPI := agenttest.NewClient(t, logger, uuid.New(), agentsdk.Manifest{}, statsCh, tailnet.NewCoordinator(logger))
247247

248-
agentClient, _, err := agentAPI.ConnectRPC26(ctx)
248+
agentClient, _, err := agentAPI.ConnectRPC27(ctx)
249249
require.NoError(t, err)
250250

251251
subAgentClient := agentcontainers.NewSubAgentClientFromAPI(logger, agentClient)

agent/agenttest/client.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,8 +124,8 @@ func (c *Client) Close() {
124124
c.derpMapOnce.Do(func() { close(c.derpMapUpdates) })
125125
}
126126

127-
func (c *Client) ConnectRPC26(ctx context.Context) (
128-
agentproto.DRPCAgentClient26, proto.DRPCTailnetClient26, error,
127+
func (c *Client) ConnectRPC27(ctx context.Context) (
128+
agentproto.DRPCAgentClient27, proto.DRPCTailnetClient26, error,
129129
) {
130130
conn, lis := drpcsdk.MemTransportPipe()
131131
c.LastWorkspaceAgent = func() {
@@ -405,6 +405,10 @@ func (f *FakeAgentAPI) ReportConnection(_ context.Context, req *agentproto.Repor
405405
return &emptypb.Empty{}, nil
406406
}
407407

408+
409+
func (f *FakeAgentAPI) ReportBoundaryLogs(_ context.Context, _ *agentproto.ReportBoundaryLogsRequest) (*agentproto.ReportBoundaryLogsResponse, error) {
410+
return &agentproto.ReportBoundaryLogsResponse{}, nil
411+
}
408412
func (f *FakeAgentAPI) GetConnectionReports() []*agentproto.ReportConnectionRequest {
409413
f.Lock()
410414
defer f.Unlock()

0 commit comments

Comments
 (0)