Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
77f9c6a
remove unnecessary calls to GetWorkspaceAgentByID for stats api calls
cstyan Oct 29, 2025
0bf08af
modify workspace caching, significantly reduce calls to
cstyan Oct 31, 2025
f447927
Add an IsEmpty check for rbac object.
cstyan Nov 14, 2025
9561447
don't need to wait on ctx after tickerfunc, plus log when we've exited
cstyan Nov 17, 2025
c5f0c60
only wrap context for the one database call
cstyan Nov 17, 2025
f77df04
formatting
cstyan Nov 17, 2025
c39830b
guard against the rbac object being invalid when adding it to the
cstyan Nov 17, 2025
8ae5993
lint
cstyan Nov 17, 2025
e99a8c5
fix tests never finishing; we need to wait on the tickerFunc finishing
cstyan Nov 17, 2025
b8d3604
don't cache prebuilds
cstyan Nov 18, 2025
31432ed
use the opts workspaceID in refreshCachedWorkspace now that we don't
cstyan Nov 18, 2025
40e423a
these shouldn't have been removed
cstyan Nov 18, 2025
dac1117
lint and fmt + move workspace rbac context stuff to a separate file
cstyan Nov 18, 2025
bdf076a
store pointer to cached workspace instead of functions
cstyan Nov 18, 2025
a77b310
we need to cache the AutostartSchedule for the stats api
cstyan Nov 18, 2025
02d84bd
move cached workspace struct to its own file, include mutex on the
cstyan Nov 18, 2025
afe37c9
make fmt and linter happy
cstyan Nov 18, 2025
15081e8
make cached workspace a pointer on the API struct
cstyan Nov 21, 2025
089a5ef
clear the cached workspace on DB error
cstyan Nov 21, 2025
a2ee19d
address more review feedback; attempt to simplify cached workspace is
cstyan Nov 21, 2025
1d48500
remove IsEmpty from rbac object, we'd need to handle too many rbac
cstyan Nov 21, 2025
96eb4c0
typo in function name
cstyan Nov 24, 2025
6811090
refactor so that functions which will now make use of the cached
cstyan Nov 24, 2025
81bc27a
fmt and lint again
cstyan Nov 25, 2025
8304864
cleanup/refactor tests, we no longer cache prebuilds but still want to
cstyan Nov 25, 2025
740348c
reduce logging for expected paths
cstyan Nov 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 72 additions & 5 deletions coderd/agentapi/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ import (
"github.com/coder/quartz"
)

const workspaceCacheRefreshInterval = 5 * time.Minute

// API implements the DRPC agent API interface from agent/proto. This struct is
// instantiated once per agent connection and kept alive for the duration of the
// session.
Expand All @@ -54,6 +56,8 @@ type API struct {
*SubAgentAPI
*tailnet.DRPCService

cachedWorkspaceFields *CachedWorkspaceFields

mu sync.Mutex
}

Expand Down Expand Up @@ -92,7 +96,7 @@ type Options struct {
UpdateAgentMetricsFn func(ctx context.Context, labels prometheusmetrics.AgentMetricLabels, metrics []*agentproto.Stats_Metric)
}

func New(opts Options) *API {
func New(opts Options, workspace database.Workspace) *API {
if opts.Clock == nil {
opts.Clock = quartz.NewReal()
}
Expand All @@ -114,6 +118,13 @@ func New(opts Options) *API {
WorkspaceID: opts.WorkspaceID,
}

// Don't cache details for prebuilds, though the cached fields will eventually be updated
// by the refresh routine once the prebuild workspace is claimed.
api.cachedWorkspaceFields = &CachedWorkspaceFields{}
if !workspace.IsPrebuild() {
api.cachedWorkspaceFields.UpdateValues(workspace)
}

api.AnnouncementBannerAPI = &AnnouncementBannerAPI{
appearanceFetcher: opts.AppearanceFetcher,
}
Expand All @@ -139,6 +150,7 @@ func New(opts Options) *API {

api.StatsAPI = &StatsAPI{
AgentFn: api.agent,
Workspace: api.cachedWorkspaceFields,
Database: opts.Database,
Log: opts.Log,
StatsReporter: opts.StatsReporter,
Expand All @@ -162,10 +174,11 @@ func New(opts Options) *API {
}

api.MetadataAPI = &MetadataAPI{
AgentFn: api.agent,
Database: opts.Database,
Pubsub: opts.Pubsub,
Log: opts.Log,
AgentFn: api.agent,
Workspace: api.cachedWorkspaceFields,
Database: opts.Database,
Pubsub: opts.Pubsub,
Log: opts.Log,
}

api.LogsAPI = &LogsAPI{
Expand Down Expand Up @@ -205,6 +218,10 @@ func New(opts Options) *API {
Database: opts.Database,
}

// Start background cache refresh loop to handle workspace changes
// like prebuild claims where owner_id and other fields may be modified in the DB.
go api.startCacheRefreshLoop(opts.Ctx)

return api
}

Expand Down Expand Up @@ -254,6 +271,56 @@ func (a *API) agent(ctx context.Context) (database.WorkspaceAgent, error) {
return agent, nil
}

// refreshCachedWorkspace periodically updates the cached workspace fields.
// This ensures that changes like prebuild claims (which modify owner_id, name, etc.)
// are eventually reflected in the cache without requiring agent reconnection.
func (a *API) refreshCachedWorkspace(ctx context.Context) {
ws, err := a.opts.Database.GetWorkspaceByID(ctx, a.opts.WorkspaceID)
if err != nil {
a.opts.Log.Warn(ctx, "failed to refresh cached workspace fields", slog.Error(err))
a.cachedWorkspaceFields.Clear()
return
}

if ws.IsPrebuild() {
return
}

// If we still have the same values, skip the update and logging calls.
if a.cachedWorkspaceFields.identity.Equal(database.WorkspaceIdentityFromWorkspace(ws)) {
return
}
// Update fields that can change during workspace lifecycle (e.g., AutostartSchedule)
a.cachedWorkspaceFields.UpdateValues(ws)

a.opts.Log.Debug(ctx, "refreshed cached workspace fields",
slog.F("workspace_id", ws.ID),
slog.F("owner_id", ws.OwnerID),
slog.F("name", ws.Name))
Comment on lines +296 to +299
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we only log if the update yields a different workspace then what we had?

Not a huge deal as the logs are infrequent. Just trying to keep the logs down.

}

// startCacheRefreshLoop runs a background goroutine that periodically refreshes
// the cached workspace fields. This is primarily needed to handle prebuild claims
// where the owner_id and other fields change while the agent connection persists.
func (a *API) startCacheRefreshLoop(ctx context.Context) {
// Refresh every 5 minutes. This provides a reasonable balance between:
// - Keeping cache fresh for prebuild claims and other workspace updates
// - Minimizing unnecessary database queries
ticker := a.opts.Clock.TickerFunc(ctx, workspaceCacheRefreshInterval, func() error {
a.refreshCachedWorkspace(ctx)
return nil
}, "cache_refresh")

// We need to wait on the ticker exiting.
_ = ticker.Wait()

a.opts.Log.Debug(ctx, "cache refresh loop exited, invalidating the workspace cache on agent API",
slog.F("workspace_id", a.cachedWorkspaceFields.identity.ID),
slog.F("owner_id", a.cachedWorkspaceFields.identity.OwnerUsername),
slog.F("name", a.cachedWorkspaceFields.identity.Name))
a.cachedWorkspaceFields.Clear()
}

func (a *API) publishWorkspaceUpdate(ctx context.Context, agent *database.WorkspaceAgent, kind wspubsub.WorkspaceEventKind) error {
a.opts.PublishWorkspaceUpdateFn(ctx, a.opts.OwnerID, wspubsub.WorkspaceEvent{
Kind: kind,
Expand Down
52 changes: 52 additions & 0 deletions coderd/agentapi/cached_workspace.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
package agentapi

import (
"sync"

"github.com/coder/coder/v2/coderd/database"
)

// CachedWorkspaceFields contains workspace data that is safe to cache for the
// duration of an agent connection. These fields are used to reduce database calls
// in high-frequency operations like stats reporting and metadata updates.
// Prebuild workspaces should not be cached using this struct within the API struct,
// however some of these fields for a workspace can be updated live so there is a
// routine in the API for refreshing the workspace on a timed interval.
//
// IMPORTANT: ACL fields (GroupACL, UserACL) are NOT cached because they can be
// modified in the database and we must use fresh data for authorization checks.
type CachedWorkspaceFields struct {
lock sync.RWMutex

identity database.WorkspaceIdentity
}

func (cws *CachedWorkspaceFields) Clear() {
cws.lock.Lock()
defer cws.lock.Unlock()
cws.identity = database.WorkspaceIdentity{}
}

func (cws *CachedWorkspaceFields) UpdateValues(ws database.Workspace) {
cws.lock.Lock()
defer cws.lock.Unlock()
cws.identity.ID = ws.ID
cws.identity.OwnerID = ws.OwnerID
cws.identity.OrganizationID = ws.OrganizationID
cws.identity.TemplateID = ws.TemplateID
cws.identity.Name = ws.Name
cws.identity.OwnerUsername = ws.OwnerUsername
cws.identity.TemplateName = ws.TemplateName
cws.identity.AutostartSchedule = ws.AutostartSchedule
}

// Returns the Workspace, true, unless the workspace has not been cached (nuked or was a prebuild).
func (cws *CachedWorkspaceFields) AsWorkspaceIdentity() (database.WorkspaceIdentity, bool) {
cws.lock.RLock()
defer cws.lock.RUnlock()
// Should we be more explicit about all fields being set to be valid?
if cws.identity.Equal(database.WorkspaceIdentity{}) {
return database.WorkspaceIdentity{}, false
}
return cws.identity, true
}
97 changes: 97 additions & 0 deletions coderd/agentapi/cached_workspace_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
package agentapi_test

import (
"testing"

"github.com/google/uuid"
"github.com/stretchr/testify/require"

"github.com/coder/coder/v2/coderd/agentapi"
"github.com/coder/coder/v2/coderd/database"
)

func TestCacheClear(t *testing.T) {
t.Parallel()

var (
user = database.User{
ID: uuid.New(),
Username: "bill",
}
template = database.Template{
ID: uuid.New(),
Name: "tpl",
}
workspace = database.Workspace{
ID: uuid.New(),
OwnerID: user.ID,
OwnerUsername: user.Username,
TemplateID: template.ID,
Name: "xyz",
TemplateName: template.Name,
}
workspaceAsCacheFields = agentapi.CachedWorkspaceFields{}
)

workspaceAsCacheFields.UpdateValues(database.Workspace{
ID: workspace.ID,
OwnerID: workspace.OwnerID,
OwnerUsername: workspace.OwnerUsername,
TemplateID: workspace.TemplateID,
Name: workspace.Name,
TemplateName: workspace.TemplateName,
AutostartSchedule: workspace.AutostartSchedule,
},
)

emptyCws := agentapi.CachedWorkspaceFields{}
workspaceAsCacheFields.Clear()
wsi, ok := workspaceAsCacheFields.AsWorkspaceIdentity()
require.False(t, ok)
ecwsi, ok := emptyCws.AsWorkspaceIdentity()
require.False(t, ok)
require.True(t, ecwsi.Equal(wsi))
}

func TestCacheUpdate(t *testing.T) {
t.Parallel()

var (
user = database.User{
ID: uuid.New(),
Username: "bill",
}
template = database.Template{
ID: uuid.New(),
Name: "tpl",
}
workspace = database.Workspace{
ID: uuid.New(),
OwnerID: user.ID,
OwnerUsername: user.Username,
TemplateID: template.ID,
Name: "xyz",
TemplateName: template.Name,
}
workspaceAsCacheFields = agentapi.CachedWorkspaceFields{}
)

workspaceAsCacheFields.UpdateValues(database.Workspace{
ID: workspace.ID,
OwnerID: workspace.OwnerID,
OwnerUsername: workspace.OwnerUsername,
TemplateID: workspace.TemplateID,
Name: workspace.Name,
TemplateName: workspace.TemplateName,
AutostartSchedule: workspace.AutostartSchedule,
},
)

cws := agentapi.CachedWorkspaceFields{}
cws.UpdateValues(workspace)
wsi, ok := workspaceAsCacheFields.AsWorkspaceIdentity()
require.True(t, ok)
cwsi, ok := cws.AsWorkspaceIdentity()
require.True(t, ok)
require.True(t, wsi.Equal(cwsi))
}
24 changes: 19 additions & 5 deletions coderd/agentapi/metadata.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,17 @@ import (
"cdr.dev/slog"
agentproto "github.com/coder/coder/v2/agent/proto"
"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbauthz"
"github.com/coder/coder/v2/coderd/database/dbtime"
"github.com/coder/coder/v2/coderd/database/pubsub"
)

type MetadataAPI struct {
AgentFn func(context.Context) (database.WorkspaceAgent, error)
Database database.Store
Pubsub pubsub.Pubsub
Log slog.Logger
AgentFn func(context.Context) (database.WorkspaceAgent, error)
Workspace *CachedWorkspaceFields
Database database.Store
Pubsub pubsub.Pubsub
Log slog.Logger

TimeNowFn func() time.Time // defaults to dbtime.Now()
}
Expand Down Expand Up @@ -107,7 +109,19 @@ func (a *MetadataAPI) BatchUpdateMetadata(ctx context.Context, req *agentproto.B
)
}

err = a.Database.UpdateWorkspaceAgentMetadata(ctx, dbUpdate)
// Inject RBAC object into context for dbauthz fast path, avoid having to
// call GetWorkspaceByAgentID on every metadata update.
rbacCtx := ctx
if dbws, ok := a.Workspace.AsWorkspaceIdentity(); ok {
rbacCtx, err = dbauthz.WithWorkspaceRBAC(ctx, dbws.RBACObject())
if err != nil {
// Don't error level log here, will exit the function. We want to fall back to GetWorkspaceByAgentID.
//nolint:gocritic
a.Log.Debug(ctx, "Cached workspace was present but RBAC object was invalid", slog.F("err", err))
}
}

err = a.Database.UpdateWorkspaceAgentMetadata(rbacCtx, dbUpdate)
if err != nil {
return nil, xerrors.Errorf("update workspace agent metadata in database: %w", err)
}
Expand Down
Loading
Loading