From 9c3484a8322dabce02f23c3d7c9ddeb86eee44ac Mon Sep 17 00:00:00 2001 From: evgeniy-scherbina Date: Tue, 27 May 2025 13:13:58 +0000 Subject: [PATCH 1/2] fix: reimplement reporting of preset-hard-limited metric --- coderd/prebuilds/global_snapshot.go | 35 +++++++----- .../coderd/prebuilds/metricscollector.go | 11 +--- enterprise/coderd/prebuilds/reconcile.go | 56 +++++++++++++------ enterprise/coderd/prebuilds/reconcile_test.go | 7 +-- 4 files changed, 64 insertions(+), 45 deletions(-) diff --git a/coderd/prebuilds/global_snapshot.go b/coderd/prebuilds/global_snapshot.go index f4c094289b54e..976461780fd07 100644 --- a/coderd/prebuilds/global_snapshot.go +++ b/coderd/prebuilds/global_snapshot.go @@ -12,11 +12,11 @@ import ( // GlobalSnapshot represents a full point-in-time snapshot of state relating to prebuilds across all templates. type GlobalSnapshot struct { - Presets []database.GetTemplatePresetsWithPrebuildsRow - RunningPrebuilds []database.GetRunningPrebuiltWorkspacesRow - PrebuildsInProgress []database.CountInProgressPrebuildsRow - Backoffs []database.GetPresetsBackoffRow - HardLimitedPresets []database.GetPresetsAtFailureLimitRow + Presets []database.GetTemplatePresetsWithPrebuildsRow + RunningPrebuilds []database.GetRunningPrebuiltWorkspacesRow + PrebuildsInProgress []database.CountInProgressPrebuildsRow + Backoffs []database.GetPresetsBackoffRow + HardLimitedPresetsMap map[uuid.UUID]database.GetPresetsAtFailureLimitRow } func NewGlobalSnapshot( @@ -26,12 +26,17 @@ func NewGlobalSnapshot( backoffs []database.GetPresetsBackoffRow, hardLimitedPresets []database.GetPresetsAtFailureLimitRow, ) GlobalSnapshot { + hardLimitedPresetsMap := make(map[uuid.UUID]database.GetPresetsAtFailureLimitRow, len(hardLimitedPresets)) + for _, preset := range hardLimitedPresets { + hardLimitedPresetsMap[preset.PresetID] = preset + } + return GlobalSnapshot{ - Presets: presets, - RunningPrebuilds: runningPrebuilds, - PrebuildsInProgress: prebuildsInProgress, - Backoffs: backoffs, - HardLimitedPresets: hardLimitedPresets, + Presets: presets, + RunningPrebuilds: runningPrebuilds, + PrebuildsInProgress: prebuildsInProgress, + Backoffs: backoffs, + HardLimitedPresetsMap: hardLimitedPresetsMap, } } @@ -66,9 +71,7 @@ func (s GlobalSnapshot) FilterByPreset(presetID uuid.UUID) (*PresetSnapshot, err backoffPtr = &backoff } - _, isHardLimited := slice.Find(s.HardLimitedPresets, func(row database.GetPresetsAtFailureLimitRow) bool { - return row.PresetID == preset.ID - }) + _, isHardLimited := s.HardLimitedPresetsMap[preset.ID] return &PresetSnapshot{ Preset: preset, @@ -80,6 +83,12 @@ func (s GlobalSnapshot) FilterByPreset(presetID uuid.UUID) (*PresetSnapshot, err }, nil } +func (s GlobalSnapshot) IsHardLimited(presetID uuid.UUID) bool { + _, isHardLimited := s.HardLimitedPresetsMap[presetID] + + return isHardLimited +} + // filterExpiredWorkspaces splits running workspaces into expired and non-expired // based on the preset's TTL. // If TTL is missing or zero, all workspaces are considered non-expired. diff --git a/enterprise/coderd/prebuilds/metricscollector.go b/enterprise/coderd/prebuilds/metricscollector.go index 90257c26dd580..4499849ffde0a 100644 --- a/enterprise/coderd/prebuilds/metricscollector.go +++ b/enterprise/coderd/prebuilds/metricscollector.go @@ -280,16 +280,9 @@ func (k hardLimitedPresetKey) String() string { return fmt.Sprintf("%s:%s:%s", k.orgName, k.templateName, k.presetName) } -// nolint:revive // isHardLimited determines if the preset should be reported as hard-limited in Prometheus. -func (mc *MetricsCollector) trackHardLimitedStatus(orgName, templateName, presetName string, isHardLimited bool) { +func (mc *MetricsCollector) registerHardLimitedPresets(isPresetHardLimited map[hardLimitedPresetKey]bool) { mc.isPresetHardLimitedMu.Lock() defer mc.isPresetHardLimitedMu.Unlock() - key := hardLimitedPresetKey{orgName: orgName, templateName: templateName, presetName: presetName} - - if isHardLimited { - mc.isPresetHardLimited[key] = true - } else { - delete(mc.isPresetHardLimited, key) - } + mc.isPresetHardLimited = isPresetHardLimited } diff --git a/enterprise/coderd/prebuilds/reconcile.go b/enterprise/coderd/prebuilds/reconcile.go index 90c97afa26d69..374b65cb3695d 100644 --- a/enterprise/coderd/prebuilds/reconcile.go +++ b/enterprise/coderd/prebuilds/reconcile.go @@ -256,6 +256,9 @@ func (c *StoreReconciler) ReconcileAll(ctx context.Context) error { if err != nil { return xerrors.Errorf("determine current snapshot: %w", err) } + + c.reportHardLimitedPresets(snapshot) + if len(snapshot.Presets) == 0 { logger.Debug(ctx, "no templates found with prebuilds configured") return nil @@ -296,6 +299,41 @@ func (c *StoreReconciler) ReconcileAll(ctx context.Context) error { return err } +func (c *StoreReconciler) reportHardLimitedPresets(snapshot *prebuilds.GlobalSnapshot) { + // presetsMap is a map from key (orgName:templateName:presetName) to list of corresponding presets. + // Multiple versions of a preset can exist with the same orgName, templateName, and presetName, + // because templates can have multiple versions — or deleted templates can share the same name. + presetsMap := make(map[hardLimitedPresetKey][]database.GetTemplatePresetsWithPrebuildsRow) + for _, preset := range snapshot.Presets { + key := hardLimitedPresetKey{ + orgName: preset.OrganizationName, + templateName: preset.TemplateName, + presetName: preset.Name, + } + + presetsMap[key] = append(presetsMap[key], preset) + } + + // Report a preset as hard-limited only if all the following conditions are met: + // - The preset is marked as hard-limited + // - The preset is using the active version of its template, and the template has not been deleted + // + // The second condition is important because a hard-limited preset that has become outdated is no longer relevant. + // Its associated prebuilt workspaces were likely deleted, and it's not meaningful to continue reporting it + // as hard-limited to the admin. + isPresetHardLimited := make(map[hardLimitedPresetKey]bool) + for key, presets := range presetsMap { + for _, preset := range presets { + if preset.UsingActiveVersion && !preset.Deleted && snapshot.IsHardLimited(preset.ID) { + isPresetHardLimited[key] = true + break + } + } + } + + c.metrics.registerHardLimitedPresets(isPresetHardLimited) +} + // SnapshotState captures the current state of all prebuilds across templates. func (c *StoreReconciler) SnapshotState(ctx context.Context, store database.Store) (*prebuilds.GlobalSnapshot, error) { if err := ctx.Err(); err != nil { @@ -361,24 +399,6 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres slog.F("preset_name", ps.Preset.Name), ) - // Report a metric only if the preset uses the latest version of the template and the template is not deleted. - // This avoids conflicts between metrics from old and new template versions. - // - // NOTE: Multiple versions of a preset can exist with the same orgName, templateName, and presetName, - // because templates can have multiple versions — or deleted templates can share the same name. - // - // The safest approach is to report the metric only for the latest version of the preset. - // When a new template version is released, the metric for the new preset should overwrite - // the old value in Prometheus. - // - // However, there’s one edge case: if an admin creates a template, it becomes hard-limited, - // then deletes the template and never creates another with the same name, - // the old preset will continue to be reported as hard-limited — - // even though it’s deleted. This will persist until `coderd` is restarted. - if ps.Preset.UsingActiveVersion && !ps.Preset.Deleted { - c.metrics.trackHardLimitedStatus(ps.Preset.OrganizationName, ps.Preset.TemplateName, ps.Preset.Name, ps.IsHardLimited) - } - // If the preset reached the hard failure limit for the first time during this iteration: // - Mark it as hard-limited in the database // - Send notifications to template admins diff --git a/enterprise/coderd/prebuilds/reconcile_test.go b/enterprise/coderd/prebuilds/reconcile_test.go index 7de22db64c8be..a0e1f9726d7d5 100644 --- a/enterprise/coderd/prebuilds/reconcile_test.go +++ b/enterprise/coderd/prebuilds/reconcile_test.go @@ -1034,8 +1034,7 @@ func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) { require.Equal(t, database.WorkspaceTransitionDelete, workspaceBuilds[0].Transition) require.Equal(t, database.WorkspaceTransitionStart, workspaceBuilds[1].Transition) - // The metric is still set to 1, even though the preset has become outdated. - // This happens because the old value hasn't been overwritten by a newer preset yet. + // Metric is deleted after preset became outdated. mf, err = registry.Gather() require.NoError(t, err) metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{ @@ -1043,9 +1042,7 @@ func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) { "preset_name": preset.Name, "org_name": org.Name, }) - require.NotNil(t, metric) - require.NotNil(t, metric.GetGauge()) - require.EqualValues(t, 1, metric.GetGauge().GetValue()) + require.Nil(t, metric) }) } } From 6505913b1579d9570091df2ab1df50077224219a Mon Sep 17 00:00:00 2001 From: evgeniy-scherbina Date: Tue, 27 May 2025 15:44:17 +0000 Subject: [PATCH 2/2] refactor: improve docs --- enterprise/coderd/prebuilds/reconcile.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/enterprise/coderd/prebuilds/reconcile.go b/enterprise/coderd/prebuilds/reconcile.go index 374b65cb3695d..ebfcfaf2b3182 100644 --- a/enterprise/coderd/prebuilds/reconcile.go +++ b/enterprise/coderd/prebuilds/reconcile.go @@ -321,6 +321,14 @@ func (c *StoreReconciler) reportHardLimitedPresets(snapshot *prebuilds.GlobalSna // The second condition is important because a hard-limited preset that has become outdated is no longer relevant. // Its associated prebuilt workspaces were likely deleted, and it's not meaningful to continue reporting it // as hard-limited to the admin. + // + // This approach accounts for all relevant scenarios: + // Scenario #1: The admin created a new template version with the same preset names. + // Scenario #2: The admin created a new template version and renamed the presets. + // Scenario #3: The admin deleted a template version that contained hard-limited presets. + // + // In all of these cases, only the latest and non-deleted presets will be reported. + // All other presets will be ignored and eventually removed from Prometheus. isPresetHardLimited := make(map[hardLimitedPresetKey]bool) for key, presets := range presetsMap { for _, preset := range presets {