Skip to content

Commit 2a15aa8

Browse files
feat: add hard-limited presets metric (#18008)
Closes #17988 Define `preset_hard_limited` metric which for every preset indicates whether a given preset has reached the hard failure limit (1 for hard-limited, 0 otherwise). CLI example: ``` curl -X GET localhost:2118/metrics | grep preset_hard_limited # HELP coderd_prebuilt_workspaces_preset_hard_limited Indicates whether a given preset has reached the hard failure limit (1 for hard-limited, 0 otherwise). # TYPE coderd_prebuilt_workspaces_preset_hard_limited gauge coderd_prebuilt_workspaces_preset_hard_limited{organization_name="coder",preset_name="GoLand: Large",template_name="Test7"} 1 coderd_prebuilt_workspaces_preset_hard_limited{organization_name="coder",preset_name="GoLand: Large",template_name="ValidTemplate"} 0 coderd_prebuilt_workspaces_preset_hard_limited{organization_name="coder",preset_name="IU: Medium",template_name="Test7"} 1 coderd_prebuilt_workspaces_preset_hard_limited{organization_name="coder",preset_name="IU: Medium",template_name="ValidTemplate"} 0 coderd_prebuilt_workspaces_preset_hard_limited{organization_name="coder",preset_name="WS: Small",template_name="Test7"} 1 ``` NOTE: ```go if !ps.Preset.Deleted && ps.Preset.UsingActiveVersion { c.metrics.trackHardLimitedStatus(ps.Preset.OrganizationName, ps.Preset.TemplateName, ps.Preset.Name, ps.IsHardLimited) } ``` Only active template version is tracked. If admin creates new template version - old value of metric (for previous template version) will be overwritten with new value of metric (for active template version). Because `template_version` is not part of metric: ```go labels = []string{"template_name", "preset_name", "organization_name"} ``` Implementation is similar to implementation of `MetricResourceReplacementsCount` metric --------- Co-authored-by: Susana Ferreira <ssncferreira@gmail.com>
1 parent 0731304 commit 2a15aa8

File tree

3 files changed

+334
-11
lines changed

3 files changed

+334
-11
lines changed

enterprise/coderd/prebuilds/metricscollector.go

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ const (
2727
MetricDesiredGauge = namespace + "desired"
2828
MetricRunningGauge = namespace + "running"
2929
MetricEligibleGauge = namespace + "eligible"
30+
MetricPresetHardLimitedGauge = namespace + "preset_hard_limited"
3031
MetricLastUpdatedGauge = namespace + "metrics_last_updated"
3132
)
3233

@@ -82,6 +83,12 @@ var (
8283
labels,
8384
nil,
8485
)
86+
presetHardLimitedDesc = prometheus.NewDesc(
87+
MetricPresetHardLimitedGauge,
88+
"Indicates whether a given preset has reached the hard failure limit (1 = hard-limited). Metric is omitted otherwise.",
89+
labels,
90+
nil,
91+
)
8592
lastUpdateDesc = prometheus.NewDesc(
8693
MetricLastUpdatedGauge,
8794
"The unix timestamp when the metrics related to prebuilt workspaces were last updated; these metrics are cached.",
@@ -104,17 +111,22 @@ type MetricsCollector struct {
104111

105112
replacementsCounter map[replacementKey]float64
106113
replacementsCounterMu sync.Mutex
114+
115+
isPresetHardLimited map[hardLimitedPresetKey]bool
116+
isPresetHardLimitedMu sync.Mutex
107117
}
108118

109119
var _ prometheus.Collector = new(MetricsCollector)
110120

111121
func NewMetricsCollector(db database.Store, logger slog.Logger, snapshotter prebuilds.StateSnapshotter) *MetricsCollector {
112122
log := logger.Named("prebuilds_metrics_collector")
123+
113124
return &MetricsCollector{
114125
database: db,
115126
logger: log,
116127
snapshotter: snapshotter,
117128
replacementsCounter: make(map[replacementKey]float64),
129+
isPresetHardLimited: make(map[hardLimitedPresetKey]bool),
118130
}
119131
}
120132

@@ -126,6 +138,7 @@ func (*MetricsCollector) Describe(descCh chan<- *prometheus.Desc) {
126138
descCh <- desiredPrebuildsDesc
127139
descCh <- runningPrebuildsDesc
128140
descCh <- eligiblePrebuildsDesc
141+
descCh <- presetHardLimitedDesc
129142
descCh <- lastUpdateDesc
130143
}
131144

@@ -173,6 +186,17 @@ func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) {
173186
metricsCh <- prometheus.MustNewConstMetric(eligiblePrebuildsDesc, prometheus.GaugeValue, float64(state.Eligible), preset.TemplateName, preset.Name, preset.OrganizationName)
174187
}
175188

189+
mc.isPresetHardLimitedMu.Lock()
190+
for key, isHardLimited := range mc.isPresetHardLimited {
191+
var val float64
192+
if isHardLimited {
193+
val = 1
194+
}
195+
196+
metricsCh <- prometheus.MustNewConstMetric(presetHardLimitedDesc, prometheus.GaugeValue, val, key.templateName, key.presetName, key.orgName)
197+
}
198+
mc.isPresetHardLimitedMu.Unlock()
199+
176200
metricsCh <- prometheus.MustNewConstMetric(lastUpdateDesc, prometheus.GaugeValue, float64(currentState.createdAt.Unix()))
177201
}
178202

@@ -247,3 +271,25 @@ func (mc *MetricsCollector) trackResourceReplacement(orgName, templateName, pres
247271
// cause an issue (or indeed if either would), so we just track the replacement.
248272
mc.replacementsCounter[key]++
249273
}
274+
275+
type hardLimitedPresetKey struct {
276+
orgName, templateName, presetName string
277+
}
278+
279+
func (k hardLimitedPresetKey) String() string {
280+
return fmt.Sprintf("%s:%s:%s", k.orgName, k.templateName, k.presetName)
281+
}
282+
283+
// nolint:revive // isHardLimited determines if the preset should be reported as hard-limited in Prometheus.
284+
func (mc *MetricsCollector) trackHardLimitedStatus(orgName, templateName, presetName string, isHardLimited bool) {
285+
mc.isPresetHardLimitedMu.Lock()
286+
defer mc.isPresetHardLimitedMu.Unlock()
287+
288+
key := hardLimitedPresetKey{orgName: orgName, templateName: templateName, presetName: presetName}
289+
290+
if isHardLimited {
291+
mc.isPresetHardLimited[key] = true
292+
} else {
293+
delete(mc.isPresetHardLimited, key)
294+
}
295+
}

enterprise/coderd/prebuilds/reconcile.go

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -361,17 +361,22 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres
361361
slog.F("preset_name", ps.Preset.Name),
362362
)
363363

364-
// If the preset was previously hard-limited, log it and exit early.
365-
if ps.Preset.PrebuildStatus == database.PrebuildStatusHardLimited {
366-
logger.Warn(ctx, "skipping hard limited preset")
367-
return nil
368-
}
364+
// Report a preset as hard-limited only if all the following conditions are met:
365+
// - The preset is marked as hard-limited
366+
// - The preset is using the active version of its template, and the template has not been deleted
367+
//
368+
// The second condition is important because a hard-limited preset that has become outdated is no longer relevant.
369+
// Its associated prebuilt workspaces were likely deleted, and it's not meaningful to continue reporting it
370+
// as hard-limited to the admin.
371+
reportAsHardLimited := ps.IsHardLimited && ps.Preset.UsingActiveVersion && !ps.Preset.Deleted
372+
c.metrics.trackHardLimitedStatus(ps.Preset.OrganizationName, ps.Preset.TemplateName, ps.Preset.Name, reportAsHardLimited)
369373

370374
// If the preset reached the hard failure limit for the first time during this iteration:
371375
// - Mark it as hard-limited in the database
372376
// - Send notifications to template admins
373-
if ps.IsHardLimited {
374-
logger.Warn(ctx, "skipping hard limited preset")
377+
// - Continue execution, we disallow only creation operation for hard-limited presets. Deletion is allowed.
378+
if ps.Preset.PrebuildStatus != database.PrebuildStatusHardLimited && ps.IsHardLimited {
379+
logger.Warn(ctx, "preset is hard limited, notifying template admins")
375380

376381
err := c.store.UpdatePresetPrebuildStatus(ctx, database.UpdatePresetPrebuildStatusParams{
377382
Status: database.PrebuildStatusHardLimited,
@@ -384,10 +389,7 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres
384389
err = c.notifyPrebuildFailureLimitReached(ctx, ps)
385390
if err != nil {
386391
logger.Error(ctx, "failed to notify that number of prebuild failures reached the limit", slog.Error(err))
387-
return nil
388392
}
389-
390-
return nil
391393
}
392394

393395
state := ps.CalculateState()
@@ -452,6 +454,13 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres
452454
actions.Create = desired
453455
}
454456

457+
// If preset is hard-limited, and it's a create operation, log it and exit early.
458+
// Creation operation is disallowed for hard-limited preset.
459+
if ps.IsHardLimited && actions.Create > 0 {
460+
logger.Warn(ctx, "skipping hard limited preset for create operation")
461+
return nil
462+
}
463+
455464
var multiErr multierror.Error
456465

457466
for range actions.Create {

0 commit comments

Comments
 (0)