Skip to content

Commit 09f7305

Browse files
committed
Review feedback
Signed-off-by: Danny Kopping <danny@coder.com>
1 parent 88451a1 commit 09f7305

File tree

10 files changed

+194
-72
lines changed

10 files changed

+194
-72
lines changed

coderd/database/dbmock/dbmock.go

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/dump.sql

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
ALTER TABLE notification_messages
2-
ADD COLUMN queued_seconds INTEGER NULL;
2+
ADD COLUMN queued_seconds FLOAT NULL;

coderd/database/models.go

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries.sql.go

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries/notifications.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ RETURNING id;
3636
WITH acquired AS (
3737
UPDATE
3838
notification_messages
39-
SET queued_seconds = CEIL(GREATEST(0, EXTRACT(EPOCH FROM (NOW() - updated_at))))::INTEGER,
39+
SET queued_seconds = GREATEST(0, EXTRACT(EPOCH FROM (NOW() - updated_at)))::FLOAT,
4040
updated_at = NOW(),
4141
status = 'leased'::notification_message_status,
4242
status_reason = 'Leased by notifier ' || sqlc.arg('notifier_id')::uuid,

coderd/notifications/manager.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,8 @@ func (m *Manager) syncUpdates(ctx context.Context) {
222222
nSuccess := len(m.success)
223223
nFailure := len(m.failure)
224224

225+
m.metrics.PendingUpdates.Set(float64(len(m.success) + len(m.failure)))
226+
225227
// Nothing to do.
226228
if nSuccess+nFailure == 0 {
227229
return
@@ -283,6 +285,7 @@ func (m *Manager) syncUpdates(ctx context.Context) {
283285
logger.Error(ctx, "bulk update failed", slog.Error(err))
284286
return
285287
}
288+
m.metrics.SyncedUpdates.Add(float64(n))
286289

287290
logger.Debug(ctx, "bulk update completed", slog.F("updated", n))
288291
}()
@@ -306,6 +309,7 @@ func (m *Manager) syncUpdates(ctx context.Context) {
306309
logger.Error(ctx, "bulk update failed", slog.Error(err))
307310
return
308311
}
312+
m.metrics.SyncedUpdates.Add(float64(n))
309313

310314
logger.Debug(ctx, "bulk update completed", slog.F("updated", n))
311315
}()

coderd/notifications/metrics.go

Lines changed: 28 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,46 @@
11
package notifications
22

33
import (
4+
"fmt"
5+
"strings"
6+
47
"github.com/prometheus/client_golang/prometheus"
58
"github.com/prometheus/client_golang/prometheus/promauto"
69
)
710

811
type Metrics struct {
9-
DispatchedCount *prometheus.CounterVec
10-
TempFailureCount *prometheus.CounterVec
11-
PermFailureCount *prometheus.CounterVec
12+
DispatchAttempts *prometheus.CounterVec
1213
RetryCount *prometheus.CounterVec
1314

1415
QueuedSeconds *prometheus.HistogramVec
1516

17+
InflightDispatches *prometheus.GaugeVec
1618
DispatcherSendSeconds *prometheus.HistogramVec
1719

1820
PendingUpdates prometheus.Gauge
21+
SyncedUpdates prometheus.Counter
1922
}
2023

2124
const (
2225
ns = "coderd"
2326
subsystem = "notifications"
2427

2528
LabelMethod = "method"
26-
LabelTemplateID = "template_id"
29+
LabelTemplateID = "notification_template_id"
30+
LabelResult = "result"
31+
32+
ResultSuccess = "success"
33+
ResultTempFail = "temp_fail"
34+
ResultPermFail = "perm_fail"
2735
)
2836

2937
func NewMetrics(reg prometheus.Registerer) *Metrics {
3038
return &Metrics{
31-
DispatchedCount: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
32-
Name: "dispatched_count", Namespace: ns, Subsystem: subsystem,
33-
Help: "The count of notifications successfully dispatched.",
34-
}, []string{LabelMethod, LabelTemplateID}),
35-
TempFailureCount: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
36-
Name: "temporary_failures_count", Namespace: ns, Subsystem: subsystem,
37-
Help: "The count of notifications which failed but have retry attempts remaining.",
38-
}, []string{LabelMethod, LabelTemplateID}),
39-
PermFailureCount: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
40-
Name: "permanent_failures_count", Namespace: ns, Subsystem: subsystem,
41-
Help: "The count of notifications which failed and have exceeded their retry attempts.",
42-
}, []string{LabelMethod, LabelTemplateID}),
39+
DispatchAttempts: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
40+
Name: "dispatch_attempts_total", Namespace: ns, Subsystem: subsystem,
41+
Help: fmt.Sprintf("The number of dispatch attempts, aggregated by the result type (%s)",
42+
strings.Join([]string{ResultSuccess, ResultTempFail, ResultPermFail}, ", ")),
43+
}, []string{LabelMethod, LabelTemplateID, LabelResult}),
4344
RetryCount: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
4445
Name: "retry_count", Namespace: ns, Subsystem: subsystem,
4546
Help: "The count of notification dispatch retry attempts.",
@@ -48,13 +49,17 @@ func NewMetrics(reg prometheus.Registerer) *Metrics {
4849
// Aggregating on LabelTemplateID as well would cause a cardinality explosion.
4950
QueuedSeconds: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
5051
Name: "queued_seconds", Namespace: ns, Subsystem: subsystem,
51-
Buckets: []float64{1, 5, 15, 30, 60, 120, 300, 600, 3600, 86400},
52-
Help: "The time elapsed between a notification being enqueued in the store and retrieved for processing " +
52+
Buckets: []float64{1, 2.5, 5, 7.5, 10, 15, 20, 30, 60, 120, 300, 600, 3600},
53+
Help: "The time elapsed between a notification being enqueued in the store and retrieved for dispatching " +
5354
"(measures the latency of the notifications system). This should generally be within CODER_NOTIFICATIONS_FETCH_INTERVAL " +
5455
"seconds; higher values for a sustained period indicates delayed processing and CODER_NOTIFICATIONS_LEASE_COUNT " +
5556
"can be increased to accommodate this.",
5657
}, []string{LabelMethod}),
5758

59+
InflightDispatches: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
60+
Name: "inflight_dispatches", Namespace: ns, Subsystem: subsystem,
61+
Help: "The number of dispatch attempts which are currently in progress.",
62+
}, []string{LabelMethod, LabelTemplateID}),
5863
// Aggregating on LabelTemplateID as well would cause a cardinality explosion.
5964
DispatcherSendSeconds: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{
6065
Name: "dispatcher_send_seconds", Namespace: ns, Subsystem: subsystem,
@@ -65,7 +70,11 @@ func NewMetrics(reg prometheus.Registerer) *Metrics {
6570
// Currently no requirement to discriminate between success and failure updates which are pending.
6671
PendingUpdates: promauto.With(reg).NewGauge(prometheus.GaugeOpts{
6772
Name: "pending_updates", Namespace: ns, Subsystem: subsystem,
68-
Help: "The number of updates waiting to be flushed to the store.",
73+
Help: "The number of dispatch attempt results waiting to be flushed to the store.",
74+
}),
75+
SyncedUpdates: promauto.With(reg).NewCounter(prometheus.CounterOpts{
76+
Name: "synced_updates_total", Namespace: ns, Subsystem: subsystem,
77+
Help: "The number of dispatch attempt results flushed to the store.",
6978
}),
7079
}
7180
}

0 commit comments

Comments
 (0)