Skip to content

chore: add tx metrics and logs for serialization errors #15215

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Oct 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion cli/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -718,7 +718,9 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd.
}

if options.DeploymentValues.Prometheus.Enable && options.DeploymentValues.Prometheus.CollectDBMetrics {
options.Database = dbmetrics.New(options.Database, options.PrometheusRegistry)
options.Database = dbmetrics.NewQueryMetrics(options.Database, options.Logger, options.PrometheusRegistry)
} else {
options.Database = dbmetrics.NewDBMetrics(options.Database, options.Logger, options.PrometheusRegistry)
}

var deploymentID string
Expand Down
4 changes: 3 additions & 1 deletion cli/testdata/coder_server_--help.golden
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,9 @@ INTROSPECTION / PROMETHEUS OPTIONS:
Collect agent stats (may increase charges for metrics storage).

--prometheus-collect-db-metrics bool, $CODER_PROMETHEUS_COLLECT_DB_METRICS (default: false)
Collect database metrics (may increase charges for metrics storage).
Collect database query metrics (may increase charges for metrics
storage). If set to false, a reduced set of database metrics are still
collected.

--prometheus-enable bool, $CODER_PROMETHEUS_ENABLE
Serve prometheus metrics on the address defined by prometheus address.
Expand Down
3 changes: 2 additions & 1 deletion cli/testdata/server-config.yaml.golden
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,8 @@ introspection:
- template_name
- username
- workspace_name
# Collect database metrics (may increase charges for metrics storage).
# Collect database query metrics (may increase charges for metrics storage). If
# set to false, a reduced set of database metrics are still collected.
# (default: false, type: bool)
collect_db_metrics: false
pprof:
Expand Down
5 changes: 4 additions & 1 deletion coderd/autobuild/lifecycle_executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,10 @@ func (e *Executor) runOnce(t time.Time) Stats {

// Run with RepeatableRead isolation so that the build process sees the same data
// as our calculation that determines whether an autobuild is necessary.
}, &sql.TxOptions{Isolation: sql.LevelRepeatableRead})
}, &database.TxOptions{
Isolation: sql.LevelRepeatableRead,
TxIdentifier: "lifecycle",
})
if auditLog != nil {
// If the transition didn't succeed then updating the workspace
// to indicate dormant didn't either.
Expand Down
3 changes: 3 additions & 0 deletions coderd/coderdtest/promhelp/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
// Package promhelp provides helper functions for asserting Prometheus
// metric values in unit tests.
package promhelp
87 changes: 87 additions & 0 deletions coderd/coderdtest/promhelp/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
package promhelp

import (
"context"
"io"
"maps"
"net/http"
"net/http/httptest"
"strings"
"testing"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
ptestutil "github.com/prometheus/client_golang/prometheus/testutil"
io_prometheus_client "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/require"
)

// RegistryDump returns the http page for a given registry's metrics.
// Very useful for visual debugging.
func RegistryDump(reg *prometheus.Registry) string {
h := promhttp.HandlerFor(reg, promhttp.HandlerOpts{})
rec := httptest.NewRecorder()
req, _ := http.NewRequestWithContext(context.Background(), http.MethodGet, "/", nil)
h.ServeHTTP(rec, req)
resp := rec.Result()
data, _ := io.ReadAll(resp.Body)
_ = resp.Body.Close()
return string(data)
}

// Compare can be used to compare a registry to some prometheus formatted
// text. If any values differ, an error is returned.
// If metric names are passed in, only those metrics will be compared.
// Usage: `Compare(reg, RegistryDump(reg))`
func Compare(reg prometheus.Gatherer, compare string, metricNames ...string) error {
return ptestutil.GatherAndCompare(reg, strings.NewReader(compare), metricNames...)
}

// HistogramValue returns the value of a histogram metric with the given name and labels.
func HistogramValue(t testing.TB, reg prometheus.Gatherer, metricName string, labels prometheus.Labels) *io_prometheus_client.Histogram {
t.Helper()

labeled := MetricValue(t, reg, metricName, labels)
require.NotNilf(t, labeled, "metric %q with labels %v not found", metricName, labels)
return labeled.GetHistogram()
}

// GaugeValue returns the value of a gauge metric with the given name and labels.
func GaugeValue(t testing.TB, reg prometheus.Gatherer, metricName string, labels prometheus.Labels) int {
t.Helper()

labeled := MetricValue(t, reg, metricName, labels)
require.NotNilf(t, labeled, "metric %q with labels %v not found", metricName, labels)
return int(labeled.GetGauge().GetValue())
}

// CounterValue returns the value of a counter metric with the given name and labels.
func CounterValue(t testing.TB, reg prometheus.Gatherer, metricName string, labels prometheus.Labels) int {
t.Helper()

labeled := MetricValue(t, reg, metricName, labels)
require.NotNilf(t, labeled, "metric %q with labels %v not found", metricName, labels)
return int(labeled.GetCounter().GetValue())
}

func MetricValue(t testing.TB, reg prometheus.Gatherer, metricName string, labels prometheus.Labels) *io_prometheus_client.Metric {
t.Helper()

metrics, err := reg.Gather()
require.NoError(t, err)

for _, m := range metrics {
if m.GetName() == metricName {
for _, labeled := range m.GetMetric() {
mLabels := make(prometheus.Labels)
for _, v := range labeled.GetLabel() {
mLabels[v.GetName()] = v.GetValue()
}
if maps.Equal(mLabels, labels) {
return labeled
}
}
}
}
return nil
}
5 changes: 3 additions & 2 deletions coderd/cryptokeys/rotate.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,9 @@ func (k *rotator) rotateKeys(ctx context.Context) error {
}
}
return nil
}, &sql.TxOptions{
Isolation: sql.LevelRepeatableRead,
}, &database.TxOptions{
Isolation: sql.LevelRepeatableRead,
TxIdentifier: "rotate_keys",
})
}

Expand Down
69 changes: 61 additions & 8 deletions coderd/database/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ type Store interface {
wrapper

Ping(ctx context.Context) (time.Duration, error)
InTx(func(Store) error, *sql.TxOptions) error
InTx(func(Store) error, *TxOptions) error
}

type wrapper interface {
Expand Down Expand Up @@ -57,6 +57,43 @@ func New(sdb *sql.DB) Store {
}
}

// TxOptions is used to pass some execution metadata to the callers.
// Ideally we could throw this into a context, but no context is used for
// transactions. So instead, the return context is attached to the options
// passed in.
// This metadata should not be returned in the method signature, because it
// is only used for metric tracking. It should never be used by business logic.
type TxOptions struct {
// Isolation is the transaction isolation level.
// If zero, the driver or database's default level is used.
Isolation sql.IsolationLevel
ReadOnly bool

// -- Coder specific metadata --
// TxIdentifier is a unique identifier for the transaction to be used
// in metrics. Can be any string.
TxIdentifier string

// Set by InTx
executionCount int
}

// IncrementExecutionCount is a helper function for external packages
// to increment the unexported count.
// Mainly for `dbmem`.
func IncrementExecutionCount(opts *TxOptions) {
opts.executionCount++
}

func (o TxOptions) ExecutionCount() int {
return o.executionCount
}

func (o *TxOptions) WithID(id string) *TxOptions {
o.TxIdentifier = id
return o
}

// queries encompasses both are sqlc generated
// queries and our custom queries.
type querier interface {
Expand All @@ -80,25 +117,39 @@ func (q *sqlQuerier) Ping(ctx context.Context) (time.Duration, error) {
return time.Since(start), err
}

func (q *sqlQuerier) InTx(function func(Store) error, txOpts *sql.TxOptions) error {
func DefaultTXOptions() *TxOptions {
return &TxOptions{
Isolation: sql.LevelDefault,
ReadOnly: false,
}
}

func (q *sqlQuerier) InTx(function func(Store) error, txOpts *TxOptions) error {
_, inTx := q.db.(*sqlx.Tx)
isolation := sql.LevelDefault
if txOpts != nil {
isolation = txOpts.Isolation

if txOpts == nil {
// create a default txOpts if left to nil
txOpts = DefaultTXOptions()
}

sqlOpts := &sql.TxOptions{
Isolation: txOpts.Isolation,
ReadOnly: txOpts.ReadOnly,
}

// If we are not already in a transaction, and we are running in serializable
// mode, we need to run the transaction in a retry loop. The caller should be
// prepared to allow retries if using serializable mode.
// If we are in a transaction already, the parent InTx call will handle the retry.
// We do not want to duplicate those retries.
if !inTx && isolation == sql.LevelSerializable {
if !inTx && sqlOpts.Isolation == sql.LevelSerializable {
// This is an arbitrarily chosen number.
const retryAmount = 3
var err error
attempts := 0
for attempts = 0; attempts < retryAmount; attempts++ {
err = q.runTx(function, txOpts)
txOpts.executionCount++
err = q.runTx(function, sqlOpts)
if err == nil {
// Transaction succeeded.
return nil
Expand All @@ -111,7 +162,9 @@ func (q *sqlQuerier) InTx(function func(Store) error, txOpts *sql.TxOptions) err
// Transaction kept failing in serializable mode.
return xerrors.Errorf("transaction failed after %d attempts: %w", attempts, err)
}
return q.runTx(function, txOpts)

txOpts.executionCount++
return q.runTx(function, sqlOpts)
}

// InTx performs database operations inside a transaction.
Expand Down
2 changes: 1 addition & 1 deletion coderd/database/db_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ func TestSerializedRetry(t *testing.T) {
db := database.New(sqlDB)

called := 0
txOpts := &sql.TxOptions{Isolation: sql.LevelSerializable}
txOpts := &database.TxOptions{Isolation: sql.LevelSerializable}
err := db.InTx(func(tx database.Store) error {
// Test nested error
return tx.InTx(func(tx database.Store) error {
Expand Down
2 changes: 1 addition & 1 deletion coderd/database/dbauthz/dbauthz.go
Original file line number Diff line number Diff line change
Expand Up @@ -558,7 +558,7 @@ func (q *querier) Ping(ctx context.Context) (time.Duration, error) {
}

// InTx runs the given function in a transaction.
func (q *querier) InTx(function func(querier database.Store) error, txOpts *sql.TxOptions) error {
func (q *querier) InTx(function func(querier database.Store) error, txOpts *database.TxOptions) error {
return q.db.InTx(func(tx database.Store) error {
// Wrap the transaction store in a querier.
wrapped := New(tx, q.auth, q.log, q.acs)
Expand Down
5 changes: 4 additions & 1 deletion coderd/database/dbmem/dbmem.go
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,7 @@ func (tx *fakeTx) releaseLocks() {
}

// InTx doesn't rollback data properly for in-memory yet.
func (q *FakeQuerier) InTx(fn func(database.Store) error, _ *sql.TxOptions) error {
func (q *FakeQuerier) InTx(fn func(database.Store) error, opts *database.TxOptions) error {
q.mutex.Lock()
defer q.mutex.Unlock()
tx := &fakeTx{
Expand All @@ -374,6 +374,9 @@ func (q *FakeQuerier) InTx(fn func(database.Store) error, _ *sql.TxOptions) erro
}
defer tx.releaseLocks()

if opts != nil {
database.IncrementExecutionCount(opts)
}
return fn(tx)
}

Expand Down
Loading
Loading