Skip to content

Commit 4f92928

Browse files
johnstcnmatifali
andauthored
feat(coderd/healthcheck): add access URL error codes and healthcheck doc (#10915)
Relates to #8965 - Added error codes for separate code paths in health checks - Prefixed errors and warnings with error code prefixes - Added a docs page with details on each code, cause and solution Co-authored-by: Muhammad Atif Ali <atif@coder.com>
1 parent 5b2f436 commit 4f92928

File tree

16 files changed

+479
-72
lines changed

16 files changed

+479
-72
lines changed

coderd/healthcheck/accessurl.go

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@ import (
77
"net/url"
88
"time"
99

10-
"golang.org/x/xerrors"
11-
1210
"github.com/coder/coder/v2/coderd/healthcheck/health"
1311
"github.com/coder/coder/v2/coderd/util/ptr"
1412
)
@@ -44,7 +42,7 @@ func (r *AccessURLReport) Run(ctx context.Context, opts *AccessURLReportOptions)
4442
r.Dismissed = opts.Dismissed
4543

4644
if opts.AccessURL == nil {
47-
r.Error = ptr.Ref("access URL is nil")
45+
r.Error = ptr.Ref(health.Messagef(health.CodeAccessURLNotSet, "Access URL not set"))
4846
r.Severity = health.SeverityError
4947
return
5048
}
@@ -56,29 +54,29 @@ func (r *AccessURLReport) Run(ctx context.Context, opts *AccessURLReportOptions)
5654

5755
accessURL, err := opts.AccessURL.Parse("/healthz")
5856
if err != nil {
59-
r.Error = convertError(xerrors.Errorf("parse healthz endpoint: %w", err))
57+
r.Error = ptr.Ref(health.Messagef(health.CodeAccessURLInvalid, "parse healthz endpoint: %s", err))
6058
r.Severity = health.SeverityError
6159
return
6260
}
6361

6462
req, err := http.NewRequestWithContext(ctx, "GET", accessURL.String(), nil)
6563
if err != nil {
66-
r.Error = convertError(xerrors.Errorf("create healthz request: %w", err))
64+
r.Error = ptr.Ref(health.Messagef(health.CodeAccessURLFetch, "create healthz request: %s", err))
6765
r.Severity = health.SeverityError
6866
return
6967
}
7068

7169
res, err := opts.Client.Do(req)
7270
if err != nil {
73-
r.Error = convertError(xerrors.Errorf("get healthz endpoint: %w", err))
71+
r.Error = ptr.Ref(health.Messagef(health.CodeAccessURLFetch, "get healthz endpoint: %s", err))
7472
r.Severity = health.SeverityError
7573
return
7674
}
7775
defer res.Body.Close()
7876

7977
body, err := io.ReadAll(res.Body)
8078
if err != nil {
81-
r.Error = convertError(xerrors.Errorf("read healthz response: %w", err))
79+
r.Error = ptr.Ref(health.Messagef(health.CodeAccessURLFetch, "read healthz response: %s", err))
8280
r.Severity = health.SeverityError
8381
return
8482
}
@@ -88,6 +86,7 @@ func (r *AccessURLReport) Run(ctx context.Context, opts *AccessURLReportOptions)
8886
r.StatusCode = res.StatusCode
8987
if res.StatusCode != http.StatusOK {
9088
r.Severity = health.SeverityWarning
89+
r.Warnings = append(r.Warnings, health.Messagef(health.CodeAccessURLNotOK, "/healthz did not return 200 OK"))
9190
}
9291
r.HealthzResponse = string(body)
9392
}

coderd/healthcheck/accessurl_test.go

Lines changed: 57 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ import (
1111
"github.com/stretchr/testify/require"
1212
"golang.org/x/xerrors"
1313

14-
"github.com/coder/coder/v2/coderd/coderdtest"
1514
"github.com/coder/coder/v2/coderd/healthcheck"
1615
"github.com/coder/coder/v2/coderd/healthcheck/health"
1716
)
@@ -25,12 +24,17 @@ func TestAccessURL(t *testing.T) {
2524
var (
2625
ctx, cancel = context.WithCancel(context.Background())
2726
report healthcheck.AccessURLReport
28-
client = coderdtest.New(t, nil)
27+
resp = []byte("OK")
28+
srv = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
29+
w.WriteHeader(http.StatusOK)
30+
_, _ = w.Write(resp)
31+
}))
2932
)
3033
defer cancel()
3134

3235
report.Run(ctx, &healthcheck.AccessURLReportOptions{
33-
AccessURL: client.URL,
36+
Client: srv.Client(),
37+
AccessURL: mustURL(t, srv.URL),
3438
})
3539

3640
assert.True(t, report.Healthy)
@@ -41,35 +45,27 @@ func TestAccessURL(t *testing.T) {
4145
assert.Nil(t, report.Error)
4246
})
4347

44-
t.Run("404", func(t *testing.T) {
48+
t.Run("NotSet", func(t *testing.T) {
4549
t.Parallel()
4650

4751
var (
4852
ctx, cancel = context.WithCancel(context.Background())
4953
report healthcheck.AccessURLReport
50-
resp = []byte("NOT OK")
51-
srv = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
52-
w.WriteHeader(http.StatusNotFound)
53-
w.Write(resp)
54-
}))
5554
)
5655
defer cancel()
57-
defer srv.Close()
58-
59-
u, err := url.Parse(srv.URL)
60-
require.NoError(t, err)
6156

6257
report.Run(ctx, &healthcheck.AccessURLReportOptions{
63-
Client: srv.Client(),
64-
AccessURL: u,
58+
Client: nil, // defaults to http.DefaultClient
59+
AccessURL: nil,
6560
})
6661

6762
assert.False(t, report.Healthy)
68-
assert.True(t, report.Reachable)
69-
assert.Equal(t, health.SeverityWarning, report.Severity)
70-
assert.Equal(t, http.StatusNotFound, report.StatusCode)
71-
assert.Equal(t, string(resp), report.HealthzResponse)
72-
assert.Nil(t, report.Error)
63+
assert.False(t, report.Reachable)
64+
assert.Equal(t, health.SeverityError, report.Severity)
65+
assert.Equal(t, 0, report.StatusCode)
66+
assert.Equal(t, "", report.HealthzResponse)
67+
require.NotNil(t, report.Error)
68+
assert.Contains(t, *report.Error, health.CodeAccessURLNotSet)
7369
})
7470

7571
t.Run("ClientErr", func(t *testing.T) {
@@ -81,7 +77,7 @@ func TestAccessURL(t *testing.T) {
8177
resp = []byte("OK")
8278
srv = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
8379
w.WriteHeader(http.StatusOK)
84-
w.Write(resp)
80+
_, _ = w.Write(resp)
8581
}))
8682
client = srv.Client()
8783
)
@@ -93,12 +89,9 @@ func TestAccessURL(t *testing.T) {
9389
return nil, expErr
9490
})
9591

96-
u, err := url.Parse(srv.URL)
97-
require.NoError(t, err)
98-
9992
report.Run(ctx, &healthcheck.AccessURLReportOptions{
10093
Client: client,
101-
AccessURL: u,
94+
AccessURL: mustURL(t, srv.URL),
10295
})
10396

10497
assert.False(t, report.Healthy)
@@ -108,6 +101,38 @@ func TestAccessURL(t *testing.T) {
108101
assert.Equal(t, "", report.HealthzResponse)
109102
require.NotNil(t, report.Error)
110103
assert.Contains(t, *report.Error, expErr.Error())
104+
assert.Contains(t, *report.Error, health.CodeAccessURLFetch)
105+
})
106+
107+
t.Run("404", func(t *testing.T) {
108+
t.Parallel()
109+
110+
var (
111+
ctx, cancel = context.WithCancel(context.Background())
112+
report healthcheck.AccessURLReport
113+
resp = []byte("NOT OK")
114+
srv = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
115+
w.WriteHeader(http.StatusNotFound)
116+
_, _ = w.Write(resp)
117+
}))
118+
)
119+
defer cancel()
120+
defer srv.Close()
121+
122+
report.Run(ctx, &healthcheck.AccessURLReportOptions{
123+
Client: srv.Client(),
124+
AccessURL: mustURL(t, srv.URL),
125+
})
126+
127+
assert.False(t, report.Healthy)
128+
assert.True(t, report.Reachable)
129+
assert.Equal(t, health.SeverityWarning, report.Severity)
130+
assert.Equal(t, http.StatusNotFound, report.StatusCode)
131+
assert.Equal(t, string(resp), report.HealthzResponse)
132+
assert.Nil(t, report.Error)
133+
if assert.NotEmpty(t, report.Warnings) {
134+
assert.Contains(t, report.Warnings[0], health.CodeAccessURLNotOK)
135+
}
111136
})
112137

113138
t.Run("DismissedError", func(t *testing.T) {
@@ -133,3 +158,10 @@ type roundTripFunc func(r *http.Request) (*http.Response, error)
133158
func (rt roundTripFunc) RoundTrip(r *http.Request) (*http.Response, error) {
134159
return rt(r)
135160
}
161+
162+
func mustURL(t testing.TB, s string) *url.URL {
163+
t.Helper()
164+
u, err := url.Parse(s)
165+
require.NoError(t, err)
166+
return u
167+
}

coderd/healthcheck/database.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@ import (
44
"context"
55
"time"
66

7-
"golang.org/x/exp/slices"
8-
"golang.org/x/xerrors"
9-
107
"github.com/coder/coder/v2/coderd/database"
118
"github.com/coder/coder/v2/coderd/healthcheck/health"
9+
"github.com/coder/coder/v2/coderd/util/ptr"
10+
11+
"golang.org/x/exp/slices"
1212
)
1313

1414
const (
@@ -55,8 +55,9 @@ func (r *DatabaseReport) Run(ctx context.Context, opts *DatabaseReportOptions) {
5555
for i := 0; i < pingCount; i++ {
5656
pong, err := opts.DB.Ping(ctx)
5757
if err != nil {
58-
r.Error = convertError(xerrors.Errorf("ping: %w", err))
58+
r.Error = ptr.Ref(health.Messagef(health.CodeDatabasePingFailed, "ping database: %s", err))
5959
r.Severity = health.SeverityError
60+
6061
return
6162
}
6263
pings = append(pings, pong)
@@ -69,6 +70,7 @@ func (r *DatabaseReport) Run(ctx context.Context, opts *DatabaseReportOptions) {
6970
r.LatencyMS = latency.Milliseconds()
7071
if r.LatencyMS >= r.ThresholdMS {
7172
r.Severity = health.SeverityWarning
73+
r.Warnings = append(r.Warnings, health.Messagef(health.CodeDatabasePingSlow, "median database ping above threshold"))
7274
}
7375
r.Healthy = true
7476
r.Reachable = true

coderd/healthcheck/database_test.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ func TestDatabase(t *testing.T) {
6565
require.NotNil(t, report.Error)
6666
assert.Equal(t, healthcheck.DatabaseDefaultThreshold.Milliseconds(), report.ThresholdMS)
6767
assert.Contains(t, *report.Error, err.Error())
68+
assert.Contains(t, *report.Error, health.CodeDatabasePingFailed)
6869
})
6970

7071
t.Run("DismissedError", func(t *testing.T) {
@@ -85,6 +86,7 @@ func TestDatabase(t *testing.T) {
8586
assert.Equal(t, health.SeverityError, report.Severity)
8687
assert.True(t, report.Dismissed)
8788
require.NotNil(t, report.Error)
89+
assert.Contains(t, *report.Error, health.CodeDatabasePingFailed)
8890
})
8991

9092
t.Run("Median", func(t *testing.T) {
@@ -112,6 +114,7 @@ func TestDatabase(t *testing.T) {
112114
assert.EqualValues(t, 1, report.LatencyMS)
113115
assert.Equal(t, healthcheck.DatabaseDefaultThreshold.Milliseconds(), report.ThresholdMS)
114116
assert.Nil(t, report.Error)
117+
assert.Empty(t, report.Warnings)
115118
})
116119

117120
t.Run("Threshold", func(t *testing.T) {
@@ -139,5 +142,8 @@ func TestDatabase(t *testing.T) {
139142
assert.EqualValues(t, 1000, report.LatencyMS)
140143
assert.Equal(t, time.Second.Milliseconds(), report.ThresholdMS)
141144
assert.Nil(t, report.Error)
145+
if assert.NotEmpty(t, report.Warnings) {
146+
assert.Contains(t, report.Warnings[0], health.CodeDatabasePingSlow)
147+
}
142148
})
143149
}

coderd/healthcheck/derphealth/derp.go

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -136,9 +136,7 @@ func (r *Report) Run(ctx context.Context, opts *ReportOptions) {
136136
r.Healthy = false
137137
}
138138

139-
for _, w := range regionReport.Warnings {
140-
r.Warnings = append(r.Warnings, fmt.Sprintf("[%s] %s", regionReport.Region.RegionName, w))
141-
}
139+
r.Warnings = append(r.Warnings, regionReport.Warnings...)
142140
mu.Unlock()
143141
}()
144142
}
@@ -202,9 +200,7 @@ func (r *RegionReport) Run(ctx context.Context) {
202200
unhealthyNodes++
203201
}
204202

205-
for _, w := range nodeReport.Warnings {
206-
r.Warnings = append(r.Warnings, fmt.Sprintf("[%s] %s", nodeReport.Node.Name, w))
207-
}
203+
r.Warnings = append(r.Warnings, nodeReport.Warnings...)
208204
r.mu.Unlock()
209205
}()
210206
}
@@ -228,7 +224,7 @@ func (r *RegionReport) Run(ctx context.Context) {
228224
} else if unhealthyNodes == 1 {
229225
// r.Healthy = true (by default)
230226
r.Severity = health.SeverityWarning
231-
r.Warnings = append(r.Warnings, oneNodeUnhealthy)
227+
r.Warnings = append(r.Warnings, health.Messagef(health.CodeDERPOneNodeUnhealthy, oneNodeUnhealthy))
232228
} else if unhealthyNodes > 1 {
233229
r.Healthy = false
234230

@@ -292,7 +288,7 @@ func (r *NodeReport) Run(ctx context.Context) {
292288
}
293289

294290
if r.UsesWebsocket {
295-
r.Warnings = append(r.Warnings, warningNodeUsesWebsocket)
291+
r.Warnings = append(r.Warnings, health.Messagef(health.CodeDERPNodeUsesWebsocket, warningNodeUsesWebsocket))
296292
r.Severity = health.SeverityWarning
297293
}
298294
}

coderd/healthcheck/derphealth/derp_test.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,9 @@ func TestDERP(t *testing.T) {
129129
assert.True(t, report.Healthy)
130130
assert.Equal(t, health.SeverityWarning, report.Severity)
131131
assert.True(t, report.Dismissed)
132+
if assert.NotEmpty(t, report.Warnings) {
133+
assert.Contains(t, report.Warnings[0], health.CodeDERPOneNodeUnhealthy)
134+
}
132135
for _, region := range report.Regions {
133136
assert.True(t, region.Healthy)
134137
assert.True(t, region.NodeReports[0].Healthy)
@@ -232,7 +235,9 @@ func TestDERP(t *testing.T) {
232235

233236
assert.True(t, report.Healthy)
234237
assert.Equal(t, health.SeverityWarning, report.Severity)
235-
assert.NotEmpty(t, report.Warnings)
238+
if assert.NotEmpty(t, report.Warnings) {
239+
assert.Contains(t, report.Warnings[0], health.CodeDERPNodeUsesWebsocket)
240+
}
236241
for _, region := range report.Regions {
237242
assert.True(t, region.Healthy)
238243
assert.Equal(t, health.SeverityWarning, region.Severity)

coderd/healthcheck/health/model.go

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,37 @@
11
package health
22

3+
import (
4+
"fmt"
5+
"strings"
6+
)
7+
38
const (
49
SeverityOK Severity = "ok"
510
SeverityWarning Severity = "warning"
611
SeverityError Severity = "error"
12+
13+
// CodeUnknown is a catch-all health code when something unexpected goes wrong (for example, a panic).
14+
CodeUnknown Code = "EUNKNOWN"
15+
16+
CodeProxyUpdate Code = "EWP01"
17+
CodeProxyFetch Code = "EWP02"
18+
CodeProxyVersionMismatch Code = "EWP03"
19+
CodeProxyUnhealthy Code = "EWP04"
20+
21+
CodeDatabasePingFailed Code = "EDB01"
22+
CodeDatabasePingSlow Code = "EDB02"
23+
24+
CodeWebsocketDial Code = "EWS01"
25+
CodeWebsocketEcho Code = "EWS02"
26+
CodeWebsocketMsg Code = "EWS03"
27+
28+
CodeAccessURLNotSet Code = "EACS01"
29+
CodeAccessURLInvalid Code = "EACS02"
30+
CodeAccessURLFetch Code = "EACS03"
31+
CodeAccessURLNotOK Code = "EACS04"
32+
33+
CodeDERPNodeUsesWebsocket Code = `EDERP01`
34+
CodeDERPOneNodeUnhealthy Code = `EDERP02`
735
)
836

937
// @typescript-generate Severity
@@ -18,3 +46,17 @@ var severityRank = map[Severity]int{
1846
func (s Severity) Value() int {
1947
return severityRank[s]
2048
}
49+
50+
// Code is a stable identifier used to link to documentation.
51+
// @typescript-generate Code
52+
type Code string
53+
54+
// Messagef is a convenience function for formatting a healthcheck error message.
55+
func Messagef(code Code, msg string, args ...any) string {
56+
var sb strings.Builder
57+
_, _ = sb.WriteString(string(code))
58+
_, _ = sb.WriteRune(':')
59+
_, _ = sb.WriteRune(' ')
60+
_, _ = sb.WriteString(fmt.Sprintf(msg, args...))
61+
return sb.String()
62+
}

0 commit comments

Comments
 (0)