Skip to content

chore: optimize GetPrebuiltWorkspaces query #18717

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
Jul 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions coderd/database/dbauthz/dbauthz.go
Original file line number Diff line number Diff line change
Expand Up @@ -2596,6 +2596,14 @@ func (q *querier) GetRunningPrebuiltWorkspaces(ctx context.Context) ([]database.
return q.db.GetRunningPrebuiltWorkspaces(ctx)
}

func (q *querier) GetRunningPrebuiltWorkspacesOptimized(ctx context.Context) ([]database.GetRunningPrebuiltWorkspacesOptimizedRow, error) {
// This query returns only prebuilt workspaces, but we decided to require permissions for all workspaces.
if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceWorkspace.All()); err != nil {
return nil, err
}
return q.db.GetRunningPrebuiltWorkspacesOptimized(ctx)
}

func (q *querier) GetRuntimeConfig(ctx context.Context, key string) (string, error) {
if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceSystem); err != nil {
return "", err
Expand Down
3 changes: 2 additions & 1 deletion coderd/database/dbauthz/dbauthz_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,8 @@ func TestDBAuthzRecursive(t *testing.T) {
if method.Name == "InTx" ||
method.Name == "Ping" ||
method.Name == "Wrappers" ||
method.Name == "PGLocks" {
method.Name == "PGLocks" ||
method.Name == "GetRunningPrebuiltWorkspacesOptimized" {
continue
}
// easy to know which method failed.
Expand Down
2 changes: 2 additions & 0 deletions coderd/database/dbauthz/setup_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ var skipMethods = map[string]string{
"Wrappers": "Not relevant",
"AcquireLock": "Not relevant",
"TryAcquireLock": "Not relevant",
// This method will be removed once we know this works correctly.
"GetRunningPrebuiltWorkspacesOptimized": "Not relevant",
}

// TestMethodTestSuite runs MethodTestSuite.
Expand Down
8 changes: 8 additions & 0 deletions coderd/database/dbgen/dbgen.go
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,14 @@ func Workspace(t testing.TB, db database.Store, orig database.WorkspaceTable) da
NextStartAt: orig.NextStartAt,
})
require.NoError(t, err, "insert workspace")
if orig.Deleted {
err = db.UpdateWorkspaceDeletedByID(genCtx, database.UpdateWorkspaceDeletedByIDParams{
ID: workspace.ID,
Deleted: true,
})
require.NoError(t, err, "set workspace as deleted")
workspace.Deleted = true
}
return workspace
}

Expand Down
7 changes: 7 additions & 0 deletions coderd/database/dbmetrics/querymetrics.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 15 additions & 0 deletions coderd/database/dbmock/dbmock.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions coderd/database/querier.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

99 changes: 99 additions & 0 deletions coderd/database/querier_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5020,3 +5020,102 @@ func requireUsersMatch(t testing.TB, expected []database.User, found []database.
t.Helper()
require.ElementsMatch(t, expected, database.ConvertUserRows(found), msg)
}

// TestGetRunningPrebuiltWorkspaces ensures the correct behavior of the
// GetRunningPrebuiltWorkspaces query.
func TestGetRunningPrebuiltWorkspaces(t *testing.T) {
t.Parallel()

if !dbtestutil.WillUsePostgres() {
t.Skip("Test requires PostgreSQL for complex queries")
}

ctx := testutil.Context(t, testutil.WaitLong)
db, _ := dbtestutil.NewDB(t)
now := dbtime.Now()

// Given: a prebuilt workspace with a successful start build and a stop build.
org := dbgen.Organization(t, db, database.Organization{})
user := dbgen.User(t, db, database.User{})
template := dbgen.Template(t, db, database.Template{
CreatedBy: user.ID,
OrganizationID: org.ID,
})
templateVersion := dbgen.TemplateVersion(t, db, database.TemplateVersion{
TemplateID: uuid.NullUUID{UUID: template.ID, Valid: true},
OrganizationID: org.ID,
CreatedBy: user.ID,
})
preset := dbgen.Preset(t, db, database.InsertPresetParams{
TemplateVersionID: templateVersion.ID,
DesiredInstances: sql.NullInt32{Int32: 1, Valid: true},
})

setupFixture := func(t *testing.T, db database.Store, name string, deleted bool, transition database.WorkspaceTransition, jobStatus database.ProvisionerJobStatus) database.WorkspaceTable {
t.Helper()
ws := dbgen.Workspace(t, db, database.WorkspaceTable{
OwnerID: database.PrebuildsSystemUserID,
TemplateID: template.ID,
Name: name,
Deleted: deleted,
})
var canceledAt sql.NullTime
var jobError sql.NullString
switch jobStatus {
case database.ProvisionerJobStatusFailed:
jobError = sql.NullString{String: assert.AnError.Error(), Valid: true}
case database.ProvisionerJobStatusCanceled:
canceledAt = sql.NullTime{Time: now, Valid: true}
}
pj := dbgen.ProvisionerJob(t, db, nil, database.ProvisionerJob{
OrganizationID: org.ID,
InitiatorID: database.PrebuildsSystemUserID,
Provisioner: database.ProvisionerTypeEcho,
Type: database.ProvisionerJobTypeWorkspaceBuild,
StartedAt: sql.NullTime{Time: now.Add(-time.Minute), Valid: true},
CanceledAt: canceledAt,
CompletedAt: sql.NullTime{Time: now, Valid: true},
Error: jobError,
})
wb := dbgen.WorkspaceBuild(t, db, database.WorkspaceBuild{
WorkspaceID: ws.ID,
TemplateVersionID: templateVersion.ID,
TemplateVersionPresetID: uuid.NullUUID{UUID: preset.ID, Valid: true},
JobID: pj.ID,
BuildNumber: 1,
Transition: transition,
InitiatorID: database.PrebuildsSystemUserID,
Reason: database.BuildReasonInitiator,
})
// Ensure things are set up as expectd
require.Equal(t, transition, wb.Transition)
require.Equal(t, int32(1), wb.BuildNumber)
require.Equal(t, jobStatus, pj.JobStatus)
require.Equal(t, deleted, ws.Deleted)

return ws
}

// Given: a number of prebuild workspaces with different states exist.
runningPrebuild := setupFixture(t, db, "running-prebuild", false, database.WorkspaceTransitionStart, database.ProvisionerJobStatusSucceeded)
_ = setupFixture(t, db, "stopped-prebuild", false, database.WorkspaceTransitionStop, database.ProvisionerJobStatusSucceeded)
_ = setupFixture(t, db, "failed-prebuild", false, database.WorkspaceTransitionStart, database.ProvisionerJobStatusFailed)
_ = setupFixture(t, db, "canceled-prebuild", false, database.WorkspaceTransitionStart, database.ProvisionerJobStatusCanceled)
_ = setupFixture(t, db, "deleted-prebuild", true, database.WorkspaceTransitionStart, database.ProvisionerJobStatusSucceeded)
Comment on lines +5101 to +5104
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice 🌟


// Given: a regular workspace also exists.
_ = dbfake.WorkspaceBuild(t, db, database.WorkspaceTable{
OwnerID: user.ID,
TemplateID: template.ID,
Name: "test-running-regular-workspace",
Deleted: false,
})

// When: we query for running prebuild workspaces
runningPrebuilds, err := db.GetRunningPrebuiltWorkspaces(ctx)
require.NoError(t, err)

// Then: only the running prebuild workspace should be returned.
require.Len(t, runningPrebuilds, 1, "expected only one running prebuilt workspace")
require.Equal(t, runningPrebuild.ID, runningPrebuilds[0].ID, "expected the running prebuilt workspace to be returned")
}
101 changes: 101 additions & 0 deletions coderd/database/queries.sql.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

61 changes: 60 additions & 1 deletion coderd/database/queries/prebuilds.sql
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,64 @@ WHERE tvp.desired_instances IS NOT NULL -- Consider only presets that have a pre
-- AND NOT t.deleted -- We don't exclude deleted templates because there's no constraint in the DB preventing a soft deletion on a template while workspaces are running.
AND (t.id = sqlc.narg('template_id')::uuid OR sqlc.narg('template_id') IS NULL);

-- name: GetRunningPrebuiltWorkspacesOptimized :many
WITH latest_prebuilds AS (
-- All workspaces that match the following criteria:
-- 1. Owned by prebuilds user
-- 2. Not deleted
-- 3. Latest build is a 'start' transition
-- 4. Latest build was successful
SELECT
workspaces.id,
workspaces.name,
workspaces.template_id,
workspace_latest_builds.template_version_id,
workspace_latest_builds.job_id,
workspaces.created_at
FROM workspace_latest_builds
JOIN workspaces ON workspaces.id = workspace_latest_builds.workspace_id
WHERE workspace_latest_builds.transition = 'start'::workspace_transition
AND workspace_latest_builds.job_status = 'succeeded'::provisioner_job_status
AND workspaces.owner_id = 'c42fdf75-3097-471c-8c33-fb52454d81c0'::UUID
AND NOT workspaces.deleted
),
workspace_latest_presets AS (
-- For each of the above workspaces, the preset_id of the most recent
-- successful start transition.
SELECT DISTINCT ON (latest_prebuilds.id)
latest_prebuilds.id AS workspace_id,
workspace_builds.template_version_preset_id AS current_preset_id
FROM latest_prebuilds
JOIN workspace_builds ON workspace_builds.workspace_id = latest_prebuilds.id
WHERE workspace_builds.transition = 'start'::workspace_transition
AND workspace_builds.template_version_preset_id IS NOT NULL
ORDER BY latest_prebuilds.id, workspace_builds.build_number DESC
),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't quite grasp the purpose of this CTE, it seems to pretty much do the same as latest_prebuilds? It's selecting distinct on workspace id from latest_prebuilds and joining workspace_builds on workspace id (we already have this data in the previous CTE, no?), and ordering by build number DESC, combined with distinct on that's == selecting directly from workspace_latest_builds, right? If we can't select the template_version_preset_id directly from the workspace_latest_builds then it'd still be more performant to join on build ID vs workspace ID and avoid t he distinct/order/where transition clause.

Might make more sense to just join workspace_builds in latest_prebuilds instead to get template_version_preset_id if required and remove this CTE.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When I omitted this CTE it caused different results for one specific prebuild instance, but I'm now not seeing the same discrepancy when I run the queries side-by-side. I'll try the modifications you suggest and see if it still yields the same results.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The more I look at that CTE, the more I question it too. On the dogfood database, all of the workspace builds owned by the prebuild user have a non-null template_version_preset_id, whereas almost all subsequent ones have a null preset ID.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAIU, whenever a template version is added, new presets are inserted into the database. I don't think we ever delete presets from the database. Except maybe when a template is deleted?
A prebuild should always correlate to a non-null preset. Could this be an issue when the active template version changes? 🤔

Copy link
Member Author

@johnstcn johnstcn Jul 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After spending some more time looking at this, it appears that the current correct approach is to query the most recent non-null preset ID for a successful start transition. Simply selecting from workspace_latest_builds may not result in the correct preset_id. I discussed with Sas and a future change may result in preset_id not being a thing any more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Prebuilt workspaces should ideally never change their preset. This means that the first build should contain a preset id and all subsequent builds should have a null preset ID. We could optimise this by dropping the workspace_latest_presets CTE from the query and instead rely on the preset ID for build 0 in all cases.

This would probably be much more performant.

The issue with this optimisation is the potentially negligent edge case where a human being restarts a prebuilt workspace with a new preset. In the optimized case such a restarted prebuilt workspace would be claimed with the wrong preset.

If we're willing to tolerate this edge case, we can optimize by dropping the CTE. Otherwise, the CTE remains necessary in order to find the last set preset id for a prebuilt workspace.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll leave the CTE for now. There is an argument to be made that once a human touches a prebuild that it's no longer valid to be claimed and as such only prebuilds with build_number = 1 should be claimable.

But what do we say to the God of Arguments? Not today.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Prebuilt workspaces should ideally never change their preset. This means that the first build should contain a preset id and all subsequent builds should have a null preset ID.

Why do we nullify the preset_id in subsequent builds? Conceptually, a prebuilt workspace is always associated with a preset, and clearing the field might imply the association no longer exists, which isn't quite accurate. Wouldn’t it make more sense to retain the preset_id to reflect this persistent relationship?

The issue with this optimisation is the potentially negligent edge case where a human being restarts a prebuilt workspace with a new preset.

How would this actually happen? Is it really possible for a prebuilt workspace to be restarted with a different preset? If we consider the association to the preset immutable once the prebuilt workspace is created, this edge case shouldn’t occur, right? Or is there a specific workflow that allows this to happen?

Either way, this is not a blocker for the PR, we can definitely move forward and revisit this discussion later if needed.

Copy link
Contributor

@SasSwart SasSwart Jul 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nullify the preset_id in subsequent builds

We don't explicitly nullify them. The fact that they are null on subsequent build is rather because there's nothing to tell a subsequent build what the preset_id should be. We could transfer the preset id from one build to the next, but that would roughly look like running the CTE that we're discussing here on every new build. Presets are set initially and when they change by specification of the user. For normal workspaces this user is a human being who chooses a preset. For prebuilt workspaces this user is the prebuilds system user and they specify the preset in the reconciliation loop.

How would this actually happen? Is it really possible for a prebuilt workspace to be restarted with a different preset?

Any workspace, prebuilt or not can be restarted with new parameters as long as those parameters are mutable. The same goes for presets. It is only possible via API right now. Neither the UI nor the CLI support changing a preset.

This means it is theoretically possible for a user with sufficient privileges to send an API call to restart with a new preset, but it is definitely not supported.

ready_agents AS (
-- For each of the above workspaces, check if all agents are ready.
SELECT
latest_prebuilds.job_id,
BOOL_AND(workspace_agents.lifecycle_state = 'ready'::workspace_agent_lifecycle_state)::boolean AS ready
FROM latest_prebuilds
JOIN workspace_resources ON workspace_resources.job_id = latest_prebuilds.job_id
JOIN workspace_agents ON workspace_agents.resource_id = workspace_resources.id
WHERE workspace_agents.deleted = false
AND workspace_agents.parent_id IS NULL
GROUP BY latest_prebuilds.job_id
)
SELECT
latest_prebuilds.id,
latest_prebuilds.name,
latest_prebuilds.template_id,
latest_prebuilds.template_version_id,
workspace_latest_presets.current_preset_id,
COALESCE(ready_agents.ready, false)::boolean AS ready,
latest_prebuilds.created_at
FROM latest_prebuilds
LEFT JOIN ready_agents ON ready_agents.job_id = latest_prebuilds.job_id
LEFT JOIN workspace_latest_presets ON workspace_latest_presets.workspace_id = latest_prebuilds.id
ORDER BY latest_prebuilds.id;

-- name: GetRunningPrebuiltWorkspaces :many
SELECT
p.id,
Expand All @@ -60,7 +118,8 @@ SELECT
FROM workspace_prebuilds p
INNER JOIN workspace_latest_builds b ON b.workspace_id = p.id
WHERE (b.transition = 'start'::workspace_transition
AND b.job_status = 'succeeded'::provisioner_job_status);
AND b.job_status = 'succeeded'::provisioner_job_status)
ORDER BY p.id;
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

review: adding stable ordering for diffing


-- name: CountInProgressPrebuilds :many
-- CountInProgressPrebuilds returns the number of in-progress prebuilds, grouped by preset ID and transition.
Expand Down
Loading
Loading