Skip to content

Commit 9cc4756

Browse files
authored
fix: Remove race condition with acquiredJobDone channel (#148)
Found another data race while running the tests: https://github.com/coder/coder/runs/5044320845?check_suite_focus=true#step:7:83 __Issue:__ There is a race in the p.acquiredJobDone chan - in particular, there can be a case where we're waiting on the channel to finish (in close) with <-p.acquiredJobDone, but in parallel, an acquireJob could've been started, which would create a new channel for p.acquiredJobDone. There is a similar race in `close(..)`ing the channel, which also came up in test runs. __Fix:__ Instead of recreating the channel everytime, we can use `sync.WaitGroup` to accomplish the same functionality - a semaphore to make close wait for the current job to wrap up.
1 parent 03ed951 commit 9cc4756

File tree

1 file changed

+4
-4
lines changed

1 file changed

+4
-4
lines changed

provisionerd/provisionerd.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ type provisionerDaemon struct {
9090
acquiredJobCancel context.CancelFunc
9191
acquiredJobCancelled atomic.Bool
9292
acquiredJobRunning atomic.Bool
93-
acquiredJobDone chan struct{}
93+
acquiredJobGroup sync.WaitGroup
9494
}
9595

9696
// Connect establishes a connection to coderd.
@@ -184,7 +184,7 @@ func (p *provisionerDaemon) acquireJob(ctx context.Context) {
184184
ctx, p.acquiredJobCancel = context.WithCancel(ctx)
185185
p.acquiredJobCancelled.Store(false)
186186
p.acquiredJobRunning.Store(true)
187-
p.acquiredJobDone = make(chan struct{})
187+
p.acquiredJobGroup.Add(1)
188188

189189
p.opts.Logger.Info(context.Background(), "acquired job",
190190
slog.F("organization_name", p.acquiredJob.OrganizationName),
@@ -235,7 +235,7 @@ func (p *provisionerDaemon) runJob(ctx context.Context) {
235235
p.acquiredJobMutex.Lock()
236236
defer p.acquiredJobMutex.Unlock()
237237
p.acquiredJobRunning.Store(false)
238-
close(p.acquiredJobDone)
238+
p.acquiredJobGroup.Done()
239239
}()
240240
// It's safe to cast this ProvisionerType. This data is coming directly from coderd.
241241
provisioner, hasProvisioner := p.opts.Provisioners[p.acquiredJob.Provisioner]
@@ -520,7 +520,7 @@ func (p *provisionerDaemon) closeWithError(err error) error {
520520
if !p.acquiredJobCancelled.Load() {
521521
p.cancelActiveJob(errMsg)
522522
}
523-
<-p.acquiredJobDone
523+
p.acquiredJobGroup.Wait()
524524
}
525525

526526
p.opts.Logger.Debug(context.Background(), "closing server with error", slog.Error(err))

0 commit comments

Comments
 (0)