libcontainerd/supervisor: fix data race

The monitorDaemon() goroutine calls startContainerd() then blocks on
<-daemonWaitCh to wait for it to exit. The startContainerd() function
would (re)initialize the daemonWaitCh so a restarted containerd could be
waited on. This implementation was race-free because startContainerd()
would synchronously initialize the daemonWaitCh before returning. When
the call to start the managed containerd process was moved into the
waiter goroutine, the code to initialize the daemonWaitCh struct field
was also moved into the goroutine. This introduced a race condition.

Move the daemonWaitCh initialization to guarantee that it happens before
the startContainerd() call returns.

Signed-off-by: Cory Snider <csnider@mirantis.com>
This commit is contained in:
Cory Snider 2024-02-01 15:37:09 -05:00
parent f5cf22ca99
commit dd20bf4862

View file

@ -190,12 +190,13 @@ func (r *remote) startContainerd() error {
runtime.LockOSThread() runtime.LockOSThread()
defer runtime.UnlockOSThread() defer runtime.UnlockOSThread()
err := cmd.Start() err := cmd.Start()
startedCh <- err
if err != nil { if err != nil {
startedCh <- err
return return
} }
r.daemonWaitCh = make(chan struct{}) r.daemonWaitCh = make(chan struct{})
startedCh <- nil
// Reap our child when needed // Reap our child when needed
if err := cmd.Wait(); err != nil { if err := cmd.Wait(); err != nil {
r.logger.WithError(err).Errorf("containerd did not exit successfully") r.logger.WithError(err).Errorf("containerd did not exit successfully")