Преглед на файлове

Merge pull request #28462 from tonistiigi/health-deadlock

Fix deadlock on cancelling healthcheck
Alexander Morozov преди 8 години
родител
ревизия
c2fbf0871d
променени са 3 файла, в които са добавени 12 реда и са изтрити 8 реда
  1. 1 4
      container/health.go
  2. 10 3
      daemon/health.go
  3. 1 1
      daemon/health_test.go

+ 1 - 4
container/health.go

@@ -42,10 +42,7 @@ func (s *Health) OpenMonitorChannel() chan struct{} {
 func (s *Health) CloseMonitorChannel() {
 	if s.stop != nil {
 		logrus.Debug("CloseMonitorChannel: waiting for probe to stop")
-		// This channel does not buffer. Once the write succeeds, the monitor
-		// has read the stop request and will not make any further updates
-		// to c.State.Health.
-		s.stop <- struct{}{}
+		close(s.stop)
 		s.stop = nil
 		logrus.Debug("CloseMonitorChannel done")
 	}

+ 10 - 3
daemon/health.go

@@ -107,10 +107,17 @@ func (p *cmdProbe) run(ctx context.Context, d *Daemon, container *container.Cont
 }
 
 // Update the container's Status.Health struct based on the latest probe's result.
-func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult) {
+func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult, done chan struct{}) {
 	c.Lock()
 	defer c.Unlock()
 
+	// probe may have been cancelled while waiting on lock. Ignore result then
+	select {
+	case <-done:
+		return
+	default:
+	}
+
 	retries := c.Config.Healthcheck.Retries
 	if retries <= 0 {
 		retries = defaultProbeRetries
@@ -183,7 +190,7 @@ func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe)
 				cancelProbe()
 				return
 			case result := <-results:
-				handleProbeResult(d, c, result)
+				handleProbeResult(d, c, result, stop)
 				// Stop timeout
 				cancelProbe()
 			case <-ctx.Done():
@@ -193,7 +200,7 @@ func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe)
 					Output:   fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout),
 					Start:    startTime,
 					End:      time.Now(),
-				})
+				}, stop)
 				cancelProbe()
 				// Wait for probe to exit (it might take a while to respond to the TERM
 				// signal and we don't want dying probes to pile up).

+ 1 - 1
daemon/health_test.go

@@ -80,7 +80,7 @@ func TestHealthStates(t *testing.T) {
 			Start:    startTime,
 			End:      startTime,
 			ExitCode: exitCode,
-		})
+		}, nil)
 	}
 
 	// starting -> failed -> success -> failed