Sfoglia il codice sorgente

Correct CPU usage calculation in presence of offline CPUs and newer Linux

In https://github.com/torvalds/linux/commit/5ca3726 (released in v4.7-rc1) the
content of the `cpuacct.usage_percpu` file in sysfs was changed to include both
online and offline cpus. This broke the arithmetic in the stats helpers used by
`docker stats`, since it was using the length of the PerCPUUsage array as a
proxy for the number of online CPUs.

Add current number of online CPUs to types.StatsJSON and use it in the
calculation.

Keep a fallback to `len(v.CPUStats.CPUUsage.PercpuUsage)` so this code
continues to work when talking to an older daemon. An old client talking to a
new daemon will ignore the new field and behave as before.

Fixes #28941.

Signed-off-by: Ian Campbell <ian.campbell@docker.com>
Ian Campbell 8 anni fa
parent
commit
115f91d757

+ 6 - 0
api/swagger.yaml

@@ -3468,6 +3468,10 @@ paths:
         The `precpu_stats` is the CPU statistic of last read, which is used
         for calculating the CPU usage percentage. It is not the same as the
         `cpu_stats` field.
+
+        If either `precpu_stats.online_cpus` or `cpu_stats.online_cpus` is
+        nil then for compatibility with older daemons the length of the
+        corresponding `cpu_usage.percpu_usage` array should be used.
       operationId: "ContainerStats"
       produces: ["application/json"]
       responses:
@@ -3546,6 +3550,7 @@ paths:
                   total_usage: 100215355
                   usage_in_kernelmode: 30000000
                 system_cpu_usage: 739306590000000
+                online_cpus: 4
                 throttling_data:
                   periods: 0
                   throttled_periods: 0
@@ -3561,6 +3566,7 @@ paths:
                   total_usage: 100093996
                   usage_in_kernelmode: 30000000
                 system_cpu_usage: 9492140000000
+                online_cpus: 4
                 throttling_data:
                   periods: 0
                   throttled_periods: 0

+ 3 - 0
api/types/stats.go

@@ -47,6 +47,9 @@ type CPUStats struct {
 	// System Usage. Linux only.
 	SystemUsage uint64 `json:"system_cpu_usage,omitempty"`
 
+	// Online CPUs. Linux only.
+	OnlineCPUs uint32 `json:"online_cpus,omitempty"`
+
 	// Throttling Data. Linux only.
 	ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
 }

+ 5 - 1
cli/command/container/stats_helpers.go

@@ -178,10 +178,14 @@ func calculateCPUPercentUnix(previousCPU, previousSystem uint64, v *types.StatsJ
 		cpuDelta = float64(v.CPUStats.CPUUsage.TotalUsage) - float64(previousCPU)
 		// calculate the change for the entire system between readings
 		systemDelta = float64(v.CPUStats.SystemUsage) - float64(previousSystem)
+		onlineCPUs  = float64(v.CPUStats.OnlineCPUs)
 	)
 
+	if onlineCPUs == 0.0 {
+		onlineCPUs = float64(len(v.CPUStats.CPUUsage.PercpuUsage))
+	}
 	if systemDelta > 0.0 && cpuDelta > 0.0 {
-		cpuPercent = (cpuDelta / systemDelta) * float64(len(v.CPUStats.CPUUsage.PercpuUsage)) * 100.0
+		cpuPercent = (cpuDelta / systemDelta) * onlineCPUs * 100.0
 	}
 	return cpuPercent
 }

+ 7 - 0
daemon/stats/collector.go

@@ -80,6 +80,12 @@ func (s *Collector) Run() {
 			continue
 		}
 
+		onlineCPUs, err := s.getNumberOnlineCPUs()
+		if err != nil {
+			logrus.Errorf("collecting system online cpu count: %v", err)
+			continue
+		}
+
 		for _, pair := range pairs {
 			stats, err := s.supervisor.GetContainerStats(pair.container)
 			if err != nil {
@@ -97,6 +103,7 @@ func (s *Collector) Run() {
 			}
 			// FIXME: move to containerd on Linux (not Windows)
 			stats.CPUStats.SystemUsage = systemUsage
+			stats.CPUStats.OnlineCPUs = onlineCPUs
 
 			pair.publisher.Publish(*stats)
 		}

+ 13 - 0
daemon/stats/collector_unix.go

@@ -11,6 +11,11 @@ import (
 	"github.com/opencontainers/runc/libcontainer/system"
 )
 
+/*
+#include <unistd.h>
+*/
+import "C"
+
 // platformNewStatsCollector performs platform specific initialisation of the
 // Collector structure.
 func platformNewStatsCollector(s *Collector) {
@@ -64,3 +69,11 @@ func (s *Collector) getSystemCPUUsage() (uint64, error) {
 	}
 	return 0, fmt.Errorf("invalid stat format. Error trying to parse the '/proc/stat' file")
 }
+
+func (s *Collector) getNumberOnlineCPUs() (uint32, error) {
+	i, err := C.sysconf(C._SC_NPROCESSORS_ONLN)
+	if err != nil {
+		return 0, err
+	}
+	return uint32(i), nil
+}

+ 4 - 0
daemon/stats/collector_windows.go

@@ -13,3 +13,7 @@ func platformNewStatsCollector(s *Collector) {
 func (s *Collector) getSystemCPUUsage() (uint64, error) {
 	return 0, nil
 }
+
+func (s *Collector) getNumberOnlineCPUs() (uint32, error) {
+	return 0, nil
+}

+ 1 - 0
docs/api/version-history.md

@@ -24,6 +24,7 @@ keywords: "API, Docker, rcli, REST, documentation"
 * `POST /build` now accepts `extrahosts` parameter to specify a host to ip mapping to use during the build.
 * `POST /services/create` and `POST /services/(id or name)/update` now accept a `rollback` value for `FailureAction`.
 * `POST /services/create` and `POST /services/(id or name)/update` now accept an optional `RollbackConfig` object which specifies rollback options.
+* `GET /containers/(id or name)/stats` now includes an `online_cpus` field in both `precpu_stats` and `cpu_stats`. If this field is `nil` then for compatibility with older daemons the length of the corresponding `cpu_usage.percpu_usage` array should be used.
 
 ## v1.26 API changes