Browse Source

Correct CPU usage calculation in presence of offline CPUs and newer Linux

In https://github.com/torvalds/linux/commit/5ca3726 (released in v4.7-rc1) the
content of the `cpuacct.usage_percpu` file in sysfs was changed to include both
online and offline cpus. This broke the arithmetic in the stats helpers used by
`docker stats`, since it was using the length of the PerCPUUsage array as a
proxy for the number of online CPUs.

Add current number of online CPUs to types.StatsJSON and use it in the
calculation.

Keep a fallback to `len(v.CPUStats.CPUUsage.PercpuUsage)` so this code
continues to work when talking to an older daemon. An old client talking to a
new daemon will ignore the new field and behave as before.

Fixes #28941.

Signed-off-by: Ian Campbell <ian.campbell@docker.com>
Ian Campbell 8 years ago
parent
commit
115f91d757

+ 6 - 0
api/swagger.yaml

@@ -3468,6 +3468,10 @@ paths:
         The `precpu_stats` is the CPU statistic of last read, which is used
         The `precpu_stats` is the CPU statistic of last read, which is used
         for calculating the CPU usage percentage. It is not the same as the
         for calculating the CPU usage percentage. It is not the same as the
         `cpu_stats` field.
         `cpu_stats` field.
+
+        If either `precpu_stats.online_cpus` or `cpu_stats.online_cpus` is
+        nil then for compatibility with older daemons the length of the
+        corresponding `cpu_usage.percpu_usage` array should be used.
       operationId: "ContainerStats"
       operationId: "ContainerStats"
       produces: ["application/json"]
       produces: ["application/json"]
       responses:
       responses:
@@ -3546,6 +3550,7 @@ paths:
                   total_usage: 100215355
                   total_usage: 100215355
                   usage_in_kernelmode: 30000000
                   usage_in_kernelmode: 30000000
                 system_cpu_usage: 739306590000000
                 system_cpu_usage: 739306590000000
+                online_cpus: 4
                 throttling_data:
                 throttling_data:
                   periods: 0
                   periods: 0
                   throttled_periods: 0
                   throttled_periods: 0
@@ -3561,6 +3566,7 @@ paths:
                   total_usage: 100093996
                   total_usage: 100093996
                   usage_in_kernelmode: 30000000
                   usage_in_kernelmode: 30000000
                 system_cpu_usage: 9492140000000
                 system_cpu_usage: 9492140000000
+                online_cpus: 4
                 throttling_data:
                 throttling_data:
                   periods: 0
                   periods: 0
                   throttled_periods: 0
                   throttled_periods: 0

+ 3 - 0
api/types/stats.go

@@ -47,6 +47,9 @@ type CPUStats struct {
 	// System Usage. Linux only.
 	// System Usage. Linux only.
 	SystemUsage uint64 `json:"system_cpu_usage,omitempty"`
 	SystemUsage uint64 `json:"system_cpu_usage,omitempty"`
 
 
+	// Online CPUs. Linux only.
+	OnlineCPUs uint32 `json:"online_cpus,omitempty"`
+
 	// Throttling Data. Linux only.
 	// Throttling Data. Linux only.
 	ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
 	ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
 }
 }

+ 5 - 1
cli/command/container/stats_helpers.go

@@ -178,10 +178,14 @@ func calculateCPUPercentUnix(previousCPU, previousSystem uint64, v *types.StatsJ
 		cpuDelta = float64(v.CPUStats.CPUUsage.TotalUsage) - float64(previousCPU)
 		cpuDelta = float64(v.CPUStats.CPUUsage.TotalUsage) - float64(previousCPU)
 		// calculate the change for the entire system between readings
 		// calculate the change for the entire system between readings
 		systemDelta = float64(v.CPUStats.SystemUsage) - float64(previousSystem)
 		systemDelta = float64(v.CPUStats.SystemUsage) - float64(previousSystem)
+		onlineCPUs  = float64(v.CPUStats.OnlineCPUs)
 	)
 	)
 
 
+	if onlineCPUs == 0.0 {
+		onlineCPUs = float64(len(v.CPUStats.CPUUsage.PercpuUsage))
+	}
 	if systemDelta > 0.0 && cpuDelta > 0.0 {
 	if systemDelta > 0.0 && cpuDelta > 0.0 {
-		cpuPercent = (cpuDelta / systemDelta) * float64(len(v.CPUStats.CPUUsage.PercpuUsage)) * 100.0
+		cpuPercent = (cpuDelta / systemDelta) * onlineCPUs * 100.0
 	}
 	}
 	return cpuPercent
 	return cpuPercent
 }
 }

+ 7 - 0
daemon/stats/collector.go

@@ -80,6 +80,12 @@ func (s *Collector) Run() {
 			continue
 			continue
 		}
 		}
 
 
+		onlineCPUs, err := s.getNumberOnlineCPUs()
+		if err != nil {
+			logrus.Errorf("collecting system online cpu count: %v", err)
+			continue
+		}
+
 		for _, pair := range pairs {
 		for _, pair := range pairs {
 			stats, err := s.supervisor.GetContainerStats(pair.container)
 			stats, err := s.supervisor.GetContainerStats(pair.container)
 			if err != nil {
 			if err != nil {
@@ -97,6 +103,7 @@ func (s *Collector) Run() {
 			}
 			}
 			// FIXME: move to containerd on Linux (not Windows)
 			// FIXME: move to containerd on Linux (not Windows)
 			stats.CPUStats.SystemUsage = systemUsage
 			stats.CPUStats.SystemUsage = systemUsage
+			stats.CPUStats.OnlineCPUs = onlineCPUs
 
 
 			pair.publisher.Publish(*stats)
 			pair.publisher.Publish(*stats)
 		}
 		}

+ 13 - 0
daemon/stats/collector_unix.go

@@ -11,6 +11,11 @@ import (
 	"github.com/opencontainers/runc/libcontainer/system"
 	"github.com/opencontainers/runc/libcontainer/system"
 )
 )
 
 
+/*
+#include <unistd.h>
+*/
+import "C"
+
 // platformNewStatsCollector performs platform specific initialisation of the
 // platformNewStatsCollector performs platform specific initialisation of the
 // Collector structure.
 // Collector structure.
 func platformNewStatsCollector(s *Collector) {
 func platformNewStatsCollector(s *Collector) {
@@ -64,3 +69,11 @@ func (s *Collector) getSystemCPUUsage() (uint64, error) {
 	}
 	}
 	return 0, fmt.Errorf("invalid stat format. Error trying to parse the '/proc/stat' file")
 	return 0, fmt.Errorf("invalid stat format. Error trying to parse the '/proc/stat' file")
 }
 }
+
+func (s *Collector) getNumberOnlineCPUs() (uint32, error) {
+	i, err := C.sysconf(C._SC_NPROCESSORS_ONLN)
+	if err != nil {
+		return 0, err
+	}
+	return uint32(i), nil
+}

+ 4 - 0
daemon/stats/collector_windows.go

@@ -13,3 +13,7 @@ func platformNewStatsCollector(s *Collector) {
 func (s *Collector) getSystemCPUUsage() (uint64, error) {
 func (s *Collector) getSystemCPUUsage() (uint64, error) {
 	return 0, nil
 	return 0, nil
 }
 }
+
+func (s *Collector) getNumberOnlineCPUs() (uint32, error) {
+	return 0, nil
+}

+ 1 - 0
docs/api/version-history.md

@@ -24,6 +24,7 @@ keywords: "API, Docker, rcli, REST, documentation"
 * `POST /build` now accepts `extrahosts` parameter to specify a host to ip mapping to use during the build.
 * `POST /build` now accepts `extrahosts` parameter to specify a host to ip mapping to use during the build.
 * `POST /services/create` and `POST /services/(id or name)/update` now accept a `rollback` value for `FailureAction`.
 * `POST /services/create` and `POST /services/(id or name)/update` now accept a `rollback` value for `FailureAction`.
 * `POST /services/create` and `POST /services/(id or name)/update` now accept an optional `RollbackConfig` object which specifies rollback options.
 * `POST /services/create` and `POST /services/(id or name)/update` now accept an optional `RollbackConfig` object which specifies rollback options.
+* `GET /containers/(id or name)/stats` now includes an `online_cpus` field in both `precpu_stats` and `cpu_stats`. If this field is `nil` then for compatibility with older daemons the length of the corresponding `cpu_usage.percpu_usage` array should be used.
 
 
 ## v1.26 API changes
 ## v1.26 API changes