moby/daemon/stats/collector.go
Ian Campbell 115f91d757 Correct CPU usage calculation in presence of offline CPUs and newer Linux
In https://github.com/torvalds/linux/commit/5ca3726 (released in v4.7-rc1) the
content of the `cpuacct.usage_percpu` file in sysfs was changed to include both
online and offline cpus. This broke the arithmetic in the stats helpers used by
`docker stats`, since it was using the length of the PerCPUUsage array as a
proxy for the number of online CPUs.

Add current number of online CPUs to types.StatsJSON and use it in the
calculation.

Keep a fallback to `len(v.CPUStats.CPUUsage.PercpuUsage)` so this code
continues to work when talking to an older daemon. An old client talking to a
new daemon will ignore the new field and behave as before.

Fixes #28941.

Signed-off-by: Ian Campbell <ian.campbell@docker.com>
2017-03-10 10:24:33 +00:00

116 lines
2.9 KiB
Go

// +build !solaris
package stats
import (
"time"
"github.com/Sirupsen/logrus"
"github.com/docker/docker/api/types"
"github.com/docker/docker/container"
"github.com/docker/docker/pkg/pubsub"
)
// Collect registers the container with the collector and adds it to
// the event loop for collection on the specified interval returning
// a channel for the subscriber to receive on.
func (s *Collector) Collect(c *container.Container) chan interface{} {
s.m.Lock()
defer s.m.Unlock()
publisher, exists := s.publishers[c]
if !exists {
publisher = pubsub.NewPublisher(100*time.Millisecond, 1024)
s.publishers[c] = publisher
}
return publisher.Subscribe()
}
// StopCollection closes the channels for all subscribers and removes
// the container from metrics collection.
func (s *Collector) StopCollection(c *container.Container) {
s.m.Lock()
if publisher, exists := s.publishers[c]; exists {
publisher.Close()
delete(s.publishers, c)
}
s.m.Unlock()
}
// Unsubscribe removes a specific subscriber from receiving updates for a container's stats.
func (s *Collector) Unsubscribe(c *container.Container, ch chan interface{}) {
s.m.Lock()
publisher := s.publishers[c]
if publisher != nil {
publisher.Evict(ch)
if publisher.Len() == 0 {
delete(s.publishers, c)
}
}
s.m.Unlock()
}
// Run starts the collectors and will indefinitely collect stats from the supervisor
func (s *Collector) Run() {
type publishersPair struct {
container *container.Container
publisher *pubsub.Publisher
}
// we cannot determine the capacity here.
// it will grow enough in first iteration
var pairs []publishersPair
for range time.Tick(s.interval) {
// it does not make sense in the first iteration,
// but saves allocations in further iterations
pairs = pairs[:0]
s.m.Lock()
for container, publisher := range s.publishers {
// copy pointers here to release the lock ASAP
pairs = append(pairs, publishersPair{container, publisher})
}
s.m.Unlock()
if len(pairs) == 0 {
continue
}
systemUsage, err := s.getSystemCPUUsage()
if err != nil {
logrus.Errorf("collecting system cpu usage: %v", err)
continue
}
onlineCPUs, err := s.getNumberOnlineCPUs()
if err != nil {
logrus.Errorf("collecting system online cpu count: %v", err)
continue
}
for _, pair := range pairs {
stats, err := s.supervisor.GetContainerStats(pair.container)
if err != nil {
if _, ok := err.(notRunningErr); !ok {
logrus.Errorf("collecting stats for %s: %v", pair.container.ID, err)
continue
}
// publish empty stats containing only name and ID if not running
pair.publisher.Publish(types.StatsJSON{
Name: pair.container.Name,
ID: pair.container.ID,
})
continue
}
// FIXME: move to containerd on Linux (not Windows)
stats.CPUStats.SystemUsage = systemUsage
stats.CPUStats.OnlineCPUs = onlineCPUs
pair.publisher.Publish(*stats)
}
}
}
type notRunningErr interface {
error
ContainerIsRunning() bool
}