Merge pull request #46617 from xinfengliu/23.0_backport_improve_stats_collector

[23.0 backport] Make one-shot stats faster
This commit is contained in:
Brian Goff 2023-10-11 13:07:50 -07:00 committed by GitHub
commit 6b3cb4aaed
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 110 additions and 133 deletions

View file

@ -7,6 +7,7 @@ import (
"runtime"
"time"
"github.com/containerd/containerd/log"
"github.com/docker/docker/api/types"
"github.com/docker/docker/api/types/backend"
"github.com/docker/docker/api/types/versions"
@ -43,6 +44,15 @@ func (daemon *Daemon) ContainerStats(ctx context.Context, prefixOrName string, c
})
}
// Get container stats directly if OneShot is set
if config.OneShot {
stats, err := daemon.GetContainerStats(ctr)
if err != nil {
return err
}
return json.NewEncoder(config.OutStream).Encode(stats)
}
outStream := config.OutStream
if config.Stream {
wf := ioutils.NewWriteFlusher(outStream)
@ -148,15 +158,34 @@ func (daemon *Daemon) unsubscribeToContainerStats(c *container.Container, ch cha
func (daemon *Daemon) GetContainerStats(container *container.Container) (*types.StatsJSON, error) {
stats, err := daemon.stats(container)
if err != nil {
return nil, err
goto done
}
// Sample system CPU usage close to container usage to avoid
// noise in metric calculations.
// FIXME: move to containerd on Linux (not Windows)
stats.CPUStats.SystemUsage, stats.CPUStats.OnlineCPUs, err = getSystemCPUUsage()
if err != nil {
goto done
}
// We already have the network stats on Windows directly from HCS.
if !container.Config.NetworkDisabled && runtime.GOOS != "windows" {
if stats.Networks, err = daemon.getNetworkStats(container); err != nil {
return nil, err
}
stats.Networks, err = daemon.getNetworkStats(container)
}
return stats, nil
done:
switch err.(type) {
case nil:
return stats, nil
case errdefs.ErrConflict, errdefs.ErrNotFound:
// return empty stats containing only name and ID if not running or not found
return &types.StatsJSON{
Name: container.Name,
ID: container.ID,
}, nil
default:
log.G(context.TODO()).Errorf("collecting stats for container %s: %v", container.Name, err)
return nil, err
}
}

View file

@ -1,15 +1,12 @@
package stats // import "github.com/docker/docker/daemon/stats"
import (
"bufio"
"sync"
"time"
"github.com/docker/docker/api/types"
"github.com/docker/docker/container"
"github.com/docker/docker/errdefs"
"github.com/moby/pubsub"
"github.com/sirupsen/logrus"
)
// Collector manages and provides container resource stats
@ -19,7 +16,6 @@ type Collector struct {
supervisor supervisor
interval time.Duration
publishers map[*container.Container]*pubsub.Publisher
bufReader *bufio.Reader
}
// NewCollector creates a stats collector that will poll the supervisor with the specified interval
@ -28,7 +24,6 @@ func NewCollector(supervisor supervisor, interval time.Duration) *Collector {
interval: interval,
supervisor: supervisor,
publishers: make(map[*container.Container]*pubsub.Publisher),
bufReader: bufio.NewReaderSize(nil, 128),
}
s.cond = sync.NewCond(&s.m)
return s
@ -107,45 +102,15 @@ func (s *Collector) Run() {
s.cond.L.Unlock()
onlineCPUs, err := s.getNumberOnlineCPUs()
if err != nil {
logrus.Errorf("collecting system online cpu count: %v", err)
continue
}
for _, pair := range pairs {
stats, err := s.supervisor.GetContainerStats(pair.container)
switch err.(type) {
case nil:
// Sample system CPU usage close to container usage to avoid
// noise in metric calculations.
systemUsage, err := s.getSystemCPUUsage()
if err != nil {
logrus.WithError(err).WithField("container_id", pair.container.ID).Errorf("collecting system cpu usage")
continue
if err != nil {
stats = &types.StatsJSON{
Name: pair.container.Name,
ID: pair.container.ID,
}
// FIXME: move to containerd on Linux (not Windows)
stats.CPUStats.SystemUsage = systemUsage
stats.CPUStats.OnlineCPUs = onlineCPUs
pair.publisher.Publish(*stats)
case errdefs.ErrConflict, errdefs.ErrNotFound:
// publish empty stats containing only name and ID if not running or not found
pair.publisher.Publish(types.StatsJSON{
Name: pair.container.Name,
ID: pair.container.ID,
})
default:
logrus.Errorf("collecting stats for %s: %v", pair.container.ID, err)
pair.publisher.Publish(types.StatsJSON{
Name: pair.container.Name,
ID: pair.container.ID,
})
}
pair.publisher.Publish(*stats)
}
time.Sleep(s.interval)

View file

@ -1,76 +0,0 @@
//go:build !windows
// +build !windows
package stats // import "github.com/docker/docker/daemon/stats"
import (
"fmt"
"os"
"strconv"
"strings"
"golang.org/x/sys/unix"
)
const (
// The value comes from `C.sysconf(C._SC_CLK_TCK)`, and
// on Linux it's a constant which is safe to be hard coded,
// so we can avoid using cgo here. For details, see:
// https://github.com/containerd/cgroups/pull/12
clockTicksPerSecond = 100
nanoSecondsPerSecond = 1e9
)
// getSystemCPUUsage returns the host system's cpu usage in
// nanoseconds. An error is returned if the format of the underlying
// file does not match.
//
// Uses /proc/stat defined by POSIX. Looks for the cpu
// statistics line and then sums up the first seven fields
// provided. See `man 5 proc` for details on specific field
// information.
func (s *Collector) getSystemCPUUsage() (uint64, error) {
f, err := os.Open("/proc/stat")
if err != nil {
return 0, err
}
defer func() {
s.bufReader.Reset(nil)
f.Close()
}()
s.bufReader.Reset(f)
for {
line, err := s.bufReader.ReadString('\n')
if err != nil {
break
}
parts := strings.Fields(line)
switch parts[0] {
case "cpu":
if len(parts) < 8 {
return 0, fmt.Errorf("invalid number of cpu fields")
}
var totalClockTicks uint64
for _, i := range parts[1:8] {
v, err := strconv.ParseUint(i, 10, 64)
if err != nil {
return 0, fmt.Errorf("Unable to convert value %s to int: %s", i, err)
}
totalClockTicks += v
}
return (totalClockTicks * nanoSecondsPerSecond) /
clockTicksPerSecond, nil
}
}
return 0, fmt.Errorf("invalid stat format. Error trying to parse the '/proc/stat' file")
}
func (s *Collector) getNumberOnlineCPUs() (uint32, error) {
var cpuset unix.CPUSet
err := unix.SchedGetaffinity(0, &cpuset)
if err != nil {
return 0, err
}
return uint32(cpuset.Count()), nil
}

View file

@ -1,12 +0,0 @@
package stats // import "github.com/docker/docker/daemon/stats"
// getSystemCPUUsage returns the host system's cpu usage in
// nanoseconds. An error is returned if the format of the underlying
// file does not match. This is a no-op on Windows.
func (s *Collector) getSystemCPUUsage() (uint64, error) {
return 0, nil
}
func (s *Collector) getNumberOnlineCPUs() (uint32, error) {
return 0, nil
}

View file

@ -4,6 +4,12 @@
package daemon // import "github.com/docker/docker/daemon"
import (
"bufio"
"fmt"
"os"
"strconv"
"strings"
"github.com/docker/docker/api/types"
"github.com/docker/docker/container"
"github.com/pkg/errors"
@ -56,3 +62,60 @@ func (daemon *Daemon) getNetworkStats(c *container.Container) (map[string]types.
return stats, nil
}
const (
// The value comes from `C.sysconf(C._SC_CLK_TCK)`, and
// on Linux it's a constant which is safe to be hard coded,
// so we can avoid using cgo here. For details, see:
// https://github.com/containerd/cgroups/pull/12
clockTicksPerSecond = 100
nanoSecondsPerSecond = 1e9
)
// getSystemCPUUsage returns the host system's cpu usage in
// nanoseconds and number of online CPUs. An error is returned
// if the format of the underlying file does not match.
//
// Uses /proc/stat defined by POSIX. Looks for the cpu
// statistics line and then sums up the first seven fields
// provided. See `man 5 proc` for details on specific field
// information.
func getSystemCPUUsage() (cpuUsage uint64, cpuNum uint32, err error) {
f, err := os.Open("/proc/stat")
if err != nil {
return 0, 0, err
}
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
line := scanner.Text()
if len(line) < 4 || line[:3] != "cpu" {
break // Assume all cpu* records are at the front, like glibc https://github.com/bminor/glibc/blob/5d00c201b9a2da768a79ea8d5311f257871c0b43/sysdeps/unix/sysv/linux/getsysstats.c#L108-L135
}
if line[3] == ' ' {
parts := strings.Fields(line)
if len(parts) < 8 {
return 0, 0, fmt.Errorf("invalid number of cpu fields")
}
var totalClockTicks uint64
for _, i := range parts[1:8] {
v, err := strconv.ParseUint(i, 10, 64)
if err != nil {
return 0, 0, fmt.Errorf("Unable to convert value %s to int: %w", i, err)
}
totalClockTicks += v
}
cpuUsage = (totalClockTicks * nanoSecondsPerSecond) /
clockTicksPerSecond
}
if '0' <= line[3] && line[3] <= '9' {
cpuNum++
}
}
if err := scanner.Err(); err != nil {
return 0, 0, fmt.Errorf("error scanning '/proc/stat' file: %w", err)
}
return
}

View file

@ -9,3 +9,11 @@ import (
func (daemon *Daemon) getNetworkStats(c *container.Container) (map[string]types.NetworkStats, error) {
return make(map[string]types.NetworkStats), nil
}
// getSystemCPUUsage returns the host system's cpu usage in
// nanoseconds and number of online CPUs. An error is returned
// if the format of the underlying file does not match.
// This is a no-op on Windows.
func getSystemCPUUsage() (uint64, uint32, error) {
return 0, 0, nil
}