Merge pull request #46619 from xinfengliu/24.0_backport_improve_stats_collector
[24.0 backport] Make one-shot stats faster
This commit is contained in:
commit
a27bf4611e
6 changed files with 108 additions and 133 deletions
|
@ -7,6 +7,7 @@ import (
|
|||
"runtime"
|
||||
"time"
|
||||
|
||||
"github.com/containerd/containerd/log"
|
||||
"github.com/docker/docker/api/types"
|
||||
"github.com/docker/docker/api/types/backend"
|
||||
"github.com/docker/docker/api/types/versions"
|
||||
|
@ -43,6 +44,15 @@ func (daemon *Daemon) ContainerStats(ctx context.Context, prefixOrName string, c
|
|||
})
|
||||
}
|
||||
|
||||
// Get container stats directly if OneShot is set
|
||||
if config.OneShot {
|
||||
stats, err := daemon.GetContainerStats(ctr)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return json.NewEncoder(config.OutStream).Encode(stats)
|
||||
}
|
||||
|
||||
outStream := config.OutStream
|
||||
if config.Stream {
|
||||
wf := ioutils.NewWriteFlusher(outStream)
|
||||
|
@ -148,15 +158,34 @@ func (daemon *Daemon) unsubscribeToContainerStats(c *container.Container, ch cha
|
|||
func (daemon *Daemon) GetContainerStats(container *container.Container) (*types.StatsJSON, error) {
|
||||
stats, err := daemon.stats(container)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
goto done
|
||||
}
|
||||
|
||||
// Sample system CPU usage close to container usage to avoid
|
||||
// noise in metric calculations.
|
||||
// FIXME: move to containerd on Linux (not Windows)
|
||||
stats.CPUStats.SystemUsage, stats.CPUStats.OnlineCPUs, err = getSystemCPUUsage()
|
||||
if err != nil {
|
||||
goto done
|
||||
}
|
||||
|
||||
// We already have the network stats on Windows directly from HCS.
|
||||
if !container.Config.NetworkDisabled && runtime.GOOS != "windows" {
|
||||
if stats.Networks, err = daemon.getNetworkStats(container); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
stats.Networks, err = daemon.getNetworkStats(container)
|
||||
}
|
||||
|
||||
return stats, nil
|
||||
done:
|
||||
switch err.(type) {
|
||||
case nil:
|
||||
return stats, nil
|
||||
case errdefs.ErrConflict, errdefs.ErrNotFound:
|
||||
// return empty stats containing only name and ID if not running or not found
|
||||
return &types.StatsJSON{
|
||||
Name: container.Name,
|
||||
ID: container.ID,
|
||||
}, nil
|
||||
default:
|
||||
log.G(context.TODO()).Errorf("collecting stats for container %s: %v", container.Name, err)
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,15 +1,12 @@
|
|||
package stats // import "github.com/docker/docker/daemon/stats"
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/docker/docker/api/types"
|
||||
"github.com/docker/docker/container"
|
||||
"github.com/docker/docker/errdefs"
|
||||
"github.com/moby/pubsub"
|
||||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
// Collector manages and provides container resource stats
|
||||
|
@ -19,7 +16,6 @@ type Collector struct {
|
|||
supervisor supervisor
|
||||
interval time.Duration
|
||||
publishers map[*container.Container]*pubsub.Publisher
|
||||
bufReader *bufio.Reader
|
||||
}
|
||||
|
||||
// NewCollector creates a stats collector that will poll the supervisor with the specified interval
|
||||
|
@ -28,7 +24,6 @@ func NewCollector(supervisor supervisor, interval time.Duration) *Collector {
|
|||
interval: interval,
|
||||
supervisor: supervisor,
|
||||
publishers: make(map[*container.Container]*pubsub.Publisher),
|
||||
bufReader: bufio.NewReaderSize(nil, 128),
|
||||
}
|
||||
s.cond = sync.NewCond(&s.m)
|
||||
return s
|
||||
|
@ -107,45 +102,15 @@ func (s *Collector) Run() {
|
|||
|
||||
s.cond.L.Unlock()
|
||||
|
||||
onlineCPUs, err := s.getNumberOnlineCPUs()
|
||||
if err != nil {
|
||||
logrus.Errorf("collecting system online cpu count: %v", err)
|
||||
continue
|
||||
}
|
||||
|
||||
for _, pair := range pairs {
|
||||
stats, err := s.supervisor.GetContainerStats(pair.container)
|
||||
|
||||
switch err.(type) {
|
||||
case nil:
|
||||
// Sample system CPU usage close to container usage to avoid
|
||||
// noise in metric calculations.
|
||||
systemUsage, err := s.getSystemCPUUsage()
|
||||
if err != nil {
|
||||
logrus.WithError(err).WithField("container_id", pair.container.ID).Errorf("collecting system cpu usage")
|
||||
continue
|
||||
if err != nil {
|
||||
stats = &types.StatsJSON{
|
||||
Name: pair.container.Name,
|
||||
ID: pair.container.ID,
|
||||
}
|
||||
|
||||
// FIXME: move to containerd on Linux (not Windows)
|
||||
stats.CPUStats.SystemUsage = systemUsage
|
||||
stats.CPUStats.OnlineCPUs = onlineCPUs
|
||||
|
||||
pair.publisher.Publish(*stats)
|
||||
|
||||
case errdefs.ErrConflict, errdefs.ErrNotFound:
|
||||
// publish empty stats containing only name and ID if not running or not found
|
||||
pair.publisher.Publish(types.StatsJSON{
|
||||
Name: pair.container.Name,
|
||||
ID: pair.container.ID,
|
||||
})
|
||||
|
||||
default:
|
||||
logrus.Errorf("collecting stats for %s: %v", pair.container.ID, err)
|
||||
pair.publisher.Publish(types.StatsJSON{
|
||||
Name: pair.container.Name,
|
||||
ID: pair.container.ID,
|
||||
})
|
||||
}
|
||||
pair.publisher.Publish(*stats)
|
||||
}
|
||||
|
||||
time.Sleep(s.interval)
|
||||
|
|
|
@ -1,76 +0,0 @@
|
|||
//go:build !windows
|
||||
// +build !windows
|
||||
|
||||
package stats // import "github.com/docker/docker/daemon/stats"
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
const (
|
||||
// The value comes from `C.sysconf(C._SC_CLK_TCK)`, and
|
||||
// on Linux it's a constant which is safe to be hard coded,
|
||||
// so we can avoid using cgo here. For details, see:
|
||||
// https://github.com/containerd/cgroups/pull/12
|
||||
clockTicksPerSecond = 100
|
||||
nanoSecondsPerSecond = 1e9
|
||||
)
|
||||
|
||||
// getSystemCPUUsage returns the host system's cpu usage in
|
||||
// nanoseconds. An error is returned if the format of the underlying
|
||||
// file does not match.
|
||||
//
|
||||
// Uses /proc/stat defined by POSIX. Looks for the cpu
|
||||
// statistics line and then sums up the first seven fields
|
||||
// provided. See `man 5 proc` for details on specific field
|
||||
// information.
|
||||
func (s *Collector) getSystemCPUUsage() (uint64, error) {
|
||||
f, err := os.Open("/proc/stat")
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
defer func() {
|
||||
s.bufReader.Reset(nil)
|
||||
f.Close()
|
||||
}()
|
||||
s.bufReader.Reset(f)
|
||||
|
||||
for {
|
||||
line, err := s.bufReader.ReadString('\n')
|
||||
if err != nil {
|
||||
break
|
||||
}
|
||||
parts := strings.Fields(line)
|
||||
switch parts[0] {
|
||||
case "cpu":
|
||||
if len(parts) < 8 {
|
||||
return 0, fmt.Errorf("invalid number of cpu fields")
|
||||
}
|
||||
var totalClockTicks uint64
|
||||
for _, i := range parts[1:8] {
|
||||
v, err := strconv.ParseUint(i, 10, 64)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("Unable to convert value %s to int: %s", i, err)
|
||||
}
|
||||
totalClockTicks += v
|
||||
}
|
||||
return (totalClockTicks * nanoSecondsPerSecond) /
|
||||
clockTicksPerSecond, nil
|
||||
}
|
||||
}
|
||||
return 0, fmt.Errorf("invalid stat format. Error trying to parse the '/proc/stat' file")
|
||||
}
|
||||
|
||||
func (s *Collector) getNumberOnlineCPUs() (uint32, error) {
|
||||
var cpuset unix.CPUSet
|
||||
err := unix.SchedGetaffinity(0, &cpuset)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return uint32(cpuset.Count()), nil
|
||||
}
|
|
@ -1,12 +0,0 @@
|
|||
package stats // import "github.com/docker/docker/daemon/stats"
|
||||
|
||||
// getSystemCPUUsage returns the host system's cpu usage in
|
||||
// nanoseconds. An error is returned if the format of the underlying
|
||||
// file does not match. This is a no-op on Windows.
|
||||
func (s *Collector) getSystemCPUUsage() (uint64, error) {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
func (s *Collector) getNumberOnlineCPUs() (uint32, error) {
|
||||
return 0, nil
|
||||
}
|
|
@ -4,7 +4,11 @@
|
|||
package daemon // import "github.com/docker/docker/daemon"
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
statsV1 "github.com/containerd/cgroups/v3/cgroup1/stats"
|
||||
|
@ -297,3 +301,60 @@ func (daemon *Daemon) getNetworkStats(c *container.Container) (map[string]types.
|
|||
|
||||
return stats, nil
|
||||
}
|
||||
|
||||
const (
|
||||
// The value comes from `C.sysconf(C._SC_CLK_TCK)`, and
|
||||
// on Linux it's a constant which is safe to be hard coded,
|
||||
// so we can avoid using cgo here. For details, see:
|
||||
// https://github.com/containerd/cgroups/pull/12
|
||||
clockTicksPerSecond = 100
|
||||
nanoSecondsPerSecond = 1e9
|
||||
)
|
||||
|
||||
// getSystemCPUUsage returns the host system's cpu usage in
|
||||
// nanoseconds and number of online CPUs. An error is returned
|
||||
// if the format of the underlying file does not match.
|
||||
//
|
||||
// Uses /proc/stat defined by POSIX. Looks for the cpu
|
||||
// statistics line and then sums up the first seven fields
|
||||
// provided. See `man 5 proc` for details on specific field
|
||||
// information.
|
||||
func getSystemCPUUsage() (cpuUsage uint64, cpuNum uint32, err error) {
|
||||
f, err := os.Open("/proc/stat")
|
||||
if err != nil {
|
||||
return 0, 0, err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
scanner := bufio.NewScanner(f)
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
if len(line) < 4 || line[:3] != "cpu" {
|
||||
break // Assume all cpu* records are at the front, like glibc https://github.com/bminor/glibc/blob/5d00c201b9a2da768a79ea8d5311f257871c0b43/sysdeps/unix/sysv/linux/getsysstats.c#L108-L135
|
||||
}
|
||||
if line[3] == ' ' {
|
||||
parts := strings.Fields(line)
|
||||
if len(parts) < 8 {
|
||||
return 0, 0, fmt.Errorf("invalid number of cpu fields")
|
||||
}
|
||||
var totalClockTicks uint64
|
||||
for _, i := range parts[1:8] {
|
||||
v, err := strconv.ParseUint(i, 10, 64)
|
||||
if err != nil {
|
||||
return 0, 0, fmt.Errorf("Unable to convert value %s to int: %w", i, err)
|
||||
}
|
||||
totalClockTicks += v
|
||||
}
|
||||
cpuUsage = (totalClockTicks * nanoSecondsPerSecond) /
|
||||
clockTicksPerSecond
|
||||
}
|
||||
if '0' <= line[3] && line[3] <= '9' {
|
||||
cpuNum++
|
||||
}
|
||||
}
|
||||
|
||||
if err := scanner.Err(); err != nil {
|
||||
return 0, 0, fmt.Errorf("error scanning '/proc/stat' file: %w", err)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
|
|
@ -77,3 +77,11 @@ func (daemon *Daemon) stats(c *container.Container) (*types.StatsJSON, error) {
|
|||
func (daemon *Daemon) getNetworkStats(c *container.Container) (map[string]types.NetworkStats, error) {
|
||||
return make(map[string]types.NetworkStats), nil
|
||||
}
|
||||
|
||||
// getSystemCPUUsage returns the host system's cpu usage in
|
||||
// nanoseconds and number of online CPUs. An error is returned
|
||||
// if the format of the underlying file does not match.
|
||||
// This is a no-op on Windows.
|
||||
func getSystemCPUUsage() (uint64, uint32, error) {
|
||||
return 0, 0, nil
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue