Merge pull request #21342 from tonistiigi/cleanup-libcontainer

Convert libnetwork stats directly to api types
This commit is contained in:
Arnaud Porterie 2016-03-21 08:54:03 -07:00
commit 2a4c970aeb
73 changed files with 19 additions and 10645 deletions

View file

@ -70,9 +70,7 @@ import (
"github.com/docker/go-connections/nat"
"github.com/docker/libnetwork"
nwconfig "github.com/docker/libnetwork/config"
lntypes "github.com/docker/libnetwork/types"
"github.com/docker/libtrust"
"github.com/opencontainers/runc/libcontainer"
"golang.org/x/net/context"
)
@ -1528,50 +1526,40 @@ func (daemon *Daemon) GetContainerStats(container *container.Container) (*types.
return nil, err
}
// Retrieve the nw statistics from libnetwork and inject them in the Stats
var nwStats []*libcontainer.NetworkInterface
if nwStats, err = daemon.getNetworkStats(container); err != nil {
if stats.Networks, err = daemon.getNetworkStats(container); err != nil {
return nil, err
}
stats.Networks = make(map[string]types.NetworkStats)
for _, iface := range nwStats {
// For API Version >= 1.21, the original data of network will
// be returned.
stats.Networks[iface.Name] = types.NetworkStats{
RxBytes: iface.RxBytes,
RxPackets: iface.RxPackets,
RxErrors: iface.RxErrors,
RxDropped: iface.RxDropped,
TxBytes: iface.TxBytes,
TxPackets: iface.TxPackets,
TxErrors: iface.TxErrors,
TxDropped: iface.TxDropped,
}
}
return stats, nil
}
func (daemon *Daemon) getNetworkStats(c *container.Container) ([]*libcontainer.NetworkInterface, error) {
var list []*libcontainer.NetworkInterface
func (daemon *Daemon) getNetworkStats(c *container.Container) (map[string]types.NetworkStats, error) {
sb, err := daemon.netController.SandboxByID(c.NetworkSettings.SandboxID)
if err != nil {
return list, err
return nil, err
}
stats, err := sb.Statistics()
lnstats, err := sb.Statistics()
if err != nil {
return list, err
return nil, err
}
// Convert libnetwork nw stats into libcontainer nw stats
for ifName, ifStats := range stats {
list = append(list, convertLnNetworkStats(ifName, ifStats))
stats := make(map[string]types.NetworkStats)
// Convert libnetwork nw stats into engine-api stats
for ifName, ifStats := range lnstats {
stats[ifName] = types.NetworkStats{
RxBytes: ifStats.RxBytes,
RxPackets: ifStats.RxPackets,
RxErrors: ifStats.RxErrors,
RxDropped: ifStats.RxDropped,
TxBytes: ifStats.TxBytes,
TxPackets: ifStats.TxPackets,
TxErrors: ifStats.TxErrors,
TxDropped: ifStats.TxDropped,
}
}
return list, nil
return stats, nil
}
// newBaseContainer creates a new container with its initial
@ -1675,19 +1663,6 @@ func (daemon *Daemon) reloadClusterDiscovery(config *Config) error {
return nil
}
func convertLnNetworkStats(name string, stats *lntypes.InterfaceStatistics) *libcontainer.NetworkInterface {
n := &libcontainer.NetworkInterface{Name: name}
n.RxBytes = stats.RxBytes
n.RxPackets = stats.RxPackets
n.RxErrors = stats.RxErrors
n.RxDropped = stats.RxDropped
n.TxBytes = stats.TxBytes
n.TxPackets = stats.TxPackets
n.TxErrors = stats.TxErrors
n.TxDropped = stats.TxDropped
return n
}
func validateID(id string) error {
if id == "" {
return fmt.Errorf("Invalid empty id")

View file

@ -1,14 +0,0 @@
package daemon
import (
"github.com/docker/engine-api/types"
"github.com/opencontainers/runc/libcontainer"
)
// convertStatsToAPITypes converts the libcontainer.Stats to the api specific
// structs. This is done to preserve API compatibility and versioning.
func convertStatsToAPITypes(ls *libcontainer.Stats) *types.StatsJSON {
// TODO FreeBSD. Refactor accordingly to fill in stats.
s := &types.StatsJSON{}
return s
}

View file

@ -1,187 +0,0 @@
// Copyright 2015 CoreOS, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Integration with the systemd D-Bus API. See http://www.freedesktop.org/wiki/Software/systemd/dbus/
package dbus
import (
"fmt"
"os"
"strconv"
"strings"
"sync"
"github.com/godbus/dbus"
)
const (
alpha = `abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ`
num = `0123456789`
alphanum = alpha + num
signalBuffer = 100
)
// needsEscape checks whether a byte in a potential dbus ObjectPath needs to be escaped
func needsEscape(i int, b byte) bool {
// Escape everything that is not a-z-A-Z-0-9
// Also escape 0-9 if it's the first character
return strings.IndexByte(alphanum, b) == -1 ||
(i == 0 && strings.IndexByte(num, b) != -1)
}
// PathBusEscape sanitizes a constituent string of a dbus ObjectPath using the
// rules that systemd uses for serializing special characters.
func PathBusEscape(path string) string {
// Special case the empty string
if len(path) == 0 {
return "_"
}
n := []byte{}
for i := 0; i < len(path); i++ {
c := path[i]
if needsEscape(i, c) {
e := fmt.Sprintf("_%x", c)
n = append(n, []byte(e)...)
} else {
n = append(n, c)
}
}
return string(n)
}
// Conn is a connection to systemd's dbus endpoint.
type Conn struct {
// sysconn/sysobj are only used to call dbus methods
sysconn *dbus.Conn
sysobj dbus.BusObject
// sigconn/sigobj are only used to receive dbus signals
sigconn *dbus.Conn
sigobj dbus.BusObject
jobListener struct {
jobs map[dbus.ObjectPath]chan<- string
sync.Mutex
}
subscriber struct {
updateCh chan<- *SubStateUpdate
errCh chan<- error
sync.Mutex
ignore map[dbus.ObjectPath]int64
cleanIgnore int64
}
}
// New establishes a connection to the system bus and authenticates.
// Callers should call Close() when done with the connection.
func New() (*Conn, error) {
return newConnection(func() (*dbus.Conn, error) {
return dbusAuthHelloConnection(dbus.SystemBusPrivate)
})
}
// NewUserConnection establishes a connection to the session bus and
// authenticates. This can be used to connect to systemd user instances.
// Callers should call Close() when done with the connection.
func NewUserConnection() (*Conn, error) {
return newConnection(func() (*dbus.Conn, error) {
return dbusAuthHelloConnection(dbus.SessionBusPrivate)
})
}
// NewSystemdConnection establishes a private, direct connection to systemd.
// This can be used for communicating with systemd without a dbus daemon.
// Callers should call Close() when done with the connection.
func NewSystemdConnection() (*Conn, error) {
return newConnection(func() (*dbus.Conn, error) {
// We skip Hello when talking directly to systemd.
return dbusAuthConnection(func() (*dbus.Conn, error) {
return dbus.Dial("unix:path=/run/systemd/private")
})
})
}
// Close closes an established connection
func (c *Conn) Close() {
c.sysconn.Close()
c.sigconn.Close()
}
func newConnection(createBus func() (*dbus.Conn, error)) (*Conn, error) {
sysconn, err := createBus()
if err != nil {
return nil, err
}
sigconn, err := createBus()
if err != nil {
sysconn.Close()
return nil, err
}
c := &Conn{
sysconn: sysconn,
sysobj: systemdObject(sysconn),
sigconn: sigconn,
sigobj: systemdObject(sigconn),
}
c.subscriber.ignore = make(map[dbus.ObjectPath]int64)
c.jobListener.jobs = make(map[dbus.ObjectPath]chan<- string)
// Setup the listeners on jobs so that we can get completions
c.sigconn.BusObject().Call("org.freedesktop.DBus.AddMatch", 0,
"type='signal', interface='org.freedesktop.systemd1.Manager', member='JobRemoved'")
c.dispatch()
return c, nil
}
func dbusAuthConnection(createBus func() (*dbus.Conn, error)) (*dbus.Conn, error) {
conn, err := createBus()
if err != nil {
return nil, err
}
// Only use EXTERNAL method, and hardcode the uid (not username)
// to avoid a username lookup (which requires a dynamically linked
// libc)
methods := []dbus.Auth{dbus.AuthExternal(strconv.Itoa(os.Getuid()))}
err = conn.Auth(methods)
if err != nil {
conn.Close()
return nil, err
}
return conn, nil
}
func dbusAuthHelloConnection(createBus func() (*dbus.Conn, error)) (*dbus.Conn, error) {
conn, err := dbusAuthConnection(createBus)
if err != nil {
return nil, err
}
if err = conn.Hello(); err != nil {
conn.Close()
return nil, err
}
return conn, nil
}
func systemdObject(conn *dbus.Conn) dbus.BusObject {
return conn.Object("org.freedesktop.systemd1", dbus.ObjectPath("/org/freedesktop/systemd1"))
}

View file

@ -1,410 +0,0 @@
// Copyright 2015 CoreOS, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package dbus
import (
"errors"
"path"
"strconv"
"github.com/godbus/dbus"
)
func (c *Conn) jobComplete(signal *dbus.Signal) {
var id uint32
var job dbus.ObjectPath
var unit string
var result string
dbus.Store(signal.Body, &id, &job, &unit, &result)
c.jobListener.Lock()
out, ok := c.jobListener.jobs[job]
if ok {
out <- result
delete(c.jobListener.jobs, job)
}
c.jobListener.Unlock()
}
func (c *Conn) startJob(ch chan<- string, job string, args ...interface{}) (int, error) {
if ch != nil {
c.jobListener.Lock()
defer c.jobListener.Unlock()
}
var p dbus.ObjectPath
err := c.sysobj.Call(job, 0, args...).Store(&p)
if err != nil {
return 0, err
}
if ch != nil {
c.jobListener.jobs[p] = ch
}
// ignore error since 0 is fine if conversion fails
jobID, _ := strconv.Atoi(path.Base(string(p)))
return jobID, nil
}
// StartUnit enqueues a start job and depending jobs, if any (unless otherwise
// specified by the mode string).
//
// Takes the unit to activate, plus a mode string. The mode needs to be one of
// replace, fail, isolate, ignore-dependencies, ignore-requirements. If
// "replace" the call will start the unit and its dependencies, possibly
// replacing already queued jobs that conflict with this. If "fail" the call
// will start the unit and its dependencies, but will fail if this would change
// an already queued job. If "isolate" the call will start the unit in question
// and terminate all units that aren't dependencies of it. If
// "ignore-dependencies" it will start a unit but ignore all its dependencies.
// If "ignore-requirements" it will start a unit but only ignore the
// requirement dependencies. It is not recommended to make use of the latter
// two options.
//
// If the provided channel is non-nil, a result string will be sent to it upon
// job completion: one of done, canceled, timeout, failed, dependency, skipped.
// done indicates successful execution of a job. canceled indicates that a job
// has been canceled before it finished execution. timeout indicates that the
// job timeout was reached. failed indicates that the job failed. dependency
// indicates that a job this job has been depending on failed and the job hence
// has been removed too. skipped indicates that a job was skipped because it
// didn't apply to the units current state.
//
// If no error occurs, the ID of the underlying systemd job will be returned. There
// does exist the possibility for no error to be returned, but for the returned job
// ID to be 0. In this case, the actual underlying ID is not 0 and this datapoint
// should not be considered authoritative.
//
// If an error does occur, it will be returned to the user alongside a job ID of 0.
func (c *Conn) StartUnit(name string, mode string, ch chan<- string) (int, error) {
return c.startJob(ch, "org.freedesktop.systemd1.Manager.StartUnit", name, mode)
}
// StopUnit is similar to StartUnit but stops the specified unit rather
// than starting it.
func (c *Conn) StopUnit(name string, mode string, ch chan<- string) (int, error) {
return c.startJob(ch, "org.freedesktop.systemd1.Manager.StopUnit", name, mode)
}
// ReloadUnit reloads a unit. Reloading is done only if the unit is already running and fails otherwise.
func (c *Conn) ReloadUnit(name string, mode string, ch chan<- string) (int, error) {
return c.startJob(ch, "org.freedesktop.systemd1.Manager.ReloadUnit", name, mode)
}
// RestartUnit restarts a service. If a service is restarted that isn't
// running it will be started.
func (c *Conn) RestartUnit(name string, mode string, ch chan<- string) (int, error) {
return c.startJob(ch, "org.freedesktop.systemd1.Manager.RestartUnit", name, mode)
}
// TryRestartUnit is like RestartUnit, except that a service that isn't running
// is not affected by the restart.
func (c *Conn) TryRestartUnit(name string, mode string, ch chan<- string) (int, error) {
return c.startJob(ch, "org.freedesktop.systemd1.Manager.TryRestartUnit", name, mode)
}
// ReloadOrRestart attempts a reload if the unit supports it and use a restart
// otherwise.
func (c *Conn) ReloadOrRestartUnit(name string, mode string, ch chan<- string) (int, error) {
return c.startJob(ch, "org.freedesktop.systemd1.Manager.ReloadOrRestartUnit", name, mode)
}
// ReloadOrTryRestart attempts a reload if the unit supports it and use a "Try"
// flavored restart otherwise.
func (c *Conn) ReloadOrTryRestartUnit(name string, mode string, ch chan<- string) (int, error) {
return c.startJob(ch, "org.freedesktop.systemd1.Manager.ReloadOrTryRestartUnit", name, mode)
}
// StartTransientUnit() may be used to create and start a transient unit, which
// will be released as soon as it is not running or referenced anymore or the
// system is rebooted. name is the unit name including suffix, and must be
// unique. mode is the same as in StartUnit(), properties contains properties
// of the unit.
func (c *Conn) StartTransientUnit(name string, mode string, properties []Property, ch chan<- string) (int, error) {
return c.startJob(ch, "org.freedesktop.systemd1.Manager.StartTransientUnit", name, mode, properties, make([]PropertyCollection, 0))
}
// KillUnit takes the unit name and a UNIX signal number to send. All of the unit's
// processes are killed.
func (c *Conn) KillUnit(name string, signal int32) {
c.sysobj.Call("org.freedesktop.systemd1.Manager.KillUnit", 0, name, "all", signal).Store()
}
// ResetFailedUnit resets the "failed" state of a specific unit.
func (c *Conn) ResetFailedUnit(name string) error {
return c.sysobj.Call("org.freedesktop.systemd1.Manager.ResetFailedUnit", 0, name).Store()
}
// getProperties takes the unit name and returns all of its dbus object properties, for the given dbus interface
func (c *Conn) getProperties(unit string, dbusInterface string) (map[string]interface{}, error) {
var err error
var props map[string]dbus.Variant
path := unitPath(unit)
if !path.IsValid() {
return nil, errors.New("invalid unit name: " + unit)
}
obj := c.sysconn.Object("org.freedesktop.systemd1", path)
err = obj.Call("org.freedesktop.DBus.Properties.GetAll", 0, dbusInterface).Store(&props)
if err != nil {
return nil, err
}
out := make(map[string]interface{}, len(props))
for k, v := range props {
out[k] = v.Value()
}
return out, nil
}
// GetUnitProperties takes the unit name and returns all of its dbus object properties.
func (c *Conn) GetUnitProperties(unit string) (map[string]interface{}, error) {
return c.getProperties(unit, "org.freedesktop.systemd1.Unit")
}
func (c *Conn) getProperty(unit string, dbusInterface string, propertyName string) (*Property, error) {
var err error
var prop dbus.Variant
path := unitPath(unit)
if !path.IsValid() {
return nil, errors.New("invalid unit name: " + unit)
}
obj := c.sysconn.Object("org.freedesktop.systemd1", path)
err = obj.Call("org.freedesktop.DBus.Properties.Get", 0, dbusInterface, propertyName).Store(&prop)
if err != nil {
return nil, err
}
return &Property{Name: propertyName, Value: prop}, nil
}
func (c *Conn) GetUnitProperty(unit string, propertyName string) (*Property, error) {
return c.getProperty(unit, "org.freedesktop.systemd1.Unit", propertyName)
}
// GetUnitTypeProperties returns the extra properties for a unit, specific to the unit type.
// Valid values for unitType: Service, Socket, Target, Device, Mount, Automount, Snapshot, Timer, Swap, Path, Slice, Scope
// return "dbus.Error: Unknown interface" if the unitType is not the correct type of the unit
func (c *Conn) GetUnitTypeProperties(unit string, unitType string) (map[string]interface{}, error) {
return c.getProperties(unit, "org.freedesktop.systemd1."+unitType)
}
// SetUnitProperties() may be used to modify certain unit properties at runtime.
// Not all properties may be changed at runtime, but many resource management
// settings (primarily those in systemd.cgroup(5)) may. The changes are applied
// instantly, and stored on disk for future boots, unless runtime is true, in which
// case the settings only apply until the next reboot. name is the name of the unit
// to modify. properties are the settings to set, encoded as an array of property
// name and value pairs.
func (c *Conn) SetUnitProperties(name string, runtime bool, properties ...Property) error {
return c.sysobj.Call("org.freedesktop.systemd1.Manager.SetUnitProperties", 0, name, runtime, properties).Store()
}
func (c *Conn) GetUnitTypeProperty(unit string, unitType string, propertyName string) (*Property, error) {
return c.getProperty(unit, "org.freedesktop.systemd1."+unitType, propertyName)
}
// ListUnits returns an array with all currently loaded units. Note that
// units may be known by multiple names at the same time, and hence there might
// be more unit names loaded than actual units behind them.
func (c *Conn) ListUnits() ([]UnitStatus, error) {
result := make([][]interface{}, 0)
err := c.sysobj.Call("org.freedesktop.systemd1.Manager.ListUnits", 0).Store(&result)
if err != nil {
return nil, err
}
resultInterface := make([]interface{}, len(result))
for i := range result {
resultInterface[i] = result[i]
}
status := make([]UnitStatus, len(result))
statusInterface := make([]interface{}, len(status))
for i := range status {
statusInterface[i] = &status[i]
}
err = dbus.Store(resultInterface, statusInterface...)
if err != nil {
return nil, err
}
return status, nil
}
type UnitStatus struct {
Name string // The primary unit name as string
Description string // The human readable description string
LoadState string // The load state (i.e. whether the unit file has been loaded successfully)
ActiveState string // The active state (i.e. whether the unit is currently started or not)
SubState string // The sub state (a more fine-grained version of the active state that is specific to the unit type, which the active state is not)
Followed string // A unit that is being followed in its state by this unit, if there is any, otherwise the empty string.
Path dbus.ObjectPath // The unit object path
JobId uint32 // If there is a job queued for the job unit the numeric job id, 0 otherwise
JobType string // The job type as string
JobPath dbus.ObjectPath // The job object path
}
type LinkUnitFileChange EnableUnitFileChange
// LinkUnitFiles() links unit files (that are located outside of the
// usual unit search paths) into the unit search path.
//
// It takes a list of absolute paths to unit files to link and two
// booleans. The first boolean controls whether the unit shall be
// enabled for runtime only (true, /run), or persistently (false,
// /etc).
// The second controls whether symlinks pointing to other units shall
// be replaced if necessary.
//
// This call returns a list of the changes made. The list consists of
// structures with three strings: the type of the change (one of symlink
// or unlink), the file name of the symlink and the destination of the
// symlink.
func (c *Conn) LinkUnitFiles(files []string, runtime bool, force bool) ([]LinkUnitFileChange, error) {
result := make([][]interface{}, 0)
err := c.sysobj.Call("org.freedesktop.systemd1.Manager.LinkUnitFiles", 0, files, runtime, force).Store(&result)
if err != nil {
return nil, err
}
resultInterface := make([]interface{}, len(result))
for i := range result {
resultInterface[i] = result[i]
}
changes := make([]LinkUnitFileChange, len(result))
changesInterface := make([]interface{}, len(changes))
for i := range changes {
changesInterface[i] = &changes[i]
}
err = dbus.Store(resultInterface, changesInterface...)
if err != nil {
return nil, err
}
return changes, nil
}
// EnableUnitFiles() may be used to enable one or more units in the system (by
// creating symlinks to them in /etc or /run).
//
// It takes a list of unit files to enable (either just file names or full
// absolute paths if the unit files are residing outside the usual unit
// search paths), and two booleans: the first controls whether the unit shall
// be enabled for runtime only (true, /run), or persistently (false, /etc).
// The second one controls whether symlinks pointing to other units shall
// be replaced if necessary.
//
// This call returns one boolean and an array with the changes made. The
// boolean signals whether the unit files contained any enablement
// information (i.e. an [Install]) section. The changes list consists of
// structures with three strings: the type of the change (one of symlink
// or unlink), the file name of the symlink and the destination of the
// symlink.
func (c *Conn) EnableUnitFiles(files []string, runtime bool, force bool) (bool, []EnableUnitFileChange, error) {
var carries_install_info bool
result := make([][]interface{}, 0)
err := c.sysobj.Call("org.freedesktop.systemd1.Manager.EnableUnitFiles", 0, files, runtime, force).Store(&carries_install_info, &result)
if err != nil {
return false, nil, err
}
resultInterface := make([]interface{}, len(result))
for i := range result {
resultInterface[i] = result[i]
}
changes := make([]EnableUnitFileChange, len(result))
changesInterface := make([]interface{}, len(changes))
for i := range changes {
changesInterface[i] = &changes[i]
}
err = dbus.Store(resultInterface, changesInterface...)
if err != nil {
return false, nil, err
}
return carries_install_info, changes, nil
}
type EnableUnitFileChange struct {
Type string // Type of the change (one of symlink or unlink)
Filename string // File name of the symlink
Destination string // Destination of the symlink
}
// DisableUnitFiles() may be used to disable one or more units in the system (by
// removing symlinks to them from /etc or /run).
//
// It takes a list of unit files to disable (either just file names or full
// absolute paths if the unit files are residing outside the usual unit
// search paths), and one boolean: whether the unit was enabled for runtime
// only (true, /run), or persistently (false, /etc).
//
// This call returns an array with the changes made. The changes list
// consists of structures with three strings: the type of the change (one of
// symlink or unlink), the file name of the symlink and the destination of the
// symlink.
func (c *Conn) DisableUnitFiles(files []string, runtime bool) ([]DisableUnitFileChange, error) {
result := make([][]interface{}, 0)
err := c.sysobj.Call("org.freedesktop.systemd1.Manager.DisableUnitFiles", 0, files, runtime).Store(&result)
if err != nil {
return nil, err
}
resultInterface := make([]interface{}, len(result))
for i := range result {
resultInterface[i] = result[i]
}
changes := make([]DisableUnitFileChange, len(result))
changesInterface := make([]interface{}, len(changes))
for i := range changes {
changesInterface[i] = &changes[i]
}
err = dbus.Store(resultInterface, changesInterface...)
if err != nil {
return nil, err
}
return changes, nil
}
type DisableUnitFileChange struct {
Type string // Type of the change (one of symlink or unlink)
Filename string // File name of the symlink
Destination string // Destination of the symlink
}
// Reload instructs systemd to scan for and reload unit files. This is
// equivalent to a 'systemctl daemon-reload'.
func (c *Conn) Reload() error {
return c.sysobj.Call("org.freedesktop.systemd1.Manager.Reload", 0).Store()
}
func unitPath(name string) dbus.ObjectPath {
return dbus.ObjectPath("/org/freedesktop/systemd1/unit/" + PathBusEscape(name))
}

View file

@ -1,218 +0,0 @@
// Copyright 2015 CoreOS, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package dbus
import (
"github.com/godbus/dbus"
)
// From the systemd docs:
//
// The properties array of StartTransientUnit() may take many of the settings
// that may also be configured in unit files. Not all parameters are currently
// accepted though, but we plan to cover more properties with future release.
// Currently you may set the Description, Slice and all dependency types of
// units, as well as RemainAfterExit, ExecStart for service units,
// TimeoutStopUSec and PIDs for scope units, and CPUAccounting, CPUShares,
// BlockIOAccounting, BlockIOWeight, BlockIOReadBandwidth,
// BlockIOWriteBandwidth, BlockIODeviceWeight, MemoryAccounting, MemoryLimit,
// DevicePolicy, DeviceAllow for services/scopes/slices. These fields map
// directly to their counterparts in unit files and as normal D-Bus object
// properties. The exception here is the PIDs field of scope units which is
// used for construction of the scope only and specifies the initial PIDs to
// add to the scope object.
type Property struct {
Name string
Value dbus.Variant
}
type PropertyCollection struct {
Name string
Properties []Property
}
type execStart struct {
Path string // the binary path to execute
Args []string // an array with all arguments to pass to the executed command, starting with argument 0
UncleanIsFailure bool // a boolean whether it should be considered a failure if the process exits uncleanly
}
// PropExecStart sets the ExecStart service property. The first argument is a
// slice with the binary path to execute followed by the arguments to pass to
// the executed command. See
// http://www.freedesktop.org/software/systemd/man/systemd.service.html#ExecStart=
func PropExecStart(command []string, uncleanIsFailure bool) Property {
execStarts := []execStart{
execStart{
Path: command[0],
Args: command,
UncleanIsFailure: uncleanIsFailure,
},
}
return Property{
Name: "ExecStart",
Value: dbus.MakeVariant(execStarts),
}
}
// PropRemainAfterExit sets the RemainAfterExit service property. See
// http://www.freedesktop.org/software/systemd/man/systemd.service.html#RemainAfterExit=
func PropRemainAfterExit(b bool) Property {
return Property{
Name: "RemainAfterExit",
Value: dbus.MakeVariant(b),
}
}
// PropDescription sets the Description unit property. See
// http://www.freedesktop.org/software/systemd/man/systemd.unit#Description=
func PropDescription(desc string) Property {
return Property{
Name: "Description",
Value: dbus.MakeVariant(desc),
}
}
func propDependency(name string, units []string) Property {
return Property{
Name: name,
Value: dbus.MakeVariant(units),
}
}
// PropRequires sets the Requires unit property. See
// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#Requires=
func PropRequires(units ...string) Property {
return propDependency("Requires", units)
}
// PropRequiresOverridable sets the RequiresOverridable unit property. See
// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#RequiresOverridable=
func PropRequiresOverridable(units ...string) Property {
return propDependency("RequiresOverridable", units)
}
// PropRequisite sets the Requisite unit property. See
// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#Requisite=
func PropRequisite(units ...string) Property {
return propDependency("Requisite", units)
}
// PropRequisiteOverridable sets the RequisiteOverridable unit property. See
// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#RequisiteOverridable=
func PropRequisiteOverridable(units ...string) Property {
return propDependency("RequisiteOverridable", units)
}
// PropWants sets the Wants unit property. See
// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#Wants=
func PropWants(units ...string) Property {
return propDependency("Wants", units)
}
// PropBindsTo sets the BindsTo unit property. See
// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#BindsTo=
func PropBindsTo(units ...string) Property {
return propDependency("BindsTo", units)
}
// PropRequiredBy sets the RequiredBy unit property. See
// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#RequiredBy=
func PropRequiredBy(units ...string) Property {
return propDependency("RequiredBy", units)
}
// PropRequiredByOverridable sets the RequiredByOverridable unit property. See
// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#RequiredByOverridable=
func PropRequiredByOverridable(units ...string) Property {
return propDependency("RequiredByOverridable", units)
}
// PropWantedBy sets the WantedBy unit property. See
// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#WantedBy=
func PropWantedBy(units ...string) Property {
return propDependency("WantedBy", units)
}
// PropBoundBy sets the BoundBy unit property. See
// http://www.freedesktop.org/software/systemd/main/systemd.unit.html#BoundBy=
func PropBoundBy(units ...string) Property {
return propDependency("BoundBy", units)
}
// PropConflicts sets the Conflicts unit property. See
// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#Conflicts=
func PropConflicts(units ...string) Property {
return propDependency("Conflicts", units)
}
// PropConflictedBy sets the ConflictedBy unit property. See
// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#ConflictedBy=
func PropConflictedBy(units ...string) Property {
return propDependency("ConflictedBy", units)
}
// PropBefore sets the Before unit property. See
// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#Before=
func PropBefore(units ...string) Property {
return propDependency("Before", units)
}
// PropAfter sets the After unit property. See
// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#After=
func PropAfter(units ...string) Property {
return propDependency("After", units)
}
// PropOnFailure sets the OnFailure unit property. See
// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#OnFailure=
func PropOnFailure(units ...string) Property {
return propDependency("OnFailure", units)
}
// PropTriggers sets the Triggers unit property. See
// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#Triggers=
func PropTriggers(units ...string) Property {
return propDependency("Triggers", units)
}
// PropTriggeredBy sets the TriggeredBy unit property. See
// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#TriggeredBy=
func PropTriggeredBy(units ...string) Property {
return propDependency("TriggeredBy", units)
}
// PropPropagatesReloadTo sets the PropagatesReloadTo unit property. See
// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#PropagatesReloadTo=
func PropPropagatesReloadTo(units ...string) Property {
return propDependency("PropagatesReloadTo", units)
}
// PropRequiresMountsFor sets the RequiresMountsFor unit property. See
// http://www.freedesktop.org/software/systemd/man/systemd.unit.html#RequiresMountsFor=
func PropRequiresMountsFor(units ...string) Property {
return propDependency("RequiresMountsFor", units)
}
// PropSlice sets the Slice unit property. See
// http://www.freedesktop.org/software/systemd/man/systemd.resource-control.html#Slice=
func PropSlice(slice string) Property {
return Property{
Name: "Slice",
Value: dbus.MakeVariant(slice),
}
}

View file

@ -1,47 +0,0 @@
// Copyright 2015 CoreOS, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package dbus
type set struct {
data map[string]bool
}
func (s *set) Add(value string) {
s.data[value] = true
}
func (s *set) Remove(value string) {
delete(s.data, value)
}
func (s *set) Contains(value string) (exists bool) {
_, exists = s.data[value]
return
}
func (s *set) Length() int {
return len(s.data)
}
func (s *set) Values() (values []string) {
for val, _ := range s.data {
values = append(values, val)
}
return
}
func newSet() *set {
return &set{make(map[string]bool)}
}

View file

@ -1,250 +0,0 @@
// Copyright 2015 CoreOS, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package dbus
import (
"errors"
"time"
"github.com/godbus/dbus"
)
const (
cleanIgnoreInterval = int64(10 * time.Second)
ignoreInterval = int64(30 * time.Millisecond)
)
// Subscribe sets up this connection to subscribe to all systemd dbus events.
// This is required before calling SubscribeUnits. When the connection closes
// systemd will automatically stop sending signals so there is no need to
// explicitly call Unsubscribe().
func (c *Conn) Subscribe() error {
c.sigconn.BusObject().Call("org.freedesktop.DBus.AddMatch", 0,
"type='signal',interface='org.freedesktop.systemd1.Manager',member='UnitNew'")
c.sigconn.BusObject().Call("org.freedesktop.DBus.AddMatch", 0,
"type='signal',interface='org.freedesktop.DBus.Properties',member='PropertiesChanged'")
err := c.sigobj.Call("org.freedesktop.systemd1.Manager.Subscribe", 0).Store()
if err != nil {
return err
}
return nil
}
// Unsubscribe this connection from systemd dbus events.
func (c *Conn) Unsubscribe() error {
err := c.sigobj.Call("org.freedesktop.systemd1.Manager.Unsubscribe", 0).Store()
if err != nil {
return err
}
return nil
}
func (c *Conn) dispatch() {
ch := make(chan *dbus.Signal, signalBuffer)
c.sigconn.Signal(ch)
go func() {
for {
signal, ok := <-ch
if !ok {
return
}
if signal.Name == "org.freedesktop.systemd1.Manager.JobRemoved" {
c.jobComplete(signal)
}
if c.subscriber.updateCh == nil {
continue
}
var unitPath dbus.ObjectPath
switch signal.Name {
case "org.freedesktop.systemd1.Manager.JobRemoved":
unitName := signal.Body[2].(string)
c.sysobj.Call("org.freedesktop.systemd1.Manager.GetUnit", 0, unitName).Store(&unitPath)
case "org.freedesktop.systemd1.Manager.UnitNew":
unitPath = signal.Body[1].(dbus.ObjectPath)
case "org.freedesktop.DBus.Properties.PropertiesChanged":
if signal.Body[0].(string) == "org.freedesktop.systemd1.Unit" {
unitPath = signal.Path
}
}
if unitPath == dbus.ObjectPath("") {
continue
}
c.sendSubStateUpdate(unitPath)
}
}()
}
// Returns two unbuffered channels which will receive all changed units every
// interval. Deleted units are sent as nil.
func (c *Conn) SubscribeUnits(interval time.Duration) (<-chan map[string]*UnitStatus, <-chan error) {
return c.SubscribeUnitsCustom(interval, 0, func(u1, u2 *UnitStatus) bool { return *u1 != *u2 }, nil)
}
// SubscribeUnitsCustom is like SubscribeUnits but lets you specify the buffer
// size of the channels, the comparison function for detecting changes and a filter
// function for cutting down on the noise that your channel receives.
func (c *Conn) SubscribeUnitsCustom(interval time.Duration, buffer int, isChanged func(*UnitStatus, *UnitStatus) bool, filterUnit func(string) bool) (<-chan map[string]*UnitStatus, <-chan error) {
old := make(map[string]*UnitStatus)
statusChan := make(chan map[string]*UnitStatus, buffer)
errChan := make(chan error, buffer)
go func() {
for {
timerChan := time.After(interval)
units, err := c.ListUnits()
if err == nil {
cur := make(map[string]*UnitStatus)
for i := range units {
if filterUnit != nil && filterUnit(units[i].Name) {
continue
}
cur[units[i].Name] = &units[i]
}
// add all new or changed units
changed := make(map[string]*UnitStatus)
for n, u := range cur {
if oldU, ok := old[n]; !ok || isChanged(oldU, u) {
changed[n] = u
}
delete(old, n)
}
// add all deleted units
for oldN := range old {
changed[oldN] = nil
}
old = cur
if len(changed) != 0 {
statusChan <- changed
}
} else {
errChan <- err
}
<-timerChan
}
}()
return statusChan, errChan
}
type SubStateUpdate struct {
UnitName string
SubState string
}
// SetSubStateSubscriber writes to updateCh when any unit's substate changes.
// Although this writes to updateCh on every state change, the reported state
// may be more recent than the change that generated it (due to an unavoidable
// race in the systemd dbus interface). That is, this method provides a good
// way to keep a current view of all units' states, but is not guaranteed to
// show every state transition they go through. Furthermore, state changes
// will only be written to the channel with non-blocking writes. If updateCh
// is full, it attempts to write an error to errCh; if errCh is full, the error
// passes silently.
func (c *Conn) SetSubStateSubscriber(updateCh chan<- *SubStateUpdate, errCh chan<- error) {
c.subscriber.Lock()
defer c.subscriber.Unlock()
c.subscriber.updateCh = updateCh
c.subscriber.errCh = errCh
}
func (c *Conn) sendSubStateUpdate(path dbus.ObjectPath) {
c.subscriber.Lock()
defer c.subscriber.Unlock()
if c.shouldIgnore(path) {
return
}
info, err := c.GetUnitProperties(string(path))
if err != nil {
select {
case c.subscriber.errCh <- err:
default:
}
}
name := info["Id"].(string)
substate := info["SubState"].(string)
update := &SubStateUpdate{name, substate}
select {
case c.subscriber.updateCh <- update:
default:
select {
case c.subscriber.errCh <- errors.New("update channel full!"):
default:
}
}
c.updateIgnore(path, info)
}
// The ignore functions work around a wart in the systemd dbus interface.
// Requesting the properties of an unloaded unit will cause systemd to send a
// pair of UnitNew/UnitRemoved signals. Because we need to get a unit's
// properties on UnitNew (as that's the only indication of a new unit coming up
// for the first time), we would enter an infinite loop if we did not attempt
// to detect and ignore these spurious signals. The signal themselves are
// indistinguishable from relevant ones, so we (somewhat hackishly) ignore an
// unloaded unit's signals for a short time after requesting its properties.
// This means that we will miss e.g. a transient unit being restarted
// *immediately* upon failure and also a transient unit being started
// immediately after requesting its status (with systemctl status, for example,
// because this causes a UnitNew signal to be sent which then causes us to fetch
// the properties).
func (c *Conn) shouldIgnore(path dbus.ObjectPath) bool {
t, ok := c.subscriber.ignore[path]
return ok && t >= time.Now().UnixNano()
}
func (c *Conn) updateIgnore(path dbus.ObjectPath, info map[string]interface{}) {
c.cleanIgnore()
// unit is unloaded - it will trigger bad systemd dbus behavior
if info["LoadState"].(string) == "not-found" {
c.subscriber.ignore[path] = time.Now().UnixNano() + ignoreInterval
}
}
// without this, ignore would grow unboundedly over time
func (c *Conn) cleanIgnore() {
now := time.Now().UnixNano()
if c.subscriber.cleanIgnore < now {
c.subscriber.cleanIgnore = now + cleanIgnoreInterval
for p, t := range c.subscriber.ignore {
if t < now {
delete(c.subscriber.ignore, p)
}
}
}
}

View file

@ -1,57 +0,0 @@
// Copyright 2015 CoreOS, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package dbus
import (
"time"
)
// SubscriptionSet returns a subscription set which is like conn.Subscribe but
// can filter to only return events for a set of units.
type SubscriptionSet struct {
*set
conn *Conn
}
func (s *SubscriptionSet) filter(unit string) bool {
return !s.Contains(unit)
}
// Subscribe starts listening for dbus events for all of the units in the set.
// Returns channels identical to conn.SubscribeUnits.
func (s *SubscriptionSet) Subscribe() (<-chan map[string]*UnitStatus, <-chan error) {
// TODO: Make fully evented by using systemd 209 with properties changed values
return s.conn.SubscribeUnitsCustom(time.Second, 0,
mismatchUnitStatus,
func(unit string) bool { return s.filter(unit) },
)
}
// NewSubscriptionSet returns a new subscription set.
func (conn *Conn) NewSubscriptionSet() *SubscriptionSet {
return &SubscriptionSet{newSet(), conn}
}
// mismatchUnitStatus returns true if the provided UnitStatus objects
// are not equivalent. false is returned if the objects are equivalent.
// Only the Name, Description and state-related fields are used in
// the comparison.
func mismatchUnitStatus(u1, u2 *UnitStatus) bool {
return u1.Name != u2.Name ||
u1.Description != u2.Description ||
u1.LoadState != u2.LoadState ||
u1.ActiveState != u2.ActiveState ||
u1.SubState != u2.SubState
}

View file

@ -1,33 +0,0 @@
// Copyright 2015 CoreOS, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package util contains utility functions related to systemd that applications
// can use to check things like whether systemd is running.
package util
import (
"os"
)
// IsRunningSystemd checks whether the host was booted with systemd as its init
// system. This functions similar to systemd's `sd_booted(3)`: internally, it
// checks whether /run/systemd/system/ exists and is a directory.
// http://www.freedesktop.org/software/systemd/man/sd_booted.html
func IsRunningSystemd() bool {
fi, err := os.Lstat("/run/systemd/system")
if err != nil {
return false
}
return fi.IsDir()
}

View file

@ -1,238 +0,0 @@
Libcontainer provides a native Go implementation for creating containers
with namespaces, cgroups, capabilities, and filesystem access controls.
It allows you to manage the lifecycle of the container performing additional operations
after the container is created.
#### Container
A container is a self contained execution environment that shares the kernel of the
host system and which is (optionally) isolated from other containers in the system.
#### Using libcontainer
Because containers are spawned in a two step process you will need a binary that
will be executed as the init process for the container. In libcontainer, we use
the current binary (/proc/self/exe) to be executed as the init process, and use
arg "init", we call the first step process "bootstrap", so you always need a "init"
function as the entry of "bootstrap".
```go
func init() {
if len(os.Args) > 1 && os.Args[1] == "init" {
runtime.GOMAXPROCS(1)
runtime.LockOSThread()
factory, _ := libcontainer.New("")
if err := factory.StartInitialization(); err != nil {
logrus.Fatal(err)
}
panic("--this line should have never been executed, congratulations--")
}
}
```
Then to create a container you first have to initialize an instance of a factory
that will handle the creation and initialization for a container.
```go
factory, err := libcontainer.New("/var/lib/container", libcontainer.Cgroupfs, libcontainer.InitArgs(os.Args[0], "init"))
if err != nil {
logrus.Fatal(err)
return
}
```
Once you have an instance of the factory created we can create a configuration
struct describing how the container is to be created. A sample would look similar to this:
```go
defaultMountFlags := syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
config := &configs.Config{
Rootfs: "/your/path/to/rootfs",
Capabilities: []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
},
Namespaces: configs.Namespaces([]configs.Namespace{
{Type: configs.NEWNS},
{Type: configs.NEWUTS},
{Type: configs.NEWIPC},
{Type: configs.NEWPID},
{Type: configs.NEWUSER},
{Type: configs.NEWNET},
}),
Cgroups: &configs.Cgroup{
Name: "test-container",
Parent: "system",
Resources: &configs.Resources{
MemorySwappiness: nil,
AllowAllDevices: false,
AllowedDevices: configs.DefaultAllowedDevices,
},
},
MaskPaths: []string{
"/proc/kcore",
},
ReadonlyPaths: []string{
"/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
},
Devices: configs.DefaultAutoCreatedDevices,
Hostname: "testing",
Mounts: []*configs.Mount{
{
Source: "proc",
Destination: "/proc",
Device: "proc",
Flags: defaultMountFlags,
},
{
Source: "tmpfs",
Destination: "/dev",
Device: "tmpfs",
Flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME,
Data: "mode=755",
},
{
Source: "devpts",
Destination: "/dev/pts",
Device: "devpts",
Flags: syscall.MS_NOSUID | syscall.MS_NOEXEC,
Data: "newinstance,ptmxmode=0666,mode=0620,gid=5",
},
{
Device: "tmpfs",
Source: "shm",
Destination: "/dev/shm",
Data: "mode=1777,size=65536k",
Flags: defaultMountFlags,
},
{
Source: "mqueue",
Destination: "/dev/mqueue",
Device: "mqueue",
Flags: defaultMountFlags,
},
{
Source: "sysfs",
Destination: "/sys",
Device: "sysfs",
Flags: defaultMountFlags | syscall.MS_RDONLY,
},
},
UidMappings: []configs.IDMap{
{
ContainerID: 0,
Host: 1000,
size: 65536,
},
},
GidMappings: []configs.IDMap{
{
ContainerID: 0,
Host: 1000,
size: 65536,
},
},
Networks: []*configs.Network{
{
Type: "loopback",
Address: "127.0.0.1/0",
Gateway: "localhost",
},
},
Rlimits: []configs.Rlimit{
{
Type: syscall.RLIMIT_NOFILE,
Hard: uint64(1025),
Soft: uint64(1025),
},
},
}
```
Once you have the configuration populated you can create a container:
```go
container, err := factory.Create("container-id", config)
if err != nil {
logrus.Fatal(err)
return
}
```
To spawn bash as the initial process inside the container and have the
processes pid returned in order to wait, signal, or kill the process:
```go
process := &libcontainer.Process{
Args: []string{"/bin/bash"},
Env: []string{"PATH=/bin"},
User: "daemon",
Stdin: os.Stdin,
Stdout: os.Stdout,
Stderr: os.Stderr,
}
err := container.Start(process)
if err != nil {
logrus.Fatal(err)
container.Destroy()
return
}
// wait for the process to finish.
_, err := process.Wait()
if err != nil {
logrus.Fatal(err)
}
// destroy the container.
container.Destroy()
```
Additional ways to interact with a running container are:
```go
// return all the pids for all processes running inside the container.
processes, err := container.Processes()
// get detailed cpu, memory, io, and network statistics for the container and
// it's processes.
stats, err := container.Stats()
// pause all processes inside the container.
container.Pause()
// resume all paused processes.
container.Resume()
```
#### Checkpoint & Restore
libcontainer now integrates [CRIU](http://criu.org/) for checkpointing and restoring containers.
This let's you save the state of a process running inside a container to disk, and then restore
that state into a new process, on the same machine or on another machine.
`criu` version 1.5.2 or higher is required to use checkpoint and restore.
If you don't already have `criu` installed, you can build it from source, following the
[online instructions](http://criu.org/Installation). `criu` is also installed in the docker image
generated when building libcontainer with docker.
## Copyright and license
Code and documentation copyright 2014 Docker, inc. Code released under the Apache 2.0 license.
Docs released under Creative commons.

View file

@ -1,335 +0,0 @@
## Container Specification - v1
This is the standard configuration for version 1 containers. It includes
namespaces, standard filesystem setup, a default Linux capability set, and
information about resource reservations. It also has information about any
populated environment settings for the processes running inside a container.
Along with the configuration of how a container is created the standard also
discusses actions that can be performed on a container to manage and inspect
information about the processes running inside.
The v1 profile is meant to be able to accommodate the majority of applications
with a strong security configuration.
### System Requirements and Compatibility
Minimum requirements:
* Kernel version - 3.10 recommended 2.6.2x minimum(with backported patches)
* Mounted cgroups with each subsystem in its own hierarchy
### Namespaces
| Flag | Enabled |
| ------------ | ------- |
| CLONE_NEWPID | 1 |
| CLONE_NEWUTS | 1 |
| CLONE_NEWIPC | 1 |
| CLONE_NEWNET | 1 |
| CLONE_NEWNS | 1 |
| CLONE_NEWUSER | 1 |
Namespaces are created for the container via the `clone` syscall.
### Filesystem
A root filesystem must be provided to a container for execution. The container
will use this root filesystem (rootfs) to jail and spawn processes inside where
the binaries and system libraries are local to that directory. Any binaries
to be executed must be contained within this rootfs.
Mounts that happen inside the container are automatically cleaned up when the
container exits as the mount namespace is destroyed and the kernel will
unmount all the mounts that were setup within that namespace.
For a container to execute properly there are certain filesystems that
are required to be mounted within the rootfs that the runtime will setup.
| Path | Type | Flags | Data |
| ----------- | ------ | -------------------------------------- | ---------------------------------------- |
| /proc | proc | MS_NOEXEC,MS_NOSUID,MS_NODEV | |
| /dev | tmpfs | MS_NOEXEC,MS_STRICTATIME | mode=755 |
| /dev/shm | tmpfs | MS_NOEXEC,MS_NOSUID,MS_NODEV | mode=1777,size=65536k |
| /dev/mqueue | mqueue | MS_NOEXEC,MS_NOSUID,MS_NODEV | |
| /dev/pts | devpts | MS_NOEXEC,MS_NOSUID | newinstance,ptmxmode=0666,mode=620,gid=5 |
| /sys | sysfs | MS_NOEXEC,MS_NOSUID,MS_NODEV,MS_RDONLY | |
After a container's filesystems are mounted within the newly created
mount namespace `/dev` will need to be populated with a set of device nodes.
It is expected that a rootfs does not need to have any device nodes specified
for `/dev` within the rootfs as the container will setup the correct devices
that are required for executing a container's process.
| Path | Mode | Access |
| ------------ | ---- | ---------- |
| /dev/null | 0666 | rwm |
| /dev/zero | 0666 | rwm |
| /dev/full | 0666 | rwm |
| /dev/tty | 0666 | rwm |
| /dev/random | 0666 | rwm |
| /dev/urandom | 0666 | rwm |
| /dev/fuse | 0666 | rwm |
**ptmx**
`/dev/ptmx` will need to be a symlink to the host's `/dev/ptmx` within
the container.
The use of a pseudo TTY is optional within a container and it should support both.
If a pseudo is provided to the container `/dev/console` will need to be
setup by binding the console in `/dev/` after it has been populated and mounted
in tmpfs.
| Source | Destination | UID GID | Mode | Type |
| --------------- | ------------ | ------- | ---- | ---- |
| *pty host path* | /dev/console | 0 0 | 0600 | bind |
After `/dev/null` has been setup we check for any external links between
the container's io, STDIN, STDOUT, STDERR. If the container's io is pointing
to `/dev/null` outside the container we close and `dup2` the the `/dev/null`
that is local to the container's rootfs.
After the container has `/proc` mounted a few standard symlinks are setup
within `/dev/` for the io.
| Source | Destination |
| --------------- | ----------- |
| /proc/self/fd | /dev/fd |
| /proc/self/fd/0 | /dev/stdin |
| /proc/self/fd/1 | /dev/stdout |
| /proc/self/fd/2 | /dev/stderr |
A `pivot_root` is used to change the root for the process, effectively
jailing the process inside the rootfs.
```c
put_old = mkdir(...);
pivot_root(rootfs, put_old);
chdir("/");
unmount(put_old, MS_DETACH);
rmdir(put_old);
```
For container's running with a rootfs inside `ramfs` a `MS_MOVE` combined
with a `chroot` is required as `pivot_root` is not supported in `ramfs`.
```c
mount(rootfs, "/", NULL, MS_MOVE, NULL);
chroot(".");
chdir("/");
```
The `umask` is set back to `0022` after the filesystem setup has been completed.
### Resources
Cgroups are used to handle resource allocation for containers. This includes
system resources like cpu, memory, and device access.
| Subsystem | Enabled |
| ---------- | ------- |
| devices | 1 |
| memory | 1 |
| cpu | 1 |
| cpuacct | 1 |
| cpuset | 1 |
| blkio | 1 |
| perf_event | 1 |
| freezer | 1 |
| hugetlb | 1 |
| pids | 1 |
All cgroup subsystem are joined so that statistics can be collected from
each of the subsystems. Freezer does not expose any stats but is joined
so that containers can be paused and resumed.
The parent process of the container's init must place the init pid inside
the correct cgroups before the initialization begins. This is done so
that no processes or threads escape the cgroups. This sync is
done via a pipe ( specified in the runtime section below ) that the container's
init process will block waiting for the parent to finish setup.
### Security
The standard set of Linux capabilities that are set in a container
provide a good default for security and flexibility for the applications.
| Capability | Enabled |
| -------------------- | ------- |
| CAP_NET_RAW | 1 |
| CAP_NET_BIND_SERVICE | 1 |
| CAP_AUDIT_READ | 1 |
| CAP_AUDIT_WRITE | 1 |
| CAP_DAC_OVERRIDE | 1 |
| CAP_SETFCAP | 1 |
| CAP_SETPCAP | 1 |
| CAP_SETGID | 1 |
| CAP_SETUID | 1 |
| CAP_MKNOD | 1 |
| CAP_CHOWN | 1 |
| CAP_FOWNER | 1 |
| CAP_FSETID | 1 |
| CAP_KILL | 1 |
| CAP_SYS_CHROOT | 1 |
| CAP_NET_BROADCAST | 0 |
| CAP_SYS_MODULE | 0 |
| CAP_SYS_RAWIO | 0 |
| CAP_SYS_PACCT | 0 |
| CAP_SYS_ADMIN | 0 |
| CAP_SYS_NICE | 0 |
| CAP_SYS_RESOURCE | 0 |
| CAP_SYS_TIME | 0 |
| CAP_SYS_TTY_CONFIG | 0 |
| CAP_AUDIT_CONTROL | 0 |
| CAP_MAC_OVERRIDE | 0 |
| CAP_MAC_ADMIN | 0 |
| CAP_NET_ADMIN | 0 |
| CAP_SYSLOG | 0 |
| CAP_DAC_READ_SEARCH | 0 |
| CAP_LINUX_IMMUTABLE | 0 |
| CAP_IPC_LOCK | 0 |
| CAP_IPC_OWNER | 0 |
| CAP_SYS_PTRACE | 0 |
| CAP_SYS_BOOT | 0 |
| CAP_LEASE | 0 |
| CAP_WAKE_ALARM | 0 |
| CAP_BLOCK_SUSPEND | 0 |
Additional security layers like [apparmor](https://wiki.ubuntu.com/AppArmor)
and [selinux](http://selinuxproject.org/page/Main_Page) can be used with
the containers. A container should support setting an apparmor profile or
selinux process and mount labels if provided in the configuration.
Standard apparmor profile:
```c
#include <tunables/global>
profile <profile_name> flags=(attach_disconnected,mediate_deleted) {
#include <abstractions/base>
network,
capability,
file,
umount,
deny @{PROC}/sys/fs/** wklx,
deny @{PROC}/sysrq-trigger rwklx,
deny @{PROC}/mem rwklx,
deny @{PROC}/kmem rwklx,
deny @{PROC}/sys/kernel/[^s][^h][^m]* wklx,
deny @{PROC}/sys/kernel/*/** wklx,
deny mount,
deny /sys/[^f]*/** wklx,
deny /sys/f[^s]*/** wklx,
deny /sys/fs/[^c]*/** wklx,
deny /sys/fs/c[^g]*/** wklx,
deny /sys/fs/cg[^r]*/** wklx,
deny /sys/firmware/efi/efivars/** rwklx,
deny /sys/kernel/security/** rwklx,
}
```
*TODO: seccomp work is being done to find a good default config*
### Runtime and Init Process
During container creation the parent process needs to talk to the container's init
process and have a form of synchronization. This is accomplished by creating
a pipe that is passed to the container's init. When the init process first spawns
it will block on its side of the pipe until the parent closes its side. This
allows the parent to have time to set the new process inside a cgroup hierarchy
and/or write any uid/gid mappings required for user namespaces.
The pipe is passed to the init process via FD 3.
The application consuming libcontainer should be compiled statically. libcontainer
does not define any init process and the arguments provided are used to `exec` the
process inside the application. There should be no long running init within the
container spec.
If a pseudo tty is provided to a container it will open and `dup2` the console
as the container's STDIN, STDOUT, STDERR as well as mounting the console
as `/dev/console`.
An extra set of mounts are provided to a container and setup for use. A container's
rootfs can contain some non portable files inside that can cause side effects during
execution of a process. These files are usually created and populated with the container
specific information via the runtime.
**Extra runtime files:**
* /etc/hosts
* /etc/resolv.conf
* /etc/hostname
* /etc/localtime
#### Defaults
There are a few defaults that can be overridden by users, but in their omission
these apply to processes within a container.
| Type | Value |
| ------------------- | ------------------------------ |
| Parent Death Signal | SIGKILL |
| UID | 0 |
| GID | 0 |
| GROUPS | 0, NULL |
| CWD | "/" |
| $HOME | Current user's home dir or "/" |
| Readonly rootfs | false |
| Pseudo TTY | false |
## Actions
After a container is created there is a standard set of actions that can
be done to the container. These actions are part of the public API for
a container.
| Action | Description |
| -------------- | ------------------------------------------------------------------ |
| Get processes | Return all the pids for processes running inside a container |
| Get Stats | Return resource statistics for the container as a whole |
| Wait | Wait waits on the container's init process ( pid 1 ) |
| Wait Process | Wait on any of the container's processes returning the exit status |
| Destroy | Kill the container's init process and remove any filesystem state |
| Signal | Send a signal to the container's init process |
| Signal Process | Send a signal to any of the container's processes |
| Pause | Pause all processes inside the container |
| Resume | Resume all processes inside the container if paused |
| Exec | Execute a new process inside of the container ( requires setns ) |
| Set | Setup configs of the container after it's created |
### Execute a new process inside of a running container.
User can execute a new process inside of a running container. Any binaries to be
executed must be accessible within the container's rootfs.
The started process will run inside the container's rootfs. Any changes
made by the process to the container's filesystem will persist after the
process finished executing.
The started process will join all the container's existing namespaces. When the
container is paused, the process will also be paused and will resume when
the container is unpaused. The started process will only run when the container's
primary process (PID 1) is running, and will not be restarted when the container
is restarted.
#### Planned additions
The started process will have its own cgroups nested inside the container's
cgroups. This is used for process tracking and optionally resource allocation
handling for the new process. Freezer cgroup is required, the rest of the cgroups
are optional. The process executor must place its pid inside the correct
cgroups before starting the process. This is done so that no child processes or
threads can escape the cgroups.
When the process is stopped, the process executor will try (in a best-effort way)
to stop all its children and remove the sub-cgroups.

View file

@ -1,69 +0,0 @@
// +build linux
package libcontainer
import (
"fmt"
"os"
"strings"
"github.com/syndtr/gocapability/capability"
)
const allCapabilityTypes = capability.CAPS | capability.BOUNDS
var capabilityMap map[string]capability.Cap
func init() {
capabilityMap = make(map[string]capability.Cap)
last := capability.CAP_LAST_CAP
// workaround for RHEL6 which has no /proc/sys/kernel/cap_last_cap
if last == capability.Cap(63) {
last = capability.CAP_BLOCK_SUSPEND
}
for _, cap := range capability.List() {
if cap > last {
continue
}
capKey := fmt.Sprintf("CAP_%s", strings.ToUpper(cap.String()))
capabilityMap[capKey] = cap
}
}
func newCapWhitelist(caps []string) (*whitelist, error) {
l := []capability.Cap{}
for _, c := range caps {
v, ok := capabilityMap[c]
if !ok {
return nil, fmt.Errorf("unknown capability %q", c)
}
l = append(l, v)
}
pid, err := capability.NewPid(os.Getpid())
if err != nil {
return nil, err
}
return &whitelist{
keep: l,
pid: pid,
}, nil
}
type whitelist struct {
pid capability.Capabilities
keep []capability.Cap
}
// dropBoundingSet drops the capability bounding set to those specified in the whitelist.
func (w *whitelist) dropBoundingSet() error {
w.pid.Clear(capability.BOUNDS)
w.pid.Set(capability.BOUNDS, w.keep...)
return w.pid.Apply(capability.BOUNDS)
}
// drop drops all capabilities for the current process except those specified in the whitelist.
func (w *whitelist) drop() error {
w.pid.Clear(allCapabilityTypes)
w.pid.Set(allCapabilityTypes, w.keep...)
return w.pid.Apply(allCapabilityTypes)
}

View file

@ -1,397 +0,0 @@
// +build linux
package fs
import (
"errors"
"fmt"
"io"
"io/ioutil"
"os"
"path/filepath"
"strconv"
"sync"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
)
var (
subsystems = subsystemSet{
&CpusetGroup{},
&DevicesGroup{},
&MemoryGroup{},
&CpuGroup{},
&CpuacctGroup{},
&PidsGroup{},
&BlkioGroup{},
&HugetlbGroup{},
&NetClsGroup{},
&NetPrioGroup{},
&PerfEventGroup{},
&FreezerGroup{},
&NameGroup{GroupName: "name=systemd", Join: true},
}
CgroupProcesses = "cgroup.procs"
HugePageSizes, _ = cgroups.GetHugePageSize()
)
var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist")
type subsystemSet []subsystem
func (s subsystemSet) Get(name string) (subsystem, error) {
for _, ss := range s {
if ss.Name() == name {
return ss, nil
}
}
return nil, errSubsystemDoesNotExist
}
type subsystem interface {
// Name returns the name of the subsystem.
Name() string
// Returns the stats, as 'stats', corresponding to the cgroup under 'path'.
GetStats(path string, stats *cgroups.Stats) error
// Removes the cgroup represented by 'cgroupData'.
Remove(*cgroupData) error
// Creates and joins the cgroup represented by 'cgroupData'.
Apply(*cgroupData) error
// Set the cgroup represented by cgroup.
Set(path string, cgroup *configs.Cgroup) error
}
type Manager struct {
mu sync.Mutex
Cgroups *configs.Cgroup
Paths map[string]string
}
// The absolute path to the root of the cgroup hierarchies.
var cgroupRootLock sync.Mutex
var cgroupRoot string
// Gets the cgroupRoot.
func getCgroupRoot() (string, error) {
cgroupRootLock.Lock()
defer cgroupRootLock.Unlock()
if cgroupRoot != "" {
return cgroupRoot, nil
}
root, err := cgroups.FindCgroupMountpointDir()
if err != nil {
return "", err
}
if _, err := os.Stat(root); err != nil {
return "", err
}
cgroupRoot = root
return cgroupRoot, nil
}
type cgroupData struct {
root string
innerPath string
config *configs.Cgroup
pid int
}
func (m *Manager) Apply(pid int) (err error) {
if m.Cgroups == nil {
return nil
}
var c = m.Cgroups
d, err := getCgroupData(m.Cgroups, pid)
if err != nil {
return err
}
if c.Paths != nil {
paths := make(map[string]string)
for name, path := range c.Paths {
_, err := d.path(name)
if err != nil {
if cgroups.IsNotFound(err) {
continue
}
return err
}
paths[name] = path
}
m.Paths = paths
return cgroups.EnterPid(m.Paths, pid)
}
m.mu.Lock()
defer m.mu.Unlock()
paths := make(map[string]string)
for _, sys := range subsystems {
if err := sys.Apply(d); err != nil {
return err
}
// TODO: Apply should, ideally, be reentrant or be broken up into a separate
// create and join phase so that the cgroup hierarchy for a container can be
// created then join consists of writing the process pids to cgroup.procs
p, err := d.path(sys.Name())
if err != nil {
if cgroups.IsNotFound(err) {
continue
}
return err
}
paths[sys.Name()] = p
}
m.Paths = paths
return nil
}
func (m *Manager) Destroy() error {
if m.Cgroups.Paths != nil {
return nil
}
m.mu.Lock()
defer m.mu.Unlock()
if err := cgroups.RemovePaths(m.Paths); err != nil {
return err
}
m.Paths = make(map[string]string)
return nil
}
func (m *Manager) GetPaths() map[string]string {
m.mu.Lock()
paths := m.Paths
m.mu.Unlock()
return paths
}
func (m *Manager) GetStats() (*cgroups.Stats, error) {
m.mu.Lock()
defer m.mu.Unlock()
stats := cgroups.NewStats()
for name, path := range m.Paths {
sys, err := subsystems.Get(name)
if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) {
continue
}
if err := sys.GetStats(path, stats); err != nil {
return nil, err
}
}
return stats, nil
}
func (m *Manager) Set(container *configs.Config) error {
for _, sys := range subsystems {
// Generate fake cgroup data.
d, err := getCgroupData(container.Cgroups, -1)
if err != nil {
return err
}
// Get the path, but don't error out if the cgroup wasn't found.
path, err := d.path(sys.Name())
if err != nil && !cgroups.IsNotFound(err) {
return err
}
if err := sys.Set(path, container.Cgroups); err != nil {
return err
}
}
if m.Paths["cpu"] != "" {
if err := CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil {
return err
}
}
return nil
}
// Freeze toggles the container's freezer cgroup depending on the state
// provided
func (m *Manager) Freeze(state configs.FreezerState) error {
d, err := getCgroupData(m.Cgroups, 0)
if err != nil {
return err
}
dir, err := d.path("freezer")
if err != nil {
return err
}
prevState := m.Cgroups.Resources.Freezer
m.Cgroups.Resources.Freezer = state
freezer, err := subsystems.Get("freezer")
if err != nil {
return err
}
err = freezer.Set(dir, m.Cgroups)
if err != nil {
m.Cgroups.Resources.Freezer = prevState
return err
}
return nil
}
func (m *Manager) GetPids() ([]int, error) {
dir, err := getCgroupPath(m.Cgroups)
if err != nil {
return nil, err
}
return cgroups.GetPids(dir)
}
func (m *Manager) GetAllPids() ([]int, error) {
dir, err := getCgroupPath(m.Cgroups)
if err != nil {
return nil, err
}
return cgroups.GetAllPids(dir)
}
func getCgroupPath(c *configs.Cgroup) (string, error) {
d, err := getCgroupData(c, 0)
if err != nil {
return "", err
}
return d.path("devices")
}
func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
root, err := getCgroupRoot()
if err != nil {
return nil, err
}
if (c.Name != "" || c.Parent != "") && c.Path != "" {
return nil, fmt.Errorf("cgroup: either Path or Name and Parent should be used")
}
// XXX: Do not remove this code. Path safety is important! -- cyphar
cgPath := libcontainerUtils.CleanPath(c.Path)
cgParent := libcontainerUtils.CleanPath(c.Parent)
cgName := libcontainerUtils.CleanPath(c.Name)
innerPath := cgPath
if innerPath == "" {
innerPath = filepath.Join(cgParent, cgName)
}
return &cgroupData{
root: root,
innerPath: innerPath,
config: c,
pid: pid,
}, nil
}
func (raw *cgroupData) parentPath(subsystem, mountpoint, root string) (string, error) {
// Use GetThisCgroupDir instead of GetInitCgroupDir, because the creating
// process could in container and shared pid namespace with host, and
// /proc/1/cgroup could point to whole other world of cgroups.
initPath, err := cgroups.GetThisCgroupDir(subsystem)
if err != nil {
return "", err
}
// This is needed for nested containers, because in /proc/self/cgroup we
// see pathes from host, which don't exist in container.
relDir, err := filepath.Rel(root, initPath)
if err != nil {
return "", err
}
return filepath.Join(mountpoint, relDir), nil
}
func (raw *cgroupData) path(subsystem string) (string, error) {
mnt, root, err := cgroups.FindCgroupMountpointAndRoot(subsystem)
// If we didn't mount the subsystem, there is no point we make the path.
if err != nil {
return "", err
}
// If the cgroup name/path is absolute do not look relative to the cgroup of the init process.
if filepath.IsAbs(raw.innerPath) {
// Sometimes subsystems can be mounted togethger as 'cpu,cpuacct'.
return filepath.Join(raw.root, filepath.Base(mnt), raw.innerPath), nil
}
parentPath, err := raw.parentPath(subsystem, mnt, root)
if err != nil {
return "", err
}
return filepath.Join(parentPath, raw.innerPath), nil
}
func (raw *cgroupData) join(subsystem string) (string, error) {
path, err := raw.path(subsystem)
if err != nil {
return "", err
}
if err := os.MkdirAll(path, 0755); err != nil {
return "", err
}
if err := writeFile(path, CgroupProcesses, strconv.Itoa(raw.pid)); err != nil {
return "", err
}
return path, nil
}
func writeFile(dir, file, data string) error {
// Normally dir should not be empty, one case is that cgroup subsystem
// is not mounted, we will get empty dir, and we want it fail here.
if dir == "" {
return fmt.Errorf("no such directory for %s.", file)
}
return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700)
}
func readFile(dir, file string) (string, error) {
data, err := ioutil.ReadFile(filepath.Join(dir, file))
return string(data), err
}
func removePath(p string, err error) error {
if err != nil {
return err
}
if p != "" {
return os.RemoveAll(p)
}
return nil
}
func CheckCpushares(path string, c int64) error {
var cpuShares int64
if c == 0 {
return nil
}
fd, err := os.Open(filepath.Join(path, "cpu.shares"))
if err != nil {
return err
}
defer fd.Close()
_, err = fmt.Fscanf(fd, "%d", &cpuShares)
if err != nil && err != io.EOF {
return err
}
if c > cpuShares {
return fmt.Errorf("The maximum allowed cpu-shares is %d", cpuShares)
} else if c < cpuShares {
return fmt.Errorf("The minimum allowed cpu-shares is %d", cpuShares)
}
return nil
}

View file

@ -1,237 +0,0 @@
// +build linux
package fs
import (
"bufio"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
type BlkioGroup struct {
}
func (s *BlkioGroup) Name() string {
return "blkio"
}
func (s *BlkioGroup) Apply(d *cgroupData) error {
_, err := d.join("blkio")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func (s *BlkioGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.BlkioWeight != 0 {
if err := writeFile(path, "blkio.weight", strconv.FormatUint(uint64(cgroup.Resources.BlkioWeight), 10)); err != nil {
return err
}
}
if cgroup.Resources.BlkioLeafWeight != 0 {
if err := writeFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(cgroup.Resources.BlkioLeafWeight), 10)); err != nil {
return err
}
}
for _, wd := range cgroup.Resources.BlkioWeightDevice {
if err := writeFile(path, "blkio.weight_device", wd.WeightString()); err != nil {
return err
}
if err := writeFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil {
return err
}
}
for _, td := range cgroup.Resources.BlkioThrottleReadBpsDevice {
if err := writeFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil {
return err
}
}
for _, td := range cgroup.Resources.BlkioThrottleWriteBpsDevice {
if err := writeFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil {
return err
}
}
for _, td := range cgroup.Resources.BlkioThrottleReadIOPSDevice {
if err := writeFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil {
return err
}
}
for _, td := range cgroup.Resources.BlkioThrottleWriteIOPSDevice {
if err := writeFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil {
return err
}
}
return nil
}
func (s *BlkioGroup) Remove(d *cgroupData) error {
return removePath(d.path("blkio"))
}
/*
examples:
blkio.sectors
8:0 6792
blkio.io_service_bytes
8:0 Read 1282048
8:0 Write 2195456
8:0 Sync 2195456
8:0 Async 1282048
8:0 Total 3477504
Total 3477504
blkio.io_serviced
8:0 Read 124
8:0 Write 104
8:0 Sync 104
8:0 Async 124
8:0 Total 228
Total 228
blkio.io_queued
8:0 Read 0
8:0 Write 0
8:0 Sync 0
8:0 Async 0
8:0 Total 0
Total 0
*/
func splitBlkioStatLine(r rune) bool {
return r == ' ' || r == ':'
}
func getBlkioStat(path string) ([]cgroups.BlkioStatEntry, error) {
var blkioStats []cgroups.BlkioStatEntry
f, err := os.Open(path)
if err != nil {
if os.IsNotExist(err) {
return blkioStats, nil
}
return nil, err
}
defer f.Close()
sc := bufio.NewScanner(f)
for sc.Scan() {
// format: dev type amount
fields := strings.FieldsFunc(sc.Text(), splitBlkioStatLine)
if len(fields) < 3 {
if len(fields) == 2 && fields[0] == "Total" {
// skip total line
continue
} else {
return nil, fmt.Errorf("Invalid line found while parsing %s: %s", path, sc.Text())
}
}
v, err := strconv.ParseUint(fields[0], 10, 64)
if err != nil {
return nil, err
}
major := v
v, err = strconv.ParseUint(fields[1], 10, 64)
if err != nil {
return nil, err
}
minor := v
op := ""
valueField := 2
if len(fields) == 4 {
op = fields[2]
valueField = 3
}
v, err = strconv.ParseUint(fields[valueField], 10, 64)
if err != nil {
return nil, err
}
blkioStats = append(blkioStats, cgroups.BlkioStatEntry{Major: major, Minor: minor, Op: op, Value: v})
}
return blkioStats, nil
}
func (s *BlkioGroup) GetStats(path string, stats *cgroups.Stats) error {
// Try to read CFQ stats available on all CFQ enabled kernels first
if blkioStats, err := getBlkioStat(filepath.Join(path, "blkio.io_serviced_recursive")); err == nil && blkioStats != nil {
return getCFQStats(path, stats)
}
return getStats(path, stats) // Use generic stats as fallback
}
func getCFQStats(path string, stats *cgroups.Stats) error {
var blkioStats []cgroups.BlkioStatEntry
var err error
if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.sectors_recursive")); err != nil {
return err
}
stats.BlkioStats.SectorsRecursive = blkioStats
if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_service_bytes_recursive")); err != nil {
return err
}
stats.BlkioStats.IoServiceBytesRecursive = blkioStats
if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_serviced_recursive")); err != nil {
return err
}
stats.BlkioStats.IoServicedRecursive = blkioStats
if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_queued_recursive")); err != nil {
return err
}
stats.BlkioStats.IoQueuedRecursive = blkioStats
if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_service_time_recursive")); err != nil {
return err
}
stats.BlkioStats.IoServiceTimeRecursive = blkioStats
if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_wait_time_recursive")); err != nil {
return err
}
stats.BlkioStats.IoWaitTimeRecursive = blkioStats
if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_merged_recursive")); err != nil {
return err
}
stats.BlkioStats.IoMergedRecursive = blkioStats
if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.time_recursive")); err != nil {
return err
}
stats.BlkioStats.IoTimeRecursive = blkioStats
return nil
}
func getStats(path string, stats *cgroups.Stats) error {
var blkioStats []cgroups.BlkioStatEntry
var err error
if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.throttle.io_service_bytes")); err != nil {
return err
}
stats.BlkioStats.IoServiceBytesRecursive = blkioStats
if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.throttle.io_serviced")); err != nil {
return err
}
stats.BlkioStats.IoServicedRecursive = blkioStats
return nil
}

View file

@ -1,94 +0,0 @@
// +build linux
package fs
import (
"bufio"
"os"
"path/filepath"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
type CpuGroup struct {
}
func (s *CpuGroup) Name() string {
return "cpu"
}
func (s *CpuGroup) Apply(d *cgroupData) error {
// We always want to join the cpu group, to allow fair cpu scheduling
// on a container basis
_, err := d.join("cpu")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func (s *CpuGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.CpuShares != 0 {
if err := writeFile(path, "cpu.shares", strconv.FormatInt(cgroup.Resources.CpuShares, 10)); err != nil {
return err
}
}
if cgroup.Resources.CpuPeriod != 0 {
if err := writeFile(path, "cpu.cfs_period_us", strconv.FormatInt(cgroup.Resources.CpuPeriod, 10)); err != nil {
return err
}
}
if cgroup.Resources.CpuQuota != 0 {
if err := writeFile(path, "cpu.cfs_quota_us", strconv.FormatInt(cgroup.Resources.CpuQuota, 10)); err != nil {
return err
}
}
if cgroup.Resources.CpuRtPeriod != 0 {
if err := writeFile(path, "cpu.rt_period_us", strconv.FormatInt(cgroup.Resources.CpuRtPeriod, 10)); err != nil {
return err
}
}
if cgroup.Resources.CpuRtRuntime != 0 {
if err := writeFile(path, "cpu.rt_runtime_us", strconv.FormatInt(cgroup.Resources.CpuRtRuntime, 10)); err != nil {
return err
}
}
return nil
}
func (s *CpuGroup) Remove(d *cgroupData) error {
return removePath(d.path("cpu"))
}
func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error {
f, err := os.Open(filepath.Join(path, "cpu.stat"))
if err != nil {
if os.IsNotExist(err) {
return nil
}
return err
}
defer f.Close()
sc := bufio.NewScanner(f)
for sc.Scan() {
t, v, err := getCgroupParamKeyValue(sc.Text())
if err != nil {
return err
}
switch t {
case "nr_periods":
stats.CpuStats.ThrottlingData.Periods = v
case "nr_throttled":
stats.CpuStats.ThrottlingData.ThrottledPeriods = v
case "throttled_time":
stats.CpuStats.ThrottlingData.ThrottledTime = v
}
}
return nil
}

View file

@ -1,121 +0,0 @@
// +build linux
package fs
import (
"fmt"
"io/ioutil"
"path/filepath"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/system"
)
const (
cgroupCpuacctStat = "cpuacct.stat"
nanosecondsInSecond = 1000000000
)
var clockTicks = uint64(system.GetClockTicks())
type CpuacctGroup struct {
}
func (s *CpuacctGroup) Name() string {
return "cpuacct"
}
func (s *CpuacctGroup) Apply(d *cgroupData) error {
// we just want to join this group even though we don't set anything
if _, err := d.join("cpuacct"); err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func (s *CpuacctGroup) Set(path string, cgroup *configs.Cgroup) error {
return nil
}
func (s *CpuacctGroup) Remove(d *cgroupData) error {
return removePath(d.path("cpuacct"))
}
func (s *CpuacctGroup) GetStats(path string, stats *cgroups.Stats) error {
userModeUsage, kernelModeUsage, err := getCpuUsageBreakdown(path)
if err != nil {
return err
}
totalUsage, err := getCgroupParamUint(path, "cpuacct.usage")
if err != nil {
return err
}
percpuUsage, err := getPercpuUsage(path)
if err != nil {
return err
}
stats.CpuStats.CpuUsage.TotalUsage = totalUsage
stats.CpuStats.CpuUsage.PercpuUsage = percpuUsage
stats.CpuStats.CpuUsage.UsageInUsermode = userModeUsage
stats.CpuStats.CpuUsage.UsageInKernelmode = kernelModeUsage
return nil
}
// Returns user and kernel usage breakdown in nanoseconds.
func getCpuUsageBreakdown(path string) (uint64, uint64, error) {
userModeUsage := uint64(0)
kernelModeUsage := uint64(0)
const (
userField = "user"
systemField = "system"
)
// Expected format:
// user <usage in ticks>
// system <usage in ticks>
data, err := ioutil.ReadFile(filepath.Join(path, cgroupCpuacctStat))
if err != nil {
return 0, 0, err
}
fields := strings.Fields(string(data))
if len(fields) != 4 {
return 0, 0, fmt.Errorf("failure - %s is expected to have 4 fields", filepath.Join(path, cgroupCpuacctStat))
}
if fields[0] != userField {
return 0, 0, fmt.Errorf("unexpected field %q in %q, expected %q", fields[0], cgroupCpuacctStat, userField)
}
if fields[2] != systemField {
return 0, 0, fmt.Errorf("unexpected field %q in %q, expected %q", fields[2], cgroupCpuacctStat, systemField)
}
if userModeUsage, err = strconv.ParseUint(fields[1], 10, 64); err != nil {
return 0, 0, err
}
if kernelModeUsage, err = strconv.ParseUint(fields[3], 10, 64); err != nil {
return 0, 0, err
}
return (userModeUsage * nanosecondsInSecond) / clockTicks, (kernelModeUsage * nanosecondsInSecond) / clockTicks, nil
}
func getPercpuUsage(path string) ([]uint64, error) {
percpuUsage := []uint64{}
data, err := ioutil.ReadFile(filepath.Join(path, "cpuacct.usage_percpu"))
if err != nil {
return percpuUsage, err
}
for _, value := range strings.Fields(string(data)) {
value, err := strconv.ParseUint(value, 10, 64)
if err != nil {
return percpuUsage, fmt.Errorf("Unable to convert param value to uint64: %s", err)
}
percpuUsage = append(percpuUsage, value)
}
return percpuUsage, nil
}

View file

@ -1,139 +0,0 @@
// +build linux
package fs
import (
"bytes"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
)
type CpusetGroup struct {
}
func (s *CpusetGroup) Name() string {
return "cpuset"
}
func (s *CpusetGroup) Apply(d *cgroupData) error {
dir, err := d.path("cpuset")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return s.ApplyDir(dir, d.config, d.pid)
}
func (s *CpusetGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.CpusetCpus != "" {
if err := writeFile(path, "cpuset.cpus", cgroup.Resources.CpusetCpus); err != nil {
return err
}
}
if cgroup.Resources.CpusetMems != "" {
if err := writeFile(path, "cpuset.mems", cgroup.Resources.CpusetMems); err != nil {
return err
}
}
return nil
}
func (s *CpusetGroup) Remove(d *cgroupData) error {
return removePath(d.path("cpuset"))
}
func (s *CpusetGroup) GetStats(path string, stats *cgroups.Stats) error {
return nil
}
func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) error {
// This might happen if we have no cpuset cgroup mounted.
// Just do nothing and don't fail.
if dir == "" {
return nil
}
root, err := getCgroupRoot()
if err != nil {
return err
}
if err := s.ensureParent(dir, root); err != nil {
return err
}
// because we are not using d.join we need to place the pid into the procs file
// unlike the other subsystems
if err := writeFile(dir, "cgroup.procs", strconv.Itoa(pid)); err != nil {
return err
}
return nil
}
func (s *CpusetGroup) getSubsystemSettings(parent string) (cpus []byte, mems []byte, err error) {
if cpus, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.cpus")); err != nil {
return
}
if mems, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.mems")); err != nil {
return
}
return cpus, mems, nil
}
// ensureParent makes sure that the parent directory of current is created
// and populated with the proper cpus and mems files copied from
// it's parent.
func (s *CpusetGroup) ensureParent(current, root string) error {
parent := filepath.Dir(current)
if libcontainerUtils.CleanPath(parent) == root {
return nil
}
// Avoid infinite recursion.
if parent == current {
return fmt.Errorf("cpuset: cgroup parent path outside cgroup root")
}
if err := s.ensureParent(parent, root); err != nil {
return err
}
if err := os.MkdirAll(current, 0755); err != nil {
return err
}
return s.copyIfNeeded(current, parent)
}
// copyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent
// directory to the current directory if the file's contents are 0
func (s *CpusetGroup) copyIfNeeded(current, parent string) error {
var (
err error
currentCpus, currentMems []byte
parentCpus, parentMems []byte
)
if currentCpus, currentMems, err = s.getSubsystemSettings(current); err != nil {
return err
}
if parentCpus, parentMems, err = s.getSubsystemSettings(parent); err != nil {
return err
}
if s.isEmpty(currentCpus) {
if err := writeFile(current, "cpuset.cpus", string(parentCpus)); err != nil {
return err
}
}
if s.isEmpty(currentMems) {
if err := writeFile(current, "cpuset.mems", string(parentMems)); err != nil {
return err
}
}
return nil
}
func (s *CpusetGroup) isEmpty(b []byte) bool {
return len(bytes.Trim(b, "\n")) == 0
}

View file

@ -1,78 +0,0 @@
// +build linux
package fs
import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/system"
)
type DevicesGroup struct {
}
func (s *DevicesGroup) Name() string {
return "devices"
}
func (s *DevicesGroup) Apply(d *cgroupData) error {
_, err := d.join("devices")
if err != nil {
// We will return error even it's `not found` error, devices
// cgroup is hard requirement for container's security.
return err
}
return nil
}
func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error {
if system.RunningInUserNS() {
return nil
}
devices := cgroup.Resources.Devices
if len(devices) > 0 {
for _, dev := range devices {
file := "devices.deny"
if dev.Allow {
file = "devices.allow"
}
if err := writeFile(path, file, dev.CgroupString()); err != nil {
return err
}
}
return nil
}
if !cgroup.Resources.AllowAllDevices {
if err := writeFile(path, "devices.deny", "a"); err != nil {
return err
}
for _, dev := range cgroup.Resources.AllowedDevices {
if err := writeFile(path, "devices.allow", dev.CgroupString()); err != nil {
return err
}
}
return nil
}
if err := writeFile(path, "devices.allow", "a"); err != nil {
return err
}
for _, dev := range cgroup.Resources.DeniedDevices {
if err := writeFile(path, "devices.deny", dev.CgroupString()); err != nil {
return err
}
}
return nil
}
func (s *DevicesGroup) Remove(d *cgroupData) error {
return removePath(d.path("devices"))
}
func (s *DevicesGroup) GetStats(path string, stats *cgroups.Stats) error {
return nil
}

View file

@ -1,61 +0,0 @@
// +build linux
package fs
import (
"fmt"
"strings"
"time"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
type FreezerGroup struct {
}
func (s *FreezerGroup) Name() string {
return "freezer"
}
func (s *FreezerGroup) Apply(d *cgroupData) error {
_, err := d.join("freezer")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error {
switch cgroup.Resources.Freezer {
case configs.Frozen, configs.Thawed:
if err := writeFile(path, "freezer.state", string(cgroup.Resources.Freezer)); err != nil {
return err
}
for {
state, err := readFile(path, "freezer.state")
if err != nil {
return err
}
if strings.TrimSpace(state) == string(cgroup.Resources.Freezer) {
break
}
time.Sleep(1 * time.Millisecond)
}
case configs.Undefined:
return nil
default:
return fmt.Errorf("Invalid argument '%s' to freezer.state", string(cgroup.Resources.Freezer))
}
return nil
}
func (s *FreezerGroup) Remove(d *cgroupData) error {
return removePath(d.path("freezer"))
}
func (s *FreezerGroup) GetStats(path string, stats *cgroups.Stats) error {
return nil
}

View file

@ -1,3 +0,0 @@
// +build !linux
package fs

View file

@ -1,71 +0,0 @@
// +build linux
package fs
import (
"fmt"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
type HugetlbGroup struct {
}
func (s *HugetlbGroup) Name() string {
return "hugetlb"
}
func (s *HugetlbGroup) Apply(d *cgroupData) error {
_, err := d.join("hugetlb")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func (s *HugetlbGroup) Set(path string, cgroup *configs.Cgroup) error {
for _, hugetlb := range cgroup.Resources.HugetlbLimit {
if err := writeFile(path, strings.Join([]string{"hugetlb", hugetlb.Pagesize, "limit_in_bytes"}, "."), strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
return err
}
}
return nil
}
func (s *HugetlbGroup) Remove(d *cgroupData) error {
return removePath(d.path("hugetlb"))
}
func (s *HugetlbGroup) GetStats(path string, stats *cgroups.Stats) error {
hugetlbStats := cgroups.HugetlbStats{}
for _, pageSize := range HugePageSizes {
usage := strings.Join([]string{"hugetlb", pageSize, "usage_in_bytes"}, ".")
value, err := getCgroupParamUint(path, usage)
if err != nil {
return fmt.Errorf("failed to parse %s - %v", usage, err)
}
hugetlbStats.Usage = value
maxUsage := strings.Join([]string{"hugetlb", pageSize, "max_usage_in_bytes"}, ".")
value, err = getCgroupParamUint(path, maxUsage)
if err != nil {
return fmt.Errorf("failed to parse %s - %v", maxUsage, err)
}
hugetlbStats.MaxUsage = value
failcnt := strings.Join([]string{"hugetlb", pageSize, "failcnt"}, ".")
value, err = getCgroupParamUint(path, failcnt)
if err != nil {
return fmt.Errorf("failed to parse %s - %v", failcnt, err)
}
hugetlbStats.Failcnt = value
stats.HugetlbStats[pageSize] = hugetlbStats
}
return nil
}

View file

@ -1,201 +0,0 @@
// +build linux
package fs
import (
"bufio"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
type MemoryGroup struct {
}
func (s *MemoryGroup) Name() string {
return "memory"
}
func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
path, err := d.path("memory")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
if memoryAssigned(d.config) {
if path != "" {
if err := os.MkdirAll(path, 0755); err != nil {
return err
}
}
// We have to set kernel memory here, as we can't change it once
// processes have been attached.
if err := s.SetKernelMemory(path, d.config); err != nil {
return err
}
}
defer func() {
if err != nil {
os.RemoveAll(path)
}
}()
// We need to join memory cgroup after set memory limits, because
// kmem.limit_in_bytes can only be set when the cgroup is empty.
_, err = d.join("memory")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func (s *MemoryGroup) SetKernelMemory(path string, cgroup *configs.Cgroup) error {
// This has to be done separately because it has special constraints (it
// can't be done after there are processes attached to the cgroup).
if cgroup.Resources.KernelMemory > 0 {
if err := writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemory, 10)); err != nil {
return err
}
}
return nil
}
func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.Memory != 0 {
if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
return err
}
}
if cgroup.Resources.MemoryReservation != 0 {
if err := writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil {
return err
}
}
if cgroup.Resources.MemorySwap > 0 {
if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
return err
}
}
if cgroup.Resources.OomKillDisable {
if err := writeFile(path, "memory.oom_control", "1"); err != nil {
return err
}
}
if cgroup.Resources.MemorySwappiness == nil || int64(*cgroup.Resources.MemorySwappiness) == -1 {
return nil
} else if int64(*cgroup.Resources.MemorySwappiness) >= 0 && int64(*cgroup.Resources.MemorySwappiness) <= 100 {
if err := writeFile(path, "memory.swappiness", strconv.FormatInt(*cgroup.Resources.MemorySwappiness, 10)); err != nil {
return err
}
} else {
return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", int64(*cgroup.Resources.MemorySwappiness))
}
return nil
}
func (s *MemoryGroup) Remove(d *cgroupData) error {
return removePath(d.path("memory"))
}
func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
// Set stats from memory.stat.
statsFile, err := os.Open(filepath.Join(path, "memory.stat"))
if err != nil {
if os.IsNotExist(err) {
return nil
}
return err
}
defer statsFile.Close()
sc := bufio.NewScanner(statsFile)
for sc.Scan() {
t, v, err := getCgroupParamKeyValue(sc.Text())
if err != nil {
return fmt.Errorf("failed to parse memory.stat (%q) - %v", sc.Text(), err)
}
stats.MemoryStats.Stats[t] = v
}
stats.MemoryStats.Cache = stats.MemoryStats.Stats["cache"]
memoryUsage, err := getMemoryData(path, "")
if err != nil {
return err
}
stats.MemoryStats.Usage = memoryUsage
swapUsage, err := getMemoryData(path, "memsw")
if err != nil {
return err
}
stats.MemoryStats.SwapUsage = swapUsage
kernelUsage, err := getMemoryData(path, "kmem")
if err != nil {
return err
}
stats.MemoryStats.KernelUsage = kernelUsage
return nil
}
func memoryAssigned(cgroup *configs.Cgroup) bool {
return cgroup.Resources.Memory != 0 ||
cgroup.Resources.MemoryReservation != 0 ||
cgroup.Resources.MemorySwap > 0 ||
cgroup.Resources.KernelMemory > 0 ||
cgroup.Resources.OomKillDisable ||
(cgroup.Resources.MemorySwappiness != nil && *cgroup.Resources.MemorySwappiness != -1)
}
func getMemoryData(path, name string) (cgroups.MemoryData, error) {
memoryData := cgroups.MemoryData{}
moduleName := "memory"
if name != "" {
moduleName = strings.Join([]string{"memory", name}, ".")
}
usage := strings.Join([]string{moduleName, "usage_in_bytes"}, ".")
maxUsage := strings.Join([]string{moduleName, "max_usage_in_bytes"}, ".")
failcnt := strings.Join([]string{moduleName, "failcnt"}, ".")
limit := strings.Join([]string{moduleName, "limit_in_bytes"}, ".")
value, err := getCgroupParamUint(path, usage)
if err != nil {
if moduleName != "memory" && os.IsNotExist(err) {
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", usage, err)
}
memoryData.Usage = value
value, err = getCgroupParamUint(path, maxUsage)
if err != nil {
if moduleName != "memory" && os.IsNotExist(err) {
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", maxUsage, err)
}
memoryData.MaxUsage = value
value, err = getCgroupParamUint(path, failcnt)
if err != nil {
if moduleName != "memory" && os.IsNotExist(err) {
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", failcnt, err)
}
memoryData.Failcnt = value
value, err = getCgroupParamUint(path, limit)
if err != nil {
if moduleName != "memory" && os.IsNotExist(err) {
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", limit, err)
}
memoryData.Limit = value
return memoryData, nil
}

View file

@ -1,40 +0,0 @@
// +build linux
package fs
import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
type NameGroup struct {
GroupName string
Join bool
}
func (s *NameGroup) Name() string {
return s.GroupName
}
func (s *NameGroup) Apply(d *cgroupData) error {
if s.Join {
// ignore errors if the named cgroup does not exist
d.join(s.GroupName)
}
return nil
}
func (s *NameGroup) Set(path string, cgroup *configs.Cgroup) error {
return nil
}
func (s *NameGroup) Remove(d *cgroupData) error {
if s.Join {
removePath(d.path(s.GroupName))
}
return nil
}
func (s *NameGroup) GetStats(path string, stats *cgroups.Stats) error {
return nil
}

View file

@ -1,41 +0,0 @@
// +build linux
package fs
import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
type NetClsGroup struct {
}
func (s *NetClsGroup) Name() string {
return "net_cls"
}
func (s *NetClsGroup) Apply(d *cgroupData) error {
_, err := d.join("net_cls")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func (s *NetClsGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.NetClsClassid != "" {
if err := writeFile(path, "net_cls.classid", cgroup.Resources.NetClsClassid); err != nil {
return err
}
}
return nil
}
func (s *NetClsGroup) Remove(d *cgroupData) error {
return removePath(d.path("net_cls"))
}
func (s *NetClsGroup) GetStats(path string, stats *cgroups.Stats) error {
return nil
}

View file

@ -1,41 +0,0 @@
// +build linux
package fs
import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
type NetPrioGroup struct {
}
func (s *NetPrioGroup) Name() string {
return "net_prio"
}
func (s *NetPrioGroup) Apply(d *cgroupData) error {
_, err := d.join("net_prio")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func (s *NetPrioGroup) Set(path string, cgroup *configs.Cgroup) error {
for _, prioMap := range cgroup.Resources.NetPrioIfpriomap {
if err := writeFile(path, "net_prio.ifpriomap", prioMap.CgroupString()); err != nil {
return err
}
}
return nil
}
func (s *NetPrioGroup) Remove(d *cgroupData) error {
return removePath(d.path("net_prio"))
}
func (s *NetPrioGroup) GetStats(path string, stats *cgroups.Stats) error {
return nil
}

View file

@ -1,35 +0,0 @@
// +build linux
package fs
import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
type PerfEventGroup struct {
}
func (s *PerfEventGroup) Name() string {
return "perf_event"
}
func (s *PerfEventGroup) Apply(d *cgroupData) error {
// we just want to join this group even though we don't set anything
if _, err := d.join("perf_event"); err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func (s *PerfEventGroup) Set(path string, cgroup *configs.Cgroup) error {
return nil
}
func (s *PerfEventGroup) Remove(d *cgroupData) error {
return removePath(d.path("perf_event"))
}
func (s *PerfEventGroup) GetStats(path string, stats *cgroups.Stats) error {
return nil
}

View file

@ -1,57 +0,0 @@
// +build linux
package fs
import (
"fmt"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
type PidsGroup struct {
}
func (s *PidsGroup) Name() string {
return "pids"
}
func (s *PidsGroup) Apply(d *cgroupData) error {
_, err := d.join("pids")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func (s *PidsGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.PidsLimit != 0 {
// "max" is the fallback value.
limit := "max"
if cgroup.Resources.PidsLimit > 0 {
limit = strconv.FormatInt(cgroup.Resources.PidsLimit, 10)
}
if err := writeFile(path, "pids.max", limit); err != nil {
return err
}
}
return nil
}
func (s *PidsGroup) Remove(d *cgroupData) error {
return removePath(d.path("pids"))
}
func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error {
value, err := getCgroupParamUint(path, "pids.current")
if err != nil {
return fmt.Errorf("failed to parse pids.current - %s", err)
}
stats.PidsStats.Current = value
return nil
}

View file

@ -1,79 +0,0 @@
// +build linux
package fs
import (
"errors"
"fmt"
"io/ioutil"
"path/filepath"
"strconv"
"strings"
)
var (
ErrNotSupportStat = errors.New("stats are not supported for subsystem")
ErrNotValidFormat = errors.New("line is not a valid key value format")
)
// Saturates negative values at zero and returns a uint64.
// Due to kernel bugs, some of the memory cgroup stats can be negative.
func parseUint(s string, base, bitSize int) (uint64, error) {
value, err := strconv.ParseUint(s, base, bitSize)
if err != nil {
intValue, intErr := strconv.ParseInt(s, base, bitSize)
// 1. Handle negative values greater than MinInt64 (and)
// 2. Handle negative values lesser than MinInt64
if intErr == nil && intValue < 0 {
return 0, nil
} else if intErr != nil && intErr.(*strconv.NumError).Err == strconv.ErrRange && intValue < 0 {
return 0, nil
}
return value, err
}
return value, nil
}
// Parses a cgroup param and returns as name, value
// i.e. "io_service_bytes 1234" will return as io_service_bytes, 1234
func getCgroupParamKeyValue(t string) (string, uint64, error) {
parts := strings.Fields(t)
switch len(parts) {
case 2:
value, err := parseUint(parts[1], 10, 64)
if err != nil {
return "", 0, fmt.Errorf("unable to convert param value (%q) to uint64: %v", parts[1], err)
}
return parts[0], value, nil
default:
return "", 0, ErrNotValidFormat
}
}
// Gets a single uint64 value from the specified cgroup file.
func getCgroupParamUint(cgroupPath, cgroupFile string) (uint64, error) {
fileName := filepath.Join(cgroupPath, cgroupFile)
contents, err := ioutil.ReadFile(fileName)
if err != nil {
return 0, err
}
res, err := parseUint(strings.TrimSpace(string(contents)), 10, 64)
if err != nil {
return res, fmt.Errorf("unable to parse %q as a uint from Cgroup file %q", string(contents), fileName)
}
return res, nil
}
// Gets a string value from the specified cgroup file
func getCgroupParamString(cgroupPath, cgroupFile string) (string, error) {
contents, err := ioutil.ReadFile(filepath.Join(cgroupPath, cgroupFile))
if err != nil {
return "", err
}
return strings.TrimSpace(string(contents)), nil
}

View file

@ -1,55 +0,0 @@
// +build !linux
package systemd
import (
"fmt"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
type Manager struct {
Cgroups *configs.Cgroup
Paths map[string]string
}
func UseSystemd() bool {
return false
}
func (m *Manager) Apply(pid int) error {
return fmt.Errorf("Systemd not supported")
}
func (m *Manager) GetPids() ([]int, error) {
return nil, fmt.Errorf("Systemd not supported")
}
func (m *Manager) GetAllPids() ([]int, error) {
return nil, fmt.Errorf("Systemd not supported")
}
func (m *Manager) Destroy() error {
return fmt.Errorf("Systemd not supported")
}
func (m *Manager) GetPaths() map[string]string {
return nil
}
func (m *Manager) GetStats() (*cgroups.Stats, error) {
return nil, fmt.Errorf("Systemd not supported")
}
func (m *Manager) Set(container *configs.Config) error {
return nil, fmt.Errorf("Systemd not supported")
}
func (m *Manager) Freeze(state configs.FreezerState) error {
return fmt.Errorf("Systemd not supported")
}
func Freeze(c *configs.Cgroup, state configs.FreezerState) error {
return fmt.Errorf("Systemd not supported")
}

View file

@ -1,605 +0,0 @@
// +build linux
package systemd
import (
"errors"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strconv"
"strings"
"sync"
"time"
systemdDbus "github.com/coreos/go-systemd/dbus"
systemdUtil "github.com/coreos/go-systemd/util"
"github.com/godbus/dbus"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs"
"github.com/opencontainers/runc/libcontainer/configs"
)
type Manager struct {
mu sync.Mutex
Cgroups *configs.Cgroup
Paths map[string]string
}
type subsystem interface {
// Name returns the name of the subsystem.
Name() string
// Returns the stats, as 'stats', corresponding to the cgroup under 'path'.
GetStats(path string, stats *cgroups.Stats) error
// Set the cgroup represented by cgroup.
Set(path string, cgroup *configs.Cgroup) error
}
var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist")
type subsystemSet []subsystem
func (s subsystemSet) Get(name string) (subsystem, error) {
for _, ss := range s {
if ss.Name() == name {
return ss, nil
}
}
return nil, errSubsystemDoesNotExist
}
var subsystems = subsystemSet{
&fs.CpusetGroup{},
&fs.DevicesGroup{},
&fs.MemoryGroup{},
&fs.CpuGroup{},
&fs.CpuacctGroup{},
&fs.PidsGroup{},
&fs.BlkioGroup{},
&fs.HugetlbGroup{},
&fs.PerfEventGroup{},
&fs.FreezerGroup{},
&fs.NetPrioGroup{},
&fs.NetClsGroup{},
&fs.NameGroup{GroupName: "name=systemd"},
}
const (
testScopeWait = 4
)
var (
connLock sync.Mutex
theConn *systemdDbus.Conn
hasStartTransientUnit bool
hasTransientDefaultDependencies bool
)
func newProp(name string, units interface{}) systemdDbus.Property {
return systemdDbus.Property{
Name: name,
Value: dbus.MakeVariant(units),
}
}
func UseSystemd() bool {
if !systemdUtil.IsRunningSystemd() {
return false
}
connLock.Lock()
defer connLock.Unlock()
if theConn == nil {
var err error
theConn, err = systemdDbus.New()
if err != nil {
return false
}
// Assume we have StartTransientUnit
hasStartTransientUnit = true
// But if we get UnknownMethod error we don't
if _, err := theConn.StartTransientUnit("test.scope", "invalid", nil, nil); err != nil {
if dbusError, ok := err.(dbus.Error); ok {
if dbusError.Name == "org.freedesktop.DBus.Error.UnknownMethod" {
hasStartTransientUnit = false
return hasStartTransientUnit
}
}
}
// Ensure the scope name we use doesn't exist. Use the Pid to
// avoid collisions between multiple libcontainer users on a
// single host.
scope := fmt.Sprintf("libcontainer-%d-systemd-test-default-dependencies.scope", os.Getpid())
testScopeExists := true
for i := 0; i <= testScopeWait; i++ {
if _, err := theConn.StopUnit(scope, "replace", nil); err != nil {
if dbusError, ok := err.(dbus.Error); ok {
if strings.Contains(dbusError.Name, "org.freedesktop.systemd1.NoSuchUnit") {
testScopeExists = false
break
}
}
}
time.Sleep(time.Millisecond)
}
// Bail out if we can't kill this scope without testing for DefaultDependencies
if testScopeExists {
return hasStartTransientUnit
}
// Assume StartTransientUnit on a scope allows DefaultDependencies
hasTransientDefaultDependencies = true
ddf := newProp("DefaultDependencies", false)
if _, err := theConn.StartTransientUnit(scope, "replace", []systemdDbus.Property{ddf}, nil); err != nil {
if dbusError, ok := err.(dbus.Error); ok {
if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") {
hasTransientDefaultDependencies = false
}
}
}
// Not critical because of the stop unit logic above.
theConn.StopUnit(scope, "replace", nil)
}
return hasStartTransientUnit
}
func getIfaceForUnit(unitName string) string {
if strings.HasSuffix(unitName, ".scope") {
return "Scope"
}
if strings.HasSuffix(unitName, ".service") {
return "Service"
}
return "Unit"
}
func (m *Manager) Apply(pid int) error {
var (
c = m.Cgroups
unitName = getUnitName(c)
slice = "system.slice"
properties []systemdDbus.Property
)
if c.Paths != nil {
paths := make(map[string]string)
for name, path := range c.Paths {
_, err := getSubsystemPath(m.Cgroups, name)
if err != nil {
// Don't fail if a cgroup hierarchy was not found, just skip this subsystem
if cgroups.IsNotFound(err) {
continue
}
return err
}
paths[name] = path
}
m.Paths = paths
return cgroups.EnterPid(m.Paths, pid)
}
if c.Parent != "" {
slice = c.Parent
}
properties = append(properties,
systemdDbus.PropSlice(slice),
systemdDbus.PropDescription("docker container "+c.Name),
newProp("PIDs", []uint32{uint32(pid)}),
)
// Always enable accounting, this gets us the same behaviour as the fs implementation,
// plus the kernel has some problems with joining the memory cgroup at a later time.
properties = append(properties,
newProp("MemoryAccounting", true),
newProp("CPUAccounting", true),
newProp("BlockIOAccounting", true))
if hasTransientDefaultDependencies {
properties = append(properties,
newProp("DefaultDependencies", false))
}
if c.Resources.Memory != 0 {
properties = append(properties,
newProp("MemoryLimit", uint64(c.Resources.Memory)))
}
if c.Resources.CpuShares != 0 {
properties = append(properties,
newProp("CPUShares", uint64(c.Resources.CpuShares)))
}
if c.Resources.BlkioWeight != 0 {
properties = append(properties,
newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight)))
}
// We need to set kernel memory before processes join cgroup because
// kmem.limit_in_bytes can only be set when the cgroup is empty.
// And swap memory limit needs to be set after memory limit, only
// memory limit is handled by systemd, so it's kind of ugly here.
if c.Resources.KernelMemory > 0 {
if err := setKernelMemory(c); err != nil {
return err
}
}
if _, err := theConn.StartTransientUnit(unitName, "replace", properties, nil); err != nil {
return err
}
if err := joinDevices(c, pid); err != nil {
return err
}
// TODO: CpuQuota and CpuPeriod not available in systemd
// we need to manually join the cpu.cfs_quota_us and cpu.cfs_period_us
if err := joinCpu(c, pid); err != nil {
return err
}
// TODO: MemoryReservation and MemorySwap not available in systemd
if err := joinMemory(c, pid); err != nil {
return err
}
// we need to manually join the freezer, net_cls, net_prio, pids and cpuset cgroup in systemd
// because it does not currently support it via the dbus api.
if err := joinFreezer(c, pid); err != nil {
return err
}
if err := joinNetPrio(c, pid); err != nil {
return err
}
if err := joinNetCls(c, pid); err != nil {
return err
}
if err := joinPids(c, pid); err != nil {
return err
}
if err := joinCpuset(c, pid); err != nil {
return err
}
if err := joinHugetlb(c, pid); err != nil {
return err
}
if err := joinPerfEvent(c, pid); err != nil {
return err
}
// FIXME: Systemd does have `BlockIODeviceWeight` property, but we got problem
// using that (at least on systemd 208, see https://github.com/opencontainers/runc/libcontainer/pull/354),
// so use fs work around for now.
if err := joinBlkio(c, pid); err != nil {
return err
}
paths := make(map[string]string)
for _, s := range subsystems {
subsystemPath, err := getSubsystemPath(m.Cgroups, s.Name())
if err != nil {
// Don't fail if a cgroup hierarchy was not found, just skip this subsystem
if cgroups.IsNotFound(err) {
continue
}
return err
}
paths[s.Name()] = subsystemPath
}
m.Paths = paths
return nil
}
func (m *Manager) Destroy() error {
if m.Cgroups.Paths != nil {
return nil
}
m.mu.Lock()
defer m.mu.Unlock()
theConn.StopUnit(getUnitName(m.Cgroups), "replace", nil)
if err := cgroups.RemovePaths(m.Paths); err != nil {
return err
}
m.Paths = make(map[string]string)
return nil
}
func (m *Manager) GetPaths() map[string]string {
m.mu.Lock()
paths := m.Paths
m.mu.Unlock()
return paths
}
func writeFile(dir, file, data string) error {
// Normally dir should not be empty, one case is that cgroup subsystem
// is not mounted, we will get empty dir, and we want it fail here.
if dir == "" {
return fmt.Errorf("no such directory for %s.", file)
}
return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700)
}
func join(c *configs.Cgroup, subsystem string, pid int) (string, error) {
path, err := getSubsystemPath(c, subsystem)
if err != nil {
return "", err
}
if err := os.MkdirAll(path, 0755); err != nil {
return "", err
}
if err := writeFile(path, "cgroup.procs", strconv.Itoa(pid)); err != nil {
return "", err
}
return path, nil
}
func joinCpu(c *configs.Cgroup, pid int) error {
_, err := join(c, "cpu", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func joinFreezer(c *configs.Cgroup, pid int) error {
_, err := join(c, "freezer", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func joinNetPrio(c *configs.Cgroup, pid int) error {
_, err := join(c, "net_prio", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func joinNetCls(c *configs.Cgroup, pid int) error {
_, err := join(c, "net_cls", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func joinPids(c *configs.Cgroup, pid int) error {
_, err := join(c, "pids", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
// systemd represents slice heirarchy using `-`, so we need to follow suit when
// generating the path of slice. Essentially, test-a-b.slice becomes
// test.slice/test-a.slice/test-a-b.slice.
func expandSlice(slice string) (string, error) {
suffix := ".slice"
// Name has to end with ".slice", but can't be just ".slice".
if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) {
return "", fmt.Errorf("invalid slice name: %s", slice)
}
// Path-separators are not allowed.
if strings.Contains(slice, "/") {
return "", fmt.Errorf("invalid slice name: %s", slice)
}
var path, prefix string
sliceName := strings.TrimSuffix(slice, suffix)
for _, component := range strings.Split(sliceName, "-") {
// test--a.slice isn't permitted, nor is -test.slice.
if component == "" {
return "", fmt.Errorf("invalid slice name: %s", slice)
}
// Append the component to the path and to the prefix.
path += prefix + component + suffix + "/"
prefix += component + "-"
}
return path, nil
}
func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
mountpoint, err := cgroups.FindCgroupMountpoint(subsystem)
if err != nil {
return "", err
}
initPath, err := cgroups.GetInitCgroupDir(subsystem)
if err != nil {
return "", err
}
slice := "system.slice"
if c.Parent != "" {
slice = c.Parent
}
slice, err = expandSlice(slice)
if err != nil {
return "", err
}
return filepath.Join(mountpoint, initPath, slice, getUnitName(c)), nil
}
func (m *Manager) Freeze(state configs.FreezerState) error {
path, err := getSubsystemPath(m.Cgroups, "freezer")
if err != nil {
return err
}
prevState := m.Cgroups.Resources.Freezer
m.Cgroups.Resources.Freezer = state
freezer, err := subsystems.Get("freezer")
if err != nil {
return err
}
err = freezer.Set(path, m.Cgroups)
if err != nil {
m.Cgroups.Resources.Freezer = prevState
return err
}
return nil
}
func (m *Manager) GetPids() ([]int, error) {
path, err := getSubsystemPath(m.Cgroups, "devices")
if err != nil {
return nil, err
}
return cgroups.GetPids(path)
}
func (m *Manager) GetAllPids() ([]int, error) {
path, err := getSubsystemPath(m.Cgroups, "devices")
if err != nil {
return nil, err
}
return cgroups.GetAllPids(path)
}
func (m *Manager) GetStats() (*cgroups.Stats, error) {
m.mu.Lock()
defer m.mu.Unlock()
stats := cgroups.NewStats()
for name, path := range m.Paths {
sys, err := subsystems.Get(name)
if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) {
continue
}
if err := sys.GetStats(path, stats); err != nil {
return nil, err
}
}
return stats, nil
}
func (m *Manager) Set(container *configs.Config) error {
for _, sys := range subsystems {
// Get the subsystem path, but don't error out for not found cgroups.
path, err := getSubsystemPath(container.Cgroups, sys.Name())
if err != nil && !cgroups.IsNotFound(err) {
return err
}
if err := sys.Set(path, container.Cgroups); err != nil {
return err
}
}
if m.Paths["cpu"] != "" {
if err := fs.CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil {
return err
}
}
return nil
}
func getUnitName(c *configs.Cgroup) string {
return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name)
}
// Atm we can't use the systemd device support because of two missing things:
// * Support for wildcards to allow mknod on any device
// * Support for wildcards to allow /dev/pts support
//
// The second is available in more recent systemd as "char-pts", but not in e.g. v208 which is
// in wide use. When both these are available we will be able to switch, but need to keep the old
// implementation for backwards compat.
//
// Note: we can't use systemd to set up the initial limits, and then change the cgroup
// because systemd will re-write the device settings if it needs to re-apply the cgroup context.
// This happens at least for v208 when any sibling unit is started.
func joinDevices(c *configs.Cgroup, pid int) error {
_, err := join(c, "devices", pid)
// Even if it's `not found` error, we'll return err because devices cgroup
// is hard requirement for container security.
if err != nil {
return err
}
return nil
}
func setKernelMemory(c *configs.Cgroup) error {
path, err := getSubsystemPath(c, "memory")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
if err := os.MkdirAll(path, 0755); err != nil {
return err
}
// This doesn't get called by manager.Set, so we need to do it here.
s := &fs.MemoryGroup{}
return s.SetKernelMemory(path, c)
}
func joinMemory(c *configs.Cgroup, pid int) error {
_, err := join(c, "memory", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
// systemd does not atm set up the cpuset controller, so we must manually
// join it. Additionally that is a very finicky controller where each
// level must have a full setup as the default for a new directory is "no cpus"
func joinCpuset(c *configs.Cgroup, pid int) error {
path, err := getSubsystemPath(c, "cpuset")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
s := &fs.CpusetGroup{}
return s.ApplyDir(path, c, pid)
}
// `BlockIODeviceWeight` property of systemd does not work properly, and systemd
// expects device path instead of major minor numbers, which is also confusing
// for users. So we use fs work around for now.
func joinBlkio(c *configs.Cgroup, pid int) error {
_, err := join(c, "blkio", pid)
if err != nil {
return err
}
return nil
}
func joinHugetlb(c *configs.Cgroup, pid int) error {
_, err := join(c, "hugetlb", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func joinPerfEvent(c *configs.Cgroup, pid int) error {
_, err := join(c, "perf_event", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}

View file

@ -1,10 +0,0 @@
// +build linux,!go1.5
package libcontainer
import "syscall"
// GidMappingsEnableSetgroups was added in Go 1.5, so do nothing when building
// with earlier versions
func enableSetgroups(sys *syscall.SysProcAttr) {
}

View file

@ -1,138 +0,0 @@
package validate
import (
"fmt"
"os"
"path/filepath"
"strings"
"github.com/opencontainers/runc/libcontainer/configs"
)
type Validator interface {
Validate(*configs.Config) error
}
func New() Validator {
return &ConfigValidator{}
}
type ConfigValidator struct {
}
func (v *ConfigValidator) Validate(config *configs.Config) error {
if err := v.rootfs(config); err != nil {
return err
}
if err := v.network(config); err != nil {
return err
}
if err := v.hostname(config); err != nil {
return err
}
if err := v.security(config); err != nil {
return err
}
if err := v.usernamespace(config); err != nil {
return err
}
if err := v.sysctl(config); err != nil {
return err
}
return nil
}
// rootfs validates the the rootfs is an absolute path and is not a symlink
// to the container's root filesystem.
func (v *ConfigValidator) rootfs(config *configs.Config) error {
cleaned, err := filepath.Abs(config.Rootfs)
if err != nil {
return err
}
if cleaned, err = filepath.EvalSymlinks(cleaned); err != nil {
return err
}
if config.Rootfs != cleaned {
return fmt.Errorf("%s is not an absolute path or is a symlink", config.Rootfs)
}
return nil
}
func (v *ConfigValidator) network(config *configs.Config) error {
if !config.Namespaces.Contains(configs.NEWNET) {
if len(config.Networks) > 0 || len(config.Routes) > 0 {
return fmt.Errorf("unable to apply network settings without a private NET namespace")
}
}
return nil
}
func (v *ConfigValidator) hostname(config *configs.Config) error {
if config.Hostname != "" && !config.Namespaces.Contains(configs.NEWUTS) {
return fmt.Errorf("unable to set hostname without a private UTS namespace")
}
return nil
}
func (v *ConfigValidator) security(config *configs.Config) error {
// restrict sys without mount namespace
if (len(config.MaskPaths) > 0 || len(config.ReadonlyPaths) > 0) &&
!config.Namespaces.Contains(configs.NEWNS) {
return fmt.Errorf("unable to restrict sys entries without a private MNT namespace")
}
return nil
}
func (v *ConfigValidator) usernamespace(config *configs.Config) error {
if config.Namespaces.Contains(configs.NEWUSER) {
if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
return fmt.Errorf("USER namespaces aren't enabled in the kernel")
}
} else {
if config.UidMappings != nil || config.GidMappings != nil {
return fmt.Errorf("User namespace mappings specified, but USER namespace isn't enabled in the config")
}
}
return nil
}
// sysctl validates that the specified sysctl keys are valid or not.
// /proc/sys isn't completely namespaced and depending on which namespaces
// are specified, a subset of sysctls are permitted.
func (v *ConfigValidator) sysctl(config *configs.Config) error {
validSysctlPrefixes := []string{}
validSysctlMap := make(map[string]bool)
if config.Namespaces.Contains(configs.NEWNET) {
validSysctlPrefixes = append(validSysctlPrefixes, "net.")
}
if config.Namespaces.Contains(configs.NEWIPC) {
validSysctlPrefixes = append(validSysctlPrefixes, "fs.mqueue.")
validSysctlMap = map[string]bool{
"kernel.msgmax": true,
"kernel.msgmnb": true,
"kernel.msgmni": true,
"kernel.sem": true,
"kernel.shmall": true,
"kernel.shmmax": true,
"kernel.shmmni": true,
"kernel.shm_rmid_forced": true,
}
}
for s := range config.Sysctl {
if validSysctlMap[s] {
continue
}
valid := false
for _, vp := range validSysctlPrefixes {
if strings.HasPrefix(s, vp) {
valid = true
break
}
}
if !valid {
return fmt.Errorf("sysctl %q is not permitted in the config", s)
}
}
return nil
}

View file

@ -1,15 +0,0 @@
package libcontainer
import "io"
// Console represents a pseudo TTY.
type Console interface {
io.ReadWriter
io.Closer
// Path returns the filesystem path to the slave side of the pty.
Path() string
// Fd returns the fd for the master of the pty.
Fd() uintptr
}

View file

@ -1,13 +0,0 @@
// +build freebsd
package libcontainer
import (
"errors"
)
// NewConsole returns an initalized console that can be used within a container by copying bytes
// from the master side to the slave that is attached as the tty for the container's init process.
func NewConsole(uid, gid int) (Console, error) {
return nil, errors.New("libcontainer console is not supported on FreeBSD")
}

View file

@ -1,145 +0,0 @@
package libcontainer
import (
"fmt"
"os"
"path/filepath"
"syscall"
"unsafe"
"github.com/opencontainers/runc/libcontainer/label"
)
// NewConsole returns an initalized console that can be used within a container by copying bytes
// from the master side to the slave that is attached as the tty for the container's init process.
func NewConsole(uid, gid int) (Console, error) {
master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0)
if err != nil {
return nil, err
}
console, err := ptsname(master)
if err != nil {
return nil, err
}
if err := unlockpt(master); err != nil {
return nil, err
}
if err := os.Chmod(console, 0600); err != nil {
return nil, err
}
if err := os.Chown(console, uid, gid); err != nil {
return nil, err
}
return &linuxConsole{
slavePath: console,
master: master,
}, nil
}
// newConsoleFromPath is an internal function returning an initialized console for use inside
// a container's MNT namespace.
func newConsoleFromPath(slavePath string) *linuxConsole {
return &linuxConsole{
slavePath: slavePath,
}
}
// linuxConsole is a linux psuedo TTY for use within a container.
type linuxConsole struct {
master *os.File
slavePath string
}
func (c *linuxConsole) Fd() uintptr {
return c.master.Fd()
}
func (c *linuxConsole) Path() string {
return c.slavePath
}
func (c *linuxConsole) Read(b []byte) (int, error) {
return c.master.Read(b)
}
func (c *linuxConsole) Write(b []byte) (int, error) {
return c.master.Write(b)
}
func (c *linuxConsole) Close() error {
if m := c.master; m != nil {
return m.Close()
}
return nil
}
// mount initializes the console inside the rootfs mounting with the specified mount label
// and applying the correct ownership of the console.
func (c *linuxConsole) mount(rootfs, mountLabel string) error {
oldMask := syscall.Umask(0000)
defer syscall.Umask(oldMask)
if err := label.SetFileLabel(c.slavePath, mountLabel); err != nil {
return err
}
dest := filepath.Join(rootfs, "/dev/console")
f, err := os.Create(dest)
if err != nil && !os.IsExist(err) {
return err
}
if f != nil {
f.Close()
}
return syscall.Mount(c.slavePath, dest, "bind", syscall.MS_BIND, "")
}
// dupStdio opens the slavePath for the console and dups the fds to the current
// processes stdio, fd 0,1,2.
func (c *linuxConsole) dupStdio() error {
slave, err := c.open(syscall.O_RDWR)
if err != nil {
return err
}
fd := int(slave.Fd())
for _, i := range []int{0, 1, 2} {
if err := syscall.Dup3(fd, i, 0); err != nil {
return err
}
}
return nil
}
// open is a clone of os.OpenFile without the O_CLOEXEC used to open the pty slave.
func (c *linuxConsole) open(flag int) (*os.File, error) {
r, e := syscall.Open(c.slavePath, flag, 0)
if e != nil {
return nil, &os.PathError{
Op: "open",
Path: c.slavePath,
Err: e,
}
}
return os.NewFile(uintptr(r), c.slavePath), nil
}
func ioctl(fd uintptr, flag, data uintptr) error {
if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, fd, flag, data); err != 0 {
return err
}
return nil
}
// unlockpt unlocks the slave pseudoterminal device corresponding to the master pseudoterminal referred to by f.
// unlockpt should be called before opening the slave side of a pty.
func unlockpt(f *os.File) error {
var u int32
return ioctl(f.Fd(), syscall.TIOCSPTLCK, uintptr(unsafe.Pointer(&u)))
}
// ptsname retrieves the name of the first available pts for the given master.
func ptsname(f *os.File) (string, error) {
var n int32
if err := ioctl(f.Fd(), syscall.TIOCGPTN, uintptr(unsafe.Pointer(&n))); err != nil {
return "", err
}
return fmt.Sprintf("/dev/pts/%d", n), nil
}

View file

@ -1,30 +0,0 @@
package libcontainer
// NewConsole returns an initalized console that can be used within a container
func NewConsole(uid, gid int) (Console, error) {
return &windowsConsole{}, nil
}
// windowsConsole is a Windows psuedo TTY for use within a container.
type windowsConsole struct {
}
func (c *windowsConsole) Fd() uintptr {
return 0
}
func (c *windowsConsole) Path() string {
return ""
}
func (c *windowsConsole) Read(b []byte) (int, error) {
return 0, nil
}
func (c *windowsConsole) Write(b []byte) (int, error) {
return 0, nil
}
func (c *windowsConsole) Close() error {
return nil
}

View file

@ -1,144 +0,0 @@
// Libcontainer provides a native Go implementation for creating containers
// with namespaces, cgroups, capabilities, and filesystem access controls.
// It allows you to manage the lifecycle of the container performing additional operations
// after the container is created.
package libcontainer
import (
"os"
"time"
"github.com/opencontainers/runc/libcontainer/configs"
)
// The status of a container.
type Status int
const (
// The container exists but has not been run yet
Created Status = iota
// The container exists and is running.
Running
// The container exists, it is in the process of being paused.
Pausing
// The container exists, but all its processes are paused.
Paused
// The container does not exist.
Destroyed
)
func (s Status) String() string {
switch s {
case Created:
return "created"
case Running:
return "running"
case Pausing:
return "pausing"
case Paused:
return "paused"
case Destroyed:
return "destroyed"
default:
return "unknown"
}
}
// BaseState represents the platform agnostic pieces relating to a
// running container's state
type BaseState struct {
// ID is the container ID.
ID string `json:"id"`
// InitProcessPid is the init process id in the parent namespace.
InitProcessPid int `json:"init_process_pid"`
// InitProcessStartTime is the init process start time in clock cycles since boot time.
InitProcessStartTime string `json:"init_process_start"`
// Created is the unix timestamp for the creation time of the container in UTC
Created time.Time `json:"created"`
// Config is the container's configuration.
Config configs.Config `json:"config"`
}
// A libcontainer container object.
//
// Each container is thread-safe within the same process. Since a container can
// be destroyed by a separate process, any function may return that the container
// was not found. BaseContainer includes methods that are platform agnostic.
type BaseContainer interface {
// Returns the ID of the container
ID() string
// Returns the current status of the container.
//
// errors:
// ContainerDestroyed - Container no longer exists,
// Systemerror - System error.
Status() (Status, error)
// State returns the current container's state information.
//
// errors:
// Systemerror - System error.
State() (*State, error)
// Returns the current config of the container.
Config() configs.Config
// Returns the PIDs inside this container. The PIDs are in the namespace of the calling process.
//
// errors:
// ContainerDestroyed - Container no longer exists,
// Systemerror - System error.
//
// Some of the returned PIDs may no longer refer to processes in the Container, unless
// the Container state is PAUSED in which case every PID in the slice is valid.
Processes() ([]int, error)
// Returns statistics for the container.
//
// errors:
// ContainerDestroyed - Container no longer exists,
// Systemerror - System error.
Stats() (*Stats, error)
// Set resources of container as configured
//
// We can use this to change resources when containers are running.
//
// errors:
// Systemerror - System error.
Set(config configs.Config) error
// Start a process inside the container. Returns error if process fails to
// start. You can track process lifecycle with passed Process structure.
//
// errors:
// ContainerDestroyed - Container no longer exists,
// ConfigInvalid - config is invalid,
// ContainerPaused - Container is paused,
// Systemerror - System error.
Start(process *Process) (err error)
// Destroys the container after killing all running processes.
//
// Any event registrations are removed before the container is destroyed.
// No error is returned if the container is already destroyed.
//
// errors:
// Systemerror - System error.
Destroy() error
// Signal sends the provided signal code to the container's initial process.
//
// errors:
// Systemerror - System error.
Signal(s os.Signal) error
}

File diff suppressed because it is too large Load diff

View file

@ -1,20 +0,0 @@
package libcontainer
// State represents a running container's state
type State struct {
BaseState
// Platform specific fields below here
}
// A libcontainer container object.
//
// Each container is thread-safe within the same process. Since a container can
// be destroyed by a separate process, any function may return that the container
// was not found.
type Container interface {
BaseContainer
// Methods below here are platform specific
}

View file

@ -1,36 +0,0 @@
// +build linux freebsd
package libcontainer
// cgroup restoring strategy provided by criu
type cg_mode uint32
const (
CRIU_CG_MODE_SOFT cg_mode = 3 + iota // restore cgroup properties if only dir created by criu
CRIU_CG_MODE_FULL // always restore all cgroups and their properties
CRIU_CG_MODE_STRICT // restore all, requiring them to not present in the system
CRIU_CG_MODE_DEFAULT // the same as CRIU_CG_MODE_SOFT
)
type CriuPageServerInfo struct {
Address string // IP address of CRIU page server
Port int32 // port number of CRIU page server
}
type VethPairName struct {
ContainerInterfaceName string
HostInterfaceName string
}
type CriuOpts struct {
ImagesDirectory string // directory for storing image files
WorkDirectory string // directory to cd and write logs/pidfiles/stats to
LeaveRunning bool // leave container in running state after checkpoint
TcpEstablished bool // checkpoint/restore established TCP connections
ExternalUnixConnections bool // allow external unix connections
ShellJob bool // allow to dump and restore shell jobs
FileLocks bool // handle file locks, for safety
PageServer CriuPageServerInfo // allow to dump to criu page server
VethPairs []VethPairName // pass the veth to criu when restore
ManageCgroupsMode cg_mode // dump or restore cgroup mode
}

View file

@ -1,6 +0,0 @@
package libcontainer
// TODO Windows: This can ultimately be entirely factored out as criu is
// a Unix concept not relevant on Windows.
type CriuOpts struct {
}

View file

@ -1,2 +0,0 @@
gen: criurpc.proto
protoc --go_out=. criurpc.proto

View file

@ -1,689 +0,0 @@
// Code generated by protoc-gen-go.
// source: criurpc.proto
// DO NOT EDIT!
/*
Package criurpc is a generated protocol buffer package.
It is generated from these files:
criurpc.proto
It has these top-level messages:
CriuPageServerInfo
CriuVethPair
ExtMountMap
InheritFd
CgroupRoot
UnixSk
CriuOpts
CriuDumpResp
CriuRestoreResp
CriuNotify
CriuReq
CriuResp
*/
package criurpc
import proto "github.com/golang/protobuf/proto"
import math "math"
// Reference imports to suppress errors if they are not otherwise used.
var _ = proto.Marshal
var _ = math.Inf
type CriuReqType int32
const (
CriuReqType_EMPTY CriuReqType = 0
CriuReqType_DUMP CriuReqType = 1
CriuReqType_RESTORE CriuReqType = 2
CriuReqType_CHECK CriuReqType = 3
CriuReqType_PRE_DUMP CriuReqType = 4
CriuReqType_PAGE_SERVER CriuReqType = 5
CriuReqType_NOTIFY CriuReqType = 6
CriuReqType_CPUINFO_DUMP CriuReqType = 7
CriuReqType_CPUINFO_CHECK CriuReqType = 8
)
var CriuReqType_name = map[int32]string{
0: "EMPTY",
1: "DUMP",
2: "RESTORE",
3: "CHECK",
4: "PRE_DUMP",
5: "PAGE_SERVER",
6: "NOTIFY",
7: "CPUINFO_DUMP",
8: "CPUINFO_CHECK",
}
var CriuReqType_value = map[string]int32{
"EMPTY": 0,
"DUMP": 1,
"RESTORE": 2,
"CHECK": 3,
"PRE_DUMP": 4,
"PAGE_SERVER": 5,
"NOTIFY": 6,
"CPUINFO_DUMP": 7,
"CPUINFO_CHECK": 8,
}
func (x CriuReqType) Enum() *CriuReqType {
p := new(CriuReqType)
*p = x
return p
}
func (x CriuReqType) String() string {
return proto.EnumName(CriuReqType_name, int32(x))
}
func (x *CriuReqType) UnmarshalJSON(data []byte) error {
value, err := proto.UnmarshalJSONEnum(CriuReqType_value, data, "CriuReqType")
if err != nil {
return err
}
*x = CriuReqType(value)
return nil
}
type CriuPageServerInfo struct {
Address *string `protobuf:"bytes,1,opt,name=address" json:"address,omitempty"`
Port *int32 `protobuf:"varint,2,opt,name=port" json:"port,omitempty"`
Pid *int32 `protobuf:"varint,3,opt,name=pid" json:"pid,omitempty"`
Fd *int32 `protobuf:"varint,4,opt,name=fd" json:"fd,omitempty"`
XXX_unrecognized []byte `json:"-"`
}
func (m *CriuPageServerInfo) Reset() { *m = CriuPageServerInfo{} }
func (m *CriuPageServerInfo) String() string { return proto.CompactTextString(m) }
func (*CriuPageServerInfo) ProtoMessage() {}
func (m *CriuPageServerInfo) GetAddress() string {
if m != nil && m.Address != nil {
return *m.Address
}
return ""
}
func (m *CriuPageServerInfo) GetPort() int32 {
if m != nil && m.Port != nil {
return *m.Port
}
return 0
}
func (m *CriuPageServerInfo) GetPid() int32 {
if m != nil && m.Pid != nil {
return *m.Pid
}
return 0
}
func (m *CriuPageServerInfo) GetFd() int32 {
if m != nil && m.Fd != nil {
return *m.Fd
}
return 0
}
type CriuVethPair struct {
IfIn *string `protobuf:"bytes,1,req,name=if_in" json:"if_in,omitempty"`
IfOut *string `protobuf:"bytes,2,req,name=if_out" json:"if_out,omitempty"`
XXX_unrecognized []byte `json:"-"`
}
func (m *CriuVethPair) Reset() { *m = CriuVethPair{} }
func (m *CriuVethPair) String() string { return proto.CompactTextString(m) }
func (*CriuVethPair) ProtoMessage() {}
func (m *CriuVethPair) GetIfIn() string {
if m != nil && m.IfIn != nil {
return *m.IfIn
}
return ""
}
func (m *CriuVethPair) GetIfOut() string {
if m != nil && m.IfOut != nil {
return *m.IfOut
}
return ""
}
type ExtMountMap struct {
Key *string `protobuf:"bytes,1,req,name=key" json:"key,omitempty"`
Val *string `protobuf:"bytes,2,req,name=val" json:"val,omitempty"`
XXX_unrecognized []byte `json:"-"`
}
func (m *ExtMountMap) Reset() { *m = ExtMountMap{} }
func (m *ExtMountMap) String() string { return proto.CompactTextString(m) }
func (*ExtMountMap) ProtoMessage() {}
func (m *ExtMountMap) GetKey() string {
if m != nil && m.Key != nil {
return *m.Key
}
return ""
}
func (m *ExtMountMap) GetVal() string {
if m != nil && m.Val != nil {
return *m.Val
}
return ""
}
type InheritFd struct {
Key *string `protobuf:"bytes,1,req,name=key" json:"key,omitempty"`
Fd *int32 `protobuf:"varint,2,req,name=fd" json:"fd,omitempty"`
XXX_unrecognized []byte `json:"-"`
}
func (m *InheritFd) Reset() { *m = InheritFd{} }
func (m *InheritFd) String() string { return proto.CompactTextString(m) }
func (*InheritFd) ProtoMessage() {}
func (m *InheritFd) GetKey() string {
if m != nil && m.Key != nil {
return *m.Key
}
return ""
}
func (m *InheritFd) GetFd() int32 {
if m != nil && m.Fd != nil {
return *m.Fd
}
return 0
}
type CgroupRoot struct {
Ctrl *string `protobuf:"bytes,1,opt,name=ctrl" json:"ctrl,omitempty"`
Path *string `protobuf:"bytes,2,req,name=path" json:"path,omitempty"`
XXX_unrecognized []byte `json:"-"`
}
func (m *CgroupRoot) Reset() { *m = CgroupRoot{} }
func (m *CgroupRoot) String() string { return proto.CompactTextString(m) }
func (*CgroupRoot) ProtoMessage() {}
func (m *CgroupRoot) GetCtrl() string {
if m != nil && m.Ctrl != nil {
return *m.Ctrl
}
return ""
}
func (m *CgroupRoot) GetPath() string {
if m != nil && m.Path != nil {
return *m.Path
}
return ""
}
type UnixSk struct {
Inode *uint32 `protobuf:"varint,1,req,name=inode" json:"inode,omitempty"`
XXX_unrecognized []byte `json:"-"`
}
func (m *UnixSk) Reset() { *m = UnixSk{} }
func (m *UnixSk) String() string { return proto.CompactTextString(m) }
func (*UnixSk) ProtoMessage() {}
func (m *UnixSk) GetInode() uint32 {
if m != nil && m.Inode != nil {
return *m.Inode
}
return 0
}
type CriuOpts struct {
ImagesDirFd *int32 `protobuf:"varint,1,req,name=images_dir_fd" json:"images_dir_fd,omitempty"`
Pid *int32 `protobuf:"varint,2,opt,name=pid" json:"pid,omitempty"`
LeaveRunning *bool `protobuf:"varint,3,opt,name=leave_running" json:"leave_running,omitempty"`
ExtUnixSk *bool `protobuf:"varint,4,opt,name=ext_unix_sk" json:"ext_unix_sk,omitempty"`
TcpEstablished *bool `protobuf:"varint,5,opt,name=tcp_established" json:"tcp_established,omitempty"`
EvasiveDevices *bool `protobuf:"varint,6,opt,name=evasive_devices" json:"evasive_devices,omitempty"`
ShellJob *bool `protobuf:"varint,7,opt,name=shell_job" json:"shell_job,omitempty"`
FileLocks *bool `protobuf:"varint,8,opt,name=file_locks" json:"file_locks,omitempty"`
LogLevel *int32 `protobuf:"varint,9,opt,name=log_level,def=2" json:"log_level,omitempty"`
LogFile *string `protobuf:"bytes,10,opt,name=log_file" json:"log_file,omitempty"`
Ps *CriuPageServerInfo `protobuf:"bytes,11,opt,name=ps" json:"ps,omitempty"`
NotifyScripts *bool `protobuf:"varint,12,opt,name=notify_scripts" json:"notify_scripts,omitempty"`
Root *string `protobuf:"bytes,13,opt,name=root" json:"root,omitempty"`
ParentImg *string `protobuf:"bytes,14,opt,name=parent_img" json:"parent_img,omitempty"`
TrackMem *bool `protobuf:"varint,15,opt,name=track_mem" json:"track_mem,omitempty"`
AutoDedup *bool `protobuf:"varint,16,opt,name=auto_dedup" json:"auto_dedup,omitempty"`
WorkDirFd *int32 `protobuf:"varint,17,opt,name=work_dir_fd" json:"work_dir_fd,omitempty"`
LinkRemap *bool `protobuf:"varint,18,opt,name=link_remap" json:"link_remap,omitempty"`
Veths []*CriuVethPair `protobuf:"bytes,19,rep,name=veths" json:"veths,omitempty"`
CpuCap *uint32 `protobuf:"varint,20,opt,name=cpu_cap,def=4294967295" json:"cpu_cap,omitempty"`
ForceIrmap *bool `protobuf:"varint,21,opt,name=force_irmap" json:"force_irmap,omitempty"`
ExecCmd []string `protobuf:"bytes,22,rep,name=exec_cmd" json:"exec_cmd,omitempty"`
ExtMnt []*ExtMountMap `protobuf:"bytes,23,rep,name=ext_mnt" json:"ext_mnt,omitempty"`
ManageCgroups *bool `protobuf:"varint,24,opt,name=manage_cgroups" json:"manage_cgroups,omitempty"`
CgRoot []*CgroupRoot `protobuf:"bytes,25,rep,name=cg_root" json:"cg_root,omitempty"`
RstSibling *bool `protobuf:"varint,26,opt,name=rst_sibling" json:"rst_sibling,omitempty"`
InheritFd []*InheritFd `protobuf:"bytes,27,rep,name=inherit_fd" json:"inherit_fd,omitempty"`
AutoExtMnt *bool `protobuf:"varint,28,opt,name=auto_ext_mnt" json:"auto_ext_mnt,omitempty"`
ExtSharing *bool `protobuf:"varint,29,opt,name=ext_sharing" json:"ext_sharing,omitempty"`
ExtMasters *bool `protobuf:"varint,30,opt,name=ext_masters" json:"ext_masters,omitempty"`
SkipMnt []string `protobuf:"bytes,31,rep,name=skip_mnt" json:"skip_mnt,omitempty"`
EnableFs []string `protobuf:"bytes,32,rep,name=enable_fs" json:"enable_fs,omitempty"`
UnixSkIno []*UnixSk `protobuf:"bytes,33,rep,name=unix_sk_ino" json:"unix_sk_ino,omitempty"`
ManageCgroupsMode *uint32 `protobuf:"varint,34,opt,name=manage_cgroups_mode" json:"manage_cgroups_mode,omitempty"`
XXX_unrecognized []byte `json:"-"`
}
func (m *CriuOpts) Reset() { *m = CriuOpts{} }
func (m *CriuOpts) String() string { return proto.CompactTextString(m) }
func (*CriuOpts) ProtoMessage() {}
const Default_CriuOpts_LogLevel int32 = 2
const Default_CriuOpts_CpuCap uint32 = 4294967295
func (m *CriuOpts) GetImagesDirFd() int32 {
if m != nil && m.ImagesDirFd != nil {
return *m.ImagesDirFd
}
return 0
}
func (m *CriuOpts) GetPid() int32 {
if m != nil && m.Pid != nil {
return *m.Pid
}
return 0
}
func (m *CriuOpts) GetLeaveRunning() bool {
if m != nil && m.LeaveRunning != nil {
return *m.LeaveRunning
}
return false
}
func (m *CriuOpts) GetExtUnixSk() bool {
if m != nil && m.ExtUnixSk != nil {
return *m.ExtUnixSk
}
return false
}
func (m *CriuOpts) GetTcpEstablished() bool {
if m != nil && m.TcpEstablished != nil {
return *m.TcpEstablished
}
return false
}
func (m *CriuOpts) GetEvasiveDevices() bool {
if m != nil && m.EvasiveDevices != nil {
return *m.EvasiveDevices
}
return false
}
func (m *CriuOpts) GetShellJob() bool {
if m != nil && m.ShellJob != nil {
return *m.ShellJob
}
return false
}
func (m *CriuOpts) GetFileLocks() bool {
if m != nil && m.FileLocks != nil {
return *m.FileLocks
}
return false
}
func (m *CriuOpts) GetLogLevel() int32 {
if m != nil && m.LogLevel != nil {
return *m.LogLevel
}
return Default_CriuOpts_LogLevel
}
func (m *CriuOpts) GetLogFile() string {
if m != nil && m.LogFile != nil {
return *m.LogFile
}
return ""
}
func (m *CriuOpts) GetPs() *CriuPageServerInfo {
if m != nil {
return m.Ps
}
return nil
}
func (m *CriuOpts) GetNotifyScripts() bool {
if m != nil && m.NotifyScripts != nil {
return *m.NotifyScripts
}
return false
}
func (m *CriuOpts) GetRoot() string {
if m != nil && m.Root != nil {
return *m.Root
}
return ""
}
func (m *CriuOpts) GetParentImg() string {
if m != nil && m.ParentImg != nil {
return *m.ParentImg
}
return ""
}
func (m *CriuOpts) GetTrackMem() bool {
if m != nil && m.TrackMem != nil {
return *m.TrackMem
}
return false
}
func (m *CriuOpts) GetAutoDedup() bool {
if m != nil && m.AutoDedup != nil {
return *m.AutoDedup
}
return false
}
func (m *CriuOpts) GetWorkDirFd() int32 {
if m != nil && m.WorkDirFd != nil {
return *m.WorkDirFd
}
return 0
}
func (m *CriuOpts) GetLinkRemap() bool {
if m != nil && m.LinkRemap != nil {
return *m.LinkRemap
}
return false
}
func (m *CriuOpts) GetVeths() []*CriuVethPair {
if m != nil {
return m.Veths
}
return nil
}
func (m *CriuOpts) GetCpuCap() uint32 {
if m != nil && m.CpuCap != nil {
return *m.CpuCap
}
return Default_CriuOpts_CpuCap
}
func (m *CriuOpts) GetForceIrmap() bool {
if m != nil && m.ForceIrmap != nil {
return *m.ForceIrmap
}
return false
}
func (m *CriuOpts) GetExecCmd() []string {
if m != nil {
return m.ExecCmd
}
return nil
}
func (m *CriuOpts) GetExtMnt() []*ExtMountMap {
if m != nil {
return m.ExtMnt
}
return nil
}
func (m *CriuOpts) GetManageCgroups() bool {
if m != nil && m.ManageCgroups != nil {
return *m.ManageCgroups
}
return false
}
func (m *CriuOpts) GetCgRoot() []*CgroupRoot {
if m != nil {
return m.CgRoot
}
return nil
}
func (m *CriuOpts) GetRstSibling() bool {
if m != nil && m.RstSibling != nil {
return *m.RstSibling
}
return false
}
func (m *CriuOpts) GetInheritFd() []*InheritFd {
if m != nil {
return m.InheritFd
}
return nil
}
func (m *CriuOpts) GetAutoExtMnt() bool {
if m != nil && m.AutoExtMnt != nil {
return *m.AutoExtMnt
}
return false
}
func (m *CriuOpts) GetExtSharing() bool {
if m != nil && m.ExtSharing != nil {
return *m.ExtSharing
}
return false
}
func (m *CriuOpts) GetExtMasters() bool {
if m != nil && m.ExtMasters != nil {
return *m.ExtMasters
}
return false
}
func (m *CriuOpts) GetSkipMnt() []string {
if m != nil {
return m.SkipMnt
}
return nil
}
func (m *CriuOpts) GetEnableFs() []string {
if m != nil {
return m.EnableFs
}
return nil
}
func (m *CriuOpts) GetUnixSkIno() []*UnixSk {
if m != nil {
return m.UnixSkIno
}
return nil
}
func (m *CriuOpts) GetManageCgroupsMode() uint32 {
if m != nil && m.ManageCgroupsMode != nil {
return *m.ManageCgroupsMode
}
return 0
}
type CriuDumpResp struct {
Restored *bool `protobuf:"varint,1,opt,name=restored" json:"restored,omitempty"`
XXX_unrecognized []byte `json:"-"`
}
func (m *CriuDumpResp) Reset() { *m = CriuDumpResp{} }
func (m *CriuDumpResp) String() string { return proto.CompactTextString(m) }
func (*CriuDumpResp) ProtoMessage() {}
func (m *CriuDumpResp) GetRestored() bool {
if m != nil && m.Restored != nil {
return *m.Restored
}
return false
}
type CriuRestoreResp struct {
Pid *int32 `protobuf:"varint,1,req,name=pid" json:"pid,omitempty"`
XXX_unrecognized []byte `json:"-"`
}
func (m *CriuRestoreResp) Reset() { *m = CriuRestoreResp{} }
func (m *CriuRestoreResp) String() string { return proto.CompactTextString(m) }
func (*CriuRestoreResp) ProtoMessage() {}
func (m *CriuRestoreResp) GetPid() int32 {
if m != nil && m.Pid != nil {
return *m.Pid
}
return 0
}
type CriuNotify struct {
Script *string `protobuf:"bytes,1,opt,name=script" json:"script,omitempty"`
Pid *int32 `protobuf:"varint,2,opt,name=pid" json:"pid,omitempty"`
XXX_unrecognized []byte `json:"-"`
}
func (m *CriuNotify) Reset() { *m = CriuNotify{} }
func (m *CriuNotify) String() string { return proto.CompactTextString(m) }
func (*CriuNotify) ProtoMessage() {}
func (m *CriuNotify) GetScript() string {
if m != nil && m.Script != nil {
return *m.Script
}
return ""
}
func (m *CriuNotify) GetPid() int32 {
if m != nil && m.Pid != nil {
return *m.Pid
}
return 0
}
type CriuReq struct {
Type *CriuReqType `protobuf:"varint,1,req,name=type,enum=CriuReqType" json:"type,omitempty"`
Opts *CriuOpts `protobuf:"bytes,2,opt,name=opts" json:"opts,omitempty"`
NotifySuccess *bool `protobuf:"varint,3,opt,name=notify_success" json:"notify_success,omitempty"`
//
// When set service won't close the connection but
// will wait for more req-s to appear. Works not
// for all request types.
KeepOpen *bool `protobuf:"varint,4,opt,name=keep_open" json:"keep_open,omitempty"`
XXX_unrecognized []byte `json:"-"`
}
func (m *CriuReq) Reset() { *m = CriuReq{} }
func (m *CriuReq) String() string { return proto.CompactTextString(m) }
func (*CriuReq) ProtoMessage() {}
func (m *CriuReq) GetType() CriuReqType {
if m != nil && m.Type != nil {
return *m.Type
}
return CriuReqType_EMPTY
}
func (m *CriuReq) GetOpts() *CriuOpts {
if m != nil {
return m.Opts
}
return nil
}
func (m *CriuReq) GetNotifySuccess() bool {
if m != nil && m.NotifySuccess != nil {
return *m.NotifySuccess
}
return false
}
func (m *CriuReq) GetKeepOpen() bool {
if m != nil && m.KeepOpen != nil {
return *m.KeepOpen
}
return false
}
type CriuResp struct {
Type *CriuReqType `protobuf:"varint,1,req,name=type,enum=CriuReqType" json:"type,omitempty"`
Success *bool `protobuf:"varint,2,req,name=success" json:"success,omitempty"`
Dump *CriuDumpResp `protobuf:"bytes,3,opt,name=dump" json:"dump,omitempty"`
Restore *CriuRestoreResp `protobuf:"bytes,4,opt,name=restore" json:"restore,omitempty"`
Notify *CriuNotify `protobuf:"bytes,5,opt,name=notify" json:"notify,omitempty"`
Ps *CriuPageServerInfo `protobuf:"bytes,6,opt,name=ps" json:"ps,omitempty"`
CrErrno *int32 `protobuf:"varint,7,opt,name=cr_errno" json:"cr_errno,omitempty"`
XXX_unrecognized []byte `json:"-"`
}
func (m *CriuResp) Reset() { *m = CriuResp{} }
func (m *CriuResp) String() string { return proto.CompactTextString(m) }
func (*CriuResp) ProtoMessage() {}
func (m *CriuResp) GetType() CriuReqType {
if m != nil && m.Type != nil {
return *m.Type
}
return CriuReqType_EMPTY
}
func (m *CriuResp) GetSuccess() bool {
if m != nil && m.Success != nil {
return *m.Success
}
return false
}
func (m *CriuResp) GetDump() *CriuDumpResp {
if m != nil {
return m.Dump
}
return nil
}
func (m *CriuResp) GetRestore() *CriuRestoreResp {
if m != nil {
return m.Restore
}
return nil
}
func (m *CriuResp) GetNotify() *CriuNotify {
if m != nil {
return m.Notify
}
return nil
}
func (m *CriuResp) GetPs() *CriuPageServerInfo {
if m != nil {
return m.Ps
}
return nil
}
func (m *CriuResp) GetCrErrno() int32 {
if m != nil && m.CrErrno != nil {
return *m.CrErrno
}
return 0
}
func init() {
proto.RegisterEnum("CriuReqType", CriuReqType_name, CriuReqType_value)
}

View file

@ -1,142 +0,0 @@
message criu_page_server_info {
optional string address = 1;
optional int32 port = 2;
optional int32 pid = 3;
optional int32 fd = 4;
}
message criu_veth_pair {
required string if_in = 1;
required string if_out = 2;
};
message ext_mount_map {
required string key = 1;
required string val = 2;
};
message inherit_fd {
required string key = 1;
required int32 fd = 2;
};
message cgroup_root {
optional string ctrl = 1;
required string path = 2;
};
message unix_sk {
required uint32 inode = 1;
};
message criu_opts {
required int32 images_dir_fd = 1;
optional int32 pid = 2; /* if not set on dump, will dump requesting process */
optional bool leave_running = 3;
optional bool ext_unix_sk = 4;
optional bool tcp_established = 5;
optional bool evasive_devices = 6;
optional bool shell_job = 7;
optional bool file_locks = 8;
optional int32 log_level = 9 [default = 2];
optional string log_file = 10; /* No subdirs are allowed. Consider using work-dir */
optional criu_page_server_info ps = 11;
optional bool notify_scripts = 12;
optional string root = 13;
optional string parent_img = 14;
optional bool track_mem = 15;
optional bool auto_dedup = 16;
optional int32 work_dir_fd = 17;
optional bool link_remap = 18;
repeated criu_veth_pair veths = 19;
optional uint32 cpu_cap = 20 [default = 0xffffffff];
optional bool force_irmap = 21;
repeated string exec_cmd = 22;
repeated ext_mount_map ext_mnt = 23;
optional bool manage_cgroups = 24; /* backward compatibility */
repeated cgroup_root cg_root = 25;
optional bool rst_sibling = 26; /* swrk only */
repeated inherit_fd inherit_fd = 27; /* swrk only */
optional bool auto_ext_mnt = 28;
optional bool ext_sharing = 29;
optional bool ext_masters = 30;
repeated string skip_mnt = 31;
repeated string enable_fs = 32;
repeated unix_sk unix_sk_ino = 33;
optional uint32 manage_cgroups_mode = 34;
}
message criu_dump_resp {
optional bool restored = 1;
}
message criu_restore_resp {
required int32 pid = 1;
}
message criu_notify {
optional string script = 1;
optional int32 pid = 2;
}
enum criu_req_type {
EMPTY = 0;
DUMP = 1;
RESTORE = 2;
CHECK = 3;
PRE_DUMP = 4;
PAGE_SERVER = 5;
NOTIFY = 6;
CPUINFO_DUMP = 7;
CPUINFO_CHECK = 8;
}
/*
* Request -- each type corresponds to must-be-there
* request arguments of respective type
*/
message criu_req {
required criu_req_type type = 1;
optional criu_opts opts = 2;
optional bool notify_success = 3;
/*
* When set service won't close the connection but
* will wait for more req-s to appear. Works not
* for all request types.
*/
optional bool keep_open = 4;
}
/*
* Responce -- it states whether the request was served
* and additional request-specific informarion
*/
message criu_resp {
required criu_req_type type = 1;
required bool success = 2;
optional criu_dump_resp dump = 3;
optional criu_restore_resp restore = 4;
optional criu_notify notify = 5;
optional criu_page_server_info ps = 6;
optional int32 cr_errno = 7;
}

View file

@ -1,70 +0,0 @@
package libcontainer
import "io"
// API error code type.
type ErrorCode int
// API error codes.
const (
// Factory errors
IdInUse ErrorCode = iota
InvalidIdFormat
// Container errors
ContainerNotExists
ContainerPaused
ContainerNotStopped
ContainerNotRunning
ContainerNotPaused
// Process errors
NoProcessOps
// Common errors
ConfigInvalid
ConsoleExists
SystemError
)
func (c ErrorCode) String() string {
switch c {
case IdInUse:
return "Id already in use"
case InvalidIdFormat:
return "Invalid format"
case ContainerPaused:
return "Container paused"
case ConfigInvalid:
return "Invalid configuration"
case SystemError:
return "System error"
case ContainerNotExists:
return "Container does not exist"
case ContainerNotStopped:
return "Container is not stopped"
case ContainerNotRunning:
return "Container is not running"
case ConsoleExists:
return "Console exists for process"
case ContainerNotPaused:
return "Container is not paused"
case NoProcessOps:
return "No process operations"
default:
return "Unknown error"
}
}
// API Error type.
type Error interface {
error
// Returns a verbose string including the error message
// and a representation of the stack trace suitable for
// printing.
Detail(w io.Writer) error
// Returns the error code for this error.
Code() ErrorCode
}

View file

@ -1,45 +0,0 @@
package libcontainer
import (
"github.com/opencontainers/runc/libcontainer/configs"
)
type Factory interface {
// Creates a new container with the given id and starts the initial process inside it.
// id must be a string containing only letters, digits and underscores and must contain
// between 1 and 1024 characters, inclusive.
//
// The id must not already be in use by an existing container. Containers created using
// a factory with the same path (and file system) must have distinct ids.
//
// Returns the new container with a running process.
//
// errors:
// IdInUse - id is already in use by a container
// InvalidIdFormat - id has incorrect format
// ConfigInvalid - config is invalid
// Systemerror - System error
//
// On error, any partially created container parts are cleaned up (the operation is atomic).
Create(id string, config *configs.Config) (Container, error)
// Load takes an ID for an existing container and returns the container information
// from the state. This presents a read only view of the container.
//
// errors:
// Path does not exist
// Container is stopped
// System error
Load(id string) (Container, error)
// StartInitialization is an internal API to libcontainer used during the reexec of the
// container.
//
// Errors:
// Pipe connection error
// System error
StartInitialization() error
// Type returns info string about factory type (e.g. lxc, libcontainer...)
Type() string
}

View file

@ -1,295 +0,0 @@
// +build linux
package libcontainer
import (
"encoding/json"
"fmt"
"os"
"os/exec"
"path/filepath"
"regexp"
"strconv"
"syscall"
"github.com/docker/docker/pkg/mount"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs"
"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/configs/validate"
"github.com/opencontainers/runc/libcontainer/utils"
)
const (
stateFilename = "state.json"
)
var (
idRegex = regexp.MustCompile(`^[\w_-]+$`)
maxIdLen = 1024
)
// InitArgs returns an options func to configure a LinuxFactory with the
// provided init arguments.
func InitArgs(args ...string) func(*LinuxFactory) error {
return func(l *LinuxFactory) error {
name := args[0]
if filepath.Base(name) == name {
if lp, err := exec.LookPath(name); err == nil {
name = lp
}
} else {
abs, err := filepath.Abs(name)
if err != nil {
return err
}
name = abs
}
l.InitPath = "/proc/self/exe"
l.InitArgs = append([]string{name}, args[1:]...)
return nil
}
}
// InitPath returns an options func to configure a LinuxFactory with the
// provided absolute path to the init binary and arguements.
func InitPath(path string, args ...string) func(*LinuxFactory) error {
return func(l *LinuxFactory) error {
l.InitPath = path
l.InitArgs = args
return nil
}
}
// SystemdCgroups is an options func to configure a LinuxFactory to return
// containers that use systemd to create and manage cgroups.
func SystemdCgroups(l *LinuxFactory) error {
l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
return &systemd.Manager{
Cgroups: config,
Paths: paths,
}
}
return nil
}
// Cgroupfs is an options func to configure a LinuxFactory to return
// containers that use the native cgroups filesystem implementation to
// create and manage cgroups.
func Cgroupfs(l *LinuxFactory) error {
l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
return &fs.Manager{
Cgroups: config,
Paths: paths,
}
}
return nil
}
// TmpfsRoot is an option func to mount LinuxFactory.Root to tmpfs.
func TmpfsRoot(l *LinuxFactory) error {
mounted, err := mount.Mounted(l.Root)
if err != nil {
return err
}
if !mounted {
if err := syscall.Mount("tmpfs", l.Root, "tmpfs", 0, ""); err != nil {
return err
}
}
return nil
}
// New returns a linux based container factory based in the root directory and
// configures the factory with the provided option funcs.
func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
if root != "" {
if err := os.MkdirAll(root, 0700); err != nil {
return nil, newGenericError(err, SystemError)
}
}
l := &LinuxFactory{
Root: root,
Validator: validate.New(),
CriuPath: "criu",
}
InitArgs(os.Args[0], "init")(l)
Cgroupfs(l)
for _, opt := range options {
if err := opt(l); err != nil {
return nil, err
}
}
return l, nil
}
// LinuxFactory implements the default factory interface for linux based systems.
type LinuxFactory struct {
// Root directory for the factory to store state.
Root string
// InitPath is the absolute path to the init binary.
InitPath string
// InitArgs are arguments for calling the init responsibilities for spawning
// a container.
InitArgs []string
// CriuPath is the path to the criu binary used for checkpoint and restore of
// containers.
CriuPath string
// Validator provides validation to container configurations.
Validator validate.Validator
// NewCgroupsManager returns an initialized cgroups manager for a single container.
NewCgroupsManager func(config *configs.Cgroup, paths map[string]string) cgroups.Manager
}
func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) {
if l.Root == "" {
return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
}
if err := l.validateID(id); err != nil {
return nil, err
}
if err := l.Validator.Validate(config); err != nil {
return nil, newGenericError(err, ConfigInvalid)
}
containerRoot := filepath.Join(l.Root, id)
if _, err := os.Stat(containerRoot); err == nil {
return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse)
} else if !os.IsNotExist(err) {
return nil, newGenericError(err, SystemError)
}
if err := os.MkdirAll(containerRoot, 0700); err != nil {
return nil, newGenericError(err, SystemError)
}
c := &linuxContainer{
id: id,
root: containerRoot,
config: config,
initPath: l.InitPath,
initArgs: l.InitArgs,
criuPath: l.CriuPath,
cgroupManager: l.NewCgroupsManager(config.Cgroups, nil),
}
c.state = &stoppedState{c: c}
return c, nil
}
func (l *LinuxFactory) Load(id string) (Container, error) {
if l.Root == "" {
return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
}
containerRoot := filepath.Join(l.Root, id)
state, err := l.loadState(containerRoot)
if err != nil {
return nil, err
}
r := &nonChildProcess{
processPid: state.InitProcessPid,
processStartTime: state.InitProcessStartTime,
fds: state.ExternalDescriptors,
}
c := &linuxContainer{
initProcess: r,
id: id,
config: &state.Config,
initPath: l.InitPath,
initArgs: l.InitArgs,
criuPath: l.CriuPath,
cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths),
root: containerRoot,
created: state.Created,
}
c.state = &createdState{c: c, s: Created}
if err := c.refreshState(); err != nil {
return nil, err
}
return c, nil
}
func (l *LinuxFactory) Type() string {
return "libcontainer"
}
// StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state
// This is a low level implementation detail of the reexec and should not be consumed externally
func (l *LinuxFactory) StartInitialization() (err error) {
fdStr := os.Getenv("_LIBCONTAINER_INITPIPE")
pipefd, err := strconv.Atoi(fdStr)
if err != nil {
return fmt.Errorf("error converting env var _LIBCONTAINER_INITPIPE(%q) to an int: %s", fdStr, err)
}
var (
pipe = os.NewFile(uintptr(pipefd), "pipe")
it = initType(os.Getenv("_LIBCONTAINER_INITTYPE"))
)
defer pipe.Close()
// clear the current process's environment to clean any libcontainer
// specific env vars.
os.Clearenv()
i, err := newContainerInit(it, pipe)
if err != nil {
l.sendError(nil, pipe, err)
return err
}
if err := i.Init(); err != nil {
if !isExecError(err) {
l.sendError(i, pipe, err)
}
return err
}
return nil
}
func (l *LinuxFactory) sendError(i initer, pipe *os.File, err error) {
// We have an error during the initialization of the container's init,
// send it back to the parent process in the form of an initError.
// If container's init successed, syscall.Exec will not return, hence
// this defer function will never be called.
if i != nil {
if _, ok := i.(*linuxStandardInit); ok {
// Synchronisation only necessary for standard init.
if err := utils.WriteJSON(pipe, syncT{procError}); err != nil {
panic(err)
}
}
}
if err := utils.WriteJSON(pipe, newSystemError(err)); err != nil {
panic(err)
}
}
func (l *LinuxFactory) loadState(root string) (*State, error) {
f, err := os.Open(filepath.Join(root, stateFilename))
if err != nil {
if os.IsNotExist(err) {
return nil, newGenericError(err, ContainerNotExists)
}
return nil, newGenericError(err, SystemError)
}
defer f.Close()
var state *State
if err := json.NewDecoder(f).Decode(&state); err != nil {
return nil, newGenericError(err, SystemError)
}
return state, nil
}
func (l *LinuxFactory) validateID(id string) error {
if !idRegex.MatchString(id) {
return newGenericError(fmt.Errorf("invalid id format: %v", id), InvalidIdFormat)
}
if len(id) > maxIdLen {
return newGenericError(fmt.Errorf("invalid id format: %v", id), InvalidIdFormat)
}
return nil
}
func isExecError(err error) bool {
_, ok := err.(*exec.Error)
return ok
}

View file

@ -1,88 +0,0 @@
package libcontainer
import (
"fmt"
"io"
"text/template"
"time"
"github.com/opencontainers/runc/libcontainer/stacktrace"
)
type syncType uint8
const (
procReady syncType = iota
procError
procRun
procHooks
procResume
)
type syncT struct {
Type syncType `json:"type"`
}
var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}}
Code: {{.ECode}}
{{if .Message }}
Message: {{.Message}}
{{end}}
Frames:{{range $i, $frame := .Stack.Frames}}
---
{{$i}}: {{$frame.Function}}
Package: {{$frame.Package}}
File: {{$frame.File}}@{{$frame.Line}}{{end}}
`))
func newGenericError(err error, c ErrorCode) Error {
if le, ok := err.(Error); ok {
return le
}
gerr := &genericError{
Timestamp: time.Now(),
Err: err,
ECode: c,
Stack: stacktrace.Capture(1),
}
if err != nil {
gerr.Message = err.Error()
}
return gerr
}
func newSystemError(err error) Error {
if le, ok := err.(Error); ok {
return le
}
gerr := &genericError{
Timestamp: time.Now(),
Err: err,
ECode: SystemError,
Stack: stacktrace.Capture(1),
}
if err != nil {
gerr.Message = err.Error()
}
return gerr
}
type genericError struct {
Timestamp time.Time
ECode ErrorCode
Err error `json:"-"`
Message string
Stack stacktrace.Stacktrace
}
func (e *genericError) Error() string {
return fmt.Sprintf("[%d] %s: %s", e.ECode, e.ECode, e.Message)
}
func (e *genericError) Code() ErrorCode {
return e.ECode
}
func (e *genericError) Detail(w io.Writer) error {
return errorTemplate.Execute(w, e)
}

View file

@ -1,364 +0,0 @@
// +build linux
package libcontainer
import (
"encoding/json"
"fmt"
"io"
"io/ioutil"
"net"
"os"
"strconv"
"strings"
"syscall"
"github.com/Sirupsen/logrus"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/user"
"github.com/opencontainers/runc/libcontainer/utils"
"github.com/vishvananda/netlink"
)
type initType string
const (
initSetns initType = "setns"
initStandard initType = "standard"
)
type pid struct {
Pid int `json:"pid"`
}
// network is an internal struct used to setup container networks.
type network struct {
configs.Network
// TempVethPeerName is a unique temporary veth peer name that was placed into
// the container's namespace.
TempVethPeerName string `json:"temp_veth_peer_name"`
}
// initConfig is used for transferring parameters from Exec() to Init()
type initConfig struct {
Args []string `json:"args"`
Env []string `json:"env"`
Cwd string `json:"cwd"`
Capabilities []string `json:"capabilities"`
ProcessLabel string `json:"process_label"`
AppArmorProfile string `json:"apparmor_profile"`
NoNewPrivileges bool `json:"no_new_privileges"`
User string `json:"user"`
Config *configs.Config `json:"config"`
Console string `json:"console"`
Networks []*network `json:"network"`
PassedFilesCount int `json:"passed_files_count"`
ContainerId string `json:"containerid"`
}
type initer interface {
Init() error
}
func newContainerInit(t initType, pipe *os.File) (initer, error) {
var config *initConfig
if err := json.NewDecoder(pipe).Decode(&config); err != nil {
return nil, err
}
if err := populateProcessEnvironment(config.Env); err != nil {
return nil, err
}
switch t {
case initSetns:
return &linuxSetnsInit{
config: config,
}, nil
case initStandard:
return &linuxStandardInit{
pipe: pipe,
parentPid: syscall.Getppid(),
config: config,
}, nil
}
return nil, fmt.Errorf("unknown init type %q", t)
}
// populateProcessEnvironment loads the provided environment variables into the
// current processes's environment.
func populateProcessEnvironment(env []string) error {
for _, pair := range env {
p := strings.SplitN(pair, "=", 2)
if len(p) < 2 {
return fmt.Errorf("invalid environment '%v'", pair)
}
if err := os.Setenv(p[0], p[1]); err != nil {
return err
}
}
return nil
}
// finalizeNamespace drops the caps, sets the correct user
// and working dir, and closes any leaked file descriptors
// before executing the command inside the namespace
func finalizeNamespace(config *initConfig) error {
// Ensure that all unwanted fds we may have accidentally
// inherited are marked close-on-exec so they stay out of the
// container
if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil {
return err
}
capabilities := config.Config.Capabilities
if config.Capabilities != nil {
capabilities = config.Capabilities
}
w, err := newCapWhitelist(capabilities)
if err != nil {
return err
}
// drop capabilities in bounding set before changing user
if err := w.dropBoundingSet(); err != nil {
return err
}
// preserve existing capabilities while we change users
if err := system.SetKeepCaps(); err != nil {
return err
}
if err := setupUser(config); err != nil {
return err
}
if err := system.ClearKeepCaps(); err != nil {
return err
}
// drop all other capabilities
if err := w.drop(); err != nil {
return err
}
if config.Cwd != "" {
if err := syscall.Chdir(config.Cwd); err != nil {
return err
}
}
return nil
}
// syncParentReady sends to the given pipe a JSON payload which indicates that
// the init is ready to Exec the child process. It then waits for the parent to
// indicate that it is cleared to Exec.
func syncParentReady(pipe io.ReadWriter) error {
// Tell parent.
if err := utils.WriteJSON(pipe, syncT{procReady}); err != nil {
return err
}
// Wait for parent to give the all-clear.
var procSync syncT
if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
if err == io.EOF {
return fmt.Errorf("parent closed synchronisation channel")
}
if procSync.Type != procRun {
return fmt.Errorf("invalid synchronisation flag from parent")
}
}
return nil
}
// syncParentHooks sends to the given pipe a JSON payload which indicates that
// the parent should execute pre-start hooks. It then waits for the parent to
// indicate that it is cleared to resume.
func syncParentHooks(pipe io.ReadWriter) error {
// Tell parent.
if err := utils.WriteJSON(pipe, syncT{procHooks}); err != nil {
return err
}
// Wait for parent to give the all-clear.
var procSync syncT
if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
if err == io.EOF {
return fmt.Errorf("parent closed synchronisation channel")
}
if procSync.Type != procResume {
return fmt.Errorf("invalid synchronisation flag from parent")
}
}
return nil
}
// setupUser changes the groups, gid, and uid for the user inside the container
func setupUser(config *initConfig) error {
// Set up defaults.
defaultExecUser := user.ExecUser{
Uid: syscall.Getuid(),
Gid: syscall.Getgid(),
Home: "/",
}
passwdPath, err := user.GetPasswdPath()
if err != nil {
return err
}
groupPath, err := user.GetGroupPath()
if err != nil {
return err
}
execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath)
if err != nil {
return err
}
var addGroups []int
if len(config.Config.AdditionalGroups) > 0 {
addGroups, err = user.GetAdditionalGroupsPath(config.Config.AdditionalGroups, groupPath)
if err != nil {
return err
}
}
// before we change to the container's user make sure that the processes STDIO
// is correctly owned by the user that we are switching to.
if err := fixStdioPermissions(execUser); err != nil {
return err
}
suppGroups := append(execUser.Sgids, addGroups...)
if err := syscall.Setgroups(suppGroups); err != nil {
return err
}
if err := system.Setgid(execUser.Gid); err != nil {
return err
}
if err := system.Setuid(execUser.Uid); err != nil {
return err
}
// if we didn't get HOME already, set it based on the user's HOME
if envHome := os.Getenv("HOME"); envHome == "" {
if err := os.Setenv("HOME", execUser.Home); err != nil {
return err
}
}
return nil
}
// fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified user.
// The ownership needs to match because it is created outside of the container and needs to be
// localized.
func fixStdioPermissions(u *user.ExecUser) error {
var null syscall.Stat_t
if err := syscall.Stat("/dev/null", &null); err != nil {
return err
}
for _, fd := range []uintptr{
os.Stdin.Fd(),
os.Stderr.Fd(),
os.Stdout.Fd(),
} {
var s syscall.Stat_t
if err := syscall.Fstat(int(fd), &s); err != nil {
return err
}
// skip chown of /dev/null if it was used as one of the STDIO fds.
if s.Rdev == null.Rdev {
continue
}
if err := syscall.Fchown(int(fd), u.Uid, u.Gid); err != nil {
return err
}
}
return nil
}
// setupNetwork sets up and initializes any network interface inside the container.
func setupNetwork(config *initConfig) error {
for _, config := range config.Networks {
strategy, err := getStrategy(config.Type)
if err != nil {
return err
}
if err := strategy.initialize(config); err != nil {
return err
}
}
return nil
}
func setupRoute(config *configs.Config) error {
for _, config := range config.Routes {
_, dst, err := net.ParseCIDR(config.Destination)
if err != nil {
return err
}
src := net.ParseIP(config.Source)
if src == nil {
return fmt.Errorf("Invalid source for route: %s", config.Source)
}
gw := net.ParseIP(config.Gateway)
if gw == nil {
return fmt.Errorf("Invalid gateway for route: %s", config.Gateway)
}
l, err := netlink.LinkByName(config.InterfaceName)
if err != nil {
return err
}
route := &netlink.Route{
Scope: netlink.SCOPE_UNIVERSE,
Dst: dst,
Src: src,
Gw: gw,
LinkIndex: l.Attrs().Index,
}
if err := netlink.RouteAdd(route); err != nil {
return err
}
}
return nil
}
func setupRlimits(config *configs.Config) error {
for _, rlimit := range config.Rlimits {
l := &syscall.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}
if err := syscall.Setrlimit(rlimit.Type, l); err != nil {
return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err)
}
}
return nil
}
func setOomScoreAdj(oomScoreAdj int, pid int) error {
path := fmt.Sprintf("/proc/%d/oom_score_adj", pid)
return ioutil.WriteFile(path, []byte(strconv.Itoa(oomScoreAdj)), 0600)
}
// killCgroupProcesses freezes then iterates over all the processes inside the
// manager's cgroups sending a SIGKILL to each process then waiting for them to
// exit.
func killCgroupProcesses(m cgroups.Manager) error {
var procs []*os.Process
if err := m.Freeze(configs.Frozen); err != nil {
logrus.Warn(err)
}
pids, err := m.GetAllPids()
if err != nil {
m.Freeze(configs.Thawed)
return err
}
for _, pid := range pids {
if p, err := os.FindProcess(pid); err == nil {
procs = append(procs, p)
if err := p.Kill(); err != nil {
logrus.Warn(err)
}
}
}
if err := m.Freeze(configs.Thawed); err != nil {
logrus.Warn(err)
}
for _, p := range procs {
if _, err := p.Wait(); err != nil {
logrus.Warn(err)
}
}
return nil
}

View file

@ -1,67 +0,0 @@
// +build linux
package keyctl
import (
"fmt"
"syscall"
"strings"
"strconv"
"unsafe"
)
const KEYCTL_JOIN_SESSION_KEYRING = 1
const KEYCTL_SETPERM = 5
const KEYCTL_DESCRIBE = 6
type KeySerial uint32
func JoinSessionKeyring(name string) (KeySerial, error) {
var _name *byte = nil
var err error
if len(name) > 0 {
_name, err = syscall.BytePtrFromString(name)
if err != nil {
return KeySerial(0), err
}
}
sessKeyId, _, errn := syscall.Syscall(syscall.SYS_KEYCTL, KEYCTL_JOIN_SESSION_KEYRING, uintptr(unsafe.Pointer(_name)), 0)
if errn != 0 {
return 0, fmt.Errorf("could not create session key: %v", errn)
}
return KeySerial(sessKeyId), nil
}
// modify permissions on a keyring by reading the current permissions,
// anding the bits with the given mask (clearing permissions) and setting
// additional permission bits
func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error {
dest := make([]byte, 1024)
destBytes := unsafe.Pointer(&dest[0])
if _, _, err := syscall.Syscall6(syscall.SYS_KEYCTL, uintptr(KEYCTL_DESCRIBE), uintptr(ringId), uintptr(destBytes), uintptr(len(dest)), 0, 0); err != 0 {
return err
}
res := strings.Split(string(dest), ";")
if len(res) < 5 {
return fmt.Errorf("Destination buffer for key description is too small")
}
// parse permissions
perm64, err := strconv.ParseUint(res[3], 16, 32)
if err != nil {
return err
}
perm := (uint32(perm64) & mask) | setbits
if _, _, err := syscall.Syscall(syscall.SYS_KEYCTL, uintptr(KEYCTL_SETPERM), uintptr(ringId), uintptr(perm)); err != 0 {
return err
}
return nil
}

View file

@ -1,88 +0,0 @@
// +build linux
package libcontainer
import (
"syscall"
"github.com/vishvananda/netlink/nl"
)
// list of known message types we want to send to bootstrap program
// The number is randomly chosen to not conflict with known netlink types
const (
InitMsg uint16 = 62000
CloneFlagsAttr uint16 = 27281
ConsolePathAttr uint16 = 27282
NsPathsAttr uint16 = 27283
UidmapAttr uint16 = 27284
GidmapAttr uint16 = 27285
SetgroupAttr uint16 = 27286
// When syscall.NLA_HDRLEN is in gccgo, take this out.
syscall_NLA_HDRLEN = (syscall.SizeofNlAttr + syscall.NLA_ALIGNTO - 1) & ^(syscall.NLA_ALIGNTO - 1)
)
type Int32msg struct {
Type uint16
Value uint32
}
// int32msg has the following representation
// | nlattr len | nlattr type |
// | uint32 value |
func (msg *Int32msg) Serialize() []byte {
buf := make([]byte, msg.Len())
native := nl.NativeEndian()
native.PutUint16(buf[0:2], uint16(msg.Len()))
native.PutUint16(buf[2:4], msg.Type)
native.PutUint32(buf[4:8], msg.Value)
return buf
}
func (msg *Int32msg) Len() int {
return syscall_NLA_HDRLEN + 4
}
// bytemsg has the following representation
// | nlattr len | nlattr type |
// | value | pad |
type Bytemsg struct {
Type uint16
Value []byte
}
func (msg *Bytemsg) Serialize() []byte {
l := msg.Len()
buf := make([]byte, (l+syscall.NLA_ALIGNTO-1) & ^(syscall.NLA_ALIGNTO-1))
native := nl.NativeEndian()
native.PutUint16(buf[0:2], uint16(l))
native.PutUint16(buf[2:4], msg.Type)
copy(buf[4:], msg.Value)
return buf
}
func (msg *Bytemsg) Len() int {
return syscall_NLA_HDRLEN + len(msg.Value) + 1 // null-terminated
}
type Boolmsg struct {
Type uint16
Value bool
}
func (msg *Boolmsg) Serialize() []byte {
buf := make([]byte, msg.Len())
native := nl.NativeEndian()
native.PutUint16(buf[0:2], uint16(msg.Len()))
native.PutUint16(buf[2:4], msg.Type)
if msg.Value {
buf[4] = 1
} else {
buf[4] = 0
}
return buf
}
func (msg *Boolmsg) Len() int {
return syscall_NLA_HDRLEN + 1
}

View file

@ -1,259 +0,0 @@
// +build linux
package libcontainer
import (
"fmt"
"io/ioutil"
"net"
"path/filepath"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/utils"
"github.com/vishvananda/netlink"
)
var strategies = map[string]networkStrategy{
"veth": &veth{},
"loopback": &loopback{},
}
// networkStrategy represents a specific network configuration for
// a container's networking stack
type networkStrategy interface {
create(*network, int) error
initialize(*network) error
detach(*configs.Network) error
attach(*configs.Network) error
}
// getStrategy returns the specific network strategy for the
// provided type.
func getStrategy(tpe string) (networkStrategy, error) {
s, exists := strategies[tpe]
if !exists {
return nil, fmt.Errorf("unknown strategy type %q", tpe)
}
return s, nil
}
// Returns the network statistics for the network interfaces represented by the NetworkRuntimeInfo.
func getNetworkInterfaceStats(interfaceName string) (*NetworkInterface, error) {
out := &NetworkInterface{Name: interfaceName}
// This can happen if the network runtime information is missing - possible if the
// container was created by an old version of libcontainer.
if interfaceName == "" {
return out, nil
}
type netStatsPair struct {
// Where to write the output.
Out *uint64
// The network stats file to read.
File string
}
// Ingress for host veth is from the container. Hence tx_bytes stat on the host veth is actually number of bytes received by the container.
netStats := []netStatsPair{
{Out: &out.RxBytes, File: "tx_bytes"},
{Out: &out.RxPackets, File: "tx_packets"},
{Out: &out.RxErrors, File: "tx_errors"},
{Out: &out.RxDropped, File: "tx_dropped"},
{Out: &out.TxBytes, File: "rx_bytes"},
{Out: &out.TxPackets, File: "rx_packets"},
{Out: &out.TxErrors, File: "rx_errors"},
{Out: &out.TxDropped, File: "rx_dropped"},
}
for _, netStat := range netStats {
data, err := readSysfsNetworkStats(interfaceName, netStat.File)
if err != nil {
return nil, err
}
*(netStat.Out) = data
}
return out, nil
}
// Reads the specified statistics available under /sys/class/net/<EthInterface>/statistics
func readSysfsNetworkStats(ethInterface, statsFile string) (uint64, error) {
data, err := ioutil.ReadFile(filepath.Join("/sys/class/net", ethInterface, "statistics", statsFile))
if err != nil {
return 0, err
}
return strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64)
}
// loopback is a network strategy that provides a basic loopback device
type loopback struct {
}
func (l *loopback) create(n *network, nspid int) error {
return nil
}
func (l *loopback) initialize(config *network) error {
return netlink.LinkSetUp(&netlink.Device{LinkAttrs: netlink.LinkAttrs{Name: "lo"}})
}
func (l *loopback) attach(n *configs.Network) (err error) {
return nil
}
func (l *loopback) detach(n *configs.Network) (err error) {
return nil
}
// veth is a network strategy that uses a bridge and creates
// a veth pair, one that is attached to the bridge on the host and the other
// is placed inside the container's namespace
type veth struct {
}
func (v *veth) detach(n *configs.Network) (err error) {
return netlink.LinkSetMaster(&netlink.Device{LinkAttrs: netlink.LinkAttrs{Name: n.HostInterfaceName}}, nil)
}
// attach a container network interface to an external network
func (v *veth) attach(n *configs.Network) (err error) {
brl, err := netlink.LinkByName(n.Bridge)
if err != nil {
return err
}
br, ok := brl.(*netlink.Bridge)
if !ok {
return fmt.Errorf("Wrong device type %T", brl)
}
host, err := netlink.LinkByName(n.HostInterfaceName)
if err != nil {
return err
}
if err := netlink.LinkSetMaster(host, br); err != nil {
return err
}
if err := netlink.LinkSetMTU(host, n.Mtu); err != nil {
return err
}
if n.HairpinMode {
if err := netlink.LinkSetHairpin(host, true); err != nil {
return err
}
}
if err := netlink.LinkSetUp(host); err != nil {
return err
}
return nil
}
func (v *veth) create(n *network, nspid int) (err error) {
tmpName, err := v.generateTempPeerName()
if err != nil {
return err
}
n.TempVethPeerName = tmpName
if n.Bridge == "" {
return fmt.Errorf("bridge is not specified")
}
veth := &netlink.Veth{
LinkAttrs: netlink.LinkAttrs{
Name: n.HostInterfaceName,
TxQLen: n.TxQueueLen,
},
PeerName: n.TempVethPeerName,
}
if err := netlink.LinkAdd(veth); err != nil {
return err
}
defer func() {
if err != nil {
netlink.LinkDel(veth)
}
}()
if err := v.attach(&n.Network); err != nil {
return err
}
child, err := netlink.LinkByName(n.TempVethPeerName)
if err != nil {
return err
}
return netlink.LinkSetNsPid(child, nspid)
}
func (v *veth) generateTempPeerName() (string, error) {
return utils.GenerateRandomName("veth", 7)
}
func (v *veth) initialize(config *network) error {
peer := config.TempVethPeerName
if peer == "" {
return fmt.Errorf("peer is not specified")
}
child, err := netlink.LinkByName(peer)
if err != nil {
return err
}
if err := netlink.LinkSetDown(child); err != nil {
return err
}
if err := netlink.LinkSetName(child, config.Name); err != nil {
return err
}
// get the interface again after we changed the name as the index also changes.
if child, err = netlink.LinkByName(config.Name); err != nil {
return err
}
if config.MacAddress != "" {
mac, err := net.ParseMAC(config.MacAddress)
if err != nil {
return err
}
if err := netlink.LinkSetHardwareAddr(child, mac); err != nil {
return err
}
}
ip, err := netlink.ParseAddr(config.Address)
if err != nil {
return err
}
if err := netlink.AddrAdd(child, ip); err != nil {
return err
}
if config.IPv6Address != "" {
ip6, err := netlink.ParseAddr(config.IPv6Address)
if err != nil {
return err
}
if err := netlink.AddrAdd(child, ip6); err != nil {
return err
}
}
if err := netlink.LinkSetMTU(child, config.Mtu); err != nil {
return err
}
if err := netlink.LinkSetUp(child); err != nil {
return err
}
if config.Gateway != "" {
gw := net.ParseIP(config.Gateway)
if err := netlink.RouteAdd(&netlink.Route{
Scope: netlink.SCOPE_UNIVERSE,
LinkIndex: child.Attrs().Index,
Gw: gw,
}); err != nil {
return err
}
}
if config.IPv6Gateway != "" {
gw := net.ParseIP(config.IPv6Gateway)
if err := netlink.RouteAdd(&netlink.Route{
Scope: netlink.SCOPE_UNIVERSE,
LinkIndex: child.Attrs().Index,
Gw: gw,
}); err != nil {
return err
}
}
return nil
}

View file

@ -1,89 +0,0 @@
// +build linux
package libcontainer
import (
"fmt"
"io/ioutil"
"os"
"path/filepath"
"syscall"
)
const oomCgroupName = "memory"
type PressureLevel uint
const (
LowPressure PressureLevel = iota
MediumPressure
CriticalPressure
)
func registerMemoryEvent(cgDir string, evName string, arg string) (<-chan struct{}, error) {
evFile, err := os.Open(filepath.Join(cgDir, evName))
if err != nil {
return nil, err
}
fd, _, syserr := syscall.RawSyscall(syscall.SYS_EVENTFD2, 0, syscall.FD_CLOEXEC, 0)
if syserr != 0 {
evFile.Close()
return nil, syserr
}
eventfd := os.NewFile(fd, "eventfd")
eventControlPath := filepath.Join(cgDir, "cgroup.event_control")
data := fmt.Sprintf("%d %d %s", eventfd.Fd(), evFile.Fd(), arg)
if err := ioutil.WriteFile(eventControlPath, []byte(data), 0700); err != nil {
eventfd.Close()
evFile.Close()
return nil, err
}
ch := make(chan struct{})
go func() {
defer func() {
close(ch)
eventfd.Close()
evFile.Close()
}()
buf := make([]byte, 8)
for {
if _, err := eventfd.Read(buf); err != nil {
return
}
// When a cgroup is destroyed, an event is sent to eventfd.
// So if the control path is gone, return instead of notifying.
if _, err := os.Lstat(eventControlPath); os.IsNotExist(err) {
return
}
ch <- struct{}{}
}
}()
return ch, nil
}
// notifyOnOOM returns channel on which you can expect event about OOM,
// if process died without OOM this channel will be closed.
func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) {
dir := paths[oomCgroupName]
if dir == "" {
return nil, fmt.Errorf("path %q missing", oomCgroupName)
}
return registerMemoryEvent(dir, "memory.oom_control", "")
}
func notifyMemoryPressure(paths map[string]string, level PressureLevel) (<-chan struct{}, error) {
dir := paths[oomCgroupName]
if dir == "" {
return nil, fmt.Errorf("path %q missing", oomCgroupName)
}
if level > CriticalPressure {
return nil, fmt.Errorf("invalid pressure level %d", level)
}
levelStr := []string{"low", "medium", "critical"}[level]
return registerMemoryEvent(dir, "memory.pressure_level", levelStr)
}

View file

@ -1,115 +0,0 @@
package libcontainer
import (
"fmt"
"io"
"math"
"os"
)
type processOperations interface {
wait() (*os.ProcessState, error)
signal(sig os.Signal) error
pid() int
}
// Process specifies the configuration and IO for a process inside
// a container.
type Process struct {
// The command to be run followed by any arguments.
Args []string
// Env specifies the environment variables for the process.
Env []string
// User will set the uid and gid of the executing process running inside the container
// local to the container's user and group configuration.
User string
// Cwd will change the processes current working directory inside the container's rootfs.
Cwd string
// Stdin is a pointer to a reader which provides the standard input stream.
Stdin io.Reader
// Stdout is a pointer to a writer which receives the standard output stream.
Stdout io.Writer
// Stderr is a pointer to a writer which receives the standard error stream.
Stderr io.Writer
// ExtraFiles specifies additional open files to be inherited by the container
ExtraFiles []*os.File
// consolePath is the path to the console allocated to the container.
consolePath string
// Capabilities specify the capabilities to keep when executing the process inside the container
// All capabilities not specified will be dropped from the processes capability mask
Capabilities []string
// AppArmorProfile specifies the profile to apply to the process and is
// changed at the time the process is execed
AppArmorProfile string
// Label specifies the label to apply to the process. It is commonly used by selinux
Label string
// NoNewPrivileges controls whether processes can gain additional privileges.
NoNewPrivileges *bool
ops processOperations
}
// Wait waits for the process to exit.
// Wait releases any resources associated with the Process
func (p Process) Wait() (*os.ProcessState, error) {
if p.ops == nil {
return nil, newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
}
return p.ops.wait()
}
// Pid returns the process ID
func (p Process) Pid() (int, error) {
// math.MinInt32 is returned here, because it's invalid value
// for the kill() system call.
if p.ops == nil {
return math.MinInt32, newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
}
return p.ops.pid(), nil
}
// Signal sends a signal to the Process.
func (p Process) Signal(sig os.Signal) error {
if p.ops == nil {
return newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
}
return p.ops.signal(sig)
}
// IO holds the process's STDIO
type IO struct {
Stdin io.WriteCloser
Stdout io.ReadCloser
Stderr io.ReadCloser
}
// NewConsole creates new console for process and returns it
func (p *Process) NewConsole(rootuid int) (Console, error) {
console, err := NewConsole(rootuid, rootuid)
if err != nil {
return nil, err
}
p.consolePath = console.Path()
return console, nil
}
// ConsoleFromPath sets the process's console with the path provided
func (p *Process) ConsoleFromPath(path string) error {
if p.consolePath != "" {
return newGenericError(fmt.Errorf("console path already exists for process"), ConsoleExists)
}
p.consolePath = path
return nil
}

View file

@ -1,476 +0,0 @@
// +build linux
package libcontainer
import (
"encoding/json"
"errors"
"fmt"
"io"
"os"
"os/exec"
"path/filepath"
"strconv"
"syscall"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/utils"
)
type parentProcess interface {
// pid returns the pid for the running process.
pid() int
// start starts the process execution.
start() error
// send a SIGKILL to the process and wait for the exit.
terminate() error
// wait waits on the process returning the process state.
wait() (*os.ProcessState, error)
// startTime return's the process start time.
startTime() (string, error)
signal(os.Signal) error
externalDescriptors() []string
setExternalDescriptors(fds []string)
}
type setnsProcess struct {
cmd *exec.Cmd
parentPipe *os.File
childPipe *os.File
cgroupPaths map[string]string
config *initConfig
fds []string
process *Process
bootstrapData io.Reader
}
func (p *setnsProcess) startTime() (string, error) {
return system.GetProcessStartTime(p.pid())
}
func (p *setnsProcess) signal(sig os.Signal) error {
s, ok := sig.(syscall.Signal)
if !ok {
return errors.New("os: unsupported signal type")
}
return syscall.Kill(p.pid(), s)
}
func (p *setnsProcess) start() (err error) {
defer p.parentPipe.Close()
err = p.cmd.Start()
p.childPipe.Close()
if err != nil {
return newSystemError(err)
}
if p.bootstrapData != nil {
if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
return newSystemError(err)
}
}
if err = p.execSetns(); err != nil {
return newSystemError(err)
}
if len(p.cgroupPaths) > 0 {
if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil {
return newSystemError(err)
}
}
if err := utils.WriteJSON(p.parentPipe, p.config); err != nil {
return newSystemError(err)
}
// set oom_score_adj
if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil {
return newSystemError(err)
}
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
return newSystemError(err)
}
// wait for the child process to fully complete and receive an error message
// if one was encoutered
var ierr *genericError
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
return newSystemError(err)
}
// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil {
p.wait()
return newSystemError(ierr)
}
return nil
}
// execSetns runs the process that executes C code to perform the setns calls
// because setns support requires the C process to fork off a child and perform the setns
// before the go runtime boots, we wait on the process to die and receive the child's pid
// over the provided pipe.
func (p *setnsProcess) execSetns() error {
status, err := p.cmd.Process.Wait()
if err != nil {
p.cmd.Wait()
return newSystemError(err)
}
if !status.Success() {
p.cmd.Wait()
return newSystemError(&exec.ExitError{ProcessState: status})
}
var pid *pid
if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
p.cmd.Wait()
return newSystemError(err)
}
process, err := os.FindProcess(pid.Pid)
if err != nil {
return err
}
p.cmd.Process = process
p.process.ops = p
return nil
}
// terminate sends a SIGKILL to the forked process for the setns routine then waits to
// avoid the process becomming a zombie.
func (p *setnsProcess) terminate() error {
if p.cmd.Process == nil {
return nil
}
err := p.cmd.Process.Kill()
if _, werr := p.wait(); err == nil {
err = werr
}
return err
}
func (p *setnsProcess) wait() (*os.ProcessState, error) {
err := p.cmd.Wait()
// Return actual ProcessState even on Wait error
return p.cmd.ProcessState, err
}
func (p *setnsProcess) pid() int {
return p.cmd.Process.Pid
}
func (p *setnsProcess) externalDescriptors() []string {
return p.fds
}
func (p *setnsProcess) setExternalDescriptors(newFds []string) {
p.fds = newFds
}
type initProcess struct {
cmd *exec.Cmd
parentPipe *os.File
childPipe *os.File
config *initConfig
manager cgroups.Manager
container *linuxContainer
fds []string
process *Process
bootstrapData io.Reader
sharePidns bool
}
func (p *initProcess) pid() int {
return p.cmd.Process.Pid
}
func (p *initProcess) externalDescriptors() []string {
return p.fds
}
// execSetns runs the process that executes C code to perform the setns calls
// because setns support requires the C process to fork off a child and perform the setns
// before the go runtime boots, we wait on the process to die and receive the child's pid
// over the provided pipe.
// This is called by initProcess.start function
func (p *initProcess) execSetns() error {
status, err := p.cmd.Process.Wait()
if err != nil {
p.cmd.Wait()
return err
}
if !status.Success() {
p.cmd.Wait()
return &exec.ExitError{ProcessState: status}
}
var pid *pid
if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
p.cmd.Wait()
return err
}
process, err := os.FindProcess(pid.Pid)
if err != nil {
return err
}
p.cmd.Process = process
return nil
}
func (p *initProcess) start() error {
defer p.parentPipe.Close()
err := p.cmd.Start()
p.process.ops = p
p.childPipe.Close()
if err != nil {
p.process.ops = nil
return newSystemError(err)
}
if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
return err
}
if err := p.execSetns(); err != nil {
return newSystemError(err)
}
// Save the standard descriptor names before the container process
// can potentially move them (e.g., via dup2()). If we don't do this now,
// we won't know at checkpoint time which file descriptor to look up.
fds, err := getPipeFds(p.pid())
if err != nil {
return newSystemError(err)
}
p.setExternalDescriptors(fds)
// Do this before syncing with child so that no children
// can escape the cgroup
if err := p.manager.Apply(p.pid()); err != nil {
return newSystemError(err)
}
defer func() {
if err != nil {
// TODO: should not be the responsibility to call here
p.manager.Destroy()
}
}()
if err := p.createNetworkInterfaces(); err != nil {
return newSystemError(err)
}
if err := p.sendConfig(); err != nil {
return newSystemError(err)
}
var (
procSync syncT
sentRun bool
sentResume bool
ierr *genericError
)
dec := json.NewDecoder(p.parentPipe)
loop:
for {
if err := dec.Decode(&procSync); err != nil {
if err == io.EOF {
break loop
}
return newSystemError(err)
}
switch procSync.Type {
case procReady:
if err := p.manager.Set(p.config.Config); err != nil {
return newSystemError(err)
}
// set oom_score_adj
if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil {
return newSystemError(err)
}
// call prestart hooks
if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
if p.config.Config.Hooks != nil {
s := configs.HookState{
Version: p.container.config.Version,
ID: p.container.id,
Pid: p.pid(),
Root: p.config.Config.Rootfs,
}
for _, hook := range p.config.Config.Hooks.Prestart {
if err := hook.Run(s); err != nil {
return newSystemError(err)
}
}
}
}
// Sync with child.
if err := utils.WriteJSON(p.parentPipe, syncT{procRun}); err != nil {
return newSystemError(err)
}
sentRun = true
case procHooks:
if p.config.Config.Hooks != nil {
s := configs.HookState{
Version: p.container.config.Version,
ID: p.container.id,
Pid: p.pid(),
Root: p.config.Config.Rootfs,
}
for _, hook := range p.config.Config.Hooks.Prestart {
if err := hook.Run(s); err != nil {
return newSystemError(err)
}
}
}
// Sync with child.
if err := utils.WriteJSON(p.parentPipe, syncT{procResume}); err != nil {
return newSystemError(err)
}
sentResume = true
case procError:
// wait for the child process to fully complete and receive an error message
// if one was encoutered
if err := dec.Decode(&ierr); err != nil && err != io.EOF {
return newSystemError(err)
}
if ierr != nil {
break loop
}
// Programmer error.
panic("No error following JSON procError payload.")
default:
return newSystemError(fmt.Errorf("invalid JSON synchronisation payload from child"))
}
}
if !sentRun {
return newSystemError(fmt.Errorf("could not synchronise with container process"))
}
if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
return newSystemError(fmt.Errorf("could not synchronise after executing prestart hooks with container process"))
}
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
return newSystemError(err)
}
// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil {
p.wait()
return newSystemError(ierr)
}
return nil
}
func (p *initProcess) wait() (*os.ProcessState, error) {
err := p.cmd.Wait()
if err != nil {
return p.cmd.ProcessState, err
}
// we should kill all processes in cgroup when init is died if we use host PID namespace
if p.sharePidns {
killCgroupProcesses(p.manager)
}
return p.cmd.ProcessState, nil
}
func (p *initProcess) terminate() error {
if p.cmd.Process == nil {
return nil
}
err := p.cmd.Process.Kill()
if _, werr := p.wait(); err == nil {
err = werr
}
return err
}
func (p *initProcess) startTime() (string, error) {
return system.GetProcessStartTime(p.pid())
}
func (p *initProcess) sendConfig() error {
// send the config to the container's init process, we don't use JSON Encode
// here because there might be a problem in JSON decoder in some cases, see:
// https://github.com/docker/docker/issues/14203#issuecomment-174177790
return utils.WriteJSON(p.parentPipe, p.config)
}
func (p *initProcess) createNetworkInterfaces() error {
for _, config := range p.config.Config.Networks {
strategy, err := getStrategy(config.Type)
if err != nil {
return err
}
n := &network{
Network: *config,
}
if err := strategy.create(n, p.pid()); err != nil {
return err
}
p.config.Networks = append(p.config.Networks, n)
}
return nil
}
func (p *initProcess) signal(sig os.Signal) error {
s, ok := sig.(syscall.Signal)
if !ok {
return errors.New("os: unsupported signal type")
}
return syscall.Kill(p.pid(), s)
}
func (p *initProcess) setExternalDescriptors(newFds []string) {
p.fds = newFds
}
func getPipeFds(pid int) ([]string, error) {
fds := make([]string, 3)
dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd")
for i := 0; i < 3; i++ {
f := filepath.Join(dirPath, strconv.Itoa(i))
target, err := os.Readlink(f)
if err != nil {
return fds, err
}
fds[i] = target
}
return fds, nil
}
// InitializeIO creates pipes for use with the process's STDIO
// and returns the opposite side for each
func (p *Process) InitializeIO(rootuid int) (i *IO, err error) {
var fds []uintptr
i = &IO{}
// cleanup in case of an error
defer func() {
if err != nil {
for _, fd := range fds {
syscall.Close(int(fd))
}
}
}()
// STDIN
r, w, err := os.Pipe()
if err != nil {
return nil, err
}
fds = append(fds, r.Fd(), w.Fd())
p.Stdin, i.Stdin = r, w
// STDOUT
if r, w, err = os.Pipe(); err != nil {
return nil, err
}
fds = append(fds, r.Fd(), w.Fd())
p.Stdout, i.Stdout = w, r
// STDERR
if r, w, err = os.Pipe(); err != nil {
return nil, err
}
fds = append(fds, r.Fd(), w.Fd())
p.Stderr, i.Stderr = w, r
// change ownership of the pipes incase we are in a user namespace
for _, fd := range fds {
if err := syscall.Fchown(int(fd), rootuid, rootuid); err != nil {
return nil, err
}
}
return i, nil
}

View file

@ -1,122 +0,0 @@
// +build linux
package libcontainer
import (
"fmt"
"os"
"github.com/opencontainers/runc/libcontainer/system"
)
func newRestoredProcess(pid int, fds []string) (*restoredProcess, error) {
var (
err error
)
proc, err := os.FindProcess(pid)
if err != nil {
return nil, err
}
started, err := system.GetProcessStartTime(pid)
if err != nil {
return nil, err
}
return &restoredProcess{
proc: proc,
processStartTime: started,
fds: fds,
}, nil
}
type restoredProcess struct {
proc *os.Process
processStartTime string
fds []string
}
func (p *restoredProcess) start() error {
return newGenericError(fmt.Errorf("restored process cannot be started"), SystemError)
}
func (p *restoredProcess) pid() int {
return p.proc.Pid
}
func (p *restoredProcess) terminate() error {
err := p.proc.Kill()
if _, werr := p.wait(); err == nil {
err = werr
}
return err
}
func (p *restoredProcess) wait() (*os.ProcessState, error) {
// TODO: how do we wait on the actual process?
// maybe use --exec-cmd in criu
st, err := p.proc.Wait()
if err != nil {
return nil, err
}
return st, nil
}
func (p *restoredProcess) startTime() (string, error) {
return p.processStartTime, nil
}
func (p *restoredProcess) signal(s os.Signal) error {
return p.proc.Signal(s)
}
func (p *restoredProcess) externalDescriptors() []string {
return p.fds
}
func (p *restoredProcess) setExternalDescriptors(newFds []string) {
p.fds = newFds
}
// nonChildProcess represents a process where the calling process is not
// the parent process. This process is created when a factory loads a container from
// a persisted state.
type nonChildProcess struct {
processPid int
processStartTime string
fds []string
}
func (p *nonChildProcess) start() error {
return newGenericError(fmt.Errorf("restored process cannot be started"), SystemError)
}
func (p *nonChildProcess) pid() int {
return p.processPid
}
func (p *nonChildProcess) terminate() error {
return newGenericError(fmt.Errorf("restored process cannot be terminated"), SystemError)
}
func (p *nonChildProcess) wait() (*os.ProcessState, error) {
return nil, newGenericError(fmt.Errorf("restored process cannot be waited on"), SystemError)
}
func (p *nonChildProcess) startTime() (string, error) {
return p.processStartTime, nil
}
func (p *nonChildProcess) signal(s os.Signal) error {
proc, err := os.FindProcess(p.processPid)
if err != nil {
return err
}
return proc.Signal(s)
}
func (p *nonChildProcess) externalDescriptors() []string {
return p.fds
}
func (p *nonChildProcess) setExternalDescriptors(newFds []string) {
p.fds = newFds
}

View file

@ -1,714 +0,0 @@
// +build linux
package libcontainer
import (
"fmt"
"io"
"io/ioutil"
"os"
"os/exec"
"path"
"path/filepath"
"strings"
"syscall"
"time"
"github.com/docker/docker/pkg/mount"
"github.com/docker/docker/pkg/symlink"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/label"
"github.com/opencontainers/runc/libcontainer/system"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
)
const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
// setupRootfs sets up the devices, mount points, and filesystems for use inside a
// new mount namespace.
func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWriter) (err error) {
if err := prepareRoot(config); err != nil {
return newSystemError(err)
}
setupDev := len(config.Devices) != 0
for _, m := range config.Mounts {
for _, precmd := range m.PremountCmds {
if err := mountCmd(precmd); err != nil {
return newSystemError(err)
}
}
if err := mountToRootfs(m, config.Rootfs, config.MountLabel); err != nil {
return newSystemError(err)
}
for _, postcmd := range m.PostmountCmds {
if err := mountCmd(postcmd); err != nil {
return newSystemError(err)
}
}
}
if setupDev {
if err := createDevices(config); err != nil {
return newSystemError(err)
}
if err := setupPtmx(config, console); err != nil {
return newSystemError(err)
}
if err := setupDevSymlinks(config.Rootfs); err != nil {
return newSystemError(err)
}
}
// Signal the parent to run the pre-start hooks.
// The hooks are run after the mounts are setup, but before we switch to the new
// root, so that the old root is still available in the hooks for any mount
// manipulations.
if err := syncParentHooks(pipe); err != nil {
return err
}
if err := syscall.Chdir(config.Rootfs); err != nil {
return newSystemError(err)
}
if config.NoPivotRoot {
err = msMoveRoot(config.Rootfs)
} else {
err = pivotRoot(config.Rootfs, config.PivotDir)
}
if err != nil {
return newSystemError(err)
}
if setupDev {
if err := reOpenDevNull(); err != nil {
return newSystemError(err)
}
}
// remount dev as ro if specifed
for _, m := range config.Mounts {
if m.Destination == "/dev" {
if m.Flags&syscall.MS_RDONLY != 0 {
if err := remountReadonly(m.Destination); err != nil {
return newSystemError(err)
}
}
break
}
}
// set rootfs ( / ) as readonly
if config.Readonlyfs {
if err := setReadonly(); err != nil {
return newSystemError(err)
}
}
syscall.Umask(0022)
return nil
}
func mountCmd(cmd configs.Command) error {
command := exec.Command(cmd.Path, cmd.Args[:]...)
command.Env = cmd.Env
command.Dir = cmd.Dir
if out, err := command.CombinedOutput(); err != nil {
return fmt.Errorf("%#v failed: %s: %v", cmd, string(out), err)
}
return nil
}
func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
var (
dest = m.Destination
)
if !strings.HasPrefix(dest, rootfs) {
dest = filepath.Join(rootfs, dest)
}
switch m.Device {
case "proc", "sysfs":
if err := os.MkdirAll(dest, 0755); err != nil {
return err
}
// Selinux kernels do not support labeling of /proc or /sys
return mountPropagate(m, rootfs, "")
case "mqueue":
if err := os.MkdirAll(dest, 0755); err != nil {
return err
}
if err := mountPropagate(m, rootfs, mountLabel); err != nil {
// older kernels do not support labeling of /dev/mqueue
if err := mountPropagate(m, rootfs, ""); err != nil {
return err
}
}
return label.SetFileLabel(dest, mountLabel)
case "tmpfs":
stat, err := os.Stat(dest)
if err != nil {
if err := os.MkdirAll(dest, 0755); err != nil {
return err
}
}
if err := mountPropagate(m, rootfs, mountLabel); err != nil {
return err
}
if stat != nil {
if err = os.Chmod(dest, stat.Mode()); err != nil {
return err
}
}
return nil
case "bind":
stat, err := os.Stat(m.Source)
if err != nil {
// error out if the source of a bind mount does not exist as we will be
// unable to bind anything to it.
return err
}
// ensure that the destination of the bind mount is resolved of symlinks at mount time because
// any previous mounts can invalidate the next mount's destination.
// this can happen when a user specifies mounts within other mounts to cause breakouts or other
// evil stuff to try to escape the container's rootfs.
if dest, err = symlink.FollowSymlinkInScope(filepath.Join(rootfs, m.Destination), rootfs); err != nil {
return err
}
if err := checkMountDestination(rootfs, dest); err != nil {
return err
}
// update the mount with the correct dest after symlinks are resolved.
m.Destination = dest
if err := createIfNotExists(dest, stat.IsDir()); err != nil {
return err
}
if err := mountPropagate(m, rootfs, mountLabel); err != nil {
return err
}
// bind mount won't change mount options, we need remount to make mount options effective.
// first check that we have non-default options required before attempting a remount
if m.Flags&^(syscall.MS_REC|syscall.MS_REMOUNT|syscall.MS_BIND) != 0 {
// only remount if unique mount options are set
if err := remount(m, rootfs); err != nil {
return err
}
}
if m.Relabel != "" {
if err := label.Validate(m.Relabel); err != nil {
return err
}
shared := label.IsShared(m.Relabel)
if err := label.Relabel(m.Source, mountLabel, shared); err != nil {
return err
}
}
case "cgroup":
binds, err := getCgroupMounts(m)
if err != nil {
return err
}
var merged []string
for _, b := range binds {
ss := filepath.Base(b.Destination)
if strings.Contains(ss, ",") {
merged = append(merged, ss)
}
}
tmpfs := &configs.Mount{
Source: "tmpfs",
Device: "tmpfs",
Destination: m.Destination,
Flags: defaultMountFlags,
Data: "mode=755",
PropagationFlags: m.PropagationFlags,
}
if err := mountToRootfs(tmpfs, rootfs, mountLabel); err != nil {
return err
}
for _, b := range binds {
if err := mountToRootfs(b, rootfs, mountLabel); err != nil {
return err
}
}
// create symlinks for merged cgroups
cwd, err := os.Getwd()
if err != nil {
return err
}
if err := os.Chdir(filepath.Join(rootfs, m.Destination)); err != nil {
return err
}
for _, mc := range merged {
for _, ss := range strings.Split(mc, ",") {
if err := os.Symlink(mc, ss); err != nil {
// if cgroup already exists, then okay(it could have been created before)
if os.IsExist(err) {
continue
}
os.Chdir(cwd)
return err
}
}
}
if err := os.Chdir(cwd); err != nil {
return err
}
if m.Flags&syscall.MS_RDONLY != 0 {
// remount cgroup root as readonly
mcgrouproot := &configs.Mount{
Destination: m.Destination,
Flags: defaultMountFlags | syscall.MS_RDONLY,
}
if err := remount(mcgrouproot, rootfs); err != nil {
return err
}
}
default:
if err := os.MkdirAll(dest, 0755); err != nil {
return err
}
return mountPropagate(m, rootfs, mountLabel)
}
return nil
}
func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) {
mounts, err := cgroups.GetCgroupMounts()
if err != nil {
return nil, err
}
cgroupPaths, err := cgroups.ParseCgroupFile("/proc/self/cgroup")
if err != nil {
return nil, err
}
var binds []*configs.Mount
for _, mm := range mounts {
dir, err := mm.GetThisCgroupDir(cgroupPaths)
if err != nil {
return nil, err
}
relDir, err := filepath.Rel(mm.Root, dir)
if err != nil {
return nil, err
}
binds = append(binds, &configs.Mount{
Device: "bind",
Source: filepath.Join(mm.Mountpoint, relDir),
Destination: filepath.Join(m.Destination, strings.Join(mm.Subsystems, ",")),
Flags: syscall.MS_BIND | syscall.MS_REC | m.Flags,
PropagationFlags: m.PropagationFlags,
})
}
return binds, nil
}
// checkMountDestination checks to ensure that the mount destination is not over the top of /proc.
// dest is required to be an abs path and have any symlinks resolved before calling this function.
func checkMountDestination(rootfs, dest string) error {
if libcontainerUtils.CleanPath(rootfs) == libcontainerUtils.CleanPath(dest) {
return fmt.Errorf("mounting into / is prohibited")
}
invalidDestinations := []string{
"/proc",
}
// White list, it should be sub directories of invalid destinations
validDestinations := []string{
// These entries can be bind mounted by files emulated by fuse,
// so commands like top, free displays stats in container.
"/proc/cpuinfo",
"/proc/diskstats",
"/proc/meminfo",
"/proc/stat",
"/proc/net/dev",
}
for _, valid := range validDestinations {
path, err := filepath.Rel(filepath.Join(rootfs, valid), dest)
if err != nil {
return err
}
if path == "." {
return nil
}
}
for _, invalid := range invalidDestinations {
path, err := filepath.Rel(filepath.Join(rootfs, invalid), dest)
if err != nil {
return err
}
if path == "." || !strings.HasPrefix(path, "..") {
return fmt.Errorf("%q cannot be mounted because it is located inside %q", dest, invalid)
}
}
return nil
}
func setupDevSymlinks(rootfs string) error {
var links = [][2]string{
{"/proc/self/fd", "/dev/fd"},
{"/proc/self/fd/0", "/dev/stdin"},
{"/proc/self/fd/1", "/dev/stdout"},
{"/proc/self/fd/2", "/dev/stderr"},
}
// kcore support can be toggled with CONFIG_PROC_KCORE; only create a symlink
// in /dev if it exists in /proc.
if _, err := os.Stat("/proc/kcore"); err == nil {
links = append(links, [2]string{"/proc/kcore", "/dev/core"})
}
for _, link := range links {
var (
src = link[0]
dst = filepath.Join(rootfs, link[1])
)
if err := os.Symlink(src, dst); err != nil && !os.IsExist(err) {
return fmt.Errorf("symlink %s %s %s", src, dst, err)
}
}
return nil
}
// If stdin, stdout, and/or stderr are pointing to `/dev/null` in the parent's rootfs
// this method will make them point to `/dev/null` in this container's rootfs. This
// needs to be called after we chroot/pivot into the container's rootfs so that any
// symlinks are resolved locally.
func reOpenDevNull() error {
var stat, devNullStat syscall.Stat_t
file, err := os.OpenFile("/dev/null", os.O_RDWR, 0)
if err != nil {
return fmt.Errorf("Failed to open /dev/null - %s", err)
}
defer file.Close()
if err := syscall.Fstat(int(file.Fd()), &devNullStat); err != nil {
return err
}
for fd := 0; fd < 3; fd++ {
if err := syscall.Fstat(fd, &stat); err != nil {
return err
}
if stat.Rdev == devNullStat.Rdev {
// Close and re-open the fd.
if err := syscall.Dup3(int(file.Fd()), fd, 0); err != nil {
return err
}
}
}
return nil
}
// Create the device nodes in the container.
func createDevices(config *configs.Config) error {
useBindMount := system.RunningInUserNS() || config.Namespaces.Contains(configs.NEWUSER)
oldMask := syscall.Umask(0000)
for _, node := range config.Devices {
// containers running in a user namespace are not allowed to mknod
// devices so we can just bind mount it from the host.
if err := createDeviceNode(config.Rootfs, node, useBindMount); err != nil {
syscall.Umask(oldMask)
return err
}
}
syscall.Umask(oldMask)
return nil
}
func bindMountDeviceNode(dest string, node *configs.Device) error {
f, err := os.Create(dest)
if err != nil && !os.IsExist(err) {
return err
}
if f != nil {
f.Close()
}
return syscall.Mount(node.Path, dest, "bind", syscall.MS_BIND, "")
}
// Creates the device node in the rootfs of the container.
func createDeviceNode(rootfs string, node *configs.Device, bind bool) error {
dest := filepath.Join(rootfs, node.Path)
if err := os.MkdirAll(filepath.Dir(dest), 0755); err != nil {
return err
}
if bind {
return bindMountDeviceNode(dest, node)
}
if err := mknodDevice(dest, node); err != nil {
if os.IsExist(err) {
return nil
} else if os.IsPermission(err) {
return bindMountDeviceNode(dest, node)
}
return err
}
return nil
}
func mknodDevice(dest string, node *configs.Device) error {
fileMode := node.FileMode
switch node.Type {
case 'c':
fileMode |= syscall.S_IFCHR
case 'b':
fileMode |= syscall.S_IFBLK
default:
return fmt.Errorf("%c is not a valid device type for device %s", node.Type, node.Path)
}
if err := syscall.Mknod(dest, uint32(fileMode), node.Mkdev()); err != nil {
return err
}
return syscall.Chown(dest, int(node.Uid), int(node.Gid))
}
func getMountInfo(mountinfo []*mount.Info, dir string) *mount.Info {
for _, m := range mountinfo {
if m.Mountpoint == dir {
return m
}
}
return nil
}
// Get the parent mount point of directory passed in as argument. Also return
// optional fields.
func getParentMount(rootfs string) (string, string, error) {
var path string
mountinfos, err := mount.GetMounts()
if err != nil {
return "", "", err
}
mountinfo := getMountInfo(mountinfos, rootfs)
if mountinfo != nil {
return rootfs, mountinfo.Optional, nil
}
path = rootfs
for {
path = filepath.Dir(path)
mountinfo = getMountInfo(mountinfos, path)
if mountinfo != nil {
return path, mountinfo.Optional, nil
}
if path == "/" {
break
}
}
// If we are here, we did not find parent mount. Something is wrong.
return "", "", fmt.Errorf("Could not find parent mount of %s", rootfs)
}
// Make parent mount private if it was shared
func rootfsParentMountPrivate(config *configs.Config) error {
sharedMount := false
parentMount, optionalOpts, err := getParentMount(config.Rootfs)
if err != nil {
return err
}
optsSplit := strings.Split(optionalOpts, " ")
for _, opt := range optsSplit {
if strings.HasPrefix(opt, "shared:") {
sharedMount = true
break
}
}
// Make parent mount PRIVATE if it was shared. It is needed for two
// reasons. First of all pivot_root() will fail if parent mount is
// shared. Secondly when we bind mount rootfs it will propagate to
// parent namespace and we don't want that to happen.
if sharedMount {
return syscall.Mount("", parentMount, "", syscall.MS_PRIVATE, "")
}
return nil
}
func prepareRoot(config *configs.Config) error {
flag := syscall.MS_SLAVE | syscall.MS_REC
if config.RootPropagation != 0 {
flag = config.RootPropagation
}
if err := syscall.Mount("", "/", "", uintptr(flag), ""); err != nil {
return err
}
if err := rootfsParentMountPrivate(config); err != nil {
return err
}
return syscall.Mount(config.Rootfs, config.Rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, "")
}
func setReadonly() error {
return syscall.Mount("/", "/", "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, "")
}
func setupPtmx(config *configs.Config, console *linuxConsole) error {
ptmx := filepath.Join(config.Rootfs, "dev/ptmx")
if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) {
return err
}
if err := os.Symlink("pts/ptmx", ptmx); err != nil {
return fmt.Errorf("symlink dev ptmx %s", err)
}
if console != nil {
return console.mount(config.Rootfs, config.MountLabel)
}
return nil
}
func pivotRoot(rootfs, pivotBaseDir string) (err error) {
if pivotBaseDir == "" {
pivotBaseDir = "/"
}
tmpDir := filepath.Join(rootfs, pivotBaseDir)
if err := os.MkdirAll(tmpDir, 0755); err != nil {
return fmt.Errorf("can't create tmp dir %s, error %v", tmpDir, err)
}
pivotDir, err := ioutil.TempDir(tmpDir, ".pivot_root")
if err != nil {
return fmt.Errorf("can't create pivot_root dir %s, error %v", pivotDir, err)
}
defer func() {
errVal := os.Remove(pivotDir)
if err == nil {
err = errVal
}
}()
if err := syscall.PivotRoot(rootfs, pivotDir); err != nil {
return fmt.Errorf("pivot_root %s", err)
}
if err := syscall.Chdir("/"); err != nil {
return fmt.Errorf("chdir / %s", err)
}
// path to pivot dir now changed, update
pivotDir = filepath.Join(pivotBaseDir, filepath.Base(pivotDir))
// Make pivotDir rprivate to make sure any of the unmounts don't
// propagate to parent.
if err := syscall.Mount("", pivotDir, "", syscall.MS_PRIVATE|syscall.MS_REC, ""); err != nil {
return err
}
if err := syscall.Unmount(pivotDir, syscall.MNT_DETACH); err != nil {
return fmt.Errorf("unmount pivot_root dir %s", err)
}
return nil
}
func msMoveRoot(rootfs string) error {
if err := syscall.Mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil {
return err
}
if err := syscall.Chroot("."); err != nil {
return err
}
return syscall.Chdir("/")
}
// createIfNotExists creates a file or a directory only if it does not already exist.
func createIfNotExists(path string, isDir bool) error {
if _, err := os.Stat(path); err != nil {
if os.IsNotExist(err) {
if isDir {
return os.MkdirAll(path, 0755)
}
if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
return err
}
f, err := os.OpenFile(path, os.O_CREATE, 0755)
if err != nil {
return err
}
f.Close()
}
}
return nil
}
// remountReadonly will bind over the top of an existing path and ensure that it is read-only.
func remountReadonly(path string) error {
for i := 0; i < 5; i++ {
if err := syscall.Mount("", path, "", syscall.MS_REMOUNT|syscall.MS_RDONLY, ""); err != nil && !os.IsNotExist(err) {
switch err {
case syscall.EINVAL:
// Probably not a mountpoint, use bind-mount
if err := syscall.Mount(path, path, "", syscall.MS_BIND, ""); err != nil {
return err
}
return syscall.Mount(path, path, "", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC|defaultMountFlags, "")
case syscall.EBUSY:
time.Sleep(100 * time.Millisecond)
continue
default:
return err
}
}
return nil
}
return fmt.Errorf("unable to mount %s as readonly max retries reached", path)
}
// maskFile bind mounts /dev/null over the top of the specified path inside a container
// to avoid security issues from processes reading information from non-namespace aware mounts ( proc/kcore ).
func maskFile(path string) error {
if err := syscall.Mount("/dev/null", path, "", syscall.MS_BIND, ""); err != nil && !os.IsNotExist(err) {
return err
}
return nil
}
// writeSystemProperty writes the value to a path under /proc/sys as determined from the key.
// For e.g. net.ipv4.ip_forward translated to /proc/sys/net/ipv4/ip_forward.
func writeSystemProperty(key, value string) error {
keyPath := strings.Replace(key, ".", "/", -1)
return ioutil.WriteFile(path.Join("/proc/sys", keyPath), []byte(value), 0644)
}
func remount(m *configs.Mount, rootfs string) error {
var (
dest = m.Destination
)
if !strings.HasPrefix(dest, rootfs) {
dest = filepath.Join(rootfs, dest)
}
if err := syscall.Mount(m.Source, dest, m.Device, uintptr(m.Flags|syscall.MS_REMOUNT), ""); err != nil {
return err
}
return nil
}
// Do the mount operation followed by additional mounts required to take care
// of propagation flags.
func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error {
var (
dest = m.Destination
data = label.FormatMountLabel(m.Data, mountLabel)
flags = m.Flags
)
if dest == "/dev" {
flags &= ^syscall.MS_RDONLY
}
if !strings.HasPrefix(dest, rootfs) {
dest = filepath.Join(rootfs, dest)
}
if err := syscall.Mount(m.Source, dest, m.Device, uintptr(flags), data); err != nil {
return err
}
for _, pflag := range m.PropagationFlags {
if err := syscall.Mount("", dest, "", uintptr(pflag), ""); err != nil {
return err
}
}
return nil
}

View file

@ -1,71 +0,0 @@
package seccomp
import (
"fmt"
"github.com/opencontainers/runc/libcontainer/configs"
)
var operators = map[string]configs.Operator{
"SCMP_CMP_NE": configs.NotEqualTo,
"SCMP_CMP_LT": configs.LessThan,
"SCMP_CMP_LE": configs.LessThanOrEqualTo,
"SCMP_CMP_EQ": configs.EqualTo,
"SCMP_CMP_GE": configs.GreaterThanOrEqualTo,
"SCMP_CMP_GT": configs.GreaterThan,
"SCMP_CMP_MASKED_EQ": configs.MaskEqualTo,
}
var actions = map[string]configs.Action{
"SCMP_ACT_KILL": configs.Kill,
"SCMP_ACT_ERRNO": configs.Errno,
"SCMP_ACT_TRAP": configs.Trap,
"SCMP_ACT_ALLOW": configs.Allow,
"SCMP_ACT_TRACE": configs.Trace,
}
var archs = map[string]string{
"SCMP_ARCH_X86": "x86",
"SCMP_ARCH_X86_64": "amd64",
"SCMP_ARCH_X32": "x32",
"SCMP_ARCH_ARM": "arm",
"SCMP_ARCH_AARCH64": "arm64",
"SCMP_ARCH_MIPS": "mips",
"SCMP_ARCH_MIPS64": "mips64",
"SCMP_ARCH_MIPS64N32": "mips64n32",
"SCMP_ARCH_MIPSEL": "mipsel",
"SCMP_ARCH_MIPSEL64": "mipsel64",
"SCMP_ARCH_MIPSEL64N32": "mipsel64n32",
}
// ConvertStringToOperator converts a string into a Seccomp comparison operator.
// Comparison operators use the names they are assigned by Libseccomp's header.
// Attempting to convert a string that is not a valid operator results in an
// error.
func ConvertStringToOperator(in string) (configs.Operator, error) {
if op, ok := operators[in]; ok == true {
return op, nil
}
return 0, fmt.Errorf("string %s is not a valid operator for seccomp", in)
}
// ConvertStringToAction converts a string into a Seccomp rule match action.
// Actions use the names they are assigned in Libseccomp's header, though some
// (notable, SCMP_ACT_TRACE) are not available in this implementation and will
// return errors.
// Attempting to convert a string that is not a valid action results in an
// error.
func ConvertStringToAction(in string) (configs.Action, error) {
if act, ok := actions[in]; ok == true {
return act, nil
}
return 0, fmt.Errorf("string %s is not a valid action for seccomp", in)
}
// ConvertStringToArch converts a string into a Seccomp comparison arch.
func ConvertStringToArch(in string) (string, error) {
if arch, ok := archs[in]; ok == true {
return arch, nil
}
return "", fmt.Errorf("string %s is not a valid arch for seccomp", in)
}

View file

@ -1,231 +0,0 @@
// +build linux,cgo,seccomp
package seccomp
import (
"bufio"
"fmt"
"log"
"os"
"strings"
"syscall"
"github.com/opencontainers/runc/libcontainer/configs"
libseccomp "github.com/seccomp/libseccomp-golang"
)
var (
actAllow = libseccomp.ActAllow
actTrap = libseccomp.ActTrap
actKill = libseccomp.ActKill
actTrace = libseccomp.ActTrace.SetReturnCode(int16(syscall.EPERM))
actErrno = libseccomp.ActErrno.SetReturnCode(int16(syscall.EPERM))
// SeccompModeFilter refers to the syscall argument SECCOMP_MODE_FILTER.
SeccompModeFilter = uintptr(2)
)
// Filters given syscalls in a container, preventing them from being used
// Started in the container init process, and carried over to all child processes
// Setns calls, however, require a separate invocation, as they are not children
// of the init until they join the namespace
func InitSeccomp(config *configs.Seccomp) error {
if config == nil {
return fmt.Errorf("cannot initialize Seccomp - nil config passed")
}
defaultAction, err := getAction(config.DefaultAction)
if err != nil {
return fmt.Errorf("error initializing seccomp - invalid default action")
}
filter, err := libseccomp.NewFilter(defaultAction)
if err != nil {
return fmt.Errorf("error creating filter: %s", err)
}
// Add extra architectures
for _, arch := range config.Architectures {
scmpArch, err := libseccomp.GetArchFromString(arch)
if err != nil {
return err
}
if err := filter.AddArch(scmpArch); err != nil {
return err
}
}
// Unset no new privs bit
if err := filter.SetNoNewPrivsBit(false); err != nil {
return fmt.Errorf("error setting no new privileges: %s", err)
}
// Add a rule for each syscall
for _, call := range config.Syscalls {
if call == nil {
return fmt.Errorf("encountered nil syscall while initializing Seccomp")
}
if err = matchCall(filter, call); err != nil {
return err
}
}
if err = filter.Load(); err != nil {
return fmt.Errorf("error loading seccomp filter into kernel: %s", err)
}
return nil
}
// IsEnabled returns if the kernel has been configured to support seccomp.
func IsEnabled() bool {
// Try to read from /proc/self/status for kernels > 3.8
s, err := parseStatusFile("/proc/self/status")
if err != nil {
// Check if Seccomp is supported, via CONFIG_SECCOMP.
if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_GET_SECCOMP, 0, 0); err != syscall.EINVAL {
// Make sure the kernel has CONFIG_SECCOMP_FILTER.
if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_SECCOMP, SeccompModeFilter, 0); err != syscall.EINVAL {
return true
}
}
return false
}
_, ok := s["Seccomp"]
return ok
}
// Convert Libcontainer Action to Libseccomp ScmpAction
func getAction(act configs.Action) (libseccomp.ScmpAction, error) {
switch act {
case configs.Kill:
return actKill, nil
case configs.Errno:
return actErrno, nil
case configs.Trap:
return actTrap, nil
case configs.Allow:
return actAllow, nil
case configs.Trace:
return actTrace, nil
default:
return libseccomp.ActInvalid, fmt.Errorf("invalid action, cannot use in rule")
}
}
// Convert Libcontainer Operator to Libseccomp ScmpCompareOp
func getOperator(op configs.Operator) (libseccomp.ScmpCompareOp, error) {
switch op {
case configs.EqualTo:
return libseccomp.CompareEqual, nil
case configs.NotEqualTo:
return libseccomp.CompareNotEqual, nil
case configs.GreaterThan:
return libseccomp.CompareGreater, nil
case configs.GreaterThanOrEqualTo:
return libseccomp.CompareGreaterEqual, nil
case configs.LessThan:
return libseccomp.CompareLess, nil
case configs.LessThanOrEqualTo:
return libseccomp.CompareLessOrEqual, nil
case configs.MaskEqualTo:
return libseccomp.CompareMaskedEqual, nil
default:
return libseccomp.CompareInvalid, fmt.Errorf("invalid operator, cannot use in rule")
}
}
// Convert Libcontainer Arg to Libseccomp ScmpCondition
func getCondition(arg *configs.Arg) (libseccomp.ScmpCondition, error) {
cond := libseccomp.ScmpCondition{}
if arg == nil {
return cond, fmt.Errorf("cannot convert nil to syscall condition")
}
op, err := getOperator(arg.Op)
if err != nil {
return cond, err
}
return libseccomp.MakeCondition(arg.Index, op, arg.Value, arg.ValueTwo)
}
// Add a rule to match a single syscall
func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error {
if call == nil || filter == nil {
return fmt.Errorf("cannot use nil as syscall to block")
}
if len(call.Name) == 0 {
return fmt.Errorf("empty string is not a valid syscall")
}
// If we can't resolve the syscall, assume it's not supported on this kernel
// Ignore it, don't error out
callNum, err := libseccomp.GetSyscallFromName(call.Name)
if err != nil {
log.Printf("Error resolving syscall name %s: %s - ignoring syscall.", call.Name, err)
return nil
}
// Convert the call's action to the libseccomp equivalent
callAct, err := getAction(call.Action)
if err != nil {
return err
}
// Unconditional match - just add the rule
if len(call.Args) == 0 {
if err = filter.AddRule(callNum, callAct); err != nil {
return err
}
} else {
// Conditional match - convert the per-arg rules into library format
conditions := []libseccomp.ScmpCondition{}
for _, cond := range call.Args {
newCond, err := getCondition(cond)
if err != nil {
return err
}
conditions = append(conditions, newCond)
}
if err = filter.AddRuleConditional(callNum, callAct, conditions); err != nil {
return err
}
}
return nil
}
func parseStatusFile(path string) (map[string]string, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
defer f.Close()
s := bufio.NewScanner(f)
status := make(map[string]string)
for s.Scan() {
if err := s.Err(); err != nil {
return nil, err
}
text := s.Text()
parts := strings.Split(text, ":")
if len(parts) <= 1 {
continue
}
status[parts[0]] = parts[1]
}
return status, nil
}

View file

@ -1,24 +0,0 @@
// +build !linux !cgo !seccomp
package seccomp
import (
"errors"
"github.com/opencontainers/runc/libcontainer/configs"
)
var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported")
// Seccomp not supported, do nothing
func InitSeccomp(config *configs.Seccomp) error {
if config != nil {
return ErrSeccompNotEnabled
}
return nil
}
// IsEnabled returns false, because it is not supported.
func IsEnabled() bool {
return false
}

View file

@ -1,11 +0,0 @@
// +build linux,go1.5
package libcontainer
import "syscall"
// Set the GidMappingsEnableSetgroups member to true, so the process's
// setgroups proc entry wont be set to 'deny' if GidMappings are set
func enableSetgroups(sys *syscall.SysProcAttr) {
sys.GidMappingsEnableSetgroups = true
}

View file

@ -1,56 +0,0 @@
// +build linux
package libcontainer
import (
"fmt"
"os"
"github.com/opencontainers/runc/libcontainer/apparmor"
"github.com/opencontainers/runc/libcontainer/keys"
"github.com/opencontainers/runc/libcontainer/label"
"github.com/opencontainers/runc/libcontainer/seccomp"
"github.com/opencontainers/runc/libcontainer/system"
)
// linuxSetnsInit performs the container's initialization for running a new process
// inside an existing container.
type linuxSetnsInit struct {
config *initConfig
}
func (l *linuxSetnsInit) getSessionRingName() string {
return fmt.Sprintf("_ses.%s", l.config.ContainerId)
}
func (l *linuxSetnsInit) Init() error {
// do not inherit the parent's session keyring
if _, err := keyctl.JoinSessionKeyring(l.getSessionRingName()); err != nil {
return err
}
if err := setupRlimits(l.config.Config); err != nil {
return err
}
if l.config.NoNewPrivileges {
if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return err
}
}
if l.config.Config.Seccomp != nil {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return err
}
}
if err := finalizeNamespace(l.config); err != nil {
return err
}
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
return err
}
if l.config.ProcessLabel != "" {
if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
return err
}
}
return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
}

View file

@ -1,27 +0,0 @@
package stacktrace
import "runtime"
// Caputure captures a stacktrace for the current calling go program
//
// skip is the number of frames to skip
func Capture(userSkip int) Stacktrace {
var (
skip = userSkip + 1 // add one for our own function
frames []Frame
prevPc uintptr = 0
)
for i := skip; ; i++ {
pc, file, line, ok := runtime.Caller(i)
//detect if caller is repeated to avoid loop, gccgo
//currently runs into a loop without this check
if !ok || pc == prevPc {
break
}
frames = append(frames, NewFrame(pc, file, line))
prevPc = pc
}
return Stacktrace{
Frames: frames,
}
}

View file

@ -1,38 +0,0 @@
package stacktrace
import (
"path/filepath"
"runtime"
"strings"
)
// NewFrame returns a new stack frame for the provided information
func NewFrame(pc uintptr, file string, line int) Frame {
fn := runtime.FuncForPC(pc)
if fn == nil {
return Frame{}
}
pack, name := parseFunctionName(fn.Name())
return Frame{
Line: line,
File: filepath.Base(file),
Package: pack,
Function: name,
}
}
func parseFunctionName(name string) (string, string) {
i := strings.LastIndex(name, ".")
if i == -1 {
return "", name
}
return name[:i], name[i+1:]
}
// Frame contains all the information for a stack frame within a go program
type Frame struct {
File string
Function string
Package string
Line int
}

View file

@ -1,5 +0,0 @@
package stacktrace
type Stacktrace struct {
Frames []Frame
}

View file

@ -1,150 +0,0 @@
// +build linux
package libcontainer
import (
"fmt"
"io"
"os"
"syscall"
"github.com/opencontainers/runc/libcontainer/apparmor"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/keys"
"github.com/opencontainers/runc/libcontainer/label"
"github.com/opencontainers/runc/libcontainer/seccomp"
"github.com/opencontainers/runc/libcontainer/system"
)
type linuxStandardInit struct {
pipe io.ReadWriter
parentPid int
config *initConfig
}
func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) {
var newperms uint32
if l.config.Config.Namespaces.Contains(configs.NEWUSER) {
// with user ns we need 'other' search permissions
newperms = 0x8
} else {
// without user ns we need 'UID' search permissions
newperms = 0x80000
}
// create a unique per session container name that we can
// join in setns; however, other containers can also join it
return fmt.Sprintf("_ses.%s", l.config.ContainerId), 0xffffffff, newperms
}
// PR_SET_NO_NEW_PRIVS isn't exposed in Golang so we define it ourselves copying the value
// the kernel
const PR_SET_NO_NEW_PRIVS = 0x26
func (l *linuxStandardInit) Init() error {
ringname, keepperms, newperms := l.getSessionRingParams()
// do not inherit the parent's session keyring
sessKeyId, err := keyctl.JoinSessionKeyring(ringname)
if err != nil {
return err
}
// make session keyring searcheable
if err := keyctl.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
return err
}
var console *linuxConsole
if l.config.Console != "" {
console = newConsoleFromPath(l.config.Console)
if err := console.dupStdio(); err != nil {
return err
}
}
if console != nil {
if err := system.Setctty(); err != nil {
return err
}
}
if err := setupNetwork(l.config); err != nil {
return err
}
if err := setupRoute(l.config.Config); err != nil {
return err
}
if err := setupRlimits(l.config.Config); err != nil {
return err
}
label.Init()
// InitializeMountNamespace() can be executed only for a new mount namespace
if l.config.Config.Namespaces.Contains(configs.NEWNS) {
if err := setupRootfs(l.config.Config, console, l.pipe); err != nil {
return err
}
}
if hostname := l.config.Config.Hostname; hostname != "" {
if err := syscall.Sethostname([]byte(hostname)); err != nil {
return err
}
}
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
return err
}
if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
return err
}
for key, value := range l.config.Config.Sysctl {
if err := writeSystemProperty(key, value); err != nil {
return err
}
}
for _, path := range l.config.Config.ReadonlyPaths {
if err := remountReadonly(path); err != nil {
return err
}
}
for _, path := range l.config.Config.MaskPaths {
if err := maskFile(path); err != nil {
return err
}
}
pdeath, err := system.GetParentDeathSignal()
if err != nil {
return err
}
if l.config.NoNewPrivileges {
if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return err
}
}
// Tell our parent that we're ready to Execv. This must be done before the
// Seccomp rules have been applied, because we need to be able to read and
// write to a socket.
if err := syncParentReady(l.pipe); err != nil {
return err
}
if l.config.Config.Seccomp != nil {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return err
}
}
if err := finalizeNamespace(l.config); err != nil {
return err
}
// finalizeNamespace can change user/group which clears the parent death
// signal, so we restore it here.
if err := pdeath.Restore(); err != nil {
return err
}
// compare the parent from the inital start of the init process and make sure that it did not change.
// if the parent changes that means it died and we were reparened to something else so we should
// just kill ourself and not cause problems for someone else.
if syscall.Getppid() != l.parentPid {
return syscall.Kill(syscall.Getpid(), syscall.SIGKILL)
}
return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
}

View file

@ -1,226 +0,0 @@
// +build linux
package libcontainer
import (
"fmt"
"os"
"path/filepath"
"github.com/Sirupsen/logrus"
"github.com/opencontainers/runc/libcontainer/configs"
)
func newStateTransitionError(from, to containerState) error {
return &stateTransitionError{
From: from.status().String(),
To: to.status().String(),
}
}
// stateTransitionError is returned when an invalid state transition happens from one
// state to another.
type stateTransitionError struct {
From string
To string
}
func (s *stateTransitionError) Error() string {
return fmt.Sprintf("invalid state transition from %s to %s", s.From, s.To)
}
type containerState interface {
transition(containerState) error
destroy() error
status() Status
}
func destroy(c *linuxContainer) error {
if !c.config.Namespaces.Contains(configs.NEWPID) {
if err := killCgroupProcesses(c.cgroupManager); err != nil {
logrus.Warn(err)
}
}
err := c.cgroupManager.Destroy()
if rerr := os.RemoveAll(c.root); err == nil {
err = rerr
}
c.initProcess = nil
if herr := runPoststopHooks(c); err == nil {
err = herr
}
c.state = &stoppedState{c: c}
return err
}
func runPoststopHooks(c *linuxContainer) error {
if c.config.Hooks != nil {
s := configs.HookState{
Version: c.config.Version,
ID: c.id,
Root: c.config.Rootfs,
}
for _, hook := range c.config.Hooks.Poststop {
if err := hook.Run(s); err != nil {
return err
}
}
}
return nil
}
// stoppedState represents a container is a stopped/destroyed state.
type stoppedState struct {
c *linuxContainer
}
func (b *stoppedState) status() Status {
return Destroyed
}
func (b *stoppedState) transition(s containerState) error {
switch s.(type) {
case *runningState:
b.c.state = s
return nil
case *restoredState:
b.c.state = s
return nil
case *stoppedState:
return nil
}
return newStateTransitionError(b, s)
}
func (b *stoppedState) destroy() error {
return destroy(b.c)
}
// runningState represents a container that is currently running.
type runningState struct {
c *linuxContainer
}
func (r *runningState) status() Status {
return Running
}
func (r *runningState) transition(s containerState) error {
switch s.(type) {
case *stoppedState:
running, err := r.c.isRunning()
if err != nil {
return err
}
if running {
return newGenericError(fmt.Errorf("container still running"), ContainerNotStopped)
}
r.c.state = s
return nil
case *pausedState:
r.c.state = s
return nil
case *runningState:
return nil
}
return newStateTransitionError(r, s)
}
func (r *runningState) destroy() error {
running, err := r.c.isRunning()
if err != nil {
return err
}
if running {
return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped)
}
return destroy(r.c)
}
// pausedState represents a container that is currently pause. It cannot be destroyed in a
// paused state and must transition back to running first.
type pausedState struct {
c *linuxContainer
}
func (p *pausedState) status() Status {
return Paused
}
func (p *pausedState) transition(s containerState) error {
switch s.(type) {
case *runningState, *stoppedState:
p.c.state = s
return nil
case *pausedState:
return nil
}
return newStateTransitionError(p, s)
}
func (p *pausedState) destroy() error {
isRunning, err := p.c.isRunning()
if err != nil {
return err
}
if !isRunning {
if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil {
return err
}
return destroy(p.c)
}
return newGenericError(fmt.Errorf("container is paused"), ContainerPaused)
}
// restoredState is the same as the running state but also has accociated checkpoint
// information that maybe need destroyed when the container is stopped and destory is called.
type restoredState struct {
imageDir string
c *linuxContainer
}
func (r *restoredState) status() Status {
return Running
}
func (r *restoredState) transition(s containerState) error {
switch s.(type) {
case *stoppedState:
return nil
case *runningState:
return nil
}
return newStateTransitionError(r, s)
}
func (r *restoredState) destroy() error {
if _, err := os.Stat(filepath.Join(r.c.root, "checkpoint")); err != nil {
if !os.IsNotExist(err) {
return err
}
}
return destroy(r.c)
}
// createdState is used whenever a container is restored, loaded, or setting additional
// processes inside and it should not be destroyed when it is exiting.
type createdState struct {
c *linuxContainer
s Status
}
func (n *createdState) status() Status {
return n.s
}
func (n *createdState) transition(s containerState) error {
n.c.state = s
return nil
}
func (n *createdState) destroy() error {
if err := n.c.refreshState(); err != nil {
return err
}
return n.c.state.destroy()
}

View file

@ -1,15 +0,0 @@
package libcontainer
type NetworkInterface struct {
// Name is the name of the network interface.
Name string
RxBytes uint64
RxPackets uint64
RxErrors uint64
RxDropped uint64
TxBytes uint64
TxPackets uint64
TxErrors uint64
TxDropped uint64
}

View file

@ -1,5 +0,0 @@
package libcontainer
type Stats struct {
Interfaces []*NetworkInterface
}

View file

@ -1,8 +0,0 @@
package libcontainer
import "github.com/opencontainers/runc/libcontainer/cgroups"
type Stats struct {
Interfaces []*NetworkInterface
CgroupStats *cgroups.Stats
}

View file

@ -1,5 +0,0 @@
package libcontainer
type Stats struct {
Interfaces []*NetworkInterface
}

View file

@ -1,86 +0,0 @@
package utils
import (
"crypto/rand"
"encoding/hex"
"encoding/json"
"io"
"os"
"path/filepath"
"syscall"
)
const (
exitSignalOffset = 128
)
// GenerateRandomName returns a new name joined with a prefix. This size
// specified is used to truncate the randomly generated value
func GenerateRandomName(prefix string, size int) (string, error) {
id := make([]byte, 32)
if _, err := io.ReadFull(rand.Reader, id); err != nil {
return "", err
}
if size > 64 {
size = 64
}
return prefix + hex.EncodeToString(id)[:size], nil
}
// ResolveRootfs ensures that the current working directory is
// not a symlink and returns the absolute path to the rootfs
func ResolveRootfs(uncleanRootfs string) (string, error) {
rootfs, err := filepath.Abs(uncleanRootfs)
if err != nil {
return "", err
}
return filepath.EvalSymlinks(rootfs)
}
// ExitStatus returns the correct exit status for a process based on if it
// was signaled or exited cleanly
func ExitStatus(status syscall.WaitStatus) int {
if status.Signaled() {
return exitSignalOffset + int(status.Signal())
}
return status.ExitStatus()
}
// WriteJSON writes the provided struct v to w using standard json marshaling
func WriteJSON(w io.Writer, v interface{}) error {
data, err := json.Marshal(v)
if err != nil {
return err
}
_, err = w.Write(data)
return err
}
// CleanPath makes a path safe for use with filepath.Join. This is done by not
// only cleaning the path, but also (if the path is relative) adding a leading
// '/' and cleaning it (then removing the leading '/'). This ensures that a
// path resulting from prepending another path will always resolve to lexically
// be a subdirectory of the prefixed path. This is all done lexically, so paths
// that include symlinks won't be safe as a result of using CleanPath.
func CleanPath(path string) string {
// Deal with empty strings nicely.
if path == "" {
return ""
}
// Ensure that all paths are cleaned (especially problematic ones like
// "/../../../../../" which can cause lots of issues).
path = filepath.Clean(path)
// If the path isn't absolute, we need to do more processing to fix paths
// such as "../../../../<etc>/some/path". We also shouldn't convert absolute
// paths to relative ones.
if !filepath.IsAbs(path) {
path = filepath.Clean(string(os.PathSeparator) + path)
// This can't fail, as (by definition) all paths are relative to root.
path, _ = filepath.Rel(string(os.PathSeparator), path)
}
// Clean the path again for good measure.
return filepath.Clean(path)
}

View file

@ -1,33 +0,0 @@
// +build !windows
package utils
import (
"io/ioutil"
"strconv"
"syscall"
)
func CloseExecFrom(minFd int) error {
fdList, err := ioutil.ReadDir("/proc/self/fd")
if err != nil {
return err
}
for _, fi := range fdList {
fd, err := strconv.Atoi(fi.Name())
if err != nil {
// ignore non-numeric file names
continue
}
if fd < minFd {
// ignore descriptors lower than our specified minimum
continue
}
// intentionally ignore errors from syscall.CloseOnExec
syscall.CloseOnExec(fd)
// the cases where this might fail are basically file descriptors that have already been closed (including and especially the one that was created when ioutil.ReadDir did the "opendir" syscall)
}
return nil
}