Pārlūkot izejas kodu

merge master

Docker-DCO-1.1-Signed-off-by: Victor Vieux <victor.vieux@docker.com> (github: vieux)
Victor Vieux 11 gadi atpakaļ
vecāks
revīzija
51d280f944

+ 1 - 1
MAINTAINERS

@@ -6,4 +6,4 @@ Michael Crosby <michael@crosbymichael.com> (@crosbymichael)
 api.go: Victor Vieux <victor@dotcloud.com> (@vieux)
 Dockerfile: Tianon Gravi <admwiggin@gmail.com> (@tianon)
 Makefile: Tianon Gravi <admwiggin@gmail.com> (@tianon)
-Vagrantfile: Daniel Mizyrycki <daniel@dotcloud.com> (@mzdaniel)
+Vagrantfile: Cristian Staretu <cristian.staretu@gmail.com> (@unclejack)

+ 4 - 1
Makefile

@@ -1,4 +1,4 @@
-.PHONY: all binary build cross default docs docs-build docs-shell shell test
+.PHONY: all binary build cross default docs docs-build docs-shell shell test test-integration
 
 GIT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD)
 DOCKER_IMAGE := docker:$(GIT_BRANCH)
@@ -25,6 +25,9 @@ docs-shell: docs-build
 test: build
 	$(DOCKER_RUN_DOCKER) hack/make.sh test test-integration
 
+test-integration: build
+	$(DOCKER_RUN_DOCKER) hack/make.sh test-integration
+
 shell: build
 	$(DOCKER_RUN_DOCKER) bash
 

+ 13 - 3
Vagrantfile

@@ -8,10 +8,9 @@ AWS_BOX_URI = ENV['BOX_URI'] || "https://github.com/mitchellh/vagrant-aws/raw/ma
 AWS_REGION = ENV['AWS_REGION'] || "us-east-1"
 AWS_AMI = ENV['AWS_AMI'] || "ami-69f5a900"
 AWS_INSTANCE_TYPE = ENV['AWS_INSTANCE_TYPE'] || 't1.micro'
-
 FORWARD_DOCKER_PORTS = ENV['FORWARD_DOCKER_PORTS']
-
-SSH_PRIVKEY_PATH = ENV["SSH_PRIVKEY_PATH"]
+SSH_PRIVKEY_PATH = ENV['SSH_PRIVKEY_PATH']
+PRIVATE_NETWORK = ENV['PRIVATE_NETWORK']
 
 # A script to upgrade from the 12.04 kernel to the raring backport kernel (3.8)
 # and install docker.
@@ -174,3 +173,14 @@ if !FORWARD_DOCKER_PORTS.nil?
     end
   end
 end
+
+if !PRIVATE_NETWORK.nil?
+  Vagrant::VERSION < "1.1.0" and Vagrant::Config.run do |config|
+    config.vm.network :hostonly, PRIVATE_NETWORK
+  end
+
+  Vagrant::VERSION >= "1.1.0" and Vagrant.configure("2") do |config|
+    config.vm.network "private_network", ip: PRIVATE_NETWORK
+  end
+end
+

+ 1 - 1
commands.go

@@ -1678,7 +1678,7 @@ func (cli *DockerCli) CmdSearch(args ...string) error {
 	v := url.Values{}
 	v.Set("term", cmd.Arg(0))
 
-	body, _, err := readBody(cli.call("GET", "/images/search?"+v.Encode(), nil, false))
+	body, _, err := readBody(cli.call("GET", "/images/search?"+v.Encode(), nil, true))
 
 	if err != nil {
 		return err

+ 15 - 13
config.go

@@ -23,29 +23,31 @@ type DaemonConfig struct {
 
 // ConfigFromJob creates and returns a new DaemonConfig object
 // by parsing the contents of a job's environment.
-func ConfigFromJob(job *engine.Job) *DaemonConfig {
-	var config DaemonConfig
-	config.Pidfile = job.Getenv("Pidfile")
-	config.Root = job.Getenv("Root")
-	config.AutoRestart = job.GetenvBool("AutoRestart")
+func DaemonConfigFromJob(job *engine.Job) *DaemonConfig {
+	config := &DaemonConfig{
+		Pidfile:                     job.Getenv("Pidfile"),
+		Root:                        job.Getenv("Root"),
+		AutoRestart:                 job.GetenvBool("AutoRestart"),
+		EnableIptables:              job.GetenvBool("EnableIptables"),
+		EnableIpForward:             job.GetenvBool("EnableIpForward"),
+		BridgeIp:                    job.Getenv("BridgeIp"),
+		DefaultIp:                   net.ParseIP(job.Getenv("DefaultIp")),
+		InterContainerCommunication: job.GetenvBool("InterContainerCommunication"),
+		GraphDriver:                 job.Getenv("GraphDriver"),
+	}
 	if dns := job.GetenvList("Dns"); dns != nil {
 		config.Dns = dns
 	}
-	config.EnableIptables = job.GetenvBool("EnableIptables")
-	config.EnableIpForward = job.GetenvBool("EnableIpForward")
 	if br := job.Getenv("BridgeIface"); br != "" {
 		config.BridgeIface = br
 	} else {
 		config.BridgeIface = DefaultNetworkBridge
 	}
-	config.BridgeIp = job.Getenv("BridgeIp")
-	config.DefaultIp = net.ParseIP(job.Getenv("DefaultIp"))
-	config.InterContainerCommunication = job.GetenvBool("InterContainerCommunication")
-	config.GraphDriver = job.Getenv("GraphDriver")
-	if mtu := job.GetenvInt("Mtu"); mtu != -1 {
+	if mtu := job.GetenvInt("Mtu"); mtu != 0 {
 		config.Mtu = mtu
 	} else {
 		config.Mtu = DefaultNetworkMtu
 	}
-	return &config
+
+	return config
 }

+ 58 - 0
container.go

@@ -104,6 +104,46 @@ type Config struct {
 	NetworkDisabled bool
 }
 
+func ContainerConfigFromJob(job *engine.Job) *Config {
+	config := &Config{
+		Hostname:        job.Getenv("Hostname"),
+		Domainname:      job.Getenv("Domainname"),
+		User:            job.Getenv("User"),
+		Memory:          job.GetenvInt64("Memory"),
+		MemorySwap:      job.GetenvInt64("MemorySwap"),
+		CpuShares:       job.GetenvInt64("CpuShares"),
+		AttachStdin:     job.GetenvBool("AttachStdin"),
+		AttachStdout:    job.GetenvBool("AttachStdout"),
+		AttachStderr:    job.GetenvBool("AttachStderr"),
+		Tty:             job.GetenvBool("Tty"),
+		OpenStdin:       job.GetenvBool("OpenStdin"),
+		StdinOnce:       job.GetenvBool("StdinOnce"),
+		Image:           job.Getenv("Image"),
+		VolumesFrom:     job.Getenv("VolumesFrom"),
+		WorkingDir:      job.Getenv("WorkingDir"),
+		NetworkDisabled: job.GetenvBool("NetworkDisabled"),
+	}
+	job.GetenvJson("ExposedPorts", &config.ExposedPorts)
+	job.GetenvJson("Volumes", &config.Volumes)
+	if PortSpecs := job.GetenvList("PortSpecs"); PortSpecs != nil {
+		config.PortSpecs = PortSpecs
+	}
+	if Env := job.GetenvList("Env"); Env != nil {
+		config.Env = Env
+	}
+	if Cmd := job.GetenvList("Cmd"); Cmd != nil {
+		config.Cmd = Cmd
+	}
+	if Dns := job.GetenvList("Dns"); Dns != nil {
+		config.Dns = Dns
+	}
+	if Entrypoint := job.GetenvList("Entrypoint"); Entrypoint != nil {
+		config.Entrypoint = Entrypoint
+	}
+
+	return config
+}
+
 type HostConfig struct {
 	Binds           []string
 	ContainerIDFile string
@@ -114,6 +154,24 @@ type HostConfig struct {
 	PublishAllPorts bool
 }
 
+func ContainerHostConfigFromJob(job *engine.Job) *HostConfig {
+	hostConfig := &HostConfig{
+		ContainerIDFile: job.Getenv("ContainerIDFile"),
+		Privileged:      job.GetenvBool("Privileged"),
+		PublishAllPorts: job.GetenvBool("PublishAllPorts"),
+	}
+	job.GetenvJson("LxcConf", &hostConfig.LxcConf)
+	job.GetenvJson("PortBindings", &hostConfig.PortBindings)
+	if Binds := job.GetenvList("Binds"); Binds != nil {
+		hostConfig.Binds = Binds
+	}
+	if Links := job.GetenvList("Links"); Links != nil {
+		hostConfig.Links = Links
+	}
+
+	return hostConfig
+}
+
 type BindMap struct {
 	SrcPath string
 	DstPath string

+ 2 - 2
contrib/init/systemd/docker.service

@@ -1,11 +1,11 @@
 [Unit]
-Description=Docker Application Container Engine 
+Description=Docker Application Container Engine
 Documentation=http://docs.docker.io
 After=network.target
 
 [Service]
-ExecStartPre=/bin/mount --make-rprivate /
 ExecStart=/usr/bin/docker -d
+Restart=on-failure
 
 [Install]
 WantedBy=multi-user.target

+ 1 - 0
contrib/init/systemd/socket-activation/docker.service

@@ -5,6 +5,7 @@ After=network.target
 
 [Service]
 ExecStart=/usr/bin/docker -d -H fd://
+Restart=on-failure
 
 [Install]
 WantedBy=multi-user.target

+ 7 - 0
contrib/mkimage-rinse.sh

@@ -1,4 +1,11 @@
 #!/usr/bin/env bash
+#
+# Create a base CentOS Docker image.
+
+# This script is useful on systems with rinse available (e.g.,
+# building a CentOS image on Debian).  See contrib/mkimage-yum.sh for
+# a way to build CentOS images on systems with yum installed.
+
 set -e
 
 repo="$1"

+ 90 - 0
contrib/mkimage-yum.sh

@@ -0,0 +1,90 @@
+#!/usr/bin/env bash
+#
+# Create a base CentOS Docker image.
+#
+# This script is useful on systems with yum installed (e.g., building
+# a CentOS image on CentOS).  See contrib/mkimage-rinse.sh for a way
+# to build CentOS images on other systems.
+
+usage() {
+    cat <<EOOPTS
+$(basename $0) [OPTIONS] <name>
+OPTIONS:
+  -y <yumconf>  The path to the yum config to install packages from. The
+                default is /etc/yum.conf.
+EOOPTS
+    exit 1
+}
+
+# option defaults
+yum_config=/etc/yum.conf
+while getopts ":y:h" opt; do
+    case $opt in
+        y)
+            yum_config=$OPTARG
+            ;;
+        h)
+            usage
+            ;;
+        \?)
+            echo "Invalid option: -$OPTARG"
+            usage
+            ;;
+    esac
+done
+shift $((OPTIND - 1))
+name=$1
+
+if [[ -z $name ]]; then
+    usage
+fi
+
+#--------------------
+
+target=$(mktemp -d --tmpdir $(basename $0).XXXXXX)
+
+set -x
+
+for dev in console null zero urandom; do
+    /sbin/MAKEDEV -d "$target"/dev -x $dev
+done
+
+yum -c "$yum_config" --installroot="$target" --setopt=tsflags=nodocs \
+    --setopt=group_package_types=mandatory -y groupinstall Core
+yum -c "$yum_config" --installroot="$mount" -y clean all
+
+cat > "$target"/etc/sysconfig/network <<EOF
+NETWORKING=yes
+HOSTNAME=localhost.localdomain
+EOF
+
+# effectively: febootstrap-minimize --keep-zoneinfo --keep-rpmdb
+# --keep-services "$target".  Stolen from mkimage-rinse.sh
+#  locales
+rm -rf "$target"/usr/{{lib,share}/locale,{lib,lib64}/gconv,bin/localedef,sbin/build-locale-archive}
+#  docs
+rm -rf "$target"/usr/share/{man,doc,info,gnome/help}
+#  cracklib
+rm -rf "$target"/usr/share/cracklib
+#  i18n
+rm -rf "$target"/usr/share/i18n
+#  sln
+rm -rf "$target"/sbin/sln
+#  ldconfig
+rm -rf "$target"/etc/ld.so.cache
+rm -rf "$target"/var/cache/ldconfig/*
+
+version=
+if [ -r "$target"/etc/redhat-release ]; then
+    version="$(sed 's/^[^0-9\]*\([0-9.]\+\).*$/\1/' /etc/redhat-release)"
+fi
+
+if [ -z "$version" ]; then
+    echo >&2 "warning: cannot autodetect OS version, using '$name' as tag"
+    version=$name
+fi
+
+tar --numeric-owner -c -C "$target" . | docker import - $name:$version
+docker run -i -t $name:$version echo success
+
+rm -rf "$target"

+ 4 - 1
docs/sources/articles/baseimages.rst

@@ -37,7 +37,10 @@ There are more example scripts for creating base images in the
 Docker GitHub Repo:
 
 * `BusyBox <https://github.com/dotcloud/docker/blob/master/contrib/mkimage-busybox.sh>`_
-* `CentOS / Scientific Linux CERN (SLC)
+* CentOS / Scientific Linux CERN (SLC) `on Debian/Ubuntu
   <https://github.com/dotcloud/docker/blob/master/contrib/mkimage-rinse.sh>`_
+  or
+  `on CentOS/RHEL/SLC/etc.
+  <https://github.com/dotcloud/docker/blob/master/contrib/mkimage-yum.sh>`_
 * `Debian / Ubuntu
   <https://github.com/dotcloud/docker/blob/master/contrib/mkimage-debootstrap.sh>`_

+ 1 - 0
docs/sources/articles/index.rst

@@ -12,3 +12,4 @@ Articles
 
    security
    baseimages
+   runmetrics

+ 463 - 0
docs/sources/articles/runmetrics.rst

@@ -0,0 +1,463 @@
+:title: Runtime Metrics
+:description: Measure the behavior of running containers
+:keywords: docker, metrics, CPU, memory, disk, IO, run, runtime
+
+.. _run_metrics:
+
+
+Runtime Metrics
+===============
+
+Linux Containers rely on `control groups
+<https://www.kernel.org/doc/Documentation/cgroups/cgroups.txt>`_ which
+not only track groups of processes, but also expose metrics about CPU,
+memory, and block I/O usage. You can access those metrics and obtain
+network usage metrics as well. This is relevant for "pure" LXC
+containers, as well as for Docker containers.
+
+Control Groups
+--------------
+
+Control groups are exposed through a pseudo-filesystem. In recent
+distros, you should find this filesystem under
+``/sys/fs/cgroup``. Under that directory, you will see multiple
+sub-directories, called devices, freezer, blkio, etc.; each
+sub-directory actually corresponds to a different cgroup hierarchy.
+
+On older systems, the control groups might be mounted on ``/cgroup``,
+without distinct hierarchies. In that case, instead of seeing the
+sub-directories, you will see a bunch of files in that directory, and
+possibly some directories corresponding to existing containers.
+
+To figure out where your control groups are mounted, you can run:
+
+::
+
+  grep cgroup /proc/mounts
+
+.. _run_findpid:
+
+Enumerating Cgroups
+-------------------
+
+You can look into ``/proc/cgroups`` to see the different control group
+subsystems known to the system, the hierarchy they belong to, and how
+many groups they contain.
+
+You can also look at ``/proc/<pid>/cgroup`` to see which control
+groups a process belongs to. The control group will be shown as a path
+relative to the root of the hierarchy mountpoint; e.g. ``/`` means
+“this process has not been assigned into a particular group”, while
+``/lxc/pumpkin`` means that the process is likely to be a member of a
+container named ``pumpkin``.
+
+Finding the Cgroup for a Given Container
+----------------------------------------
+
+For each container, one cgroup will be created in each hierarchy. On
+older systems with older versions of the LXC userland tools, the name
+of the cgroup will be the name of the container. With more recent
+versions of the LXC tools, the cgroup will be ``lxc/<container_name>.``
+
+For Docker containers using cgroups, the container name will be the
+full ID or long ID of the container. If a container shows up as
+ae836c95b4c3 in ``docker ps``, its long ID might be something like
+``ae836c95b4c3c9e9179e0e91015512da89fdec91612f63cebae57df9a5444c79``. You
+can look it up with ``docker inspect`` or ``docker ps -notrunc``.
+
+Putting everything together to look at the memory metrics for a Docker
+container, take a look at ``/sys/fs/cgroup/memory/lxc/<longid>/``.
+
+Metrics from Cgroups: Memory, CPU, Block IO
+-------------------------------------------
+
+For each subsystem (memory, CPU, and block I/O), you will find one or
+more pseudo-files containing statistics.
+
+Memory Metrics: ``memory.stat``
+...............................
+
+Memory metrics are found in the "memory" cgroup. Note that the memory
+control group adds a little overhead, because it does very
+fine-grained accounting of the memory usage on your host. Therefore,
+many distros chose to not enable it by default. Generally, to enable
+it, all you have to do is to add some kernel command-line parameters:
+``cgroup_enable=memory swapaccount=1``.
+
+The metrics are in the pseudo-file ``memory.stat``. Here is what it
+will look like:
+
+::
+
+  cache 11492564992
+  rss 1930993664
+  mapped_file 306728960
+  pgpgin 406632648
+  pgpgout 403355412
+  swap 0
+  pgfault 728281223
+  pgmajfault 1724
+  inactive_anon 46608384
+  active_anon 1884520448
+  inactive_file 7003344896
+  active_file 4489052160
+  unevictable 32768
+  hierarchical_memory_limit 9223372036854775807
+  hierarchical_memsw_limit 9223372036854775807
+  total_cache 11492564992
+  total_rss 1930993664
+  total_mapped_file 306728960
+  total_pgpgin 406632648
+  total_pgpgout 403355412
+  total_swap 0
+  total_pgfault 728281223
+  total_pgmajfault 1724
+  total_inactive_anon 46608384
+  total_active_anon 1884520448
+  total_inactive_file 7003344896
+  total_active_file 4489052160
+  total_unevictable 32768
+
+The first half (without the ``total_`` prefix) contains statistics
+relevant to the processes within the cgroup, excluding
+sub-cgroups. The second half (with the ``total_`` prefix) includes
+sub-cgroups as well.
+
+Some metrics are "gauges", i.e. values that can increase or decrease
+(e.g. swap, the amount of swap space used by the members of the
+cgroup). Some others are "counters", i.e. values that can only go up,
+because they represent occurrences of a specific event (e.g. pgfault,
+which indicates the number of page faults which happened since the
+creation of the cgroup; this number can never decrease).
+
+cache 
+  the amount of memory used by the processes of this control group
+  that can be associated precisely with a block on a block
+  device. When you read from and write to files on disk, this amount
+  will increase. This will be the case if you use "conventional" I/O
+  (``open``, ``read``, ``write`` syscalls) as well as mapped files
+  (with ``mmap``). It also accounts for the memory used by ``tmpfs``
+  mounts, though the reasons are unclear.
+
+rss 
+  the amount of memory that *doesn't* correspond to anything on
+  disk: stacks, heaps, and anonymous memory maps.
+
+mapped_file 
+  indicates the amount of memory mapped by the processes in the
+  control group. It doesn't give you information about *how much*
+  memory is used; it rather tells you *how* it is used.
+
+pgfault and pgmajfault 
+  indicate the number of times that a process of the cgroup triggered
+  a "page fault" and a "major fault", respectively. A page fault
+  happens when a process accesses a part of its virtual memory space
+  which is nonexistent or protected. The former can happen if the
+  process is buggy and tries to access an invalid address (it will
+  then be sent a ``SIGSEGV`` signal, typically killing it with the
+  famous ``Segmentation fault`` message). The latter can happen when
+  the process reads from a memory zone which has been swapped out, or
+  which corresponds to a mapped file: in that case, the kernel will
+  load the page from disk, and let the CPU complete the memory
+  access. It can also happen when the process writes to a
+  copy-on-write memory zone: likewise, the kernel will preempt the
+  process, duplicate the memory page, and resume the write operation
+  on the process' own copy of the page. "Major" faults happen when the
+  kernel actually has to read the data from disk. When it just has to
+  duplicate an existing page, or allocate an empty page, it's a
+  regular (or "minor") fault.
+
+swap 
+  the amount of swap currently used by the processes in this cgroup.
+
+active_anon and inactive_anon
+  the amount of *anonymous* memory that has been identified has
+  respectively *active* and *inactive* by the kernel. "Anonymous"
+  memory is the memory that is *not* linked to disk pages. In other
+  words, that's the equivalent of the rss counter described above. In
+  fact, the very definition of the rss counter is **active_anon** +
+  **inactive_anon** - **tmpfs** (where tmpfs is the amount of memory
+  used up by ``tmpfs`` filesystems mounted by this control
+  group). Now, what's the difference between "active" and "inactive"?
+  Pages are initially "active"; and at regular intervals, the kernel
+  sweeps over the memory, and tags some pages as "inactive". Whenever
+  they are accessed again, they are immediately retagged
+  "active". When the kernel is almost out of memory, and time comes to
+  swap out to disk, the kernel will swap "inactive" pages.
+
+active_file and inactive_file
+  cache memory, with *active* and *inactive* similar to the *anon*
+  memory above. The exact formula is cache = **active_file** +
+  **inactive_file** + **tmpfs**. The exact rules used by the kernel to
+  move memory pages between active and inactive sets are different
+  from the ones used for anonymous memory, but the general principle
+  is the same. Note that when the kernel needs to reclaim memory, it
+  is cheaper to reclaim a clean (=non modified) page from this pool,
+  since it can be reclaimed immediately (while anonymous pages and
+  dirty/modified pages have to be written to disk first).
+
+unevictable
+  the amount of memory that cannot be reclaimed; generally, it will
+  account for memory that has been "locked" with ``mlock``. It is
+  often used by crypto frameworks to make sure that secret keys and
+  other sensitive material never gets swapped out to disk.
+
+memory and memsw limits
+  These are not really metrics, but a reminder of the limits applied
+  to this cgroup. The first one indicates the maximum amount of
+  physical memory that can be used by the processes of this control
+  group; the second one indicates the maximum amount of RAM+swap.
+
+Accounting for memory in the page cache is very complex. If two
+processes in different control groups both read the same file
+(ultimately relying on the same blocks on disk), the corresponding
+memory charge will be split between the control groups. It's nice, but
+it also means that when a cgroup is terminated, it could increase the
+memory usage of another cgroup, because they are not splitting the
+cost anymore for those memory pages.
+
+CPU metrics: ``cpuacct.stat``
+.............................
+
+Now that we've covered memory metrics, everything else will look very
+simple in comparison. CPU metrics will be found in the ``cpuacct``
+controller.
+
+For each container, you will find a pseudo-file ``cpuacct.stat``,
+containing the CPU usage accumulated by the processes of the
+container, broken down between ``user`` and ``system`` time. If you're
+not familiar with the distinction, ``user`` is the time during which
+the processes were in direct control of the CPU (i.e. executing
+process code), and ``system`` is the time during which the CPU was
+executing system calls on behalf of those processes.
+
+Those times are expressed in ticks of 1/100th of a second. Actually,
+they are expressed in "user jiffies". There are ``USER_HZ``
+*"jiffies"* per second, and on x86 systems, ``USER_HZ`` is 100. This
+used to map exactly to the number of scheduler "ticks" per second; but
+with the advent of higher frequency scheduling, as well as `tickless
+kernels <http://lwn.net/Articles/549580/>`_, the number of kernel
+ticks wasn't relevant anymore. It stuck around anyway, mainly for
+legacy and compatibility reasons.
+
+Block I/O metrics
+.................
+
+Block I/O is accounted in the ``blkio`` controller. Different metrics
+are scattered across different files. While you can find in-depth
+details in the `blkio-controller
+<https://www.kernel.org/doc/Documentation/cgroups/blkio-controller.txt>`_
+file in the kernel documentation, here is a short list of the most
+relevant ones:
+
+blkio.sectors 
+  contain the number of 512-bytes sectors read and written by the
+  processes member of the cgroup, device by device. Reads and writes
+  are merged in a single counter.
+
+blkio.io_service_bytes 
+  indicates the number of bytes read and written by the cgroup. It has
+  4 counters per device, because for each device, it differentiates
+  between synchronous vs. asynchronous I/O, and reads vs. writes.
+
+blkio.io_serviced
+  the number of I/O operations performed, regardless of their size. It
+  also has 4 counters per device.
+
+blkio.io_queued 
+  indicates the number of I/O operations currently queued for this
+  cgroup. In other words, if the cgroup isn't doing any I/O, this will
+  be zero. Note that the opposite is not true. In other words, if
+  there is no I/O queued, it does not mean that the cgroup is idle
+  (I/O-wise). It could be doing purely synchronous reads on an
+  otherwise quiescent device, which is therefore able to handle them
+  immediately, without queuing. Also, while it is helpful to figure
+  out which cgroup is putting stress on the I/O subsystem, keep in
+  mind that is is a relative quantity. Even if a process group does
+  not perform more I/O, its queue size can increase just because the
+  device load increases because of other devices.
+
+Network Metrics
+---------------
+
+Network metrics are not exposed directly by control groups. There is a
+good explanation for that: network interfaces exist within the context
+of *network namespaces*. The kernel could probably accumulate metrics
+about packets and bytes sent and received by a group of processes, but
+those metrics wouldn't be very useful. You want per-interface metrics
+(because traffic happening on the local ``lo`` interface doesn't
+really count). But since processes in a single cgroup can belong to
+multiple network namespaces, those metrics would be harder to
+interpret: multiple network namespaces means multiple ``lo``
+interfaces, potentially multiple ``eth0`` interfaces, etc.; so this is
+why there is no easy way to gather network metrics with control
+groups.
+
+Instead we can gather network metrics from other sources:
+
+IPtables
+........
+
+IPtables (or rather, the netfilter framework for which iptables is
+just an interface) can do some serious accounting.
+
+For instance, you can setup a rule to account for the outbound HTTP
+traffic on a web server:
+
+::
+
+  iptables -I OUTPUT -p tcp --sport 80
+
+
+There is no ``-j`` or ``-g`` flag, so the rule will just count matched
+packets and go to the following rule.
+
+Later, you can check the values of the counters, with:
+
+::
+
+   iptables -nxvL OUTPUT
+
+Technically, ``-n`` is not required, but it will prevent iptables from
+doing DNS reverse lookups, which are probably useless in this
+scenario.
+
+Counters include packets and bytes. If you want to setup metrics for
+container traffic like this, you could execute a ``for`` loop to add
+two ``iptables`` rules per container IP address (one in each
+direction), in the ``FORWARD`` chain. This will only meter traffic
+going through the NAT layer; you will also have to add traffic going
+through the userland proxy.
+
+Then, you will need to check those counters on a regular basis. If you
+happen to use ``collectd``, there is a nice plugin to automate
+iptables counters collection.
+
+Interface-level counters
+........................
+
+Since each container has a virtual Ethernet interface, you might want
+to check directly the TX and RX counters of this interface. You will
+notice that each container is associated to a virtual Ethernet
+interface in your host, with a name like ``vethKk8Zqi``. Figuring out
+which interface corresponds to which container is, unfortunately,
+difficult.
+
+But for now, the best way is to check the metrics *from within the
+containers*. To accomplish this, you can run an executable from the
+host environment within the network namespace of a container using
+**ip-netns magic**.
+
+The ``ip-netns exec`` command will let you execute any program
+(present in the host system) within any network namespace visible to
+the current process. This means that your host will be able to enter
+the network namespace of your containers, but your containers won't be
+able to access the host, nor their sibling containers. Containers will
+be able to “see” and affect their sub-containers, though.
+
+The exact format of the command is::
+
+  ip netns exec <nsname> <command...>
+
+For example::
+
+  ip netns exec mycontainer netstat -i
+
+``ip netns`` finds the "mycontainer" container by using namespaces
+pseudo-files. Each process belongs to one network namespace, one PID
+namespace, one ``mnt`` namespace, etc., and those namespaces are
+materialized under ``/proc/<pid>/ns/``. For example, the network
+namespace of PID 42 is materialized by the pseudo-file
+``/proc/42/ns/net``.
+
+When you run ``ip netns exec mycontainer ...``, it expects
+``/var/run/netns/mycontainer`` to be one of those
+pseudo-files. (Symlinks are accepted.)
+
+In other words, to execute a command within the network namespace of a
+container, we need to:
+
+* Find out the PID of any process within the container that we want to
+  investigate;
+* Create a symlink from ``/var/run/netns/<somename>`` to
+  ``/proc/<thepid>/ns/net``
+* Execute ``ip netns exec <somename> ....``
+
+Please review :ref:`run_findpid` to learn how to find the cgroup of a
+pprocess running in the container of which you want to measure network
+usage. From there, you can examine the pseudo-file named ``tasks``,
+which containes the PIDs that are in the control group (i.e. in the
+container). Pick any one of them.
+
+Putting everything together, if the "short ID" of a container is held
+in the environment variable ``$CID``, then you can do this::
+
+  TASKS=/sys/fs/cgroup/devices/$CID*/tasks
+  PID=$(head -n 1 $TASKS)
+  mkdir -p /var/run/netns
+  ln -sf /proc/$PID/ns/net /var/run/netns/$CID
+  ip netns exec $CID netstat -i
+
+
+Tips for high-performance metric collection
+-------------------------------------------
+
+Note that running a new process each time you want to update metrics
+is (relatively) expensive. If you want to collect metrics at high
+resolutions, and/or over a large number of containers (think 1000
+containers on a single host), you do not want to fork a new process
+each time.
+
+Here is how to collect metrics from a single process. You will have to
+write your metric collector in C (or any language that lets you do
+low-level system calls). You need to use a special system call,
+``setns()``, which lets the current process enter any arbitrary
+namespace. It requires, however, an open file descriptor to the
+namespace pseudo-file (remember: that’s the pseudo-file in
+``/proc/<pid>/ns/net``).
+
+However, there is a catch: you must not keep this file descriptor
+open. If you do, when the last process of the control group exits, the
+namespace will not be destroyed, and its network resources (like the
+virtual interface of the container) will stay around for ever (or
+until you close that file descriptor).
+
+The right approach would be to keep track of the first PID of each
+container, and re-open the namespace pseudo-file each time.
+
+Collecting metrics when a container exits 
+-----------------------------------------
+
+Sometimes, you do not care about real time metric collection, but when
+a container exits, you want to know how much CPU, memory, etc. it has
+used.
+
+Docker makes this difficult because it relies on ``lxc-start``, which
+carefully cleans up after itself, but it is still possible. It is
+usually easier to collect metrics at regular intervals (e.g. every
+minute, with the collectd LXC plugin) and rely on that instead.
+
+But, if you'd still like to gather the stats when a container stops,
+here is how:
+
+For each container, start a collection process, and move it to the
+control groups that you want to monitor by writing its PID to the
+tasks file of the cgroup. The collection process should periodically
+re-read the tasks file to check if it's the last process of the
+control group. (If you also want to collect network statistics as
+explained in the previous section, you should also move the process to
+the appropriate network namespace.)
+
+When the container exits, ``lxc-start`` will try to delete the control
+groups. It will fail, since the control group is still in use; but
+that’s fine. You process should now detect that it is the only one
+remaining in the group. Now is the right time to collect all the
+metrics you need!
+
+Finally, your process should move itself back to the root control
+group, and remove the container control group. To remove a control
+group, just ``rmdir`` its directory. It's counter-intuitive to
+``rmdir`` a directory as it still contains files; but remember that
+this is a pseudo-filesystem, so usual rules don't apply. After the
+cleanup is done, the collection process can exit safely.
+

+ 10 - 1
docs/sources/installation/ubuntulinux.rst

@@ -217,6 +217,15 @@ To install the latest version of docker, use the standard ``apt-get`` method:
    # install the latest
    sudo apt-get install lxc-docker
 
+Troubleshooting
+^^^^^^^^^^^^^^^
+
+On Linux Mint, the ``cgroups-lite`` package is not installed by default.
+Before Docker will work correctly, you will need to install this via:
+
+.. code-block:: bash
+
+    sudo apt-get update && sudo apt-get install cgroups-lite
 
 .. _ufw:
 
@@ -224,7 +233,7 @@ Docker and UFW
 ^^^^^^^^^^^^^^
 
 Docker uses a bridge to manage container networking. By default, UFW drops all
-`forwarding` traffic. As a result will you need to enable UFW forwarding:
+`forwarding` traffic. As a result you will need to enable UFW forwarding:
 
 .. code-block:: bash
 

+ 4 - 4
docs/sources/reference/builder.rst

@@ -1,12 +1,12 @@
-:title: Build Images (Dockerfile Reference)
+:title: Dockerfile Reference
 :description: Dockerfiles use a simple DSL which allows you to automate the steps you would normally manually take to create an image.
 :keywords: builder, docker, Dockerfile, automation, image creation
 
 .. _dockerbuilder:
 
-===================================
-Build Images (Dockerfile Reference)
-===================================
+====================
+Dockerfile Reference
+====================
 
 **Docker can act as a builder** and read instructions from a text
 ``Dockerfile`` to automate the steps you would otherwise take manually

+ 39 - 0
docs/sources/reference/commandline/cli.rst

@@ -18,6 +18,45 @@ To list available commands, either run ``docker`` with no parameters or execute
 
     ...
 
+.. _cli_options:
+
+Types of Options
+----------------
+
+Boolean
+~~~~~~~
+
+Boolean options look like ``-d=false``. The value you see is the
+default value which gets set if you do **not** use the boolean
+flag. If you do call ``run -d``, that sets the opposite boolean value,
+so in this case, ``true``, and so ``docker run -d`` **will** run in
+"detached" mode, in the background. Other boolean options are similar
+-- specifying them will set the value to the opposite of the default
+value.
+
+Multi
+~~~~~
+
+Options like ``-a=[]`` indicate they can be specified multiple times::
+
+  docker run -a stdin -a stdout -a stderr -i -t ubuntu /bin/bash
+
+Sometimes this can use a more complex value string, as for ``-v``::
+
+  docker run -v /host:/container example/mysql
+
+Strings and Integers
+~~~~~~~~~~~~~~~~~~~~
+
+Options like ``-name=""`` expect a string, and they can only be
+specified once. Options like ``-c=0`` expect an integer, and they can
+only be specified once.
+
+----
+
+Commands
+--------
+
 .. _cli_daemon:
 
 ``daemon``

+ 1 - 0
docs/sources/reference/index.rst

@@ -14,4 +14,5 @@ Contents:
 
    commandline/index
    builder
+   run
    api/index

+ 419 - 0
docs/sources/reference/run.rst

@@ -0,0 +1,419 @@
+:title: Docker Run Reference 
+:description: Configure containers at runtime
+:keywords: docker, run, configure, runtime
+
+.. _run_docker:
+
+====================
+Docker Run Reference
+====================
+
+**Docker runs processes in isolated containers**.  When an operator
+executes ``docker run``, she starts a process with its own file
+system, its own networking, and its own isolated process tree. The
+:ref:`image_def` which starts the process may define defaults related
+to the binary to run, the networking to expose, and more, but ``docker
+run`` gives final control to the operator who starts the container
+from the image. That's the main reason :ref:`cli_run` has more options
+than any other ``docker`` command.
+
+Every one of the :ref:`example_list` shows running containers, and so
+here we try to give more in-depth guidance.
+
+.. contents:: Table of Contents
+   :depth: 2
+
+.. _run_running:
+
+General Form
+============
+
+As you've seen in the :ref:`example_list`, the basic `run` command
+takes this form::
+
+  docker run [OPTIONS] IMAGE[:TAG] [COMMAND] [ARG...]
+
+To learn how to interpret the types of ``[OPTIONS]``, see
+:ref:`cli_options`.
+
+The list of ``[OPTIONS]`` breaks down into two groups: 
+
+1. Settings exclusive to operators, including:
+
+   * Detached or Foreground running,
+   * Container Identification,
+   * Network settings, and
+   * Runtime Constraints on CPU and Memory
+   * Privileges and LXC Configuration
+
+2. Setting shared between operators and developers, where operators
+   can override defaults developers set in images at build time.
+
+Together, the ``docker run [OPTIONS]`` give complete control over
+runtime behavior to the operator, allowing them to override all
+defaults set by the developer during ``docker build`` and nearly all
+the defaults set by the Docker runtime itself.
+
+Operator Exclusive Options
+==========================
+
+Only the operator (the person executing ``docker run``) can set the
+following options.
+
+.. contents::
+   :local:
+
+Detached vs Foreground
+----------------------
+
+When starting a Docker container, you must first decide if you want to
+run the container in the background in a "detached" mode or in the
+default foreground mode::
+
+   -d=false: Detached mode: Run container in the background, print new container id
+
+Detached (-d)
+.............
+
+In detached mode (``-d=true`` or just ``-d``), all I/O should be done
+through network connections or shared volumes because the container is
+no longer listening to the commandline where you executed ``docker
+run``. You can reattach to a detached container with ``docker``
+:ref:`cli_attach`. If you choose to run a container in the detached
+mode, then you cannot use the ``-rm`` option.
+
+Foreground
+..........
+
+In foreground mode (the default when ``-d`` is not specified),
+``docker run`` can start the process in the container and attach the
+console to the process's standard input, output, and standard
+error. It can even pretend to be a TTY (this is what most commandline
+executables expect) and pass along signals. All of that is
+configurable::
+
+   -a=[]          : Attach to ``stdin``, ``stdout`` and/or ``stderr``
+   -t=false       : Allocate a pseudo-tty
+   -sig-proxy=true: Proxify all received signal to the process (even in non-tty mode)
+   -i=false       : Keep STDIN open even if not attached
+
+If you do not specify ``-a`` then Docker will `attach everything
+(stdin,stdout,stderr)
+<https://github.com/dotcloud/docker/blob/75a7f4d90cde0295bcfb7213004abce8d4779b75/commands.go#L1797>`_. You
+can specify to which of the three standard streams (``stdin``, ``stdout``,
+``stderr``) you'd like to connect instead, as in::
+
+   docker run -a stdin -a stdout -i -t ubuntu /bin/bash
+
+For interactive processes (like a shell) you will typically want a tty
+as well as persistent standard input (``stdin``), so you'll use ``-i
+-t`` together in most interactive cases.
+
+Container Identification
+------------------------
+
+Name (-name)
+............
+
+The operator can identify a container in three ways:
+
+* UUID long identifier ("f78375b1c487e03c9438c729345e54db9d20cfa2ac1fc3494b6eb60872e74778")
+* UUID short identifier ("f78375b1c487")
+* Name ("evil_ptolemy")
+
+The UUID identifiers come from the Docker daemon, and if you do not
+assign a name to the container with ``-name`` then the daemon will
+also generate a random string name too. The name can become a handy
+way to add meaning to a container since you can use this name when
+defining :ref:`links <working_with_links_names>` (or any other place
+you need to identify a container). This works for both background and
+foreground Docker containers.
+
+PID Equivalent
+..............
+
+And finally, to help with automation, you can have Docker write the
+container ID out to a file of your choosing. This is similar to how
+some programs might write out their process ID to a file (you've seen
+them as PID files)::
+
+      -cidfile="": Write the container ID to the file
+
+Network Settings
+----------------
+
+::
+   -n=true   : Enable networking for this container
+   -dns=[]   : Set custom dns servers for the container
+
+By default, all containers have networking enabled and they can make
+any outgoing connections. The operator can completely disable
+networking with ``docker run -n`` which disables all incoming and outgoing
+networking. In cases like this, you would perform I/O through files or
+STDIN/STDOUT only.
+
+Your container will use the same DNS servers as the host by default,
+but you can override this with ``-dns``.
+
+Clean Up (-rm)
+--------------
+
+By default a container's file system persists even after the container
+exits. This makes debugging a lot easier (since you can inspect the
+final state) and you retain all your data by default. But if you are
+running short-term **foreground** processes, these container file
+systems can really pile up. If instead you'd like Docker to
+**automatically clean up the container and remove the file system when
+the container exits**, you can add the ``-rm`` flag::
+
+   -rm=false: Automatically remove the container when it exits (incompatible with -d)
+
+
+Runtime Constraints on CPU and Memory
+-------------------------------------
+
+The operator can also adjust the performance parameters of the container::
+
+   -m="": Memory limit (format: <number><optional unit>, where unit = b, k, m or g)
+   -c=0 : CPU shares (relative weight)
+
+The operator can constrain the memory available to a container easily
+with ``docker run -m``. If the host supports swap memory, then the
+``-m`` memory setting can be larger than physical RAM.
+
+Similarly the operator can increase the priority of this container
+with the ``-c`` option. By default, all containers run at the same
+priority and get the same proportion of CPU cycles, but you can tell
+the kernel to give more shares of CPU time to one or more containers
+when you start them via Docker.
+
+Runtime Privilege and LXC Configuration
+---------------------------------------
+
+::
+
+   -privileged=false: Give extended privileges to this container
+   -lxc-conf=[]: Add custom lxc options -lxc-conf="lxc.cgroup.cpuset.cpus = 0,1"
+
+By default, Docker containers are "unprivileged" and cannot, for
+example, run a Docker daemon inside a Docker container. This is
+because by default a container is not allowed to access any devices,
+but a "privileged" container is given access to all devices (see
+lxc-template.go_ and documentation on `cgroups devices
+<https://www.kernel.org/doc/Documentation/cgroups/devices.txt>`_).
+
+When the operator executes ``docker run -privileged``, Docker will
+enable to access to all devices on the host as well as set some
+configuration in AppArmor to allow the container nearly all the same
+access to the host as processes running outside containers on the
+host. Additional information about running with ``-privileged`` is
+available on the `Docker Blog
+<http://blog.docker.io/2013/09/docker-can-now-run-within-docker/>`_.
+
+An operator can also specify LXC options using one or more
+``-lxc-conf`` parameters. These can be new parameters or override
+existing parameters from the lxc-template.go_. Note that in the
+future, a given host's Docker daemon may not use LXC, so this is an
+implementation-specific configuration meant for operators already
+familiar with using LXC directly.
+
+.. _lxc-template.go: https://github.com/dotcloud/docker/blob/master/execdriver/lxc/lxc_template.go
+
+
+Overriding ``Dockerfile`` Image Defaults
+========================================
+
+When a developer builds an image from a :ref:`Dockerfile
+<dockerbuilder>` or when she commits it, the developer can set a
+number of default parameters that take effect when the image starts up
+as a container.
+
+Four of the ``Dockerfile`` commands cannot be overridden at runtime:
+``FROM, MAINTAINER, RUN``, and ``ADD``. Everything else has a
+corresponding override in ``docker run``. We'll go through what the
+developer might have set in each ``Dockerfile`` instruction and how the
+operator can override that setting.
+
+.. contents::
+   :local:
+
+CMD (Default Command or Options)
+--------------------------------
+
+Recall the optional ``COMMAND`` in the Docker commandline::
+
+  docker run [OPTIONS] IMAGE[:TAG] [COMMAND] [ARG...]
+
+This command is optional because the person who created the ``IMAGE``
+may have already provided a default ``COMMAND`` using the ``Dockerfile``
+``CMD``. As the operator (the person running a container from the
+image), you can override that ``CMD`` just by specifying a new
+``COMMAND``.
+
+If the image also specifies an ``ENTRYPOINT`` then the ``CMD`` or
+``COMMAND`` get appended as arguments to the ``ENTRYPOINT``.
+
+
+ENTRYPOINT (Default Command to Execute at Runtime
+-------------------------------------------------
+
+::
+
+   -entrypoint="": Overwrite the default entrypoint set by the image
+
+The ENTRYPOINT of an image is similar to a ``COMMAND`` because it
+specifies what executable to run when the container starts, but it is
+(purposely) more difficult to override. The ``ENTRYPOINT`` gives a
+container its default nature or behavior, so that when you set an
+``ENTRYPOINT`` you can run the container *as if it were that binary*,
+complete with default options, and you can pass in more options via
+the ``COMMAND``. But, sometimes an operator may want to run something else
+inside the container, so you can override the default ``ENTRYPOINT`` at
+runtime by using a string to specify the new ``ENTRYPOINT``. Here is an
+example of how to run a shell in a container that has been set up to
+automatically run something else (like ``/usr/bin/redis-server``)::
+
+  docker run -i -t -entrypoint /bin/bash example/redis
+
+or two examples of how to pass more parameters to that ENTRYPOINT::
+
+  docker run -i -t -entrypoint /bin/bash example/redis -c ls -l
+  docker run -i -t -entrypoint /usr/bin/redis-cli example/redis --help
+
+
+EXPOSE (Incoming Ports)
+-----------------------
+
+The ``Dockerfile`` doesn't give much control over networking, only
+providing the ``EXPOSE`` instruction to give a hint to the operator
+about what incoming ports might provide services. The following
+options work with or override the ``Dockerfile``'s exposed defaults::
+
+   -expose=[]: Expose a port from the container 
+               without publishing it to your host
+   -P=false  : Publish all exposed ports to the host interfaces
+   -p=[]     : Publish a container's port to the host (format: 
+               ip:hostPort:containerPort | ip::containerPort | 
+               hostPort:containerPort) 
+               (use 'docker port' to see the actual mapping)
+   -link=""  : Add link to another container (name:alias)
+
+As mentioned previously, ``EXPOSE`` (and ``-expose``) make a port
+available **in** a container for incoming connections. The port number
+on the inside of the container (where the service listens) does not
+need to be the same number as the port exposed on the outside of the
+container (where clients connect), so inside the container you might
+have an HTTP service listening on port 80 (and so you ``EXPOSE 80`` in
+the ``Dockerfile``), but outside the container the port might be 42800.
+
+To help a new client container reach the server container's internal
+port operator ``-expose``'d by the operator or ``EXPOSE``'d by the
+developer, the operator has three choices: start the server container
+with ``-P`` or ``-p,`` or start the client container with ``-link``.
+
+If the operator uses ``-P`` or ``-p`` then Docker will make the
+exposed port accessible on the host and the ports will be available to
+any client that can reach the host. To find the map between the host
+ports and the exposed ports, use ``docker port``)
+
+If the operator uses ``-link`` when starting the new client container,
+then the client container can access the exposed port via a private
+networking interface. Docker will set some environment variables in
+the client container to help indicate which interface and port to use.
+
+ENV (Environment Variables)
+---------------------------
+
+The operator can **set any environment variable** in the container by
+using one or more ``-e`` flags, even overriding those already defined by the
+developer with a Dockefile ``ENV``::
+
+   $ docker run -e "deep=purple" -rm ubuntu /bin/bash -c export
+   declare -x HOME="/"
+   declare -x HOSTNAME="85bc26a0e200"
+   declare -x OLDPWD
+   declare -x PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
+   declare -x PWD="/"
+   declare -x SHLVL="1"
+   declare -x container="lxc"
+   declare -x deep="purple"
+
+Similarly the operator can set the **hostname** with ``-h``.
+
+``-link name:alias`` also sets environment variables, using the
+*alias* string to define environment variables within the container
+that give the IP and PORT information for connecting to the service
+container. Let's imagine we have a container running Redis::
+
+   # Start the service container, named redis-name
+   $ docker run -d -name redis-name dockerfiles/redis
+   4241164edf6f5aca5b0e9e4c9eccd899b0b8080c64c0cd26efe02166c73208f3
+
+   # The redis-name container exposed port 6379
+   $ docker ps  
+   CONTAINER ID        IMAGE                      COMMAND                CREATED             STATUS              PORTS               NAMES
+   4241164edf6f        dockerfiles/redis:latest   /redis-stable/src/re   5 seconds ago       Up 4 seconds        6379/tcp            redis-name  
+
+   # Note that there are no public ports exposed since we didn't use -p or -P
+   $ docker port 4241164edf6f 6379
+   2014/01/25 00:55:38 Error: No public port '6379' published for 4241164edf6f
+
+
+Yet we can get information about the Redis container's exposed ports
+with ``-link``. Choose an alias that will form a valid environment
+variable!
+
+::
+
+   $ docker run -rm -link redis-name:redis_alias -entrypoint /bin/bash dockerfiles/redis -c export
+   declare -x HOME="/"
+   declare -x HOSTNAME="acda7f7b1cdc"
+   declare -x OLDPWD
+   declare -x PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
+   declare -x PWD="/"
+   declare -x REDIS_ALIAS_NAME="/distracted_wright/redis"
+   declare -x REDIS_ALIAS_PORT="tcp://172.17.0.32:6379"
+   declare -x REDIS_ALIAS_PORT_6379_TCP="tcp://172.17.0.32:6379"
+   declare -x REDIS_ALIAS_PORT_6379_TCP_ADDR="172.17.0.32"
+   declare -x REDIS_ALIAS_PORT_6379_TCP_PORT="6379"
+   declare -x REDIS_ALIAS_PORT_6379_TCP_PROTO="tcp"
+   declare -x SHLVL="1"
+   declare -x container="lxc"
+
+And we can use that information to connect from another container as a client::
+
+   $ docker run -i -t -rm -link redis-name:redis_alias -entrypoint /bin/bash dockerfiles/redis -c '/redis-stable/src/redis-cli -h $REDIS_ALIAS_PORT_6379_TCP_ADDR -p $REDIS_ALIAS_PORT_6379_TCP_PORT'
+   172.17.0.32:6379>
+
+VOLUME (Shared Filesystems)
+---------------------------
+
+::
+
+   -v=[]: Create a bind mount with: [host-dir]:[container-dir]:[rw|ro]. 
+          If "container-dir" is missing, then docker creates a new volume.
+   -volumes-from="": Mount all volumes from the given container(s)
+
+The volumes commands are complex enough to have their own
+documentation in section :ref:`volume_def`. A developer can define one
+or more ``VOLUME``\s associated with an image, but only the operator can
+give access from one container to another (or from a container to a
+volume mounted on the host).
+
+USER
+----
+
+The default user within a container is ``root`` (id = 0), but if the
+developer created additional users, those are accessible too. The
+developer can set a default user to run the first process with the
+``Dockerfile USER`` command, but the operator can override it ::
+
+   -u="": Username or UID
+
+WORKDIR
+-------
+
+The default working directory for running binaries within a container is the root directory (``/``), but the developer can set a different default with the ``Dockerfile WORKDIR`` command. The operator can override this with::
+
+   -w="": Working directory inside the container
+

+ 1 - 19
engine/env.go

@@ -60,7 +60,7 @@ func (env *Env) GetInt64(key string) int64 {
 	s := strings.Trim(env.Get(key), " \t")
 	val, err := strconv.ParseInt(s, 10, 64)
 	if err != nil {
-		return -1
+		return 0
 	}
 	return val
 }
@@ -213,24 +213,6 @@ func (env *Env) WriteTo(dst io.Writer) (n int64, err error) {
 	return 0, env.Encode(dst)
 }
 
-func (env *Env) Export(dst interface{}) (err error) {
-	defer func() {
-		if err != nil {
-			err = fmt.Errorf("ExportEnv %s", err)
-		}
-	}()
-	var buf bytes.Buffer
-	// step 1: encode/marshal the env to an intermediary json representation
-	if err := env.Encode(&buf); err != nil {
-		return err
-	}
-	// step 2: decode/unmarshal the intermediary json into the destination object
-	if err := json.NewDecoder(&buf).Decode(dst); err != nil {
-		return err
-	}
-	return nil
-}
-
 func (env *Env) Import(src interface{}) (err error) {
 	defer func() {
 		if err != nil {

+ 1 - 27
engine/env_test.go

@@ -62,7 +62,7 @@ func TestSetenvInt(t *testing.T) {
 	if val := job.GetenvInt("bar"); val != 42 {
 		t.Fatalf("GetenvInt returns incorrect value: %d", val)
 	}
-	if val := job.GetenvInt("nonexistent"); val != -1 {
+	if val := job.GetenvInt("nonexistent"); val != 0 {
 		t.Fatalf("GetenvInt returns incorrect value: %d", val)
 	}
 }
@@ -84,32 +84,6 @@ func TestSetenvList(t *testing.T) {
 	}
 }
 
-func TestImportEnv(t *testing.T) {
-	type dummy struct {
-		DummyInt         int
-		DummyStringArray []string
-	}
-
-	job := mkJob(t, "dummy")
-	if err := job.ImportEnv(&dummy{42, []string{"foo", "bar"}}); err != nil {
-		t.Fatal(err)
-	}
-
-	dmy := dummy{}
-	if err := job.ExportEnv(&dmy); err != nil {
-		t.Fatal(err)
-	}
-
-	if dmy.DummyInt != 42 {
-		t.Fatalf("Expected 42, got %d", dmy.DummyInt)
-	}
-
-	if len(dmy.DummyStringArray) != 2 || dmy.DummyStringArray[0] != "foo" || dmy.DummyStringArray[1] != "bar" {
-		t.Fatalf("Expected {foo, bar}, got %v", dmy.DummyStringArray)
-	}
-
-}
-
 func TestEnviron(t *testing.T) {
 	job := mkJob(t, "dummy")
 	job.Setenv("foo", "bar")

+ 4 - 4
engine/job.go

@@ -102,6 +102,10 @@ func (job *Job) String() string {
 	return fmt.Sprintf("%s.%s%s", job.Eng, job.CallString(), job.StatusString())
 }
 
+func (job *Job) EnvExists(key string) (value bool) {
+	return job.env.Exists(key)
+}
+
 func (job *Job) Getenv(key string) (value string) {
 	return job.env.Get(key)
 }
@@ -172,10 +176,6 @@ func (job *Job) EncodeEnv(dst io.Writer) error {
 	return job.env.Encode(dst)
 }
 
-func (job *Job) ExportEnv(dst interface{}) (err error) {
-	return job.env.Export(dst)
-}
-
 func (job *Job) ImportEnv(src interface{}) (err error) {
 	return job.env.Import(src)
 }

+ 3 - 1
execdriver/lxc/driver.go

@@ -155,7 +155,9 @@ func (d *driver) Run(c *execdriver.Command, startCallback execdriver.StartCallba
 	)
 	go func() {
 		if err := c.Wait(); err != nil {
-			waitErr = err
+			if _, ok := err.(*exec.ExitError); !ok { // Do not propagate the error if it's simply a status code != 0
+				waitErr = err
+			}
 		}
 		close(waitLock)
 	}()

+ 217 - 0
graphdriver/btrfs/btrfs.go

@@ -0,0 +1,217 @@
+// +build linux
+
+package btrfs
+
+/*
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <linux/btrfs.h>
+
+*/
+import "C"
+import (
+	"fmt"
+	"github.com/dotcloud/docker/graphdriver"
+	"os"
+	"path"
+	"syscall"
+	"unsafe"
+)
+
+func init() {
+	graphdriver.Register("btrfs", Init)
+}
+
+func Init(home string) (graphdriver.Driver, error) {
+	rootdir := path.Dir(home)
+
+	var buf syscall.Statfs_t
+	if err := syscall.Statfs(rootdir, &buf); err != nil {
+		return nil, err
+	}
+
+	if buf.Type != 0x9123683E {
+		return nil, fmt.Errorf("%s is not a btrfs filesystem", rootdir)
+	}
+
+	return &Driver{
+		home: home,
+	}, nil
+}
+
+type Driver struct {
+	home string
+}
+
+func (d *Driver) String() string {
+	return "btrfs"
+}
+
+func (d *Driver) Status() [][2]string {
+	return nil
+}
+
+func (d *Driver) Cleanup() error {
+	return nil
+}
+
+func free(p *C.char) {
+	C.free(unsafe.Pointer(p))
+}
+
+func openDir(path string) (*C.DIR, error) {
+	Cpath := C.CString(path)
+	defer free(Cpath)
+
+	dir := C.opendir(Cpath)
+	if dir == nil {
+		return nil, fmt.Errorf("Can't open dir")
+	}
+	return dir, nil
+}
+
+func closeDir(dir *C.DIR) {
+	if dir != nil {
+		C.closedir(dir)
+	}
+}
+
+func getDirFd(dir *C.DIR) uintptr {
+	return uintptr(C.dirfd(dir))
+}
+
+func subvolCreate(path, name string) error {
+	dir, err := openDir(path)
+	if err != nil {
+		return err
+	}
+	defer closeDir(dir)
+
+	var args C.struct_btrfs_ioctl_vol_args
+	for i, c := range []byte(name) {
+		args.name[i] = C.char(c)
+	}
+
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, getDirFd(dir), C.BTRFS_IOC_SUBVOL_CREATE,
+		uintptr(unsafe.Pointer(&args)))
+	if errno != 0 {
+		return fmt.Errorf("Can't create subvolume")
+	}
+	return nil
+}
+
+func subvolSnapshot(src, dest, name string) error {
+	srcDir, err := openDir(src)
+	if err != nil {
+		return err
+	}
+	defer closeDir(srcDir)
+
+	destDir, err := openDir(dest)
+	if err != nil {
+		return err
+	}
+	defer closeDir(destDir)
+
+	var args C.struct_btrfs_ioctl_vol_args_v2
+	args.fd = C.__s64(getDirFd(srcDir))
+	for i, c := range []byte(name) {
+		args.name[i] = C.char(c)
+	}
+
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, getDirFd(destDir), C.BTRFS_IOC_SNAP_CREATE_V2,
+		uintptr(unsafe.Pointer(&args)))
+	if errno != 0 {
+		return fmt.Errorf("Can't create subvolume")
+	}
+	return nil
+}
+
+func subvolDelete(path, name string) error {
+	dir, err := openDir(path)
+	if err != nil {
+		return err
+	}
+	defer closeDir(dir)
+
+	var args C.struct_btrfs_ioctl_vol_args
+	for i, c := range []byte(name) {
+		args.name[i] = C.char(c)
+	}
+
+	_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, getDirFd(dir), C.BTRFS_IOC_SNAP_DESTROY,
+		uintptr(unsafe.Pointer(&args)))
+	if errno != 0 {
+		return fmt.Errorf("Can't create subvolume")
+	}
+	return nil
+}
+
+func (d *Driver) subvolumesDir() string {
+	return path.Join(d.home, "subvolumes")
+}
+
+func (d *Driver) subvolumesDirId(id string) string {
+	return path.Join(d.subvolumesDir(), id)
+}
+
+func (d *Driver) Create(id string, parent string) error {
+	subvolumes := path.Join(d.home, "subvolumes")
+	if err := os.MkdirAll(subvolumes, 0700); err != nil {
+		return err
+	}
+	if parent == "" {
+		if err := subvolCreate(subvolumes, id); err != nil {
+			return err
+		}
+	} else {
+		parentDir, err := d.Get(parent)
+		if err != nil {
+			return err
+		}
+		if err := subvolSnapshot(parentDir, subvolumes, id); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func (d *Driver) Remove(id string) error {
+	dir := d.subvolumesDirId(id)
+	if _, err := os.Stat(dir); err != nil {
+		return err
+	}
+	if err := subvolDelete(d.subvolumesDir(), id); err != nil {
+		return err
+	}
+	return os.RemoveAll(dir)
+}
+
+func (d *Driver) Get(id string) (string, error) {
+	dir := d.subvolumesDirId(id)
+	st, err := os.Stat(dir)
+	if err != nil {
+		return "", err
+	}
+
+	if !st.IsDir() {
+		return "", fmt.Errorf("%s: not a directory", dir)
+	}
+
+	return dir, nil
+}
+
+func (d *Driver) Put(id string) {
+	// Get() creates no runtime resources (like e.g. mounts)
+	// so this doesn't need to do anything.
+}
+
+func (d *Driver) Exists(id string) bool {
+	dir := d.subvolumesDirId(id)
+	_, err := os.Stat(dir)
+	return err == nil
+}

+ 3 - 0
graphdriver/btrfs/dummy_unsupported.go

@@ -0,0 +1,3 @@
+// +build !linux
+
+package btrfs

+ 2 - 0
graphdriver/driver.go

@@ -41,6 +41,8 @@ var (
 		"aufs",
 		"devicemapper",
 		"vfs",
+		// experimental, has to be enabled manually for now
+		"btrfs",
 	}
 )
 

+ 13 - 1
hack/install.sh

@@ -37,8 +37,10 @@ if command_exists docker || command_exists lxc-docker; then
 	( set -x; sleep 20 )
 fi
 
+user="$(id -un 2>/dev/null || true)"
+
 sh_c='sh -c'
-if [ "$(whoami 2>/dev/null || true)" != 'root' ]; then
+if [ "$user" != 'root' ]; then
 	if command_exists sudo; then
 		sh_c='sudo sh -c'
 	elif command_exists su; then
@@ -124,6 +126,16 @@ case "$lsb_dist" in
 				$sh_c 'docker run busybox echo "Docker has been successfully installed!"'
 			) || true
 		fi
+		your_user=your-user
+		[ "$user" != 'root' ] && your_user="$user"
+		echo
+		echo 'If you would like to use Docker as a non-root user, you should now consider'
+		echo 'adding your user to the "docker" group with something like:'
+		echo
+		echo '  sudo usermod -aG docker' $your_user
+		echo
+		echo 'Remember that you will have to log out and back in for this to take effect!'
+		echo
 		exit 0
 		;;
 		

+ 12 - 6
hack/make.sh

@@ -25,12 +25,18 @@ set -o pipefail
 
 # We're a nice, sexy, little shell script, and people might try to run us;
 # but really, they shouldn't. We want to be in a container!
-RESOLVCONF=$(readlink --canonicalize /etc/resolv.conf)
-grep -q "$RESOLVCONF" /proc/mounts || {
-	echo >&2 "# WARNING! I don't seem to be running in a docker container."
-	echo >&2 "# The result of this command might be an incorrect build, and will not be officially supported."
-	echo >&2 "# Try this: 'make all'"
-}
+if [ "$(pwd)" != '/go/src/github.com/dotcloud/docker' ] || [ -z "$DOCKER_CROSSPLATFORMS" ]; then
+	{
+		echo "# WARNING! I don't seem to be running in the Docker container."
+		echo "# The result of this command might be an incorrect build, and will not be"
+		echo "#   officially supported."
+		echo "#"
+		echo "# Try this instead: make all"
+		echo "#"
+	} >&2
+fi
+
+echo
 
 # List of bundles to create when no argument is passed
 DEFAULT_BUNDLES=(

+ 6 - 1
hack/travis/dco.py

@@ -5,7 +5,7 @@ import yaml
 
 from env import commit_range
 
-commit_format = '-%n hash: "%h"%n author: %aN <%aE>%n message: |%n%w(0,2,2)%B'
+commit_format = '-%n hash: "%h"%n author: %aN <%aE>%n message: |%n%w(0,2,2).%B'
 
 gitlog = subprocess.check_output([
 	'git', 'log', '--reverse',
@@ -24,6 +24,11 @@ p = re.compile(r'^{0} ([^<]+) <([^<>@]+@[^<>]+)> \(github: (\S+)\)$'.format(re.e
 failed_commits = 0
 
 for commit in commits:
+	commit['message'] = commit['message'][1:]
+	# trim off our '.' that exists just to prevent fun YAML parsing issues
+	# see https://github.com/dotcloud/docker/pull/3836#issuecomment-33723094
+	# and https://travis-ci.org/dotcloud/docker/builds/17926783
+	
 	commit['stat'] = subprocess.check_output([
 		'git', 'log', '--format=format:', '--max-count=1',
 		'--name-status', commit['hash'], '--',

+ 24 - 0
integration/server_test.go

@@ -114,6 +114,30 @@ func TestCreateRm(t *testing.T) {
 
 }
 
+func TestCreateNumberHostname(t *testing.T) {
+	eng := NewTestEngine(t)
+	defer mkRuntimeFromEngine(eng, t).Nuke()
+
+	config, _, _, err := docker.ParseRun([]string{"-h", "web.0", unitTestImageID, "echo test"}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	createTestContainer(eng, config, t)
+}
+
+func TestCreateNumberUsername(t *testing.T) {
+	eng := NewTestEngine(t)
+	defer mkRuntimeFromEngine(eng, t).Nuke()
+
+	config, _, _, err := docker.ParseRun([]string{"-u", "1002", unitTestImageID, "echo test"}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	createTestContainer(eng, config, t)
+}
+
 func TestCreateRmVolumes(t *testing.T) {
 	eng := NewTestEngine(t)
 	defer mkRuntimeFromEngine(eng, t).Nuke()

+ 30 - 140
network.go

@@ -5,9 +5,9 @@ import (
 	"github.com/dotcloud/docker/networkdriver"
 	"github.com/dotcloud/docker/networkdriver/ipallocator"
 	"github.com/dotcloud/docker/networkdriver/portallocator"
+	"github.com/dotcloud/docker/networkdriver/portmapper"
 	"github.com/dotcloud/docker/pkg/iptables"
 	"github.com/dotcloud/docker/pkg/netlink"
-	"github.com/dotcloud/docker/proxy"
 	"github.com/dotcloud/docker/utils"
 	"io/ioutil"
 	"log"
@@ -159,129 +159,6 @@ func getIfaceAddr(name string) (net.Addr, error) {
 	return addrs4[0], nil
 }
 
-// Port mapper takes care of mapping external ports to containers by setting
-// up iptables rules.
-// It keeps track of all mappings and is able to unmap at will
-type PortMapper struct {
-	tcpMapping map[string]*net.TCPAddr
-	tcpProxies map[string]proxy.Proxy
-	udpMapping map[string]*net.UDPAddr
-	udpProxies map[string]proxy.Proxy
-
-	iptables         *iptables.Chain
-	defaultIp        net.IP
-	proxyFactoryFunc func(net.Addr, net.Addr) (proxy.Proxy, error)
-}
-
-func (mapper *PortMapper) Map(ip net.IP, port int, backendAddr net.Addr) error {
-
-	if _, isTCP := backendAddr.(*net.TCPAddr); isTCP {
-		mapKey := (&net.TCPAddr{Port: port, IP: ip}).String()
-		if _, exists := mapper.tcpProxies[mapKey]; exists {
-			return fmt.Errorf("TCP Port %s is already in use", mapKey)
-		}
-		backendPort := backendAddr.(*net.TCPAddr).Port
-		backendIP := backendAddr.(*net.TCPAddr).IP
-		if mapper.iptables != nil {
-			if err := mapper.iptables.Forward(iptables.Add, ip, port, "tcp", backendIP.String(), backendPort); err != nil {
-				return err
-			}
-		}
-		mapper.tcpMapping[mapKey] = backendAddr.(*net.TCPAddr)
-		proxy, err := mapper.proxyFactoryFunc(&net.TCPAddr{IP: ip, Port: port}, backendAddr)
-		if err != nil {
-			mapper.Unmap(ip, port, "tcp")
-			return err
-		}
-		mapper.tcpProxies[mapKey] = proxy
-		go proxy.Run()
-	} else {
-		mapKey := (&net.UDPAddr{Port: port, IP: ip}).String()
-		if _, exists := mapper.udpProxies[mapKey]; exists {
-			return fmt.Errorf("UDP: Port %s is already in use", mapKey)
-		}
-		backendPort := backendAddr.(*net.UDPAddr).Port
-		backendIP := backendAddr.(*net.UDPAddr).IP
-		if mapper.iptables != nil {
-			if err := mapper.iptables.Forward(iptables.Add, ip, port, "udp", backendIP.String(), backendPort); err != nil {
-				return err
-			}
-		}
-		mapper.udpMapping[mapKey] = backendAddr.(*net.UDPAddr)
-		proxy, err := mapper.proxyFactoryFunc(&net.UDPAddr{IP: ip, Port: port}, backendAddr)
-		if err != nil {
-			mapper.Unmap(ip, port, "udp")
-			return err
-		}
-		mapper.udpProxies[mapKey] = proxy
-		go proxy.Run()
-	}
-	return nil
-}
-
-func (mapper *PortMapper) Unmap(ip net.IP, port int, proto string) error {
-	if proto == "tcp" {
-		mapKey := (&net.TCPAddr{Port: port, IP: ip}).String()
-		backendAddr, ok := mapper.tcpMapping[mapKey]
-		if !ok {
-			return fmt.Errorf("Port tcp/%s is not mapped", mapKey)
-		}
-		if proxy, exists := mapper.tcpProxies[mapKey]; exists {
-			proxy.Close()
-			delete(mapper.tcpProxies, mapKey)
-		}
-		if mapper.iptables != nil {
-			if err := mapper.iptables.Forward(iptables.Delete, ip, port, proto, backendAddr.IP.String(), backendAddr.Port); err != nil {
-				return err
-			}
-		}
-		delete(mapper.tcpMapping, mapKey)
-	} else {
-		mapKey := (&net.UDPAddr{Port: port, IP: ip}).String()
-		backendAddr, ok := mapper.udpMapping[mapKey]
-		if !ok {
-			return fmt.Errorf("Port udp/%s is not mapped", mapKey)
-		}
-		if proxy, exists := mapper.udpProxies[mapKey]; exists {
-			proxy.Close()
-			delete(mapper.udpProxies, mapKey)
-		}
-		if mapper.iptables != nil {
-			if err := mapper.iptables.Forward(iptables.Delete, ip, port, proto, backendAddr.IP.String(), backendAddr.Port); err != nil {
-				return err
-			}
-		}
-		delete(mapper.udpMapping, mapKey)
-	}
-	return nil
-}
-
-func newPortMapper(config *DaemonConfig) (*PortMapper, error) {
-	// We can always try removing the iptables
-	if err := iptables.RemoveExistingChain("DOCKER"); err != nil {
-		return nil, err
-	}
-	var chain *iptables.Chain
-	if config.EnableIptables {
-		var err error
-		chain, err = iptables.NewChain("DOCKER", config.BridgeIface)
-		if err != nil {
-			return nil, fmt.Errorf("Failed to create DOCKER chain: %s", err)
-		}
-	}
-
-	mapper := &PortMapper{
-		tcpMapping:       make(map[string]*net.TCPAddr),
-		tcpProxies:       make(map[string]proxy.Proxy),
-		udpMapping:       make(map[string]*net.UDPAddr),
-		udpProxies:       make(map[string]proxy.Proxy),
-		iptables:         chain,
-		defaultIp:        config.DefaultIp,
-		proxyFactoryFunc: proxy.NewProxy,
-	}
-	return mapper, nil
-}
-
 // Network interface represents the networking stack of a container
 type NetworkInterface struct {
 	IPNet   net.IPNet
@@ -299,7 +176,7 @@ func (iface *NetworkInterface) AllocatePort(port Port, binding PortBinding) (*Na
 		return nil, fmt.Errorf("Trying to allocate port for interface %v, which is disabled", iface) // FIXME
 	}
 
-	ip := iface.manager.portMapper.defaultIp
+	ip := iface.manager.defaultBindingIP
 
 	if binding.HostIp != "" {
 		ip = net.ParseIP(binding.HostIp)
@@ -331,7 +208,7 @@ func (iface *NetworkInterface) AllocatePort(port Port, binding PortBinding) (*Na
 		backend = &net.UDPAddr{IP: iface.IPNet.IP, Port: containerPort}
 	}
 
-	if err := iface.manager.portMapper.Map(ip, extPort, backend); err != nil {
+	if err := portmapper.Map(backend, ip, extPort); err != nil {
 		portallocator.ReleasePort(ip, nat.Port.Proto(), extPort)
 		return nil, err
 	}
@@ -365,7 +242,15 @@ func (iface *NetworkInterface) Release() {
 		}
 		ip := net.ParseIP(nat.Binding.HostIp)
 		utils.Debugf("Unmaping %s/%s:%s", nat.Port.Proto, ip.String(), nat.Binding.HostPort)
-		if err := iface.manager.portMapper.Unmap(ip, hostPort, nat.Port.Proto()); err != nil {
+
+		var host net.Addr
+		if nat.Port.Proto() == "tcp" {
+			host = &net.TCPAddr{IP: ip, Port: hostPort}
+		} else {
+			host = &net.UDPAddr{IP: ip, Port: hostPort}
+		}
+
+		if err := portmapper.Unmap(host); err != nil {
 			log.Printf("Unable to unmap port %s: %s", nat, err)
 		}
 
@@ -382,12 +267,10 @@ func (iface *NetworkInterface) Release() {
 // Network Manager manages a set of network interfaces
 // Only *one* manager per host machine should be used
 type NetworkManager struct {
-	bridgeIface   string
-	bridgeNetwork *net.IPNet
-
-	portMapper *PortMapper
-
-	disabled bool
+	bridgeIface      string
+	bridgeNetwork    *net.IPNet
+	defaultBindingIP net.IP
+	disabled         bool
 }
 
 // Allocate a network interface
@@ -444,7 +327,7 @@ func newNetworkManager(config *DaemonConfig) (*NetworkManager, error) {
 		natArgs := []string{"POSTROUTING", "-t", "nat", "-s", addr.String(), "!", "-d", addr.String(), "-j", "MASQUERADE"}
 
 		if !iptables.Exists(natArgs...) {
-			if output, err := iptables.Raw(append([]string{"-A"}, natArgs...)...); err != nil {
+			if output, err := iptables.Raw(append([]string{"-I"}, natArgs...)...); err != nil {
 				return nil, fmt.Errorf("Unable to enable network bridge NAT: %s", err)
 			} else if len(output) != 0 {
 				return nil, fmt.Errorf("Error iptables postrouting: %s", output)
@@ -508,16 +391,23 @@ func newNetworkManager(config *DaemonConfig) (*NetworkManager, error) {
 		}
 	}
 
-	portMapper, err := newPortMapper(config)
-	if err != nil {
+	// We can always try removing the iptables
+	if err := iptables.RemoveExistingChain("DOCKER"); err != nil {
 		return nil, err
 	}
 
-	manager := &NetworkManager{
-		bridgeIface:   config.BridgeIface,
-		bridgeNetwork: network,
-		portMapper:    portMapper,
+	if config.EnableIptables {
+		chain, err := iptables.NewChain("DOCKER", config.BridgeIface)
+		if err != nil {
+			return nil, err
+		}
+		portmapper.SetIptablesChain(chain)
 	}
 
+	manager := &NetworkManager{
+		bridgeIface:      config.BridgeIface,
+		bridgeNetwork:    network,
+		defaultBindingIP: config.DefaultIp,
+	}
 	return manager, nil
 }

+ 0 - 72
network_test.go

@@ -1,72 +0,0 @@
-package docker
-
-import (
-	"github.com/dotcloud/docker/pkg/iptables"
-	"github.com/dotcloud/docker/proxy"
-	"net"
-	"testing"
-)
-
-type StubProxy struct {
-	frontendAddr *net.Addr
-	backendAddr  *net.Addr
-}
-
-func (proxy *StubProxy) Run()                   {}
-func (proxy *StubProxy) Close()                 {}
-func (proxy *StubProxy) FrontendAddr() net.Addr { return *proxy.frontendAddr }
-func (proxy *StubProxy) BackendAddr() net.Addr  { return *proxy.backendAddr }
-
-func NewStubProxy(frontendAddr, backendAddr net.Addr) (proxy.Proxy, error) {
-	return &StubProxy{
-		frontendAddr: &frontendAddr,
-		backendAddr:  &backendAddr,
-	}, nil
-}
-
-func TestPortMapper(t *testing.T) {
-	// FIXME: is this iptables chain still used anywhere?
-	var chain *iptables.Chain
-	mapper := &PortMapper{
-		tcpMapping:       make(map[string]*net.TCPAddr),
-		tcpProxies:       make(map[string]proxy.Proxy),
-		udpMapping:       make(map[string]*net.UDPAddr),
-		udpProxies:       make(map[string]proxy.Proxy),
-		iptables:         chain,
-		defaultIp:        net.IP("0.0.0.0"),
-		proxyFactoryFunc: NewStubProxy,
-	}
-
-	dstIp1 := net.ParseIP("192.168.0.1")
-	dstIp2 := net.ParseIP("192.168.0.2")
-	srcAddr1 := &net.TCPAddr{Port: 1080, IP: net.ParseIP("172.16.0.1")}
-	srcAddr2 := &net.TCPAddr{Port: 1080, IP: net.ParseIP("172.16.0.2")}
-
-	if err := mapper.Map(dstIp1, 80, srcAddr1); err != nil {
-		t.Fatalf("Failed to allocate port: %s", err)
-	}
-
-	if mapper.Map(dstIp1, 80, srcAddr1) == nil {
-		t.Fatalf("Port is in use - mapping should have failed")
-	}
-
-	if mapper.Map(dstIp1, 80, srcAddr2) == nil {
-		t.Fatalf("Port is in use - mapping should have failed")
-	}
-
-	if err := mapper.Map(dstIp2, 80, srcAddr2); err != nil {
-		t.Fatalf("Failed to allocate port: %s", err)
-	}
-
-	if mapper.Unmap(dstIp1, 80, "tcp") != nil {
-		t.Fatalf("Failed to release port")
-	}
-
-	if mapper.Unmap(dstIp2, 80, "tcp") != nil {
-		t.Fatalf("Failed to release port")
-	}
-
-	if mapper.Unmap(dstIp2, 80, "tcp") == nil {
-		t.Fatalf("Port already released, but no error reported")
-	}
-}

+ 6 - 1
networkdriver/ipallocator/allocator.go

@@ -99,12 +99,17 @@ func getNextIp(address *net.IPNet) (*net.IP, error) {
 		return ip, nil
 	}
 
+	var (
+		firstNetIP = address.IP.To4().Mask(address.Mask)
+		firstAsInt = ipToInt(&firstNetIP) + 1
+	)
+
 	pos = int32(allocated.PullBack())
 	for i := int32(0); i < max; i++ {
 		pos = pos%max + 1
 		next := int32(base + pos)
 
-		if next == ownIP {
+		if next == ownIP || next == firstAsInt {
 			continue
 		}
 

+ 21 - 0
networkdriver/ipallocator/allocator_test.go

@@ -213,6 +213,27 @@ func TestIPAllocator(t *testing.T) {
 	}
 }
 
+func TestAllocateFirstIP(t *testing.T) {
+	defer reset()
+	network := &net.IPNet{
+		IP:   []byte{192, 168, 0, 0},
+		Mask: []byte{255, 255, 255, 0},
+	}
+
+	firstIP := network.IP.To4().Mask(network.Mask)
+	first := ipToInt(&firstIP) + 1
+
+	ip, err := RequestIP(network, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	allocated := ipToInt(ip)
+
+	if allocated == first {
+		t.Fatalf("allocated ip should not equal first ip: %d == %d", first, allocated)
+	}
+}
+
 func assertIPEquals(t *testing.T, ip1, ip2 *net.IP) {
 	if !ip1.Equal(*ip2) {
 		t.Fatalf("Expected IP %s, got %s", ip1, ip2)

+ 131 - 0
networkdriver/portmapper/mapper.go

@@ -0,0 +1,131 @@
+package portmapper
+
+import (
+	"errors"
+	"fmt"
+	"github.com/dotcloud/docker/pkg/iptables"
+	"github.com/dotcloud/docker/proxy"
+	"net"
+	"sync"
+)
+
+type mapping struct {
+	proto         string
+	userlandProxy proxy.Proxy
+	host          net.Addr
+	container     net.Addr
+}
+
+var (
+	chain *iptables.Chain
+	lock  sync.Mutex
+
+	// udp:ip:port
+	currentMappings = make(map[string]*mapping)
+	newProxy        = proxy.NewProxy
+)
+
+var (
+	ErrUnknownBackendAddressType = errors.New("unknown container address type not supported")
+	ErrPortMappedForIP           = errors.New("port is already mapped to ip")
+	ErrPortNotMapped             = errors.New("port is not mapped")
+)
+
+func SetIptablesChain(c *iptables.Chain) {
+	chain = c
+}
+
+func Map(container net.Addr, hostIP net.IP, hostPort int) error {
+	lock.Lock()
+	defer lock.Unlock()
+
+	var m *mapping
+	switch container.(type) {
+	case *net.TCPAddr:
+		m = &mapping{
+			proto:     "tcp",
+			host:      &net.TCPAddr{IP: hostIP, Port: hostPort},
+			container: container,
+		}
+	case *net.UDPAddr:
+		m = &mapping{
+			proto:     "udp",
+			host:      &net.UDPAddr{IP: hostIP, Port: hostPort},
+			container: container,
+		}
+	default:
+		return ErrUnknownBackendAddressType
+	}
+
+	key := getKey(m.host)
+	if _, exists := currentMappings[key]; exists {
+		return ErrPortMappedForIP
+	}
+
+	containerIP, containerPort := getIPAndPort(m.container)
+	if err := forward(iptables.Add, m.proto, hostIP, hostPort, containerIP.String(), containerPort); err != nil {
+		return err
+	}
+
+	p, err := newProxy(m.host, m.container)
+	if err != nil {
+		// need to undo the iptables rules before we reutrn
+		forward(iptables.Delete, m.proto, hostIP, hostPort, containerIP.String(), containerPort)
+		return err
+	}
+
+	m.userlandProxy = p
+	currentMappings[key] = m
+
+	go p.Run()
+
+	return nil
+}
+
+func Unmap(host net.Addr) error {
+	lock.Lock()
+	defer lock.Unlock()
+
+	key := getKey(host)
+	data, exists := currentMappings[key]
+	if !exists {
+		return ErrPortNotMapped
+	}
+
+	data.userlandProxy.Close()
+	delete(currentMappings, key)
+
+	containerIP, containerPort := getIPAndPort(data.container)
+	hostIP, hostPort := getIPAndPort(data.host)
+	if err := forward(iptables.Delete, data.proto, hostIP, hostPort, containerIP.String(), containerPort); err != nil {
+		return err
+	}
+	return nil
+}
+
+func getKey(a net.Addr) string {
+	switch t := a.(type) {
+	case *net.TCPAddr:
+		return fmt.Sprintf("%s:%d/%s", t.IP.String(), t.Port, "tcp")
+	case *net.UDPAddr:
+		return fmt.Sprintf("%s:%d/%s", t.IP.String(), t.Port, "udp")
+	}
+	return ""
+}
+
+func getIPAndPort(a net.Addr) (net.IP, int) {
+	switch t := a.(type) {
+	case *net.TCPAddr:
+		return t.IP, t.Port
+	case *net.UDPAddr:
+		return t.IP, t.Port
+	}
+	return nil, 0
+}
+
+func forward(action iptables.Action, proto string, sourceIP net.IP, sourcePort int, containerIP string, containerPort int) error {
+	if chain == nil {
+		return nil
+	}
+	return chain.Forward(action, sourceIP, sourcePort, proto, containerIP, containerPort)
+}

+ 107 - 0
networkdriver/portmapper/mapper_test.go

@@ -0,0 +1,107 @@
+package portmapper
+
+import (
+	"github.com/dotcloud/docker/pkg/iptables"
+	"github.com/dotcloud/docker/proxy"
+	"net"
+	"testing"
+)
+
+func init() {
+	// override this func to mock out the proxy server
+	newProxy = proxy.NewStubProxy
+}
+
+func reset() {
+	chain = nil
+	currentMappings = make(map[string]*mapping)
+}
+
+func TestSetIptablesChain(t *testing.T) {
+	defer reset()
+
+	c := &iptables.Chain{
+		Name:   "TEST",
+		Bridge: "192.168.1.1",
+	}
+
+	if chain != nil {
+		t.Fatal("chain should be nil at init")
+	}
+
+	SetIptablesChain(c)
+	if chain == nil {
+		t.Fatal("chain should not be nil after set")
+	}
+}
+
+func TestMapPorts(t *testing.T) {
+	dstIp1 := net.ParseIP("192.168.0.1")
+	dstIp2 := net.ParseIP("192.168.0.2")
+	dstAddr1 := &net.TCPAddr{IP: dstIp1, Port: 80}
+	dstAddr2 := &net.TCPAddr{IP: dstIp2, Port: 80}
+
+	srcAddr1 := &net.TCPAddr{Port: 1080, IP: net.ParseIP("172.16.0.1")}
+	srcAddr2 := &net.TCPAddr{Port: 1080, IP: net.ParseIP("172.16.0.2")}
+
+	if err := Map(srcAddr1, dstIp1, 80); err != nil {
+		t.Fatalf("Failed to allocate port: %s", err)
+	}
+
+	if Map(srcAddr1, dstIp1, 80) == nil {
+		t.Fatalf("Port is in use - mapping should have failed")
+	}
+
+	if Map(srcAddr2, dstIp1, 80) == nil {
+		t.Fatalf("Port is in use - mapping should have failed")
+	}
+
+	if err := Map(srcAddr2, dstIp2, 80); err != nil {
+		t.Fatalf("Failed to allocate port: %s", err)
+	}
+
+	if Unmap(dstAddr1) != nil {
+		t.Fatalf("Failed to release port")
+	}
+
+	if Unmap(dstAddr2) != nil {
+		t.Fatalf("Failed to release port")
+	}
+
+	if Unmap(dstAddr2) == nil {
+		t.Fatalf("Port already released, but no error reported")
+	}
+}
+
+func TestGetUDPKey(t *testing.T) {
+	addr := &net.UDPAddr{IP: net.ParseIP("192.168.1.5"), Port: 53}
+
+	key := getKey(addr)
+
+	if expected := "192.168.1.5:53/udp"; key != expected {
+		t.Fatalf("expected key %s got %s", expected, key)
+	}
+}
+
+func TestGetTCPKey(t *testing.T) {
+	addr := &net.TCPAddr{IP: net.ParseIP("192.168.1.5"), Port: 80}
+
+	key := getKey(addr)
+
+	if expected := "192.168.1.5:80/tcp"; key != expected {
+		t.Fatalf("expected key %s got %s", expected, key)
+	}
+}
+
+func TestGetUDPIPAndPort(t *testing.T) {
+	addr := &net.UDPAddr{IP: net.ParseIP("192.168.1.5"), Port: 53}
+
+	ip, port := getIPAndPort(addr)
+	if expected := "192.168.1.5"; ip.String() != expected {
+		t.Fatalf("expected ip %s got %s", expected, ip)
+	}
+
+	if ep := 53; port != ep {
+		t.Fatalf("expected port %d got %d", ep, port)
+	}
+}

+ 17 - 0
pkg/iptables/iptables.go

@@ -73,6 +73,23 @@ func (c *Chain) Forward(action Action, ip net.IP, port int, proto, dest_addr str
 	} else if len(output) != 0 {
 		return fmt.Errorf("Error iptables forward: %s", output)
 	}
+
+	fAction := action
+	if fAction == Add {
+		fAction = "-I"
+	}
+	if output, err := Raw(string(fAction), "FORWARD",
+		"!", "-i", c.Bridge,
+		"-o", c.Bridge,
+		"-p", proto,
+		"-d", daddr,
+		"--dport", strconv.Itoa(port),
+		"-j", "ACCEPT"); err != nil {
+		return err
+	} else if len(output) != 0 {
+		return fmt.Errorf("Error iptables forward: %s", output)
+	}
+
 	return nil
 }
 

+ 22 - 0
proxy/stub_proxy.go

@@ -0,0 +1,22 @@
+package proxy
+
+import (
+	"net"
+)
+
+type StubProxy struct {
+	frontendAddr net.Addr
+	backendAddr  net.Addr
+}
+
+func (p *StubProxy) Run()                   {}
+func (p *StubProxy) Close()                 {}
+func (p *StubProxy) FrontendAddr() net.Addr { return p.frontendAddr }
+func (p *StubProxy) BackendAddr() net.Addr  { return p.backendAddr }
+
+func NewStubProxy(frontendAddr, backendAddr net.Addr) (Proxy, error) {
+	return &StubProxy{
+		frontendAddr: frontendAddr,
+		backendAddr:  backendAddr,
+	}, nil
+}

+ 1 - 0
runtime.go

@@ -9,6 +9,7 @@ import (
 	"github.com/dotcloud/docker/execdriver/lxc"
 	"github.com/dotcloud/docker/graphdriver"
 	"github.com/dotcloud/docker/graphdriver/aufs"
+	_ "github.com/dotcloud/docker/graphdriver/btrfs"
 	_ "github.com/dotcloud/docker/graphdriver/devmapper"
 	_ "github.com/dotcloud/docker/graphdriver/vfs"
 	"github.com/dotcloud/docker/networkdriver/portallocator"

+ 20 - 23
server.go

@@ -43,8 +43,7 @@ func init() {
 // The signals SIGINT, SIGQUIT and SIGTERM are intercepted for cleanup.
 func jobInitApi(job *engine.Job) engine.Status {
 	job.Logf("Creating server")
-	// FIXME: ImportEnv deprecates ConfigFromJob
-	srv, err := NewServer(job.Eng, ConfigFromJob(job))
+	srv, err := NewServer(job.Eng, DaemonConfigFromJob(job))
 	if err != nil {
 		return job.Error(err)
 	}
@@ -1012,7 +1011,7 @@ func (srv *Server) Containers(job *engine.Job) engine.Status {
 	}, -1)
 
 	for _, container := range srv.runtime.List() {
-		if !container.State.IsRunning() && !all && n == -1 && since == "" && before == "" {
+		if !container.State.IsRunning() && !all && n <= 0 && since == "" && before == "" {
 			continue
 		}
 		if before != "" && !foundBefore {
@@ -1021,7 +1020,7 @@ func (srv *Server) Containers(job *engine.Job) engine.Status {
 			}
 			continue
 		}
-		if displayed == n {
+		if n > 0 && displayed == n {
 			break
 		}
 		if container.ID == since || utils.TruncateID(container.ID) == since {
@@ -1644,10 +1643,7 @@ func (srv *Server) ContainerCreate(job *engine.Job) engine.Status {
 	} else if len(job.Args) > 1 {
 		return job.Errorf("Usage: %s", job.Name)
 	}
-	var config Config
-	if err := job.ExportEnv(&config); err != nil {
-		return job.Error(err)
-	}
+	config := ContainerConfigFromJob(job)
 	if config.Memory != 0 && config.Memory < 524288 {
 		return job.Errorf("Minimum memory limit allowed is 512k")
 	}
@@ -1668,7 +1664,7 @@ func (srv *Server) ContainerCreate(job *engine.Job) engine.Status {
 		config.Dns = defaultDns
 	}
 
-	container, buildWarnings, err := srv.runtime.Create(&config, name)
+	container, buildWarnings, err := srv.runtime.Create(config, name)
 	if err != nil {
 		if srv.runtime.graph.IsNotExist(err) {
 			_, tag := utils.ParseRepositoryTag(config.Image)
@@ -1699,10 +1695,12 @@ func (srv *Server) ContainerRestart(job *engine.Job) engine.Status {
 	if len(job.Args) != 1 {
 		return job.Errorf("Usage: %s CONTAINER\n", job.Name)
 	}
-	name := job.Args[0]
-	t := job.GetenvInt("t")
-	if t == -1 {
-		t = 10
+	var (
+		name = job.Args[0]
+		t    = 10
+	)
+	if job.EnvExists("t") {
+		t = job.GetenvInt("t")
 	}
 	if container := srv.runtime.Get(name); container != nil {
 		if err := container.Restart(int(t)); err != nil {
@@ -2073,10 +2071,7 @@ func (srv *Server) ContainerStart(job *engine.Job) engine.Status {
 	}
 	// If no environment was set, then no hostconfig was passed.
 	if len(job.Environ()) > 0 {
-		var hostConfig HostConfig
-		if err := job.ExportEnv(&hostConfig); err != nil {
-			return job.Error(err)
-		}
+		hostConfig := ContainerHostConfigFromJob(job)
 		// Validate the HostConfig binds. Make sure that:
 		// 1) the source of a bind mount isn't /
 		//         The bind mount "/:/foo" isn't allowed.
@@ -2101,10 +2096,10 @@ func (srv *Server) ContainerStart(job *engine.Job) engine.Status {
 			}
 		}
 		// Register any links from the host config before starting the container
-		if err := srv.RegisterLinks(container, &hostConfig); err != nil {
+		if err := srv.RegisterLinks(container, hostConfig); err != nil {
 			return job.Error(err)
 		}
-		container.hostConfig = &hostConfig
+		container.hostConfig = hostConfig
 		container.ToDisk()
 	}
 	if err := container.Start(); err != nil {
@@ -2119,10 +2114,12 @@ func (srv *Server) ContainerStop(job *engine.Job) engine.Status {
 	if len(job.Args) != 1 {
 		return job.Errorf("Usage: %s CONTAINER\n", job.Name)
 	}
-	name := job.Args[0]
-	t := job.GetenvInt("t")
-	if t == -1 {
-		t = 10
+	var (
+		name = job.Args[0]
+		t    = 10
+	)
+	if job.EnvExists("t") {
+		t = job.GetenvInt("t")
 	}
 	if container := srv.runtime.Get(name); container != nil {
 		if err := container.Stop(int(t)); err != nil {