Browse Source

Add failure action for rolling updates

This changes the default behavior so that rolling updates will not
proceed once an updated task fails to start, or stops running during the
update. Users can use docker service inspect --pretty servicename to see
the update status, and if it pauses due to a failure, it will explain
that the update is paused, and show the task ID that caused it to pause.
It also shows the time since the update started.

A new --update-on-failure=(pause|continue) flag selects the
behavior. Pause means the update stops once a task fails, continue means
the old behavior of continuing the update anyway.

In the future this will be extended with additional behaviors like
automatic rollback, and flags controlling parameters like how many tasks
need to fail for the update to stop proceeding. This is a minimal
solution for 1.12.

Signed-off-by: Aaron Lehmann <aaron.lehmann@docker.com>
Aaron Lehmann 9 years ago
parent
commit
57ae29aa74

+ 13 - 0
api/client/service/inspect.go

@@ -4,6 +4,7 @@ import (
 	"fmt"
 	"io"
 	"strings"
+	"time"
 
 	"golang.org/x/net/context"
 
@@ -101,6 +102,17 @@ func printService(out io.Writer, service swarm.Service) {
 			fmt.Fprintf(out, " Replicas:\t%d\n", *service.Spec.Mode.Replicated.Replicas)
 		}
 	}
+
+	if service.UpdateStatus.State != "" {
+		fmt.Fprintln(out, "Update status:")
+		fmt.Fprintf(out, " State:\t\t%s\n", service.UpdateStatus.State)
+		fmt.Fprintf(out, " Started:\t%s ago\n", strings.ToLower(units.HumanDuration(time.Since(service.UpdateStatus.StartedAt))))
+		if service.UpdateStatus.State == swarm.UpdateStateCompleted {
+			fmt.Fprintf(out, " Completed:\t%s ago\n", strings.ToLower(units.HumanDuration(time.Since(service.UpdateStatus.CompletedAt))))
+		}
+		fmt.Fprintf(out, " Message:\t%s\n", service.UpdateStatus.Message)
+	}
+
 	fmt.Fprintln(out, "Placement:")
 	if service.Spec.TaskTemplate.Placement != nil && len(service.Spec.TaskTemplate.Placement.Constraints) > 0 {
 		ioutils.FprintfIfNotEmpty(out, " Constraints\t: %s\n", strings.Join(service.Spec.TaskTemplate.Placement.Constraints, ", "))
@@ -110,6 +122,7 @@ func printService(out io.Writer, service swarm.Service) {
 	if service.Spec.UpdateConfig.Delay.Nanoseconds() > 0 {
 		fmt.Fprintf(out, " Delay:\t\t%s\n", service.Spec.UpdateConfig.Delay)
 	}
+	fmt.Fprintf(out, " On failure:\t%s\n", service.Spec.UpdateConfig.FailureAction)
 	fmt.Fprintf(out, "ContainerSpec:\n")
 	printContainerSpec(out, service.Spec.TaskTemplate.ContainerSpec)
 

+ 43 - 39
api/client/service/opts.go

@@ -274,6 +274,7 @@ func (m *MountOpt) Value() []swarm.Mount {
 type updateOptions struct {
 	parallelism uint64
 	delay       time.Duration
+	onFailure   string
 }
 
 type resourceOptions struct {
@@ -455,8 +456,9 @@ func (opts *serviceOptions) ToService() (swarm.ServiceSpec, error) {
 		},
 		Mode: swarm.ServiceMode{},
 		UpdateConfig: &swarm.UpdateConfig{
-			Parallelism: opts.update.parallelism,
-			Delay:       opts.update.delay,
+			Parallelism:   opts.update.parallelism,
+			Delay:         opts.update.delay,
+			FailureAction: opts.update.onFailure,
 		},
 		Networks:     convertNetworks(opts.networks),
 		EndpointSpec: opts.endpoint.ToEndpointSpec(),
@@ -503,6 +505,7 @@ func addServiceFlags(cmd *cobra.Command, opts *serviceOptions) {
 
 	flags.Uint64Var(&opts.update.parallelism, flagUpdateParallelism, 1, "Maximum number of tasks updated simultaneously (0 to update all at once)")
 	flags.DurationVar(&opts.update.delay, flagUpdateDelay, time.Duration(0), "Delay between updates")
+	flags.StringVar(&opts.update.onFailure, flagUpdateFailureAction, "pause", "Action on update failure (pause|continue)")
 
 	flags.StringVar(&opts.endpoint.mode, flagEndpointMode, "", "Endpoint mode (vip or dnsrr)")
 
@@ -513,41 +516,42 @@ func addServiceFlags(cmd *cobra.Command, opts *serviceOptions) {
 }
 
 const (
-	flagConstraint         = "constraint"
-	flagConstraintRemove   = "constraint-rm"
-	flagConstraintAdd      = "constraint-add"
-	flagEndpointMode       = "endpoint-mode"
-	flagEnv                = "env"
-	flagEnvRemove          = "env-rm"
-	flagEnvAdd             = "env-add"
-	flagLabel              = "label"
-	flagLabelRemove        = "label-rm"
-	flagLabelAdd           = "label-add"
-	flagLimitCPU           = "limit-cpu"
-	flagLimitMemory        = "limit-memory"
-	flagMode               = "mode"
-	flagMount              = "mount"
-	flagMountRemove        = "mount-rm"
-	flagMountAdd           = "mount-add"
-	flagName               = "name"
-	flagNetwork            = "network"
-	flagNetworkRemove      = "network-rm"
-	flagNetworkAdd         = "network-add"
-	flagPublish            = "publish"
-	flagPublishRemove      = "publish-rm"
-	flagPublishAdd         = "publish-add"
-	flagReplicas           = "replicas"
-	flagReserveCPU         = "reserve-cpu"
-	flagReserveMemory      = "reserve-memory"
-	flagRestartCondition   = "restart-condition"
-	flagRestartDelay       = "restart-delay"
-	flagRestartMaxAttempts = "restart-max-attempts"
-	flagRestartWindow      = "restart-window"
-	flagStopGracePeriod    = "stop-grace-period"
-	flagUpdateDelay        = "update-delay"
-	flagUpdateParallelism  = "update-parallelism"
-	flagUser               = "user"
-	flagRegistryAuth       = "with-registry-auth"
-	flagLogDriver          = "log-driver"
-	flagLogOpt             = "log-opt"
+	flagConstraint          = "constraint"
+	flagConstraintRemove    = "constraint-rm"
+	flagConstraintAdd       = "constraint-add"
+	flagEndpointMode        = "endpoint-mode"
+	flagEnv                 = "env"
+	flagEnvRemove           = "env-rm"
+	flagEnvAdd              = "env-add"
+	flagLabel               = "label"
+	flagLabelRemove         = "label-rm"
+	flagLabelAdd            = "label-add"
+	flagLimitCPU            = "limit-cpu"
+	flagLimitMemory         = "limit-memory"
+	flagMode                = "mode"
+	flagMount               = "mount"
+	flagMountRemove         = "mount-rm"
+	flagMountAdd            = "mount-add"
+	flagName                = "name"
+	flagNetwork             = "network"
+	flagNetworkRemove       = "network-rm"
+	flagNetworkAdd          = "network-add"
+	flagPublish             = "publish"
+	flagPublishRemove       = "publish-rm"
+	flagPublishAdd          = "publish-add"
+	flagReplicas            = "replicas"
+	flagReserveCPU          = "reserve-cpu"
+	flagReserveMemory       = "reserve-memory"
+	flagRestartCondition    = "restart-condition"
+	flagRestartDelay        = "restart-delay"
+	flagRestartMaxAttempts  = "restart-max-attempts"
+	flagRestartWindow       = "restart-window"
+	flagStopGracePeriod     = "stop-grace-period"
+	flagUpdateDelay         = "update-delay"
+	flagUpdateFailureAction = "update-failure-action"
+	flagUpdateParallelism   = "update-parallelism"
+	flagUser                = "user"
+	flagRegistryAuth        = "with-registry-auth"
+	flagLogDriver           = "log-driver"
+	flagLogOpt              = "log-opt"
 )

+ 2 - 1
api/client/service/update.go

@@ -191,12 +191,13 @@ func updateService(flags *pflag.FlagSet, spec *swarm.ServiceSpec) error {
 		return err
 	}
 
-	if anyChanged(flags, flagUpdateParallelism, flagUpdateDelay) {
+	if anyChanged(flags, flagUpdateParallelism, flagUpdateDelay, flagUpdateFailureAction) {
 		if spec.UpdateConfig == nil {
 			spec.UpdateConfig = &swarm.UpdateConfig{}
 		}
 		updateUint64(flagUpdateParallelism, &spec.UpdateConfig.Parallelism)
 		updateDuration(flagUpdateDelay, &spec.UpdateConfig.Delay)
+		updateString(flagUpdateFailureAction, &spec.UpdateConfig.FailureAction)
 	}
 
 	updateNetworks(flags, &spec.Networks)

+ 1 - 0
contrib/completion/bash/docker

@@ -1726,6 +1726,7 @@ _docker_service_update() {
 		--restart-window
 		--stop-grace-period
 		--update-delay
+		--update-failure-action
 		--update-parallelism
 		--user -u
 		--workdir -w

+ 1 - 0
contrib/completion/zsh/_docker

@@ -1094,6 +1094,7 @@ __docker_service_subcommand() {
         "($help)--restart-window=[Window used to evaluate the restart policy]:window: "
         "($help)--stop-grace-period=[Time to wait before force killing a container]:grace period: "
         "($help)--update-delay=[Delay between updates]:delay: "
+        "($help)--update-failure-action=[Action on update failure]:mode:(pause continue)"
         "($help)--update-parallelism=[Maximum number of tasks updated simultaneously]:number: "
         "($help -u --user)"{-u=,--user=}"[Username or UID]:user:_users"
         "($help)--with-registry-auth[Send registry authentication details to swarm agents]"

+ 37 - 3
daemon/cluster/convert/service.go

@@ -53,9 +53,16 @@ func ServiceFromGRPC(s swarmapi.Service) types.Service {
 		}
 
 		service.Spec.UpdateConfig.Delay, _ = ptypes.Duration(&s.Spec.Update.Delay)
+
+		switch s.Spec.Update.FailureAction {
+		case swarmapi.UpdateConfig_PAUSE:
+			service.Spec.UpdateConfig.FailureAction = types.UpdateFailureActionPause
+		case swarmapi.UpdateConfig_CONTINUE:
+			service.Spec.UpdateConfig.FailureAction = types.UpdateFailureActionContinue
+		}
 	}
 
-	//Mode
+	// Mode
 	switch t := s.Spec.GetMode().(type) {
 	case *swarmapi.ServiceSpec_Global:
 		service.Spec.Mode.Global = &types.GlobalService{}
@@ -65,6 +72,23 @@ func ServiceFromGRPC(s swarmapi.Service) types.Service {
 		}
 	}
 
+	// UpdateStatus
+	service.UpdateStatus = types.UpdateStatus{}
+	if s.UpdateStatus != nil {
+		switch s.UpdateStatus.State {
+		case swarmapi.UpdateStatus_UPDATING:
+			service.UpdateStatus.State = types.UpdateStateUpdating
+		case swarmapi.UpdateStatus_PAUSED:
+			service.UpdateStatus.State = types.UpdateStatePaused
+		case swarmapi.UpdateStatus_COMPLETED:
+			service.UpdateStatus.State = types.UpdateStateCompleted
+		}
+
+		service.UpdateStatus.StartedAt, _ = ptypes.Timestamp(s.UpdateStatus.StartedAt)
+		service.UpdateStatus.CompletedAt, _ = ptypes.Timestamp(s.UpdateStatus.CompletedAt)
+		service.UpdateStatus.Message = s.UpdateStatus.Message
+	}
+
 	return service
 }
 
@@ -111,9 +135,19 @@ func ServiceSpecToGRPC(s types.ServiceSpec) (swarmapi.ServiceSpec, error) {
 	}
 
 	if s.UpdateConfig != nil {
+		var failureAction swarmapi.UpdateConfig_FailureAction
+		switch s.UpdateConfig.FailureAction {
+		case types.UpdateFailureActionPause, "":
+			failureAction = swarmapi.UpdateConfig_PAUSE
+		case types.UpdateFailureActionContinue:
+			failureAction = swarmapi.UpdateConfig_CONTINUE
+		default:
+			return swarmapi.ServiceSpec{}, fmt.Errorf("unrecongized update failure action %s", s.UpdateConfig.FailureAction)
+		}
 		spec.Update = &swarmapi.UpdateConfig{
-			Parallelism: s.UpdateConfig.Parallelism,
-			Delay:       *ptypes.DurationProto(s.UpdateConfig.Delay),
+			Parallelism:   s.UpdateConfig.Parallelism,
+			Delay:         *ptypes.DurationProto(s.UpdateConfig.Delay),
+			FailureAction: failureAction,
 		}
 	}
 

+ 4 - 1
docs/reference/api/docker_remote_api_v1.24.md

@@ -3964,7 +3964,8 @@ Create a service
       },
       "UpdateConfig": {
         "Delay": 30000000000.0,
-        "Parallelism": 2
+        "Parallelism": 2,
+        "FailureAction": "pause"
       },
       "EndpointSpec": {
         "Ports": [
@@ -4054,6 +4055,8 @@ JSON Parameters:
     - **Parallelism** – Maximum number of tasks to be updated in one iteration (0 means unlimited
       parallelism).
     - **Delay** – Amount of time between updates.
+    - **FailureAction** - Action to take if an updated task fails to run, or stops running during the
+      update. Values are `continue` and `pause`.
 - **Networks** – Array of network names or IDs to attach the service to.
 - **Endpoint** – Properties that can be configured to access and load balance a service.
     - **Spec** –

+ 4 - 1
docs/reference/api/docker_remote_api_v1.25.md

@@ -3965,7 +3965,8 @@ Create a service
       },
       "UpdateConfig": {
         "Delay": 30000000000.0,
-        "Parallelism": 2
+        "Parallelism": 2,
+        "FailureAction": "pause"
       },
       "EndpointSpec": {
         "Ports": [
@@ -4055,6 +4056,8 @@ JSON Parameters:
     - **Parallelism** – Maximum number of tasks to be updated in one iteration (0 means unlimited
       parallelism).
     - **Delay** – Amount of time between updates.
+    - **FailureAction** - Action to take if an updated task fails to run, or stops running during the
+      update. Values are `continue` and `pause`.
 - **Networks** – Array of network names or IDs to attach the service to.
 - **Endpoint** – Properties that can be configured to access and load balance a service.
     - **Spec** –

+ 28 - 27
docs/reference/commandline/service_create.md

@@ -17,33 +17,34 @@ Usage:  docker service create [OPTIONS] IMAGE [COMMAND] [ARG...]
 Create a new service
 
 Options:
-      --constraint value             Placement constraints (default [])
-      --endpoint-mode string         Endpoint mode (vip or dnsrr)
-  -e, --env value                    Set environment variables (default [])
-      --help                         Print usage
-  -l, --label value                  Service labels (default [])
-      --limit-cpu value              Limit CPUs (default 0.000)
-      --limit-memory value           Limit Memory (default 0 B)
-      --log-driver string            Logging driver for service
-      --log-opt value                Logging driver options (default [])
-      --mode string                  Service mode (replicated or global) (default "replicated")
-      --mount value                  Attach a mount to the service
-      --name string                  Service name
-      --network value                Network attachments (default [])
-  -p, --publish value                Publish a port as a node port (default [])
-      --replicas value               Number of tasks (default none)
-      --reserve-cpu value            Reserve CPUs (default 0.000)
-      --reserve-memory value         Reserve Memory (default 0 B)
-      --restart-condition string     Restart when condition is met (none, on-failure, or any)
-      --restart-delay value          Delay between restart attempts (default none)
-      --restart-max-attempts value   Maximum number of restarts before giving up (default none)
-      --restart-window value         Window used to evaluate the restart policy (default none)
-      --stop-grace-period value      Time to wait before force killing a container (default none)
-      --update-delay duration        Delay between updates
-      --update-parallelism uint      Maximum number of tasks updated simultaneously (0 to update all at once) (default 1)
-  -u, --user string                  Username or UID
-      --with-registry-auth           Send registry authentication details to Swarm agents
-  -w, --workdir string               Working directory inside the container
+      --constraint value               Placement constraints (default [])
+      --endpoint-mode string           Endpoint mode (vip or dnsrr)
+  -e, --env value                      Set environment variables (default [])
+      --help                           Print usage
+  -l, --label value                    Service labels (default [])
+      --limit-cpu value                Limit CPUs (default 0.000)
+      --limit-memory value             Limit Memory (default 0 B)
+      --log-driver string              Logging driver for service
+      --log-opt value                  Logging driver options (default [])
+      --mode string                    Service mode (replicated or global) (default "replicated")
+      --mount value                    Attach a mount to the service
+      --name string                    Service name
+      --network value                  Network attachments (default [])
+  -p, --publish value                  Publish a port as a node port (default [])
+      --replicas value                 Number of tasks (default none)
+      --reserve-cpu value              Reserve CPUs (default 0.000)
+      --reserve-memory value           Reserve Memory (default 0 B)
+      --restart-condition string       Restart when condition is met (none, on-failure, or any)
+      --restart-delay value            Delay between restart attempts (default none)
+      --restart-max-attempts value     Maximum number of restarts before giving up (default none)
+      --restart-window value           Window used to evaluate the restart policy (default none)
+      --stop-grace-period value        Time to wait before force killing a container (default none)
+      --update-delay duration          Delay between updates
+      --update-failure-action string   Action on update failure (pause|continue) (default "pause")
+      --update-parallelism uint        Maximum number of tasks updated simultaneously (0 to update all at once) (default 1)
+  -u, --user string                    Username or UID
+      --with-registry-auth             Send registry authentication details to Swarm agents
+  -w, --workdir string                 Working directory inside the container
 ```
 
 Creates a service as described by the specified parameters. This command has to

+ 35 - 34
docs/reference/commandline/service_update.md

@@ -17,40 +17,41 @@ Usage:  docker service update [OPTIONS] SERVICE
 Update a service
 
 Options:
-      --args string                  Service command args
-      --constraint-add value         Add or update placement constraints (default [])
-      --constraint-rm value          Remove a constraint (default [])
-      --endpoint-mode string         Endpoint mode (vip or dnsrr)
-      --env-add value                Add or update environment variables (default [])
-      --env-rm value                 Remove an environment variable (default [])
-      --help                         Print usage
-      --image string                 Service image tag
-      --label-add value              Add or update service labels (default [])
-      --label-rm value               Remove a label by its key (default [])
-      --limit-cpu value              Limit CPUs (default 0.000)
-      --limit-memory value           Limit Memory (default 0 B)
-      --log-driver string            Logging driver for service
-      --log-opt value                Logging driver options (default [])
-      --mount-add value              Add or update a mount on a service
-      --mount-rm value               Remove a mount by its target path (default [])
-      --name string                  Service name
-      --network-add value            Add or update network attachments (default [])
-      --network-rm value             Remove a network by name (default [])
-      --publish-add value            Add or update a published port (default [])
-      --publish-rm value             Remove a published port by its target port (default [])
-      --replicas value               Number of tasks (default none)
-      --reserve-cpu value            Reserve CPUs (default 0.000)
-      --reserve-memory value         Reserve Memory (default 0 B)
-      --restart-condition string     Restart when condition is met (none, on-failure, or any)
-      --restart-delay value          Delay between restart attempts (default none)
-      --restart-max-attempts value   Maximum number of restarts before giving up (default none)
-      --restart-window value         Window used to evaluate the restart policy (default none)
-      --stop-grace-period value      Time to wait before force killing a container (default none)
-      --update-delay duration        Delay between updates
-      --update-parallelism uint      Maximum number of tasks updated simultaneously (0 to update all at once) (default 1)
-  -u, --user string                  Username or UID
-      --with-registry-auth           Send registry authentication details to Swarm agents
-  -w, --workdir string               Working directory inside the container
+      --args string                    Service command args
+      --constraint-add value           Add or update placement constraints (default [])
+      --constraint-rm value            Remove a constraint (default [])
+      --endpoint-mode string           Endpoint mode (vip or dnsrr)
+      --env-add value                  Add or update environment variables (default [])
+      --env-rm value                   Remove an environment variable (default [])
+      --help                           Print usage
+      --image string                   Service image tag
+      --label-add value                Add or update service labels (default [])
+      --label-rm value                 Remove a label by its key (default [])
+      --limit-cpu value                Limit CPUs (default 0.000)
+      --limit-memory value             Limit Memory (default 0 B)
+      --log-driver string              Logging driver for service
+      --log-opt value                  Logging driver options (default [])
+      --mount-add value                Add or update a mount on a service
+      --mount-rm value                 Remove a mount by its target path (default [])
+      --name string                    Service name
+      --network-add value              Add or update network attachments (default [])
+      --network-rm value               Remove a network by name (default [])
+      --publish-add value              Add or update a published port (default [])
+      --publish-rm value               Remove a published port by its target port (default [])
+      --replicas value                 Number of tasks (default none)
+      --reserve-cpu value              Reserve CPUs (default 0.000)
+      --reserve-memory value           Reserve Memory (default 0 B)
+      --restart-condition string       Restart when condition is met (none, on-failure, or any)
+      --restart-delay value            Delay between restart attempts (default none)
+      --restart-max-attempts value     Maximum number of restarts before giving up (default none)
+      --restart-window value           Window used to evaluate the restart policy (default none)
+      --stop-grace-period value        Time to wait before force killing a container (default none)
+      --update-delay duration          Delay between updates
+      --update-failure-action string   Action on update failure (pause|continue) (default "pause")
+      --update-parallelism uint        Maximum number of tasks updated simultaneously (0 to update all at once) (default 1)
+  -u, --user string                    Username or UID
+      --with-registry-auth             Send registry authentication details to Swarm agents
+  -w, --workdir string                 Working directory inside the container
 ```
 
 Updates a service as described by the specified parameters. This command has to be run targeting a manager node.

+ 3 - 2
integration-cli/docker_api_swarm_test.go

@@ -791,8 +791,9 @@ func serviceForUpdate(s *swarm.Service) {
 			},
 		},
 		UpdateConfig: &swarm.UpdateConfig{
-			Parallelism: 2,
-			Delay:       8 * time.Second,
+			Parallelism:   2,
+			Delay:         8 * time.Second,
+			FailureAction: swarm.UpdateFailureActionContinue,
 		},
 	}
 	s.Spec.Name = "updatetest"