瀏覽代碼

Service update failure thresholds and rollback

This adds support for two enhancements to swarm service rolling updates:

- Failure thresholds: In Docker 1.12, a service update could be set up
  to either pause or continue after a single failure occurs. This adds
  an --update-max-failure-ratio flag that controls how many tasks need to
  fail to update for the update as a whole to be considered a failure. A
  counterpart flag, --update-monitor, controls how long to monitor each
  task for a failure after starting it during the update.

- Rollback flag: service update --rollback reverts the service to its
  previous version. If a service update encounters task failures, or
  fails to function properly for some other reason, the user can roll back
  the update.

SwarmKit also has the ability to roll back updates automatically after
hitting the failure thresholds, but we've decided not to expose this in
the Docker API/CLI for now, favoring a workflow where the decision to
roll back is always made by an admin. Depending on user feedback, we may
add a "rollback" option to --update-failure-action in the future.

Signed-off-by: Aaron Lehmann <aaron.lehmann@docker.com>
Aaron Lehmann 8 年之前
父節點
當前提交
6d4b527699

+ 1 - 1
api/server/router/swarm/backend.go

@@ -15,7 +15,7 @@ type Backend interface {
 	GetServices(basictypes.ServiceListOptions) ([]types.Service, error)
 	GetServices(basictypes.ServiceListOptions) ([]types.Service, error)
 	GetService(string) (types.Service, error)
 	GetService(string) (types.Service, error)
 	CreateService(types.ServiceSpec, string) (string, error)
 	CreateService(types.ServiceSpec, string) (string, error)
-	UpdateService(string, uint64, types.ServiceSpec, string) error
+	UpdateService(string, uint64, types.ServiceSpec, string, string) error
 	RemoveService(string) error
 	RemoveService(string) error
 	GetNodes(basictypes.NodeListOptions) ([]types.Node, error)
 	GetNodes(basictypes.NodeListOptions) ([]types.Node, error)
 	GetNode(string) (types.Node, error)
 	GetNode(string) (types.Node, error)

+ 3 - 1
api/server/router/swarm/cluster_routes.go

@@ -156,7 +156,9 @@ func (sr *swarmRouter) updateService(ctx context.Context, w http.ResponseWriter,
 	// Get returns "" if the header does not exist
 	// Get returns "" if the header does not exist
 	encodedAuth := r.Header.Get("X-Registry-Auth")
 	encodedAuth := r.Header.Get("X-Registry-Auth")
 
 
-	if err := sr.backend.UpdateService(vars["id"], version, service, encodedAuth); err != nil {
+	registryAuthFrom := r.URL.Query().Get("registryAuthFrom")
+
+	if err := sr.backend.UpdateService(vars["id"], version, service, encodedAuth, registryAuthFrom); err != nil {
 		logrus.Errorf("Error updating service %s: %v", vars["id"], err)
 		logrus.Errorf("Error updating service %s: %v", vars["id"], err)
 		return err
 		return err
 	}
 	}

+ 3 - 3
api/types/swarm/service.go

@@ -90,16 +90,16 @@ type UpdateConfig struct {
 	// be used.
 	// be used.
 	Monitor time.Duration `json:",omitempty"`
 	Monitor time.Duration `json:",omitempty"`
 
 
-	// AllowedFailureFraction is the fraction of tasks that may fail during
+	// MaxFailureRatio is the fraction of tasks that may fail during
 	// an update before the failure action is invoked. Any task created by
 	// an update before the failure action is invoked. Any task created by
 	// the current update which ends up in one of the states REJECTED,
 	// the current update which ends up in one of the states REJECTED,
 	// COMPLETED or FAILED within Monitor from its creation counts as a
 	// COMPLETED or FAILED within Monitor from its creation counts as a
 	// failure. The number of failures is divided by the number of tasks
 	// failure. The number of failures is divided by the number of tasks
 	// being updated, and if this fraction is greater than
 	// being updated, and if this fraction is greater than
-	// AllowedFailureFraction, the failure action is invoked.
+	// MaxFailureRatio, the failure action is invoked.
 	//
 	//
 	// If the failure action is CONTINUE, there is no effect.
 	// If the failure action is CONTINUE, there is no effect.
 	// If the failure action is PAUSE, no more tasks will be updated until
 	// If the failure action is PAUSE, no more tasks will be updated until
 	// another update is started.
 	// another update is started.
-	AllowedFailureFraction float32
+	MaxFailureRatio float32
 }
 }

+ 17 - 1
cli/command/formatter/service.go

@@ -41,10 +41,14 @@ Placement:
 {{- if .HasUpdateConfig }}
 {{- if .HasUpdateConfig }}
 UpdateConfig:
 UpdateConfig:
  Parallelism:	{{ .UpdateParallelism }}
  Parallelism:	{{ .UpdateParallelism }}
-{{- if .HasUpdateDelay -}}
+{{- if .HasUpdateDelay}}
  Delay:		{{ .UpdateDelay }}
  Delay:		{{ .UpdateDelay }}
 {{- end }}
 {{- end }}
  On failure:	{{ .UpdateOnFailure }}
  On failure:	{{ .UpdateOnFailure }}
+{{- if .HasUpdateMonitor}}
+ Monitoring Period: {{ .UpdateMonitor }}
+{{- end }}
+ Max failure ratio: {{ .UpdateMaxFailureRatio }}
 {{- end }}
 {{- end }}
 ContainerSpec:
 ContainerSpec:
  Image:		{{ .ContainerImage }}
  Image:		{{ .ContainerImage }}
@@ -218,6 +222,18 @@ func (ctx *serviceInspectContext) UpdateOnFailure() string {
 	return ctx.Service.Spec.UpdateConfig.FailureAction
 	return ctx.Service.Spec.UpdateConfig.FailureAction
 }
 }
 
 
+func (ctx *serviceInspectContext) HasUpdateMonitor() bool {
+	return ctx.Service.Spec.UpdateConfig.Monitor.Nanoseconds() > 0
+}
+
+func (ctx *serviceInspectContext) UpdateMonitor() time.Duration {
+	return ctx.Service.Spec.UpdateConfig.Monitor
+}
+
+func (ctx *serviceInspectContext) UpdateMaxFailureRatio() float32 {
+	return ctx.Service.Spec.UpdateConfig.MaxFailureRatio
+}
+
 func (ctx *serviceInspectContext) ContainerImage() string {
 func (ctx *serviceInspectContext) ContainerImage() string {
 	return ctx.Service.Spec.TaskTemplate.ContainerSpec.Image
 	return ctx.Service.Spec.TaskTemplate.ContainerSpec.Image
 }
 }

+ 56 - 48
cli/command/service/opts.go

@@ -267,9 +267,11 @@ func (m *MountOpt) Value() []mounttypes.Mount {
 }
 }
 
 
 type updateOptions struct {
 type updateOptions struct {
-	parallelism uint64
-	delay       time.Duration
-	onFailure   string
+	parallelism     uint64
+	delay           time.Duration
+	monitor         time.Duration
+	onFailure       string
+	maxFailureRatio float32
 }
 }
 
 
 type resourceOptions struct {
 type resourceOptions struct {
@@ -458,9 +460,11 @@ func (opts *serviceOptions) ToService() (swarm.ServiceSpec, error) {
 		Networks: convertNetworks(opts.networks),
 		Networks: convertNetworks(opts.networks),
 		Mode:     swarm.ServiceMode{},
 		Mode:     swarm.ServiceMode{},
 		UpdateConfig: &swarm.UpdateConfig{
 		UpdateConfig: &swarm.UpdateConfig{
-			Parallelism:   opts.update.parallelism,
-			Delay:         opts.update.delay,
-			FailureAction: opts.update.onFailure,
+			Parallelism:     opts.update.parallelism,
+			Delay:           opts.update.delay,
+			Monitor:         opts.update.monitor,
+			FailureAction:   opts.update.onFailure,
+			MaxFailureRatio: opts.update.maxFailureRatio,
 		},
 		},
 		EndpointSpec: opts.endpoint.ToEndpointSpec(),
 		EndpointSpec: opts.endpoint.ToEndpointSpec(),
 	}
 	}
@@ -507,7 +511,9 @@ func addServiceFlags(cmd *cobra.Command, opts *serviceOptions) {
 
 
 	flags.Uint64Var(&opts.update.parallelism, flagUpdateParallelism, 1, "Maximum number of tasks updated simultaneously (0 to update all at once)")
 	flags.Uint64Var(&opts.update.parallelism, flagUpdateParallelism, 1, "Maximum number of tasks updated simultaneously (0 to update all at once)")
 	flags.DurationVar(&opts.update.delay, flagUpdateDelay, time.Duration(0), "Delay between updates")
 	flags.DurationVar(&opts.update.delay, flagUpdateDelay, time.Duration(0), "Delay between updates")
+	flags.DurationVar(&opts.update.monitor, flagUpdateMonitor, time.Duration(0), "Duration after each task update to monitor for failure")
 	flags.StringVar(&opts.update.onFailure, flagUpdateFailureAction, "pause", "Action on update failure (pause|continue)")
 	flags.StringVar(&opts.update.onFailure, flagUpdateFailureAction, "pause", "Action on update failure (pause|continue)")
+	flags.Float32Var(&opts.update.maxFailureRatio, flagUpdateMaxFailureRatio, 0, "Failure rate to tolerate during an update")
 
 
 	flags.StringVar(&opts.endpoint.mode, flagEndpointMode, "", "Endpoint mode (vip or dnsrr)")
 	flags.StringVar(&opts.endpoint.mode, flagEndpointMode, "", "Endpoint mode (vip or dnsrr)")
 
 
@@ -518,46 +524,48 @@ func addServiceFlags(cmd *cobra.Command, opts *serviceOptions) {
 }
 }
 
 
 const (
 const (
-	flagConstraint           = "constraint"
-	flagConstraintRemove     = "constraint-rm"
-	flagConstraintAdd        = "constraint-add"
-	flagContainerLabel       = "container-label"
-	flagContainerLabelRemove = "container-label-rm"
-	flagContainerLabelAdd    = "container-label-add"
-	flagEndpointMode         = "endpoint-mode"
-	flagEnv                  = "env"
-	flagEnvRemove            = "env-rm"
-	flagEnvAdd               = "env-add"
-	flagGroupAdd             = "group-add"
-	flagGroupRemove          = "group-rm"
-	flagLabel                = "label"
-	flagLabelRemove          = "label-rm"
-	flagLabelAdd             = "label-add"
-	flagLimitCPU             = "limit-cpu"
-	flagLimitMemory          = "limit-memory"
-	flagMode                 = "mode"
-	flagMount                = "mount"
-	flagMountRemove          = "mount-rm"
-	flagMountAdd             = "mount-add"
-	flagName                 = "name"
-	flagNetwork              = "network"
-	flagPublish              = "publish"
-	flagPublishRemove        = "publish-rm"
-	flagPublishAdd           = "publish-add"
-	flagReplicas             = "replicas"
-	flagReserveCPU           = "reserve-cpu"
-	flagReserveMemory        = "reserve-memory"
-	flagRestartCondition     = "restart-condition"
-	flagRestartDelay         = "restart-delay"
-	flagRestartMaxAttempts   = "restart-max-attempts"
-	flagRestartWindow        = "restart-window"
-	flagStopGracePeriod      = "stop-grace-period"
-	flagUpdateDelay          = "update-delay"
-	flagUpdateFailureAction  = "update-failure-action"
-	flagUpdateParallelism    = "update-parallelism"
-	flagUser                 = "user"
-	flagWorkdir              = "workdir"
-	flagRegistryAuth         = "with-registry-auth"
-	flagLogDriver            = "log-driver"
-	flagLogOpt               = "log-opt"
+	flagConstraint            = "constraint"
+	flagConstraintRemove      = "constraint-rm"
+	flagConstraintAdd         = "constraint-add"
+	flagContainerLabel        = "container-label"
+	flagContainerLabelRemove  = "container-label-rm"
+	flagContainerLabelAdd     = "container-label-add"
+	flagEndpointMode          = "endpoint-mode"
+	flagEnv                   = "env"
+	flagEnvRemove             = "env-rm"
+	flagEnvAdd                = "env-add"
+	flagGroupAdd              = "group-add"
+	flagGroupRemove           = "group-rm"
+	flagLabel                 = "label"
+	flagLabelRemove           = "label-rm"
+	flagLabelAdd              = "label-add"
+	flagLimitCPU              = "limit-cpu"
+	flagLimitMemory           = "limit-memory"
+	flagMode                  = "mode"
+	flagMount                 = "mount"
+	flagMountRemove           = "mount-rm"
+	flagMountAdd              = "mount-add"
+	flagName                  = "name"
+	flagNetwork               = "network"
+	flagPublish               = "publish"
+	flagPublishRemove         = "publish-rm"
+	flagPublishAdd            = "publish-add"
+	flagReplicas              = "replicas"
+	flagReserveCPU            = "reserve-cpu"
+	flagReserveMemory         = "reserve-memory"
+	flagRestartCondition      = "restart-condition"
+	flagRestartDelay          = "restart-delay"
+	flagRestartMaxAttempts    = "restart-max-attempts"
+	flagRestartWindow         = "restart-window"
+	flagStopGracePeriod       = "stop-grace-period"
+	flagUpdateDelay           = "update-delay"
+	flagUpdateFailureAction   = "update-failure-action"
+	flagUpdateMaxFailureRatio = "update-max-failure-ratio"
+	flagUpdateMonitor         = "update-monitor"
+	flagUpdateParallelism     = "update-parallelism"
+	flagUser                  = "user"
+	flagWorkdir               = "workdir"
+	flagRegistryAuth          = "with-registry-auth"
+	flagLogDriver             = "log-driver"
+	flagLogOpt                = "log-opt"
 )
 )

+ 30 - 4
cli/command/service/update.go

@@ -36,6 +36,7 @@ func newUpdateCommand(dockerCli *command.DockerCli) *cobra.Command {
 	flags := cmd.Flags()
 	flags := cmd.Flags()
 	flags.String("image", "", "Service image tag")
 	flags.String("image", "", "Service image tag")
 	flags.String("args", "", "Service command args")
 	flags.String("args", "", "Service command args")
+	flags.Bool("rollback", false, "Rollback to previous specification")
 	addServiceFlags(cmd, opts)
 	addServiceFlags(cmd, opts)
 
 
 	flags.Var(newListOptsVar(), flagEnvRemove, "Remove an environment variable")
 	flags.Var(newListOptsVar(), flagEnvRemove, "Remove an environment variable")
@@ -68,7 +69,20 @@ func runUpdate(dockerCli *command.DockerCli, flags *pflag.FlagSet, serviceID str
 		return err
 		return err
 	}
 	}
 
 
-	err = updateService(flags, &service.Spec)
+	rollback, err := flags.GetBool("rollback")
+	if err != nil {
+		return err
+	}
+
+	spec := &service.Spec
+	if rollback {
+		spec = service.PreviousSpec
+		if spec == nil {
+			return fmt.Errorf("service does not have a previous specification to roll back to")
+		}
+	}
+
+	err = updateService(flags, spec)
 	if err != nil {
 	if err != nil {
 		return err
 		return err
 	}
 	}
@@ -81,15 +95,19 @@ func runUpdate(dockerCli *command.DockerCli, flags *pflag.FlagSet, serviceID str
 	if sendAuth {
 	if sendAuth {
 		// Retrieve encoded auth token from the image reference
 		// Retrieve encoded auth token from the image reference
 		// This would be the old image if it didn't change in this update
 		// This would be the old image if it didn't change in this update
-		image := service.Spec.TaskTemplate.ContainerSpec.Image
+		image := spec.TaskTemplate.ContainerSpec.Image
 		encodedAuth, err := command.RetrieveAuthTokenFromImage(ctx, dockerCli, image)
 		encodedAuth, err := command.RetrieveAuthTokenFromImage(ctx, dockerCli, image)
 		if err != nil {
 		if err != nil {
 			return err
 			return err
 		}
 		}
 		updateOpts.EncodedRegistryAuth = encodedAuth
 		updateOpts.EncodedRegistryAuth = encodedAuth
+	} else if rollback {
+		updateOpts.RegistryAuthFrom = types.RegistryAuthFromPreviousSpec
+	} else {
+		updateOpts.RegistryAuthFrom = types.RegistryAuthFromSpec
 	}
 	}
 
 
-	err = apiClient.ServiceUpdate(ctx, service.ID, service.Version, service.Spec, updateOpts)
+	err = apiClient.ServiceUpdate(ctx, service.ID, service.Version, *spec, updateOpts)
 	if err != nil {
 	if err != nil {
 		return err
 		return err
 	}
 	}
@@ -111,6 +129,12 @@ func updateService(flags *pflag.FlagSet, spec *swarm.ServiceSpec) error {
 		}
 		}
 	}
 	}
 
 
+	updateFloat32 := func(flag string, field *float32) {
+		if flags.Changed(flag) {
+			*field, _ = flags.GetFloat32(flag)
+		}
+	}
+
 	updateDuration := func(flag string, field *time.Duration) {
 	updateDuration := func(flag string, field *time.Duration) {
 		if flags.Changed(flag) {
 		if flags.Changed(flag) {
 			*field, _ = flags.GetDuration(flag)
 			*field, _ = flags.GetDuration(flag)
@@ -195,13 +219,15 @@ func updateService(flags *pflag.FlagSet, spec *swarm.ServiceSpec) error {
 		return err
 		return err
 	}
 	}
 
 
-	if anyChanged(flags, flagUpdateParallelism, flagUpdateDelay, flagUpdateFailureAction) {
+	if anyChanged(flags, flagUpdateParallelism, flagUpdateDelay, flagUpdateMonitor, flagUpdateFailureAction, flagUpdateMaxFailureRatio) {
 		if spec.UpdateConfig == nil {
 		if spec.UpdateConfig == nil {
 			spec.UpdateConfig = &swarm.UpdateConfig{}
 			spec.UpdateConfig = &swarm.UpdateConfig{}
 		}
 		}
 		updateUint64(flagUpdateParallelism, &spec.UpdateConfig.Parallelism)
 		updateUint64(flagUpdateParallelism, &spec.UpdateConfig.Parallelism)
 		updateDuration(flagUpdateDelay, &spec.UpdateConfig.Delay)
 		updateDuration(flagUpdateDelay, &spec.UpdateConfig.Delay)
+		updateDuration(flagUpdateMonitor, &spec.UpdateConfig.Monitor)
 		updateString(flagUpdateFailureAction, &spec.UpdateConfig.FailureAction)
 		updateString(flagUpdateFailureAction, &spec.UpdateConfig.FailureAction)
+		updateFloat32(flagUpdateMaxFailureRatio, &spec.UpdateConfig.MaxFailureRatio)
 	}
 	}
 
 
 	if flags.Changed(flagEndpointMode) {
 	if flags.Changed(flagEndpointMode) {

+ 3 - 0
contrib/completion/bash/docker

@@ -1760,9 +1760,12 @@ _docker_service_update() {
 		--restart-delay
 		--restart-delay
 		--restart-max-attempts
 		--restart-max-attempts
 		--restart-window
 		--restart-window
+		--rollback
 		--stop-grace-period
 		--stop-grace-period
 		--update-delay
 		--update-delay
 		--update-failure-action
 		--update-failure-action
+		--update-max-failure-ratio
+		--update-monitor
 		--update-parallelism
 		--update-parallelism
 		--user -u
 		--user -u
 		--workdir -w
 		--workdir -w

+ 3 - 0
contrib/completion/zsh/_docker

@@ -1108,6 +1108,8 @@ __docker_service_subcommand() {
         "($help)--stop-grace-period=[Time to wait before force killing a container]:grace period: "
         "($help)--stop-grace-period=[Time to wait before force killing a container]:grace period: "
         "($help)--update-delay=[Delay between updates]:delay: "
         "($help)--update-delay=[Delay between updates]:delay: "
         "($help)--update-failure-action=[Action on update failure]:mode:(pause continue)"
         "($help)--update-failure-action=[Action on update failure]:mode:(pause continue)"
+        "($help)--update-max-failure-ratio=[Failure rate to tolerate during an update]:fraction: "
+        "($help)--update-monitor=[Duration after each task update to monitor for failure]:window: "
         "($help)--update-parallelism=[Maximum number of tasks updated simultaneously]:number: "
         "($help)--update-parallelism=[Maximum number of tasks updated simultaneously]:number: "
         "($help -u --user)"{-u=,--user=}"[Username or UID]:user:_users"
         "($help -u --user)"{-u=,--user=}"[Username or UID]:user:_users"
         "($help)--with-registry-auth[Send registry authentication details to swarm agents]"
         "($help)--with-registry-auth[Send registry authentication details to swarm agents]"
@@ -1185,6 +1187,7 @@ __docker_service_subcommand() {
                 "($help)*--container-label-rm=[Remove a container label by its key]:label: " \
                 "($help)*--container-label-rm=[Remove a container label by its key]:label: " \
                 "($help)*--group-rm=[Remove previously added user groups from the container]:group:_groups" \
                 "($help)*--group-rm=[Remove previously added user groups from the container]:group:_groups" \
                 "($help)--image=[Service image tag]:image:__docker_repositories" \
                 "($help)--image=[Service image tag]:image:__docker_repositories" \
+                "($help)--rollback[Rollback to previous specification]" \
                 "($help -)1:service:__docker_complete_services" && ret=0
                 "($help -)1:service:__docker_complete_services" && ret=0
             ;;
             ;;
         (help)
         (help)

+ 13 - 2
daemon/cluster/cluster.go

@@ -913,7 +913,7 @@ func (c *Cluster) GetService(input string) (types.Service, error) {
 }
 }
 
 
 // UpdateService updates existing service to match new properties.
 // UpdateService updates existing service to match new properties.
-func (c *Cluster) UpdateService(serviceIDOrName string, version uint64, spec types.ServiceSpec, encodedAuth string) error {
+func (c *Cluster) UpdateService(serviceIDOrName string, version uint64, spec types.ServiceSpec, encodedAuth string, registryAuthFrom string) error {
 	c.RLock()
 	c.RLock()
 	defer c.RUnlock()
 	defer c.RUnlock()
 
 
@@ -948,7 +948,18 @@ func (c *Cluster) UpdateService(serviceIDOrName string, version uint64, spec typ
 	} else {
 	} else {
 		// this is needed because if the encodedAuth isn't being updated then we
 		// this is needed because if the encodedAuth isn't being updated then we
 		// shouldn't lose it, and continue to use the one that was already present
 		// shouldn't lose it, and continue to use the one that was already present
-		ctnr := currentService.Spec.Task.GetContainer()
+		var ctnr *swarmapi.ContainerSpec
+		switch registryAuthFrom {
+		case apitypes.RegistryAuthFromSpec, "":
+			ctnr = currentService.Spec.Task.GetContainer()
+		case apitypes.RegistryAuthFromPreviousSpec:
+			if currentService.PreviousSpec == nil {
+				return fmt.Errorf("service does not have a previous spec")
+			}
+			ctnr = currentService.PreviousSpec.Task.GetContainer()
+		default:
+			return fmt.Errorf("unsupported registryAuthFromValue")
+		}
 		if ctnr == nil {
 		if ctnr == nil {
 			return fmt.Errorf("service does not use container tasks")
 			return fmt.Errorf("service does not use container tasks")
 		}
 		}

+ 75 - 57
daemon/cluster/convert/service.go

@@ -12,8 +12,43 @@ import (
 
 
 // ServiceFromGRPC converts a grpc Service to a Service.
 // ServiceFromGRPC converts a grpc Service to a Service.
 func ServiceFromGRPC(s swarmapi.Service) types.Service {
 func ServiceFromGRPC(s swarmapi.Service) types.Service {
-	spec := s.Spec
-	containerConfig := spec.Task.Runtime.(*swarmapi.TaskSpec_Container).Container
+	service := types.Service{
+		ID:           s.ID,
+		Spec:         *serviceSpecFromGRPC(&s.Spec),
+		PreviousSpec: serviceSpecFromGRPC(s.PreviousSpec),
+
+		Endpoint: endpointFromGRPC(s.Endpoint),
+	}
+
+	// Meta
+	service.Version.Index = s.Meta.Version.Index
+	service.CreatedAt, _ = ptypes.Timestamp(s.Meta.CreatedAt)
+	service.UpdatedAt, _ = ptypes.Timestamp(s.Meta.UpdatedAt)
+
+	// UpdateStatus
+	service.UpdateStatus = types.UpdateStatus{}
+	if s.UpdateStatus != nil {
+		switch s.UpdateStatus.State {
+		case swarmapi.UpdateStatus_UPDATING:
+			service.UpdateStatus.State = types.UpdateStateUpdating
+		case swarmapi.UpdateStatus_PAUSED:
+			service.UpdateStatus.State = types.UpdateStatePaused
+		case swarmapi.UpdateStatus_COMPLETED:
+			service.UpdateStatus.State = types.UpdateStateCompleted
+		}
+
+		service.UpdateStatus.StartedAt, _ = ptypes.Timestamp(s.UpdateStatus.StartedAt)
+		service.UpdateStatus.CompletedAt, _ = ptypes.Timestamp(s.UpdateStatus.CompletedAt)
+		service.UpdateStatus.Message = s.UpdateStatus.Message
+	}
+
+	return service
+}
+
+func serviceSpecFromGRPC(spec *swarmapi.ServiceSpec) *types.ServiceSpec {
+	if spec == nil {
+		return nil
+	}
 
 
 	serviceNetworks := make([]types.NetworkAttachmentConfig, 0, len(spec.Networks))
 	serviceNetworks := make([]types.NetworkAttachmentConfig, 0, len(spec.Networks))
 	for _, n := range spec.Networks {
 	for _, n := range spec.Networks {
@@ -25,78 +60,57 @@ func ServiceFromGRPC(s swarmapi.Service) types.Service {
 		taskNetworks = append(taskNetworks, types.NetworkAttachmentConfig{Target: n.Target, Aliases: n.Aliases})
 		taskNetworks = append(taskNetworks, types.NetworkAttachmentConfig{Target: n.Target, Aliases: n.Aliases})
 	}
 	}
 
 
-	service := types.Service{
-		ID: s.ID,
-
-		Spec: types.ServiceSpec{
-			TaskTemplate: types.TaskSpec{
-				ContainerSpec: containerSpecFromGRPC(containerConfig),
-				Resources:     resourcesFromGRPC(s.Spec.Task.Resources),
-				RestartPolicy: restartPolicyFromGRPC(s.Spec.Task.Restart),
-				Placement:     placementFromGRPC(s.Spec.Task.Placement),
-				LogDriver:     driverFromGRPC(s.Spec.Task.LogDriver),
-				Networks:      taskNetworks,
-			},
-
-			Networks:     serviceNetworks,
-			EndpointSpec: endpointSpecFromGRPC(s.Spec.Endpoint),
+	containerConfig := spec.Task.Runtime.(*swarmapi.TaskSpec_Container).Container
+	convertedSpec := &types.ServiceSpec{
+		Annotations: types.Annotations{
+			Name:   spec.Annotations.Name,
+			Labels: spec.Annotations.Labels,
 		},
 		},
-		Endpoint: endpointFromGRPC(s.Endpoint),
-	}
 
 
-	// Meta
-	service.Version.Index = s.Meta.Version.Index
-	service.CreatedAt, _ = ptypes.Timestamp(s.Meta.CreatedAt)
-	service.UpdatedAt, _ = ptypes.Timestamp(s.Meta.UpdatedAt)
+		TaskTemplate: types.TaskSpec{
+			ContainerSpec: containerSpecFromGRPC(containerConfig),
+			Resources:     resourcesFromGRPC(spec.Task.Resources),
+			RestartPolicy: restartPolicyFromGRPC(spec.Task.Restart),
+			Placement:     placementFromGRPC(spec.Task.Placement),
+			LogDriver:     driverFromGRPC(spec.Task.LogDriver),
+			Networks:      taskNetworks,
+		},
 
 
-	// Annotations
-	service.Spec.Name = s.Spec.Annotations.Name
-	service.Spec.Labels = s.Spec.Annotations.Labels
+		Networks:     serviceNetworks,
+		EndpointSpec: endpointSpecFromGRPC(spec.Endpoint),
+	}
 
 
 	// UpdateConfig
 	// UpdateConfig
-	if s.Spec.Update != nil {
-		service.Spec.UpdateConfig = &types.UpdateConfig{
-			Parallelism: s.Spec.Update.Parallelism,
+	if spec.Update != nil {
+		convertedSpec.UpdateConfig = &types.UpdateConfig{
+			Parallelism:     spec.Update.Parallelism,
+			MaxFailureRatio: spec.Update.MaxFailureRatio,
 		}
 		}
 
 
-		service.Spec.UpdateConfig.Delay, _ = ptypes.Duration(&s.Spec.Update.Delay)
+		convertedSpec.UpdateConfig.Delay, _ = ptypes.Duration(&spec.Update.Delay)
+		if spec.Update.Monitor != nil {
+			convertedSpec.UpdateConfig.Monitor, _ = ptypes.Duration(spec.Update.Monitor)
+		}
 
 
-		switch s.Spec.Update.FailureAction {
+		switch spec.Update.FailureAction {
 		case swarmapi.UpdateConfig_PAUSE:
 		case swarmapi.UpdateConfig_PAUSE:
-			service.Spec.UpdateConfig.FailureAction = types.UpdateFailureActionPause
+			convertedSpec.UpdateConfig.FailureAction = types.UpdateFailureActionPause
 		case swarmapi.UpdateConfig_CONTINUE:
 		case swarmapi.UpdateConfig_CONTINUE:
-			service.Spec.UpdateConfig.FailureAction = types.UpdateFailureActionContinue
+			convertedSpec.UpdateConfig.FailureAction = types.UpdateFailureActionContinue
 		}
 		}
 	}
 	}
 
 
 	// Mode
 	// Mode
-	switch t := s.Spec.GetMode().(type) {
+	switch t := spec.GetMode().(type) {
 	case *swarmapi.ServiceSpec_Global:
 	case *swarmapi.ServiceSpec_Global:
-		service.Spec.Mode.Global = &types.GlobalService{}
+		convertedSpec.Mode.Global = &types.GlobalService{}
 	case *swarmapi.ServiceSpec_Replicated:
 	case *swarmapi.ServiceSpec_Replicated:
-		service.Spec.Mode.Replicated = &types.ReplicatedService{
+		convertedSpec.Mode.Replicated = &types.ReplicatedService{
 			Replicas: &t.Replicated.Replicas,
 			Replicas: &t.Replicated.Replicas,
 		}
 		}
 	}
 	}
 
 
-	// UpdateStatus
-	service.UpdateStatus = types.UpdateStatus{}
-	if s.UpdateStatus != nil {
-		switch s.UpdateStatus.State {
-		case swarmapi.UpdateStatus_UPDATING:
-			service.UpdateStatus.State = types.UpdateStateUpdating
-		case swarmapi.UpdateStatus_PAUSED:
-			service.UpdateStatus.State = types.UpdateStatePaused
-		case swarmapi.UpdateStatus_COMPLETED:
-			service.UpdateStatus.State = types.UpdateStateCompleted
-		}
-
-		service.UpdateStatus.StartedAt, _ = ptypes.Timestamp(s.UpdateStatus.StartedAt)
-		service.UpdateStatus.CompletedAt, _ = ptypes.Timestamp(s.UpdateStatus.CompletedAt)
-		service.UpdateStatus.Message = s.UpdateStatus.Message
-	}
-
-	return service
+	return convertedSpec
 }
 }
 
 
 // ServiceSpecToGRPC converts a ServiceSpec to a grpc ServiceSpec.
 // ServiceSpecToGRPC converts a ServiceSpec to a grpc ServiceSpec.
@@ -158,9 +172,13 @@ func ServiceSpecToGRPC(s types.ServiceSpec) (swarmapi.ServiceSpec, error) {
 			return swarmapi.ServiceSpec{}, fmt.Errorf("unrecongized update failure action %s", s.UpdateConfig.FailureAction)
 			return swarmapi.ServiceSpec{}, fmt.Errorf("unrecongized update failure action %s", s.UpdateConfig.FailureAction)
 		}
 		}
 		spec.Update = &swarmapi.UpdateConfig{
 		spec.Update = &swarmapi.UpdateConfig{
-			Parallelism:   s.UpdateConfig.Parallelism,
-			Delay:         *ptypes.DurationProto(s.UpdateConfig.Delay),
-			FailureAction: failureAction,
+			Parallelism:     s.UpdateConfig.Parallelism,
+			Delay:           *ptypes.DurationProto(s.UpdateConfig.Delay),
+			FailureAction:   failureAction,
+			MaxFailureRatio: s.UpdateConfig.MaxFailureRatio,
+		}
+		if s.UpdateConfig.Monitor != 0 {
+			spec.Update.Monitor = ptypes.DurationProto(s.UpdateConfig.Monitor)
 		}
 		}
 	}
 	}
 
 

+ 1 - 0
docs/reference/api/docker_remote_api.md

@@ -129,6 +129,7 @@ This section lists each version from latest to oldest.  Each listing includes a
 * `GET /containers/json` now supports a `is-task` filter to filter
 * `GET /containers/json` now supports a `is-task` filter to filter
   containers that are tasks (part of a service in swarm mode).
   containers that are tasks (part of a service in swarm mode).
 * `POST /containers/create` now takes `StopTimeout` field.
 * `POST /containers/create` now takes `StopTimeout` field.
+* `POST /services/create` and `POST /services/(id or name)/update` now accept `Monitor` and `MaxFailureRatio` parameters, which control the response to failures during service updates.
 
 
 ### v1.24 API changes
 ### v1.24 API changes
 
 

+ 23 - 7
docs/reference/api/docker_remote_api_v1.25.md

@@ -4877,7 +4877,9 @@ List services
           },
           },
           "UpdateConfig": {
           "UpdateConfig": {
             "Parallelism": 1,
             "Parallelism": 1,
-            "FailureAction": "pause"
+            "FailureAction": "pause",
+            "Monitor": 15000000000,
+            "MaxFailureRatio": 0.15
           },
           },
           "EndpointSpec": {
           "EndpointSpec": {
             "Mode": "vip",
             "Mode": "vip",
@@ -5077,8 +5079,8 @@ image](#create-an-image) section for more details.
     - **RestartPolicy** – Specification for the restart policy which applies to containers created
     - **RestartPolicy** – Specification for the restart policy which applies to containers created
       as part of this service.
       as part of this service.
         - **Condition** – Condition for restart (`none`, `on-failure`, or `any`).
         - **Condition** – Condition for restart (`none`, `on-failure`, or `any`).
-        - **Delay** – Delay between restart attempts.
-        - **Attempts** – Maximum attempts to restart a given container before giving up (default value
+        - **Delay** – Delay between restart attempts, in nanoseconds.
+        - **MaxAttempts** – Maximum attempts to restart a given container before giving up (default value
           is 0, which is ignored).
           is 0, which is ignored).
         - **Window** – Windows is the time window used to evaluate the restart policy (default value is
         - **Window** – Windows is the time window used to evaluate the restart policy (default value is
           0, which is unbounded).
           0, which is unbounded).
@@ -5087,9 +5089,12 @@ image](#create-an-image) section for more details.
 - **UpdateConfig** – Specification for the update strategy of the service.
 - **UpdateConfig** – Specification for the update strategy of the service.
     - **Parallelism** – Maximum number of tasks to be updated in one iteration (0 means unlimited
     - **Parallelism** – Maximum number of tasks to be updated in one iteration (0 means unlimited
       parallelism).
       parallelism).
-    - **Delay** – Amount of time between updates.
+    - **Delay** – Amount of time between updates, in nanoseconds.
     - **FailureAction** - Action to take if an updated task fails to run, or stops running during the
     - **FailureAction** - Action to take if an updated task fails to run, or stops running during the
       update. Values are `continue` and `pause`.
       update. Values are `continue` and `pause`.
+    - **Monitor** - Amount of time to monitor each updated task for failures, in nanoseconds.
+    - **MaxFailureRatio** - The fraction of tasks that may fail during an update before the
+      failure action is invoked, specified as a floating point number between 0 and 1. The default is 0.
 - **Networks** – Array of network names or IDs to attach the service to.
 - **Networks** – Array of network names or IDs to attach the service to.
 - **EndpointSpec** – Properties that can be configured to access and load balance a service.
 - **EndpointSpec** – Properties that can be configured to access and load balance a service.
     - **Mode** – The mode of resolution to use for internal load balancing
     - **Mode** – The mode of resolution to use for internal load balancing
@@ -5259,7 +5264,9 @@ image](#create-an-image) section for more details.
         }
         }
       },
       },
       "UpdateConfig": {
       "UpdateConfig": {
-        "Parallelism": 1
+        "Parallelism": 1,
+        "Monitor": 15000000000,
+        "MaxFailureRatio": 0.15
       },
       },
       "EndpointSpec": {
       "EndpointSpec": {
         "Mode": "vip"
         "Mode": "vip"
@@ -5314,7 +5321,7 @@ image](#create-an-image) section for more details.
     - **RestartPolicy** – Specification for the restart policy which applies to containers created
     - **RestartPolicy** – Specification for the restart policy which applies to containers created
       as part of this service.
       as part of this service.
         - **Condition** – Condition for restart (`none`, `on-failure`, or `any`).
         - **Condition** – Condition for restart (`none`, `on-failure`, or `any`).
-        - **Delay** – Delay between restart attempts.
+        - **Delay** – Delay between restart attempts, in nanoseconds.
         - **MaxAttempts** – Maximum attempts to restart a given container before giving up (default value
         - **MaxAttempts** – Maximum attempts to restart a given container before giving up (default value
           is 0, which is ignored).
           is 0, which is ignored).
         - **Window** – Windows is the time window used to evaluate the restart policy (default value is
         - **Window** – Windows is the time window used to evaluate the restart policy (default value is
@@ -5324,7 +5331,12 @@ image](#create-an-image) section for more details.
 - **UpdateConfig** – Specification for the update strategy of the service.
 - **UpdateConfig** – Specification for the update strategy of the service.
     - **Parallelism** – Maximum number of tasks to be updated in one iteration (0 means unlimited
     - **Parallelism** – Maximum number of tasks to be updated in one iteration (0 means unlimited
       parallelism).
       parallelism).
-    - **Delay** – Amount of time between updates.
+    - **Delay** – Amount of time between updates, in nanoseconds.
+    - **FailureAction** - Action to take if an updated task fails to run, or stops running during the
+      update. Values are `continue` and `pause`.
+    - **Monitor** - Amount of time to monitor each updated task for failures, in nanoseconds.
+    - **MaxFailureRatio** - The fraction of tasks that may fail during an update before the
+      failure action is invoked, specified as a floating point number between 0 and 1. The default is 0.
 - **Networks** – Array of network names or IDs to attach the service to.
 - **Networks** – Array of network names or IDs to attach the service to.
 - **EndpointSpec** – Properties that can be configured to access and load balance a service.
 - **EndpointSpec** – Properties that can be configured to access and load balance a service.
     - **Mode** – The mode of resolution to use for internal load balancing
     - **Mode** – The mode of resolution to use for internal load balancing
@@ -5338,6 +5350,10 @@ image](#create-an-image) section for more details.
 
 
 - **version** – The version number of the service object being updated. This is
 - **version** – The version number of the service object being updated. This is
   required to avoid conflicting writes.
   required to avoid conflicting writes.
+- **registryAuthFrom** - If the X-Registry-Auth header is not specified, this
+  parameter indicates where to find registry authorization credentials. The
+  valid values are `spec` and `previous-spec`. If unspecified, the default is
+  `spec`.
 
 
 **Request Headers**:
 **Request Headers**:
 
 

+ 32 - 30
docs/reference/commandline/service_create.md

@@ -12,36 +12,38 @@ Usage:  docker service create [OPTIONS] IMAGE [COMMAND] [ARG...]
 Create a new service
 Create a new service
 
 
 Options:
 Options:
-      --constraint value               Placement constraints (default [])
-      --container-label value          Service container labels (default [])
-      --endpoint-mode string           Endpoint mode (vip or dnsrr)
-  -e, --env value                      Set environment variables (default [])
-      --group-add value                Add additional user groups to the container (default [])
-      --help                           Print usage
-  -l, --label value                    Service labels (default [])
-      --limit-cpu value                Limit CPUs (default 0.000)
-      --limit-memory value             Limit Memory (default 0 B)
-      --log-driver string              Logging driver for service
-      --log-opt value                  Logging driver options (default [])
-      --mode string                    Service mode (replicated or global) (default "replicated")
-      --mount value                    Attach a mount to the service
-      --name string                    Service name
-      --network value                  Network attachments (default [])
-  -p, --publish value                  Publish a port as a node port (default [])
-      --replicas value                 Number of tasks (default none)
-      --reserve-cpu value              Reserve CPUs (default 0.000)
-      --reserve-memory value           Reserve Memory (default 0 B)
-      --restart-condition string       Restart when condition is met (none, on-failure, or any)
-      --restart-delay value            Delay between restart attempts (default none)
-      --restart-max-attempts value     Maximum number of restarts before giving up (default none)
-      --restart-window value           Window used to evaluate the restart policy (default none)
-      --stop-grace-period value        Time to wait before force killing a container (default none)
-      --update-delay duration          Delay between updates
-      --update-failure-action string   Action on update failure (pause|continue) (default "pause")
-      --update-parallelism uint        Maximum number of tasks updated simultaneously (0 to update all at once) (default 1)
-  -u, --user string                    Username or UID (format: <name|uid>[:<group|gid>])
-      --with-registry-auth             Send registry authentication details to Swarm agents
-  -w, --workdir string                 Working directory inside the container
+      --constraint value                 Placement constraints (default [])
+      --container-label value            Service container labels (default [])
+      --endpoint-mode string             Endpoint mode (vip or dnsrr)
+  -e, --env value                        Set environment variables (default [])
+      --group-add value                  Add additional user groups to the container (default [])
+      --help                             Print usage
+  -l, --label value                      Service labels (default [])
+      --limit-cpu value                  Limit CPUs (default 0.000)
+      --limit-memory value               Limit Memory (default 0 B)
+      --log-driver string                Logging driver for service
+      --log-opt value                    Logging driver options (default [])
+      --mode string                      Service mode (replicated or global) (default "replicated")
+      --mount value                      Attach a mount to the service
+      --name string                      Service name
+      --network value                    Network attachments (default [])
+  -p, --publish value                    Publish a port as a node port (default [])
+      --replicas value                   Number of tasks (default none)
+      --reserve-cpu value                Reserve CPUs (default 0.000)
+      --reserve-memory value             Reserve Memory (default 0 B)
+      --restart-condition string         Restart when condition is met (none, on-failure, or any)
+      --restart-delay value              Delay between restart attempts (default none)
+      --restart-max-attempts value       Maximum number of restarts before giving up (default none)
+      --restart-window value             Window used to evaluate the restart policy (default none)
+      --stop-grace-period value          Time to wait before force killing a container (default none)
+      --update-delay duration            Delay between updates
+      --update-failure-action string     Action on update failure (pause|continue) (default "pause")
+      --update-max-failure-ratio value   Failure rate to tolerate during an update
+      --update-monitor duration          Duration after each task update to monitor for failure (default 0s)
+      --update-parallelism uint          Maximum number of tasks updated simultaneously (0 to update all at once) (default 1)
+  -u, --user string                      Username or UID (format: <name|uid>[:<group|gid>])
+      --with-registry-auth               Send registry authentication details to Swarm agents
+  -w, --workdir string                   Working directory inside the container
 ```
 ```
 
 
 Creates a service as described by the specified parameters. You must run this
 Creates a service as described by the specified parameters. You must run this

+ 40 - 37
docs/reference/commandline/service_update.md

@@ -12,43 +12,46 @@ Usage:  docker service update [OPTIONS] SERVICE
 Update a service
 Update a service
 
 
 Options:
 Options:
-      --args string                    Service command args
-      --constraint-add value           Add or update placement constraints (default [])
-      --constraint-rm value            Remove a constraint (default [])
-      --container-label-add value      Add or update container labels (default [])
-      --container-label-rm value       Remove a container label by its key (default [])
-      --endpoint-mode string           Endpoint mode (vip or dnsrr)
-      --env-add value                  Add or update environment variables (default [])
-      --env-rm value                   Remove an environment variable (default [])
-      --group-add value                Add additional user groups to the container (default [])
-      --group-rm value                 Remove previously added user groups from the container (default [])
-      --help                           Print usage
-      --image string                   Service image tag
-      --label-add value                Add or update service labels (default [])
-      --label-rm value                 Remove a label by its key (default [])
-      --limit-cpu value                Limit CPUs (default 0.000)
-      --limit-memory value             Limit Memory (default 0 B)
-      --log-driver string              Logging driver for service
-      --log-opt value                  Logging driver options (default [])
-      --mount-add value                Add or update a mount on a service
-      --mount-rm value                 Remove a mount by its target path (default [])
-      --name string                    Service name
-      --publish-add value              Add or update a published port (default [])
-      --publish-rm value               Remove a published port by its target port (default [])
-      --replicas value                 Number of tasks (default none)
-      --reserve-cpu value              Reserve CPUs (default 0.000)
-      --reserve-memory value           Reserve Memory (default 0 B)
-      --restart-condition string       Restart when condition is met (none, on-failure, or any)
-      --restart-delay value            Delay between restart attempts (default none)
-      --restart-max-attempts value     Maximum number of restarts before giving up (default none)
-      --restart-window value           Window used to evaluate the restart policy (default none)
-      --stop-grace-period value        Time to wait before force killing a container (default none)
-      --update-delay duration          Delay between updates
-      --update-failure-action string   Action on update failure (pause|continue) (default "pause")
-      --update-parallelism uint        Maximum number of tasks updated simultaneously (0 to update all at once) (default 1)
-  -u, --user string                    Username or UID (format: <name|uid>[:<group|gid>])
-      --with-registry-auth             Send registry authentication details to Swarm agents
-  -w, --workdir string                 Working directory inside the container
+      --args string                      Service command args
+      --constraint-add value             Add or update placement constraints (default [])
+      --constraint-rm value              Remove a constraint (default [])
+      --container-label-add value        Add or update container labels (default [])
+      --container-label-rm value         Remove a container label by its key (default [])
+      --endpoint-mode string             Endpoint mode (vip or dnsrr)
+      --env-add value                    Add or update environment variables (default [])
+      --env-rm value                     Remove an environment variable (default [])
+      --group-add value                  Add additional user groups to the container (default [])
+      --group-rm value                   Remove previously added user groups from the container (default [])
+      --help                             Print usage
+      --image string                     Service image tag
+      --label-add value                  Add or update service labels (default [])
+      --label-rm value                   Remove a label by its key (default [])
+      --limit-cpu value                  Limit CPUs (default 0.000)
+      --limit-memory value               Limit Memory (default 0 B)
+      --log-driver string                Logging driver for service
+      --log-opt value                    Logging driver options (default [])
+      --mount-add value                  Add or update a mount on a service
+      --mount-rm value                   Remove a mount by its target path (default [])
+      --name string                      Service name
+      --publish-add value                Add or update a published port (default [])
+      --publish-rm value                 Remove a published port by its target port (default [])
+      --replicas value                   Number of tasks (default none)
+      --reserve-cpu value                Reserve CPUs (default 0.000)
+      --reserve-memory value             Reserve Memory (default 0 B)
+      --restart-condition string         Restart when condition is met (none, on-failure, or any)
+      --restart-delay value              Delay between restart attempts (default none)
+      --restart-max-attempts value       Maximum number of restarts before giving up (default none)
+      --restart-window value             Window used to evaluate the restart policy (default none)
+      --rollback                         Rollback to previous specification
+      --stop-grace-period value          Time to wait before force killing a container (default none)
+      --update-delay duration            Delay between updates
+      --update-failure-action string     Action on update failure (pause|continue) (default "pause")
+      --update-max-failure-ratio value   Failure rate to tolerate during an update
+      --update-monitor duration          Duration after each task update to monitor for failure (default 0s)
+      --update-parallelism uint          Maximum number of tasks updated simultaneously (0 to update all at once) (default 1)
+  -u, --user string                      Username or UID (format: <name|uid>[:<group|gid>])
+      --with-registry-auth               Send registry authentication details to Swarm agents
+  -w, --workdir string                   Working directory inside the container
 ```
 ```
 
 
 Updates a service as described by the specified parameters. This command has to be run targeting a manager node.
 Updates a service as described by the specified parameters. This command has to be run targeting a manager node.

+ 11 - 4
integration-cli/daemon_swarm.go

@@ -139,8 +139,8 @@ func (d *SwarmDaemon) getServiceTasks(c *check.C, service string) []swarm.Task {
 	return tasks
 	return tasks
 }
 }
 
 
-func (d *SwarmDaemon) checkServiceRunningTasks(c *check.C, service string) func(*check.C) (interface{}, check.CommentInterface) {
-	return func(*check.C) (interface{}, check.CommentInterface) {
+func (d *SwarmDaemon) checkServiceRunningTasks(service string) func(*check.C) (interface{}, check.CommentInterface) {
+	return func(c *check.C) (interface{}, check.CommentInterface) {
 		tasks := d.getServiceTasks(c, service)
 		tasks := d.getServiceTasks(c, service)
 		var runningCount int
 		var runningCount int
 		for _, task := range tasks {
 		for _, task := range tasks {
@@ -152,8 +152,15 @@ func (d *SwarmDaemon) checkServiceRunningTasks(c *check.C, service string) func(
 	}
 	}
 }
 }
 
 
-func (d *SwarmDaemon) checkServiceTasks(c *check.C, service string) func(*check.C) (interface{}, check.CommentInterface) {
-	return func(*check.C) (interface{}, check.CommentInterface) {
+func (d *SwarmDaemon) checkServiceUpdateState(service string) func(*check.C) (interface{}, check.CommentInterface) {
+	return func(c *check.C) (interface{}, check.CommentInterface) {
+		service := d.getService(c, service)
+		return service.UpdateStatus.State, nil
+	}
+}
+
+func (d *SwarmDaemon) checkServiceTasks(service string) func(*check.C) (interface{}, check.CommentInterface) {
+	return func(c *check.C) (interface{}, check.CommentInterface) {
 		tasks := d.getServiceTasks(c, service)
 		tasks := d.getServiceTasks(c, service)
 		return len(tasks), nil
 		return len(tasks), nil
 	}
 	}

+ 83 - 8
integration-cli/docker_api_swarm_test.go

@@ -310,6 +310,63 @@ func (s *DockerSwarmSuite) TestAPISwarmServicesUpdate(c *check.C) {
 	// 3nd batch
 	// 3nd batch
 	waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkRunningTaskImages, checker.DeepEquals,
 	waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkRunningTaskImages, checker.DeepEquals,
 		map[string]int{image2: instances})
 		map[string]int{image2: instances})
+
+	// Roll back to the previous version. This uses the CLI because
+	// rollback is a client-side operation.
+	out, err := daemons[0].Cmd("service", "update", "--rollback", id)
+	c.Assert(err, checker.IsNil, check.Commentf(out))
+
+	// first batch
+	waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkRunningTaskImages, checker.DeepEquals,
+		map[string]int{image2: instances - parallelism, image1: parallelism})
+
+	// 2nd batch
+	waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkRunningTaskImages, checker.DeepEquals,
+		map[string]int{image2: instances - 2*parallelism, image1: 2 * parallelism})
+
+	// 3nd batch
+	waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkRunningTaskImages, checker.DeepEquals,
+		map[string]int{image1: instances})
+}
+
+func (s *DockerSwarmSuite) TestApiSwarmServicesFailedUpdate(c *check.C) {
+	const nodeCount = 3
+	var daemons [nodeCount]*SwarmDaemon
+	for i := 0; i < nodeCount; i++ {
+		daemons[i] = s.AddDaemon(c, true, i == 0)
+	}
+	// wait for nodes ready
+	waitAndAssert(c, 5*time.Second, daemons[0].checkNodeReadyCount, checker.Equals, nodeCount)
+
+	// service image at start
+	image1 := "busybox:latest"
+	// target image in update
+	image2 := "busybox:badtag"
+
+	// create service
+	instances := 5
+	id := daemons[0].createService(c, serviceForUpdate, setInstances(instances))
+
+	// wait for tasks ready
+	waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkRunningTaskImages, checker.DeepEquals,
+		map[string]int{image1: instances})
+
+	// issue service update
+	service := daemons[0].getService(c, id)
+	daemons[0].updateService(c, service, setImage(image2), setFailureAction(swarm.UpdateFailureActionPause), setMaxFailureRatio(0.25), setParallelism(1))
+
+	// should update 2 tasks and then pause
+	waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceUpdateState(id), checker.Equals, swarm.UpdateStatePaused)
+	v, _ := daemons[0].checkServiceRunningTasks(id)(c)
+	c.Assert(v, checker.Equals, instances-2)
+
+	// Roll back to the previous version. This uses the CLI because
+	// rollback is a client-side operation.
+	out, err := daemons[0].Cmd("service", "update", "--rollback", id)
+	c.Assert(err, checker.IsNil, check.Commentf(out))
+
+	waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkRunningTaskImages, checker.DeepEquals,
+		map[string]int{image1: instances})
 }
 }
 
 
 func (s *DockerSwarmSuite) TestAPISwarmServiceConstraintRole(c *check.C) {
 func (s *DockerSwarmSuite) TestAPISwarmServiceConstraintRole(c *check.C) {
@@ -326,7 +383,7 @@ func (s *DockerSwarmSuite) TestAPISwarmServiceConstraintRole(c *check.C) {
 	instances := 3
 	instances := 3
 	id := daemons[0].createService(c, simpleTestService, setConstraints(constraints), setInstances(instances))
 	id := daemons[0].createService(c, simpleTestService, setConstraints(constraints), setInstances(instances))
 	// wait for tasks ready
 	// wait for tasks ready
-	waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceRunningTasks(c, id), checker.Equals, instances)
+	waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceRunningTasks(id), checker.Equals, instances)
 	// validate tasks are running on worker nodes
 	// validate tasks are running on worker nodes
 	tasks := daemons[0].getServiceTasks(c, id)
 	tasks := daemons[0].getServiceTasks(c, id)
 	for _, task := range tasks {
 	for _, task := range tasks {
@@ -340,7 +397,7 @@ func (s *DockerSwarmSuite) TestAPISwarmServiceConstraintRole(c *check.C) {
 	constraints = []string{"node.role!=worker"}
 	constraints = []string{"node.role!=worker"}
 	id = daemons[0].createService(c, simpleTestService, setConstraints(constraints), setInstances(instances))
 	id = daemons[0].createService(c, simpleTestService, setConstraints(constraints), setInstances(instances))
 	// wait for tasks ready
 	// wait for tasks ready
-	waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceRunningTasks(c, id), checker.Equals, instances)
+	waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceRunningTasks(id), checker.Equals, instances)
 	tasks = daemons[0].getServiceTasks(c, id)
 	tasks = daemons[0].getServiceTasks(c, id)
 	// validate tasks are running on manager nodes
 	// validate tasks are running on manager nodes
 	for _, task := range tasks {
 	for _, task := range tasks {
@@ -354,7 +411,7 @@ func (s *DockerSwarmSuite) TestAPISwarmServiceConstraintRole(c *check.C) {
 	constraints = []string{"node.role==nosuchrole"}
 	constraints = []string{"node.role==nosuchrole"}
 	id = daemons[0].createService(c, simpleTestService, setConstraints(constraints), setInstances(instances))
 	id = daemons[0].createService(c, simpleTestService, setConstraints(constraints), setInstances(instances))
 	// wait for tasks created
 	// wait for tasks created
-	waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceTasks(c, id), checker.Equals, instances)
+	waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceTasks(id), checker.Equals, instances)
 	// let scheduler try
 	// let scheduler try
 	time.Sleep(250 * time.Millisecond)
 	time.Sleep(250 * time.Millisecond)
 	// validate tasks are not assigned to any node
 	// validate tasks are not assigned to any node
@@ -394,7 +451,7 @@ func (s *DockerSwarmSuite) TestAPISwarmServiceConstraintLabel(c *check.C) {
 	constraints := []string{"node.labels.security==high"}
 	constraints := []string{"node.labels.security==high"}
 	id := daemons[0].createService(c, simpleTestService, setConstraints(constraints), setInstances(instances))
 	id := daemons[0].createService(c, simpleTestService, setConstraints(constraints), setInstances(instances))
 	// wait for tasks ready
 	// wait for tasks ready
-	waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceRunningTasks(c, id), checker.Equals, instances)
+	waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceRunningTasks(id), checker.Equals, instances)
 	tasks := daemons[0].getServiceTasks(c, id)
 	tasks := daemons[0].getServiceTasks(c, id)
 	// validate all tasks are running on nodes[0]
 	// validate all tasks are running on nodes[0]
 	for _, task := range tasks {
 	for _, task := range tasks {
@@ -407,7 +464,7 @@ func (s *DockerSwarmSuite) TestAPISwarmServiceConstraintLabel(c *check.C) {
 	constraints = []string{"node.labels.security!=high"}
 	constraints = []string{"node.labels.security!=high"}
 	id = daemons[0].createService(c, simpleTestService, setConstraints(constraints), setInstances(instances))
 	id = daemons[0].createService(c, simpleTestService, setConstraints(constraints), setInstances(instances))
 	// wait for tasks ready
 	// wait for tasks ready
-	waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceRunningTasks(c, id), checker.Equals, instances)
+	waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceRunningTasks(id), checker.Equals, instances)
 	tasks = daemons[0].getServiceTasks(c, id)
 	tasks = daemons[0].getServiceTasks(c, id)
 	// validate all tasks are NOT running on nodes[0]
 	// validate all tasks are NOT running on nodes[0]
 	for _, task := range tasks {
 	for _, task := range tasks {
@@ -419,7 +476,7 @@ func (s *DockerSwarmSuite) TestAPISwarmServiceConstraintLabel(c *check.C) {
 	constraints = []string{"node.labels.security==medium"}
 	constraints = []string{"node.labels.security==medium"}
 	id = daemons[0].createService(c, simpleTestService, setConstraints(constraints), setInstances(instances))
 	id = daemons[0].createService(c, simpleTestService, setConstraints(constraints), setInstances(instances))
 	// wait for tasks created
 	// wait for tasks created
-	waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceTasks(c, id), checker.Equals, instances)
+	waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceTasks(id), checker.Equals, instances)
 	// let scheduler try
 	// let scheduler try
 	time.Sleep(250 * time.Millisecond)
 	time.Sleep(250 * time.Millisecond)
 	tasks = daemons[0].getServiceTasks(c, id)
 	tasks = daemons[0].getServiceTasks(c, id)
@@ -437,7 +494,7 @@ func (s *DockerSwarmSuite) TestAPISwarmServiceConstraintLabel(c *check.C) {
 	}
 	}
 	id = daemons[0].createService(c, simpleTestService, setConstraints(constraints), setInstances(instances))
 	id = daemons[0].createService(c, simpleTestService, setConstraints(constraints), setInstances(instances))
 	// wait for tasks created
 	// wait for tasks created
-	waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceTasks(c, id), checker.Equals, instances)
+	waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceTasks(id), checker.Equals, instances)
 	// let scheduler try
 	// let scheduler try
 	time.Sleep(250 * time.Millisecond)
 	time.Sleep(250 * time.Millisecond)
 	tasks = daemons[0].getServiceTasks(c, id)
 	tasks = daemons[0].getServiceTasks(c, id)
@@ -452,7 +509,7 @@ func (s *DockerSwarmSuite) TestAPISwarmServiceConstraintLabel(c *check.C) {
 		}
 		}
 	})
 	})
 	// wait for tasks ready
 	// wait for tasks ready
-	waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceRunningTasks(c, id), checker.Equals, instances)
+	waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceRunningTasks(id), checker.Equals, instances)
 	tasks = daemons[0].getServiceTasks(c, id)
 	tasks = daemons[0].getServiceTasks(c, id)
 	for _, task := range tasks {
 	for _, task := range tasks {
 		c.Assert(task.NodeID, checker.Equals, nodes[1].ID)
 		c.Assert(task.NodeID, checker.Equals, nodes[1].ID)
@@ -1022,6 +1079,24 @@ func setImage(image string) serviceConstructor {
 	}
 	}
 }
 }
 
 
+func setFailureAction(failureAction string) serviceConstructor {
+	return func(s *swarm.Service) {
+		s.Spec.UpdateConfig.FailureAction = failureAction
+	}
+}
+
+func setMaxFailureRatio(maxFailureRatio float32) serviceConstructor {
+	return func(s *swarm.Service) {
+		s.Spec.UpdateConfig.MaxFailureRatio = maxFailureRatio
+	}
+}
+
+func setParallelism(parallelism uint64) serviceConstructor {
+	return func(s *swarm.Service) {
+		s.Spec.UpdateConfig.Parallelism = parallelism
+	}
+}
+
 func setConstraints(constraints []string) serviceConstructor {
 func setConstraints(constraints []string) serviceConstructor {
 	return func(s *swarm.Service) {
 	return func(s *swarm.Service) {
 		if s.Spec.TaskTemplate.Placement == nil {
 		if s.Spec.TaskTemplate.Placement == nil {

+ 1 - 1
integration-cli/docker_cli_swarm_test.go

@@ -349,7 +349,7 @@ func (s *DockerSwarmSuite) TestPsListContainersFilterIsTask(c *check.C) {
 	c.Assert(strings.TrimSpace(out), checker.Not(checker.Equals), "")
 	c.Assert(strings.TrimSpace(out), checker.Not(checker.Equals), "")
 
 
 	// make sure task has been deployed.
 	// make sure task has been deployed.
-	waitAndAssert(c, defaultReconciliationTimeout, d.checkServiceRunningTasks(c, name), checker.Equals, 1)
+	waitAndAssert(c, defaultReconciliationTimeout, d.checkServiceRunningTasks(name), checker.Equals, 1)
 
 
 	// Filter non-tasks
 	// Filter non-tasks
 	out, err = d.Cmd("ps", "-a", "-q", "--filter=is-task=false")
 	out, err = d.Cmd("ps", "-a", "-q", "--filter=is-task=false")