Add failure action for rolling updates

This changes the default behavior so that rolling updates will not
proceed once an updated task fails to start, or stops running during the
update. Users can use docker service inspect --pretty servicename to see
the update status, and if it pauses due to a failure, it will explain
that the update is paused, and show the task ID that caused it to pause.
It also shows the time since the update started.

A new --update-on-failure=(pause|continue) flag selects the
behavior. Pause means the update stops once a task fails, continue means
the old behavior of continuing the update anyway.

In the future this will be extended with additional behaviors like
automatic rollback, and flags controlling parameters like how many tasks
need to fail for the update to stop proceeding. This is a minimal
solution for 1.12.

Signed-off-by: Aaron Lehmann <aaron.lehmann@docker.com>
This commit is contained in:
Aaron Lehmann 2016-07-22 11:35:51 -07:00
parent 3585026c3a
commit 57ae29aa74
11 changed files with 171 additions and 108 deletions

View file

@ -4,6 +4,7 @@ import (
"fmt"
"io"
"strings"
"time"
"golang.org/x/net/context"
@ -101,6 +102,17 @@ func printService(out io.Writer, service swarm.Service) {
fmt.Fprintf(out, " Replicas:\t%d\n", *service.Spec.Mode.Replicated.Replicas)
}
}
if service.UpdateStatus.State != "" {
fmt.Fprintln(out, "Update status:")
fmt.Fprintf(out, " State:\t\t%s\n", service.UpdateStatus.State)
fmt.Fprintf(out, " Started:\t%s ago\n", strings.ToLower(units.HumanDuration(time.Since(service.UpdateStatus.StartedAt))))
if service.UpdateStatus.State == swarm.UpdateStateCompleted {
fmt.Fprintf(out, " Completed:\t%s ago\n", strings.ToLower(units.HumanDuration(time.Since(service.UpdateStatus.CompletedAt))))
}
fmt.Fprintf(out, " Message:\t%s\n", service.UpdateStatus.Message)
}
fmt.Fprintln(out, "Placement:")
if service.Spec.TaskTemplate.Placement != nil && len(service.Spec.TaskTemplate.Placement.Constraints) > 0 {
ioutils.FprintfIfNotEmpty(out, " Constraints\t: %s\n", strings.Join(service.Spec.TaskTemplate.Placement.Constraints, ", "))
@ -110,6 +122,7 @@ func printService(out io.Writer, service swarm.Service) {
if service.Spec.UpdateConfig.Delay.Nanoseconds() > 0 {
fmt.Fprintf(out, " Delay:\t\t%s\n", service.Spec.UpdateConfig.Delay)
}
fmt.Fprintf(out, " On failure:\t%s\n", service.Spec.UpdateConfig.FailureAction)
fmt.Fprintf(out, "ContainerSpec:\n")
printContainerSpec(out, service.Spec.TaskTemplate.ContainerSpec)

View file

@ -274,6 +274,7 @@ func (m *MountOpt) Value() []swarm.Mount {
type updateOptions struct {
parallelism uint64
delay time.Duration
onFailure string
}
type resourceOptions struct {
@ -457,6 +458,7 @@ func (opts *serviceOptions) ToService() (swarm.ServiceSpec, error) {
UpdateConfig: &swarm.UpdateConfig{
Parallelism: opts.update.parallelism,
Delay: opts.update.delay,
FailureAction: opts.update.onFailure,
},
Networks: convertNetworks(opts.networks),
EndpointSpec: opts.endpoint.ToEndpointSpec(),
@ -503,6 +505,7 @@ func addServiceFlags(cmd *cobra.Command, opts *serviceOptions) {
flags.Uint64Var(&opts.update.parallelism, flagUpdateParallelism, 1, "Maximum number of tasks updated simultaneously (0 to update all at once)")
flags.DurationVar(&opts.update.delay, flagUpdateDelay, time.Duration(0), "Delay between updates")
flags.StringVar(&opts.update.onFailure, flagUpdateFailureAction, "pause", "Action on update failure (pause|continue)")
flags.StringVar(&opts.endpoint.mode, flagEndpointMode, "", "Endpoint mode (vip or dnsrr)")
@ -545,6 +548,7 @@ const (
flagRestartWindow = "restart-window"
flagStopGracePeriod = "stop-grace-period"
flagUpdateDelay = "update-delay"
flagUpdateFailureAction = "update-failure-action"
flagUpdateParallelism = "update-parallelism"
flagUser = "user"
flagRegistryAuth = "with-registry-auth"

View file

@ -191,12 +191,13 @@ func updateService(flags *pflag.FlagSet, spec *swarm.ServiceSpec) error {
return err
}
if anyChanged(flags, flagUpdateParallelism, flagUpdateDelay) {
if anyChanged(flags, flagUpdateParallelism, flagUpdateDelay, flagUpdateFailureAction) {
if spec.UpdateConfig == nil {
spec.UpdateConfig = &swarm.UpdateConfig{}
}
updateUint64(flagUpdateParallelism, &spec.UpdateConfig.Parallelism)
updateDuration(flagUpdateDelay, &spec.UpdateConfig.Delay)
updateString(flagUpdateFailureAction, &spec.UpdateConfig.FailureAction)
}
updateNetworks(flags, &spec.Networks)

View file

@ -1726,6 +1726,7 @@ _docker_service_update() {
--restart-window
--stop-grace-period
--update-delay
--update-failure-action
--update-parallelism
--user -u
--workdir -w

View file

@ -1094,6 +1094,7 @@ __docker_service_subcommand() {
"($help)--restart-window=[Window used to evaluate the restart policy]:window: "
"($help)--stop-grace-period=[Time to wait before force killing a container]:grace period: "
"($help)--update-delay=[Delay between updates]:delay: "
"($help)--update-failure-action=[Action on update failure]:mode:(pause continue)"
"($help)--update-parallelism=[Maximum number of tasks updated simultaneously]:number: "
"($help -u --user)"{-u=,--user=}"[Username or UID]:user:_users"
"($help)--with-registry-auth[Send registry authentication details to swarm agents]"

View file

@ -53,9 +53,16 @@ func ServiceFromGRPC(s swarmapi.Service) types.Service {
}
service.Spec.UpdateConfig.Delay, _ = ptypes.Duration(&s.Spec.Update.Delay)
switch s.Spec.Update.FailureAction {
case swarmapi.UpdateConfig_PAUSE:
service.Spec.UpdateConfig.FailureAction = types.UpdateFailureActionPause
case swarmapi.UpdateConfig_CONTINUE:
service.Spec.UpdateConfig.FailureAction = types.UpdateFailureActionContinue
}
}
//Mode
// Mode
switch t := s.Spec.GetMode().(type) {
case *swarmapi.ServiceSpec_Global:
service.Spec.Mode.Global = &types.GlobalService{}
@ -65,6 +72,23 @@ func ServiceFromGRPC(s swarmapi.Service) types.Service {
}
}
// UpdateStatus
service.UpdateStatus = types.UpdateStatus{}
if s.UpdateStatus != nil {
switch s.UpdateStatus.State {
case swarmapi.UpdateStatus_UPDATING:
service.UpdateStatus.State = types.UpdateStateUpdating
case swarmapi.UpdateStatus_PAUSED:
service.UpdateStatus.State = types.UpdateStatePaused
case swarmapi.UpdateStatus_COMPLETED:
service.UpdateStatus.State = types.UpdateStateCompleted
}
service.UpdateStatus.StartedAt, _ = ptypes.Timestamp(s.UpdateStatus.StartedAt)
service.UpdateStatus.CompletedAt, _ = ptypes.Timestamp(s.UpdateStatus.CompletedAt)
service.UpdateStatus.Message = s.UpdateStatus.Message
}
return service
}
@ -111,9 +135,19 @@ func ServiceSpecToGRPC(s types.ServiceSpec) (swarmapi.ServiceSpec, error) {
}
if s.UpdateConfig != nil {
var failureAction swarmapi.UpdateConfig_FailureAction
switch s.UpdateConfig.FailureAction {
case types.UpdateFailureActionPause, "":
failureAction = swarmapi.UpdateConfig_PAUSE
case types.UpdateFailureActionContinue:
failureAction = swarmapi.UpdateConfig_CONTINUE
default:
return swarmapi.ServiceSpec{}, fmt.Errorf("unrecongized update failure action %s", s.UpdateConfig.FailureAction)
}
spec.Update = &swarmapi.UpdateConfig{
Parallelism: s.UpdateConfig.Parallelism,
Delay: *ptypes.DurationProto(s.UpdateConfig.Delay),
FailureAction: failureAction,
}
}

View file

@ -3964,7 +3964,8 @@ Create a service
},
"UpdateConfig": {
"Delay": 30000000000.0,
"Parallelism": 2
"Parallelism": 2,
"FailureAction": "pause"
},
"EndpointSpec": {
"Ports": [
@ -4054,6 +4055,8 @@ JSON Parameters:
- **Parallelism** Maximum number of tasks to be updated in one iteration (0 means unlimited
parallelism).
- **Delay** Amount of time between updates.
- **FailureAction** - Action to take if an updated task fails to run, or stops running during the
update. Values are `continue` and `pause`.
- **Networks** Array of network names or IDs to attach the service to.
- **Endpoint** Properties that can be configured to access and load balance a service.
- **Spec**

View file

@ -3965,7 +3965,8 @@ Create a service
},
"UpdateConfig": {
"Delay": 30000000000.0,
"Parallelism": 2
"Parallelism": 2,
"FailureAction": "pause"
},
"EndpointSpec": {
"Ports": [
@ -4055,6 +4056,8 @@ JSON Parameters:
- **Parallelism** Maximum number of tasks to be updated in one iteration (0 means unlimited
parallelism).
- **Delay** Amount of time between updates.
- **FailureAction** - Action to take if an updated task fails to run, or stops running during the
update. Values are `continue` and `pause`.
- **Networks** Array of network names or IDs to attach the service to.
- **Endpoint** Properties that can be configured to access and load balance a service.
- **Spec**

View file

@ -40,6 +40,7 @@ Options:
--restart-window value Window used to evaluate the restart policy (default none)
--stop-grace-period value Time to wait before force killing a container (default none)
--update-delay duration Delay between updates
--update-failure-action string Action on update failure (pause|continue) (default "pause")
--update-parallelism uint Maximum number of tasks updated simultaneously (0 to update all at once) (default 1)
-u, --user string Username or UID
--with-registry-auth Send registry authentication details to Swarm agents

View file

@ -47,6 +47,7 @@ Options:
--restart-window value Window used to evaluate the restart policy (default none)
--stop-grace-period value Time to wait before force killing a container (default none)
--update-delay duration Delay between updates
--update-failure-action string Action on update failure (pause|continue) (default "pause")
--update-parallelism uint Maximum number of tasks updated simultaneously (0 to update all at once) (default 1)
-u, --user string Username or UID
--with-registry-auth Send registry authentication details to Swarm agents

View file

@ -793,6 +793,7 @@ func serviceForUpdate(s *swarm.Service) {
UpdateConfig: &swarm.UpdateConfig{
Parallelism: 2,
Delay: 8 * time.Second,
FailureAction: swarm.UpdateFailureActionContinue,
},
}
s.Spec.Name = "updatetest"