Переглянути джерело

Merge pull request #30261 from aaronlehmann/rollout-mode

swarm: Add update/rollback order
Brian Goff 8 роки тому
батько
коміт
3de58eb2bc

+ 12 - 0
api/swagger.yaml

@@ -2296,6 +2296,12 @@ definitions:
             description: "The fraction of tasks that may fail during an update before the failure action is invoked, specified as a floating point number between 0 and 1."
             type: "number"
             default: 0
+          Order:
+            description: "The order of operations when rolling out an updated task. Either the old task is shut down before the new task is started, or the new task is started before the old task is shut down."
+            type: "string"
+            enum:
+              - "stop-first"
+              - "start-first"
       RollbackConfig:
         description: "Specification for the rollback strategy of the service."
         type: "object"
@@ -2322,6 +2328,12 @@ definitions:
             description: "The fraction of tasks that may fail during a rollback before the failure action is invoked, specified as a floating point number between 0 and 1."
             type: "number"
             default: 0
+          Order:
+            description: "The order of operations when rolling back a task. Either the old task is shut down before the new task is started, or the new task is started before the old task is shut down."
+            type: "string"
+            enum:
+              - "stop-first"
+              - "start-first"
       Networks:
         description: "Array of network names or IDs to attach the service to."
         type: "array"

+ 10 - 0
api/types/swarm/service.go

@@ -77,6 +77,11 @@ const (
 	UpdateFailureActionContinue = "continue"
 	// UpdateFailureActionRollback ROLLBACK
 	UpdateFailureActionRollback = "rollback"
+
+	// UpdateOrderStopFirst STOP_FIRST
+	UpdateOrderStopFirst = "stop-first"
+	// UpdateOrderStartFirst START_FIRST
+	UpdateOrderStartFirst = "start-first"
 )
 
 // UpdateConfig represents the update configuration.
@@ -111,4 +116,9 @@ type UpdateConfig struct {
 	// If the failure action is PAUSE, no more tasks will be updated until
 	// another update is started.
 	MaxFailureRatio float32
+
+	// Order indicates the order of operations when rolling out an updated
+	// task. Either the old task is shut down before the new task is
+	// started, or the new task is started before the old task is shut down.
+	Order string
 }

+ 10 - 0
cli/command/formatter/service.go

@@ -57,6 +57,7 @@ UpdateConfig:
  Monitoring Period: {{ .UpdateMonitor }}
 {{- end }}
  Max failure ratio: {{ .UpdateMaxFailureRatio }}
+ Update order:      {{ .UpdateOrder }}
 {{- end }}
 {{- if .HasRollbackConfig }}
 RollbackConfig:
@@ -69,6 +70,7 @@ RollbackConfig:
  Monitoring Period: {{ .RollbackMonitor }}
 {{- end }}
  Max failure ratio: {{ .RollbackMaxFailureRatio }}
+ Rollback order:    {{ .RollbackOrder }}
 {{- end }}
 ContainerSpec:
  Image:		{{ .ContainerImage }}
@@ -260,6 +262,10 @@ func (ctx *serviceInspectContext) UpdateOnFailure() string {
 	return ctx.Service.Spec.UpdateConfig.FailureAction
 }
 
+func (ctx *serviceInspectContext) UpdateOrder() string {
+	return ctx.Service.Spec.UpdateConfig.Order
+}
+
 func (ctx *serviceInspectContext) HasUpdateMonitor() bool {
 	return ctx.Service.Spec.UpdateConfig.Monitor.Nanoseconds() > 0
 }
@@ -304,6 +310,10 @@ func (ctx *serviceInspectContext) RollbackMaxFailureRatio() float32 {
 	return ctx.Service.Spec.RollbackConfig.MaxFailureRatio
 }
 
+func (ctx *serviceInspectContext) RollbackOrder() string {
+	return ctx.Service.Spec.RollbackConfig.Order
+}
+
 func (ctx *serviceInspectContext) ContainerImage() string {
 	return ctx.Service.Spec.TaskTemplate.ContainerSpec.Image
 }

+ 8 - 0
cli/command/service/opts.go

@@ -188,6 +188,7 @@ type updateOptions struct {
 	monitor         time.Duration
 	onFailure       string
 	maxFailureRatio floatValue
+	order           string
 }
 
 func (opts updateOptions) config() *swarm.UpdateConfig {
@@ -197,6 +198,7 @@ func (opts updateOptions) config() *swarm.UpdateConfig {
 		Monitor:         opts.monitor,
 		FailureAction:   opts.onFailure,
 		MaxFailureRatio: opts.maxFailureRatio.Value(),
+		Order:           opts.order,
 	}
 }
 
@@ -533,6 +535,8 @@ func addServiceFlags(flags *pflag.FlagSet, opts *serviceOptions) {
 	flags.StringVar(&opts.update.onFailure, flagUpdateFailureAction, "pause", `Action on update failure ("pause"|"continue"|"rollback")`)
 	flags.Var(&opts.update.maxFailureRatio, flagUpdateMaxFailureRatio, "Failure rate to tolerate during an update")
 	flags.SetAnnotation(flagUpdateMaxFailureRatio, "version", []string{"1.25"})
+	flags.StringVar(&opts.update.order, flagUpdateOrder, "stop-first", `Update order ("start-first"|"stop-first")`)
+	flags.SetAnnotation(flagUpdateOrder, "version", []string{"1.29"})
 
 	flags.Uint64Var(&opts.rollback.parallelism, flagRollbackParallelism, 1, "Maximum number of tasks rolled back simultaneously (0 to roll back all at once)")
 	flags.SetAnnotation(flagRollbackParallelism, "version", []string{"1.28"})
@@ -544,6 +548,8 @@ func addServiceFlags(flags *pflag.FlagSet, opts *serviceOptions) {
 	flags.SetAnnotation(flagRollbackFailureAction, "version", []string{"1.28"})
 	flags.Var(&opts.rollback.maxFailureRatio, flagRollbackMaxFailureRatio, "Failure rate to tolerate during a rollback")
 	flags.SetAnnotation(flagRollbackMaxFailureRatio, "version", []string{"1.28"})
+	flags.StringVar(&opts.rollback.order, flagRollbackOrder, "stop-first", `Rollback order ("start-first"|"stop-first")`)
+	flags.SetAnnotation(flagRollbackOrder, "version", []string{"1.29"})
 
 	flags.StringVar(&opts.endpoint.mode, flagEndpointMode, "vip", "Endpoint mode (vip or dnsrr)")
 
@@ -633,6 +639,7 @@ const (
 	flagRollbackFailureAction   = "rollback-failure-action"
 	flagRollbackMaxFailureRatio = "rollback-max-failure-ratio"
 	flagRollbackMonitor         = "rollback-monitor"
+	flagRollbackOrder           = "rollback-order"
 	flagRollbackParallelism     = "rollback-parallelism"
 	flagStopGracePeriod         = "stop-grace-period"
 	flagStopSignal              = "stop-signal"
@@ -641,6 +648,7 @@ const (
 	flagUpdateFailureAction     = "update-failure-action"
 	flagUpdateMaxFailureRatio   = "update-max-failure-ratio"
 	flagUpdateMonitor           = "update-monitor"
+	flagUpdateOrder             = "update-order"
 	flagUpdateParallelism       = "update-parallelism"
 	flagUser                    = "user"
 	flagWorkdir                 = "workdir"

+ 4 - 2
cli/command/service/update.go

@@ -320,7 +320,7 @@ func updateService(flags *pflag.FlagSet, spec *swarm.ServiceSpec) error {
 		return err
 	}
 
-	if anyChanged(flags, flagUpdateParallelism, flagUpdateDelay, flagUpdateMonitor, flagUpdateFailureAction, flagUpdateMaxFailureRatio) {
+	if anyChanged(flags, flagUpdateParallelism, flagUpdateDelay, flagUpdateMonitor, flagUpdateFailureAction, flagUpdateMaxFailureRatio, flagUpdateOrder) {
 		if spec.UpdateConfig == nil {
 			spec.UpdateConfig = &swarm.UpdateConfig{}
 		}
@@ -329,9 +329,10 @@ func updateService(flags *pflag.FlagSet, spec *swarm.ServiceSpec) error {
 		updateDuration(flagUpdateMonitor, &spec.UpdateConfig.Monitor)
 		updateString(flagUpdateFailureAction, &spec.UpdateConfig.FailureAction)
 		updateFloatValue(flagUpdateMaxFailureRatio, &spec.UpdateConfig.MaxFailureRatio)
+		updateString(flagUpdateOrder, &spec.UpdateConfig.Order)
 	}
 
-	if anyChanged(flags, flagRollbackParallelism, flagRollbackDelay, flagRollbackMonitor, flagRollbackFailureAction, flagRollbackMaxFailureRatio) {
+	if anyChanged(flags, flagRollbackParallelism, flagRollbackDelay, flagRollbackMonitor, flagRollbackFailureAction, flagRollbackMaxFailureRatio, flagRollbackOrder) {
 		if spec.RollbackConfig == nil {
 			spec.RollbackConfig = &swarm.UpdateConfig{}
 		}
@@ -340,6 +341,7 @@ func updateService(flags *pflag.FlagSet, spec *swarm.ServiceSpec) error {
 		updateDuration(flagRollbackMonitor, &spec.RollbackConfig.Monitor)
 		updateString(flagRollbackFailureAction, &spec.RollbackConfig.FailureAction)
 		updateFloatValue(flagRollbackMaxFailureRatio, &spec.RollbackConfig.MaxFailureRatio)
+		updateString(flagRollbackOrder, &spec.RollbackConfig.Order)
 	}
 
 	if flags.Changed(flagEndpointMode) {

+ 17 - 1
daemon/cluster/convert/service.go

@@ -393,6 +393,13 @@ func updateConfigFromGRPC(updateConfig *swarmapi.UpdateConfig) *types.UpdateConf
 		converted.FailureAction = types.UpdateFailureActionRollback
 	}
 
+	switch updateConfig.Order {
+	case swarmapi.UpdateConfig_STOP_FIRST:
+		converted.Order = types.UpdateOrderStopFirst
+	case swarmapi.UpdateConfig_START_FIRST:
+		converted.Order = types.UpdateOrderStartFirst
+	}
+
 	return converted
 }
 
@@ -415,12 +422,21 @@ func updateConfigToGRPC(updateConfig *types.UpdateConfig) (*swarmapi.UpdateConfi
 	case types.UpdateFailureActionRollback:
 		converted.FailureAction = swarmapi.UpdateConfig_ROLLBACK
 	default:
-		return nil, fmt.Errorf("unrecongized update failure action %s", updateConfig.FailureAction)
+		return nil, fmt.Errorf("unrecognized update failure action %s", updateConfig.FailureAction)
 	}
 	if updateConfig.Monitor != 0 {
 		converted.Monitor = gogotypes.DurationProto(updateConfig.Monitor)
 	}
 
+	switch updateConfig.Order {
+	case types.UpdateOrderStopFirst, "":
+		converted.Order = swarmapi.UpdateConfig_STOP_FIRST
+	case types.UpdateOrderStartFirst:
+		converted.Order = swarmapi.UpdateConfig_START_FIRST
+	default:
+		return nil, fmt.Errorf("unrecognized update order %s", updateConfig.Order)
+	}
+
 	return converted, nil
 }
 

+ 2 - 0
docs/reference/commandline/service_create.md

@@ -65,6 +65,7 @@ Options:
       --rollback-max-failure-ratio float   Failure rate to tolerate during a rollback
       --rollback-monitor duration          Duration after each task rollback to monitor for failure
                                            (ns|us|ms|s|m|h) (default 0s)
+      --rollback-order string              Rollback order ("start-first"|"stop-first") (default "stop-first")
       --rollback-parallelism uint          Maximum number of tasks rolled back simultaneously (0 to roll
                                            back all at once) (default 1)
       --secret secret                      Specify secrets to expose to the service
@@ -75,6 +76,7 @@ Options:
       --update-failure-action string       Action on update failure ("pause"|"continue"|"rollback") (default "pause")
       --update-max-failure-ratio float     Failure rate to tolerate during an update
       --update-monitor duration            Duration after each task update to monitor for failure (ns|us|ms|s|m|h)
+      --update-order string                Update order ("start-first"|"stop-first") (default "stop-first")
       --update-parallelism uint            Maximum number of tasks updated simultaneously (0 to update all at once) (default 1)
   -u, --user string                        Username or UID (format: <name|uid>[:<group|gid>])
       --with-registry-auth                 Send registry authentication details to swarm agents

+ 2 - 0
docs/reference/commandline/service_update.md

@@ -77,6 +77,7 @@ Options:
       --rollback-max-failure-ratio float   Failure rate to tolerate during a rollback
       --rollback-monitor duration          Duration after each task rollback to monitor for failure
                                            (ns|us|ms|s|m|h) (default 0s)
+      --rollback-order string              Rollback order ("start-first"|"stop-first") (default "stop-first")
       --rollback-parallelism uint          Maximum number of tasks rolled back simultaneously (0 to roll
                                            back all at once) (default 1)
       --secret-add secret                  Add or update a secret on a service
@@ -88,6 +89,7 @@ Options:
       --update-failure-action string       Action on update failure ("pause"|"continue"|"rollback") (default "pause")
       --update-max-failure-ratio float     Failure rate to tolerate during an update
       --update-monitor duration            Duration after each task update to monitor for failure (ns|us|ms|s|m|h) 
+      --update-order string                Update order ("start-first"|"stop-first") (default "stop-first")
       --update-parallelism uint            Maximum number of tasks updated simultaneously (0 to update all at once) (default 1)
   -u, --user string                        Username or UID (format: <name|uid>[:<group|gid>])
       --with-registry-auth                 Send registry authentication details to swarm agents

+ 109 - 0
integration-cli/docker_api_swarm_service_test.go

@@ -175,6 +175,115 @@ func (s *DockerSwarmSuite) TestAPISwarmServicesUpdate(c *check.C) {
 		map[string]int{image1: instances})
 }
 
+func (s *DockerSwarmSuite) TestAPISwarmServicesUpdateStartFirst(c *check.C) {
+	d := s.AddDaemon(c, true, true)
+
+	// service image at start
+	image1 := "busybox:latest"
+	// target image in update
+	image2 := "testhealth"
+
+	// service started from this image won't pass health check
+	_, _, err := d.BuildImageWithOut(image2,
+		`FROM busybox
+		HEALTHCHECK --interval=1s --timeout=1s --retries=1024\
+		  CMD cat /status`,
+		true)
+	c.Check(err, check.IsNil)
+
+	// create service
+	instances := 5
+	parallelism := 2
+	rollbackParallelism := 3
+	id := d.CreateService(c, serviceForUpdate, setInstances(instances), setUpdateOrder(swarm.UpdateOrderStartFirst), setRollbackOrder(swarm.UpdateOrderStartFirst))
+
+	checkStartingTasks := func(expected int) []swarm.Task {
+		var startingTasks []swarm.Task
+		waitAndAssert(c, defaultReconciliationTimeout, func(c *check.C) (interface{}, check.CommentInterface) {
+			tasks := d.GetServiceTasks(c, id)
+			startingTasks = nil
+			for _, t := range tasks {
+				if t.Status.State == swarm.TaskStateStarting {
+					startingTasks = append(startingTasks, t)
+				}
+			}
+			return startingTasks, nil
+		}, checker.HasLen, expected)
+
+		return startingTasks
+	}
+
+	makeTasksHealthy := func(tasks []swarm.Task) {
+		for _, t := range tasks {
+			containerID := t.Status.ContainerStatus.ContainerID
+			d.Cmd("exec", containerID, "touch", "/status")
+		}
+	}
+
+	// wait for tasks ready
+	waitAndAssert(c, defaultReconciliationTimeout, d.CheckRunningTaskImages, checker.DeepEquals,
+		map[string]int{image1: instances})
+
+	// issue service update
+	service := d.GetService(c, id)
+	d.UpdateService(c, service, setImage(image2))
+
+	// first batch
+
+	// The old tasks should be running, and the new ones should be starting.
+	startingTasks := checkStartingTasks(parallelism)
+
+	waitAndAssert(c, defaultReconciliationTimeout, d.CheckRunningTaskImages, checker.DeepEquals,
+		map[string]int{image1: instances})
+
+	// make it healthy
+	makeTasksHealthy(startingTasks)
+
+	waitAndAssert(c, defaultReconciliationTimeout, d.CheckRunningTaskImages, checker.DeepEquals,
+		map[string]int{image1: instances - parallelism, image2: parallelism})
+
+	// 2nd batch
+
+	// The old tasks should be running, and the new ones should be starting.
+	startingTasks = checkStartingTasks(parallelism)
+
+	waitAndAssert(c, defaultReconciliationTimeout, d.CheckRunningTaskImages, checker.DeepEquals,
+		map[string]int{image1: instances - parallelism, image2: parallelism})
+
+	// make it healthy
+	makeTasksHealthy(startingTasks)
+
+	waitAndAssert(c, defaultReconciliationTimeout, d.CheckRunningTaskImages, checker.DeepEquals,
+		map[string]int{image1: instances - 2*parallelism, image2: 2 * parallelism})
+
+	// 3nd batch
+
+	// The old tasks should be running, and the new ones should be starting.
+	startingTasks = checkStartingTasks(1)
+
+	waitAndAssert(c, defaultReconciliationTimeout, d.CheckRunningTaskImages, checker.DeepEquals,
+		map[string]int{image1: instances - 2*parallelism, image2: 2 * parallelism})
+
+	// make it healthy
+	makeTasksHealthy(startingTasks)
+
+	waitAndAssert(c, defaultReconciliationTimeout, d.CheckRunningTaskImages, checker.DeepEquals,
+		map[string]int{image2: instances})
+
+	// Roll back to the previous version. This uses the CLI because
+	// rollback is a client-side operation.
+	out, err := d.Cmd("service", "update", "--rollback", id)
+	c.Assert(err, checker.IsNil, check.Commentf(out))
+
+	// first batch
+	waitAndAssert(c, defaultReconciliationTimeout, d.CheckRunningTaskImages, checker.DeepEquals,
+		map[string]int{image2: instances - rollbackParallelism, image1: rollbackParallelism})
+
+	// 2nd batch
+	waitAndAssert(c, defaultReconciliationTimeout, d.CheckRunningTaskImages, checker.DeepEquals,
+		map[string]int{image1: instances})
+}
+
 func (s *DockerSwarmSuite) TestAPISwarmServicesFailedUpdate(c *check.C) {
 	const nodeCount = 3
 	var daemons [nodeCount]*daemon.Swarm

+ 18 - 0
integration-cli/docker_api_swarm_test.go

@@ -596,6 +596,24 @@ func setInstances(replicas int) daemon.ServiceConstructor {
 	}
 }
 
+func setUpdateOrder(order string) daemon.ServiceConstructor {
+	return func(s *swarm.Service) {
+		if s.Spec.UpdateConfig == nil {
+			s.Spec.UpdateConfig = &swarm.UpdateConfig{}
+		}
+		s.Spec.UpdateConfig.Order = order
+	}
+}
+
+func setRollbackOrder(order string) daemon.ServiceConstructor {
+	return func(s *swarm.Service) {
+		if s.Spec.RollbackConfig == nil {
+			s.Spec.RollbackConfig = &swarm.UpdateConfig{}
+		}
+		s.Spec.RollbackConfig.Order = order
+	}
+}
+
 func setImage(image string) daemon.ServiceConstructor {
 	return func(s *swarm.Service) {
 		s.Spec.TaskTemplate.ContainerSpec.Image = image