Retry creating dynamic networks if not found
In cases there are failures in task start, swarmkit might be trying to
restart the task again in the same node which might keep failing. This
creates a race where when a failed task is getting removed it might
remove the associated network while another task for the same service
or a different service but connected to the same network is proceeding
with starting the container knowing that the network is still
present. Fix this by reacting to `ErrNoSuchNetwork` error during
container start by trying to recreate the managed networks. If they
have been removed it will be recreated. If they are already present
nothing bad will happen.
Signed-off-by: Jana Radhakrishnan <mrjana@docker.com>
(cherry picked from commit 117cef5e97
)
Signed-off-by: Tibor Vass <tibor@docker.com>
This commit is contained in:
parent
2f6ca79080
commit
769c25c416
1 changed files with 18 additions and 2 deletions
|
@ -6,6 +6,7 @@ import (
|
|||
executorpkg "github.com/docker/docker/daemon/cluster/executor"
|
||||
"github.com/docker/engine-api/types"
|
||||
"github.com/docker/engine-api/types/events"
|
||||
"github.com/docker/libnetwork"
|
||||
"github.com/docker/swarmkit/agent/exec"
|
||||
"github.com/docker/swarmkit/api"
|
||||
"github.com/docker/swarmkit/log"
|
||||
|
@ -160,8 +161,23 @@ func (r *controller) Start(ctx context.Context) error {
|
|||
return exec.ErrTaskStarted
|
||||
}
|
||||
|
||||
if err := r.adapter.start(ctx); err != nil {
|
||||
return errors.Wrap(err, "starting container failed")
|
||||
for {
|
||||
if err := r.adapter.start(ctx); err != nil {
|
||||
if _, ok := err.(libnetwork.ErrNoSuchNetwork); ok {
|
||||
// Retry network creation again if we
|
||||
// failed because some of the networks
|
||||
// were not found.
|
||||
if err := r.adapter.createNetworks(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
continue
|
||||
}
|
||||
|
||||
return errors.Wrap(err, "starting container failed")
|
||||
}
|
||||
|
||||
break
|
||||
}
|
||||
|
||||
// no health check
|
||||
|
|
Loading…
Reference in a new issue