daemon: rename: don't reload endpoint from datastore

Commit 8b7af1d0f added some code to update the DNSNames of all endpoints attached to a sandbox by loading a new instance of each affected endpoints from the datastore through a call to `Network.EndpointByID()`. This method then calls `Network.getEndpointFromStore()`, that in turn calls `store.GetObject()`, which then calls `cache.get()`, which calls `o.CopyTo(kvObject)`. This effectively creates a fresh new instance of an Endpoint. However, endpoints are already kept in memory by Sandbox, meaning we now have two in-memory instances of the same Endpoint. As it turns out, libnetwork is built around the idea that no two objects representing the same thing should leave in-memory, otherwise breaking mutex locking and optimistic locking (as both instances will have a drifting version tracking ID -- dbIndex in libnetwork parliance). In this specific case, this bug materializes by container rename failing when applied a second time for a given container. An integration test is added to make sure this won't happen again. Signed-off-by: Albin Kerouanton <albinker@gmail.com>
2024-01-23 20:15:45 +01:00 · 2024-01-23 20:15:45 +01:00 · 80c44b4b2e
commit 80c44b4b2e
parent f19f233ca5
7 changed files with 34 additions and 11 deletions
--- a/daemon/rename.go
+++ b/daemon/rename.go
@ -2,6 +2,7 @@ package daemon // import "github.com/docker/docker/daemon"

 import (
 	"context"
+	"fmt"
 	"strings"

 	"github.com/containerd/log"
@ -127,9 +128,9 @@ func (daemon *Daemon) ContainerRename(oldName, newName string) (retErr error) {
 				return err
 			}

-			ep, err := nw.EndpointByID(epConfig.EndpointID)
-			if err != nil {
-				return err
+			ep := sb.GetEndpoint(epConfig.EndpointID)
+			if ep == nil {
+				return fmt.Errorf("no endpoint attached to network %s found", nw.Name())
 			}

 			oldDNSNames := make([]string, len(epConfig.DNSNames))
--- a/integration/container/rename_test.go
+++ b/integration/container/rename_test.go
@ -192,3 +192,25 @@ func TestRenameContainerWithLinkedContainer(t *testing.T) {
 	assert.NilError(t, err)
 	assert.Check(t, is.Equal(db1ID, inspect.ID))
 }
+
+// Regression test for https://github.com/moby/moby/issues/47186
+func TestRenameContainerTwice(t *testing.T) {
+	ctx := setupTest(t)
+	apiClient := testEnv.APIClient()
+
+	ctrName := "c0"
+	container.Run(ctx, t, apiClient, container.WithName("c0"))
+	defer func() {
+		container.Remove(ctx, t, apiClient, ctrName, containertypes.RemoveOptions{
+			Force: true,
+		})
+	}()
+
+	err := apiClient.ContainerRename(ctx, "c0", "c1")
+	assert.NilError(t, err)
+	ctrName = "c1"
+
+	err = apiClient.ContainerRename(ctx, "c1", "c2")
+	assert.NilError(t, err)
+	ctrName = "c2"
+}
--- a/libnetwork/agent.go
+++ b/libnetwork/agent.go
@ -623,7 +623,7 @@ func (ep *Endpoint) addServiceInfoToCluster(sb *Sandbox) error {
 	// In case the deleteServiceInfoToCluster arrives first, this one is happening after the endpoint is
 	// removed from the list, in this situation the delete will bail out not finding any data to cleanup
 	// and the add will bail out not finding the endpoint on the sandbox.
-	if err := sb.getEndpoint(ep.ID()); err == nil {
+	if err := sb.GetEndpoint(ep.ID()); err == nil {
 		log.G(context.TODO()).Warnf("addServiceInfoToCluster suppressing service resolution ep is not anymore in the sandbox %s", ep.ID())
 		return nil
 	}
@ -692,7 +692,7 @@ func (ep *Endpoint) deleteServiceInfoFromCluster(sb *Sandbox, fullRemove bool, m
 	// get caught in disableServceInNetworkDB, but we check here to make the
 	// nature of the condition more clear.
 	// See comment in addServiceInfoToCluster()
-	if err := sb.getEndpoint(ep.ID()); err == nil {
+	if err := sb.GetEndpoint(ep.ID()); err == nil {
 		log.G(context.TODO()).Warnf("deleteServiceInfoFromCluster suppressing service resolution ep is not anymore in the sandbox %s", ep.ID())
 		return nil
 	}
--- a/libnetwork/endpoint_info.go
+++ b/libnetwork/endpoint_info.go
@ -194,7 +194,7 @@ func (ep *Endpoint) Info() EndpointInfo {
 		return ep
 	}

-	return sb.getEndpoint(ep.ID())
+	return sb.GetEndpoint(ep.ID())
 }

 // Iface returns information about the interface which was assigned to
--- a/libnetwork/network.go
+++ b/libnetwork/network.go
@ -1276,8 +1276,8 @@ func (n *Network) EndpointByName(name string) (*Endpoint, error) {
 	return e, nil
 }

-// EndpointByID returns the Endpoint which has the passed id. If not found,
-// the error ErrNoSuchEndpoint is returned.
+// EndpointByID should *never* be called as it's going to create a 2nd instance of an Endpoint. The first one lives in
+// the Sandbox the endpoint is attached to. Instead, the endpoint should be retrieved by calling [Sandbox.Endpoints()].
 func (n *Network) EndpointByID(id string) (*Endpoint, error) {
 	if id == "" {
 		return nil, ErrInvalidID(id)
--- a/libnetwork/sandbox.go
+++ b/libnetwork/sandbox.go
@ -334,7 +334,7 @@ func (sb *Sandbox) removeEndpointRaw(ep *Endpoint) {
 	}
 }

-func (sb *Sandbox) getEndpoint(id string) *Endpoint {
+func (sb *Sandbox) GetEndpoint(id string) *Endpoint {
 	sb.mu.Lock()
 	defer sb.mu.Unlock()

@ -554,7 +554,7 @@ func (sb *Sandbox) DisableService() (err error) {
 }

 func (sb *Sandbox) clearNetworkResources(origEp *Endpoint) error {
-	ep := sb.getEndpoint(origEp.id)
+	ep := sb.GetEndpoint(origEp.id)
 	if ep == nil {
 		return fmt.Errorf("could not find the sandbox endpoint data for endpoint %s",
 			origEp.id)
--- a/libnetwork/service_linux.go
+++ b/libnetwork/service_linux.go
@ -57,7 +57,7 @@ func (n *Network) findLBEndpointSandbox() (*Endpoint, *Sandbox, error) {
 	if !ok {
 		return nil, nil, fmt.Errorf("Unable to get sandbox for %s(%s) in for %s", ep.Name(), ep.ID(), n.ID())
 	}
-	sep := sb.getEndpoint(ep.ID())
+	sep := sb.GetEndpoint(ep.ID())
 	if sep == nil {
 		return nil, nil, fmt.Errorf("Load balancing endpoint %s(%s) removed from %s", ep.Name(), ep.ID(), n.ID())
 	}