Przeglądaj źródła

Workaround kernel bugs s related to namespaces

This PR attempts to work around bugs present in kernel
version 3.18-4.0.1 relating to namespace creation
and destruction. This fix attempts to avoid certain
systemmcalls to not get in the kkernel bug path as well
as lazily garbage collecting the name paths when they are removed.

Signed-off-by: Jana Radhakrishnan <mrjana@docker.com>
Jana Radhakrishnan 10 lat temu
rodzic
commit
3ec19ff62b

+ 111 - 22
libnetwork/sandbox/namespace_linux.go

@@ -4,10 +4,15 @@ import (
 	"fmt"
 	"fmt"
 	"net"
 	"net"
 	"os"
 	"os"
+	"os/exec"
+	"path/filepath"
 	"runtime"
 	"runtime"
 	"sync"
 	"sync"
 	"syscall"
 	"syscall"
+	"time"
 
 
+	log "github.com/Sirupsen/logrus"
+	"github.com/docker/docker/pkg/reexec"
 	"github.com/docker/libnetwork/types"
 	"github.com/docker/libnetwork/types"
 	"github.com/vishvananda/netlink"
 	"github.com/vishvananda/netlink"
 	"github.com/vishvananda/netns"
 	"github.com/vishvananda/netns"
@@ -15,7 +20,13 @@ import (
 
 
 const prefix = "/var/run/docker/netns"
 const prefix = "/var/run/docker/netns"
 
 
-var once sync.Once
+var (
+	once             sync.Once
+	garbagePathMap   = make(map[string]bool)
+	gpmLock          sync.Mutex
+	gpmWg            sync.WaitGroup
+	gpmCleanupPeriod = 60
+)
 
 
 // The networkNamespace type is the linux implementation of the Sandbox
 // The networkNamespace type is the linux implementation of the Sandbox
 // interface. It represents a linux network namespace, and moves an interface
 // interface. It represents a linux network namespace, and moves an interface
@@ -27,11 +38,56 @@ type networkNamespace struct {
 	sync.Mutex
 	sync.Mutex
 }
 }
 
 
+func init() {
+	reexec.Register("netns-create", reexecCreateNamespace)
+}
+
 func createBasePath() {
 func createBasePath() {
 	err := os.MkdirAll(prefix, 0644)
 	err := os.MkdirAll(prefix, 0644)
 	if err != nil && !os.IsExist(err) {
 	if err != nil && !os.IsExist(err) {
 		panic("Could not create net namespace path directory")
 		panic("Could not create net namespace path directory")
 	}
 	}
+
+	// cleanup any stale namespace files if any
+	cleanupNamespaceFiles()
+
+	// Start the garbage collection go routine
+	go removeUnusedPaths()
+}
+
+func removeUnusedPaths() {
+	for {
+		time.Sleep(time.Duration(gpmCleanupPeriod) * time.Second)
+
+		gpmLock.Lock()
+		pathList := make([]string, 0, len(garbagePathMap))
+		for path := range garbagePathMap {
+			pathList = append(pathList, path)
+		}
+		garbagePathMap = make(map[string]bool)
+		gpmWg.Add(1)
+		gpmLock.Unlock()
+
+		for _, path := range pathList {
+			os.Remove(path)
+		}
+
+		gpmWg.Done()
+	}
+}
+
+func addToGarbagePaths(path string) {
+	gpmLock.Lock()
+	defer gpmLock.Unlock()
+
+	garbagePathMap[path] = true
+}
+
+func removeFromGarbagePaths(path string) {
+	gpmLock.Lock()
+	defer gpmLock.Unlock()
+
+	delete(garbagePathMap, path)
 }
 }
 
 
 // GenerateKey generates a sandbox key based on the passed
 // GenerateKey generates a sandbox key based on the passed
@@ -56,6 +112,16 @@ func NewSandbox(key string, osCreate bool) (Sandbox, error) {
 	return &networkNamespace{path: key, sinfo: info}, nil
 	return &networkNamespace{path: key, sinfo: info}, nil
 }
 }
 
 
+func reexecCreateNamespace() {
+	if len(os.Args) < 2 {
+		log.Fatal("no namespace path provided")
+	}
+
+	if err := syscall.Mount("/proc/self/ns/net", os.Args[1], "bind", syscall.MS_BIND, ""); err != nil {
+		log.Fatal(err)
+	}
+}
+
 func createNetworkNamespace(path string, osCreate bool) (*Info, error) {
 func createNetworkNamespace(path string, osCreate bool) (*Info, error) {
 	runtime.LockOSThread()
 	runtime.LockOSThread()
 	defer runtime.UnlockOSThread()
 	defer runtime.UnlockOSThread()
@@ -70,23 +136,18 @@ func createNetworkNamespace(path string, osCreate bool) (*Info, error) {
 		return nil, err
 		return nil, err
 	}
 	}
 
 
+	cmd := &exec.Cmd{
+		Path:   reexec.Self(),
+		Args:   append([]string{"netns-create"}, path),
+		Stdout: os.Stdout,
+		Stderr: os.Stderr,
+	}
 	if osCreate {
 	if osCreate {
-		defer netns.Set(origns)
-		newns, err := netns.New()
-		if err != nil {
-			return nil, err
-		}
-		defer newns.Close()
-
-		if err := loopbackUp(); err != nil {
-			return nil, err
-		}
+		cmd.SysProcAttr = &syscall.SysProcAttr{}
+		cmd.SysProcAttr.Cloneflags = syscall.CLONE_NEWNET
 	}
 	}
-
-	procNet := fmt.Sprintf("/proc/%d/task/%d/ns/net", os.Getpid(), syscall.Gettid())
-
-	if err := syscall.Mount(procNet, path, "bind", syscall.MS_BIND, ""); err != nil {
-		return nil, err
+	if err := cmd.Run(); err != nil {
+		return nil, fmt.Errorf("namespace creation reexec command failed: %v", err)
 	}
 	}
 
 
 	interfaces := []*Interface{}
 	interfaces := []*Interface{}
@@ -94,10 +155,27 @@ func createNetworkNamespace(path string, osCreate bool) (*Info, error) {
 	return info, nil
 	return info, nil
 }
 }
 
 
-func cleanupNamespaceFile(path string) {
+func cleanupNamespaceFiles() {
+	filepath.Walk(prefix, func(path string, info os.FileInfo, err error) error {
+		stat, err := os.Stat(path)
+		if err != nil {
+			return err
+		}
+
+		if stat.IsDir() {
+			return filepath.SkipDir
+		}
+
+		syscall.Unmount(path, syscall.MNT_DETACH)
+		os.Remove(path)
+
+		return nil
+	})
+}
+
+func unmountNamespaceFile(path string) {
 	if _, err := os.Stat(path); err == nil {
 	if _, err := os.Stat(path); err == nil {
-		n := &networkNamespace{path: path}
-		n.Destroy()
+		syscall.Unmount(path, syscall.MNT_DETACH)
 	}
 	}
 }
 }
 
 
@@ -105,11 +183,20 @@ func createNamespaceFile(path string) (err error) {
 	var f *os.File
 	var f *os.File
 
 
 	once.Do(createBasePath)
 	once.Do(createBasePath)
-	// cleanup namespace file if it already exists because of a previous ungraceful exit.
-	cleanupNamespaceFile(path)
+	// Remove it from garbage collection list if present
+	removeFromGarbagePaths(path)
+
+	// If the path is there unmount it first
+	unmountNamespaceFile(path)
+
+	// wait for garbage collection to complete if it is in progress
+	// before trying to create the file.
+	gpmWg.Wait()
+
 	if f, err = os.Create(path); err == nil {
 	if f, err = os.Create(path); err == nil {
 		f.Close()
 		f.Close()
 	}
 	}
+
 	return err
 	return err
 }
 }
 
 
@@ -310,5 +397,7 @@ func (n *networkNamespace) Destroy() error {
 		return err
 		return err
 	}
 	}
 
 
-	return os.Remove(n.path)
+	// Stash it into the garbage collection list
+	addToGarbagePaths(n.path)
+	return nil
 }
 }

+ 11 - 0
libnetwork/sandbox/sandbox_linux_test.go

@@ -6,6 +6,7 @@ import (
 	"path/filepath"
 	"path/filepath"
 	"runtime"
 	"runtime"
 	"testing"
 	"testing"
+	"time"
 
 
 	"github.com/docker/libnetwork/netutils"
 	"github.com/docker/libnetwork/netutils"
 	"github.com/vishvananda/netlink"
 	"github.com/vishvananda/netlink"
@@ -31,6 +32,9 @@ func newKey(t *testing.T) (string, error) {
 		return "", err
 		return "", err
 	}
 	}
 
 
+	// Set the rpmCleanupPeriod to be low to make the test run quicker
+	gpmCleanupPeriod = 2
+
 	return name, nil
 	return name, nil
 }
 }
 
 
@@ -146,3 +150,10 @@ func verifySandbox(t *testing.T, s Sandbox) {
 			err)
 			err)
 	}
 	}
 }
 }
+
+func verifyCleanup(t *testing.T, s Sandbox) {
+	time.Sleep(time.Duration(gpmCleanupPeriod*2) * time.Second)
+	if _, err := os.Stat(s.Key()); err == nil {
+		t.Fatalf("The sandbox path %s is not getting cleanup event after twice the cleanup period", s.Key())
+	}
+}

+ 11 - 0
libnetwork/sandbox/sandbox_test.go

@@ -2,9 +2,19 @@ package sandbox
 
 
 import (
 import (
 	"net"
 	"net"
+	"os"
 	"testing"
 	"testing"
+
+	"github.com/docker/docker/pkg/reexec"
 )
 )
 
 
+func TestMain(m *testing.M) {
+	if reexec.Init() {
+		return
+	}
+	os.Exit(m.Run())
+}
+
 func TestSandboxCreate(t *testing.T) {
 func TestSandboxCreate(t *testing.T) {
 	key, err := newKey(t)
 	key, err := newKey(t)
 	if err != nil {
 	if err != nil {
@@ -44,6 +54,7 @@ func TestSandboxCreate(t *testing.T) {
 
 
 	verifySandbox(t, s)
 	verifySandbox(t, s)
 	s.Destroy()
 	s.Destroy()
+	verifyCleanup(t, s)
 }
 }
 
 
 func TestSandboxCreateTwice(t *testing.T) {
 func TestSandboxCreateTwice(t *testing.T) {