Attempt to protect on disk image store with mutex

During `(*Graph).Register, there was no protection on adding new layers
concurrently. In some cases, this resulted in corruption of a layer by creating
the directory but not the underlying data. This manifested in several different
IO errors reported in the client.  This attempts to fix this by adding a mutex
by Image ID to protect the Register operation.

We do not completely understand the root cause of this corruption other than
the result is somehow tied to this particular function.  This fix has been
confirmed to address the issue through testing.

Unfortunately, this fix does not address existing corruption. The user will
have to remove and re-pull the corrupt layer to stop the error from happening
in the future. This change only ensures that the layer will not become corrupt.

Signed-off-by: Stephen J Day <stephen.day@docker.com>
This commit is contained in:
Stephen J Day 2015-06-04 12:58:58 -07:00
parent 4ab8b88286
commit 7eac23cf8d
2 changed files with 59 additions and 6 deletions

View file

@ -31,9 +31,10 @@ import (
// A Graph is a store for versioned filesystem images and the relationship between them.
type Graph struct {
root string
idIndex *truncindex.TruncIndex
driver graphdriver.Driver
root string
idIndex *truncindex.TruncIndex
driver graphdriver.Driver
imageMutex imageMutex // protect images in driver.
}
var (
@ -153,6 +154,15 @@ func (graph *Graph) Create(layerData archive.ArchiveReader, containerID, contain
// Register imports a pre-existing image into the graph.
func (graph *Graph) Register(img *image.Image, layerData archive.ArchiveReader) (err error) {
if err := image.ValidateID(img.ID); err != nil {
return err
}
// We need this entire operation to be atomic within the engine. Note that
// this doesn't mean Register is fully safe yet.
graph.imageMutex.Lock(img.ID)
defer graph.imageMutex.Unlock(img.ID)
defer func() {
// If any error occurs, remove the new dir from the driver.
// Don't check for errors since the dir might not have been created.
@ -161,9 +171,7 @@ func (graph *Graph) Register(img *image.Image, layerData archive.ArchiveReader)
graph.driver.Remove(img.ID)
}
}()
if err := image.ValidateID(img.ID); err != nil {
return err
}
// (This is a convenience to save time. Race conditions are taken care of by os.Rename)
if graph.Exists(img.ID) {
return fmt.Errorf("Image %s already exists", img.ID)

45
graph/mutex.go Normal file
View file

@ -0,0 +1,45 @@
package graph
import "sync"
// imageMutex provides a lock per image id to protect shared resources in the
// graph. This is only used with registration but should be used when
// manipulating the layer store.
type imageMutex struct {
mus map[string]*sync.Mutex // mutexes by image id.
mu sync.Mutex // protects lock map
// NOTE(stevvooe): The map above will grow to the size of all images ever
// registered during a daemon run. To free these resources, we must
// deallocate after unlock. Doing this safely is non-trivial in the face
// of a very minor leak.
}
// Lock the provided id.
func (im *imageMutex) Lock(id string) {
im.getImageLock(id).Lock()
}
// Unlock the provided id.
func (im *imageMutex) Unlock(id string) {
im.getImageLock(id).Unlock()
}
// getImageLock returns the mutex for the given id. This method will never
// return nil.
func (im *imageMutex) getImageLock(id string) *sync.Mutex {
im.mu.Lock()
defer im.mu.Unlock()
if im.mus == nil { // lazy
im.mus = make(map[string]*sync.Mutex)
}
mu, ok := im.mus[id]
if !ok {
mu = new(sync.Mutex)
im.mus[id] = mu
}
return mu
}