Merge pull request #16589 from mavenugo/ln_vin_ls

Vendoring libnetwork and libkv
This commit is contained in:
Jess Frazelle 2015-09-25 13:31:14 -07:00
commit 3c3d232e0a
62 changed files with 5915 additions and 585 deletions

View file

@ -813,25 +813,6 @@ func createNetwork(controller libnetwork.NetworkController, dnet string, driver
return controller.NewNetwork(driver, dnet, createOptions...)
}
func (container *Container) secondaryNetworkRequired(ctx context.Context, primaryNetworkType string) bool {
switch primaryNetworkType {
case "bridge", "none", "host", "container":
return false
}
if container.daemon.configStore.DisableBridge {
return false
}
if container.Config.ExposedPorts != nil && len(container.Config.ExposedPorts) > 0 {
return true
}
if container.hostConfig.PortBindings != nil && len(container.hostConfig.PortBindings) > 0 {
return true
}
return false
}
func (container *Container) allocateNetwork(ctx context.Context) error {
mode := container.hostConfig.NetworkMode
controller := container.daemon.netController
@ -866,13 +847,6 @@ func (container *Container) allocateNetwork(ctx context.Context) error {
service = strings.Replace(service, "/", "", -1)
}
if container.secondaryNetworkRequired(ctx, networkDriver) {
// Configure Bridge as secondary network for port binding purposes
if err := container.configureNetwork(ctx, "bridge", service, "bridge", false); err != nil {
return err
}
}
if err := container.configureNetwork(ctx, networkName, service, networkDriver, mode.IsDefault()); err != nil {
return err
}

View file

@ -313,15 +313,16 @@ func networkOptions(dconfig *Config) ([]nwconfig.Option, error) {
}
if strings.TrimSpace(dconfig.NetworkKVStore) != "" {
kv := strings.Split(dconfig.NetworkKVStore, ":")
kv := strings.Split(dconfig.NetworkKVStore, "://")
if len(kv) < 2 {
return nil, fmt.Errorf("kv store daemon config must be of the form KV-PROVIDER:KV-URL")
return nil, fmt.Errorf("kv store daemon config must be of the form KV-PROVIDER://KV-URL")
}
options = append(options, nwconfig.OptionKVProvider(kv[0]))
options = append(options, nwconfig.OptionKVProviderURL(strings.Join(kv[1:], ":")))
options = append(options, nwconfig.OptionKVProviderURL(strings.Join(kv[1:], "://")))
}
options = append(options, nwconfig.OptionLabels(dconfig.Labels))
options = append(options, driverOptions(dconfig)...)
return options, nil
}
@ -336,24 +337,13 @@ func initNetworkController(config *Config) (libnetwork.NetworkController, error)
return nil, fmt.Errorf("error obtaining controller instance: %v", err)
}
// Initialize default driver "null"
if err := controller.ConfigureNetworkDriver("null", options.Generic{}); err != nil {
return nil, fmt.Errorf("Error initializing null driver: %v", err)
}
// Initialize default network on "null"
if _, err := controller.NewNetwork("null", "none"); err != nil {
if _, err := controller.NewNetwork("null", "none", libnetwork.NetworkOptionPersist(false)); err != nil {
return nil, fmt.Errorf("Error creating default \"null\" network: %v", err)
}
// Initialize default driver "host"
if err := controller.ConfigureNetworkDriver("host", options.Generic{}); err != nil {
return nil, fmt.Errorf("Error initializing host driver: %v", err)
}
// Initialize default network on "host"
if _, err := controller.NewNetwork("host", "host"); err != nil {
if _, err := controller.NewNetwork("host", "host", libnetwork.NetworkOptionPersist(false)); err != nil {
return nil, fmt.Errorf("Error creating default \"host\" network: %v", err)
}
@ -367,18 +357,22 @@ func initNetworkController(config *Config) (libnetwork.NetworkController, error)
return controller, nil
}
func initBridgeDriver(controller libnetwork.NetworkController, config *Config) error {
option := options.Generic{
func driverOptions(config *Config) []nwconfig.Option {
bridgeConfig := options.Generic{
"EnableIPForwarding": config.Bridge.EnableIPForward,
"EnableIPTables": config.Bridge.EnableIPTables,
"EnableUserlandProxy": config.Bridge.EnableUserlandProxy}
bridgeOption := options.Generic{netlabel.GenericData: bridgeConfig}
if err := controller.ConfigureNetworkDriver("bridge", options.Generic{netlabel.GenericData: option}); err != nil {
return fmt.Errorf("Error initializing bridge driver: %v", err)
}
dOptions := []nwconfig.Option{}
dOptions = append(dOptions, nwconfig.OptionDriverConfig("bridge", bridgeOption))
return dOptions
}
func initBridgeDriver(controller libnetwork.NetworkController, config *Config) error {
netOption := options.Generic{
"BridgeName": config.Bridge.Iface,
"DefaultBridge": true,
"Mtu": config.Mtu,
"EnableIPMasquerade": config.Bridge.EnableIPMasq,
"EnableICC": config.Bridge.InterContainerCommunication,
@ -430,7 +424,8 @@ func initBridgeDriver(controller libnetwork.NetworkController, config *Config) e
libnetwork.NetworkOptionGeneric(options.Generic{
netlabel.GenericData: netOption,
netlabel.EnableIPv6: config.Bridge.EnableIPv6,
}))
}),
libnetwork.NetworkOptionPersist(false))
if err != nil {
return fmt.Errorf("Error creating default \"bridge\" network: %v", err)
}

View file

@ -20,18 +20,19 @@ clone git github.com/tchap/go-patricia v2.1.0
clone git golang.org/x/net 3cffabab72adf04f8e3b01c5baf775361837b5fe https://github.com/golang/net.git
#get libnetwork packages
clone git github.com/docker/libnetwork e5fea92a6c8a5968bdb8005bf959c6e23113b689
clone git github.com/docker/libnetwork f5423a097e5da89f9ea206ddf8b93b5ac1f51ee7
clone git github.com/armon/go-metrics eb0af217e5e9747e41dd5303755356b62d28e3ec
clone git github.com/hashicorp/go-msgpack 71c2886f5a673a35f909803f38ece5810165097b
clone git github.com/hashicorp/memberlist 9a1e242e454d2443df330bdd51a436d5a9058fc4
clone git github.com/hashicorp/serf 7151adcef72687bf95f451a2e0ba15cb19412bf2
clone git github.com/docker/libkv a0a57ed3755665e9a402a3df315402134eb6625f
clone git github.com/docker/libkv ea7ff6ae76485ab93ac36799d3e13b1905787ffe
clone git github.com/vishvananda/netns 604eaf189ee867d8c147fafc28def2394e878d25
clone git github.com/vishvananda/netlink 4b5dce31de6d42af5bb9811c6d265472199e0fec
clone git github.com/BurntSushi/toml f706d00e3de6abe700c994cdd545a1a4915af060
clone git github.com/samuel/go-zookeeper d0e0d8e11f318e000a8cc434616d69e329edc374
clone git github.com/coreos/go-etcd v2.0.0
clone git github.com/hashicorp/consul v0.5.2
clone git github.com/boltdb/bolt v1.0
# get graph and distribution packages
clone git github.com/docker/distribution ec87e9b6971d831f0eff752ddb54fb64693e51cd # docker/1.8 branch

View file

@ -0,0 +1,3 @@
*.prof
*.test
/bin/

View file

@ -0,0 +1,20 @@
The MIT License (MIT)
Copyright (c) 2013 Ben Johnson
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View file

@ -0,0 +1,54 @@
TEST=.
BENCH=.
COVERPROFILE=/tmp/c.out
BRANCH=`git rev-parse --abbrev-ref HEAD`
COMMIT=`git rev-parse --short HEAD`
GOLDFLAGS="-X main.branch $(BRANCH) -X main.commit $(COMMIT)"
default: build
bench:
go test -v -test.run=NOTHINCONTAINSTHIS -test.bench=$(BENCH)
# http://cloc.sourceforge.net/
cloc:
@cloc --not-match-f='Makefile|_test.go' .
cover: fmt
go test -coverprofile=$(COVERPROFILE) -test.run=$(TEST) $(COVERFLAG) .
go tool cover -html=$(COVERPROFILE)
rm $(COVERPROFILE)
cpuprofile: fmt
@go test -c
@./bolt.test -test.v -test.run=$(TEST) -test.cpuprofile cpu.prof
# go get github.com/kisielk/errcheck
errcheck:
@echo "=== errcheck ==="
@errcheck github.com/boltdb/bolt
fmt:
@go fmt ./...
get:
@go get -d ./...
build: get
@mkdir -p bin
@go build -ldflags=$(GOLDFLAGS) -a -o bin/bolt ./cmd/bolt
test: fmt
@go get github.com/stretchr/testify/assert
@echo "=== TESTS ==="
@go test -v -cover -test.run=$(TEST)
@echo ""
@echo ""
@echo "=== CLI ==="
@go test -v -test.run=$(TEST) ./cmd/bolt
@echo ""
@echo ""
@echo "=== RACE DETECTOR ==="
@go test -v -race -test.run="TestSimulate_(100op|1000op)"
.PHONY: bench cloc cover cpuprofile fmt memprofile test

View file

@ -0,0 +1,455 @@
Bolt [![Build Status](https://drone.io/github.com/boltdb/bolt/status.png)](https://drone.io/github.com/boltdb/bolt/latest) [![Coverage Status](https://coveralls.io/repos/boltdb/bolt/badge.png?branch=master)](https://coveralls.io/r/boltdb/bolt?branch=master) [![GoDoc](https://godoc.org/github.com/boltdb/bolt?status.png)](https://godoc.org/github.com/boltdb/bolt) ![Version](http://img.shields.io/badge/version-1.0-green.png)
====
Bolt is a pure Go key/value store inspired by [Howard Chu's][hyc_symas] and
the [LMDB project][lmdb]. The goal of the project is to provide a simple,
fast, and reliable database for projects that don't require a full database
server such as Postgres or MySQL.
Since Bolt is meant to be used as such a low-level piece of functionality,
simplicity is key. The API will be small and only focus on getting values
and setting values. That's it.
[hyc_symas]: https://twitter.com/hyc_symas
[lmdb]: http://symas.com/mdb/
## Project Status
Bolt is stable and the API is fixed. Full unit test coverage and randomized
black box testing are used to ensure database consistency and thread safety.
Bolt is currently in high-load production environments serving databases as
large as 1TB. Many companies such as Shopify and Heroku use Bolt-backed
services every day.
## Getting Started
### Installing
To start using Bolt, install Go and run `go get`:
```sh
$ go get github.com/boltdb/bolt/...
```
This will retrieve the library and install the `bolt` command line utility into
your `$GOBIN` path.
### Opening a database
The top-level object in Bolt is a `DB`. It is represented as a single file on
your disk and represents a consistent snapshot of your data.
To open your database, simply use the `bolt.Open()` function:
```go
package main
import (
"log"
"github.com/boltdb/bolt"
)
func main() {
// Open the my.db data file in your current directory.
// It will be created if it doesn't exist.
db, err := bolt.Open("my.db", 0600, nil)
if err != nil {
log.Fatal(err)
}
defer db.Close()
...
}
```
Please note that Bolt obtains a file lock on the data file so multiple processes
cannot open the same database at the same time. Opening an already open Bolt
database will cause it to hang until the other process closes it. To prevent
an indefinite wait you can pass a timeout option to the `Open()` function:
```go
db, err := bolt.Open("my.db", 0600, &bolt.Options{Timeout: 1 * time.Second})
```
### Transactions
Bolt allows only one read-write transaction at a time but allows as many
read-only transactions as you want at a time. Each transaction has a consistent
view of the data as it existed when the transaction started.
Individual transactions and all objects created from them (e.g. buckets, keys)
are not thread safe. To work with data in multiple goroutines you must start
a transaction for each one or use locking to ensure only one goroutine accesses
a transaction at a time. Creating transaction from the `DB` is thread safe.
#### Read-write transactions
To start a read-write transaction, you can use the `DB.Update()` function:
```go
err := db.Update(func(tx *bolt.Tx) error {
...
return nil
})
```
Inside the closure, you have a consistent view of the database. You commit the
transaction by returning `nil` at the end. You can also rollback the transaction
at any point by returning an error. All database operations are allowed inside
a read-write transaction.
Always check the return error as it will report any disk failures that can cause
your transaction to not complete. If you return an error within your closure
it will be passed through.
#### Read-only transactions
To start a read-only transaction, you can use the `DB.View()` function:
```go
err := db.View(func(tx *bolt.Tx) error {
...
return nil
})
```
You also get a consistent view of the database within this closure, however,
no mutating operations are allowed within a read-only transaction. You can only
retrieve buckets, retrieve values, and copy the database within a read-only
transaction.
### Using buckets
Buckets are collections of key/value pairs within the database. All keys in a
bucket must be unique. You can create a bucket using the `DB.CreateBucket()`
function:
```go
db.Update(func(tx *bolt.Tx) error {
b, err := tx.CreateBucket([]byte("MyBucket"))
if err != nil {
return fmt.Errorf("create bucket: %s", err)
}
return nil
})
```
You can also create a bucket only if it doesn't exist by using the
`Tx.CreateBucketIfNotExists()` function. It's a common pattern to call this
function for all your top-level buckets after you open your database so you can
guarantee that they exist for future transactions.
To delete a bucket, simply call the `Tx.DeleteBucket()` function.
### Using key/value pairs
To save a key/value pair to a bucket, use the `Bucket.Put()` function:
```go
db.Update(func(tx *bolt.Tx) error {
b := tx.Bucket([]byte("MyBucket"))
err := b.Put([]byte("answer"), []byte("42"))
return err
})
```
This will set the value of the `"answer"` key to `"42"` in the `MyBucket`
bucket. To retrieve this value, we can use the `Bucket.Get()` function:
```go
db.View(func(tx *bolt.Tx) error {
b := tx.Bucket([]byte("MyBucket"))
v := b.Get([]byte("answer"))
fmt.Printf("The answer is: %s\n", v)
return nil
})
```
The `Get()` function does not return an error because its operation is
guarenteed to work (unless there is some kind of system failure). If the key
exists then it will return its byte slice value. If it doesn't exist then it
will return `nil`. It's important to note that you can have a zero-length value
set to a key which is different than the key not existing.
Use the `Bucket.Delete()` function to delete a key from the bucket.
### Iterating over keys
Bolt stores its keys in byte-sorted order within a bucket. This makes sequential
iteration over these keys extremely fast. To iterate over keys we'll use a
`Cursor`:
```go
db.View(func(tx *bolt.Tx) error {
b := tx.Bucket([]byte("MyBucket"))
c := b.Cursor()
for k, v := c.First(); k != nil; k, v = c.Next() {
fmt.Printf("key=%s, value=%s\n", k, v)
}
return nil
})
```
The cursor allows you to move to a specific point in the list of keys and move
forward or backward through the keys one at a time.
The following functions are available on the cursor:
```
First() Move to the first key.
Last() Move to the last key.
Seek() Move to a specific key.
Next() Move to the next key.
Prev() Move to the previous key.
```
When you have iterated to the end of the cursor then `Next()` will return `nil`.
You must seek to a position using `First()`, `Last()`, or `Seek()` before
calling `Next()` or `Prev()`. If you do not seek to a position then these
functions will return `nil`.
#### Prefix scans
To iterate over a key prefix, you can combine `Seek()` and `bytes.HasPrefix()`:
```go
db.View(func(tx *bolt.Tx) error {
c := tx.Bucket([]byte("MyBucket")).Cursor()
prefix := []byte("1234")
for k, v := c.Seek(prefix); bytes.HasPrefix(k, prefix); k, v = c.Next() {
fmt.Printf("key=%s, value=%s\n", k, v)
}
return nil
})
```
#### Range scans
Another common use case is scanning over a range such as a time range. If you
use a sortable time encoding such as RFC3339 then you can query a specific
date range like this:
```go
db.View(func(tx *bolt.Tx) error {
// Assume our events bucket has RFC3339 encoded time keys.
c := tx.Bucket([]byte("Events")).Cursor()
// Our time range spans the 90's decade.
min := []byte("1990-01-01T00:00:00Z")
max := []byte("2000-01-01T00:00:00Z")
// Iterate over the 90's.
for k, v := c.Seek(min); k != nil && bytes.Compare(k, max) != -1; k, v = c.Next() {
fmt.Printf("%s: %s\n", k, v)
}
return nil
})
```
#### ForEach()
You can also use the function `ForEach()` if you know you'll be iterating over
all the keys in a bucket:
```go
db.View(func(tx *bolt.Tx) error {
b := tx.Bucket([]byte("MyBucket"))
b.ForEach(func(k, v []byte) error {
fmt.Printf("key=%s, value=%s\n", k, v)
return nil
})
return nil
})
```
### Nested buckets
You can also store a bucket in a key to create nested buckets. The API is the
same as the bucket management API on the `DB` object:
```go
func (*Bucket) CreateBucket(key []byte) (*Bucket, error)
func (*Bucket) CreateBucketIfNotExists(key []byte) (*Bucket, error)
func (*Bucket) DeleteBucket(key []byte) error
```
### Database backups
Bolt is a single file so it's easy to backup. You can use the `Tx.Copy()`
function to write a consistent view of the database to a writer. If you call
this from a read-only transaction, it will perform a hot backup and not block
your other database reads and writes. It will also use `O_DIRECT` when available
to prevent page cache trashing.
One common use case is to backup over HTTP so you can use tools like `cURL` to
do database backups:
```go
func BackupHandleFunc(w http.ResponseWriter, req *http.Request) {
err := db.View(func(tx bolt.Tx) error {
w.Header().Set("Content-Type", "application/octet-stream")
w.Header().Set("Content-Disposition", `attachment; filename="my.db"`)
w.Header().Set("Content-Length", strconv.Itoa(int(tx.Size())))
return tx.Copy(w)
})
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
}
}
```
Then you can backup using this command:
```sh
$ curl http://localhost/backup > my.db
```
Or you can open your browser to `http://localhost/backup` and it will download
automatically.
If you want to backup to another file you can use the `Tx.CopyFile()` helper
function.
### Statistics
The database keeps a running count of many of the internal operations it
performs so you can better understand what's going on. By grabbing a snapshot
of these stats at two points in time we can see what operations were performed
in that time range.
For example, we could start a goroutine to log stats every 10 seconds:
```go
go func() {
// Grab the initial stats.
prev := db.Stats()
for {
// Wait for 10s.
time.Sleep(10 * time.Second)
// Grab the current stats and diff them.
stats := db.Stats()
diff := stats.Sub(&prev)
// Encode stats to JSON and print to STDERR.
json.NewEncoder(os.Stderr).Encode(diff)
// Save stats for the next loop.
prev = stats
}
}
}()
```
It's also useful to pipe these stats to a service such as statsd for monitoring
or to provide an HTTP endpoint that will perform a fixed-length sample.
## Resources
For more information on getting started with Bolt, check out the following articles:
* [Intro to BoltDB: Painless Performant Persistence](http://npf.io/2014/07/intro-to-boltdb-painless-performant-persistence/) by [Nate Finch](https://github.com/natefinch).
## Comparing Bolt to LMDB
Bolt was originally a port of LMDB so it is architecturally similar. Both use
a B+tree, have ACID semanetics with fully serializable transactions, and support
lock-free MVCC using a single writer and multiple readers.
The two projects have somewhat diverged. LMDB heavily focuses on raw performance
while Bolt has focused on simplicity and ease of use. For example, LMDB allows
several unsafe actions such as direct writes and append writes for the sake of
performance. Bolt opts to disallow actions which can leave the database in a
corrupted state. The only exception to this in Bolt is `DB.NoSync`.
## Caveats & Limitations
It's important to pick the right tool for the job and Bolt is no exception.
Here are a few things to note when evaluating and using Bolt:
* Bolt is good for read intensive workloads. Sequential write performance is
also fast but random writes can be slow. You can add a write-ahead log or
[transaction coalescer](https://github.com/boltdb/coalescer) in front of Bolt
to mitigate this issue.
* Bolt uses a B+tree internally so there can be a lot of random page access.
SSDs provide a significant performance boost over spinning disks.
* Try to avoid long running read transactions. Bolt uses copy-on-write so
old pages cannot be reclaimed while an old transaction is using them.
* Byte slices returned from Bolt are only valid during a transaction. Once the
transaction has been committed or rolled back then the memory they point to
can be reused by a new page or can be unmapped from virtual memory and you'll
see an `unexpected fault address` panic when accessing it.
* Be careful when using `Bucket.FillPercent`. Setting a high fill percent for
buckets that have random inserts will cause your database to have very poor
page utilization.
* Use larger buckets in general. Smaller buckets causes poor page utilization
once they become larger than the page size (typically 4KB).
* Bulk loading a lot of random writes into a new bucket can be slow as the
page will not split until the transaction is committed. Randomly inserting
more than 100,000 key/value pairs into a single new bucket in a single
transaction is not advised.
* Bolt uses a memory-mapped file so the underlying operating system handles the
caching of the data. Typically, the OS will cache as much of the file as it
can in memory and will release memory as needed to other processes. This means
that Bolt can show very high memory usage when working with large databases.
However, this is expected and the OS will release memory as needed. Bolt can
handle databases much larger than the available physical RAM.
## Other Projects Using Bolt
Below is a list of public, open source projects that use Bolt:
* [Bazil](https://github.com/bazillion/bazil) - A file system that lets your data reside where it is most convenient for it to reside.
* [DVID](https://github.com/janelia-flyem/dvid) - Added Bolt as optional storage engine and testing it against Basho-tuned leveldb.
* [Skybox Analytics](https://github.com/skybox/skybox) - A standalone funnel analysis tool for web analytics.
* [Scuttlebutt](https://github.com/benbjohnson/scuttlebutt) - Uses Bolt to store and process all Twitter mentions of GitHub projects.
* [Wiki](https://github.com/peterhellberg/wiki) - A tiny wiki using Goji, BoltDB and Blackfriday.
* [ChainStore](https://github.com/nulayer/chainstore) - Simple key-value interface to a variety of storage engines organized as a chain of operations.
* [MetricBase](https://github.com/msiebuhr/MetricBase) - Single-binary version of Graphite.
* [Gitchain](https://github.com/gitchain/gitchain) - Decentralized, peer-to-peer Git repositories aka "Git meets Bitcoin".
* [event-shuttle](https://github.com/sclasen/event-shuttle) - A Unix system service to collect and reliably deliver messages to Kafka.
* [ipxed](https://github.com/kelseyhightower/ipxed) - Web interface and api for ipxed.
* [BoltStore](https://github.com/yosssi/boltstore) - Session store using Bolt.
* [photosite/session](http://godoc.org/bitbucket.org/kardianos/photosite/session) - Sessions for a photo viewing site.
* [LedisDB](https://github.com/siddontang/ledisdb) - A high performance NoSQL, using Bolt as optional storage.
* [ipLocator](https://github.com/AndreasBriese/ipLocator) - A fast ip-geo-location-server using bolt with bloom filters.
* [cayley](https://github.com/google/cayley) - Cayley is an open-source graph database using Bolt as optional backend.
* [bleve](http://www.blevesearch.com/) - A pure Go search engine similar to ElasticSearch that uses Bolt as the default storage backend.
* [tentacool](https://github.com/optiflows/tentacool) - REST api server to manage system stuff (IP, DNS, Gateway...) on a linux server.
* [SkyDB](https://github.com/skydb/sky) - Behavioral analytics database.
If you are using Bolt in a project please send a pull request to add it to the list.

View file

@ -0,0 +1,4 @@
package bolt
// maxMapSize represents the largest mmap size supported by Bolt.
const maxMapSize = 0xFFFFFFF // 256MB

View file

@ -0,0 +1,4 @@
package bolt
// maxMapSize represents the largest mmap size supported by Bolt.
const maxMapSize = 0xFFFFFFFFFFFF // 256TB

View file

@ -0,0 +1,4 @@
package bolt
// maxMapSize represents the largest mmap size supported by Bolt.
const maxMapSize = 0xFFFFFFF // 256MB

View file

@ -0,0 +1,12 @@
package bolt
import (
"syscall"
)
var odirect = syscall.O_DIRECT
// fdatasync flushes written data to a file descriptor.
func fdatasync(db *DB) error {
return syscall.Fdatasync(int(db.file.Fd()))
}

View file

@ -0,0 +1,29 @@
package bolt
import (
"syscall"
"unsafe"
)
const (
msAsync = 1 << iota // perform asynchronous writes
msSync // perform synchronous writes
msInvalidate // invalidate cached data
)
var odirect int
func msync(db *DB) error {
_, _, errno := syscall.Syscall(syscall.SYS_MSYNC, uintptr(unsafe.Pointer(db.data)), uintptr(db.datasz), msInvalidate)
if errno != 0 {
return errno
}
return nil
}
func fdatasync(db *DB) error {
if db.data != nil {
return msync(db)
}
return db.file.Sync()
}

View file

@ -0,0 +1,69 @@
// +build !windows,!plan9
package bolt
import (
"os"
"syscall"
"time"
"unsafe"
)
// flock acquires an advisory lock on a file descriptor.
func flock(f *os.File, timeout time.Duration) error {
var t time.Time
for {
// If we're beyond our timeout then return an error.
// This can only occur after we've attempted a flock once.
if t.IsZero() {
t = time.Now()
} else if timeout > 0 && time.Since(t) > timeout {
return ErrTimeout
}
// Otherwise attempt to obtain an exclusive lock.
err := syscall.Flock(int(f.Fd()), syscall.LOCK_EX|syscall.LOCK_NB)
if err == nil {
return nil
} else if err != syscall.EWOULDBLOCK {
return err
}
// Wait for a bit and try again.
time.Sleep(50 * time.Millisecond)
}
}
// funlock releases an advisory lock on a file descriptor.
func funlock(f *os.File) error {
return syscall.Flock(int(f.Fd()), syscall.LOCK_UN)
}
// mmap memory maps a DB's data file.
func mmap(db *DB, sz int) error {
b, err := syscall.Mmap(int(db.file.Fd()), 0, sz, syscall.PROT_READ, syscall.MAP_SHARED)
if err != nil {
return err
}
// Save the original byte slice and convert to a byte array pointer.
db.dataref = b
db.data = (*[maxMapSize]byte)(unsafe.Pointer(&b[0]))
db.datasz = sz
return nil
}
// munmap unmaps a DB's data file from memory.
func munmap(db *DB) error {
// Ignore the unmap if we have no mapped data.
if db.dataref == nil {
return nil
}
// Unmap using the original byte slice.
err := syscall.Munmap(db.dataref)
db.dataref = nil
db.data = nil
db.datasz = 0
return err
}

View file

@ -0,0 +1,74 @@
package bolt
import (
"fmt"
"os"
"syscall"
"time"
"unsafe"
)
var odirect int
// fdatasync flushes written data to a file descriptor.
func fdatasync(db *DB) error {
return db.file.Sync()
}
// flock acquires an advisory lock on a file descriptor.
func flock(f *os.File, _ time.Duration) error {
return nil
}
// funlock releases an advisory lock on a file descriptor.
func funlock(f *os.File) error {
return nil
}
// mmap memory maps a DB's data file.
// Based on: https://github.com/edsrzf/mmap-go
func mmap(db *DB, sz int) error {
// Truncate the database to the size of the mmap.
if err := db.file.Truncate(int64(sz)); err != nil {
return fmt.Errorf("truncate: %s", err)
}
// Open a file mapping handle.
sizelo := uint32(sz >> 32)
sizehi := uint32(sz) & 0xffffffff
h, errno := syscall.CreateFileMapping(syscall.Handle(db.file.Fd()), nil, syscall.PAGE_READONLY, sizelo, sizehi, nil)
if h == 0 {
return os.NewSyscallError("CreateFileMapping", errno)
}
// Create the memory map.
addr, errno := syscall.MapViewOfFile(h, syscall.FILE_MAP_READ, 0, 0, uintptr(sz))
if addr == 0 {
return os.NewSyscallError("MapViewOfFile", errno)
}
// Close mapping handle.
if err := syscall.CloseHandle(syscall.Handle(h)); err != nil {
return os.NewSyscallError("CloseHandle", err)
}
// Convert to a byte array.
db.data = ((*[maxMapSize]byte)(unsafe.Pointer(addr)))
db.datasz = sz
return nil
}
// munmap unmaps a pointer from a file.
// Based on: https://github.com/edsrzf/mmap-go
func munmap(db *DB) error {
if db.data == nil {
return nil
}
addr := (uintptr)(unsafe.Pointer(&db.data[0]))
if err := syscall.UnmapViewOfFile(addr); err != nil {
return os.NewSyscallError("UnmapViewOfFile", err)
}
return nil
}

View file

@ -0,0 +1,10 @@
// +build !windows,!plan9,!linux,!openbsd
package bolt
var odirect int
// fdatasync flushes written data to a file descriptor.
func fdatasync(db *DB) error {
return db.file.Sync()
}

View file

@ -0,0 +1,728 @@
package bolt
import (
"bytes"
"fmt"
"unsafe"
)
const (
// MaxKeySize is the maximum length of a key, in bytes.
MaxKeySize = 32768
// MaxValueSize is the maximum length of a value, in bytes.
MaxValueSize = 4294967295
)
const (
maxUint = ^uint(0)
minUint = 0
maxInt = int(^uint(0) >> 1)
minInt = -maxInt - 1
)
const bucketHeaderSize = int(unsafe.Sizeof(bucket{}))
const (
minFillPercent = 0.1
maxFillPercent = 1.0
)
// DefaultFillPercent is the percentage that split pages are filled.
// This value can be changed by setting Bucket.FillPercent.
const DefaultFillPercent = 0.5
// Bucket represents a collection of key/value pairs inside the database.
type Bucket struct {
*bucket
tx *Tx // the associated transaction
buckets map[string]*Bucket // subbucket cache
page *page // inline page reference
rootNode *node // materialized node for the root page.
nodes map[pgid]*node // node cache
// Sets the threshold for filling nodes when they split. By default,
// the bucket will fill to 50% but it can be useful to increase this
// amount if you know that your write workloads are mostly append-only.
//
// This is non-persisted across transactions so it must be set in every Tx.
FillPercent float64
}
// bucket represents the on-file representation of a bucket.
// This is stored as the "value" of a bucket key. If the bucket is small enough,
// then its root page can be stored inline in the "value", after the bucket
// header. In the case of inline buckets, the "root" will be 0.
type bucket struct {
root pgid // page id of the bucket's root-level page
sequence uint64 // monotonically incrementing, used by NextSequence()
}
// newBucket returns a new bucket associated with a transaction.
func newBucket(tx *Tx) Bucket {
var b = Bucket{tx: tx, FillPercent: DefaultFillPercent}
if tx.writable {
b.buckets = make(map[string]*Bucket)
b.nodes = make(map[pgid]*node)
}
return b
}
// Tx returns the tx of the bucket.
func (b *Bucket) Tx() *Tx {
return b.tx
}
// Root returns the root of the bucket.
func (b *Bucket) Root() pgid {
return b.root
}
// Writable returns whether the bucket is writable.
func (b *Bucket) Writable() bool {
return b.tx.writable
}
// Cursor creates a cursor associated with the bucket.
// The cursor is only valid as long as the transaction is open.
// Do not use a cursor after the transaction is closed.
func (b *Bucket) Cursor() *Cursor {
// Update transaction statistics.
b.tx.stats.CursorCount++
// Allocate and return a cursor.
return &Cursor{
bucket: b,
stack: make([]elemRef, 0),
}
}
// Bucket retrieves a nested bucket by name.
// Returns nil if the bucket does not exist.
func (b *Bucket) Bucket(name []byte) *Bucket {
if b.buckets != nil {
if child := b.buckets[string(name)]; child != nil {
return child
}
}
// Move cursor to key.
c := b.Cursor()
k, v, flags := c.seek(name)
// Return nil if the key doesn't exist or it is not a bucket.
if !bytes.Equal(name, k) || (flags&bucketLeafFlag) == 0 {
return nil
}
// Otherwise create a bucket and cache it.
var child = b.openBucket(v)
if b.buckets != nil {
b.buckets[string(name)] = child
}
return child
}
// Helper method that re-interprets a sub-bucket value
// from a parent into a Bucket
func (b *Bucket) openBucket(value []byte) *Bucket {
var child = newBucket(b.tx)
// If this is a writable transaction then we need to copy the bucket entry.
// Read-only transactions can point directly at the mmap entry.
if b.tx.writable {
child.bucket = &bucket{}
*child.bucket = *(*bucket)(unsafe.Pointer(&value[0]))
} else {
child.bucket = (*bucket)(unsafe.Pointer(&value[0]))
}
// Save a reference to the inline page if the bucket is inline.
if child.root == 0 {
child.page = (*page)(unsafe.Pointer(&value[bucketHeaderSize]))
}
return &child
}
// CreateBucket creates a new bucket at the given key and returns the new bucket.
// Returns an error if the key already exists, if the bucket name is blank, or if the bucket name is too long.
func (b *Bucket) CreateBucket(key []byte) (*Bucket, error) {
if b.tx.db == nil {
return nil, ErrTxClosed
} else if !b.tx.writable {
return nil, ErrTxNotWritable
} else if len(key) == 0 {
return nil, ErrBucketNameRequired
}
// Move cursor to correct position.
c := b.Cursor()
k, _, flags := c.seek(key)
// Return an error if there is an existing key.
if bytes.Equal(key, k) {
if (flags & bucketLeafFlag) != 0 {
return nil, ErrBucketExists
} else {
return nil, ErrIncompatibleValue
}
}
// Create empty, inline bucket.
var bucket = Bucket{
bucket: &bucket{},
rootNode: &node{isLeaf: true},
FillPercent: DefaultFillPercent,
}
var value = bucket.write()
// Insert into node.
key = cloneBytes(key)
c.node().put(key, key, value, 0, bucketLeafFlag)
// Since subbuckets are not allowed on inline buckets, we need to
// dereference the inline page, if it exists. This will cause the bucket
// to be treated as a regular, non-inline bucket for the rest of the tx.
b.page = nil
return b.Bucket(key), nil
}
// CreateBucketIfNotExists creates a new bucket if it doesn't already exist and returns a reference to it.
// Returns an error if the bucket name is blank, or if the bucket name is too long.
func (b *Bucket) CreateBucketIfNotExists(key []byte) (*Bucket, error) {
child, err := b.CreateBucket(key)
if err == ErrBucketExists {
return b.Bucket(key), nil
} else if err != nil {
return nil, err
}
return child, nil
}
// DeleteBucket deletes a bucket at the given key.
// Returns an error if the bucket does not exists, or if the key represents a non-bucket value.
func (b *Bucket) DeleteBucket(key []byte) error {
if b.tx.db == nil {
return ErrTxClosed
} else if !b.Writable() {
return ErrTxNotWritable
}
// Move cursor to correct position.
c := b.Cursor()
k, _, flags := c.seek(key)
// Return an error if bucket doesn't exist or is not a bucket.
if !bytes.Equal(key, k) {
return ErrBucketNotFound
} else if (flags & bucketLeafFlag) == 0 {
return ErrIncompatibleValue
}
// Recursively delete all child buckets.
child := b.Bucket(key)
err := child.ForEach(func(k, v []byte) error {
if v == nil {
if err := child.DeleteBucket(k); err != nil {
return fmt.Errorf("delete bucket: %s", err)
}
}
return nil
})
if err != nil {
return err
}
// Remove cached copy.
delete(b.buckets, string(key))
// Release all bucket pages to freelist.
child.nodes = nil
child.rootNode = nil
child.free()
// Delete the node if we have a matching key.
c.node().del(key)
return nil
}
// Get retrieves the value for a key in the bucket.
// Returns a nil value if the key does not exist or if the key is a nested bucket.
func (b *Bucket) Get(key []byte) []byte {
k, v, flags := b.Cursor().seek(key)
// Return nil if this is a bucket.
if (flags & bucketLeafFlag) != 0 {
return nil
}
// If our target node isn't the same key as what's passed in then return nil.
if !bytes.Equal(key, k) {
return nil
}
return v
}
// Put sets the value for a key in the bucket.
// If the key exist then its previous value will be overwritten.
// Returns an error if the bucket was created from a read-only transaction, if the key is blank, if the key is too large, or if the value is too large.
func (b *Bucket) Put(key []byte, value []byte) error {
if b.tx.db == nil {
return ErrTxClosed
} else if !b.Writable() {
return ErrTxNotWritable
} else if len(key) == 0 {
return ErrKeyRequired
} else if len(key) > MaxKeySize {
return ErrKeyTooLarge
} else if int64(len(value)) > MaxValueSize {
return ErrValueTooLarge
}
// Move cursor to correct position.
c := b.Cursor()
k, _, flags := c.seek(key)
// Return an error if there is an existing key with a bucket value.
if bytes.Equal(key, k) && (flags&bucketLeafFlag) != 0 {
return ErrIncompatibleValue
}
// Insert into node.
key = cloneBytes(key)
c.node().put(key, key, value, 0, 0)
return nil
}
// Delete removes a key from the bucket.
// If the key does not exist then nothing is done and a nil error is returned.
// Returns an error if the bucket was created from a read-only transaction.
func (b *Bucket) Delete(key []byte) error {
if b.tx.db == nil {
return ErrTxClosed
} else if !b.Writable() {
return ErrTxNotWritable
}
// Move cursor to correct position.
c := b.Cursor()
_, _, flags := c.seek(key)
// Return an error if there is already existing bucket value.
if (flags & bucketLeafFlag) != 0 {
return ErrIncompatibleValue
}
// Delete the node if we have a matching key.
c.node().del(key)
return nil
}
// NextSequence returns an autoincrementing integer for the bucket.
func (b *Bucket) NextSequence() (uint64, error) {
if b.tx.db == nil {
return 0, ErrTxClosed
} else if !b.Writable() {
return 0, ErrTxNotWritable
}
// Increment and return the sequence.
b.bucket.sequence++
return b.bucket.sequence, nil
}
// ForEach executes a function for each key/value pair in a bucket.
// If the provided function returns an error then the iteration is stopped and
// the error is returned to the caller.
func (b *Bucket) ForEach(fn func(k, v []byte) error) error {
if b.tx.db == nil {
return ErrTxClosed
}
c := b.Cursor()
for k, v := c.First(); k != nil; k, v = c.Next() {
if err := fn(k, v); err != nil {
return err
}
}
return nil
}
// Stat returns stats on a bucket.
func (b *Bucket) Stats() BucketStats {
var s, subStats BucketStats
pageSize := b.tx.db.pageSize
s.BucketN += 1
if b.root == 0 {
s.InlineBucketN += 1
}
b.forEachPage(func(p *page, depth int) {
if (p.flags & leafPageFlag) != 0 {
s.KeyN += int(p.count)
// used totals the used bytes for the page
used := pageHeaderSize
if p.count != 0 {
// If page has any elements, add all element headers.
used += leafPageElementSize * int(p.count-1)
// Add all element key, value sizes.
// The computation takes advantage of the fact that the position
// of the last element's key/value equals to the total of the sizes
// of all previous elements' keys and values.
// It also includes the last element's header.
lastElement := p.leafPageElement(p.count - 1)
used += int(lastElement.pos + lastElement.ksize + lastElement.vsize)
}
if b.root == 0 {
// For inlined bucket just update the inline stats
s.InlineBucketInuse += used
} else {
// For non-inlined bucket update all the leaf stats
s.LeafPageN++
s.LeafInuse += used
s.LeafOverflowN += int(p.overflow)
// Collect stats from sub-buckets.
// Do that by iterating over all element headers
// looking for the ones with the bucketLeafFlag.
for i := uint16(0); i < p.count; i++ {
e := p.leafPageElement(i)
if (e.flags & bucketLeafFlag) != 0 {
// For any bucket element, open the element value
// and recursively call Stats on the contained bucket.
subStats.Add(b.openBucket(e.value()).Stats())
}
}
}
} else if (p.flags & branchPageFlag) != 0 {
s.BranchPageN++
lastElement := p.branchPageElement(p.count - 1)
// used totals the used bytes for the page
// Add header and all element headers.
used := pageHeaderSize + (branchPageElementSize * int(p.count-1))
// Add size of all keys and values.
// Again, use the fact that last element's position equals to
// the total of key, value sizes of all previous elements.
used += int(lastElement.pos + lastElement.ksize)
s.BranchInuse += used
s.BranchOverflowN += int(p.overflow)
}
// Keep track of maximum page depth.
if depth+1 > s.Depth {
s.Depth = (depth + 1)
}
})
// Alloc stats can be computed from page counts and pageSize.
s.BranchAlloc = (s.BranchPageN + s.BranchOverflowN) * pageSize
s.LeafAlloc = (s.LeafPageN + s.LeafOverflowN) * pageSize
// Add the max depth of sub-buckets to get total nested depth.
s.Depth += subStats.Depth
// Add the stats for all sub-buckets
s.Add(subStats)
return s
}
// forEachPage iterates over every page in a bucket, including inline pages.
func (b *Bucket) forEachPage(fn func(*page, int)) {
// If we have an inline page then just use that.
if b.page != nil {
fn(b.page, 0)
return
}
// Otherwise traverse the page hierarchy.
b.tx.forEachPage(b.root, 0, fn)
}
// forEachPageNode iterates over every page (or node) in a bucket.
// This also includes inline pages.
func (b *Bucket) forEachPageNode(fn func(*page, *node, int)) {
// If we have an inline page or root node then just use that.
if b.page != nil {
fn(b.page, nil, 0)
return
}
b._forEachPageNode(b.root, 0, fn)
}
func (b *Bucket) _forEachPageNode(pgid pgid, depth int, fn func(*page, *node, int)) {
var p, n = b.pageNode(pgid)
// Execute function.
fn(p, n, depth)
// Recursively loop over children.
if p != nil {
if (p.flags & branchPageFlag) != 0 {
for i := 0; i < int(p.count); i++ {
elem := p.branchPageElement(uint16(i))
b._forEachPageNode(elem.pgid, depth+1, fn)
}
}
} else {
if !n.isLeaf {
for _, inode := range n.inodes {
b._forEachPageNode(inode.pgid, depth+1, fn)
}
}
}
}
// spill writes all the nodes for this bucket to dirty pages.
func (b *Bucket) spill() error {
// Spill all child buckets first.
for name, child := range b.buckets {
// If the child bucket is small enough and it has no child buckets then
// write it inline into the parent bucket's page. Otherwise spill it
// like a normal bucket and make the parent value a pointer to the page.
var value []byte
if child.inlineable() {
child.free()
value = child.write()
} else {
if err := child.spill(); err != nil {
return err
}
// Update the child bucket header in this bucket.
value = make([]byte, unsafe.Sizeof(bucket{}))
var bucket = (*bucket)(unsafe.Pointer(&value[0]))
*bucket = *child.bucket
}
// Skip writing the bucket if there are no materialized nodes.
if child.rootNode == nil {
continue
}
// Update parent node.
var c = b.Cursor()
k, _, flags := c.seek([]byte(name))
_assert(bytes.Equal([]byte(name), k), "misplaced bucket header: %x -> %x", []byte(name), k)
_assert(flags&bucketLeafFlag != 0, "unexpected bucket header flag: %x", flags)
c.node().put([]byte(name), []byte(name), value, 0, bucketLeafFlag)
}
// Ignore if there's not a materialized root node.
if b.rootNode == nil {
return nil
}
// Spill nodes.
if err := b.rootNode.spill(); err != nil {
return err
}
b.rootNode = b.rootNode.root()
// Update the root node for this bucket.
_assert(b.rootNode.pgid < b.tx.meta.pgid, "pgid (%d) above high water mark (%d)", b.rootNode.pgid, b.tx.meta.pgid)
b.root = b.rootNode.pgid
return nil
}
// inlineable returns true if a bucket is small enough to be written inline
// and if it contains no subbuckets. Otherwise returns false.
func (b *Bucket) inlineable() bool {
var n = b.rootNode
// Bucket must only contain a single leaf node.
if n == nil || !n.isLeaf {
return false
}
// Bucket is not inlineable if it contains subbuckets or if it goes beyond
// our threshold for inline bucket size.
var size = pageHeaderSize
for _, inode := range n.inodes {
size += leafPageElementSize + len(inode.key) + len(inode.value)
if inode.flags&bucketLeafFlag != 0 {
return false
} else if size > b.maxInlineBucketSize() {
return false
}
}
return true
}
// Returns the maximum total size of a bucket to make it a candidate for inlining.
func (b *Bucket) maxInlineBucketSize() int {
return b.tx.db.pageSize / 4
}
// write allocates and writes a bucket to a byte slice.
func (b *Bucket) write() []byte {
// Allocate the appropriate size.
var n = b.rootNode
var value = make([]byte, bucketHeaderSize+n.size())
// Write a bucket header.
var bucket = (*bucket)(unsafe.Pointer(&value[0]))
*bucket = *b.bucket
// Convert byte slice to a fake page and write the root node.
var p = (*page)(unsafe.Pointer(&value[bucketHeaderSize]))
n.write(p)
return value
}
// rebalance attempts to balance all nodes.
func (b *Bucket) rebalance() {
for _, n := range b.nodes {
n.rebalance()
}
for _, child := range b.buckets {
child.rebalance()
}
}
// node creates a node from a page and associates it with a given parent.
func (b *Bucket) node(pgid pgid, parent *node) *node {
_assert(b.nodes != nil, "nodes map expected")
// Retrieve node if it's already been created.
if n := b.nodes[pgid]; n != nil {
return n
}
// Otherwise create a node and cache it.
n := &node{bucket: b, parent: parent}
if parent == nil {
b.rootNode = n
} else {
parent.children = append(parent.children, n)
}
// Use the inline page if this is an inline bucket.
var p = b.page
if p == nil {
p = b.tx.page(pgid)
}
// Read the page into the node and cache it.
n.read(p)
b.nodes[pgid] = n
// Update statistics.
b.tx.stats.NodeCount++
return n
}
// free recursively frees all pages in the bucket.
func (b *Bucket) free() {
if b.root == 0 {
return
}
var tx = b.tx
b.forEachPageNode(func(p *page, n *node, _ int) {
if p != nil {
tx.db.freelist.free(tx.meta.txid, p)
} else {
n.free()
}
})
b.root = 0
}
// dereference removes all references to the old mmap.
func (b *Bucket) dereference() {
if b.rootNode != nil {
b.rootNode.root().dereference()
}
for _, child := range b.buckets {
child.dereference()
}
}
// pageNode returns the in-memory node, if it exists.
// Otherwise returns the underlying page.
func (b *Bucket) pageNode(id pgid) (*page, *node) {
// Inline buckets have a fake page embedded in their value so treat them
// differently. We'll return the rootNode (if available) or the fake page.
if b.root == 0 {
_assert(id == 0, "inline bucket non-zero page access(2): %d != 0", id)
if b.rootNode != nil {
return nil, b.rootNode
}
return b.page, nil
}
// Check the node cache for non-inline buckets.
if b.nodes != nil {
if n := b.nodes[id]; n != nil {
return nil, n
}
}
// Finally lookup the page from the transaction if no node is materialized.
return b.tx.page(id), nil
}
// BucketStats records statistics about resources used by a bucket.
type BucketStats struct {
// Page count statistics.
BranchPageN int // number of logical branch pages
BranchOverflowN int // number of physical branch overflow pages
LeafPageN int // number of logical leaf pages
LeafOverflowN int // number of physical leaf overflow pages
// Tree statistics.
KeyN int // number of keys/value pairs
Depth int // number of levels in B+tree
// Page size utilization.
BranchAlloc int // bytes allocated for physical branch pages
BranchInuse int // bytes actually used for branch data
LeafAlloc int // bytes allocated for physical leaf pages
LeafInuse int // bytes actually used for leaf data
// Bucket statistics
BucketN int // total number of buckets including the top bucket
InlineBucketN int // total number on inlined buckets
InlineBucketInuse int // bytes used for inlined buckets (also accounted for in LeafInuse)
}
func (s *BucketStats) Add(other BucketStats) {
s.BranchPageN += other.BranchPageN
s.BranchOverflowN += other.BranchOverflowN
s.LeafPageN += other.LeafPageN
s.LeafOverflowN += other.LeafOverflowN
s.KeyN += other.KeyN
if s.Depth < other.Depth {
s.Depth = other.Depth
}
s.BranchAlloc += other.BranchAlloc
s.BranchInuse += other.BranchInuse
s.LeafAlloc += other.LeafAlloc
s.LeafInuse += other.LeafInuse
s.BucketN += other.BucketN
s.InlineBucketN += other.InlineBucketN
s.InlineBucketInuse += other.InlineBucketInuse
}
// cloneBytes returns a copy of a given slice.
func cloneBytes(v []byte) []byte {
var clone = make([]byte, len(v))
copy(clone, v)
return clone
}

View file

@ -0,0 +1,376 @@
package bolt
import (
"bytes"
"sort"
)
// Cursor represents an iterator that can traverse over all key/value pairs in a bucket in sorted order.
// Cursors see nested buckets with value == nil.
// Cursors can be obtained from a transaction and are valid as long as the transaction is open.
//
// Changing data while traversing with a cursor may cause it to be invalidated
// and return unexpected keys and/or values. You must reposition your cursor
// after mutating data.
type Cursor struct {
bucket *Bucket
stack []elemRef
}
// Bucket returns the bucket that this cursor was created from.
func (c *Cursor) Bucket() *Bucket {
return c.bucket
}
// First moves the cursor to the first item in the bucket and returns its key and value.
// If the bucket is empty then a nil key and value are returned.
func (c *Cursor) First() (key []byte, value []byte) {
_assert(c.bucket.tx.db != nil, "tx closed")
c.stack = c.stack[:0]
p, n := c.bucket.pageNode(c.bucket.root)
c.stack = append(c.stack, elemRef{page: p, node: n, index: 0})
c.first()
k, v, flags := c.keyValue()
if (flags & uint32(bucketLeafFlag)) != 0 {
return k, nil
}
return k, v
}
// Last moves the cursor to the last item in the bucket and returns its key and value.
// If the bucket is empty then a nil key and value are returned.
func (c *Cursor) Last() (key []byte, value []byte) {
_assert(c.bucket.tx.db != nil, "tx closed")
c.stack = c.stack[:0]
p, n := c.bucket.pageNode(c.bucket.root)
ref := elemRef{page: p, node: n}
ref.index = ref.count() - 1
c.stack = append(c.stack, ref)
c.last()
k, v, flags := c.keyValue()
if (flags & uint32(bucketLeafFlag)) != 0 {
return k, nil
}
return k, v
}
// Next moves the cursor to the next item in the bucket and returns its key and value.
// If the cursor is at the end of the bucket then a nil key and value are returned.
func (c *Cursor) Next() (key []byte, value []byte) {
_assert(c.bucket.tx.db != nil, "tx closed")
k, v, flags := c.next()
if (flags & uint32(bucketLeafFlag)) != 0 {
return k, nil
}
return k, v
}
// Prev moves the cursor to the previous item in the bucket and returns its key and value.
// If the cursor is at the beginning of the bucket then a nil key and value are returned.
func (c *Cursor) Prev() (key []byte, value []byte) {
_assert(c.bucket.tx.db != nil, "tx closed")
// Attempt to move back one element until we're successful.
// Move up the stack as we hit the beginning of each page in our stack.
for i := len(c.stack) - 1; i >= 0; i-- {
elem := &c.stack[i]
if elem.index > 0 {
elem.index--
break
}
c.stack = c.stack[:i]
}
// If we've hit the end then return nil.
if len(c.stack) == 0 {
return nil, nil
}
// Move down the stack to find the last element of the last leaf under this branch.
c.last()
k, v, flags := c.keyValue()
if (flags & uint32(bucketLeafFlag)) != 0 {
return k, nil
}
return k, v
}
// Seek moves the cursor to a given key and returns it.
// If the key does not exist then the next key is used. If no keys
// follow, a nil key is returned.
func (c *Cursor) Seek(seek []byte) (key []byte, value []byte) {
k, v, flags := c.seek(seek)
// If we ended up after the last element of a page then move to the next one.
if ref := &c.stack[len(c.stack)-1]; ref.index >= ref.count() {
k, v, flags = c.next()
}
if k == nil {
return nil, nil
} else if (flags & uint32(bucketLeafFlag)) != 0 {
return k, nil
}
return k, v
}
// Delete removes the current key/value under the cursor from the bucket.
// Delete fails if current key/value is a bucket or if the transaction is not writable.
func (c *Cursor) Delete() error {
if c.bucket.tx.db == nil {
return ErrTxClosed
} else if !c.bucket.Writable() {
return ErrTxNotWritable
}
key, _, flags := c.keyValue()
// Return an error if current value is a bucket.
if (flags & bucketLeafFlag) != 0 {
return ErrIncompatibleValue
}
c.node().del(key)
return nil
}
// seek moves the cursor to a given key and returns it.
// If the key does not exist then the next key is used.
func (c *Cursor) seek(seek []byte) (key []byte, value []byte, flags uint32) {
_assert(c.bucket.tx.db != nil, "tx closed")
// Start from root page/node and traverse to correct page.
c.stack = c.stack[:0]
c.search(seek, c.bucket.root)
ref := &c.stack[len(c.stack)-1]
// If the cursor is pointing to the end of page/node then return nil.
if ref.index >= ref.count() {
return nil, nil, 0
}
// If this is a bucket then return a nil value.
return c.keyValue()
}
// first moves the cursor to the first leaf element under the last page in the stack.
func (c *Cursor) first() {
for {
// Exit when we hit a leaf page.
var ref = &c.stack[len(c.stack)-1]
if ref.isLeaf() {
break
}
// Keep adding pages pointing to the first element to the stack.
var pgid pgid
if ref.node != nil {
pgid = ref.node.inodes[ref.index].pgid
} else {
pgid = ref.page.branchPageElement(uint16(ref.index)).pgid
}
p, n := c.bucket.pageNode(pgid)
c.stack = append(c.stack, elemRef{page: p, node: n, index: 0})
}
}
// last moves the cursor to the last leaf element under the last page in the stack.
func (c *Cursor) last() {
for {
// Exit when we hit a leaf page.
ref := &c.stack[len(c.stack)-1]
if ref.isLeaf() {
break
}
// Keep adding pages pointing to the last element in the stack.
var pgid pgid
if ref.node != nil {
pgid = ref.node.inodes[ref.index].pgid
} else {
pgid = ref.page.branchPageElement(uint16(ref.index)).pgid
}
p, n := c.bucket.pageNode(pgid)
var nextRef = elemRef{page: p, node: n}
nextRef.index = nextRef.count() - 1
c.stack = append(c.stack, nextRef)
}
}
// next moves to the next leaf element and returns the key and value.
// If the cursor is at the last leaf element then it stays there and returns nil.
func (c *Cursor) next() (key []byte, value []byte, flags uint32) {
// Attempt to move over one element until we're successful.
// Move up the stack as we hit the end of each page in our stack.
var i int
for i = len(c.stack) - 1; i >= 0; i-- {
elem := &c.stack[i]
if elem.index < elem.count()-1 {
elem.index++
break
}
}
// If we've hit the root page then stop and return. This will leave the
// cursor on the last element of the last page.
if i == -1 {
return nil, nil, 0
}
// Otherwise start from where we left off in the stack and find the
// first element of the first leaf page.
c.stack = c.stack[:i+1]
c.first()
return c.keyValue()
}
// search recursively performs a binary search against a given page/node until it finds a given key.
func (c *Cursor) search(key []byte, pgid pgid) {
p, n := c.bucket.pageNode(pgid)
if p != nil {
_assert((p.flags&(branchPageFlag|leafPageFlag)) != 0, "invalid page type: %d: %x", p.id, p.flags)
}
e := elemRef{page: p, node: n}
c.stack = append(c.stack, e)
// If we're on a leaf page/node then find the specific node.
if e.isLeaf() {
c.nsearch(key)
return
}
if n != nil {
c.searchNode(key, n)
return
}
c.searchPage(key, p)
}
func (c *Cursor) searchNode(key []byte, n *node) {
var exact bool
index := sort.Search(len(n.inodes), func(i int) bool {
// TODO(benbjohnson): Optimize this range search. It's a bit hacky right now.
// sort.Search() finds the lowest index where f() != -1 but we need the highest index.
ret := bytes.Compare(n.inodes[i].key, key)
if ret == 0 {
exact = true
}
return ret != -1
})
if !exact && index > 0 {
index--
}
c.stack[len(c.stack)-1].index = index
// Recursively search to the next page.
c.search(key, n.inodes[index].pgid)
}
func (c *Cursor) searchPage(key []byte, p *page) {
// Binary search for the correct range.
inodes := p.branchPageElements()
var exact bool
index := sort.Search(int(p.count), func(i int) bool {
// TODO(benbjohnson): Optimize this range search. It's a bit hacky right now.
// sort.Search() finds the lowest index where f() != -1 but we need the highest index.
ret := bytes.Compare(inodes[i].key(), key)
if ret == 0 {
exact = true
}
return ret != -1
})
if !exact && index > 0 {
index--
}
c.stack[len(c.stack)-1].index = index
// Recursively search to the next page.
c.search(key, inodes[index].pgid)
}
// nsearch searches the leaf node on the top of the stack for a key.
func (c *Cursor) nsearch(key []byte) {
e := &c.stack[len(c.stack)-1]
p, n := e.page, e.node
// If we have a node then search its inodes.
if n != nil {
index := sort.Search(len(n.inodes), func(i int) bool {
return bytes.Compare(n.inodes[i].key, key) != -1
})
e.index = index
return
}
// If we have a page then search its leaf elements.
inodes := p.leafPageElements()
index := sort.Search(int(p.count), func(i int) bool {
return bytes.Compare(inodes[i].key(), key) != -1
})
e.index = index
}
// keyValue returns the key and value of the current leaf element.
func (c *Cursor) keyValue() ([]byte, []byte, uint32) {
ref := &c.stack[len(c.stack)-1]
if ref.count() == 0 || ref.index >= ref.count() {
return nil, nil, 0
}
// Retrieve value from node.
if ref.node != nil {
inode := &ref.node.inodes[ref.index]
return inode.key, inode.value, inode.flags
}
// Or retrieve value from page.
elem := ref.page.leafPageElement(uint16(ref.index))
return elem.key(), elem.value(), elem.flags
}
// node returns the node that the cursor is currently positioned on.
func (c *Cursor) node() *node {
_assert(len(c.stack) > 0, "accessing a node with a zero-length cursor stack")
// If the top of the stack is a leaf node then just return it.
if ref := &c.stack[len(c.stack)-1]; ref.node != nil && ref.isLeaf() {
return ref.node
}
// Start from root and traverse down the hierarchy.
var n = c.stack[0].node
if n == nil {
n = c.bucket.node(c.stack[0].page.id, nil)
}
for _, ref := range c.stack[:len(c.stack)-1] {
_assert(!n.isLeaf, "expected branch node")
n = n.childAt(int(ref.index))
}
_assert(n.isLeaf, "expected leaf node")
return n
}
// elemRef represents a reference to an element on a given page/node.
type elemRef struct {
page *page
node *node
index int
}
// isLeaf returns whether the ref is pointing at a leaf page/node.
func (r *elemRef) isLeaf() bool {
if r.node != nil {
return r.node.isLeaf
}
return (r.page.flags & leafPageFlag) != 0
}
// count returns the number of inodes or page elements.
func (r *elemRef) count() int {
if r.node != nil {
return len(r.node.inodes)
}
return int(r.page.count)
}

689
vendor/src/github.com/boltdb/bolt/db.go vendored Normal file
View file

@ -0,0 +1,689 @@
package bolt
import (
"fmt"
"hash/fnv"
"os"
"runtime"
"runtime/debug"
"strings"
"sync"
"time"
"unsafe"
)
// The smallest size that the mmap can be.
const minMmapSize = 1 << 22 // 4MB
// The largest step that can be taken when remapping the mmap.
const maxMmapStep = 1 << 30 // 1GB
// The data file format version.
const version = 2
// Represents a marker value to indicate that a file is a Bolt DB.
const magic uint32 = 0xED0CDAED
// IgnoreNoSync specifies whether the NoSync field of a DB is ignored when
// syncing changes to a file. This is required as some operating systems,
// such as OpenBSD, do not have a unified buffer cache (UBC) and writes
// must be synchronzied using the msync(2) syscall.
const IgnoreNoSync = runtime.GOOS == "openbsd"
// DB represents a collection of buckets persisted to a file on disk.
// All data access is performed through transactions which can be obtained through the DB.
// All the functions on DB will return a ErrDatabaseNotOpen if accessed before Open() is called.
type DB struct {
// When enabled, the database will perform a Check() after every commit.
// A panic is issued if the database is in an inconsistent state. This
// flag has a large performance impact so it should only be used for
// debugging purposes.
StrictMode bool
// Setting the NoSync flag will cause the database to skip fsync()
// calls after each commit. This can be useful when bulk loading data
// into a database and you can restart the bulk load in the event of
// a system failure or database corruption. Do not set this flag for
// normal use.
//
// If the package global IgnoreNoSync constant is true, this value is
// ignored. See the comment on that constant for more details.
//
// THIS IS UNSAFE. PLEASE USE WITH CAUTION.
NoSync bool
path string
file *os.File
dataref []byte
data *[maxMapSize]byte
datasz int
meta0 *meta
meta1 *meta
pageSize int
opened bool
rwtx *Tx
txs []*Tx
freelist *freelist
stats Stats
rwlock sync.Mutex // Allows only one writer at a time.
metalock sync.Mutex // Protects meta page access.
mmaplock sync.RWMutex // Protects mmap access during remapping.
statlock sync.RWMutex // Protects stats access.
ops struct {
writeAt func(b []byte, off int64) (n int, err error)
}
}
// Path returns the path to currently open database file.
func (db *DB) Path() string {
return db.path
}
// GoString returns the Go string representation of the database.
func (db *DB) GoString() string {
return fmt.Sprintf("bolt.DB{path:%q}", db.path)
}
// String returns the string representation of the database.
func (db *DB) String() string {
return fmt.Sprintf("DB<%q>", db.path)
}
// Open creates and opens a database at the given path.
// If the file does not exist then it will be created automatically.
// Passing in nil options will cause Bolt to open the database with the default options.
func Open(path string, mode os.FileMode, options *Options) (*DB, error) {
var db = &DB{opened: true}
// Set default options if no options are provided.
if options == nil {
options = DefaultOptions
}
// Open data file and separate sync handler for metadata writes.
db.path = path
var err error
if db.file, err = os.OpenFile(db.path, os.O_RDWR|os.O_CREATE, mode); err != nil {
_ = db.close()
return nil, err
}
// Lock file so that other processes using Bolt cannot use the database
// at the same time. This would cause corruption since the two processes
// would write meta pages and free pages separately.
if err := flock(db.file, options.Timeout); err != nil {
_ = db.close()
return nil, err
}
// Default values for test hooks
db.ops.writeAt = db.file.WriteAt
// Initialize the database if it doesn't exist.
if info, err := db.file.Stat(); err != nil {
return nil, fmt.Errorf("stat error: %s", err)
} else if info.Size() == 0 {
// Initialize new files with meta pages.
if err := db.init(); err != nil {
return nil, err
}
} else {
// Read the first meta page to determine the page size.
var buf [0x1000]byte
if _, err := db.file.ReadAt(buf[:], 0); err == nil {
m := db.pageInBuffer(buf[:], 0).meta()
if err := m.validate(); err != nil {
return nil, fmt.Errorf("meta0 error: %s", err)
}
db.pageSize = int(m.pageSize)
}
}
// Memory map the data file.
if err := db.mmap(0); err != nil {
_ = db.close()
return nil, err
}
// Read in the freelist.
db.freelist = newFreelist()
db.freelist.read(db.page(db.meta().freelist))
// Mark the database as opened and return.
return db, nil
}
// mmap opens the underlying memory-mapped file and initializes the meta references.
// minsz is the minimum size that the new mmap can be.
func (db *DB) mmap(minsz int) error {
db.mmaplock.Lock()
defer db.mmaplock.Unlock()
// Dereference all mmap references before unmapping.
if db.rwtx != nil {
db.rwtx.root.dereference()
}
// Unmap existing data before continuing.
if err := db.munmap(); err != nil {
return err
}
info, err := db.file.Stat()
if err != nil {
return fmt.Errorf("mmap stat error: %s", err)
} else if int(info.Size()) < db.pageSize*2 {
return fmt.Errorf("file size too small")
}
// Ensure the size is at least the minimum size.
var size = int(info.Size())
if size < minsz {
size = minsz
}
size = db.mmapSize(size)
// Memory-map the data file as a byte slice.
if err := mmap(db, size); err != nil {
return err
}
// Save references to the meta pages.
db.meta0 = db.page(0).meta()
db.meta1 = db.page(1).meta()
// Validate the meta pages.
if err := db.meta0.validate(); err != nil {
return fmt.Errorf("meta0 error: %s", err)
}
if err := db.meta1.validate(); err != nil {
return fmt.Errorf("meta1 error: %s", err)
}
return nil
}
// munmap unmaps the data file from memory.
func (db *DB) munmap() error {
if err := munmap(db); err != nil {
return fmt.Errorf("unmap error: " + err.Error())
}
return nil
}
// mmapSize determines the appropriate size for the mmap given the current size
// of the database. The minimum size is 4MB and doubles until it reaches 1GB.
func (db *DB) mmapSize(size int) int {
if size <= minMmapSize {
return minMmapSize
} else if size < maxMmapStep {
size *= 2
} else {
size += maxMmapStep
}
// Ensure that the mmap size is a multiple of the page size.
if (size % db.pageSize) != 0 {
size = ((size / db.pageSize) + 1) * db.pageSize
}
return size
}
// init creates a new database file and initializes its meta pages.
func (db *DB) init() error {
// Set the page size to the OS page size.
db.pageSize = os.Getpagesize()
// Create two meta pages on a buffer.
buf := make([]byte, db.pageSize*4)
for i := 0; i < 2; i++ {
p := db.pageInBuffer(buf[:], pgid(i))
p.id = pgid(i)
p.flags = metaPageFlag
// Initialize the meta page.
m := p.meta()
m.magic = magic
m.version = version
m.pageSize = uint32(db.pageSize)
m.version = version
m.freelist = 2
m.root = bucket{root: 3}
m.pgid = 4
m.txid = txid(i)
}
// Write an empty freelist at page 3.
p := db.pageInBuffer(buf[:], pgid(2))
p.id = pgid(2)
p.flags = freelistPageFlag
p.count = 0
// Write an empty leaf page at page 4.
p = db.pageInBuffer(buf[:], pgid(3))
p.id = pgid(3)
p.flags = leafPageFlag
p.count = 0
// Write the buffer to our data file.
if _, err := db.ops.writeAt(buf, 0); err != nil {
return err
}
if err := fdatasync(db); err != nil {
return err
}
return nil
}
// Close releases all database resources.
// All transactions must be closed before closing the database.
func (db *DB) Close() error {
db.metalock.Lock()
defer db.metalock.Unlock()
return db.close()
}
func (db *DB) close() error {
db.opened = false
db.freelist = nil
db.path = ""
// Clear ops.
db.ops.writeAt = nil
// Close the mmap.
if err := db.munmap(); err != nil {
return err
}
// Close file handles.
if db.file != nil {
// Unlock the file.
_ = funlock(db.file)
// Close the file descriptor.
if err := db.file.Close(); err != nil {
return fmt.Errorf("db file close: %s", err)
}
db.file = nil
}
return nil
}
// Begin starts a new transaction.
// Multiple read-only transactions can be used concurrently but only one
// write transaction can be used at a time. Starting multiple write transactions
// will cause the calls to block and be serialized until the current write
// transaction finishes.
//
// IMPORTANT: You must close read-only transactions after you are finished or
// else the database will not reclaim old pages.
func (db *DB) Begin(writable bool) (*Tx, error) {
if writable {
return db.beginRWTx()
}
return db.beginTx()
}
func (db *DB) beginTx() (*Tx, error) {
// Lock the meta pages while we initialize the transaction. We obtain
// the meta lock before the mmap lock because that's the order that the
// write transaction will obtain them.
db.metalock.Lock()
// Obtain a read-only lock on the mmap. When the mmap is remapped it will
// obtain a write lock so all transactions must finish before it can be
// remapped.
db.mmaplock.RLock()
// Exit if the database is not open yet.
if !db.opened {
db.mmaplock.RUnlock()
db.metalock.Unlock()
return nil, ErrDatabaseNotOpen
}
// Create a transaction associated with the database.
t := &Tx{}
t.init(db)
// Keep track of transaction until it closes.
db.txs = append(db.txs, t)
n := len(db.txs)
// Unlock the meta pages.
db.metalock.Unlock()
// Update the transaction stats.
db.statlock.Lock()
db.stats.TxN++
db.stats.OpenTxN = n
db.statlock.Unlock()
return t, nil
}
func (db *DB) beginRWTx() (*Tx, error) {
// Obtain writer lock. This is released by the transaction when it closes.
// This enforces only one writer transaction at a time.
db.rwlock.Lock()
// Once we have the writer lock then we can lock the meta pages so that
// we can set up the transaction.
db.metalock.Lock()
defer db.metalock.Unlock()
// Exit if the database is not open yet.
if !db.opened {
db.rwlock.Unlock()
return nil, ErrDatabaseNotOpen
}
// Create a transaction associated with the database.
t := &Tx{writable: true}
t.init(db)
db.rwtx = t
// Free any pages associated with closed read-only transactions.
var minid txid = 0xFFFFFFFFFFFFFFFF
for _, t := range db.txs {
if t.meta.txid < minid {
minid = t.meta.txid
}
}
if minid > 0 {
db.freelist.release(minid - 1)
}
return t, nil
}
// removeTx removes a transaction from the database.
func (db *DB) removeTx(tx *Tx) {
// Release the read lock on the mmap.
db.mmaplock.RUnlock()
// Use the meta lock to restrict access to the DB object.
db.metalock.Lock()
// Remove the transaction.
for i, t := range db.txs {
if t == tx {
db.txs = append(db.txs[:i], db.txs[i+1:]...)
break
}
}
n := len(db.txs)
// Unlock the meta pages.
db.metalock.Unlock()
// Merge statistics.
db.statlock.Lock()
db.stats.OpenTxN = n
db.stats.TxStats.add(&tx.stats)
db.statlock.Unlock()
}
// Update executes a function within the context of a read-write managed transaction.
// If no error is returned from the function then the transaction is committed.
// If an error is returned then the entire transaction is rolled back.
// Any error that is returned from the function or returned from the commit is
// returned from the Update() method.
//
// Attempting to manually commit or rollback within the function will cause a panic.
func (db *DB) Update(fn func(*Tx) error) error {
t, err := db.Begin(true)
if err != nil {
return err
}
// Make sure the transaction rolls back in the event of a panic.
defer func() {
if t.db != nil {
t.rollback()
}
}()
// Mark as a managed tx so that the inner function cannot manually commit.
t.managed = true
// If an error is returned from the function then rollback and return error.
err = fn(t)
t.managed = false
if err != nil {
_ = t.Rollback()
return err
}
return t.Commit()
}
// View executes a function within the context of a managed read-only transaction.
// Any error that is returned from the function is returned from the View() method.
//
// Attempting to manually rollback within the function will cause a panic.
func (db *DB) View(fn func(*Tx) error) error {
t, err := db.Begin(false)
if err != nil {
return err
}
// Make sure the transaction rolls back in the event of a panic.
defer func() {
if t.db != nil {
t.rollback()
}
}()
// Mark as a managed tx so that the inner function cannot manually rollback.
t.managed = true
// If an error is returned from the function then pass it through.
err = fn(t)
t.managed = false
if err != nil {
_ = t.Rollback()
return err
}
if err := t.Rollback(); err != nil {
return err
}
return nil
}
// Stats retrieves ongoing performance stats for the database.
// This is only updated when a transaction closes.
func (db *DB) Stats() Stats {
db.statlock.RLock()
defer db.statlock.RUnlock()
return db.stats
}
// This is for internal access to the raw data bytes from the C cursor, use
// carefully, or not at all.
func (db *DB) Info() *Info {
return &Info{uintptr(unsafe.Pointer(&db.data[0])), db.pageSize}
}
// page retrieves a page reference from the mmap based on the current page size.
func (db *DB) page(id pgid) *page {
pos := id * pgid(db.pageSize)
return (*page)(unsafe.Pointer(&db.data[pos]))
}
// pageInBuffer retrieves a page reference from a given byte array based on the current page size.
func (db *DB) pageInBuffer(b []byte, id pgid) *page {
return (*page)(unsafe.Pointer(&b[id*pgid(db.pageSize)]))
}
// meta retrieves the current meta page reference.
func (db *DB) meta() *meta {
if db.meta0.txid > db.meta1.txid {
return db.meta0
}
return db.meta1
}
// allocate returns a contiguous block of memory starting at a given page.
func (db *DB) allocate(count int) (*page, error) {
// Allocate a temporary buffer for the page.
buf := make([]byte, count*db.pageSize)
p := (*page)(unsafe.Pointer(&buf[0]))
p.overflow = uint32(count - 1)
// Use pages from the freelist if they are available.
if p.id = db.freelist.allocate(count); p.id != 0 {
return p, nil
}
// Resize mmap() if we're at the end.
p.id = db.rwtx.meta.pgid
var minsz = int((p.id+pgid(count))+1) * db.pageSize
if minsz >= db.datasz {
if err := db.mmap(minsz); err != nil {
return nil, fmt.Errorf("mmap allocate error: %s", err)
}
}
// Move the page id high water mark.
db.rwtx.meta.pgid += pgid(count)
return p, nil
}
// Options represents the options that can be set when opening a database.
type Options struct {
// Timeout is the amount of time to wait to obtain a file lock.
// When set to zero it will wait indefinitely. This option is only
// available on Darwin and Linux.
Timeout time.Duration
}
// DefaultOptions represent the options used if nil options are passed into Open().
// No timeout is used which will cause Bolt to wait indefinitely for a lock.
var DefaultOptions = &Options{
Timeout: 0,
}
// Stats represents statistics about the database.
type Stats struct {
// Freelist stats
FreePageN int // total number of free pages on the freelist
PendingPageN int // total number of pending pages on the freelist
FreeAlloc int // total bytes allocated in free pages
FreelistInuse int // total bytes used by the freelist
// Transaction stats
TxN int // total number of started read transactions
OpenTxN int // number of currently open read transactions
TxStats TxStats // global, ongoing stats.
}
// Sub calculates and returns the difference between two sets of database stats.
// This is useful when obtaining stats at two different points and time and
// you need the performance counters that occurred within that time span.
func (s *Stats) Sub(other *Stats) Stats {
if other == nil {
return *s
}
var diff Stats
diff.FreePageN = s.FreePageN
diff.PendingPageN = s.PendingPageN
diff.FreeAlloc = s.FreeAlloc
diff.FreelistInuse = s.FreelistInuse
diff.TxN = other.TxN - s.TxN
diff.TxStats = s.TxStats.Sub(&other.TxStats)
return diff
}
func (s *Stats) add(other *Stats) {
s.TxStats.add(&other.TxStats)
}
type Info struct {
Data uintptr
PageSize int
}
type meta struct {
magic uint32
version uint32
pageSize uint32
flags uint32
root bucket
freelist pgid
pgid pgid
txid txid
checksum uint64
}
// validate checks the marker bytes and version of the meta page to ensure it matches this binary.
func (m *meta) validate() error {
if m.checksum != 0 && m.checksum != m.sum64() {
return ErrChecksum
} else if m.magic != magic {
return ErrInvalid
} else if m.version != version {
return ErrVersionMismatch
}
return nil
}
// copy copies one meta object to another.
func (m *meta) copy(dest *meta) {
*dest = *m
}
// write writes the meta onto a page.
func (m *meta) write(p *page) {
_assert(m.root.root < m.pgid, "root bucket pgid (%d) above high water mark (%d)", m.root.root, m.pgid)
_assert(m.freelist < m.pgid, "freelist pgid (%d) above high water mark (%d)", m.freelist, m.pgid)
// Page id is either going to be 0 or 1 which we can determine by the transaction ID.
p.id = pgid(m.txid % 2)
p.flags |= metaPageFlag
// Calculate the checksum.
m.checksum = m.sum64()
m.copy(p.meta())
}
// generates the checksum for the meta.
func (m *meta) sum64() uint64 {
var h = fnv.New64a()
_, _ = h.Write((*[unsafe.Offsetof(meta{}.checksum)]byte)(unsafe.Pointer(m))[:])
return h.Sum64()
}
// _assert will panic with a given formatted message if the given condition is false.
func _assert(condition bool, msg string, v ...interface{}) {
if !condition {
panic(fmt.Sprintf("assertion failed: "+msg, v...))
}
}
func warn(v ...interface{}) {
fmt.Fprintln(os.Stderr, v...)
}
func warnf(msg string, v ...interface{}) {
fmt.Fprintf(os.Stderr, msg+"\n", v...)
}
func printstack() {
stack := strings.Join(strings.Split(string(debug.Stack()), "\n")[2:], "\n")
fmt.Fprintln(os.Stderr, stack)
}

View file

@ -0,0 +1,44 @@
/*
Package bolt implements a low-level key/value store in pure Go. It supports
fully serializable transactions, ACID semantics, and lock-free MVCC with
multiple readers and a single writer. Bolt can be used for projects that
want a simple data store without the need to add large dependencies such as
Postgres or MySQL.
Bolt is a single-level, zero-copy, B+tree data store. This means that Bolt is
optimized for fast read access and does not require recovery in the event of a
system crash. Transactions which have not finished committing will simply be
rolled back in the event of a crash.
The design of Bolt is based on Howard Chu's LMDB database project.
Bolt currently works on Windows, Mac OS X, and Linux.
Basics
There are only a few types in Bolt: DB, Bucket, Tx, and Cursor. The DB is
a collection of buckets and is represented by a single file on disk. A bucket is
a collection of unique keys that are associated with values.
Transactions provide either read-only or read-write access to the database.
Read-only transactions can retrieve key/value pairs and can use Cursors to
iterate over the dataset sequentially. Read-write transactions can create and
delete buckets and can insert and remove keys. Only one read-write transaction
is allowed at a time.
Caveats
The database uses a read-only, memory-mapped data file to ensure that
applications cannot corrupt the database, however, this means that keys and
values returned from Bolt cannot be changed. Writing to a read-only byte slice
will cause Go to panic.
Keys and values retrieved from the database are only valid for the life of
the transaction. When used outside the transaction, these byte slices can
point to different data or can point to invalid memory which will cause a panic.
*/
package bolt

View file

@ -0,0 +1,66 @@
package bolt
import "errors"
// These errors can be returned when opening or calling methods on a DB.
var (
// ErrDatabaseNotOpen is returned when a DB instance is accessed before it
// is opened or after it is closed.
ErrDatabaseNotOpen = errors.New("database not open")
// ErrDatabaseOpen is returned when opening a database that is
// already open.
ErrDatabaseOpen = errors.New("database already open")
// ErrInvalid is returned when a data file is not a Bolt-formatted database.
ErrInvalid = errors.New("invalid database")
// ErrVersionMismatch is returned when the data file was created with a
// different version of Bolt.
ErrVersionMismatch = errors.New("version mismatch")
// ErrChecksum is returned when either meta page checksum does not match.
ErrChecksum = errors.New("checksum error")
// ErrTimeout is returned when a database cannot obtain an exclusive lock
// on the data file after the timeout passed to Open().
ErrTimeout = errors.New("timeout")
)
// These errors can occur when beginning or committing a Tx.
var (
// ErrTxNotWritable is returned when performing a write operation on a
// read-only transaction.
ErrTxNotWritable = errors.New("tx not writable")
// ErrTxClosed is returned when committing or rolling back a transaction
// that has already been committed or rolled back.
ErrTxClosed = errors.New("tx closed")
)
// These errors can occur when putting or deleting a value or a bucket.
var (
// ErrBucketNotFound is returned when trying to access a bucket that has
// not been created yet.
ErrBucketNotFound = errors.New("bucket not found")
// ErrBucketExists is returned when creating a bucket that already exists.
ErrBucketExists = errors.New("bucket already exists")
// ErrBucketNameRequired is returned when creating a bucket with a blank name.
ErrBucketNameRequired = errors.New("bucket name required")
// ErrKeyRequired is returned when inserting a zero-length key.
ErrKeyRequired = errors.New("key required")
// ErrKeyTooLarge is returned when inserting a key that is larger than MaxKeySize.
ErrKeyTooLarge = errors.New("key too large")
// ErrValueTooLarge is returned when inserting a value that is larger than MaxValueSize.
ErrValueTooLarge = errors.New("value too large")
// ErrIncompatibleValue is returned when trying create or delete a bucket
// on an existing non-bucket key or when trying to create or delete a
// non-bucket key on an existing bucket key.
ErrIncompatibleValue = errors.New("incompatible value")
)

View file

@ -0,0 +1,234 @@
package bolt
import (
"sort"
"unsafe"
)
// freelist represents a list of all pages that are available for allocation.
// It also tracks pages that have been freed but are still in use by open transactions.
type freelist struct {
ids []pgid // all free and available free page ids.
pending map[txid][]pgid // mapping of soon-to-be free page ids by tx.
cache map[pgid]bool // fast lookup of all free and pending page ids.
}
// newFreelist returns an empty, initialized freelist.
func newFreelist() *freelist {
return &freelist{
pending: make(map[txid][]pgid),
cache: make(map[pgid]bool),
}
}
// size returns the size of the page after serialization.
func (f *freelist) size() int {
return pageHeaderSize + (int(unsafe.Sizeof(pgid(0))) * f.count())
}
// count returns count of pages on the freelist
func (f *freelist) count() int {
return f.free_count() + f.pending_count()
}
// free_count returns count of free pages
func (f *freelist) free_count() int {
return len(f.ids)
}
// pending_count returns count of pending pages
func (f *freelist) pending_count() int {
var count int
for _, list := range f.pending {
count += len(list)
}
return count
}
// all returns a list of all free ids and all pending ids in one sorted list.
func (f *freelist) all() []pgid {
ids := make([]pgid, len(f.ids))
copy(ids, f.ids)
for _, list := range f.pending {
ids = append(ids, list...)
}
sort.Sort(pgids(ids))
return ids
}
// allocate returns the starting page id of a contiguous list of pages of a given size.
// If a contiguous block cannot be found then 0 is returned.
func (f *freelist) allocate(n int) pgid {
if len(f.ids) == 0 {
return 0
}
var initial, previd pgid
for i, id := range f.ids {
_assert(id > 1, "invalid page allocation: %d", id)
// Reset initial page if this is not contiguous.
if previd == 0 || id-previd != 1 {
initial = id
}
// If we found a contiguous block then remove it and return it.
if (id-initial)+1 == pgid(n) {
// If we're allocating off the beginning then take the fast path
// and just adjust the existing slice. This will use extra memory
// temporarily but the append() in free() will realloc the slice
// as is necessary.
if (i + 1) == n {
f.ids = f.ids[i+1:]
} else {
copy(f.ids[i-n+1:], f.ids[i+1:])
f.ids = f.ids[:len(f.ids)-n]
}
// Remove from the free cache.
for i := pgid(0); i < pgid(n); i++ {
delete(f.cache, initial+i)
}
return initial
}
previd = id
}
return 0
}
// free releases a page and its overflow for a given transaction id.
// If the page is already free then a panic will occur.
func (f *freelist) free(txid txid, p *page) {
_assert(p.id > 1, "cannot free page 0 or 1: %d", p.id)
// Free page and all its overflow pages.
var ids = f.pending[txid]
for id := p.id; id <= p.id+pgid(p.overflow); id++ {
// Verify that page is not already free.
_assert(!f.cache[id], "page %d already freed", id)
// Add to the freelist and cache.
ids = append(ids, id)
f.cache[id] = true
}
f.pending[txid] = ids
}
// release moves all page ids for a transaction id (or older) to the freelist.
func (f *freelist) release(txid txid) {
for tid, ids := range f.pending {
if tid <= txid {
// Move transaction's pending pages to the available freelist.
// Don't remove from the cache since the page is still free.
f.ids = append(f.ids, ids...)
delete(f.pending, tid)
}
}
sort.Sort(pgids(f.ids))
}
// rollback removes the pages from a given pending tx.
func (f *freelist) rollback(txid txid) {
// Remove page ids from cache.
for _, id := range f.pending[txid] {
delete(f.cache, id)
}
// Remove pages from pending list.
delete(f.pending, txid)
}
// freed returns whether a given page is in the free list.
func (f *freelist) freed(pgid pgid) bool {
return f.cache[pgid]
}
// read initializes the freelist from a freelist page.
func (f *freelist) read(p *page) {
// If the page.count is at the max uint16 value (64k) then it's considered
// an overflow and the size of the freelist is stored as the first element.
idx, count := 0, int(p.count)
if count == 0xFFFF {
idx = 1
count = int(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[0])
}
// Copy the list of page ids from the freelist.
ids := ((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[idx:count]
f.ids = make([]pgid, len(ids))
copy(f.ids, ids)
// Make sure they're sorted.
sort.Sort(pgids(f.ids))
// Rebuild the page cache.
f.reindex()
}
// write writes the page ids onto a freelist page. All free and pending ids are
// saved to disk since in the event of a program crash, all pending ids will
// become free.
func (f *freelist) write(p *page) error {
// Combine the old free pgids and pgids waiting on an open transaction.
ids := f.all()
// Update the header flag.
p.flags |= freelistPageFlag
// The page.count can only hold up to 64k elements so if we overflow that
// number then we handle it by putting the size in the first element.
if len(ids) < 0xFFFF {
p.count = uint16(len(ids))
copy(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[:], ids)
} else {
p.count = 0xFFFF
((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[0] = pgid(len(ids))
copy(((*[maxAllocSize]pgid)(unsafe.Pointer(&p.ptr)))[1:], ids)
}
return nil
}
// reload reads the freelist from a page and filters out pending items.
func (f *freelist) reload(p *page) {
f.read(p)
// Build a cache of only pending pages.
pcache := make(map[pgid]bool)
for _, pendingIDs := range f.pending {
for _, pendingID := range pendingIDs {
pcache[pendingID] = true
}
}
// Check each page in the freelist and build a new available freelist
// with any pages not in the pending lists.
var a []pgid
for _, id := range f.ids {
if !pcache[id] {
a = append(a, id)
}
}
f.ids = a
// Once the available list is rebuilt then rebuild the free cache so that
// it includes the available and pending free pages.
f.reindex()
}
// reindex rebuilds the free cache based on available and pending free lists.
func (f *freelist) reindex() {
f.cache = make(map[pgid]bool)
for _, id := range f.ids {
f.cache[id] = true
}
for _, pendingIDs := range f.pending {
for _, pendingID := range pendingIDs {
f.cache[pendingID] = true
}
}
}

View file

@ -0,0 +1,616 @@
package bolt
import (
"bytes"
"sort"
"unsafe"
)
// node represents an in-memory, deserialized page.
type node struct {
bucket *Bucket
isLeaf bool
unbalanced bool
spilled bool
key []byte
pgid pgid
parent *node
children nodes
inodes inodes
}
// root returns the top-level node this node is attached to.
func (n *node) root() *node {
if n.parent == nil {
return n
}
return n.parent.root()
}
// minKeys returns the minimum number of inodes this node should have.
func (n *node) minKeys() int {
if n.isLeaf {
return 1
}
return 2
}
// size returns the size of the node after serialization.
func (n *node) size() int {
sz, elsz := pageHeaderSize, n.pageElementSize()
for i := 0; i < len(n.inodes); i++ {
item := &n.inodes[i]
sz += elsz + len(item.key) + len(item.value)
}
return sz
}
// sizeLessThan returns true if the node is less than a given size.
// This is an optimization to avoid calculating a large node when we only need
// to know if it fits inside a certain page size.
func (n *node) sizeLessThan(v int) bool {
sz, elsz := pageHeaderSize, n.pageElementSize()
for i := 0; i < len(n.inodes); i++ {
item := &n.inodes[i]
sz += elsz + len(item.key) + len(item.value)
if sz >= v {
return false
}
}
return true
}
// pageElementSize returns the size of each page element based on the type of node.
func (n *node) pageElementSize() int {
if n.isLeaf {
return leafPageElementSize
}
return branchPageElementSize
}
// childAt returns the child node at a given index.
func (n *node) childAt(index int) *node {
_assert(!n.isLeaf, "invalid childAt(%d) on a leaf node", index)
return n.bucket.node(n.inodes[index].pgid, n)
}
// childIndex returns the index of a given child node.
func (n *node) childIndex(child *node) int {
index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 })
return index
}
// numChildren returns the number of children.
func (n *node) numChildren() int {
return len(n.inodes)
}
// nextSibling returns the next node with the same parent.
func (n *node) nextSibling() *node {
if n.parent == nil {
return nil
}
index := n.parent.childIndex(n)
if index >= n.parent.numChildren()-1 {
return nil
}
return n.parent.childAt(index + 1)
}
// prevSibling returns the previous node with the same parent.
func (n *node) prevSibling() *node {
if n.parent == nil {
return nil
}
index := n.parent.childIndex(n)
if index == 0 {
return nil
}
return n.parent.childAt(index - 1)
}
// put inserts a key/value.
func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) {
_assert(pgid < n.bucket.tx.meta.pgid, "pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid)
_assert(len(oldKey) > 0, "put: zero-length old key")
_assert(len(newKey) > 0, "put: zero-length new key")
// Find insertion index.
index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 })
// Add capacity and shift nodes if we don't have an exact match and need to insert.
exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey))
if !exact {
n.inodes = append(n.inodes, inode{})
copy(n.inodes[index+1:], n.inodes[index:])
}
inode := &n.inodes[index]
inode.flags = flags
inode.key = newKey
inode.value = value
inode.pgid = pgid
_assert(len(inode.key) > 0, "put: zero-length inode key")
}
// del removes a key from the node.
func (n *node) del(key []byte) {
// Find index of key.
index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 })
// Exit if the key isn't found.
if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].key, key) {
return
}
// Delete inode from the node.
n.inodes = append(n.inodes[:index], n.inodes[index+1:]...)
// Mark the node as needing rebalancing.
n.unbalanced = true
}
// read initializes the node from a page.
func (n *node) read(p *page) {
n.pgid = p.id
n.isLeaf = ((p.flags & leafPageFlag) != 0)
n.inodes = make(inodes, int(p.count))
for i := 0; i < int(p.count); i++ {
inode := &n.inodes[i]
if n.isLeaf {
elem := p.leafPageElement(uint16(i))
inode.flags = elem.flags
inode.key = elem.key()
inode.value = elem.value()
} else {
elem := p.branchPageElement(uint16(i))
inode.pgid = elem.pgid
inode.key = elem.key()
}
_assert(len(inode.key) > 0, "read: zero-length inode key")
}
// Save first key so we can find the node in the parent when we spill.
if len(n.inodes) > 0 {
n.key = n.inodes[0].key
_assert(len(n.key) > 0, "read: zero-length node key")
} else {
n.key = nil
}
}
// write writes the items onto one or more pages.
func (n *node) write(p *page) {
// Initialize page.
if n.isLeaf {
p.flags |= leafPageFlag
} else {
p.flags |= branchPageFlag
}
_assert(len(n.inodes) < 0xFFFF, "inode overflow: %d (pgid=%d)", len(n.inodes), p.id)
p.count = uint16(len(n.inodes))
// Loop over each item and write it to the page.
b := (*[maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[n.pageElementSize()*len(n.inodes):]
for i, item := range n.inodes {
_assert(len(item.key) > 0, "write: zero-length inode key")
// Write the page element.
if n.isLeaf {
elem := p.leafPageElement(uint16(i))
elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
elem.flags = item.flags
elem.ksize = uint32(len(item.key))
elem.vsize = uint32(len(item.value))
} else {
elem := p.branchPageElement(uint16(i))
elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
elem.ksize = uint32(len(item.key))
elem.pgid = item.pgid
_assert(elem.pgid != p.id, "write: circular dependency occurred")
}
// Write data for the element to the end of the page.
copy(b[0:], item.key)
b = b[len(item.key):]
copy(b[0:], item.value)
b = b[len(item.value):]
}
// DEBUG ONLY: n.dump()
}
// split breaks up a node into multiple smaller nodes, if appropriate.
// This should only be called from the spill() function.
func (n *node) split(pageSize int) []*node {
var nodes []*node
node := n
for {
// Split node into two.
a, b := node.splitTwo(pageSize)
nodes = append(nodes, a)
// If we can't split then exit the loop.
if b == nil {
break
}
// Set node to b so it gets split on the next iteration.
node = b
}
return nodes
}
// splitTwo breaks up a node into two smaller nodes, if appropriate.
// This should only be called from the split() function.
func (n *node) splitTwo(pageSize int) (*node, *node) {
// Ignore the split if the page doesn't have at least enough nodes for
// two pages or if the nodes can fit in a single page.
if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) {
return n, nil
}
// Determine the threshold before starting a new node.
var fillPercent = n.bucket.FillPercent
if fillPercent < minFillPercent {
fillPercent = minFillPercent
} else if fillPercent > maxFillPercent {
fillPercent = maxFillPercent
}
threshold := int(float64(pageSize) * fillPercent)
// Determine split position and sizes of the two pages.
splitIndex, _ := n.splitIndex(threshold)
// Split node into two separate nodes.
// If there's no parent then we'll need to create one.
if n.parent == nil {
n.parent = &node{bucket: n.bucket, children: []*node{n}}
}
// Create a new node and add it to the parent.
next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
n.parent.children = append(n.parent.children, next)
// Split inodes across two nodes.
next.inodes = n.inodes[splitIndex:]
n.inodes = n.inodes[:splitIndex]
// Update the statistics.
n.bucket.tx.stats.Split++
return n, next
}
// splitIndex finds the position where a page will fill a given threshold.
// It returns the index as well as the size of the first page.
// This is only be called from split().
func (n *node) splitIndex(threshold int) (index, sz int) {
sz = pageHeaderSize
// Loop until we only have the minimum number of keys required for the second page.
for i := 0; i < len(n.inodes)-minKeysPerPage; i++ {
index = i
inode := n.inodes[i]
elsize := n.pageElementSize() + len(inode.key) + len(inode.value)
// If we have at least the minimum number of keys and adding another
// node would put us over the threshold then exit and return.
if i >= minKeysPerPage && sz+elsize > threshold {
break
}
// Add the element size to the total size.
sz += elsize
}
return
}
// spill writes the nodes to dirty pages and splits nodes as it goes.
// Returns an error if dirty pages cannot be allocated.
func (n *node) spill() error {
var tx = n.bucket.tx
if n.spilled {
return nil
}
// Spill child nodes first. Child nodes can materialize sibling nodes in
// the case of split-merge so we cannot use a range loop. We have to check
// the children size on every loop iteration.
sort.Sort(n.children)
for i := 0; i < len(n.children); i++ {
if err := n.children[i].spill(); err != nil {
return err
}
}
// We no longer need the child list because it's only used for spill tracking.
n.children = nil
// Split nodes into appropriate sizes. The first node will always be n.
var nodes = n.split(tx.db.pageSize)
for _, node := range nodes {
// Add node's page to the freelist if it's not new.
if node.pgid > 0 {
tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid))
node.pgid = 0
}
// Allocate contiguous space for the node.
p, err := tx.allocate((node.size() / tx.db.pageSize) + 1)
if err != nil {
return err
}
// Write the node.
_assert(p.id < tx.meta.pgid, "pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid)
node.pgid = p.id
node.write(p)
node.spilled = true
// Insert into parent inodes.
if node.parent != nil {
var key = node.key
if key == nil {
key = node.inodes[0].key
}
node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0)
node.key = node.inodes[0].key
_assert(len(node.key) > 0, "spill: zero-length node key")
}
// Update the statistics.
tx.stats.Spill++
}
// If the root node split and created a new root then we need to spill that
// as well. We'll clear out the children to make sure it doesn't try to respill.
if n.parent != nil && n.parent.pgid == 0 {
n.children = nil
return n.parent.spill()
}
return nil
}
// rebalance attempts to combine the node with sibling nodes if the node fill
// size is below a threshold or if there are not enough keys.
func (n *node) rebalance() {
if !n.unbalanced {
return
}
n.unbalanced = false
// Update statistics.
n.bucket.tx.stats.Rebalance++
// Ignore if node is above threshold (25%) and has enough keys.
var threshold = n.bucket.tx.db.pageSize / 4
if n.size() > threshold && len(n.inodes) > n.minKeys() {
return
}
// Root node has special handling.
if n.parent == nil {
// If root node is a branch and only has one node then collapse it.
if !n.isLeaf && len(n.inodes) == 1 {
// Move root's child up.
child := n.bucket.node(n.inodes[0].pgid, n)
n.isLeaf = child.isLeaf
n.inodes = child.inodes[:]
n.children = child.children
// Reparent all child nodes being moved.
for _, inode := range n.inodes {
if child, ok := n.bucket.nodes[inode.pgid]; ok {
child.parent = n
}
}
// Remove old child.
child.parent = nil
delete(n.bucket.nodes, child.pgid)
child.free()
}
return
}
// If node has no keys then just remove it.
if n.numChildren() == 0 {
n.parent.del(n.key)
n.parent.removeChild(n)
delete(n.bucket.nodes, n.pgid)
n.free()
n.parent.rebalance()
return
}
_assert(n.parent.numChildren() > 1, "parent must have at least 2 children")
// Destination node is right sibling if idx == 0, otherwise left sibling.
var target *node
var useNextSibling = (n.parent.childIndex(n) == 0)
if useNextSibling {
target = n.nextSibling()
} else {
target = n.prevSibling()
}
// If target node has extra nodes then just move one over.
if target.numChildren() > target.minKeys() {
if useNextSibling {
// Reparent and move node.
if child, ok := n.bucket.nodes[target.inodes[0].pgid]; ok {
child.parent.removeChild(child)
child.parent = n
child.parent.children = append(child.parent.children, child)
}
n.inodes = append(n.inodes, target.inodes[0])
target.inodes = target.inodes[1:]
// Update target key on parent.
target.parent.put(target.key, target.inodes[0].key, nil, target.pgid, 0)
target.key = target.inodes[0].key
_assert(len(target.key) > 0, "rebalance(1): zero-length node key")
} else {
// Reparent and move node.
if child, ok := n.bucket.nodes[target.inodes[len(target.inodes)-1].pgid]; ok {
child.parent.removeChild(child)
child.parent = n
child.parent.children = append(child.parent.children, child)
}
n.inodes = append(n.inodes, inode{})
copy(n.inodes[1:], n.inodes)
n.inodes[0] = target.inodes[len(target.inodes)-1]
target.inodes = target.inodes[:len(target.inodes)-1]
}
// Update parent key for node.
n.parent.put(n.key, n.inodes[0].key, nil, n.pgid, 0)
n.key = n.inodes[0].key
_assert(len(n.key) > 0, "rebalance(2): zero-length node key")
return
}
// If both this node and the target node are too small then merge them.
if useNextSibling {
// Reparent all child nodes being moved.
for _, inode := range target.inodes {
if child, ok := n.bucket.nodes[inode.pgid]; ok {
child.parent.removeChild(child)
child.parent = n
child.parent.children = append(child.parent.children, child)
}
}
// Copy over inodes from target and remove target.
n.inodes = append(n.inodes, target.inodes...)
n.parent.del(target.key)
n.parent.removeChild(target)
delete(n.bucket.nodes, target.pgid)
target.free()
} else {
// Reparent all child nodes being moved.
for _, inode := range n.inodes {
if child, ok := n.bucket.nodes[inode.pgid]; ok {
child.parent.removeChild(child)
child.parent = target
child.parent.children = append(child.parent.children, child)
}
}
// Copy over inodes to target and remove node.
target.inodes = append(target.inodes, n.inodes...)
n.parent.del(n.key)
n.parent.removeChild(n)
delete(n.bucket.nodes, n.pgid)
n.free()
}
// Either this node or the target node was deleted from the parent so rebalance it.
n.parent.rebalance()
}
// removes a node from the list of in-memory children.
// This does not affect the inodes.
func (n *node) removeChild(target *node) {
for i, child := range n.children {
if child == target {
n.children = append(n.children[:i], n.children[i+1:]...)
return
}
}
}
// dereference causes the node to copy all its inode key/value references to heap memory.
// This is required when the mmap is reallocated so inodes are not pointing to stale data.
func (n *node) dereference() {
if n.key != nil {
key := make([]byte, len(n.key))
copy(key, n.key)
n.key = key
_assert(n.pgid == 0 || len(n.key) > 0, "dereference: zero-length node key on existing node")
}
for i := range n.inodes {
inode := &n.inodes[i]
key := make([]byte, len(inode.key))
copy(key, inode.key)
inode.key = key
_assert(len(inode.key) > 0, "dereference: zero-length inode key")
value := make([]byte, len(inode.value))
copy(value, inode.value)
inode.value = value
}
// Recursively dereference children.
for _, child := range n.children {
child.dereference()
}
// Update statistics.
n.bucket.tx.stats.NodeDeref++
}
// free adds the node's underlying page to the freelist.
func (n *node) free() {
if n.pgid != 0 {
n.bucket.tx.db.freelist.free(n.bucket.tx.meta.txid, n.bucket.tx.page(n.pgid))
n.pgid = 0
}
}
// dump writes the contents of the node to STDERR for debugging purposes.
/*
func (n *node) dump() {
// Write node header.
var typ = "branch"
if n.isLeaf {
typ = "leaf"
}
warnf("[NODE %d {type=%s count=%d}]", n.pgid, typ, len(n.inodes))
// Write out abbreviated version of each item.
for _, item := range n.inodes {
if n.isLeaf {
if item.flags&bucketLeafFlag != 0 {
bucket := (*bucket)(unsafe.Pointer(&item.value[0]))
warnf("+L %08x -> (bucket root=%d)", trunc(item.key, 4), bucket.root)
} else {
warnf("+L %08x -> %08x", trunc(item.key, 4), trunc(item.value, 4))
}
} else {
warnf("+B %08x -> pgid=%d", trunc(item.key, 4), item.pgid)
}
}
warn("")
}
*/
type nodes []*node
func (s nodes) Len() int { return len(s) }
func (s nodes) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
func (s nodes) Less(i, j int) bool { return bytes.Compare(s[i].inodes[0].key, s[j].inodes[0].key) == -1 }
// inode represents an internal node inside of a node.
// It can be used to point to elements in a page or point
// to an element which hasn't been added to a page yet.
type inode struct {
flags uint32
pgid pgid
key []byte
value []byte
}
type inodes []inode

View file

@ -0,0 +1,135 @@
package bolt
import (
"fmt"
"os"
"unsafe"
)
const pageHeaderSize = int(unsafe.Offsetof(((*page)(nil)).ptr))
const maxAllocSize = 0xFFFFFFF
const minKeysPerPage = 2
const branchPageElementSize = int(unsafe.Sizeof(branchPageElement{}))
const leafPageElementSize = int(unsafe.Sizeof(leafPageElement{}))
const (
branchPageFlag = 0x01
leafPageFlag = 0x02
metaPageFlag = 0x04
freelistPageFlag = 0x10
)
const (
bucketLeafFlag = 0x01
)
type pgid uint64
type page struct {
id pgid
flags uint16
count uint16
overflow uint32
ptr uintptr
}
// typ returns a human readable page type string used for debugging.
func (p *page) typ() string {
if (p.flags & branchPageFlag) != 0 {
return "branch"
} else if (p.flags & leafPageFlag) != 0 {
return "leaf"
} else if (p.flags & metaPageFlag) != 0 {
return "meta"
} else if (p.flags & freelistPageFlag) != 0 {
return "freelist"
}
return fmt.Sprintf("unknown<%02x>", p.flags)
}
// meta returns a pointer to the metadata section of the page.
func (p *page) meta() *meta {
return (*meta)(unsafe.Pointer(&p.ptr))
}
// leafPageElement retrieves the leaf node by index
func (p *page) leafPageElement(index uint16) *leafPageElement {
n := &((*[0x7FFFFFF]leafPageElement)(unsafe.Pointer(&p.ptr)))[index]
return n
}
// leafPageElements retrieves a list of leaf nodes.
func (p *page) leafPageElements() []leafPageElement {
return ((*[0x7FFFFFF]leafPageElement)(unsafe.Pointer(&p.ptr)))[:]
}
// branchPageElement retrieves the branch node by index
func (p *page) branchPageElement(index uint16) *branchPageElement {
return &((*[0x7FFFFFF]branchPageElement)(unsafe.Pointer(&p.ptr)))[index]
}
// branchPageElements retrieves a list of branch nodes.
func (p *page) branchPageElements() []branchPageElement {
return ((*[0x7FFFFFF]branchPageElement)(unsafe.Pointer(&p.ptr)))[:]
}
// dump writes n bytes of the page to STDERR as hex output.
func (p *page) hexdump(n int) {
buf := (*[maxAllocSize]byte)(unsafe.Pointer(p))[:n]
fmt.Fprintf(os.Stderr, "%x\n", buf)
}
type pages []*page
func (s pages) Len() int { return len(s) }
func (s pages) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
func (s pages) Less(i, j int) bool { return s[i].id < s[j].id }
// branchPageElement represents a node on a branch page.
type branchPageElement struct {
pos uint32
ksize uint32
pgid pgid
}
// key returns a byte slice of the node key.
func (n *branchPageElement) key() []byte {
buf := (*[maxAllocSize]byte)(unsafe.Pointer(n))
return buf[n.pos : n.pos+n.ksize]
}
// leafPageElement represents a node on a leaf page.
type leafPageElement struct {
flags uint32
pos uint32
ksize uint32
vsize uint32
}
// key returns a byte slice of the node key.
func (n *leafPageElement) key() []byte {
buf := (*[maxAllocSize]byte)(unsafe.Pointer(n))
return buf[n.pos : n.pos+n.ksize]
}
// value returns a byte slice of the node value.
func (n *leafPageElement) value() []byte {
buf := (*[maxAllocSize]byte)(unsafe.Pointer(n))
return buf[n.pos+n.ksize : n.pos+n.ksize+n.vsize]
}
// PageInfo represents human readable information about a page.
type PageInfo struct {
ID int
Type string
Count int
OverflowCount int
}
type pgids []pgid
func (s pgids) Len() int { return len(s) }
func (s pgids) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
func (s pgids) Less(i, j int) bool { return s[i] < s[j] }

580
vendor/src/github.com/boltdb/bolt/tx.go vendored Normal file
View file

@ -0,0 +1,580 @@
package bolt
import (
"fmt"
"io"
"os"
"sort"
"time"
"unsafe"
)
// txid represents the internal transaction identifier.
type txid uint64
// Tx represents a read-only or read/write transaction on the database.
// Read-only transactions can be used for retrieving values for keys and creating cursors.
// Read/write transactions can create and remove buckets and create and remove keys.
//
// IMPORTANT: You must commit or rollback transactions when you are done with
// them. Pages can not be reclaimed by the writer until no more transactions
// are using them. A long running read transaction can cause the database to
// quickly grow.
type Tx struct {
writable bool
managed bool
db *DB
meta *meta
root Bucket
pages map[pgid]*page
stats TxStats
commitHandlers []func()
}
// init initializes the transaction.
func (tx *Tx) init(db *DB) {
tx.db = db
tx.pages = nil
// Copy the meta page since it can be changed by the writer.
tx.meta = &meta{}
db.meta().copy(tx.meta)
// Copy over the root bucket.
tx.root = newBucket(tx)
tx.root.bucket = &bucket{}
*tx.root.bucket = tx.meta.root
// Increment the transaction id and add a page cache for writable transactions.
if tx.writable {
tx.pages = make(map[pgid]*page)
tx.meta.txid += txid(1)
}
}
// ID returns the transaction id.
func (tx *Tx) ID() int {
return int(tx.meta.txid)
}
// DB returns a reference to the database that created the transaction.
func (tx *Tx) DB() *DB {
return tx.db
}
// Size returns current database size in bytes as seen by this transaction.
func (tx *Tx) Size() int64 {
return int64(tx.meta.pgid) * int64(tx.db.pageSize)
}
// Writable returns whether the transaction can perform write operations.
func (tx *Tx) Writable() bool {
return tx.writable
}
// Cursor creates a cursor associated with the root bucket.
// All items in the cursor will return a nil value because all root bucket keys point to buckets.
// The cursor is only valid as long as the transaction is open.
// Do not use a cursor after the transaction is closed.
func (tx *Tx) Cursor() *Cursor {
return tx.root.Cursor()
}
// Stats retrieves a copy of the current transaction statistics.
func (tx *Tx) Stats() TxStats {
return tx.stats
}
// Bucket retrieves a bucket by name.
// Returns nil if the bucket does not exist.
func (tx *Tx) Bucket(name []byte) *Bucket {
return tx.root.Bucket(name)
}
// CreateBucket creates a new bucket.
// Returns an error if the bucket already exists, if the bucket name is blank, or if the bucket name is too long.
func (tx *Tx) CreateBucket(name []byte) (*Bucket, error) {
return tx.root.CreateBucket(name)
}
// CreateBucketIfNotExists creates a new bucket if it doesn't already exist.
// Returns an error if the bucket name is blank, or if the bucket name is too long.
func (tx *Tx) CreateBucketIfNotExists(name []byte) (*Bucket, error) {
return tx.root.CreateBucketIfNotExists(name)
}
// DeleteBucket deletes a bucket.
// Returns an error if the bucket cannot be found or if the key represents a non-bucket value.
func (tx *Tx) DeleteBucket(name []byte) error {
return tx.root.DeleteBucket(name)
}
// ForEach executes a function for each bucket in the root.
// If the provided function returns an error then the iteration is stopped and
// the error is returned to the caller.
func (tx *Tx) ForEach(fn func(name []byte, b *Bucket) error) error {
return tx.root.ForEach(func(k, v []byte) error {
if err := fn(k, tx.root.Bucket(k)); err != nil {
return err
}
return nil
})
}
// OnCommit adds a handler function to be executed after the transaction successfully commits.
func (tx *Tx) OnCommit(fn func()) {
tx.commitHandlers = append(tx.commitHandlers, fn)
}
// Commit writes all changes to disk and updates the meta page.
// Returns an error if a disk write error occurs.
func (tx *Tx) Commit() error {
_assert(!tx.managed, "managed tx commit not allowed")
if tx.db == nil {
return ErrTxClosed
} else if !tx.writable {
return ErrTxNotWritable
}
// TODO(benbjohnson): Use vectorized I/O to write out dirty pages.
// Rebalance nodes which have had deletions.
var startTime = time.Now()
tx.root.rebalance()
if tx.stats.Rebalance > 0 {
tx.stats.RebalanceTime += time.Since(startTime)
}
// spill data onto dirty pages.
startTime = time.Now()
if err := tx.root.spill(); err != nil {
tx.rollback()
return err
}
tx.stats.SpillTime += time.Since(startTime)
// Free the old root bucket.
tx.meta.root.root = tx.root.root
// Free the freelist and allocate new pages for it. This will overestimate
// the size of the freelist but not underestimate the size (which would be bad).
tx.db.freelist.free(tx.meta.txid, tx.db.page(tx.meta.freelist))
p, err := tx.allocate((tx.db.freelist.size() / tx.db.pageSize) + 1)
if err != nil {
tx.rollback()
return err
}
if err := tx.db.freelist.write(p); err != nil {
tx.rollback()
return err
}
tx.meta.freelist = p.id
// Write dirty pages to disk.
startTime = time.Now()
if err := tx.write(); err != nil {
tx.rollback()
return err
}
// If strict mode is enabled then perform a consistency check.
// Only the first consistency error is reported in the panic.
if tx.db.StrictMode {
if err, ok := <-tx.Check(); ok {
panic("check fail: " + err.Error())
}
}
// Write meta to disk.
if err := tx.writeMeta(); err != nil {
tx.rollback()
return err
}
tx.stats.WriteTime += time.Since(startTime)
// Finalize the transaction.
tx.close()
// Execute commit handlers now that the locks have been removed.
for _, fn := range tx.commitHandlers {
fn()
}
return nil
}
// Rollback closes the transaction and ignores all previous updates.
func (tx *Tx) Rollback() error {
_assert(!tx.managed, "managed tx rollback not allowed")
if tx.db == nil {
return ErrTxClosed
}
tx.rollback()
return nil
}
func (tx *Tx) rollback() {
if tx.db == nil {
return
}
if tx.writable {
tx.db.freelist.rollback(tx.meta.txid)
tx.db.freelist.reload(tx.db.page(tx.db.meta().freelist))
}
tx.close()
}
func (tx *Tx) close() {
if tx.db == nil {
return
}
if tx.writable {
// Grab freelist stats.
var freelistFreeN = tx.db.freelist.free_count()
var freelistPendingN = tx.db.freelist.pending_count()
var freelistAlloc = tx.db.freelist.size()
// Remove writer lock.
tx.db.rwlock.Unlock()
// Merge statistics.
tx.db.statlock.Lock()
tx.db.stats.FreePageN = freelistFreeN
tx.db.stats.PendingPageN = freelistPendingN
tx.db.stats.FreeAlloc = (freelistFreeN + freelistPendingN) * tx.db.pageSize
tx.db.stats.FreelistInuse = freelistAlloc
tx.db.stats.TxStats.add(&tx.stats)
tx.db.statlock.Unlock()
} else {
tx.db.removeTx(tx)
}
tx.db = nil
}
// Copy writes the entire database to a writer.
// A reader transaction is maintained during the copy so it is safe to continue
// using the database while a copy is in progress.
// Copy will write exactly tx.Size() bytes into the writer.
func (tx *Tx) Copy(w io.Writer) error {
var f *os.File
var err error
// Attempt to open reader directly.
if f, err = os.OpenFile(tx.db.path, os.O_RDONLY|odirect, 0); err != nil {
// Fallback to a regular open if that doesn't work.
if f, err = os.OpenFile(tx.db.path, os.O_RDONLY, 0); err != nil {
return err
}
}
// Copy the meta pages.
tx.db.metalock.Lock()
_, err = io.CopyN(w, f, int64(tx.db.pageSize*2))
tx.db.metalock.Unlock()
if err != nil {
_ = f.Close()
return fmt.Errorf("meta copy: %s", err)
}
// Copy data pages.
if _, err := io.CopyN(w, f, tx.Size()-int64(tx.db.pageSize*2)); err != nil {
_ = f.Close()
return err
}
return f.Close()
}
// CopyFile copies the entire database to file at the given path.
// A reader transaction is maintained during the copy so it is safe to continue
// using the database while a copy is in progress.
func (tx *Tx) CopyFile(path string, mode os.FileMode) error {
f, err := os.OpenFile(path, os.O_RDWR|os.O_CREATE|os.O_TRUNC, mode)
if err != nil {
return err
}
err = tx.Copy(f)
if err != nil {
_ = f.Close()
return err
}
return f.Close()
}
// Check performs several consistency checks on the database for this transaction.
// An error is returned if any inconsistency is found.
//
// It can be safely run concurrently on a writable transaction. However, this
// incurs a high cost for large databases and databases with a lot of subbuckets
// because of caching. This overhead can be removed if running on a read-only
// transaction, however, it is not safe to execute other writer transactions at
// the same time.
func (tx *Tx) Check() <-chan error {
ch := make(chan error)
go tx.check(ch)
return ch
}
func (tx *Tx) check(ch chan error) {
// Check if any pages are double freed.
freed := make(map[pgid]bool)
for _, id := range tx.db.freelist.all() {
if freed[id] {
ch <- fmt.Errorf("page %d: already freed", id)
}
freed[id] = true
}
// Track every reachable page.
reachable := make(map[pgid]*page)
reachable[0] = tx.page(0) // meta0
reachable[1] = tx.page(1) // meta1
for i := uint32(0); i <= tx.page(tx.meta.freelist).overflow; i++ {
reachable[tx.meta.freelist+pgid(i)] = tx.page(tx.meta.freelist)
}
// Recursively check buckets.
tx.checkBucket(&tx.root, reachable, freed, ch)
// Ensure all pages below high water mark are either reachable or freed.
for i := pgid(0); i < tx.meta.pgid; i++ {
_, isReachable := reachable[i]
if !isReachable && !freed[i] {
ch <- fmt.Errorf("page %d: unreachable unfreed", int(i))
}
}
// Close the channel to signal completion.
close(ch)
}
func (tx *Tx) checkBucket(b *Bucket, reachable map[pgid]*page, freed map[pgid]bool, ch chan error) {
// Ignore inline buckets.
if b.root == 0 {
return
}
// Check every page used by this bucket.
b.tx.forEachPage(b.root, 0, func(p *page, _ int) {
if p.id > tx.meta.pgid {
ch <- fmt.Errorf("page %d: out of bounds: %d", int(p.id), int(b.tx.meta.pgid))
}
// Ensure each page is only referenced once.
for i := pgid(0); i <= pgid(p.overflow); i++ {
var id = p.id + i
if _, ok := reachable[id]; ok {
ch <- fmt.Errorf("page %d: multiple references", int(id))
}
reachable[id] = p
}
// We should only encounter un-freed leaf and branch pages.
if freed[p.id] {
ch <- fmt.Errorf("page %d: reachable freed", int(p.id))
} else if (p.flags&branchPageFlag) == 0 && (p.flags&leafPageFlag) == 0 {
ch <- fmt.Errorf("page %d: invalid type: %s", int(p.id), p.typ())
}
})
// Check each bucket within this bucket.
_ = b.ForEach(func(k, v []byte) error {
if child := b.Bucket(k); child != nil {
tx.checkBucket(child, reachable, freed, ch)
}
return nil
})
}
// allocate returns a contiguous block of memory starting at a given page.
func (tx *Tx) allocate(count int) (*page, error) {
p, err := tx.db.allocate(count)
if err != nil {
return nil, err
}
// Save to our page cache.
tx.pages[p.id] = p
// Update statistics.
tx.stats.PageCount++
tx.stats.PageAlloc += count * tx.db.pageSize
return p, nil
}
// write writes any dirty pages to disk.
func (tx *Tx) write() error {
// Sort pages by id.
pages := make(pages, 0, len(tx.pages))
for _, p := range tx.pages {
pages = append(pages, p)
}
sort.Sort(pages)
// Write pages to disk in order.
for _, p := range pages {
size := (int(p.overflow) + 1) * tx.db.pageSize
buf := (*[maxAllocSize]byte)(unsafe.Pointer(p))[:size]
offset := int64(p.id) * int64(tx.db.pageSize)
if _, err := tx.db.ops.writeAt(buf, offset); err != nil {
return err
}
// Update statistics.
tx.stats.Write++
}
if !tx.db.NoSync || IgnoreNoSync {
if err := fdatasync(tx.db); err != nil {
return err
}
}
// Clear out page cache.
tx.pages = make(map[pgid]*page)
return nil
}
// writeMeta writes the meta to the disk.
func (tx *Tx) writeMeta() error {
// Create a temporary buffer for the meta page.
buf := make([]byte, tx.db.pageSize)
p := tx.db.pageInBuffer(buf, 0)
tx.meta.write(p)
// Write the meta page to file.
if _, err := tx.db.ops.writeAt(buf, int64(p.id)*int64(tx.db.pageSize)); err != nil {
return err
}
if !tx.db.NoSync || IgnoreNoSync {
if err := fdatasync(tx.db); err != nil {
return err
}
}
// Update statistics.
tx.stats.Write++
return nil
}
// page returns a reference to the page with a given id.
// If page has been written to then a temporary bufferred page is returned.
func (tx *Tx) page(id pgid) *page {
// Check the dirty pages first.
if tx.pages != nil {
if p, ok := tx.pages[id]; ok {
return p
}
}
// Otherwise return directly from the mmap.
return tx.db.page(id)
}
// forEachPage iterates over every page within a given page and executes a function.
func (tx *Tx) forEachPage(pgid pgid, depth int, fn func(*page, int)) {
p := tx.page(pgid)
// Execute function.
fn(p, depth)
// Recursively loop over children.
if (p.flags & branchPageFlag) != 0 {
for i := 0; i < int(p.count); i++ {
elem := p.branchPageElement(uint16(i))
tx.forEachPage(elem.pgid, depth+1, fn)
}
}
}
// Page returns page information for a given page number.
// This is only safe for concurrent use when used by a writable transaction.
func (tx *Tx) Page(id int) (*PageInfo, error) {
if tx.db == nil {
return nil, ErrTxClosed
} else if pgid(id) >= tx.meta.pgid {
return nil, nil
}
// Build the page info.
p := tx.db.page(pgid(id))
info := &PageInfo{
ID: id,
Count: int(p.count),
OverflowCount: int(p.overflow),
}
// Determine the type (or if it's free).
if tx.db.freelist.freed(pgid(id)) {
info.Type = "free"
} else {
info.Type = p.typ()
}
return info, nil
}
// TxStats represents statistics about the actions performed by the transaction.
type TxStats struct {
// Page statistics.
PageCount int // number of page allocations
PageAlloc int // total bytes allocated
// Cursor statistics.
CursorCount int // number of cursors created
// Node statistics
NodeCount int // number of node allocations
NodeDeref int // number of node dereferences
// Rebalance statistics.
Rebalance int // number of node rebalances
RebalanceTime time.Duration // total time spent rebalancing
// Split/Spill statistics.
Split int // number of nodes split
Spill int // number of nodes spilled
SpillTime time.Duration // total time spent spilling
// Write statistics.
Write int // number of writes performed
WriteTime time.Duration // total time spent writing to disk
}
func (s *TxStats) add(other *TxStats) {
s.PageCount += other.PageCount
s.PageAlloc += other.PageAlloc
s.CursorCount += other.CursorCount
s.NodeCount += other.NodeCount
s.NodeDeref += other.NodeDeref
s.Rebalance += other.Rebalance
s.RebalanceTime += other.RebalanceTime
s.Split += other.Split
s.Spill += other.Spill
s.SpillTime += other.SpillTime
s.Write += other.Write
s.WriteTime += other.WriteTime
}
// Sub calculates and returns the difference between two sets of transaction stats.
// This is useful when obtaining stats at two different points and time and
// you need the performance counters that occurred within that time span.
func (s *TxStats) Sub(other *TxStats) TxStats {
var diff TxStats
diff.PageCount = s.PageCount - other.PageCount
diff.PageAlloc = s.PageAlloc - other.PageAlloc
diff.CursorCount = s.CursorCount - other.CursorCount
diff.NodeCount = s.NodeCount - other.NodeCount
diff.NodeDeref = s.NodeDeref - other.NodeDeref
diff.Rebalance = s.Rebalance - other.Rebalance
diff.RebalanceTime = s.RebalanceTime - other.RebalanceTime
diff.Split = s.Split - other.Split
diff.Spill = s.Spill - other.Spill
diff.SpillTime = s.SpillTime - other.SpillTime
diff.Write = s.Write - other.Write
diff.WriteTime = s.WriteTime - other.WriteTime
return diff
}

View file

@ -0,0 +1,425 @@
Attribution-ShareAlike 4.0 International
=======================================================================
Creative Commons Corporation ("Creative Commons") is not a law firm and
does not provide legal services or legal advice. Distribution of
Creative Commons public licenses does not create a lawyer-client or
other relationship. Creative Commons makes its licenses and related
information available on an "as-is" basis. Creative Commons gives no
warranties regarding its licenses, any material licensed under their
terms and conditions, or any related information. Creative Commons
disclaims all liability for damages resulting from their use to the
fullest extent possible.
Using Creative Commons Public Licenses
Creative Commons public licenses provide a standard set of terms and
conditions that creators and other rights holders may use to share
original works of authorship and other material subject to copyright
and certain other rights specified in the public license below. The
following considerations are for informational purposes only, are not
exhaustive, and do not form part of our licenses.
Considerations for licensors: Our public licenses are
intended for use by those authorized to give the public
permission to use material in ways otherwise restricted by
copyright and certain other rights. Our licenses are
irrevocable. Licensors should read and understand the terms
and conditions of the license they choose before applying it.
Licensors should also secure all rights necessary before
applying our licenses so that the public can reuse the
material as expected. Licensors should clearly mark any
material not subject to the license. This includes other CC-
licensed material, or material used under an exception or
limitation to copyright. More considerations for licensors:
wiki.creativecommons.org/Considerations_for_licensors
Considerations for the public: By using one of our public
licenses, a licensor grants the public permission to use the
licensed material under specified terms and conditions. If
the licensor's permission is not necessary for any reason--for
example, because of any applicable exception or limitation to
copyright--then that use is not regulated by the license. Our
licenses grant only permissions under copyright and certain
other rights that a licensor has authority to grant. Use of
the licensed material may still be restricted for other
reasons, including because others have copyright or other
rights in the material. A licensor may make special requests,
such as asking that all changes be marked or described.
Although not required by our licenses, you are encouraged to
respect those requests where reasonable. More_considerations
for the public:
wiki.creativecommons.org/Considerations_for_licensees
=======================================================================
Creative Commons Attribution-ShareAlike 4.0 International Public
License
By exercising the Licensed Rights (defined below), You accept and agree
to be bound by the terms and conditions of this Creative Commons
Attribution-ShareAlike 4.0 International Public License ("Public
License"). To the extent this Public License may be interpreted as a
contract, You are granted the Licensed Rights in consideration of Your
acceptance of these terms and conditions, and the Licensor grants You
such rights in consideration of benefits the Licensor receives from
making the Licensed Material available under these terms and
conditions.
Section 1 -- Definitions.
a. Adapted Material means material subject to Copyright and Similar
Rights that is derived from or based upon the Licensed Material
and in which the Licensed Material is translated, altered,
arranged, transformed, or otherwise modified in a manner requiring
permission under the Copyright and Similar Rights held by the
Licensor. For purposes of this Public License, where the Licensed
Material is a musical work, performance, or sound recording,
Adapted Material is always produced where the Licensed Material is
synched in timed relation with a moving image.
b. Adapter's License means the license You apply to Your Copyright
and Similar Rights in Your contributions to Adapted Material in
accordance with the terms and conditions of this Public License.
c. BY-SA Compatible License means a license listed at
creativecommons.org/compatiblelicenses, approved by Creative
Commons as essentially the equivalent of this Public License.
d. Copyright and Similar Rights means copyright and/or similar rights
closely related to copyright including, without limitation,
performance, broadcast, sound recording, and Sui Generis Database
Rights, without regard to how the rights are labeled or
categorized. For purposes of this Public License, the rights
specified in Section 2(b)(1)-(2) are not Copyright and Similar
Rights.
e. Effective Technological Measures means those measures that, in the
absence of proper authority, may not be circumvented under laws
fulfilling obligations under Article 11 of the WIPO Copyright
Treaty adopted on December 20, 1996, and/or similar international
agreements.
f. Exceptions and Limitations means fair use, fair dealing, and/or
any other exception or limitation to Copyright and Similar Rights
that applies to Your use of the Licensed Material.
g. License Elements means the license attributes listed in the name
of a Creative Commons Public License. The License Elements of this
Public License are Attribution and ShareAlike.
h. Licensed Material means the artistic or literary work, database,
or other material to which the Licensor applied this Public
License.
i. Licensed Rights means the rights granted to You subject to the
terms and conditions of this Public License, which are limited to
all Copyright and Similar Rights that apply to Your use of the
Licensed Material and that the Licensor has authority to license.
j. Licensor means the individual(s) or entity(ies) granting rights
under this Public License.
k. Share means to provide material to the public by any means or
process that requires permission under the Licensed Rights, such
as reproduction, public display, public performance, distribution,
dissemination, communication, or importation, and to make material
available to the public including in ways that members of the
public may access the material from a place and at a time
individually chosen by them.
l. Sui Generis Database Rights means rights other than copyright
resulting from Directive 96/9/EC of the European Parliament and of
the Council of 11 March 1996 on the legal protection of databases,
as amended and/or succeeded, as well as other essentially
equivalent rights anywhere in the world.
m. You means the individual or entity exercising the Licensed Rights
under this Public License. Your has a corresponding meaning.
Section 2 -- Scope.
a. License grant.
1. Subject to the terms and conditions of this Public License,
the Licensor hereby grants You a worldwide, royalty-free,
non-sublicensable, non-exclusive, irrevocable license to
exercise the Licensed Rights in the Licensed Material to:
a. reproduce and Share the Licensed Material, in whole or
in part; and
b. produce, reproduce, and Share Adapted Material.
2. Exceptions and Limitations. For the avoidance of doubt, where
Exceptions and Limitations apply to Your use, this Public
License does not apply, and You do not need to comply with
its terms and conditions.
3. Term. The term of this Public License is specified in Section
6(a).
4. Media and formats; technical modifications allowed. The
Licensor authorizes You to exercise the Licensed Rights in
all media and formats whether now known or hereafter created,
and to make technical modifications necessary to do so. The
Licensor waives and/or agrees not to assert any right or
authority to forbid You from making technical modifications
necessary to exercise the Licensed Rights, including
technical modifications necessary to circumvent Effective
Technological Measures. For purposes of this Public License,
simply making modifications authorized by this Section 2(a)
(4) never produces Adapted Material.
5. Downstream recipients.
a. Offer from the Licensor -- Licensed Material. Every
recipient of the Licensed Material automatically
receives an offer from the Licensor to exercise the
Licensed Rights under the terms and conditions of this
Public License.
b. Additional offer from the Licensor -- Adapted Material.
Every recipient of Adapted Material from You
automatically receives an offer from the Licensor to
exercise the Licensed Rights in the Adapted Material
under the conditions of the Adapter's License You apply.
c. No downstream restrictions. You may not offer or impose
any additional or different terms or conditions on, or
apply any Effective Technological Measures to, the
Licensed Material if doing so restricts exercise of the
Licensed Rights by any recipient of the Licensed
Material.
6. No endorsement. Nothing in this Public License constitutes or
may be construed as permission to assert or imply that You
are, or that Your use of the Licensed Material is, connected
with, or sponsored, endorsed, or granted official status by,
the Licensor or others designated to receive attribution as
provided in Section 3(a)(1)(A)(i).
b. Other rights.
1. Moral rights, such as the right of integrity, are not
licensed under this Public License, nor are publicity,
privacy, and/or other similar personality rights; however, to
the extent possible, the Licensor waives and/or agrees not to
assert any such rights held by the Licensor to the limited
extent necessary to allow You to exercise the Licensed
Rights, but not otherwise.
2. Patent and trademark rights are not licensed under this
Public License.
3. To the extent possible, the Licensor waives any right to
collect royalties from You for the exercise of the Licensed
Rights, whether directly or through a collecting society
under any voluntary or waivable statutory or compulsory
licensing scheme. In all other cases the Licensor expressly
reserves any right to collect such royalties.
Section 3 -- License Conditions.
Your exercise of the Licensed Rights is expressly made subject to the
following conditions.
a. Attribution.
1. If You Share the Licensed Material (including in modified
form), You must:
a. retain the following if it is supplied by the Licensor
with the Licensed Material:
i. identification of the creator(s) of the Licensed
Material and any others designated to receive
attribution, in any reasonable manner requested by
the Licensor (including by pseudonym if
designated);
ii. a copyright notice;
iii. a notice that refers to this Public License;
iv. a notice that refers to the disclaimer of
warranties;
v. a URI or hyperlink to the Licensed Material to the
extent reasonably practicable;
b. indicate if You modified the Licensed Material and
retain an indication of any previous modifications; and
c. indicate the Licensed Material is licensed under this
Public License, and include the text of, or the URI or
hyperlink to, this Public License.
2. You may satisfy the conditions in Section 3(a)(1) in any
reasonable manner based on the medium, means, and context in
which You Share the Licensed Material. For example, it may be
reasonable to satisfy the conditions by providing a URI or
hyperlink to a resource that includes the required
information.
3. If requested by the Licensor, You must remove any of the
information required by Section 3(a)(1)(A) to the extent
reasonably practicable.
b. ShareAlike.
In addition to the conditions in Section 3(a), if You Share
Adapted Material You produce, the following conditions also apply.
1. The Adapter's License You apply must be a Creative Commons
license with the same License Elements, this version or
later, or a BY-SA Compatible License.
2. You must include the text of, or the URI or hyperlink to, the
Adapter's License You apply. You may satisfy this condition
in any reasonable manner based on the medium, means, and
context in which You Share Adapted Material.
3. You may not offer or impose any additional or different terms
or conditions on, or apply any Effective Technological
Measures to, Adapted Material that restrict exercise of the
rights granted under the Adapter's License You apply.
Section 4 -- Sui Generis Database Rights.
Where the Licensed Rights include Sui Generis Database Rights that
apply to Your use of the Licensed Material:
a. for the avoidance of doubt, Section 2(a)(1) grants You the right
to extract, reuse, reproduce, and Share all or a substantial
portion of the contents of the database;
b. if You include all or a substantial portion of the database
contents in a database in which You have Sui Generis Database
Rights, then the database in which You have Sui Generis Database
Rights (but not its individual contents) is Adapted Material,
including for purposes of Section 3(b); and
c. You must comply with the conditions in Section 3(a) if You Share
all or a substantial portion of the contents of the database.
For the avoidance of doubt, this Section 4 supplements and does not
replace Your obligations under this Public License where the Licensed
Rights include other Copyright and Similar Rights.
Section 5 -- Disclaimer of Warranties and Limitation of Liability.
a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
c. The disclaimer of warranties and limitation of liability provided
above shall be interpreted in a manner that, to the extent
possible, most closely approximates an absolute disclaimer and
waiver of all liability.
Section 6 -- Term and Termination.
a. This Public License applies for the term of the Copyright and
Similar Rights licensed here. However, if You fail to comply with
this Public License, then Your rights under this Public License
terminate automatically.
b. Where Your right to use the Licensed Material has terminated under
Section 6(a), it reinstates:
1. automatically as of the date the violation is cured, provided
it is cured within 30 days of Your discovery of the
violation; or
2. upon express reinstatement by the Licensor.
For the avoidance of doubt, this Section 6(b) does not affect any
right the Licensor may have to seek remedies for Your violations
of this Public License.
c. For the avoidance of doubt, the Licensor may also offer the
Licensed Material under separate terms or conditions or stop
distributing the Licensed Material at any time; however, doing so
will not terminate this Public License.
d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
License.
Section 7 -- Other Terms and Conditions.
a. The Licensor shall not be bound by any additional or different
terms or conditions communicated by You unless expressly agreed.
b. Any arrangements, understandings, or agreements regarding the
Licensed Material not stated herein are separate from and
independent of the terms and conditions of this Public License.
Section 8 -- Interpretation.
a. For the avoidance of doubt, this Public License does not, and
shall not be interpreted to, reduce, limit, restrict, or impose
conditions on any use of the Licensed Material that could lawfully
be made without permission under this Public License.
b. To the extent possible, if any provision of this Public License is
deemed unenforceable, it shall be automatically reformed to the
minimum extent necessary to make it enforceable. If the provision
cannot be reformed, it shall be severed from this Public License
without affecting the enforceability of the remaining terms and
conditions.
c. No term or condition of this Public License will be waived and no
failure to comply consented to unless expressly agreed to by the
Licensor.
d. Nothing in this Public License constitutes or may be interpreted
as a limitation upon, or waiver of, any privileges and immunities
that apply to the Licensor or You, including from the legal
processes of any jurisdiction or authority.
=======================================================================
Creative Commons is not a party to its public licenses.
Notwithstanding, Creative Commons may elect to apply one of its public
licenses to material it publishes and in those instances will be
considered the "Licensor." Except for the limited purpose of indicating
that material is shared under a Creative Commons public license or as
otherwise permitted by the Creative Commons policies published at
creativecommons.org/policies, Creative Commons does not authorize the
use of the trademark "Creative Commons" or any other trademark or logo
of Creative Commons without its prior written consent including,
without limitation, in connection with any unauthorized modifications
to any of its public licenses or any other arrangements,
understandings, or agreements concerning use of licensed material. For
the avoidance of doubt, this paragraph does not form part of the public
licenses.
Creative Commons may be contacted at creativecommons.org.

View file

@ -68,6 +68,10 @@ func main() {
You can find other usage examples for `libkv` under the `docker/swarm` or `docker/libnetwork` repositories.
## TLS
The etcd backend supports etcd servers that require TLS Client Authentication. Zookeeper and Consul support are planned. This feature is somewhat experimental and the store.ClientTLSConfig struct may change to accommodate the additional backends.
## Warning
There are a few consistency issues with *etcd*, on the notion of *directory* and *key*. If you want to use the three KV backends in an interchangeable way, you should only put data on leaves (see [Issue 20](https://github.com/docker/libkv/issues/20) for more details). This will be fixed when *etcd* API v3 will be made available (API v3 drops the *directory/key* distinction). An official release for *libkv* with a tag is likely to come after this issue being marked as **solved**.
@ -113,4 +117,4 @@ Want to hack on libkv? [Docker's contributions guidelines](https://github.com/do
##Copyright and license
Code and documentation copyright 2015 Docker, inc. Code released under the Apache 2.0 license. Docs released under Creative commons.
Copyright © 2014-2015 Docker, Inc. All rights reserved, except as follows. Code is released under the Apache 2.0 license. Documentation is licensed to end users under the Creative Commons Attribution 4.0 International License under the terms and conditions set forth in the file "LICENSE.docs". You may obtain a duplicate copy of the same license, titled CC-BY-SA-4.0, at http://creativecommons.org/licenses/by/4.0/.

View file

@ -0,0 +1,327 @@
package boltdb
import (
"bytes"
"encoding/binary"
"errors"
"os"
"path/filepath"
"sync/atomic"
"github.com/boltdb/bolt"
"github.com/docker/libkv"
"github.com/docker/libkv/store"
)
var (
// ErrMultipleEndpointsUnsupported is thrown when multiple endpoints specified for
// BoltDB. Endpoint has to be a local file path
ErrMultipleEndpointsUnsupported = errors.New("boltdb supports one endpoint and should be a file path")
// ErrBoltBucketNotFound is thrown when specified BoltBD bucket doesn't exist in the DB
ErrBoltBucketNotFound = errors.New("boltdb bucket doesn't exist")
// ErrBoltBucketOptionMissing is thrown when boltBcuket config option is missing
ErrBoltBucketOptionMissing = errors.New("boltBucket config option missing")
// ErrBoltAPIUnsupported is thrown when an APIs unsupported by BoltDB backend is called
ErrBoltAPIUnsupported = errors.New("API not supported by BoltDB backend")
)
//BoltDB type implements the Store interface
type BoltDB struct {
client *bolt.DB
boltBucket []byte
dbIndex uint64
}
const (
libkvmetadatalen = 8
)
// Register registers boltdb to libkv
func Register() {
libkv.AddStore(store.BOLTDB, New)
}
// New opens a new BoltDB connection to the specified path and bucket
func New(endpoints []string, options *store.Config) (store.Store, error) {
if len(endpoints) > 1 {
return nil, ErrMultipleEndpointsUnsupported
}
if (options == nil) || (len(options.Bucket) == 0) {
return nil, ErrBoltBucketOptionMissing
}
dir, _ := filepath.Split(endpoints[0])
if err := os.MkdirAll(dir, 0750); err != nil {
return nil, err
}
var boltOptions *bolt.Options
if options != nil {
boltOptions = &bolt.Options{Timeout: options.ConnectionTimeout}
}
db, err := bolt.Open(endpoints[0], 0644, boltOptions)
if err != nil {
return nil, err
}
b := &BoltDB{}
b.client = db
b.boltBucket = []byte(options.Bucket)
return b, nil
}
// Get the value at "key". BoltDB doesn't provide an inbuilt last modified index with every kv pair. Its implemented by
// by a atomic counter maintained by the libkv and appened to the value passed by the client.
func (b *BoltDB) Get(key string) (*store.KVPair, error) {
var val []byte
db := b.client
err := db.View(func(tx *bolt.Tx) error {
bucket := tx.Bucket(b.boltBucket)
if bucket == nil {
return (ErrBoltBucketNotFound)
}
val = bucket.Get([]byte(key))
return nil
})
if len(val) == 0 {
return nil, store.ErrKeyNotFound
}
if err != nil {
return nil, err
}
dbIndex := binary.LittleEndian.Uint64(val[:libkvmetadatalen])
val = val[libkvmetadatalen:]
return &store.KVPair{Key: key, Value: val, LastIndex: (dbIndex)}, nil
}
//Put the key, value pair. index number metadata is prepended to the value
func (b *BoltDB) Put(key string, value []byte, opts *store.WriteOptions) error {
var dbIndex uint64
db := b.client
dbval := make([]byte, libkvmetadatalen)
err := db.Update(func(tx *bolt.Tx) error {
bucket, err := tx.CreateBucketIfNotExists(b.boltBucket)
if err != nil {
return err
}
dbIndex = atomic.AddUint64(&b.dbIndex, 1)
binary.LittleEndian.PutUint64(dbval, dbIndex)
dbval = append(dbval, value...)
err = bucket.Put([]byte(key), dbval)
if err != nil {
return err
}
return nil
})
return err
}
//Delete the value for the given key.
func (b *BoltDB) Delete(key string) error {
db := b.client
err := db.Update(func(tx *bolt.Tx) error {
bucket := tx.Bucket(b.boltBucket)
if bucket == nil {
return (ErrBoltBucketNotFound)
}
err := bucket.Delete([]byte(key))
return err
})
return err
}
// Exists checks if the key exists inside the store
func (b *BoltDB) Exists(key string) (bool, error) {
var val []byte
db := b.client
err := db.View(func(tx *bolt.Tx) error {
bucket := tx.Bucket(b.boltBucket)
if bucket == nil {
return (ErrBoltBucketNotFound)
}
val = bucket.Get([]byte(key))
return nil
})
if len(val) == 0 {
return false, err
}
return true, err
}
// List returns the range of keys starting with the passed in prefix
func (b *BoltDB) List(keyPrefix string) ([]*store.KVPair, error) {
kv := []*store.KVPair{}
db := b.client
err := db.View(func(tx *bolt.Tx) error {
bucket := tx.Bucket(b.boltBucket)
if bucket == nil {
return (ErrBoltBucketNotFound)
}
cursor := bucket.Cursor()
prefix := []byte(keyPrefix)
for key, val := cursor.Seek(prefix); bytes.HasPrefix(key, prefix); key, val = cursor.Next() {
dbIndex := binary.LittleEndian.Uint64(val[:libkvmetadatalen])
val = val[libkvmetadatalen:]
kv = append(kv, &store.KVPair{
Key: string(key),
Value: val,
LastIndex: dbIndex,
})
}
return nil
})
if len(kv) == 0 {
return nil, store.ErrKeyNotFound
}
return kv, err
}
// AtomicDelete deletes a value at "key" if the key
// has not been modified in the meantime, throws an
// error if this is the case
func (b *BoltDB) AtomicDelete(key string, previous *store.KVPair) (bool, error) {
var val []byte
var dbIndex uint64
if previous == nil {
return false, store.ErrPreviousNotSpecified
}
db := b.client
err := db.Update(func(tx *bolt.Tx) error {
bucket := tx.Bucket(b.boltBucket)
if bucket == nil {
return ErrBoltBucketNotFound
}
val = bucket.Get([]byte(key))
dbIndex = binary.LittleEndian.Uint64(val[:libkvmetadatalen])
if dbIndex != previous.LastIndex {
return store.ErrKeyModified
}
err := bucket.Delete([]byte(key))
return err
})
if err != nil {
return false, err
}
return true, err
}
// AtomicPut puts a value at "key" if the key has not been
// modified since the last Put, throws an error if this is the case
func (b *BoltDB) AtomicPut(key string, value []byte, previous *store.KVPair, options *store.WriteOptions) (bool, *store.KVPair, error) {
var val []byte
var dbIndex uint64
dbval := make([]byte, libkvmetadatalen)
db := b.client
err := db.Update(func(tx *bolt.Tx) error {
var err error
bucket := tx.Bucket(b.boltBucket)
if bucket == nil {
if previous != nil {
return ErrBoltBucketNotFound
}
bucket, err = tx.CreateBucket(b.boltBucket)
if err != nil {
return err
}
}
// AtomicPut is equivalent to Put if previous is nil and the Ky
// doesn't exist in the DB.
val = bucket.Get([]byte(key))
if previous == nil && len(val) != 0 {
return store.ErrKeyModified
}
if previous != nil {
if len(val) == 0 {
return store.ErrKeyNotFound
}
dbIndex = binary.LittleEndian.Uint64(val[:libkvmetadatalen])
if dbIndex != previous.LastIndex {
return store.ErrKeyModified
}
}
dbIndex = atomic.AddUint64(&b.dbIndex, 1)
binary.LittleEndian.PutUint64(dbval, b.dbIndex)
dbval = append(dbval, value...)
return (bucket.Put([]byte(key), dbval))
})
if err != nil {
return false, nil, err
}
updated := &store.KVPair{
Key: key,
Value: value,
LastIndex: dbIndex,
}
return true, updated, nil
}
// Close the db connection to the BoltDB
func (b *BoltDB) Close() {
db := b.client
db.Close()
}
// DeleteTree deletes a range of keys with a given prefix
func (b *BoltDB) DeleteTree(keyPrefix string) error {
db := b.client
err := db.Update(func(tx *bolt.Tx) error {
bucket := tx.Bucket(b.boltBucket)
if bucket == nil {
return (ErrBoltBucketNotFound)
}
cursor := bucket.Cursor()
prefix := []byte(keyPrefix)
for key, _ := cursor.Seek(prefix); bytes.HasPrefix(key, prefix); key, _ = cursor.Next() {
_ = bucket.Delete([]byte(key))
}
return nil
})
return err
}
// NewLock has to implemented at the library level since its not supported by BoltDB
func (b *BoltDB) NewLock(key string, options *store.LockOptions) (store.Locker, error) {
return nil, ErrBoltAPIUnsupported
}
// Watch has to implemented at the library level since its not supported by BoltDB
func (b *BoltDB) Watch(key string, stopCh <-chan struct{}) (<-chan *store.KVPair, error) {
return nil, ErrBoltAPIUnsupported
}
// WatchTree has to implemented at the library level since its not supported by BoltDB
func (b *BoltDB) WatchTree(directory string, stopCh <-chan struct{}) (<-chan []*store.KVPair, error) {
return nil, ErrBoltAPIUnsupported
}

View file

@ -43,13 +43,28 @@ func Register() {
func New(addrs []string, options *store.Config) (store.Store, error) {
s := &Etcd{}
entries := store.CreateEndpoints(addrs, "http")
s.client = etcd.NewClient(entries)
var (
entries []string
err error
)
// Create the etcd client
if options != nil && options.ClientTLS != nil {
entries = store.CreateEndpoints(addrs, "https")
s.client, err = etcd.NewTLSClient(entries, options.ClientTLS.CertFile, options.ClientTLS.KeyFile, options.ClientTLS.CACertFile)
if err != nil {
return nil, err
}
} else {
entries = store.CreateEndpoints(addrs, "http")
s.client = etcd.NewClient(entries)
}
// Set options
if options != nil {
// Plain TLS config overrides ClientTLS if specified
if options.TLS != nil {
s.setTLS(options.TLS)
s.setTLS(options.TLS, addrs)
}
if options.ConnectionTimeout != 0 {
s.setTimeout(options.ConnectionTimeout)
@ -67,16 +82,10 @@ func New(addrs []string, options *store.Config) (store.Store, error) {
return s, nil
}
// SetTLS sets the tls configuration given the path
// of certificate files
func (s *Etcd) setTLS(tls *tls.Config) {
// Change to https scheme
var addrs []string
entries := s.client.GetCluster()
for _, entry := range entries {
addrs = append(addrs, strings.Replace(entry, "http", "https", -1))
}
s.client.SetCluster(addrs)
// SetTLS sets the tls configuration given a tls.Config scheme
func (s *Etcd) setTLS(tls *tls.Config, addrs []string) {
entries := store.CreateEndpoints(addrs, "https")
s.client.SetCluster(entries)
// Set transport
t := http.Transport{

View file

@ -39,11 +39,20 @@ var (
// Config contains the options for a storage client
type Config struct {
ClientTLS *ClientTLSConfig
TLS *tls.Config
ConnectionTimeout time.Duration
Bucket string
}
// ClientTLSConfig contains data for a Client TLS configuration in the form
// the etcd client wants it. Eventually we'll adapt it for ZK and Consul.
type ClientTLSConfig struct {
CertFile string
KeyFile string
CACertFile string
}
// Store represents the backend K/V storage
// Each store should support every call listed
// here. Or it couldn't be implemented as a K/V

View file

@ -10,15 +10,10 @@ cidocker = docker run ${ciargs} ${dockerargs} golang:1.4
all: ${build_image}.created build check integration-tests clean
integration-tests: ./cmd/dnet/dnet
@if [ ! -d ./integration-tmp ]; then \
mkdir -p ./integration-tmp; \
git clone https://github.com/sstephenson/bats.git ./integration-tmp/bats; \
./integration-tmp/bats/install.sh ./integration-tmp; \
fi
@./integration-tmp/bin/bats ./test/integration/dnet
@./test/integration/dnet/run-integration-tests.sh
./cmd/dnet/dnet:
make build-local
make build
clean:
@if [ -e ./cmd/dnet/dnet ]; then \
@ -76,7 +71,7 @@ run-tests:
done
@echo "Done running tests"
check-local: check-format check-code start-services run-tests
check-local: check-format check-code start-services run-tests
install-deps:
apt-get update && apt-get -y install iptables zookeeperd

View file

@ -17,61 +17,55 @@ There are many networking solutions available to suit a broad range of use-cases
```go
// Create a new controller instance
controller, err := libnetwork.New()
if err != nil {
return
}
// Select and configure the network driver
networkType := "bridge"
// Select and configure the network driver
networkType := "bridge"
// Create a new controller instance
driverOptions := options.Generic{}
genericOption := make(map[string]interface{})
genericOption[netlabel.GenericData] = driverOptions
controller, err := libnetwork.New(config.OptionDriverConfig(networkType, genericOption))
if err != nil {
return
}
driverOptions := options.Generic{}
genericOption := make(map[string]interface{})
genericOption[netlabel.GenericData] = driverOptions
err = controller.ConfigureNetworkDriver(networkType, genericOption)
if err != nil {
return
}
// Create a network for containers to join.
// NewNetwork accepts Variadic optional arguments that libnetwork and Drivers can use.
network, err := controller.NewNetwork(networkType, "network1")
if err != nil {
return
}
// Create a network for containers to join.
// NewNetwork accepts Variadic optional arguments that libnetwork and Drivers can use.
network, err := controller.NewNetwork(networkType, "network1")
if err != nil {
return
}
// For each new container: allocate IP and interfaces. The returned network
// settings will be used for container infos (inspect and such), as well as
// iptables rules for port publishing. This info is contained or accessible
// from the returned endpoint.
ep, err := network.CreateEndpoint("Endpoint1")
if err != nil {
return
}
// For each new container: allocate IP and interfaces. The returned network
// settings will be used for container infos (inspect and such), as well as
// iptables rules for port publishing. This info is contained or accessible
// from the returned endpoint.
ep, err := network.CreateEndpoint("Endpoint1")
if err != nil {
return
}
// Create the sandbox for the container.
// NewSandbox accepts Variadic optional arguments which libnetwork can use.
sbx, err := controller.NewSandbox("container1",
libnetwork.OptionHostname("test"),
libnetwork.OptionDomainname("docker.io"))
// Create the sandbox for the containr.
sbx, err := controller.NewSandbox("container1",
libnetwork.OptionHostname("test"),
libnetwork.OptionDomainname("docker.io"))
// A sandbox can join the endpoint via the join api.
// Join accepts Variadic arguments which libnetwork and Drivers can use.
err = ep.Join(sbx)
if err != nil {
return
}
// A sandbox can join the endpoint via the join api.
err = ep.Join(sbx)
if err != nil {
return
}
// libnetwork client can check the endpoint's operational data via the Info() API
epInfo, err := ep.DriverInfo()
mapData, ok := epInfo[netlabel.PortMap]
// libnetwork client can check the endpoint's operational data via the Info() API
epInfo, err := ep.DriverInfo()
mapData, ok := epInfo[netlabel.PortMap]
if ok {
portMapping, ok := mapData.([]types.PortBinding)
if ok {
portMapping, ok := mapData.([]types.PortBinding)
if ok {
fmt.Printf("Current port mapping for endpoint %s: %v", ep.Name(), portMapping)
}
fmt.Printf("Current port mapping for endpoint %s: %v", ep.Name(), portMapping)
}
}
```
#### Current Status
Please watch this space for updates on the progress.
@ -87,4 +81,3 @@ Want to hack on libnetwork? [Docker's contributions guidelines](https://github.c
## Copyright and license
Code and documentation copyright 2015 Docker, inc. Code released under the Apache 2.0 license. Docs released under Creative commons.

View file

@ -291,9 +291,6 @@ func processCreateDefaults(c libnetwork.NetworkController, nc *networkCreate) {
if _, ok := gData["BridgeName"]; !ok {
gData["BridgeName"] = nc.Name
}
if _, ok := gData["AllowNonDefaultBridge"]; !ok {
gData["AllowNonDefaultBridge"] = "true"
}
nc.Options[netlabel.GenericData] = genericData
}
}

View file

@ -87,12 +87,12 @@ func (s *sequence) toString() string {
}
// GetAvailableBit returns the position of the first unset bit in the bitmask represented by this sequence
func (s *sequence) getAvailableBit() (uint32, uint32, error) {
func (s *sequence) getAvailableBit(from uint32) (uint32, uint32, error) {
if s.block == blockMAX || s.count == 0 {
return invalidPos, invalidPos, fmt.Errorf("no available bit")
return invalidPos, invalidPos, errNoBitAvailable
}
bits := uint32(0)
bitSel := blockFirstBit
bits := from
bitSel := blockFirstBit >> from
for bitSel > 0 && s.block&bitSel != 0 {
bitSel >>= 1
bits++
@ -186,12 +186,23 @@ func (h *Handle) getCopy() *Handle {
}
}
// SetAnyInRange atomically sets the first unset bit in the specified range in the sequence and returns the corresponding ordinal
func (h *Handle) SetAnyInRange(start, end uint32) (uint32, error) {
if end-start <= 0 || end >= h.bits {
return invalidPos, fmt.Errorf("invalid bit range [%d, %d]", start, end)
}
if h.Unselected() == 0 {
return invalidPos, errNoBitAvailable
}
return h.set(0, start, end, true, false)
}
// SetAny atomically sets the first unset bit in the sequence and returns the corresponding ordinal
func (h *Handle) SetAny() (uint32, error) {
if h.Unselected() == 0 {
return invalidPos, errNoBitAvailable
}
return h.set(0, true, false)
return h.set(0, 0, h.bits-1, true, false)
}
// Set atomically sets the corresponding bit in the sequence
@ -199,7 +210,7 @@ func (h *Handle) Set(ordinal uint32) error {
if err := h.validateOrdinal(ordinal); err != nil {
return err
}
_, err := h.set(ordinal, false, false)
_, err := h.set(ordinal, 0, 0, false, false)
return err
}
@ -208,7 +219,7 @@ func (h *Handle) Unset(ordinal uint32) error {
if err := h.validateOrdinal(ordinal); err != nil {
return err
}
_, err := h.set(ordinal, false, true)
_, err := h.set(ordinal, 0, 0, false, true)
return err
}
@ -225,7 +236,7 @@ func (h *Handle) IsSet(ordinal uint32) bool {
}
// set/reset the bit
func (h *Handle) set(ordinal uint32, any bool, release bool) (uint32, error) {
func (h *Handle) set(ordinal, start, end uint32, any bool, release bool) (uint32, error) {
var (
bitPos uint32
bytePos uint32
@ -240,8 +251,11 @@ func (h *Handle) set(ordinal uint32, any bool, release bool) (uint32, error) {
bytePos, bitPos = ordinalToPos(ordinal)
} else {
if any {
bytePos, bitPos, err = getFirstAvailable(h.head)
bytePos, bitPos, err = getFirstAvailable(h.head, start)
ret = posToOrdinal(bytePos, bitPos)
if end < ret {
err = errNoBitAvailable
}
} else {
bytePos, bitPos, err = checkIfAvailable(h.head, ordinal)
ret = ordinal
@ -285,7 +299,7 @@ func (h *Handle) set(ordinal uint32, any bool, release bool) (uint32, error) {
// checks is needed because to cover the case where the number of bits is not a multiple of blockLen
func (h *Handle) validateOrdinal(ordinal uint32) error {
if ordinal > h.bits {
if ordinal >= h.bits {
return fmt.Errorf("bit does not belong to the sequence")
}
return nil
@ -353,16 +367,24 @@ func (h *Handle) String() string {
h.app, h.id, h.dbIndex, h.bits, h.unselected, h.head.toString())
}
// getFirstAvailable looks for the first unset bit in passed mask
func getFirstAvailable(head *sequence) (uint32, uint32, error) {
byteIndex := uint32(0)
current := head
// getFirstAvailable looks for the first unset bit in passed mask starting from start
func getFirstAvailable(head *sequence, start uint32) (uint32, uint32, error) {
// Find sequence which contains the start bit
byteStart, bitStart := ordinalToPos(start)
current, _, _, inBlockBytePos := findSequence(head, byteStart)
// Derive the this sequence offsets
byteOffset := byteStart - inBlockBytePos
bitOffset := inBlockBytePos*8 + bitStart
for current != nil {
if current.block != blockMAX {
bytePos, bitPos, err := current.getAvailableBit()
return byteIndex + bytePos, bitPos, err
bytePos, bitPos, err := current.getAvailableBit(bitOffset)
return byteOffset + bytePos, bitPos, err
}
byteIndex += current.count * blockBytes
// Moving to next block: Reset bit offset.
bitOffset = 0
byteOffset += current.count * blockBytes
current = current.next
}
return invalidPos, invalidPos, errNoBitAvailable
@ -371,8 +393,7 @@ func getFirstAvailable(head *sequence) (uint32, uint32, error) {
// checkIfAvailable checks if the bit correspondent to the specified ordinal is unset
// If the ordinal is beyond the sequence limits, a negative response is returned
func checkIfAvailable(head *sequence, ordinal uint32) (uint32, uint32, error) {
bytePos := ordinal / 8
bitPos := ordinal % 8
bytePos, bitPos := ordinalToPos(ordinal)
// Find the sequence containing this byte
current, _, _, inBlockBytePos := findSequence(head, bytePos)

View file

@ -70,6 +70,16 @@ func (h *Handle) Exists() bool {
return h.dbExists
}
// Skip provides a way for a KV Object to avoid persisting it in the KV Store
func (h *Handle) Skip() bool {
return false
}
// DataScope method returns the storage scope of the datastore
func (h *Handle) DataScope() datastore.DataScope {
return datastore.GlobalScope
}
func (h *Handle) watchForChanges() error {
h.Lock()
store := h.store

View file

@ -236,21 +236,22 @@ func (cli *NetworkCli) CmdServiceLs(chain string, args ...string) error {
wr := tabwriter.NewWriter(cli.out, 20, 1, 3, ' ', 0)
// unless quiet (-q) is specified, print field titles
if !*quiet {
fmt.Fprintln(wr, "SERVICE ID\tNAME\tNETWORK\tCONTAINER")
fmt.Fprintln(wr, "SERVICE ID\tNAME\tNETWORK\tCONTAINER\tSANDBOX")
}
for _, sr := range serviceResources {
ID := sr.ID
bkID, err := getBackendID(cli, ID)
bkID, sbID, err := getBackendID(cli, ID)
if err != nil {
return err
}
if !*noTrunc {
ID = stringid.TruncateID(ID)
bkID = stringid.TruncateID(bkID)
sbID = stringid.TruncateID(sbID)
}
if !*quiet {
fmt.Fprintf(wr, "%s\t%s\t%s\t%s\n", ID, sr.Name, sr.Network, bkID)
fmt.Fprintf(wr, "%s\t%s\t%s\t%s\t%s\n", ID, sr.Name, sr.Network, bkID, sbID)
} else {
fmt.Fprintln(wr, ID)
}
@ -260,24 +261,26 @@ func (cli *NetworkCli) CmdServiceLs(chain string, args ...string) error {
return nil
}
func getBackendID(cli *NetworkCli, servID string) (string, error) {
func getBackendID(cli *NetworkCli, servID string) (string, string, error) {
var (
obj []byte
err error
bk string
sb string
)
if obj, _, err = readBody(cli.call("GET", "/services/"+servID+"/backend", nil, nil)); err == nil {
var sr SandboxResource
if err := json.NewDecoder(bytes.NewReader(obj)).Decode(&sr); err == nil {
bk = sr.ContainerID
sb = sr.ID
} else {
// Only print a message, don't make the caller cli fail for this
fmt.Fprintf(cli.out, "Failed to retrieve backend list for service %s (%v)\n", servID, err)
}
}
return bk, err
return bk, sb, err
}
// CmdServiceInfo handles service info UI

View file

@ -5,14 +5,15 @@ import (
"github.com/BurntSushi/toml"
log "github.com/Sirupsen/logrus"
"github.com/docker/libkv/store"
"github.com/docker/libnetwork/netlabel"
)
// Config encapsulates configurations of various Libnetwork components
type Config struct {
Daemon DaemonCfg
Cluster ClusterCfg
Datastore DatastoreCfg
Daemon DaemonCfg
Cluster ClusterCfg
GlobalStore, LocalStore DatastoreCfg
}
// DaemonCfg represents libnetwork core configuration
@ -21,6 +22,7 @@ type DaemonCfg struct {
DefaultNetwork string
DefaultDriver string
Labels []string
DriverCfg map[string]interface{}
}
// ClusterCfg represents cluster configuration
@ -40,6 +42,7 @@ type DatastoreCfg struct {
type DatastoreClientCfg struct {
Provider string
Address string
Config *store.Config
}
// ParseConfig parses the libnetwork configuration file
@ -71,6 +74,13 @@ func OptionDefaultDriver(dd string) Option {
}
}
// OptionDriverConfig returns an option setter for driver configuration.
func OptionDriverConfig(networkType string, config map[string]interface{}) Option {
return func(c *Config) {
c.Daemon.DriverCfg[networkType] = config
}
}
// OptionLabels function returns an option setter for labels
func OptionLabels(labels []string) Option {
return func(c *Config) {
@ -86,7 +96,7 @@ func OptionLabels(labels []string) Option {
func OptionKVProvider(provider string) Option {
return func(c *Config) {
log.Infof("Option OptionKVProvider: %s", provider)
c.Datastore.Client.Provider = strings.TrimSpace(provider)
c.GlobalStore.Client.Provider = strings.TrimSpace(provider)
}
}
@ -94,7 +104,7 @@ func OptionKVProvider(provider string) Option {
func OptionKVProviderURL(url string) Option {
return func(c *Config) {
log.Infof("Option OptionKVProviderURL: %s", url)
c.Datastore.Client.Address = strings.TrimSpace(url)
c.GlobalStore.Client.Address = strings.TrimSpace(url)
}
}
@ -114,3 +124,27 @@ func IsValidName(name string) bool {
}
return true
}
// OptionLocalKVProvider function returns an option setter for kvstore provider
func OptionLocalKVProvider(provider string) Option {
return func(c *Config) {
log.Infof("Option OptionLocalKVProvider: %s", provider)
c.LocalStore.Client.Provider = strings.TrimSpace(provider)
}
}
// OptionLocalKVProviderURL function returns an option setter for kvstore url
func OptionLocalKVProviderURL(url string) Option {
return func(c *Config) {
log.Infof("Option OptionLocalKVProviderURL: %s", url)
c.LocalStore.Client.Address = strings.TrimSpace(url)
}
}
// OptionLocalKVProviderConfig function returns an option setter for kvstore config
func OptionLocalKVProviderConfig(config *store.Config) Option {
return func(c *Config) {
log.Infof("Option OptionLocalKVProviderConfig: %v", config)
c.LocalStore.Client.Config = config
}
}

View file

@ -2,16 +2,13 @@
Package libnetwork provides the basic functionality and extension points to
create network namespaces and allocate interfaces for containers to use.
// Create a new controller instance
controller, _err := libnetwork.New(nil)
// Select and configure the network driver
networkType := "bridge"
// Create a new controller instance
driverOptions := options.Generic{}
genericOption := make(map[string]interface{})
genericOption[netlabel.GenericData] = driverOptions
err := controller.ConfigureNetworkDriver(networkType, genericOption)
controller, err := libnetwork.New(config.OptionDriverConfig(networkType, genericOption))
if err != nil {
return
}
@ -32,11 +29,14 @@ create network namespaces and allocate interfaces for containers to use.
return
}
// A container can join the endpoint by providing the container ID to the join api.
// Join accepts Variadic arguments which will be made use of by libnetwork and Drivers
err = ep.Join("container1",
libnetwork.JoinOptionHostname("test"),
libnetwork.JoinOptionDomainname("docker.io"))
// Create the sandbox for the container.
// NewSandbox accepts Variadic optional arguments which libnetwork can use.
sbx, err := controller.NewSandbox("container1",
libnetwork.OptionHostname("test"),
libnetwork.OptionDomainname("docker.io"))
// A sandbox can join the endpoint via the join api.
err = ep.Join(sbx)
if err != nil {
return
}
@ -47,7 +47,6 @@ import (
"container/heap"
"fmt"
"net"
"strings"
"sync"
log "github.com/Sirupsen/logrus"
@ -57,7 +56,6 @@ import (
"github.com/docker/libnetwork/datastore"
"github.com/docker/libnetwork/driverapi"
"github.com/docker/libnetwork/hostdiscovery"
"github.com/docker/libnetwork/netlabel"
"github.com/docker/libnetwork/osl"
"github.com/docker/libnetwork/types"
)
@ -68,9 +66,6 @@ type NetworkController interface {
// ID provides an unique identity for the controller
ID() string
// ConfigureNetworkDriver applies the passed options to the driver instance for the specified network type
ConfigureNetworkDriver(networkType string, options map[string]interface{}) error
// Config method returns the bootup configuration for the controller
Config() config.Config
@ -125,13 +120,13 @@ type endpointTable map[string]*endpoint
type sandboxTable map[string]*sandbox
type controller struct {
id string
networks networkTable
drivers driverTable
sandboxes sandboxTable
cfg *config.Config
store datastore.DataStore
extKeyListener net.Listener
id string
networks networkTable
drivers driverTable
sandboxes sandboxTable
cfg *config.Config
globalStore, localStore datastore.DataStore
extKeyListener net.Listener
sync.Mutex
}
@ -139,7 +134,11 @@ type controller struct {
func New(cfgOptions ...config.Option) (NetworkController, error) {
var cfg *config.Config
if len(cfgOptions) > 0 {
cfg = &config.Config{}
cfg = &config.Config{
Daemon: config.DaemonCfg{
DriverCfg: make(map[string]interface{}),
},
}
cfg.ProcessOptions(cfgOptions...)
}
c := &controller{
@ -153,7 +152,7 @@ func New(cfgOptions ...config.Option) (NetworkController, error) {
}
if cfg != nil {
if err := c.initDataStore(); err != nil {
if err := c.initGlobalStore(); err != nil {
// Failing to initalize datastore is a bad situation to be in.
// But it cannot fail creating the Controller
log.Debugf("Failed to Initialize Datastore due to %v. Operating in non-clustered mode", err)
@ -163,6 +162,9 @@ func New(cfgOptions ...config.Option) (NetworkController, error) {
// But it cannot fail creating the Controller
log.Debugf("Failed to Initialize Discovery : %v", err)
}
if err := c.initLocalStore(); err != nil {
log.Debugf("Failed to Initialize LocalDatastore due to %v.", err)
}
}
if err := c.startExternalKeyListener(); err != nil {
@ -207,16 +209,6 @@ func (c *controller) Config() config.Config {
return *c.cfg
}
func (c *controller) ConfigureNetworkDriver(networkType string, options map[string]interface{}) error {
c.Lock()
dd, ok := c.drivers[networkType]
c.Unlock()
if !ok {
return NetworkTypeError(networkType)
}
return dd.driver.Config(options)
}
func (c *controller) RegisterDriver(networkType string, driver driverapi.Driver, capability driverapi.Capability) error {
if !config.IsValidName(networkType) {
return ErrInvalidName(networkType)
@ -228,32 +220,8 @@ func (c *controller) RegisterDriver(networkType string, driver driverapi.Driver,
return driverapi.ErrActiveRegistration(networkType)
}
c.drivers[networkType] = &driverData{driver, capability}
if c.cfg == nil {
c.Unlock()
return nil
}
opt := make(map[string]interface{})
for _, label := range c.cfg.Daemon.Labels {
if strings.HasPrefix(label, netlabel.DriverPrefix+"."+networkType) {
opt[netlabel.Key(label)] = netlabel.Value(label)
}
}
if capability.Scope == driverapi.GlobalScope && c.validateDatastoreConfig() {
opt[netlabel.KVProvider] = c.cfg.Datastore.Client.Provider
opt[netlabel.KVProviderURL] = c.cfg.Datastore.Client.Address
}
c.Unlock()
if len(opt) != 0 {
if err := driver.Config(opt); err != nil {
return err
}
}
return nil
}
@ -280,6 +248,7 @@ func (c *controller) NewNetwork(networkType, name string, options ...NetworkOpti
id: stringid.GenerateRandomID(),
ctrlr: c,
endpoints: endpointTable{},
persist: true,
}
network.processOptions(options...)
@ -288,7 +257,7 @@ func (c *controller) NewNetwork(networkType, name string, options ...NetworkOpti
return nil, err
}
if err := c.updateNetworkToStore(network); err != nil {
if err := c.updateToStore(network); err != nil {
log.Warnf("couldnt create network %s: %v", network.name, err)
if e := network.Delete(); e != nil {
log.Warnf("couldnt cleanup network %s: %v", network.name, err)
@ -317,6 +286,7 @@ func (c *controller) addNetwork(n *network) error {
n.Lock()
n.svcRecords = svcMap{}
n.driver = dd.driver
n.dataScope = dd.capability.DataScope
d := n.driver
n.Unlock()
@ -324,8 +294,10 @@ func (c *controller) addNetwork(n *network) error {
if err := d.CreateNetwork(n.id, n.generic); err != nil {
return err
}
if err := n.watchEndpoints(); err != nil {
return err
if n.isGlobalScoped() {
if err := n.watchEndpoints(); err != nil {
return err
}
}
c.Lock()
c.networks[n.id] = n
@ -515,20 +487,10 @@ func (c *controller) loadDriver(networkType string) (*driverData, error) {
return dd, nil
}
func (c *controller) isDriverGlobalScoped(networkType string) (bool, error) {
c.Lock()
dd, ok := c.drivers[networkType]
c.Unlock()
if !ok {
return false, types.NotFoundErrorf("driver not found for %s", networkType)
}
if dd.capability.Scope == driverapi.GlobalScope {
return true, nil
}
return false, nil
}
func (c *controller) Stop() {
if c.localStore != nil {
c.localStore.KVStore().Close()
}
c.stopExternalKeyListener()
osl.GC()
}

View file

@ -6,6 +6,7 @@ import (
"github.com/docker/libkv"
"github.com/docker/libkv/store"
"github.com/docker/libkv/store/boltdb"
"github.com/docker/libkv/store/consul"
"github.com/docker/libkv/store/etcd"
"github.com/docker/libkv/store/zookeeper"
@ -58,8 +59,22 @@ type KV interface {
// True if the object exists in the datastore, false if it hasn't been stored yet.
// When SetIndex() is called, the object has been stored.
Exists() bool
// DataScope indicates the storage scope of the KV object
DataScope() DataScope
// Skip provides a way for a KV Object to avoid persisting it in the KV Store
Skip() bool
}
// DataScope indicates the storage scope
type DataScope int
const (
// LocalScope indicates to store the KV object in local datastore such as boltdb
LocalScope DataScope = iota
// GlobalScope indicates to store the KV object in global datastore such as consul/etcd/zookeeper
GlobalScope
)
const (
// NetworkKeyPrefix is the prefix for network key in the kv store
NetworkKeyPrefix = "network"
@ -73,6 +88,7 @@ func init() {
consul.Register()
zookeeper.Register()
etcd.Register()
boltdb.Register()
}
//Key provides convenient method to create a Key
@ -94,8 +110,11 @@ func ParseKey(key string) ([]string, error) {
}
// newClient used to connect to KV Store
func newClient(kv string, addrs string) (DataStore, error) {
store, err := libkv.NewStore(store.Backend(kv), []string{addrs}, &store.Config{})
func newClient(kv string, addrs string, config *store.Config) (DataStore, error) {
if config == nil {
config = &store.Config{}
}
store, err := libkv.NewStore(store.Backend(kv), []string{addrs}, config)
if err != nil {
return nil, err
}
@ -109,7 +128,7 @@ func NewDataStore(cfg *config.DatastoreCfg) (DataStore, error) {
return nil, types.BadRequestErrorf("invalid configuration passed to datastore")
}
// TODO : cfg.Embedded case
return newClient(cfg.Client.Provider, cfg.Client.Address)
return newClient(cfg.Client.Provider, cfg.Client.Address, cfg.Client.Config)
}
// NewCustomDataStore can be used by clients to plugin cusom datatore that adhers to store.Store

View file

@ -0,0 +1,159 @@
package libnetwork
import (
"fmt"
"github.com/docker/libnetwork/netlabel"
"github.com/docker/libnetwork/options"
"github.com/docker/libnetwork/types"
)
const (
libnGWNetwork = "docker_gwbridge"
gwEPlen = 12
)
/*
libnetwork creates a bridge network "docker_gw_bridge" for provding
default gateway for the containers if none of the container's endpoints
have GW set by the driver. ICC is set to false for the GW_bridge network.
If a driver can't provide external connectivity it can choose to not set
the GW IP for the endpoint.
endpoint on the GW_bridge network is managed dynamically by libnetwork.
ie:
- its created when an endpoint without GW joins the container
- its deleted when an endpoint with GW joins the container
*/
func (sb *sandbox) setupDefaultGW(srcEp *endpoint) error {
var createOptions []EndpointOption
c := srcEp.getNetwork().getController()
// check if the conitainer already has a GW endpoint
if ep := sb.getEndpointInGWNetwork(); ep != nil {
return nil
}
n, err := c.NetworkByName(libnGWNetwork)
if err != nil {
if _, ok := err.(types.NotFoundError); !ok {
return err
}
n, err = c.createGWNetwork()
if err != nil {
return err
}
}
if opt, ok := srcEp.generic[netlabel.PortMap]; ok {
if pb, ok := opt.([]types.PortBinding); ok {
createOptions = append(createOptions, CreateOptionPortMapping(pb))
}
}
if opt, ok := srcEp.generic[netlabel.ExposedPorts]; ok {
if exp, ok := opt.([]types.TransportPort); ok {
createOptions = append(createOptions, CreateOptionExposedPorts(exp))
}
}
eplen := gwEPlen
if len(sb.containerID) < gwEPlen {
eplen = len(sb.containerID)
}
newEp, err := n.CreateEndpoint("gateway_"+sb.containerID[0:eplen], createOptions...)
if err != nil {
return fmt.Errorf("container %s: endpoint create on GW Network failed: %v", sb.containerID, err)
}
epLocal := newEp.(*endpoint)
if err := epLocal.sbJoin(sb); err != nil {
return fmt.Errorf("container %s: endpoint join on GW Network failed: %v", sb.containerID, err)
}
return nil
}
func (sb *sandbox) clearDefaultGW() error {
var ep *endpoint
if ep = sb.getEndpointInGWNetwork(); ep == nil {
return nil
}
if err := ep.sbLeave(sb); err != nil {
return fmt.Errorf("container %s: endpoint leaving GW Network failed: %v", sb.containerID, err)
}
if err := ep.Delete(); err != nil {
return fmt.Errorf("container %s: deleting endpoint on GW Network failed: %v", sb.containerID, err)
}
return nil
}
func (c *controller) createGWNetwork() (Network, error) {
netOption := options.Generic{
"BridgeName": libnGWNetwork,
"EnableICC": false,
"EnableIPMasquerade": true,
}
n, err := c.NewNetwork("bridge", libnGWNetwork,
NetworkOptionGeneric(options.Generic{
netlabel.GenericData: netOption,
netlabel.EnableIPv6: false,
}))
if err != nil {
return nil, fmt.Errorf("error creating external connectivity network: %v", err)
}
return n, err
}
func (sb *sandbox) needDefaultGW() bool {
var needGW bool
for _, ep := range sb.getConnectedEndpoints() {
if ep.endpointInGWNetwork() {
continue
}
if ep.getNetwork().Type() == "null" || ep.getNetwork().Type() == "host" {
continue
}
// TODO v6 needs to be handled.
if len(ep.Gateway()) > 0 {
return false
}
needGW = true
}
return needGW
}
func (sb *sandbox) getEndpointInGWNetwork() *endpoint {
for _, ep := range sb.getConnectedEndpoints() {
if ep.getNetwork().name == libnGWNetwork {
return ep
}
}
return nil
}
func (ep *endpoint) endpointInGWNetwork() bool {
if ep.getNetwork().name == libnGWNetwork {
return true
}
return false
}
func (sb *sandbox) getEPwithoutGateway() *endpoint {
for _, ep := range sb.getConnectedEndpoints() {
if ep.getNetwork().Type() == "null" || ep.getNetwork().Type() == "host" {
continue
}
if len(ep.Gateway()) == 0 {
return ep
}
}
return nil
}

View file

@ -1,15 +1,16 @@
package driverapi
import "net"
import (
"net"
"github.com/docker/libnetwork/datastore"
)
// NetworkPluginEndpointType represents the Endpoint Type used by Plugin system
const NetworkPluginEndpointType = "NetworkDriver"
// Driver is an interface that every plugin driver needs to implement.
type Driver interface {
// Push driver specific config to the driver
Config(options map[string]interface{}) error
// CreateNetwork invokes the driver method to create a network passing
// the network id and network specific config. The config mechanism will
// eventually be replaced with labels which are yet to be introduced.
@ -101,17 +102,7 @@ type DriverCallback interface {
RegisterDriver(name string, driver Driver, capability Capability) error
}
// Scope indicates the drivers scope capability
type Scope int
const (
// LocalScope represents the driver capable of providing networking services for containers in a single host
LocalScope Scope = iota
// GlobalScope represents the driver capable of providing networking services for containers across hosts
GlobalScope
)
// Capability represents the high level capabilities of the drivers which libnetwork can make use of
type Capability struct {
Scope Scope
DataScope datastore.DataScope
}

View file

@ -0,0 +1,55 @@
package libnetwork
import (
"strings"
"github.com/docker/libnetwork/driverapi"
"github.com/docker/libnetwork/netlabel"
)
type initializer struct {
fn func(driverapi.DriverCallback, map[string]interface{}) error
ntype string
}
func initDrivers(c *controller) error {
for _, i := range getInitializers() {
if err := i.fn(c, makeDriverConfig(c, i.ntype)); err != nil {
return err
}
}
return nil
}
func makeDriverConfig(c *controller, ntype string) map[string]interface{} {
if c.cfg == nil {
return nil
}
config := make(map[string]interface{})
if c.validateGlobalStoreConfig() {
config[netlabel.KVProvider] = c.cfg.GlobalStore.Client.Provider
config[netlabel.KVProviderURL] = c.cfg.GlobalStore.Client.Address
}
for _, label := range c.cfg.Daemon.Labels {
if !strings.HasPrefix(netlabel.Key(label), netlabel.DriverPrefix+"."+ntype) {
continue
}
config[netlabel.Key(label)] = netlabel.Value(label)
}
drvCfg, ok := c.cfg.Daemon.DriverCfg[ntype]
if !ok {
return config
}
for k, v := range drvCfg.(map[string]interface{}) {
config[k] = v
}
return config
}

View file

@ -14,6 +14,7 @@ import (
"syscall"
"github.com/Sirupsen/logrus"
"github.com/docker/libnetwork/datastore"
"github.com/docker/libnetwork/driverapi"
"github.com/docker/libnetwork/ipallocator"
"github.com/docker/libnetwork/iptables"
@ -47,18 +48,18 @@ type configuration struct {
// networkConfiguration for network specific configuration
type networkConfiguration struct {
BridgeName string
AddressIPv4 *net.IPNet
FixedCIDR *net.IPNet
FixedCIDRv6 *net.IPNet
EnableIPv6 bool
EnableIPMasquerade bool
EnableICC bool
Mtu int
DefaultGatewayIPv4 net.IP
DefaultGatewayIPv6 net.IP
DefaultBindingIP net.IP
AllowNonDefaultBridge bool
BridgeName string
AddressIPv4 *net.IPNet
FixedCIDR *net.IPNet
FixedCIDRv6 *net.IPNet
EnableIPv6 bool
EnableIPMasquerade bool
EnableICC bool
Mtu int
DefaultGatewayIPv4 net.IP
DefaultGatewayIPv6 net.IP
DefaultBindingIP net.IP
DefaultBridge bool
}
// endpointConfiguration represents the user specified configuration for the sandbox endpoint
@ -97,7 +98,6 @@ type bridgeNetwork struct {
type driver struct {
config *configuration
configured bool
network *bridgeNetwork
natChain *iptables.ChainInfo
filterChain *iptables.ChainInfo
@ -106,13 +106,13 @@ type driver struct {
}
// New constructs a new bridge driver
func newDriver() driverapi.Driver {
func newDriver() *driver {
ipAllocator = ipallocator.New()
return &driver{networks: map[string]*bridgeNetwork{}, config: &configuration{}}
}
// Init registers a new instance of bridge driver
func Init(dc driverapi.DriverCallback) error {
func Init(dc driverapi.DriverCallback, config map[string]interface{}) error {
if _, err := os.Stat("/proc/sys/net/bridge"); err != nil {
if out, err := exec.Command("modprobe", "-va", "bridge", "br_netfilter").CombinedOutput(); err != nil {
logrus.Warnf("Running modprobe bridge br_netfilter failed with message: %s, error: %v", out, err)
@ -128,10 +128,15 @@ func Init(dc driverapi.DriverCallback) error {
logrus.Warnf("Failed to remove existing iptables entries in %s : %v", DockerChain, err)
}
c := driverapi.Capability{
Scope: driverapi.LocalScope,
d := newDriver()
if err := d.configure(config); err != nil {
return err
}
return dc.RegisterDriver(networkType, newDriver(), c)
c := driverapi.Capability{
DataScope: datastore.LocalScope,
}
return dc.RegisterDriver(networkType, d, c)
}
// Validate performs a static validation on the network configuration parameters.
@ -244,13 +249,13 @@ func (c *networkConfiguration) fromMap(data map[string]interface{}) error {
}
}
if i, ok := data["AllowNonDefaultBridge"]; ok && i != nil {
if i, ok := data["DefaultBridge"]; ok && i != nil {
if s, ok := i.(string); ok {
if c.AllowNonDefaultBridge, err = strconv.ParseBool(s); err != nil {
return types.BadRequestErrorf("failed to parse AllowNonDefaultBridge value: %s", err.Error())
if c.DefaultBridge, err = strconv.ParseBool(s); err != nil {
return types.BadRequestErrorf("failed to parse DefaultBridge value: %s", err.Error())
}
} else {
return types.BadRequestErrorf("invalid type for AllowNonDefaultBridge value")
return types.BadRequestErrorf("invalid type for DefaultBridge value")
}
}
@ -426,17 +431,13 @@ func (c *networkConfiguration) conflictsWithNetworks(id string, others []*bridge
return nil
}
func (d *driver) Config(option map[string]interface{}) error {
func (d *driver) configure(option map[string]interface{}) error {
var config *configuration
var err error
d.Lock()
defer d.Unlock()
if d.configured {
return &ErrConfigExists{}
}
genericData, ok := option[netlabel.GenericData]
if !ok || genericData == nil {
return nil
@ -469,7 +470,6 @@ func (d *driver) Config(option map[string]interface{}) error {
}
}
d.configured = true
d.config = config
return nil
}
@ -516,7 +516,7 @@ func parseNetworkGenericOptions(data interface{}) (*networkConfiguration, error)
return config, err
}
func parseNetworkOptions(option options.Generic) (*networkConfiguration, error) {
func parseNetworkOptions(id string, option options.Generic) (*networkConfiguration, error) {
var err error
config := &networkConfiguration{}
@ -537,6 +537,9 @@ func parseNetworkOptions(option options.Generic) (*networkConfiguration, error)
return nil, err
}
if config.BridgeName == "" && config.DefaultBridge == false {
config.BridgeName = "br-" + id[:12]
}
return config, nil
}
@ -567,20 +570,12 @@ func (d *driver) getNetworks() []*bridgeNetwork {
// Create a new network using bridge plugin
func (d *driver) CreateNetwork(id string, option map[string]interface{}) error {
var (
err error
configLocked bool
)
var err error
defer osl.InitOSContext()()
// Sanity checks
d.Lock()
if !d.configured {
configLocked = true
d.configured = true
}
if _, ok := d.networks[id]; ok {
d.Unlock()
return types.ForbiddenErrorf("network %s exists", id)
@ -588,7 +583,7 @@ func (d *driver) CreateNetwork(id string, option map[string]interface{}) error {
d.Unlock()
// Parse and validate the config. It should not conflict with existing networks' config
config, err := parseNetworkOptions(option)
config, err := parseNetworkOptions(id, option)
if err != nil {
return err
}
@ -619,10 +614,6 @@ func (d *driver) CreateNetwork(id string, option map[string]interface{}) error {
defer func() {
if err != nil {
d.Lock()
if configLocked {
d.configured = false
}
delete(d.networks, id)
d.Unlock()
}

View file

@ -211,6 +211,17 @@ func (ndbee NonDefaultBridgeExistError) Error() string {
// Forbidden denotes the type of this error
func (ndbee NonDefaultBridgeExistError) Forbidden() {}
// NonDefaultBridgeNeedsIPError is returned when a non-default
// bridge config is passed but it has no ip configured
type NonDefaultBridgeNeedsIPError string
func (ndbee NonDefaultBridgeNeedsIPError) Error() string {
return fmt.Sprintf("bridge device with non default name %s must have a valid IP address", string(ndbee))
}
// Forbidden denotes the type of this error
func (ndbee NonDefaultBridgeNeedsIPError) Forbidden() {}
// FixedCIDRv4Error is returned when fixed-cidrv4 configuration
// failed.
type FixedCIDRv4Error struct {

View file

@ -15,7 +15,7 @@ func setupDevice(config *networkConfiguration, i *bridgeInterface) error {
// We only attempt to create the bridge when the requested device name is
// the default one.
if config.BridgeName != DefaultBridgeName && !config.AllowNonDefaultBridge {
if config.BridgeName != DefaultBridgeName && config.DefaultBridge {
return NonDefaultBridgeExistError(config.BridgeName)
}

View file

@ -53,8 +53,8 @@ func setupBridgeIPv4(config *networkConfiguration, i *bridgeInterface) error {
// Do not try to configure IPv4 on a non-default bridge unless you are
// specifically asked to do so.
if config.BridgeName != DefaultBridgeName && !config.AllowNonDefaultBridge {
return NonDefaultBridgeExistError(config.BridgeName)
if config.BridgeName != DefaultBridgeName && config.DefaultBridge {
return NonDefaultBridgeNeedsIPError(config.BridgeName)
}
bridgeIPv4, err := electBridgeIPv4(config)

View file

@ -3,6 +3,7 @@ package host
import (
"sync"
"github.com/docker/libnetwork/datastore"
"github.com/docker/libnetwork/driverapi"
"github.com/docker/libnetwork/types"
)
@ -15,17 +16,13 @@ type driver struct {
}
// Init registers a new instance of host driver
func Init(dc driverapi.DriverCallback) error {
func Init(dc driverapi.DriverCallback, config map[string]interface{}) error {
c := driverapi.Capability{
Scope: driverapi.LocalScope,
DataScope: datastore.LocalScope,
}
return dc.RegisterDriver(networkType, &driver{}, c)
}
func (d *driver) Config(option map[string]interface{}) error {
return nil
}
func (d *driver) CreateNetwork(id string, option map[string]interface{}) error {
d.Lock()
defer d.Unlock()

View file

@ -3,6 +3,7 @@ package null
import (
"sync"
"github.com/docker/libnetwork/datastore"
"github.com/docker/libnetwork/driverapi"
"github.com/docker/libnetwork/types"
)
@ -15,17 +16,13 @@ type driver struct {
}
// Init registers a new instance of null driver
func Init(dc driverapi.DriverCallback) error {
func Init(dc driverapi.DriverCallback, config map[string]interface{}) error {
c := driverapi.Capability{
Scope: driverapi.LocalScope,
DataScope: datastore.LocalScope,
}
return dc.RegisterDriver(networkType, &driver{}, c)
}
func (d *driver) Config(option map[string]interface{}) error {
return nil
}
func (d *driver) CreateNetwork(id string, option map[string]interface{}) error {
d.Lock()
defer d.Unlock()

View file

@ -72,11 +72,6 @@ func (d *driver) Join(nid, eid string, sboxKey string, jinfo driverapi.JoinInfo,
}
}
err = jinfo.SetGateway(bridgeIP.IP)
if err != nil {
return err
}
d.peerDbAdd(nid, eid, ep.addr.IP, ep.mac,
d.serfInstance.LocalMember().Addr, true)
d.notifyCh <- ovNotify{

View file

@ -40,6 +40,10 @@ func (d *driver) CreateNetwork(id string, option map[string]interface{}) error {
return fmt.Errorf("invalid network id")
}
if err := d.configure(); err != nil {
return err
}
n := &network{
id: id,
driver: d,
@ -297,6 +301,10 @@ func (n *network) Exists() bool {
return n.dbExists
}
func (n *network) Skip() bool {
return false
}
func (n *network) SetValue(value []byte) error {
var vni uint32
err := json.Unmarshal(value, &vni)
@ -306,6 +314,10 @@ func (n *network) SetValue(value []byte) error {
return err
}
func (n *network) DataScope() datastore.DataScope {
return datastore.GlobalScope
}
func (n *network) writeToStore() error {
return n.driver.store.PutObjectAtomic(n)
}

View file

@ -6,6 +6,7 @@ import (
"net"
"sync"
"github.com/docker/libkv/store"
"github.com/docker/libnetwork/config"
"github.com/docker/libnetwork/datastore"
"github.com/docker/libnetwork/driverapi"
@ -30,6 +31,7 @@ type driver struct {
exitCh chan chan struct{}
ifaceName string
neighIP string
config map[string]interface{}
peerDb peerNetworkMap
serfInstance *serf.Serf
networks networkTable
@ -67,19 +69,22 @@ func onceInit() {
}
// Init registers a new instance of overlay driver
func Init(dc driverapi.DriverCallback) error {
func Init(dc driverapi.DriverCallback, config map[string]interface{}) error {
once.Do(onceInit)
c := driverapi.Capability{
Scope: driverapi.GlobalScope,
DataScope: datastore.GlobalScope,
}
return dc.RegisterDriver(networkType, &driver{
d := &driver{
networks: networkTable{},
peerDb: peerNetworkMap{
mp: map[string]peerMap{},
},
}, c)
config: config,
}
return dc.RegisterDriver(networkType, d, c)
}
// Fini cleans up the driver resources
@ -95,23 +100,27 @@ func Fini(drv driverapi.Driver) {
}
}
func (d *driver) Config(option map[string]interface{}) error {
func (d *driver) configure() error {
var onceDone bool
var err error
if len(d.config) == 0 {
return nil
}
d.Do(func() {
onceDone = true
if ifaceName, ok := option[netlabel.OverlayBindInterface]; ok {
if ifaceName, ok := d.config[netlabel.OverlayBindInterface]; ok {
d.ifaceName = ifaceName.(string)
}
if neighIP, ok := option[netlabel.OverlayNeighborIP]; ok {
if neighIP, ok := d.config[netlabel.OverlayNeighborIP]; ok {
d.neighIP = neighIP.(string)
}
provider, provOk := option[netlabel.KVProvider]
provURL, urlOk := option[netlabel.KVProviderURL]
provider, provOk := d.config[netlabel.KVProvider]
provURL, urlOk := d.config[netlabel.KVProviderURL]
if provOk && urlOk {
cfg := &config.DatastoreCfg{
@ -120,6 +129,10 @@ func (d *driver) Config(option map[string]interface{}) error {
Address: provURL.(string),
},
}
provConfig, confOk := d.config[netlabel.KVProviderConfig]
if confOk {
cfg.Client.Config = provConfig.(*store.Config)
}
d.store, err = datastore.NewDataStore(cfg)
if err != nil {
err = fmt.Errorf("failed to initialize data store: %v", err)
@ -146,10 +159,6 @@ func (d *driver) Config(option map[string]interface{}) error {
})
if !onceDone {
return fmt.Errorf("config already applied to driver")
}
return err
}

View file

@ -6,6 +6,7 @@ import (
log "github.com/Sirupsen/logrus"
"github.com/docker/docker/pkg/plugins"
"github.com/docker/libnetwork/datastore"
"github.com/docker/libnetwork/driverapi"
"github.com/docker/libnetwork/drivers/remote/api"
"github.com/docker/libnetwork/types"
@ -26,7 +27,7 @@ func newDriver(name string, client *plugins.Client) driverapi.Driver {
// Init makes sure a remote driver is registered when a network driver
// plugin is activated.
func Init(dc driverapi.DriverCallback) error {
func Init(dc driverapi.DriverCallback, config map[string]interface{}) error {
plugins.Handle(driverapi.NetworkPluginEndpointType, func(name string, client *plugins.Client) {
// negotiate driver capability with client
d := newDriver(name, client)
@ -52,9 +53,9 @@ func (d *driver) getCapabilities() (*driverapi.Capability, error) {
c := &driverapi.Capability{}
switch capResp.Scope {
case "global":
c.Scope = driverapi.GlobalScope
c.DataScope = datastore.GlobalScope
case "local":
c.Scope = driverapi.LocalScope
c.DataScope = datastore.LocalScope
default:
return nil, fmt.Errorf("invalid capability: expecting 'local' or 'global', got %s", capResp.Scope)
}

View file

@ -1,6 +1,9 @@
package windows
import "github.com/docker/libnetwork/driverapi"
import (
"github.com/docker/libnetwork/datastore"
"github.com/docker/libnetwork/driverapi"
)
const networkType = "windows"
@ -9,17 +12,13 @@ const networkType = "windows"
type driver struct{}
// Init registers a new instance of null driver
func Init(dc driverapi.DriverCallback) error {
func Init(dc driverapi.DriverCallback, config map[string]interface{}) error {
c := driverapi.Capability{
Scope: driverapi.LocalScope,
DataScope: datastore.LocalScope,
}
return dc.RegisterDriver(networkType, &driver{}, c)
}
func (d *driver) Config(option map[string]interface{}) error {
return nil
}
func (d *driver) CreateNetwork(id string, option map[string]interface{}) error {
return nil
}

View file

@ -1,19 +1,13 @@
package libnetwork
import (
"github.com/docker/libnetwork/driverapi"
"github.com/docker/libnetwork/drivers/null"
"github.com/docker/libnetwork/drivers/remote"
)
func initDrivers(dc driverapi.DriverCallback) error {
for _, fn := range [](func(driverapi.DriverCallback) error){
null.Init,
remote.Init,
} {
if err := fn(dc); err != nil {
return err
}
func getInitializers() []initializer {
return []initializer{
{null.Init, "null"},
{remote.Init, "remote"},
}
return nil
}

View file

@ -1,25 +1,19 @@
package libnetwork
import (
"github.com/docker/libnetwork/driverapi"
"github.com/docker/libnetwork/drivers/bridge"
"github.com/docker/libnetwork/drivers/host"
"github.com/docker/libnetwork/drivers/null"
o "github.com/docker/libnetwork/drivers/overlay"
"github.com/docker/libnetwork/drivers/overlay"
"github.com/docker/libnetwork/drivers/remote"
)
func initDrivers(dc driverapi.DriverCallback) error {
for _, fn := range [](func(driverapi.DriverCallback) error){
bridge.Init,
host.Init,
null.Init,
remote.Init,
o.Init,
} {
if err := fn(dc); err != nil {
return err
}
func getInitializers() []initializer {
return []initializer{
{bridge.Init, "bridge"},
{host.Init, "host"},
{null.Init, "null"},
{remote.Init, "remote"},
{overlay.Init, "overlay"},
}
return nil
}

View file

@ -1,17 +1,9 @@
package libnetwork
import (
"github.com/docker/libnetwork/driverapi"
"github.com/docker/libnetwork/drivers/windows"
)
import "github.com/docker/libnetwork/drivers/windows"
func initDrivers(dc driverapi.DriverCallback) error {
for _, fn := range [](func(driverapi.DriverCallback) error){
windows.Init,
} {
if err := fn(dc); err != nil {
return err
}
func getInitializers() []initializer {
return []initializer{
{windows.Init, "windows"},
}
return nil
}

View file

@ -5,6 +5,7 @@ import (
"encoding/json"
"fmt"
"net"
"strings"
"sync"
log "github.com/Sirupsen/logrus"
@ -130,15 +131,15 @@ func (ep *endpoint) KeyPrefix() []string {
return []string{datastore.EndpointKeyPrefix, ep.getNetwork().id}
}
func (ep *endpoint) networkIDFromKey(key []string) (string, error) {
// endpoint Key structure : endpoint/network-id/endpoint-id
// it's an invalid key if the key doesn't have all the 3 key elements above
if key == nil || len(key) < 3 || key[0] != datastore.EndpointKeyPrefix {
func (ep *endpoint) networkIDFromKey(key string) (string, error) {
// endpoint Key structure : docker/libnetwork/endpoint/${network-id}/${endpoint-id}
// it's an invalid key if the key doesn't have all the 5 key elements above
keyElements := strings.Split(key, "/")
if !strings.HasPrefix(key, datastore.Key(datastore.EndpointKeyPrefix)) || len(keyElements) < 5 {
return "", fmt.Errorf("invalid endpoint key : %v", key)
}
// network-id is placed at index=1. pls refer to endpoint.Key() method
return key[1], nil
// network-id is placed at index=3. pls refer to endpoint.Key() method
return strings.Split(key, "/")[3], nil
}
func (ep *endpoint) Value() []byte {
@ -172,6 +173,10 @@ func (ep *endpoint) Exists() bool {
return ep.dbExists
}
func (ep *endpoint) Skip() bool {
return ep.getNetwork().Skip()
}
func (ep *endpoint) processOptions(options ...EndpointOption) {
ep.Lock()
defer ep.Unlock()
@ -183,40 +188,7 @@ func (ep *endpoint) processOptions(options ...EndpointOption) {
}
}
// joinLeaveStart waits to ensure there are no joins or leaves in progress and
// marks this join/leave in progress without race
func (ep *endpoint) joinLeaveStart() {
ep.Lock()
defer ep.Unlock()
for ep.joinLeaveDone != nil {
joinLeaveDone := ep.joinLeaveDone
ep.Unlock()
select {
case <-joinLeaveDone:
}
ep.Lock()
}
ep.joinLeaveDone = make(chan struct{})
}
// joinLeaveEnd marks the end of this join/leave operation and
// signals the same without race to other join and leave waiters
func (ep *endpoint) joinLeaveEnd() {
ep.Lock()
defer ep.Unlock()
if ep.joinLeaveDone != nil {
close(ep.joinLeaveDone)
ep.joinLeaveDone = nil
}
}
func (ep *endpoint) Join(sbox Sandbox, options ...EndpointOption) error {
var err error
if sbox == nil {
return types.BadRequestErrorf("endpoint cannot be joined by nil container")
@ -227,8 +199,18 @@ func (ep *endpoint) Join(sbox Sandbox, options ...EndpointOption) error {
return types.BadRequestErrorf("not a valid Sandbox interface")
}
ep.joinLeaveStart()
defer ep.joinLeaveEnd()
sb.joinLeaveStart()
defer sb.joinLeaveEnd()
return ep.sbJoin(sbox, options...)
}
func (ep *endpoint) sbJoin(sbox Sandbox, options ...EndpointOption) error {
var err error
sb, ok := sbox.(*sandbox)
if !ok {
return types.BadRequestErrorf("not a valid Sandbox interface")
}
ep.Lock()
if ep.sandboxID != "" {
@ -281,8 +263,10 @@ func (ep *endpoint) Join(sbox Sandbox, options ...EndpointOption) error {
return err
}
if err = network.ctrlr.updateEndpointToStore(ep); err != nil {
return err
if !ep.isLocalScoped() {
if err = network.ctrlr.updateToStore(ep); err != nil {
return err
}
}
sb.Lock()
@ -304,7 +288,11 @@ func (ep *endpoint) Join(sbox Sandbox, options ...EndpointOption) error {
if err = sb.populateNetworkResources(ep); err != nil {
return err
}
return nil
if sb.needDefaultGW() {
return sb.setupDefaultGW(ep)
}
return sb.clearDefaultGW()
}
func (ep *endpoint) hasInterface(iName string) bool {
@ -315,9 +303,6 @@ func (ep *endpoint) hasInterface(iName string) bool {
}
func (ep *endpoint) Leave(sbox Sandbox, options ...EndpointOption) error {
ep.joinLeaveStart()
defer ep.joinLeaveEnd()
if sbox == nil || sbox.ID() == "" || sbox.Key() == "" {
return types.BadRequestErrorf("invalid Sandbox passed to enpoint leave: %v", sbox)
}
@ -327,6 +312,18 @@ func (ep *endpoint) Leave(sbox Sandbox, options ...EndpointOption) error {
return types.BadRequestErrorf("not a valid Sandbox interface")
}
sb.joinLeaveStart()
defer sb.joinLeaveEnd()
return ep.sbLeave(sbox, options...)
}
func (ep *endpoint) sbLeave(sbox Sandbox, options ...EndpointOption) error {
sb, ok := sbox.(*sandbox)
if !ok {
return types.BadRequestErrorf("not a valid Sandbox interface")
}
ep.Lock()
sid := ep.sandboxID
ep.Unlock()
@ -350,18 +347,31 @@ func (ep *endpoint) Leave(sbox Sandbox, options ...EndpointOption) error {
d := n.driver
n.Unlock()
if err := c.updateEndpointToStore(ep); err != nil {
ep.Lock()
ep.sandboxID = sid
ep.Unlock()
return err
if !ep.isLocalScoped() {
if err := c.updateToStore(ep); err != nil {
ep.Lock()
ep.sandboxID = sid
ep.Unlock()
return err
}
}
if err := d.Leave(n.id, ep.id); err != nil {
return err
}
return sb.clearNetworkResources(ep)
if err := sb.clearNetworkResources(ep); err != nil {
return err
}
if sb.needDefaultGW() {
ep := sb.getEPwithoutGateway()
if ep == nil {
return fmt.Errorf("endpoint without GW expected, but not found")
}
return sb.setupDefaultGW(ep)
}
return sb.clearDefaultGW()
}
func (ep *endpoint) Delete() error {
@ -379,27 +389,31 @@ func (ep *endpoint) Delete() error {
n.Unlock()
ep.Unlock()
if err = ctrlr.deleteEndpointFromStore(ep); err != nil {
return err
if !ep.isLocalScoped() {
if err = ctrlr.deleteFromStore(ep); err != nil {
return err
}
}
defer func() {
if err != nil {
ep.SetIndex(0)
if e := ctrlr.updateEndpointToStore(ep); e != nil {
log.Warnf("failed to recreate endpoint in store %s : %v", name, err)
ep.dbExists = false
if !ep.isLocalScoped() {
if e := ctrlr.updateToStore(ep); e != nil {
log.Warnf("failed to recreate endpoint in store %s : %v", name, e)
}
}
}
}()
// Update the endpoint count in network and update it in the datastore
n.DecEndpointCnt()
if err = ctrlr.updateNetworkToStore(n); err != nil {
if err = ctrlr.updateToStore(n); err != nil {
return err
}
defer func() {
if err != nil {
n.IncEndpointCnt()
if e := ctrlr.updateNetworkToStore(n); e != nil {
if e := ctrlr.updateToStore(n); e != nil {
log.Warnf("failed to update network %s : %v", n.name, e)
}
}
@ -525,3 +539,13 @@ func JoinOptionPriority(ep Endpoint, prio int) EndpointOption {
sb.epPriority[ep.id] = prio
}
}
func (ep *endpoint) DataScope() datastore.DataScope {
ep.Lock()
defer ep.Unlock()
return ep.network.dataScope
}
func (ep *endpoint) isLocalScoped() bool {
return ep.DataScope() == datastore.LocalScope
}

View file

@ -30,6 +30,9 @@ const (
// KVProviderURL constant represents the KV provider URL
KVProviderURL = DriverPrefix + ".kv_provider_url"
// KVProviderConfig constant represents the KV provider Config
KVProviderConfig = DriverPrefix + ".kv_provider_config"
// OverlayBindInterface constant represents overlay driver bind interface
OverlayBindInterface = DriverPrefix + ".overlay.bind_interface"

View file

@ -68,7 +68,9 @@ type network struct {
dbIndex uint64
svcRecords svcMap
dbExists bool
persist bool
stopWatchCh chan struct{}
dataScope datastore.DataScope
sync.Mutex
}
@ -140,6 +142,18 @@ func (n *network) Exists() bool {
return n.dbExists
}
func (n *network) Skip() bool {
n.Lock()
defer n.Unlock()
return !n.persist
}
func (n *network) DataScope() datastore.DataScope {
n.Lock()
defer n.Unlock()
return n.dataScope
}
func (n *network) EndpointCnt() uint64 {
n.Lock()
defer n.Unlock()
@ -167,6 +181,7 @@ func (n *network) MarshalJSON() ([]byte, error) {
netMap["endpointCnt"] = n.endpointCnt
netMap["enableIPv6"] = n.enableIPv6
netMap["generic"] = n.generic
netMap["persist"] = n.persist
return json.Marshal(netMap)
}
@ -184,6 +199,9 @@ func (n *network) UnmarshalJSON(b []byte) (err error) {
if netMap["generic"] != nil {
n.generic = netMap["generic"].(map[string]interface{})
}
if netMap["persist"] != nil {
n.persist = netMap["persist"].(bool)
}
return nil
}
@ -203,6 +221,13 @@ func NetworkOptionGeneric(generic map[string]interface{}) NetworkOption {
}
}
// NetworkOptionPersist returns an option setter to set persistence policy for a network
func NetworkOptionPersist(persist bool) NetworkOption {
return func(n *network) {
n.persist = persist
}
}
func (n *network) processOptions(options ...NetworkOption) {
for _, opt := range options {
if opt != nil {
@ -233,13 +258,22 @@ func (n *network) Delete() error {
// deleteNetworkFromStore performs an atomic delete operation and the network.endpointCnt field will help
// prevent any possible race between endpoint join and network delete
if err = ctrlr.deleteNetworkFromStore(n); err != nil {
if err = ctrlr.deleteFromStore(n); err != nil {
if err == datastore.ErrKeyModified {
return types.InternalErrorf("operation in progress. delete failed for network %s. Please try again.")
}
return err
}
defer func() {
if err != nil {
n.dbExists = false
if e := ctrlr.updateToStore(n); e != nil {
log.Warnf("failed to recreate network in store %s : %v", n.name, e)
}
}
}()
if err = n.deleteNetwork(); err != nil {
return err
}
@ -315,13 +349,13 @@ func (n *network) CreateEndpoint(name string, options ...EndpointOption) (Endpoi
n.Unlock()
n.IncEndpointCnt()
if err = ctrlr.updateNetworkToStore(n); err != nil {
if err = ctrlr.updateToStore(n); err != nil {
return nil, err
}
defer func() {
if err != nil {
n.DecEndpointCnt()
if err = ctrlr.updateNetworkToStore(n); err != nil {
if err = ctrlr.updateToStore(n); err != nil {
log.Warnf("endpoint count cleanup failed when updating network for %s : %v", name, err)
}
}
@ -337,8 +371,10 @@ func (n *network) CreateEndpoint(name string, options ...EndpointOption) (Endpoi
}
}()
if err = ctrlr.updateEndpointToStore(ep); err != nil {
return nil, err
if !ep.isLocalScoped() {
if err = ctrlr.updateToStore(ep); err != nil {
return nil, err
}
}
return ep, nil
@ -398,11 +434,8 @@ func (n *network) EndpointByID(id string) (Endpoint, error) {
return nil, ErrNoSuchEndpoint(id)
}
func (n *network) isGlobalScoped() (bool, error) {
n.Lock()
c := n.ctrlr
n.Unlock()
return c.isDriverGlobalScoped(n.networkType)
func (n *network) isGlobalScoped() bool {
return n.DataScope() == datastore.GlobalScope
}
func (n *network) updateSvcRecord(ep *endpoint, isAdd bool) {

View file

@ -54,16 +54,15 @@ func (sb *sandbox) processOptions(options ...SandboxOption) {
type epHeap []*endpoint
type sandbox struct {
id string
containerID string
config containerConfig
osSbox osl.Sandbox
controller *controller
refCnt int
endpoints epHeap
epPriority map[string]int
//hostsPath string
//resolvConfPath string
id string
containerID string
config containerConfig
osSbox osl.Sandbox
controller *controller
refCnt int
hostsOnce sync.Once
endpoints epHeap
epPriority map[string]int
joinLeaveDone chan struct{}
sync.Mutex
}
@ -149,6 +148,11 @@ func (sb *sandbox) Delete() error {
// Detach from all endpoints
for _, ep := range sb.getConnectedEndpoints() {
// endpoint in the Gateway network will be cleaned up
// when when sandbox no longer needs external connectivity
if ep.endpointInGWNetwork() {
continue
}
if err := ep.Leave(sb); err != nil {
log.Warnf("Failed detaching sandbox %s from endpoint %s: %v\n", sb.ID(), ep.ID(), err)
}
@ -342,15 +346,16 @@ func (sb *sandbox) populateNetworkResources(ep *endpoint) error {
}
}
sb.Lock()
highEp := sb.endpoints[0]
sb.Unlock()
if ep == highEp {
if err := sb.updateGateway(ep); err != nil {
return err
for _, gwep := range sb.getConnectedEndpoints() {
if len(gwep.Gateway()) > 0 {
if gwep != ep {
return nil
}
if err := sb.updateGateway(gwep); err != nil {
return err
}
}
}
return nil
}
@ -389,26 +394,33 @@ func (sb *sandbox) clearNetworkResources(ep *endpoint) error {
return nil
}
highEpBefore := sb.endpoints[0]
var (
i int
e *endpoint
gwepBefore, gwepAfter *endpoint
index = -1
)
for i, e = range sb.endpoints {
for i, e := range sb.endpoints {
if e == ep {
index = i
}
if len(e.Gateway()) > 0 && gwepBefore == nil {
gwepBefore = e
}
if index != -1 && gwepBefore != nil {
break
}
}
heap.Remove(&sb.endpoints, i)
var highEpAfter *endpoint
if len(sb.endpoints) > 0 {
highEpAfter = sb.endpoints[0]
heap.Remove(&sb.endpoints, index)
for _, e := range sb.endpoints {
if len(e.Gateway()) > 0 {
gwepAfter = e
break
}
}
delete(sb.epPriority, ep.ID())
sb.Unlock()
if highEpBefore != highEpAfter {
sb.updateGateway(highEpAfter)
if gwepAfter != nil && gwepBefore != gwepAfter {
sb.updateGateway(gwepAfter)
}
return nil
@ -447,22 +459,47 @@ func (sb *sandbox) buildHostsFile() error {
}
func (sb *sandbox) updateHostsFile(ifaceIP string, svcRecords []etchosts.Record) error {
var err error
if sb.config.originHostsPath != "" {
return nil
}
// Rebuild the hosts file accounting for the passed interface IP and service records
extraContent := make([]etchosts.Record, 0, len(sb.config.extraHosts)+len(svcRecords))
max := func(a, b int) int {
if a < b {
return b
}
for _, extraHost := range sb.config.extraHosts {
extraContent = append(extraContent, etchosts.Record{Hosts: extraHost.name, IP: extraHost.IP})
return a
}
extraContent := make([]etchosts.Record, 0,
max(len(sb.config.extraHosts), len(svcRecords)))
sb.hostsOnce.Do(func() {
// Rebuild the hosts file accounting for the passed
// interface IP and service records
for _, extraHost := range sb.config.extraHosts {
extraContent = append(extraContent,
etchosts.Record{Hosts: extraHost.name, IP: extraHost.IP})
}
err = etchosts.Build(sb.config.hostsPath, ifaceIP,
sb.config.hostName, sb.config.domainName, extraContent)
})
if err != nil {
return err
}
extraContent = extraContent[:0]
for _, svc := range svcRecords {
extraContent = append(extraContent, svc)
}
return etchosts.Build(sb.config.hostsPath, ifaceIP, sb.config.hostName, sb.config.domainName, extraContent)
sb.addHostsEntries(extraContent)
return nil
}
func (sb *sandbox) addHostsEntries(recs []etchosts.Record) {
@ -629,6 +666,38 @@ func (sb *sandbox) updateDNS(ipv6Enabled bool) error {
return os.Rename(tmpResolvFile.Name(), sb.config.resolvConfPath)
}
// joinLeaveStart waits to ensure there are no joins or leaves in progress and
// marks this join/leave in progress without race
func (sb *sandbox) joinLeaveStart() {
sb.Lock()
defer sb.Unlock()
for sb.joinLeaveDone != nil {
joinLeaveDone := sb.joinLeaveDone
sb.Unlock()
select {
case <-joinLeaveDone:
}
sb.Lock()
}
sb.joinLeaveDone = make(chan struct{})
}
// joinLeaveEnd marks the end of this join/leave operation and
// signals the same without race to other join and leave waiters
func (sb *sandbox) joinLeaveEnd() {
sb.Lock()
defer sb.Unlock()
if sb.joinLeaveDone != nil {
close(sb.joinLeaveDone)
sb.joinLeaveDone = nil
}
}
// OptionHostname function returns an option setter for hostname option to
// be passed to NewSandbox method.
func OptionHostname(name string) SandboxOption {
@ -748,6 +817,17 @@ func (eh epHeap) Less(i, j int) bool {
ci, _ := eh[i].getSandbox()
cj, _ := eh[j].getSandbox()
epi := eh[i]
epj := eh[j]
if epi.endpointInGWNetwork() {
return false
}
if epj.endpointInGWNetwork() {
return true
}
cip, ok := ci.epPriority[eh[i].ID()]
if !ok {
cip = 0

View file

@ -138,7 +138,7 @@ func (c *controller) acceptClientConnections(sock string, l net.Listener) {
conn, err := l.Accept()
if err != nil {
if _, err1 := os.Stat(sock); os.IsNotExist(err1) {
logrus.Warnf("Unix socket %s doesnt exist. cannot accept client connections", sock)
logrus.Debugf("Unix socket %s doesnt exist. cannot accept client connections", sock)
return
}
logrus.Errorf("Error accepting connection %v", err)

View file

@ -3,44 +3,87 @@ package libnetwork
import (
"encoding/json"
"fmt"
"time"
log "github.com/Sirupsen/logrus"
"github.com/docker/libkv/store"
"github.com/docker/libnetwork/config"
"github.com/docker/libnetwork/datastore"
)
func (c *controller) validateDatastoreConfig() bool {
return c.cfg != nil && c.cfg.Datastore.Client.Provider != "" && c.cfg.Datastore.Client.Address != ""
var (
defaultBoltTimeout = 3 * time.Second
defaultLocalStoreConfig = config.DatastoreCfg{
Embedded: true,
Client: config.DatastoreClientCfg{
Provider: "boltdb",
Address: defaultPrefix + "/boltdb.db",
Config: &store.Config{
Bucket: "libnetwork",
ConnectionTimeout: defaultBoltTimeout,
},
},
}
)
func (c *controller) validateGlobalStoreConfig() bool {
return c.cfg != nil && c.cfg.GlobalStore.Client.Provider != "" && c.cfg.GlobalStore.Client.Address != ""
}
func (c *controller) initDataStore() error {
func (c *controller) initGlobalStore() error {
c.Lock()
cfg := c.cfg
c.Unlock()
if !c.validateDatastoreConfig() {
return fmt.Errorf("datastore initialization requires a valid configuration")
if !c.validateGlobalStoreConfig() {
return fmt.Errorf("globalstore initialization requires a valid configuration")
}
store, err := datastore.NewDataStore(&cfg.Datastore)
store, err := datastore.NewDataStore(&cfg.GlobalStore)
if err != nil {
return err
}
c.Lock()
c.store = store
c.globalStore = store
c.Unlock()
nws, err := c.getNetworksFromStore()
nws, err := c.getNetworksFromStore(true)
if err == nil {
c.processNetworkUpdate(nws, nil)
} else if err != datastore.ErrKeyNotFound {
log.Warnf("failed to read networks from datastore during init : %v", err)
log.Warnf("failed to read networks from globalstore during init : %v", err)
}
return c.watchNetworks()
}
func (c *controller) getNetworksFromStore() ([]*store.KVPair, error) {
func (c *controller) initLocalStore() error {
c.Lock()
cs := c.store
cfg := c.cfg
c.Unlock()
localStore, err := datastore.NewDataStore(c.getLocalStoreConfig(cfg))
if err != nil {
return err
}
c.Lock()
c.localStore = localStore
c.Unlock()
nws, err := c.getNetworksFromStore(false)
if err == nil {
c.processNetworkUpdate(nws, nil)
} else if err != datastore.ErrKeyNotFound {
log.Warnf("failed to read networks from localstore during init : %v", err)
}
return nil
}
func (c *controller) getNetworksFromStore(global bool) ([]*store.KVPair, error) {
var cs datastore.DataStore
c.Lock()
if global {
cs = c.globalStore
} else {
cs = c.localStore
}
c.Unlock()
return cs.KVStore().List(datastore.Key(datastore.NetworkKeyPrefix))
}
@ -54,50 +97,6 @@ func (c *controller) newNetworkFromStore(n *network) error {
return c.addNetwork(n)
}
func (c *controller) updateNetworkToStore(n *network) error {
global, err := n.isGlobalScoped()
if err != nil || !global {
return err
}
c.Lock()
cs := c.store
c.Unlock()
if cs == nil {
log.Debugf("datastore not initialized. Network %s is not added to the store", n.Name())
return nil
}
return cs.PutObjectAtomic(n)
}
func (c *controller) deleteNetworkFromStore(n *network) error {
global, err := n.isGlobalScoped()
if err != nil || !global {
return err
}
c.Lock()
cs := c.store
c.Unlock()
if cs == nil {
log.Debugf("datastore not initialized. Network %s is not deleted from datastore", n.Name())
return nil
}
if err := cs.DeleteObjectAtomic(n); err != nil {
return err
}
return nil
}
func (c *controller) getNetworkFromStore(nid string) (*network, error) {
n := network{id: nid}
if err := c.store.GetObject(datastore.Key(n.Key()...), &n); err != nil {
return nil, err
}
return &n, nil
}
func (c *controller) newEndpointFromStore(key string, ep *endpoint) error {
ep.Lock()
n := ep.network
@ -113,52 +112,30 @@ func (c *controller) newEndpointFromStore(key string, ep *endpoint) error {
return err
}
func (c *controller) updateEndpointToStore(ep *endpoint) error {
ep.Lock()
n := ep.network
name := ep.name
ep.Unlock()
global, err := n.isGlobalScoped()
if err != nil || !global {
return err
func (c *controller) updateToStore(kvObject datastore.KV) error {
if kvObject.Skip() {
return nil
}
c.Lock()
cs := c.store
c.Unlock()
cs := c.getDataStore(kvObject.DataScope())
if cs == nil {
log.Debugf("datastore not initialized. endpoint %s is not added to the store", name)
log.Debugf("datastore not initialized. kv object %s is not added to the store", datastore.Key(kvObject.Key()...))
return nil
}
return cs.PutObjectAtomic(ep)
return cs.PutObjectAtomic(kvObject)
}
func (c *controller) getEndpointFromStore(eid string) (*endpoint, error) {
ep := endpoint{id: eid}
if err := c.store.GetObject(datastore.Key(ep.Key()...), &ep); err != nil {
return nil, err
func (c *controller) deleteFromStore(kvObject datastore.KV) error {
if kvObject.Skip() {
return nil
}
return &ep, nil
}
func (c *controller) deleteEndpointFromStore(ep *endpoint) error {
ep.Lock()
n := ep.network
ep.Unlock()
global, err := n.isGlobalScoped()
if err != nil || !global {
return err
}
c.Lock()
cs := c.store
c.Unlock()
cs := c.getDataStore(kvObject.DataScope())
if cs == nil {
log.Debugf("datastore not initialized. endpoint %s is not deleted from datastore", ep.Name())
log.Debugf("datastore not initialized. kv object %s is not deleted from datastore", datastore.Key(kvObject.Key()...))
return nil
}
if err := cs.DeleteObjectAtomic(ep); err != nil {
if err := cs.DeleteObjectAtomic(kvObject); err != nil {
return err
}
@ -166,12 +143,12 @@ func (c *controller) deleteEndpointFromStore(ep *endpoint) error {
}
func (c *controller) watchNetworks() error {
if !c.validateDatastoreConfig() {
if !c.validateGlobalStoreConfig() {
return nil
}
c.Lock()
cs := c.store
cs := c.globalStore
c.Unlock()
networkKey := datastore.Key(datastore.NetworkKeyPrefix)
@ -191,8 +168,7 @@ func (c *controller) watchNetworks() error {
lview := c.networks
c.Unlock()
for k, v := range lview {
global, _ := v.isGlobalScoped()
if global {
if v.isGlobalScoped() {
tmpview[k] = v
}
}
@ -207,7 +183,7 @@ func (c *controller) watchNetworks() error {
continue
}
tmp := network{}
if err := c.store.GetObject(datastore.Key(existing.Key()...), &tmp); err != datastore.ErrKeyNotFound {
if err := c.globalStore.GetObject(datastore.Key(existing.Key()...), &tmp); err != datastore.ErrKeyNotFound {
continue
}
if err := existing.deleteNetwork(); err != nil {
@ -221,12 +197,12 @@ func (c *controller) watchNetworks() error {
}
func (n *network) watchEndpoints() error {
if !n.ctrlr.validateDatastoreConfig() {
if n.Skip() || !n.ctrlr.validateGlobalStoreConfig() {
return nil
}
n.Lock()
cs := n.ctrlr.store
cs := n.ctrlr.globalStore
tmp := endpoint{network: n}
n.stopWatchCh = make(chan struct{})
stopCh := n.stopWatchCh
@ -251,28 +227,11 @@ func (n *network) watchEndpoints() error {
lview := n.endpoints
n.Unlock()
for k, v := range lview {
global, _ := v.network.isGlobalScoped()
if global {
if v.network.isGlobalScoped() {
tmpview[k] = v
}
}
for _, epe := range eps {
var ep endpoint
err := json.Unmarshal(epe.Value, &ep)
if err != nil {
log.Error(err)
continue
}
delete(tmpview, ep.id)
ep.SetIndex(epe.LastIndex)
ep.network = n
if n.ctrlr.processEndpointUpdate(&ep) {
err = n.ctrlr.newEndpointFromStore(epe.Key, &ep)
if err != nil {
log.Error(err)
}
}
}
n.ctrlr.processEndpointsUpdate(eps, &tmpview)
// Delete processing
for k := range tmpview {
n.Lock()
@ -381,3 +340,53 @@ func ensureKeys(key string, cs datastore.DataStore) error {
}
return cs.KVStore().Put(key, []byte{}, nil)
}
func (c *controller) getLocalStoreConfig(cfg *config.Config) *config.DatastoreCfg {
if cfg != nil && cfg.LocalStore.Client.Provider != "" && cfg.LocalStore.Client.Address != "" {
return &cfg.LocalStore
}
return &defaultLocalStoreConfig
}
func (c *controller) getDataStore(dataScope datastore.DataScope) (dataStore datastore.DataStore) {
c.Lock()
if dataScope == datastore.GlobalScope {
dataStore = c.globalStore
} else if dataScope == datastore.LocalScope {
dataStore = c.localStore
}
c.Unlock()
return
}
func (c *controller) processEndpointsUpdate(eps []*store.KVPair, prune *endpointTable) {
for _, epe := range eps {
var ep endpoint
err := json.Unmarshal(epe.Value, &ep)
if err != nil {
log.Error(err)
continue
}
if prune != nil {
delete(*prune, ep.id)
}
ep.SetIndex(epe.LastIndex)
if nid, err := ep.networkIDFromKey(epe.Key); err != nil {
log.Error(err)
continue
} else {
if n, err := c.NetworkByID(nid); err != nil {
log.Error(err)
continue
} else {
ep.network = n.(*network)
}
}
if c.processEndpointUpdate(&ep) {
err = c.newEndpointFromStore(epe.Key, &ep)
if err != nil {
log.Error(err)
}
}
}
}