Add overlay network support < 3.16 kernels

Add support for overlay networking in older kernels.

Following were done to achieve this:
    + Create the vxlan network in host namespace.
    + This may create conflicts with other private
      networks so check for conflicts and fail a
      join if there is any conflict.
    + Add iptable based filtering to only allow
      subnet bridges in the same network to forward
      traffic while different network bridges will
      not be able to forward b/w each other. Also
      block traffic to overlay network originating
      from the host itself.

Signed-off-by: Jana Radhakrishnan <mrjana@docker.com>
This commit is contained in:
Jana Radhakrishnan 2015-12-10 14:35:49 -08:00
parent db8f663df0
commit b7d0fefabc
8 changed files with 372 additions and 39 deletions

View file

@ -0,0 +1,119 @@
package overlay
import (
"fmt"
"sync"
"github.com/Sirupsen/logrus"
"github.com/docker/libnetwork/iptables"
)
const globalChain = "DOCKER-OVERLAY"
var filterOnce sync.Once
func rawIPTables(args ...string) error {
if output, err := iptables.Raw(args...); err != nil {
return fmt.Errorf("unable to add overlay filter: %v", err)
} else if len(output) != 0 {
return fmt.Errorf("unable to add overlay filter: %s", string(output))
}
return nil
}
func setupGlobalChain() {
if err := rawIPTables("-N", globalChain); err != nil {
logrus.Errorf("could not create global overlay chain: %v", err)
return
}
if err := rawIPTables("-A", globalChain, "-j", "RETURN"); err != nil {
logrus.Errorf("could not install default return chain in the overlay global chain: %v", err)
return
}
}
func setNetworkChain(cname string, remove bool) error {
// Initialize the onetime global overlay chain
filterOnce.Do(setupGlobalChain)
opt := "-N"
// In case of remove, make sure to flush the rules in the chain
if remove {
if err := rawIPTables("-F", cname); err != nil {
return fmt.Errorf("failed to flush overlay network chain %s rules: %v", cname, err)
}
opt = "-X"
}
if err := rawIPTables(opt, cname); err != nil {
return fmt.Errorf("failed network chain operation %q for chain %s: %v", opt, cname, err)
}
if !remove {
if err := rawIPTables("-A", cname, "-j", "DROP"); err != nil {
return fmt.Errorf("failed adding default drop rule to overlay network chain %s: %v", cname, err)
}
}
return nil
}
func addNetworkChain(cname string) error {
return setNetworkChain(cname, false)
}
func removeNetworkChain(cname string) error {
return setNetworkChain(cname, true)
}
func setFilters(cname, brName string, remove bool) error {
opt := "-I"
if remove {
opt = "-D"
}
// Everytime we set filters for a new subnet make sure to move the global overlay hook to the top of the both the OUTPUT and forward chains
if !remove {
for _, chain := range []string{"OUTPUT", "FORWARD"} {
exists := iptables.Exists(iptables.Filter, chain, "-j", globalChain)
if exists {
if err := rawIPTables("-D", chain, "-j", globalChain); err != nil {
return fmt.Errorf("failed to delete overlay hook in chain %s while moving the hook: %v", chain, err)
}
}
if err := rawIPTables("-I", chain, "-j", globalChain); err != nil {
return fmt.Errorf("failed to insert overlay hook in chain %s: %v", chain, err)
}
}
}
// Insert/Delete the rule to jump to per-bridge chain
exists := iptables.Exists(iptables.Filter, globalChain, "-o", brName, "-j", cname)
if (!remove && !exists) || (remove && exists) {
if err := rawIPTables(opt, globalChain, "-o", brName, "-j", cname); err != nil {
return fmt.Errorf("failed to add per-bridge filter rule for bridge %s, network chain %s: %v", brName, cname, err)
}
}
exists = iptables.Exists(iptables.Filter, cname, "-i", brName, "-j", "ACCEPT")
if (!remove && exists) || (remove && !exists) {
return nil
}
if err := rawIPTables(opt, cname, "-i", brName, "-j", "ACCEPT"); err != nil {
return fmt.Errorf("failed to add overlay filter rile for network chain %s, bridge %s: %v", cname, brName, err)
}
return nil
}
func addFilters(cname, brName string) error {
return setFilters(cname, brName, false)
}
func removeFilters(cname, brName string) error {
return setFilters(cname, brName, true)
}

View file

@ -4,6 +4,7 @@ import (
"encoding/json"
"fmt"
"net"
"os"
"sync"
"syscall"
@ -12,11 +13,17 @@ import (
"github.com/docker/libnetwork/driverapi"
"github.com/docker/libnetwork/netutils"
"github.com/docker/libnetwork/osl"
"github.com/docker/libnetwork/resolvconf"
"github.com/docker/libnetwork/types"
"github.com/vishvananda/netlink"
"github.com/vishvananda/netlink/nl"
)
var (
hostMode bool
hostModeOnce sync.Once
)
type networkTable map[string]*network
type subnet struct {
@ -87,22 +94,6 @@ func (d *driver) CreateNetwork(id string, option map[string]interface{}, ipV4Dat
return nil
}
/* func (d *driver) createNetworkfromStore(nid string) (*network, error) {
n := &network{
id: nid,
driver: d,
endpoints: endpointTable{},
once: &sync.Once{},
subnets: []*subnet{},
}
err := d.store.GetObject(datastore.Key(n.Key()...), n)
if err != nil {
return nil, fmt.Errorf("unable to get network %q from data store, %v", nid, err)
}
return n, nil
}*/
func (d *driver) DeleteNetwork(nid string) error {
if nid == "" {
return fmt.Errorf("invalid network id")
@ -171,6 +162,12 @@ func (n *network) destroySandbox() {
}
for _, s := range n.subnets {
if hostMode {
if err := removeFilters(n.id[:12], s.brName); err != nil {
logrus.Warnf("Could not remove overlay filters: %v", err)
}
}
if s.vxlanName != "" {
err := deleteVxlan(s.vxlanName)
if err != nil {
@ -178,17 +175,88 @@ func (n *network) destroySandbox() {
}
}
}
if hostMode {
if err := removeNetworkChain(n.id[:12]); err != nil {
logrus.Warnf("could not remove network chain: %v", err)
}
}
sbox.Destroy()
n.setSandbox(nil)
}
}
func (n *network) initSubnetSandbox(s *subnet) error {
// create a bridge and vxlan device for this subnet and move it to the sandbox
brName, err := netutils.GenerateIfaceName("bridge", 7)
if err != nil {
return err
func setHostMode() {
if os.Getenv("_OVERLAY_HOST_MODE") != "" {
hostMode = true
return
}
err := createVxlan("testvxlan", 1)
if err != nil {
logrus.Errorf("Failed to create testvxlan interface: %v", err)
return
}
defer deleteVxlan("testvxlan")
path := "/proc/self/ns/net"
f, err := os.OpenFile(path, os.O_RDONLY, 0)
if err != nil {
logrus.Errorf("Failed to open path %s for network namespace for setting host mode: %v", path, err)
return
}
defer f.Close()
nsFD := f.Fd()
iface, err := netlink.LinkByName("testvxlan")
if err != nil {
logrus.Errorf("Failed to get link testvxlan: %v", err)
return
}
// If we are not able to move the vxlan interface to a namespace
// then fallback to host mode
if err := netlink.LinkSetNsFd(iface, int(nsFD)); err != nil {
hostMode = true
}
}
func (n *network) generateVxlanName(s *subnet) string {
return "vx-" + fmt.Sprintf("%06x", n.vxlanID(s)) + "-" + n.id[:5]
}
func (n *network) generateBridgeName(s *subnet) string {
return "ov-" + fmt.Sprintf("%06x", n.vxlanID(s)) + "-" + n.id[:5]
}
func isOverlap(nw *net.IPNet) bool {
var nameservers []string
if rc, err := resolvconf.Get(); err == nil {
nameservers = resolvconf.GetNameserversAsCIDR(rc.Content)
}
if err := netutils.CheckNameserverOverlaps(nameservers, nw); err != nil {
return true
}
if err := netutils.CheckRouteOverlaps(nw); err != nil {
return true
}
return false
}
func (n *network) initSubnetSandbox(s *subnet) error {
if hostMode && isOverlap(s.subnetIP) {
return fmt.Errorf("overlay subnet %s has conflicts in the host while running in host mode", s.subnetIP.String())
}
// create a bridge and vxlan device for this subnet and move it to the sandbox
brName := n.generateBridgeName(s)
sbox := n.sandbox()
if err := sbox.AddInterface(brName, "br",
@ -197,7 +265,12 @@ func (n *network) initSubnetSandbox(s *subnet) error {
return fmt.Errorf("bridge creation in sandbox failed for subnet %q: %v", s.subnetIP.String(), err)
}
vxlanName, err := createVxlan(n.vxlanID(s))
vxlanName := n.generateVxlanName(s)
// Try to delete the vxlan interface if already present
deleteVxlan(vxlanName)
err := createVxlan(vxlanName, n.vxlanID(s))
if err != nil {
return err
}
@ -207,6 +280,12 @@ func (n *network) initSubnetSandbox(s *subnet) error {
return fmt.Errorf("vxlan interface creation failed for subnet %q: %v", s.subnetIP.String(), err)
}
if hostMode {
if err := addFilters(n.id[:12], brName); err != nil {
return err
}
}
n.Lock()
s.vxlanName = vxlanName
s.brName = brName
@ -220,8 +299,16 @@ func (n *network) initSandbox() error {
n.initEpoch++
n.Unlock()
hostModeOnce.Do(setHostMode)
if hostMode {
if err := addNetworkChain(n.id[:12]); err != nil {
return err
}
}
sbox, err := osl.NewSandbox(
osl.GenerateKey(fmt.Sprintf("%d-", n.initEpoch)+n.id), true)
osl.GenerateKey(fmt.Sprintf("%d-", n.initEpoch)+n.id), !hostMode)
if err != nil {
return fmt.Errorf("could not create network sandbox: %v", err)
}

View file

@ -47,14 +47,9 @@ func createVethPair() (string, string, error) {
return name1, name2, nil
}
func createVxlan(vni uint32) (string, error) {
func createVxlan(name string, vni uint32) error {
defer osl.InitOSContext()()
name, err := netutils.GenerateIfaceName("vxlan", 7)
if err != nil {
return "", fmt.Errorf("error generating vxlan name: %v", err)
}
vxlan := &netlink.Vxlan{
LinkAttrs: netlink.LinkAttrs{Name: name},
VxlanId: int(vni),
@ -66,10 +61,10 @@ func createVxlan(vni uint32) (string, error) {
}
if err := netlink.LinkAdd(vxlan); err != nil {
return "", fmt.Errorf("error creating vxlan interface: %v", err)
return fmt.Errorf("error creating vxlan interface: %v", err)
}
return name, nil
return nil
}
func deleteVxlan(name string) error {

View file

@ -109,6 +109,7 @@ func (i *nwIface) Remove() error {
n.Lock()
path := n.path
isDefault := n.isDefault
n.Unlock()
return nsInvoke(path, func(nsFD int) error { return nil }, func(callerFD int) error {
@ -134,7 +135,7 @@ func (i *nwIface) Remove() error {
if err := netlink.LinkDel(iface); err != nil {
return fmt.Errorf("failed deleting bridge %q: %v", i.SrcName(), err)
}
} else {
} else if !isDefault {
// Move the network interface to caller namespace.
if err := netlink.LinkSetNsFd(iface, callerFD); err != nil {
fmt.Println("LinkSetNsPid failed: ", err)
@ -213,9 +214,15 @@ func (n *networkNamespace) AddInterface(srcName, dstPrefix string, options ...If
}
n.Lock()
i.dstName = fmt.Sprintf("%s%d", i.dstName, n.nextIfIndex)
n.nextIfIndex++
if n.isDefault {
i.dstName = i.srcName
} else {
i.dstName = fmt.Sprintf("%s%d", i.dstName, n.nextIfIndex)
n.nextIfIndex++
}
path := n.path
isDefault := n.isDefault
n.Unlock()
return nsInvoke(path, func(nsFD int) error {
@ -231,9 +238,13 @@ func (n *networkNamespace) AddInterface(srcName, dstPrefix string, options ...If
return fmt.Errorf("failed to get link by name %q: %v", i.srcName, err)
}
// Move the network interface to the destination namespace.
if err := netlink.LinkSetNsFd(iface, nsFD); err != nil {
return fmt.Errorf("failed to set namespace on link %q: %v", i.srcName, err)
// Move the network interface to the destination
// namespace only if the namespace is not a default
// type
if !isDefault {
if err := netlink.LinkSetNsFd(iface, nsFD); err != nil {
return fmt.Errorf("failed to set namespace on link %q: %v", i.srcName, err)
}
}
return nil

View file

@ -41,6 +41,7 @@ type networkNamespace struct {
staticRoutes []*types.StaticRoute
neighbors []*neigh
nextIfIndex int
isDefault bool
sync.Mutex
}
@ -146,7 +147,7 @@ func NewSandbox(key string, osCreate bool) (Sandbox, error) {
return nil, err
}
return &networkNamespace{path: key}, nil
return &networkNamespace{path: key, isDefault: !osCreate}, nil
}
func (n *networkNamespace) InterfaceOptions() IfaceOptionSetter {

View file

@ -163,6 +163,7 @@ EOF
--name=${name} \
--privileged \
-p ${hport}:${cport} \
-e _OVERLAY_HOST_MODE \
-v $(pwd)/:/go/src/github.com/docker/libnetwork \
-v /tmp:/tmp \
-v $(pwd)/${TMPC_ROOT}:/scratch \
@ -215,6 +216,21 @@ function runc() {
dnet_exec ${dnet} "umount /var/run/netns/c && rm /var/run/netns/c"
}
function runc_nofail() {
local dnet
dnet=${1}
shift
dnet_exec ${dnet} "cp /var/lib/docker/network/files/${1}*/* /scratch/rootfs/etc"
dnet_exec ${dnet} "mkdir -p /var/run/netns"
dnet_exec ${dnet} "touch /var/run/netns/c && mount -o bind /var/run/docker/netns/${1} /var/run/netns/c"
set +e
dnet_exec ${dnet} "ip netns exec c unshare -fmuip --mount-proc chroot \"/scratch/rootfs\" /bin/sh -c \"/bin/mount -t proc proc /proc && ${2}\""
status="$?"
set -e
dnet_exec ${dnet} "umount /var/run/netns/c && rm /var/run/netns/c"
}
function start_etcd() {
local bridge_ip
stop_etcd
@ -442,3 +458,83 @@ function test_overlay_singlehost() {
dnet_cmd $(inst_id2port 1) network rm multihost
}
function test_overlay_hostmode() {
dnet_suffix=$1
shift
echo $(docker ps)
start=1
end=2
# Setup overlay network and connect containers ot it
dnet_cmd $(inst_id2port 1) network create -d overlay multihost1
dnet_cmd $(inst_id2port 1) network create -d overlay multihost2
dnet_cmd $(inst_id2port 1) network ls
for i in `seq ${start} ${end}`;
do
dnet_cmd $(inst_id2port 1) container create mh1_${i}
net_connect 1 mh1_${i} multihost1
done
for i in `seq ${start} ${end}`;
do
dnet_cmd $(inst_id2port 1) container create mh2_${i}
net_connect 1 mh2_${i} multihost2
done
# Now test connectivity between all the containers using service names
for i in `seq ${start} ${end}`;
do
for j in `seq ${start} ${end}`;
do
if [ "$i" -eq "$j" ]; then
continue
fi
# Find the IP addresses of the j containers on both networks
hrun runc $(dnet_container_name 1 $dnet_suffix) $(get_sbox_id 1 mh1_${i}) "nslookup mh1_$j"
mh1_j_ip=$(echo ${output} | awk '{print $11}')
hrun runc $(dnet_container_name 1 $dnet_suffix) $(get_sbox_id 1 mh2_${i}) "nslookup mh2_$j"
mh2_j_ip=$(echo ${output} | awk '{print $11}')
# Ping the j containers in the same network and ensure they are successfull
runc $(dnet_container_name 1 $dnet_suffix) $(get_sbox_id 1 mh1_${i}) \
"ping -c 1 mh1_$j"
runc $(dnet_container_name 1 $dnet_suffix) $(get_sbox_id 1 mh2_${i}) \
"ping -c 1 mh2_$j"
# Try pinging j container IPs from the container in the other network and make sure that they are not successfull
runc_nofail $(dnet_container_name 1 $dnet_suffix) $(get_sbox_id 1 mh1_${i}) "ping -c 1 ${mh2_j_ip}"
[ "${status}" -ne 0 ]
runc_nofail $(dnet_container_name 1 $dnet_suffix) $(get_sbox_id 1 mh2_${i}) "ping -c 1 ${mh1_j_ip}"
[ "${status}" -ne 0 ]
# Try pinging the j container IPS from the host(dnet container in this case) and make syre that they are not successfull
hrun docker exec -it $(dnet_container_name 1 $dnet_suffix) "ping -c 1 ${mh1_j_ip}"
[ "${status}" -ne 0 ]
hrun docker exec -it $(dnet_container_name 1 $dnet_suffix) "ping -c 1 ${mh2_j_ip}"
[ "${status}" -ne 0 ]
done
done
# Teardown the container connections and the network
for i in `seq ${start} ${end}`;
do
net_disconnect 1 mh1_${i} multihost1
dnet_cmd $(inst_id2port 1) container rm mh1_${i}
done
for i in `seq ${start} ${end}`;
do
net_disconnect 1 mh2_${i} multihost2
dnet_cmd $(inst_id2port 1) container rm mh2_${i}
done
dnet_cmd $(inst_id2port 1) network rm multihost1
dnet_cmd $(inst_id2port 1) network rm multihost2
}

View file

@ -0,0 +1,9 @@
# -*- mode: sh -*-
#!/usr/bin/env bats
load helpers
@test "Test overlay network hostmode with consul" {
skip_for_circleci
test_overlay_hostmode consul
}

View file

@ -56,6 +56,21 @@ function run_overlay_consul_tests() {
unset cmap[dnet-3-consul]
}
function run_overlay_consul_host_tests() {
export _OVERLAY_HOST_MODE="true"
## Setup
start_dnet 1 consul 1>>${INTEGRATION_ROOT}/test.log 2>&1
cmap[dnet-1-consul]=dnet-1-consul
## Run the test cases
./integration-tmp/bin/bats ./test/integration/dnet/overlay-consul-host.bats
## Teardown
stop_dnet 1 consul 1>>${INTEGRATION_ROOT}/test.log 2>&1
unset cmap[dnet-1-consul]
unset _OVERLAY_HOST_MODE
}
function run_overlay_zk_tests() {
## Test overlay network with zookeeper
start_dnet 1 zookeeper 1>>${INTEGRATION_ROOT}/test.log 2>&1
@ -207,7 +222,7 @@ if [ -z "$SUITES" ]; then
# old kernel and limited docker environment.
suites="dnet simple_consul multi_consul multi_zk multi_etcd"
else
suites="dnet simple_consul multi_consul multi_zk multi_etcd bridge overlay_consul overlay_zk overlay_etcd"
suites="dnet simple_consul multi_consul multi_zk multi_etcd bridge overlay_consul overlay_consul_host overlay_zk overlay_etcd"
fi
else
suites="$SUITES"