Переглянути джерело

Import the ssd tool in libnetwork

Signed-off-by: Flavio Crisciani <flavio.crisciani@docker.com>
Flavio Crisciani 7 роки тому
батько
коміт
a16d469867
3 змінених файлів з 261 додано та 0 видалено
  1. 34 0
      libnetwork/cmd/ssd/Dockerfile
  2. 47 0
      libnetwork/cmd/ssd/README.md
  3. 180 0
      libnetwork/cmd/ssd/ssd.py

+ 34 - 0
libnetwork/cmd/ssd/Dockerfile

@@ -0,0 +1,34 @@
+FROM alpine:3.7
+ENV PACKAGES="\
+    musl \
+    linux-headers \
+    build-base \
+    util-linux \
+    bash \
+    git \
+    ca-certificates \
+    python2 \
+    python2-dev \
+    py-setuptools \
+    iproute2 \
+    curl \
+    strace \
+    drill \
+    ipvsadm \
+    iperf \
+    ethtool \
+"
+
+RUN echo \
+    && apk add --no-cache $PACKAGES \
+    && if [[ ! -e /usr/bin/python ]];        then ln -sf /usr/bin/python2.7 /usr/bin/python; fi \
+    && if [[ ! -e /usr/bin/python-config ]]; then ln -sf /usr/bin/python2.7-config /usr/bin/python-config; fi \
+    && if [[ ! -e /usr/bin/easy_install ]];  then ln -sf /usr/bin/easy_install-2.7 /usr/bin/easy_install; fi \
+    && easy_install pip \
+    && pip install --upgrade pip \
+    && if [[ ! -e /usr/bin/pip ]]; then ln -sf /usr/bin/pip2.7 /usr/bin/pip; fi \
+    && echo
+
+ADD ssd.py /
+RUN pip install git+git://github.com/docker/docker-py.git
+ENTRYPOINT [ "python", "/ssd.py"]

+ 47 - 0
libnetwork/cmd/ssd/README.md

@@ -0,0 +1,47 @@
+# Docker Swarm Service Driller(ssd)
+
+ssd is a troubleshooting utility for Docker swarm networks. 
+
+### control-plane and datapath consistency check on a node
+ssd checks for the consistency between docker network control-plane (from the docker daemon in-memory state) and kernel data path programming. Currently the tool checks only for the consistency of the Load balancer (implemented using IPVS).
+
+In a three node swarm cluser ssd status for a overlay network `ov2` which has three services running, each replicated to 3 instances.
+
+````bash
+vagrant@net-1:~/code/go/src/github.com/docker/docker-e2e/tests$ docker run -v /var/run/docker.sock:/var/run/docker.sock -v /var/run/docker/netns:/var/run/docker/netns --privileged --net=host sanimej/ssd ov2
+Verifying LB programming for containers on network ov2
+Verifying container /s2.3.ltrdwef0iqf90rqauw3ehcs56...
+service s2... OK
+service s3... OK
+service s1... OK
+Verifying container /s3.3.nyhwvdvnocb4wftyhb8dr4fj8...
+service s2... OK
+service s3... OK
+service s1... OK
+Verifying container /s1.3.wwx5tuxhnvoz5vrb8ohphby0r...
+service s2... OK
+service s3... OK
+service s1... OK
+Verifying LB programming for containers on network ingress
+Verifying container Ingress...
+service web... OK
+````
+
+ssd checks the required iptables programming to direct an incoming packet with the <host ip>:<published port> to the right <backend ip>:<target port>
+
+### control-plane consistency check across nodes in a cluster
+
+Docker networking uses a gossip protocol to synchronize networking state across nodes  in a cluster. ssd's `gossip-consistency` command verifies if the state maintained by all the nodes are consistent.
+
+````bash
+In a three node cluster with services running on an overlay network ov2 ssd consistency-checker shows 
+
+vagrant@net-1:~/code/go/src/github.com/docker/docker-e2e/tests$ docker run -v /var/run/docker.sock:/var/run/docker.sock -v /var/run/docker/netns:/var/run/docker/netns --privileged sanimej/ssd ov2 gossip-consistency
+Node id: sjfp0ca8f43rvnab6v7f21gq0 gossip hash c57d89094dbb574a37930393278dc282
+
+Node id: bg228r3q9095grj4wxkqs80oe gossip hash c57d89094dbb574a37930393278dc282
+
+Node id: 6jylcraipcv2pxdricqe77j5q gossip hash c57d89094dbb574a37930393278dc282
+````
+
+This is hash digest of the control-plane state for the network `ov2` from all the cluster nodes. If the values have a mismatch `docker network inspect --verbose` on the individual nodes can help in identifying what the specific difference is.

+ 180 - 0
libnetwork/cmd/ssd/ssd.py

@@ -0,0 +1,180 @@
+#!/usr/bin/python
+
+import sys, signal, time
+import docker
+import re
+import subprocess
+import json
+import hashlib
+
+ipv4match = re.compile(
+    r'(25[0-5]|2[0-4][0-9]|[01]?[0-9]?[0-9]).' +
+    r'(25[0-5]|2[0-4][0-9]|[01]?[0-9]?[0-9]).' +
+    r'(25[0-5]|2[0-4][0-9]|[01]?[0-9]?[0-9]).' +
+    r'(25[0-5]|2[0-4][0-9]|[01]?[0-9]?[0-9])'
+)
+
+def check_iptables(name, plist):
+    replace = (':', ',')
+    ports = []
+    for port in plist:
+        for r in replace:
+            port = port.replace(r, ' ')
+
+        p = port.split()
+        ports.append((p[1], p[3]))
+
+    # get the ingress sandbox's docker_gwbridge network IP.
+    # published ports get DNAT'ed to this IP.
+    ip = subprocess.check_output(['/usr/bin/nsenter', '--net=/var/run/docker/netns/ingress_sbox', '/bin/bash', '-c', 'ifconfig eth1 | grep \"inet\\ addr\" | cut -d: -f2 | cut -d\" \" -f1'])
+    ip = ip.rstrip()
+
+    for p in ports:
+        rule = '/sbin/iptables -t nat -C DOCKER-INGRESS -p tcp --dport {0} -j DNAT --to {1}:{2}'.format(p[1], ip, p[1])
+        try:
+            subprocess.check_output(["/bin/bash", "-c", rule])
+        except subprocess.CalledProcessError as e:
+            print "Service {0}: host iptables DNAT rule for port {1} -> ingress sandbox {2}:{3} missing".format(name, p[1], ip, p[1])
+
+def get_namespaces(data, ingress=False):
+    if ingress is True:
+        return {"Ingress":"/var/run/docker/netns/ingress_sbox"}
+    else:
+        spaces =[]
+        for c in data["Containers"]:
+            sandboxes = {str(c) for c in data["Containers"]}
+
+        containers = {}
+        for s in sandboxes:
+            spaces.append(str(cli.inspect_container(s)["NetworkSettings"]["SandboxKey"]))
+            inspect = cli.inspect_container(s)
+            containers[str(inspect["Name"])] = str(inspect["NetworkSettings"]["SandboxKey"])
+        return containers
+
+
+def check_network(nw_name, ingress=False):
+
+    print "Verifying LB programming for containers on network %s" % nw_name
+
+    data = cli.inspect_network(nw_name, verbose=True)
+
+    services = data["Services"]
+    fwmarks = {str(service): str(svalue["LocalLBIndex"]) for service, svalue in services.items()}
+
+    stasks = {}
+    for service, svalue in services.items():
+        if service == "":
+            continue
+        tasks = []
+        for task in svalue["Tasks"]:
+            tasks.append(str(task["EndpointIP"]))
+        stasks[fwmarks[str(service)]] = tasks
+
+        # for services in ingress network verify the iptables rules
+        # that direct ingress (published port) to backend (target port)
+        if ingress is True:
+            check_iptables(service, svalue["Ports"])
+
+    containers = get_namespaces(data, ingress)
+    for container, namespace in containers.items():
+        print "Verifying container %s..." % container
+        ipvs = subprocess.check_output(['/usr/bin/nsenter', '--net=%s' % namespace, '/usr/sbin/ipvsadm', '-ln'])
+
+        mark = ""
+        realmark = {}
+        for line in ipvs.splitlines():
+            if "FWM" in line:
+                mark = re.findall("[0-9]+", line)[0]
+                realmark[str(mark)] = []
+            elif "->" in line:
+                if mark == "":
+                    continue
+                ip = ipv4match.search(line)
+                if ip is not None:
+                    realmark[mark].append(format(ip.group(0)))
+            else:
+                mark = ""
+        for key in realmark.keys():
+            if key not in stasks:
+                print "LB Index %s" % key, "present in IPVS but missing in docker daemon"
+                del realmark[key]
+
+        for key in stasks.keys():
+            if key not in realmark:
+                print "LB Index %s" % key, "present in docker daemon but missing in IPVS"
+                del stasks[key]
+
+        for key in realmark:
+            service = "--Invalid--"
+            for sname, idx in fwmarks.items():
+                if key == idx:
+                    service = sname
+            if len(set(realmark[key])) != len(set(stasks[key])):
+                print "Incorrect LB Programming for service %s" % service
+                print "control-plane backend tasks:"
+                for task in stasks[key]:
+                    print task
+                print "kernel IPVS backend tasks:"
+                for task in realmark[key]:
+                    print task
+            else:
+                print "service %s... OK" % service
+
+if __name__ == '__main__':
+    if len(sys.argv) < 2:
+        print 'Usage: ssd.py network-name [gossip-consistency]'
+        sys.exit()
+
+    cli = docker.APIClient(base_url='unix://var/run/docker.sock', version='auto')
+    if len(sys.argv) == 3:
+        command = sys.argv[2]
+    else:
+        command = 'default'
+
+    if command == 'gossip-consistency':
+        cspec = docker.types.ContainerSpec(
+            image='sanimej/ssd',
+            args=[sys.argv[1], 'gossip-hash'],
+            mounts=[docker.types.Mount('/var/run/docker.sock', '/var/run/docker.sock', type='bind')]
+        )
+        mode = docker.types.ServiceMode(
+            mode='global'
+        )
+        task_template = docker.types.TaskTemplate(cspec)
+
+        cli.create_service(task_template, name='gossip-hash', mode=mode)
+        #TODO change to a deterministic way to check if the service is up.
+        time.sleep(5)
+        output = cli.service_logs('gossip-hash', stdout=True, stderr=True, details=True)
+        for line in output:
+            print("Node id: %s gossip hash %s" % (line[line.find("=")+1:line.find(",")], line[line.find(" ")+1:]))
+        if cli.remove_service('gossip-hash') is not True:
+            print("Deleting gossip-hash service failed")
+    elif command == 'gossip-hash':
+        data = cli.inspect_network(sys.argv[1], verbose=True)
+        services = data["Services"]
+        md5 = hashlib.md5()
+        entries = []
+        for service, value in services.items():
+            entries.append(service)
+            entries.append(value["VIP"])
+            for task in value["Tasks"]:
+                for key, val in task.items():
+                    if isinstance(val, dict):
+                        for k, v in val.items():
+                            entries.append(v)
+                    else:
+                        entries.append(val)
+        entries.sort()
+        for e in entries:
+            md5.update(e)
+        print(md5.hexdigest())
+        sys.stdout.flush()
+        while True:
+           signal.pause()
+    elif command == 'default':
+        if sys.argv[1] == "ingress":
+            check_network("ingress", ingress=True)
+        else:
+            check_network(sys.argv[1])
+            check_network("ingress", ingress=True)