Merge pull request #32505 from fcrisciani/conntrack_test

Conntrack flush support
This commit is contained in:
Sebastiaan van Stijn 2017-04-11 16:40:56 +02:00 committed by GitHub
commit f30e94a495
20 changed files with 919 additions and 23 deletions

View file

@ -12,6 +12,7 @@ import (
"os"
"path/filepath"
"strings"
"syscall"
"time"
"github.com/docker/docker/api/types"
@ -1789,3 +1790,56 @@ func (s *DockerNetworkSuite) TestDockerNetworkDisconnectFromBridge(c *check.C) {
_, _, err := dockerCmdWithError("network", "disconnect", network, name)
c.Assert(err, check.IsNil)
}
// TestConntrackFlowsLeak covers the failure scenario of ticket: https://github.com/docker/docker/issues/8795
// Validates that conntrack is correctly cleaned once a container is destroyed
func (s *DockerNetworkSuite) TestConntrackFlowsLeak(c *check.C) {
testRequires(c, IsAmd64, DaemonIsLinux, Network)
// Create a new network
dockerCmd(c, "network", "create", "--subnet=192.168.10.0/24", "--gateway=192.168.10.1", "-o", "com.docker.network.bridge.host_binding_ipv4=192.168.10.1", "testbind")
assertNwIsAvailable(c, "testbind")
// Launch the server, this will remain listening on an exposed port and reply to any request in a ping/pong fashion
cmd := "while true; do echo hello | nc -w 1 -lu 8080; done"
_, _, err := dockerCmdWithError("run", "-d", "--name", "server", "--net", "testbind", "-p", "8080:8080/udp", "appropriate/nc", "sh", "-c", cmd)
c.Assert(err, check.IsNil)
// Launch a container client, here the objective is to create a flow that is natted in order to expose the bug
cmd = "echo world | nc -q 1 -u 192.168.10.1 8080"
_, _, err = dockerCmdWithError("run", "-d", "--name", "client", "--net=host", "appropriate/nc", "sh", "-c", cmd)
c.Assert(err, check.IsNil)
// Get all the flows using netlink
flows, err := netlink.ConntrackTableList(netlink.ConntrackTable, syscall.AF_INET)
c.Assert(err, check.IsNil)
var flowMatch int
for _, flow := range flows {
// count only the flows that we are interested in, skipping others that can be laying around the host
if flow.Forward.Protocol == syscall.IPPROTO_UDP &&
flow.Forward.DstIP.Equal(net.ParseIP("192.168.10.1")) &&
flow.Forward.DstPort == 8080 {
flowMatch++
}
}
// The client should have created only 1 flow
c.Assert(flowMatch, checker.Equals, 1)
// Now delete the server, this will trigger the conntrack cleanup
err = deleteContainer("server")
c.Assert(err, checker.IsNil)
// Fetch again all the flows and validate that there is no server flow in the conntrack laying around
flows, err = netlink.ConntrackTableList(netlink.ConntrackTable, syscall.AF_INET)
c.Assert(err, check.IsNil)
flowMatch = 0
for _, flow := range flows {
if flow.Forward.Protocol == syscall.IPPROTO_UDP &&
flow.Forward.DstIP.Equal(net.ParseIP("192.168.10.1")) &&
flow.Forward.DstPort == 8080 {
flowMatch++
}
}
// All the flows have to be gone
c.Assert(flowMatch, checker.Equals, 0)
}

View file

@ -24,7 +24,7 @@ github.com/RackSec/srslog 456df3a81436d29ba874f3590eeeee25d666f8a5
github.com/imdario/mergo 0.2.1
#get libnetwork packages
github.com/docker/libnetwork ab8f7e61743aa7e54c5d0dad0551543adadc33cf
github.com/docker/libnetwork b13e0604016a4944025aaff521d9c125850b0d04
github.com/docker/go-events 18b43f1bc85d9cdd42c05a6cd2d444c7a200a894
github.com/armon/go-radix e39d623f12e8e41c7b5529e9a9dd67a1e2261f80
github.com/armon/go-metrics eb0af217e5e9747e41dd5303755356b62d28e3ec
@ -34,7 +34,7 @@ github.com/hashicorp/go-multierror fcdddc395df1ddf4247c69bd436e84cfa0733f7e
github.com/hashicorp/serf 598c54895cc5a7b1a24a398d635e8c0ea0959870
github.com/docker/libkv 1d8431073ae03cdaedb198a89722f3aab6d418ef
github.com/vishvananda/netns 604eaf189ee867d8c147fafc28def2394e878d25
github.com/vishvananda/netlink c682914b0b231f6cad204a86e565551e51d387c0
github.com/vishvananda/netlink 1e86b2bee5b6a7d377e4c02bb7f98209d6a7297c
github.com/BurntSushi/toml f706d00e3de6abe700c994cdd545a1a4915af060
github.com/samuel/go-zookeeper d0e0d8e11f318e000a8cc434616d69e329edc374
github.com/deckarep/golang-set ef32fa3046d9f249d399f98ebaf9be944430fd1d

View file

@ -47,6 +47,7 @@ import (
"container/heap"
"fmt"
"net"
"path/filepath"
"strings"
"sync"
"time"
@ -979,6 +980,8 @@ func (c *controller) NewSandbox(containerID string, options ...SandboxOption) (s
if sb.ingress {
c.ingressSandbox = sb
sb.config.hostsPath = filepath.Join(c.cfg.Daemon.DataDir, "/network/files/hosts")
sb.config.resolvConfPath = filepath.Join(c.cfg.Daemon.DataDir, "/network/files/resolv.conf")
sb.id = "ingress_sbox"
}
c.Unlock()

View file

@ -1346,6 +1346,13 @@ func (d *driver) RevokeExternalConnectivity(nid, eid string) error {
endpoint.portMapping = nil
// Clean the connection tracker state of the host for the specific endpoint
// The host kernel keeps track of the connections (TCP and UDP), so if a new endpoint gets the same IP of
// this one (that is going down), is possible that some of the packets would not be routed correctly inside
// the new endpoint
// Deeper details: https://github.com/docker/docker/issues/8795
clearEndpointConnections(d.nlh, endpoint)
if err = d.storeUpdate(endpoint); err != nil {
return fmt.Errorf("failed to update bridge endpoint %s to store: %v", endpoint.id[0:7], err)
}

View file

@ -7,6 +7,7 @@ import (
"github.com/Sirupsen/logrus"
"github.com/docker/libnetwork/iptables"
"github.com/vishvananda/netlink"
)
// DockerChain: DOCKER iptable chain name
@ -348,3 +349,15 @@ func setupInternalNetworkRules(bridgeIface string, addr net.Addr, icc, insert bo
}
return nil
}
func clearEndpointConnections(nlh *netlink.Handle, ep *bridgeEndpoint) {
var ipv4List []net.IP
var ipv6List []net.IP
if ep.addr != nil {
ipv4List = append(ipv4List, ep.addr.IP)
}
if ep.addrv6 != nil {
ipv6List = append(ipv6List, ep.addrv6.IP)
}
iptables.DeleteConntrackEntries(nlh, ipv4List, ipv6List)
}

View file

@ -665,7 +665,7 @@ func (ep *endpoint) hasInterface(iName string) bool {
func (ep *endpoint) Leave(sbox Sandbox, options ...EndpointOption) error {
if sbox == nil || sbox.ID() == "" || sbox.Key() == "" {
return types.BadRequestErrorf("invalid Sandbox passed to enpoint leave: %v", sbox)
return types.BadRequestErrorf("invalid Sandbox passed to endpoint leave: %v", sbox)
}
sb, ok := sbox.(*sandbox)

View file

@ -129,7 +129,7 @@ type ActiveEndpointsError struct {
}
func (aee *ActiveEndpointsError) Error() string {
return fmt.Sprintf("network %s has active endpoints", aee.name)
return fmt.Sprintf("network %s id %s has active endpoints", aee.name, aee.id)
}
// Forbidden denotes the type of this error

View file

@ -0,0 +1,59 @@
package iptables
import (
"errors"
"net"
"syscall"
"github.com/Sirupsen/logrus"
"github.com/vishvananda/netlink"
)
var (
// ErrConntrackNotConfigurable means that conntrack module is not loaded or does not have the netlink module loaded
ErrConntrackNotConfigurable = errors.New("conntrack is not available")
)
// IsConntrackProgrammable returns true if the handle supports the NETLINK_NETFILTER and the base modules are loaded
func IsConntrackProgrammable(nlh *netlink.Handle) bool {
return nlh.SupportsNetlinkFamily(syscall.NETLINK_NETFILTER)
}
// DeleteConntrackEntries deletes all the conntrack connections on the host for the specified IP
// Returns the number of flows deleted for IPv4, IPv6 else error
func DeleteConntrackEntries(nlh *netlink.Handle, ipv4List []net.IP, ipv6List []net.IP) (uint, uint, error) {
if !IsConntrackProgrammable(nlh) {
return 0, 0, ErrConntrackNotConfigurable
}
var totalIPv4FlowPurged uint
for _, ipAddress := range ipv4List {
flowPurged, err := purgeConntrackState(nlh, syscall.AF_INET, ipAddress)
if err != nil {
logrus.Warnf("Failed to delete conntrack state for %s: %v", ipAddress, err)
continue
}
totalIPv4FlowPurged += flowPurged
}
var totalIPv6FlowPurged uint
for _, ipAddress := range ipv6List {
flowPurged, err := purgeConntrackState(nlh, syscall.AF_INET6, ipAddress)
if err != nil {
logrus.Warnf("Failed to delete conntrack state for %s: %v", ipAddress, err)
continue
}
totalIPv6FlowPurged += flowPurged
}
logrus.Debugf("DeleteConntrackEntries purged ipv4:%d, ipv6:%d", totalIPv4FlowPurged, totalIPv6FlowPurged)
return totalIPv4FlowPurged, totalIPv6FlowPurged, nil
}
func purgeConntrackState(nlh *netlink.Handle, family netlink.InetFamily, ipAddress net.IP) (uint, error) {
filter := &netlink.ConntrackFilter{}
// NOTE: doing the flush using the ipAddress is safe because today there cannot be multiple networks with the same subnet
// so it will not be possible to flush flows that are of other containers
filter.AddIP(netlink.ConntrackNatAnyIP, ipAddress)
return nlh.ConntrackDeleteFilter(netlink.ConntrackTable, family, filter)
}

View file

@ -100,14 +100,14 @@ func detectIptables() {
supportsCOpt = supportsCOption(mj, mn, mc)
}
func initIptables() {
func initDependencies() {
probe()
initFirewalld()
detectIptables()
}
func initCheck() error {
initOnce.Do(initIptables)
initOnce.Do(initDependencies)
if iptablesPath == "" {
return ErrIptablesNotFound

View file

@ -88,12 +88,25 @@ func (nDB *NetworkDB) handleNodeEvent(nEvent *NodeEvent) bool {
}
func (nDB *NetworkDB) handleNetworkEvent(nEvent *NetworkEvent) bool {
var flushEntries bool
// Update our local clock if the received messages has newer
// time.
nDB.networkClock.Witness(nEvent.LTime)
nDB.Lock()
defer nDB.Unlock()
defer func() {
nDB.Unlock()
// When a node leaves a network on the last task removal cleanup the
// local entries for this network & node combination. When the tasks
// on a network are removed we could have missed the gossip updates.
// Not doing this cleanup can leave stale entries because bulksyncs
// from the node will no longer include this network state.
//
// deleteNodeNetworkEntries takes nDB lock.
if flushEntries {
nDB.deleteNodeNetworkEntries(nEvent.NetworkID, nEvent.NodeName)
}
}()
if nEvent.NodeName == nDB.config.NodeName {
return false
@ -121,6 +134,7 @@ func (nDB *NetworkDB) handleNetworkEvent(nEvent *NetworkEvent) bool {
n.leaving = nEvent.Type == NetworkEventTypeLeave
if n.leaving {
n.reapTime = reapInterval
flushEntries = true
}
nDB.addNetworkNode(nEvent.NetworkID, nEvent.NodeName)

View file

@ -372,6 +372,37 @@ func (nDB *NetworkDB) deleteNetworkEntriesForNode(deletedNode string) {
nDB.Unlock()
}
func (nDB *NetworkDB) deleteNodeNetworkEntries(nid, node string) {
nDB.Lock()
nDB.indexes[byNetwork].WalkPrefix(fmt.Sprintf("/%s", nid),
func(path string, v interface{}) bool {
oldEntry := v.(*entry)
params := strings.Split(path[1:], "/")
nid := params[0]
tname := params[1]
key := params[2]
if oldEntry.node != node {
return false
}
entry := &entry{
ltime: oldEntry.ltime,
node: node,
value: oldEntry.value,
deleting: true,
reapTime: reapInterval,
}
nDB.indexes[byTable].Insert(fmt.Sprintf("/%s/%s/%s", tname, nid, key), entry)
nDB.indexes[byNetwork].Insert(fmt.Sprintf("/%s/%s/%s", nid, tname, key), entry)
nDB.broadcaster.Write(makeEvent(opDelete, tname, nid, key, entry.value))
return false
})
nDB.Unlock()
}
func (nDB *NetworkDB) deleteNodeTableEntries(node string) {
nDB.Lock()
nDB.indexes[byTable].Walk(func(path string, v interface{}) bool {

View file

@ -75,13 +75,28 @@ func NlHandle() *netlink.Handle {
func getSupportedNlFamilies() []int {
fams := []int{syscall.NETLINK_ROUTE}
// NETLINK_XFRM test
if err := loadXfrmModules(); err != nil {
if checkXfrmSocket() != nil {
logrus.Warnf("Could not load necessary modules for IPSEC rules: %v", err)
return fams
} else {
fams = append(fams, syscall.NETLINK_XFRM)
}
} else {
fams = append(fams, syscall.NETLINK_XFRM)
}
return append(fams, syscall.NETLINK_XFRM)
// NETLINK_NETFILTER test
if err := loadNfConntrackModules(); err != nil {
if checkNfSocket() != nil {
logrus.Warnf("Could not load necessary modules for Conntrack: %v", err)
} else {
fams = append(fams, syscall.NETLINK_NETFILTER)
}
} else {
fams = append(fams, syscall.NETLINK_NETFILTER)
}
return fams
}
func loadXfrmModules() error {
@ -103,3 +118,23 @@ func checkXfrmSocket() error {
syscall.Close(fd)
return nil
}
func loadNfConntrackModules() error {
if out, err := exec.Command("modprobe", "-va", "nf_conntrack").CombinedOutput(); err != nil {
return fmt.Errorf("Running modprobe nf_conntrack failed with message: `%s`, error: %v", strings.TrimSpace(string(out)), err)
}
if out, err := exec.Command("modprobe", "-va", "nf_conntrack_netlink").CombinedOutput(); err != nil {
return fmt.Errorf("Running modprobe nf_conntrack_netlink failed with message: `%s`, error: %v", strings.TrimSpace(string(out)), err)
}
return nil
}
// API check on required nf_conntrack* modules (nf_conntrack, nf_conntrack_netlink)
func checkNfSocket() error {
fd, err := syscall.Socket(syscall.AF_NETLINK, syscall.SOCK_RAW, syscall.NETLINK_NETFILTER)
if err != nil {
return err
}
syscall.Close(fd)
return nil
}

View file

@ -644,13 +644,6 @@ func (sb *sandbox) SetKey(basePath string) error {
sb.Lock()
sb.osSbox = osSbox
sb.Unlock()
defer func() {
if err != nil {
sb.Lock()
sb.osSbox = nil
sb.Unlock()
}
}()
// If the resolver was setup before stop it and set it up in the
// new osl sandbox.

View file

@ -27,6 +27,19 @@ func (h *Handle) AddrAdd(link Link, addr *Addr) error {
return h.addrHandle(link, addr, req)
}
// AddrReplace will replace (or, if not present, add) an IP address on a link device.
// Equivalent to: `ip addr replace $addr dev $link`
func AddrReplace(link Link, addr *Addr) error {
return pkgHandle.AddrReplace(link, addr)
}
// AddrReplace will replace (or, if not present, add) an IP address on a link device.
// Equivalent to: `ip addr replace $addr dev $link`
func (h *Handle) AddrReplace(link Link, addr *Addr) error {
req := h.newNetlinkRequest(syscall.RTM_NEWADDR, syscall.NLM_F_CREATE|syscall.NLM_F_REPLACE|syscall.NLM_F_ACK)
return h.addrHandle(link, addr, req)
}
// AddrDel will delete an IP address from a link device.
// Equivalent to: `ip addr del $addr dev $link`
func AddrDel(link Link, addr *Addr) error {

View file

@ -0,0 +1,344 @@
package netlink
import (
"bytes"
"encoding/binary"
"errors"
"fmt"
"net"
"syscall"
"github.com/vishvananda/netlink/nl"
)
// ConntrackTableType Conntrack table for the netlink operation
type ConntrackTableType uint8
const (
// ConntrackTable Conntrack table
// https://github.com/torvalds/linux/blob/master/include/uapi/linux/netfilter/nfnetlink.h -> #define NFNL_SUBSYS_CTNETLINK 1
ConntrackTable = 1
// ConntrackExpectTable Conntrack expect table
// https://github.com/torvalds/linux/blob/master/include/uapi/linux/netfilter/nfnetlink.h -> #define NFNL_SUBSYS_CTNETLINK_EXP 2
ConntrackExpectTable = 2
)
const (
// backward compatibility with golang 1.6 which does not have io.SeekCurrent
seekCurrent = 1
)
// InetFamily Family type
type InetFamily uint8
// -L [table] [options] List conntrack or expectation table
// -G [table] parameters Get conntrack or expectation
// -I [table] parameters Create a conntrack or expectation
// -U [table] parameters Update a conntrack
// -E [table] [options] Show events
// -C [table] Show counter
// -S Show statistics
// ConntrackTableList returns the flow list of a table of a specific family
// conntrack -L [table] [options] List conntrack or expectation table
func ConntrackTableList(table ConntrackTableType, family InetFamily) ([]*ConntrackFlow, error) {
return pkgHandle.ConntrackTableList(table, family)
}
// ConntrackTableFlush flushes all the flows of a specified table
// conntrack -F [table] Flush table
// The flush operation applies to all the family types
func ConntrackTableFlush(table ConntrackTableType) error {
return pkgHandle.ConntrackTableFlush(table)
}
// ConntrackDeleteFilter deletes entries on the specified table on the base of the filter
// conntrack -D [table] parameters Delete conntrack or expectation
func ConntrackDeleteFilter(table ConntrackTableType, family InetFamily, filter *ConntrackFilter) (uint, error) {
return pkgHandle.ConntrackDeleteFilter(table, family, filter)
}
// ConntrackTableList returns the flow list of a table of a specific family using the netlink handle passed
// conntrack -L [table] [options] List conntrack or expectation table
func (h *Handle) ConntrackTableList(table ConntrackTableType, family InetFamily) ([]*ConntrackFlow, error) {
res, err := h.dumpConntrackTable(table, family)
if err != nil {
return nil, err
}
// Deserialize all the flows
var result []*ConntrackFlow
for _, dataRaw := range res {
result = append(result, parseRawData(dataRaw))
}
return result, nil
}
// ConntrackTableFlush flushes all the flows of a specified table using the netlink handle passed
// conntrack -F [table] Flush table
// The flush operation applies to all the family types
func (h *Handle) ConntrackTableFlush(table ConntrackTableType) error {
req := h.newConntrackRequest(table, syscall.AF_INET, nl.IPCTNL_MSG_CT_DELETE, syscall.NLM_F_ACK)
_, err := req.Execute(syscall.NETLINK_NETFILTER, 0)
return err
}
// ConntrackDeleteFilter deletes entries on the specified table on the base of the filter using the netlink handle passed
// conntrack -D [table] parameters Delete conntrack or expectation
func (h *Handle) ConntrackDeleteFilter(table ConntrackTableType, family InetFamily, filter *ConntrackFilter) (uint, error) {
res, err := h.dumpConntrackTable(table, family)
if err != nil {
return 0, err
}
var matched uint
for _, dataRaw := range res {
flow := parseRawData(dataRaw)
if match := filter.MatchConntrackFlow(flow); match {
req2 := h.newConntrackRequest(table, family, nl.IPCTNL_MSG_CT_DELETE, syscall.NLM_F_ACK)
// skip the first 4 byte that are the netfilter header, the newConntrackRequest is adding it already
req2.AddRawData(dataRaw[4:])
req2.Execute(syscall.NETLINK_NETFILTER, 0)
matched++
}
}
return matched, nil
}
func (h *Handle) newConntrackRequest(table ConntrackTableType, family InetFamily, operation, flags int) *nl.NetlinkRequest {
// Create the Netlink request object
req := h.newNetlinkRequest((int(table)<<8)|operation, flags)
// Add the netfilter header
msg := &nl.Nfgenmsg{
NfgenFamily: uint8(family),
Version: nl.NFNETLINK_V0,
ResId: 0,
}
req.AddData(msg)
return req
}
func (h *Handle) dumpConntrackTable(table ConntrackTableType, family InetFamily) ([][]byte, error) {
req := h.newConntrackRequest(table, family, nl.IPCTNL_MSG_CT_GET, syscall.NLM_F_DUMP)
return req.Execute(syscall.NETLINK_NETFILTER, 0)
}
// The full conntrack flow structure is very complicated and can be found in the file:
// http://git.netfilter.org/libnetfilter_conntrack/tree/include/internal/object.h
// For the time being, the structure below allows to parse and extract the base information of a flow
type ipTuple struct {
SrcIP net.IP
DstIP net.IP
Protocol uint8
SrcPort uint16
DstPort uint16
}
type ConntrackFlow struct {
FamilyType uint8
Forward ipTuple
Reverse ipTuple
}
func (s *ConntrackFlow) String() string {
// conntrack cmd output:
// udp 17 src=127.0.0.1 dst=127.0.0.1 sport=4001 dport=1234 [UNREPLIED] src=127.0.0.1 dst=127.0.0.1 sport=1234 dport=4001
return fmt.Sprintf("%s\t%d src=%s dst=%s sport=%d dport=%d\tsrc=%s dst=%s sport=%d dport=%d",
nl.L4ProtoMap[s.Forward.Protocol], s.Forward.Protocol,
s.Forward.SrcIP.String(), s.Forward.DstIP.String(), s.Forward.SrcPort, s.Forward.DstPort,
s.Reverse.SrcIP.String(), s.Reverse.DstIP.String(), s.Reverse.SrcPort, s.Reverse.DstPort)
}
// This method parse the ip tuple structure
// The message structure is the following:
// <len, [CTA_IP_V4_SRC|CTA_IP_V6_SRC], 16 bytes for the IP>
// <len, [CTA_IP_V4_DST|CTA_IP_V6_DST], 16 bytes for the IP>
// <len, NLA_F_NESTED|nl.CTA_TUPLE_PROTO, 1 byte for the protocol, 3 bytes of padding>
// <len, CTA_PROTO_SRC_PORT, 2 bytes for the source port, 2 bytes of padding>
// <len, CTA_PROTO_DST_PORT, 2 bytes for the source port, 2 bytes of padding>
func parseIpTuple(reader *bytes.Reader, tpl *ipTuple) {
for i := 0; i < 2; i++ {
_, t, _, v := parseNfAttrTLV(reader)
switch t {
case nl.CTA_IP_V4_SRC, nl.CTA_IP_V6_SRC:
tpl.SrcIP = v
case nl.CTA_IP_V4_DST, nl.CTA_IP_V6_DST:
tpl.DstIP = v
}
}
// Skip the next 4 bytes nl.NLA_F_NESTED|nl.CTA_TUPLE_PROTO
reader.Seek(4, seekCurrent)
_, t, _, v := parseNfAttrTLV(reader)
if t == nl.CTA_PROTO_NUM {
tpl.Protocol = uint8(v[0])
}
// Skip some padding 3 bytes
reader.Seek(3, seekCurrent)
for i := 0; i < 2; i++ {
_, t, _ := parseNfAttrTL(reader)
switch t {
case nl.CTA_PROTO_SRC_PORT:
parseBERaw16(reader, &tpl.SrcPort)
case nl.CTA_PROTO_DST_PORT:
parseBERaw16(reader, &tpl.DstPort)
}
// Skip some padding 2 byte
reader.Seek(2, seekCurrent)
}
}
func parseNfAttrTLV(r *bytes.Reader) (isNested bool, attrType, len uint16, value []byte) {
isNested, attrType, len = parseNfAttrTL(r)
value = make([]byte, len)
binary.Read(r, binary.BigEndian, &value)
return isNested, attrType, len, value
}
func parseNfAttrTL(r *bytes.Reader) (isNested bool, attrType, len uint16) {
binary.Read(r, nl.NativeEndian(), &len)
len -= nl.SizeofNfattr
binary.Read(r, nl.NativeEndian(), &attrType)
isNested = (attrType & nl.NLA_F_NESTED) == nl.NLA_F_NESTED
attrType = attrType & (nl.NLA_F_NESTED - 1)
return isNested, attrType, len
}
func parseBERaw16(r *bytes.Reader, v *uint16) {
binary.Read(r, binary.BigEndian, v)
}
func parseRawData(data []byte) *ConntrackFlow {
s := &ConntrackFlow{}
// First there is the Nfgenmsg header
// consume only the family field
reader := bytes.NewReader(data)
binary.Read(reader, nl.NativeEndian(), &s.FamilyType)
// skip rest of the Netfilter header
reader.Seek(3, seekCurrent)
// The message structure is the following:
// <len, NLA_F_NESTED|CTA_TUPLE_ORIG> 4 bytes
// <len, NLA_F_NESTED|CTA_TUPLE_IP> 4 bytes
// flow information of the forward flow
// <len, NLA_F_NESTED|CTA_TUPLE_REPLY> 4 bytes
// <len, NLA_F_NESTED|CTA_TUPLE_IP> 4 bytes
// flow information of the reverse flow
for reader.Len() > 0 {
nested, t, l := parseNfAttrTL(reader)
if nested && t == nl.CTA_TUPLE_ORIG {
if nested, t, _ = parseNfAttrTL(reader); nested && t == nl.CTA_TUPLE_IP {
parseIpTuple(reader, &s.Forward)
}
} else if nested && t == nl.CTA_TUPLE_REPLY {
if nested, t, _ = parseNfAttrTL(reader); nested && t == nl.CTA_TUPLE_IP {
parseIpTuple(reader, &s.Reverse)
// Got all the useful information stop parsing
break
} else {
// Header not recognized skip it
reader.Seek(int64(l), seekCurrent)
}
}
}
return s
}
// Conntrack parameters and options:
// -n, --src-nat ip source NAT ip
// -g, --dst-nat ip destination NAT ip
// -j, --any-nat ip source or destination NAT ip
// -m, --mark mark Set mark
// -c, --secmark secmark Set selinux secmark
// -e, --event-mask eventmask Event mask, eg. NEW,DESTROY
// -z, --zero Zero counters while listing
// -o, --output type[,...] Output format, eg. xml
// -l, --label label[,...] conntrack labels
// Common parameters and options:
// -s, --src, --orig-src ip Source address from original direction
// -d, --dst, --orig-dst ip Destination address from original direction
// -r, --reply-src ip Source addres from reply direction
// -q, --reply-dst ip Destination address from reply direction
// -p, --protonum proto Layer 4 Protocol, eg. 'tcp'
// -f, --family proto Layer 3 Protocol, eg. 'ipv6'
// -t, --timeout timeout Set timeout
// -u, --status status Set status, eg. ASSURED
// -w, --zone value Set conntrack zone
// --orig-zone value Set zone for original direction
// --reply-zone value Set zone for reply direction
// -b, --buffer-size Netlink socket buffer size
// --mask-src ip Source mask address
// --mask-dst ip Destination mask address
// Filter types
type ConntrackFilterType uint8
const (
ConntrackOrigSrcIP = iota // -orig-src ip Source address from original direction
ConntrackOrigDstIP // -orig-dst ip Destination address from original direction
ConntrackNatSrcIP // -src-nat ip Source NAT ip
ConntrackNatDstIP // -dst-nat ip Destination NAT ip
ConntrackNatAnyIP // -any-nat ip Source or destination NAT ip
)
type ConntrackFilter struct {
ipFilter map[ConntrackFilterType]net.IP
}
// AddIP adds an IP to the conntrack filter
func (f *ConntrackFilter) AddIP(tp ConntrackFilterType, ip net.IP) error {
if f.ipFilter == nil {
f.ipFilter = make(map[ConntrackFilterType]net.IP)
}
if _, ok := f.ipFilter[tp]; ok {
return errors.New("Filter attribute already present")
}
f.ipFilter[tp] = ip
return nil
}
// MatchConntrackFlow applies the filter to the flow and returns true if the flow matches the filter
// false otherwise
func (f *ConntrackFilter) MatchConntrackFlow(flow *ConntrackFlow) bool {
if len(f.ipFilter) == 0 {
// empty filter always not match
return false
}
match := true
// -orig-src ip Source address from original direction
if elem, found := f.ipFilter[ConntrackOrigSrcIP]; found {
match = match && elem.Equal(flow.Forward.SrcIP)
}
// -orig-dst ip Destination address from original direction
if elem, found := f.ipFilter[ConntrackOrigDstIP]; match && found {
match = match && elem.Equal(flow.Forward.DstIP)
}
// -src-nat ip Source NAT ip
if elem, found := f.ipFilter[ConntrackNatSrcIP]; match && found {
match = match && elem.Equal(flow.Reverse.SrcIP)
}
// -dst-nat ip Destination NAT ip
if elem, found := f.ipFilter[ConntrackNatDstIP]; match && found {
match = match && elem.Equal(flow.Reverse.DstIP)
}
// -any-nat ip Source or destination NAT ip
if elem, found := f.ipFilter[ConntrackNatAnyIP]; match && found {
match = match && (elem.Equal(flow.Reverse.SrcIP) || elem.Equal(flow.Reverse.DstIP))
}
return match
}

View file

@ -0,0 +1,53 @@
// +build !linux
package netlink
// ConntrackTableType Conntrack table for the netlink operation
type ConntrackTableType uint8
// InetFamily Family type
type InetFamily uint8
// ConntrackFlow placeholder
type ConntrackFlow struct{}
// ConntrackFilter placeholder
type ConntrackFilter struct{}
// ConntrackTableList returns the flow list of a table of a specific family
// conntrack -L [table] [options] List conntrack or expectation table
func ConntrackTableList(table ConntrackTableType, family InetFamily) ([]*ConntrackFlow, error) {
return nil, ErrNotImplemented
}
// ConntrackTableFlush flushes all the flows of a specified table
// conntrack -F [table] Flush table
// The flush operation applies to all the family types
func ConntrackTableFlush(table ConntrackTableType) error {
return ErrNotImplemented
}
// ConntrackDeleteFilter deletes entries on the specified table on the base of the filter
// conntrack -D [table] parameters Delete conntrack or expectation
func ConntrackDeleteFilter(table ConntrackTableType, family InetFamily, filter *ConntrackFilter) (uint, error) {
return 0, ErrNotImplemented
}
// ConntrackTableList returns the flow list of a table of a specific family using the netlink handle passed
// conntrack -L [table] [options] List conntrack or expectation table
func (h *Handle) ConntrackTableList(table ConntrackTableType, family InetFamily) ([]*ConntrackFlow, error) {
return nil, ErrNotImplemented
}
// ConntrackTableFlush flushes all the flows of a specified table using the netlink handle passed
// conntrack -F [table] Flush table
// The flush operation applies to all the family types
func (h *Handle) ConntrackTableFlush(table ConntrackTableType) error {
return ErrNotImplemented
}
// ConntrackDeleteFilter deletes entries on the specified table on the base of the filter using the netlink handle passed
// conntrack -D [table] parameters Delete conntrack or expectation
func (h *Handle) ConntrackDeleteFilter(table ConntrackTableType, family InetFamily, filter *ConntrackFilter) (uint, error) {
return 0, ErrNotImplemented
}

View file

@ -1,6 +1,10 @@
package netlink
import "fmt"
import (
"fmt"
"github.com/vishvananda/netlink/nl"
)
type Filter interface {
Attrs() *FilterAttrs
@ -180,11 +184,46 @@ func NewMirredAction(redirIndex int) *MirredAction {
}
}
// Constants used in TcU32Sel.Flags.
const (
TC_U32_TERMINAL = nl.TC_U32_TERMINAL
TC_U32_OFFSET = nl.TC_U32_OFFSET
TC_U32_VAROFFSET = nl.TC_U32_VAROFFSET
TC_U32_EAT = nl.TC_U32_EAT
)
// Sel of the U32 filters that contains multiple TcU32Key. This is the copy
// and the frontend representation of nl.TcU32Sel. It is serialized into canonical
// nl.TcU32Sel with the appropriate endianness.
type TcU32Sel struct {
Flags uint8
Offshift uint8
Nkeys uint8
Pad uint8
Offmask uint16
Off uint16
Offoff int16
Hoff int16
Hmask uint32
Keys []TcU32Key
}
// TcU32Key contained of Sel in the U32 filters. This is the copy and the frontend
// representation of nl.TcU32Key. It is serialized into chanonical nl.TcU32Sel
// with the appropriate endianness.
type TcU32Key struct {
Mask uint32
Val uint32
Off int32
OffMask int32
}
// U32 filters on many packet related properties
type U32 struct {
FilterAttrs
ClassId uint32
RedirIndex int
Sel *TcU32Sel
Actions []Action
}

View file

@ -6,6 +6,7 @@ import (
"errors"
"fmt"
"syscall"
"unsafe"
"github.com/vishvananda/netlink/nl"
)
@ -128,12 +129,34 @@ func (h *Handle) FilterAdd(filter Filter) error {
options := nl.NewRtAttr(nl.TCA_OPTIONS, nil)
if u32, ok := filter.(*U32); ok {
// match all
sel := nl.TcU32Sel{
Nkeys: 1,
Flags: nl.TC_U32_TERMINAL,
// Convert TcU32Sel into nl.TcU32Sel as it is without copy.
sel := (*nl.TcU32Sel)(unsafe.Pointer(u32.Sel))
if sel == nil {
// match all
sel = &nl.TcU32Sel{
Nkeys: 1,
Flags: nl.TC_U32_TERMINAL,
}
sel.Keys = append(sel.Keys, nl.TcU32Key{})
}
sel.Keys = append(sel.Keys, nl.TcU32Key{})
if native != networkOrder {
// Copy Tcu32Sel.
cSel := sel
keys := make([]nl.TcU32Key, cap(sel.Keys))
copy(keys, sel.Keys)
cSel.Keys = keys
sel = cSel
// Handle the endianness of attributes
sel.Offmask = native.Uint16(htons(sel.Offmask))
sel.Hmask = native.Uint32(htonl(sel.Hmask))
for _, key := range sel.Keys {
key.Mask = native.Uint32(htonl(key.Mask))
key.Val = native.Uint32(htonl(key.Val))
}
}
sel.Nkeys = uint8(len(sel.Keys))
nl.NewRtAttrChild(options, nl.TCA_U32_SEL, sel.Serialize())
if u32.ClassId != 0 {
nl.NewRtAttrChild(options, nl.TCA_U32_CLASSID, nl.Uint32Attr(u32.ClassId))
@ -425,6 +448,16 @@ func parseU32Data(filter Filter, data []syscall.NetlinkRouteAttr) (bool, error)
case nl.TCA_U32_SEL:
detailed = true
sel := nl.DeserializeTcU32Sel(datum.Value)
u32.Sel = (*TcU32Sel)(unsafe.Pointer(sel))
if native != networkOrder {
// Handle the endianness of attributes
u32.Sel.Offmask = native.Uint16(htons(sel.Offmask))
u32.Sel.Hmask = native.Uint32(htonl(sel.Hmask))
for _, key := range u32.Sel.Keys {
key.Mask = native.Uint32(htonl(key.Mask))
key.Val = native.Uint32(htonl(key.Val))
}
}
// only parse if we have a very basic redirect
if sel.Flags&nl.TC_U32_TERMINAL == 0 || sel.Nkeys != 1 {
return detailed, nil
@ -443,6 +476,8 @@ func parseU32Data(filter Filter, data []syscall.NetlinkRouteAttr) (bool, error)
u32.RedirIndex = int(action.Ifindex)
}
}
case nl.TCA_U32_CLASSID:
u32.ClassId = native.Uint32(datum.Value)
}
}
return detailed, nil

View file

@ -0,0 +1,189 @@
package nl
import "unsafe"
// Track the message sizes for the correct serialization/deserialization
const (
SizeofNfgenmsg = 4
SizeofNfattr = 4
SizeofNfConntrack = 376
SizeofNfctTupleHead = 52
)
var L4ProtoMap = map[uint8]string{
6: "tcp",
17: "udp",
}
// All the following constants are coming from:
// https://github.com/torvalds/linux/blob/master/include/uapi/linux/netfilter/nfnetlink_conntrack.h
// enum cntl_msg_types {
// IPCTNL_MSG_CT_NEW,
// IPCTNL_MSG_CT_GET,
// IPCTNL_MSG_CT_DELETE,
// IPCTNL_MSG_CT_GET_CTRZERO,
// IPCTNL_MSG_CT_GET_STATS_CPU,
// IPCTNL_MSG_CT_GET_STATS,
// IPCTNL_MSG_CT_GET_DYING,
// IPCTNL_MSG_CT_GET_UNCONFIRMED,
//
// IPCTNL_MSG_MAX
// };
const (
IPCTNL_MSG_CT_GET = 1
IPCTNL_MSG_CT_DELETE = 2
)
// #define NFNETLINK_V0 0
const (
NFNETLINK_V0 = 0
)
// #define NLA_F_NESTED (1 << 15)
const (
NLA_F_NESTED = (1 << 15)
)
// enum ctattr_type {
// CTA_UNSPEC,
// CTA_TUPLE_ORIG,
// CTA_TUPLE_REPLY,
// CTA_STATUS,
// CTA_PROTOINFO,
// CTA_HELP,
// CTA_NAT_SRC,
// #define CTA_NAT CTA_NAT_SRC /* backwards compatibility */
// CTA_TIMEOUT,
// CTA_MARK,
// CTA_COUNTERS_ORIG,
// CTA_COUNTERS_REPLY,
// CTA_USE,
// CTA_ID,
// CTA_NAT_DST,
// CTA_TUPLE_MASTER,
// CTA_SEQ_ADJ_ORIG,
// CTA_NAT_SEQ_ADJ_ORIG = CTA_SEQ_ADJ_ORIG,
// CTA_SEQ_ADJ_REPLY,
// CTA_NAT_SEQ_ADJ_REPLY = CTA_SEQ_ADJ_REPLY,
// CTA_SECMARK, /* obsolete */
// CTA_ZONE,
// CTA_SECCTX,
// CTA_TIMESTAMP,
// CTA_MARK_MASK,
// CTA_LABELS,
// CTA_LABELS_MASK,
// __CTA_MAX
// };
const (
CTA_TUPLE_ORIG = 1
CTA_TUPLE_REPLY = 2
CTA_STATUS = 3
CTA_TIMEOUT = 8
CTA_MARK = 9
CTA_PROTOINFO = 4
)
// enum ctattr_tuple {
// CTA_TUPLE_UNSPEC,
// CTA_TUPLE_IP,
// CTA_TUPLE_PROTO,
// CTA_TUPLE_ZONE,
// __CTA_TUPLE_MAX
// };
// #define CTA_TUPLE_MAX (__CTA_TUPLE_MAX - 1)
const (
CTA_TUPLE_IP = 1
CTA_TUPLE_PROTO = 2
)
// enum ctattr_ip {
// CTA_IP_UNSPEC,
// CTA_IP_V4_SRC,
// CTA_IP_V4_DST,
// CTA_IP_V6_SRC,
// CTA_IP_V6_DST,
// __CTA_IP_MAX
// };
// #define CTA_IP_MAX (__CTA_IP_MAX - 1)
const (
CTA_IP_V4_SRC = 1
CTA_IP_V4_DST = 2
CTA_IP_V6_SRC = 3
CTA_IP_V6_DST = 4
)
// enum ctattr_l4proto {
// CTA_PROTO_UNSPEC,
// CTA_PROTO_NUM,
// CTA_PROTO_SRC_PORT,
// CTA_PROTO_DST_PORT,
// CTA_PROTO_ICMP_ID,
// CTA_PROTO_ICMP_TYPE,
// CTA_PROTO_ICMP_CODE,
// CTA_PROTO_ICMPV6_ID,
// CTA_PROTO_ICMPV6_TYPE,
// CTA_PROTO_ICMPV6_CODE,
// __CTA_PROTO_MAX
// };
// #define CTA_PROTO_MAX (__CTA_PROTO_MAX - 1)
const (
CTA_PROTO_NUM = 1
CTA_PROTO_SRC_PORT = 2
CTA_PROTO_DST_PORT = 3
)
// enum ctattr_protoinfo {
// CTA_PROTOINFO_UNSPEC,
// CTA_PROTOINFO_TCP,
// CTA_PROTOINFO_DCCP,
// CTA_PROTOINFO_SCTP,
// __CTA_PROTOINFO_MAX
// };
// #define CTA_PROTOINFO_MAX (__CTA_PROTOINFO_MAX - 1)
const (
CTA_PROTOINFO_TCP = 1
)
// enum ctattr_protoinfo_tcp {
// CTA_PROTOINFO_TCP_UNSPEC,
// CTA_PROTOINFO_TCP_STATE,
// CTA_PROTOINFO_TCP_WSCALE_ORIGINAL,
// CTA_PROTOINFO_TCP_WSCALE_REPLY,
// CTA_PROTOINFO_TCP_FLAGS_ORIGINAL,
// CTA_PROTOINFO_TCP_FLAGS_REPLY,
// __CTA_PROTOINFO_TCP_MAX
// };
// #define CTA_PROTOINFO_TCP_MAX (__CTA_PROTOINFO_TCP_MAX - 1)
const (
CTA_PROTOINFO_TCP_STATE = 1
CTA_PROTOINFO_TCP_WSCALE_ORIGINAL = 2
CTA_PROTOINFO_TCP_WSCALE_REPLY = 3
CTA_PROTOINFO_TCP_FLAGS_ORIGINAL = 4
CTA_PROTOINFO_TCP_FLAGS_REPLY = 5
)
// /* General form of address family dependent message.
// */
// struct nfgenmsg {
// __u8 nfgen_family; /* AF_xxx */
// __u8 version; /* nfnetlink version */
// __be16 res_id; /* resource id */
// };
type Nfgenmsg struct {
NfgenFamily uint8
Version uint8
ResId uint16 // big endian
}
func (msg *Nfgenmsg) Len() int {
return SizeofNfgenmsg
}
func DeserializeNfgenmsg(b []byte) *Nfgenmsg {
return (*Nfgenmsg)(unsafe.Pointer(&b[0:SizeofNfgenmsg][0]))
}
func (msg *Nfgenmsg) Serialize() []byte {
return (*(*[SizeofNfgenmsg]byte)(unsafe.Pointer(msg)))[:]
}

View file

@ -24,7 +24,7 @@ const (
)
// SupportedNlFamilies contains the list of netlink families this netlink package supports
var SupportedNlFamilies = []int{syscall.NETLINK_ROUTE, syscall.NETLINK_XFRM}
var SupportedNlFamilies = []int{syscall.NETLINK_ROUTE, syscall.NETLINK_XFRM, syscall.NETLINK_NETFILTER}
var nextSeqNr uint32
@ -321,6 +321,7 @@ func (a *RtAttr) Serialize() []byte {
type NetlinkRequest struct {
syscall.NlMsghdr
Data []NetlinkRequestData
RawData []byte
Sockets map[int]*SocketHandle
}
@ -332,6 +333,8 @@ func (req *NetlinkRequest) Serialize() []byte {
dataBytes[i] = data.Serialize()
length = length + len(dataBytes[i])
}
length += len(req.RawData)
req.Len = uint32(length)
b := make([]byte, length)
hdr := (*(*[syscall.SizeofNlMsghdr]byte)(unsafe.Pointer(req)))[:]
@ -343,6 +346,10 @@ func (req *NetlinkRequest) Serialize() []byte {
next = next + 1
}
}
// Add the raw data if any
if len(req.RawData) > 0 {
copy(b[next:length], req.RawData)
}
return b
}
@ -352,6 +359,13 @@ func (req *NetlinkRequest) AddData(data NetlinkRequestData) {
}
}
// AddRawData adds raw bytes to the end of the NetlinkRequest object during serialization
func (req *NetlinkRequest) AddRawData(data []byte) {
if data != nil {
req.RawData = append(req.RawData, data...)
}
}
// Execute the request against a the given sockType.
// Returns a list of netlink messages in serialized format, optionally filtered
// by resType.