2 سال پیش · 214e200f95
--- a/libnetwork/drivers/overlay/bpf.go
+++ b/libnetwork/drivers/overlay/bpf.go
@@ -8,18 +8,21 @@ import (
 
				 	"golang.org/x/net/bpf"
			
 
				 )
			
 
				 
			
 
				-// vniMatchBPF returns a BPF program suitable for passing to the iptables bpf
			
 
				-// match which matches on the VXAN Network ID of encapsulated packets. The
			
 
				-// program assumes that it will be used in a rule which only matches UDP
			
 
				-// datagrams.
			
 
				+// vniMatchBPF returns a BPF program suitable for passing to the iptables and
			
 
				+// ip6tables bpf match which matches on the VXAN Network ID of encapsulated
			
 
				+// packets. The program assumes that it will be used in a rule which only
			
 
				+// matches UDP datagrams.
			
 
				 func vniMatchBPF(vni uint32) []bpf.RawInstruction {
			
 
				 	asm, err := bpf.Assemble([]bpf.Instruction{
			
 
				-		bpf.LoadMemShift{Off: 0},                                    // ldx 4*([0] & 0xf) ; Load length of IPv4 header into X
			
 
				-		bpf.LoadIndirect{Off: 12, Size: 4},                          // ld [x + 12]       ; Load VXLAN ID (UDP header + 4 bytes) into A
			
 
				-		bpf.ALUOpConstant{Op: bpf.ALUOpAnd, Val: 0xffffff00},        // and #0xffffff00   ; VXLAN ID is in top 24 bits
			
 
				-		bpf.JumpIf{Cond: bpf.JumpEqual, Val: vni << 8, SkipTrue: 1}, // jeq ($vni << 8), match
			
 
				-		bpf.RetConstant{Val: 0},                                     // ret #0
			
 
				-		bpf.RetConstant{Val: ^uint32(0)},                            // match: ret #-1
			
 
				+		// Load offset of UDP payload into X.
			
 
				+		bpf.LoadExtension{Num: bpf.ExtPayloadOffset}, // ld poff
			
 
				+		bpf.TAX{}, // tax
			
 
				+
			
 
				+		bpf.LoadIndirect{Off: 4, Size: 4},                      // ld [x + 4] ; Load VXLAN ID into top 24 bits of A
			
 
				+		bpf.ALUOpConstant{Op: bpf.ALUOpShiftRight, Val: 8},     // rsh #8     ; A >>= 8
			
 
				+		bpf.JumpIf{Cond: bpf.JumpEqual, Val: vni, SkipTrue: 1}, // jeq $vni, match
			
 
				+		bpf.RetConstant{Val: 0},                                // ret #0
			
 
				+		bpf.RetConstant{Val: ^uint32(0)},                       // match: ret #-1
			
 
				 	})
			
 
				 	// bpf.Assemble() only errors if an instruction is invalid. As the only variable
			
 
				 	// part of the program is an instruction value for which the entire range is
			
--- a/libnetwork/drivers/overlay/bpf_linux_test.go
+++ b/libnetwork/drivers/overlay/bpf_linux_test.go
@@ -0,0 +1,227 @@
 
				+package overlay
			
 
				+
			
 
				+import (
			
 
				+	"bytes"
			
 
				+	"encoding/binary"
			
 
				+	"errors"
			
 
				+	"fmt"
			
 
				+	"net"
			
 
				+	"net/netip"
			
 
				+	"testing"
			
 
				+	"time"
			
 
				+
			
 
				+	"golang.org/x/net/bpf"
			
 
				+	"golang.org/x/net/ipv4"
			
 
				+	"golang.org/x/sys/unix"
			
 
				+)
			
 
				+
			
 
				+func TestVNIMatchBPF(t *testing.T) {
			
 
				+	// The BPF filter program under test uses Linux extensions which are not
			
 
				+	// emulated by any user-space BPF interpreters. It is also classic BPF,
			
 
				+	// which cannot be tested in-kernel using the bpf(BPF_PROG_RUN) syscall.
			
 
				+	// The best we can do without actually programming it into an iptables
			
 
				+	// rule and end-to-end testing it is to attach it as a socket filter to
			
 
				+	// a raw socket and test which loopback packets make it through.
			
 
				+	//
			
 
				+	// Modern kernels transpile cBPF programs into eBPF for execution, so a
			
 
				+	// possible future direction would be to extract the transpiler and
			
 
				+	// convert the program under test to eBPF so it could be loaded and run
			
 
				+	// using the bpf(2) syscall.
			
 
				+	// https://elixir.bootlin.com/linux/v6.2/source/net/core/filter.c#L559
			
 
				+	// Though the effort would be better spent on adding nftables support to
			
 
				+	// libnetwork so this whole BPF program could be replaced with a native
			
 
				+	// nftables '@th' match expression.
			
 
				+	//
			
 
				+	// The filter could be manually e2e-tested for both IPv4 and IPv6 by
			
 
				+	// programming ip[6]tables rules which log matching packets and sending
			
 
				+	// test packets loopback using netcat. All the necessary information
			
 
				+	// (bytecode and an acceptable test vector) is logged by this test.
			
 
				+	//
			
 
				+	//     $ sudo ip6tables -A INPUT -p udp -s ::1 -d ::1 -m bpf \
			
 
				+	//         --bytecode "${bpf_program_under_test}" \
			
 
				+	//         -j LOG --log-prefix '[IPv6 VNI match]:'
			
 
				+	//     $ <<<"${udp_payload_hexdump}" xxd -r -p | nc -u -6 localhost 30000
			
 
				+	//     $ sudo dmesg
			
 
				+
			
 
				+	loopback := net.IPv4(127, 0, 0, 1)
			
 
				+
			
 
				+	// Reserve an ephemeral UDP port for loopback testing.
			
 
				+	// Binding to a TUN device would be more hermetic, but is much more effort to set up.
			
 
				+	reservation, err := net.ListenUDP("udp", &net.UDPAddr{IP: loopback, Port: 0})
			
 
				+	if err != nil {
			
 
				+		t.Fatal(err)
			
 
				+	}
			
 
				+	defer reservation.Close()
			
 
				+	daddr := reservation.LocalAddr().(*net.UDPAddr).AddrPort()
			
 
				+
			
 
				+	sender, err := net.DialUDP("udp", nil, reservation.LocalAddr().(*net.UDPAddr))
			
 
				+	if err != nil {
			
 
				+		t.Fatal(err)
			
 
				+	}
			
 
				+	defer sender.Close()
			
 
				+	saddr := sender.LocalAddr().(*net.UDPAddr).AddrPort()
			
 
				+
			
 
				+	// There doesn't seem to be a way to receive the entire Layer-3 IPv6
			
 
				+	// packet including the fixed IP header using the portable raw sockets
			
 
				+	// API. That can only be done from an AF_PACKET socket, and it is
			
 
				+	// unclear whether 'ld poff' would behave the same in a BPF program
			
 
				+	// attached to such a socket as in an xt_bpf match.
			
 
				+	c, err := net.ListenIP("ip4:udp", &net.IPAddr{IP: loopback})
			
 
				+	if err != nil {
			
 
				+		if errors.Is(err, unix.EPERM) {
			
 
				+			t.Skip("test requires CAP_NET_RAW")
			
 
				+		}
			
 
				+		t.Fatal(err)
			
 
				+	}
			
 
				+	defer c.Close()
			
 
				+
			
 
				+	pc := ipv4.NewPacketConn(c)
			
 
				+
			
 
				+	testvectors := []uint32{
			
 
				+		0,
			
 
				+		1,
			
 
				+		0x08,
			
 
				+		42,
			
 
				+		0x80,
			
 
				+		0xfe,
			
 
				+		0xff,
			
 
				+		0x100,
			
 
				+		0xfff,  // 4095
			
 
				+		0x1000, // 4096
			
 
				+		0x1001,
			
 
				+		0x10000,
			
 
				+		0xfffffe,
			
 
				+		0xffffff, // Max VNI
			
 
				+	}
			
 
				+	for _, vni := range []uint32{1, 42, 0x100, 0x1000, 0xfffffe, 0xffffff} {
			
 
				+		t.Run(fmt.Sprintf("vni=%d", vni), func(t *testing.T) {
			
 
				+			setBPF(t, pc, vniMatchBPF(vni))
			
 
				+
			
 
				+			for _, v := range testvectors {
			
 
				+				pkt := appendVXLANHeader(nil, v)
			
 
				+				pkt = append(pkt, []byte{0xde, 0xad, 0xbe, 0xef}...)
			
 
				+				if _, err := sender.Write(pkt); err != nil {
			
 
				+					t.Fatal(err)
			
 
				+				}
			
 
				+
			
 
				+				rpkt, ok := readUDPPacketFromRawSocket(t, pc, saddr, daddr)
			
 
				+				// Sanity check: the only packets readUDPPacketFromRawSocket
			
 
				+				// should return are ones we sent.
			
 
				+				if ok && !bytes.Equal(pkt, rpkt) {
			
 
				+					t.Fatalf("received unexpected packet: % x", rpkt)
			
 
				+				}
			
 
				+				if ok != (v == vni) {
			
 
				+					t.Errorf("unexpected packet tagged with vni=%d (got %v, want %v)", v, ok, v == vni)
			
 
				+				}
			
 
				+			}
			
 
				+		})
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func appendVXLANHeader(b []byte, vni uint32) []byte {
			
 
				+	// https://tools.ietf.org/html/rfc7348#section-5
			
 
				+	b = append(b, []byte{0x08, 0x00, 0x00, 0x00}...)
			
 
				+	return binary.BigEndian.AppendUint32(b, vni<<8)
			
 
				+}
			
 
				+
			
 
				+func setBPF(t *testing.T, c *ipv4.PacketConn, fprog []bpf.RawInstruction) {
			
 
				+	// https://natanyellin.com/posts/ebpf-filtering-done-right/
			
 
				+	blockall, _ := bpf.Assemble([]bpf.Instruction{bpf.RetConstant{Val: 0}})
			
 
				+	if err := c.SetBPF(blockall); err != nil {
			
 
				+		t.Fatal(err)
			
 
				+	}
			
 
				+	ms := make([]ipv4.Message, 100)
			
 
				+	for {
			
 
				+		n, err := c.ReadBatch(ms, unix.MSG_DONTWAIT)
			
 
				+		if err != nil {
			
 
				+			if errors.Is(err, unix.EAGAIN) {
			
 
				+				break
			
 
				+			}
			
 
				+			t.Fatal(err)
			
 
				+		}
			
 
				+		if n == 0 {
			
 
				+			break
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	t.Logf("setting socket filter: %v", marshalXTBPF(fprog))
			
 
				+	if err := c.SetBPF(fprog); err != nil {
			
 
				+		t.Fatal(err)
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// readUDPPacketFromRawSocket reads raw IP packets from pc until a UDP packet
			
 
				+// which matches the (src, dst) 4-tuple is found or the receive buffer is empty,
			
 
				+// and returns the payload of the UDP packet.
			
 
				+func readUDPPacketFromRawSocket(t *testing.T, pc *ipv4.PacketConn, src, dst netip.AddrPort) ([]byte, bool) {
			
 
				+	t.Helper()
			
 
				+
			
 
				+	ms := []ipv4.Message{
			
 
				+		{Buffers: [][]byte{make([]byte, 1500)}},
			
 
				+	}
			
 
				+
			
 
				+	// Set a time limit to prevent an infinite loop if there is a lot of
			
 
				+	// loopback traffic being captured which prevents the buffer from
			
 
				+	// emptying.
			
 
				+	deadline := time.Now().Add(1 * time.Second)
			
 
				+	for time.Now().Before(deadline) {
			
 
				+		n, err := pc.ReadBatch(ms, unix.MSG_DONTWAIT)
			
 
				+		if err != nil {
			
 
				+			if !errors.Is(err, unix.EAGAIN) {
			
 
				+				t.Fatal(err)
			
 
				+			}
			
 
				+			break
			
 
				+		}
			
 
				+		if n == 0 {
			
 
				+			break
			
 
				+		}
			
 
				+		pkt := ms[0].Buffers[0][:ms[0].N]
			
 
				+		psrc, pdst, payload, ok := parseUDP(pkt)
			
 
				+		// Discard captured packets which belong to other unrelated flows.
			
 
				+		if !ok || psrc != src || pdst != dst {
			
 
				+			t.Logf("discarding packet:\n% x", pkt)
			
 
				+			continue
			
 
				+		}
			
 
				+		t.Logf("received packet (%v -> %v):\n% x", psrc, pdst, payload)
			
 
				+		// While not strictly required, copy payload into a new
			
 
				+		// slice which does not share a backing array with pkt
			
 
				+		// so the IP and UDP headers can be garbage collected.
			
 
				+		return append([]byte(nil), payload...), true
			
 
				+	}
			
 
				+	return nil, false
			
 
				+}
			
 
				+
			
 
				+func parseIPv4(b []byte) (src, dst netip.Addr, protocol byte, payload []byte, ok bool) {
			
 
				+	if len(b) < 20 {
			
 
				+		return netip.Addr{}, netip.Addr{}, 0, nil, false
			
 
				+	}
			
 
				+	hlen := int(b[0]&0x0f) * 4
			
 
				+	if hlen < 20 {
			
 
				+		return netip.Addr{}, netip.Addr{}, 0, nil, false
			
 
				+	}
			
 
				+	src, _ = netip.AddrFromSlice(b[12:16])
			
 
				+	dst, _ = netip.AddrFromSlice(b[16:20])
			
 
				+	protocol = b[9]
			
 
				+	payload = b[hlen:]
			
 
				+	return src, dst, protocol, payload, true
			
 
				+}
			
 
				+
			
 
				+// parseUDP parses the IP and UDP headers from the raw Layer-3 packet data in b.
			
 
				+func parseUDP(b []byte) (src, dst netip.AddrPort, payload []byte, ok bool) {
			
 
				+	srcip, dstip, protocol, ippayload, ok := parseIPv4(b)
			
 
				+	if !ok {
			
 
				+		return netip.AddrPort{}, netip.AddrPort{}, nil, false
			
 
				+	}
			
 
				+	if protocol != 17 {
			
 
				+		return netip.AddrPort{}, netip.AddrPort{}, nil, false
			
 
				+	}
			
 
				+	if len(ippayload) < 8 {
			
 
				+		return netip.AddrPort{}, netip.AddrPort{}, nil, false
			
 
				+	}
			
 
				+	sport := binary.BigEndian.Uint16(ippayload[0:2])
			
 
				+	dport := binary.BigEndian.Uint16(ippayload[2:4])
			
 
				+	src = netip.AddrPortFrom(srcip, sport)
			
 
				+	dst = netip.AddrPortFrom(dstip, dport)
			
 
				+	payload = ippayload[8:]
			
 
				+	return src, dst, payload, true
			
 
				+}