0ct0pu5
/
moby


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
							package overlay

import (
	"bytes"
	"encoding/binary"
	"errors"
	"fmt"
	"net"
	"net/netip"
	"testing"
	"time"

	"golang.org/x/net/bpf"
	"golang.org/x/net/ipv4"
	"golang.org/x/sys/unix"
)

func TestVNIMatchBPF(t *testing.T) {
	// The BPF filter program under test uses Linux extensions which are not
	// emulated by any user-space BPF interpreters. It is also classic BPF,
	// which cannot be tested in-kernel using the bpf(BPF_PROG_RUN) syscall.
	// The best we can do without actually programming it into an iptables
	// rule and end-to-end testing it is to attach it as a socket filter to
	// a raw socket and test which loopback packets make it through.
	//
	// Modern kernels transpile cBPF programs into eBPF for execution, so a
	// possible future direction would be to extract the transpiler and
	// convert the program under test to eBPF so it could be loaded and run
	// using the bpf(2) syscall.
	// https://elixir.bootlin.com/linux/v6.2/source/net/core/filter.c#L559
	// Though the effort would be better spent on adding nftables support to
	// libnetwork so this whole BPF program could be replaced with a native
	// nftables '@th' match expression.
	//
	// The filter could be manually e2e-tested for both IPv4 and IPv6 by
	// programming ip[6]tables rules which log matching packets and sending
	// test packets loopback using netcat. All the necessary information
	// (bytecode and an acceptable test vector) is logged by this test.
	//
	//     $ sudo ip6tables -A INPUT -p udp -s ::1 -d ::1 -m bpf \
	//         --bytecode "${bpf_program_under_test}" \
	//         -j LOG --log-prefix '[IPv6 VNI match]:'
	//     $ <<<"${udp_payload_hexdump}" xxd -r -p | nc -u -6 localhost 30000
	//     $ sudo dmesg

	loopback := net.IPv4(127, 0, 0, 1)

	// Reserve an ephemeral UDP port for loopback testing.
	// Binding to a TUN device would be more hermetic, but is much more effort to set up.
	reservation, err := net.ListenUDP("udp", &net.UDPAddr{IP: loopback, Port: 0})
	if err != nil {
		t.Fatal(err)
	}
	defer reservation.Close()
	daddr := reservation.LocalAddr().(*net.UDPAddr).AddrPort()

	sender, err := net.DialUDP("udp", nil, reservation.LocalAddr().(*net.UDPAddr))
	if err != nil {
		t.Fatal(err)
	}
	defer sender.Close()
	saddr := sender.LocalAddr().(*net.UDPAddr).AddrPort()

	// There doesn't seem to be a way to receive the entire Layer-3 IPv6
	// packet including the fixed IP header using the portable raw sockets
	// API. That can only be done from an AF_PACKET socket, and it is
	// unclear whether 'ld poff' would behave the same in a BPF program
	// attached to such a socket as in an xt_bpf match.
	c, err := net.ListenIP("ip4:udp", &net.IPAddr{IP: loopback})
	if err != nil {
		if errors.Is(err, unix.EPERM) {
			t.Skip("test requires CAP_NET_RAW")
		}
		t.Fatal(err)
	}
	defer c.Close()

	pc := ipv4.NewPacketConn(c)

	testvectors := []uint32{
		0,
		1,
		0x08,
		42,
		0x80,
		0xfe,
		0xff,
		0x100,
		0xfff,  // 4095
		0x1000, // 4096
		0x1001,
		0x10000,
		0xfffffe,
		0xffffff, // Max VNI
	}
	for _, vni := range []uint32{1, 42, 0x100, 0x1000, 0xfffffe, 0xffffff} {
		t.Run(fmt.Sprintf("vni=%d", vni), func(t *testing.T) {
			setBPF(t, pc, vniMatchBPF(vni))

			for _, v := range testvectors {
				pkt := appendVXLANHeader(nil, v)
				pkt = append(pkt, []byte{0xde, 0xad, 0xbe, 0xef}...)
				if _, err := sender.Write(pkt); err != nil {
					t.Fatal(err)
				}

				rpkt, ok := readUDPPacketFromRawSocket(t, pc, saddr, daddr)
				// Sanity check: the only packets readUDPPacketFromRawSocket
				// should return are ones we sent.
				if ok && !bytes.Equal(pkt, rpkt) {
					t.Fatalf("received unexpected packet: % x", rpkt)
				}
				if ok != (v == vni) {
					t.Errorf("unexpected packet tagged with vni=%d (got %v, want %v)", v, ok, v == vni)
				}
			}
		})
	}
}

func appendVXLANHeader(b []byte, vni uint32) []byte {
	// https://tools.ietf.org/html/rfc7348#section-5
	b = append(b, []byte{0x08, 0x00, 0x00, 0x00}...)
	return binary.BigEndian.AppendUint32(b, vni<<8)
}

func setBPF(t *testing.T, c *ipv4.PacketConn, fprog []bpf.RawInstruction) {
	// https://natanyellin.com/posts/ebpf-filtering-done-right/
	blockall, _ := bpf.Assemble([]bpf.Instruction{bpf.RetConstant{Val: 0}})
	if err := c.SetBPF(blockall); err != nil {
		t.Fatal(err)
	}
	ms := make([]ipv4.Message, 100)
	for {
		n, err := c.ReadBatch(ms, unix.MSG_DONTWAIT)
		if err != nil {
			if errors.Is(err, unix.EAGAIN) {
				break
			}
			t.Fatal(err)
		}
		if n == 0 {
			break
		}
	}

	t.Logf("setting socket filter: %v", marshalXTBPF(fprog))
	if err := c.SetBPF(fprog); err != nil {
		t.Fatal(err)
	}
}

// readUDPPacketFromRawSocket reads raw IP packets from pc until a UDP packet
// which matches the (src, dst) 4-tuple is found or the receive buffer is empty,
// and returns the payload of the UDP packet.
func readUDPPacketFromRawSocket(t *testing.T, pc *ipv4.PacketConn, src, dst netip.AddrPort) ([]byte, bool) {
	t.Helper()

	ms := []ipv4.Message{
		{Buffers: [][]byte{make([]byte, 1500)}},
	}

	// Set a time limit to prevent an infinite loop if there is a lot of
	// loopback traffic being captured which prevents the buffer from
	// emptying.
	deadline := time.Now().Add(1 * time.Second)
	for time.Now().Before(deadline) {
		n, err := pc.ReadBatch(ms, unix.MSG_DONTWAIT)
		if err != nil {
			if !errors.Is(err, unix.EAGAIN) {
				t.Fatal(err)
			}
			break
		}
		if n == 0 {
			break
		}
		pkt := ms[0].Buffers[0][:ms[0].N]
		psrc, pdst, payload, ok := parseUDP(pkt)
		// Discard captured packets which belong to other unrelated flows.
		if !ok || psrc != src || pdst != dst {
			t.Logf("discarding packet:\n% x", pkt)
			continue
		}
		t.Logf("received packet (%v -> %v):\n% x", psrc, pdst, payload)
		// While not strictly required, copy payload into a new
		// slice which does not share a backing array with pkt
		// so the IP and UDP headers can be garbage collected.
		return append([]byte(nil), payload...), true
	}
	return nil, false
}

func parseIPv4(b []byte) (src, dst netip.Addr, protocol byte, payload []byte, ok bool) {
	if len(b) < 20 {
		return netip.Addr{}, netip.Addr{}, 0, nil, false
	}
	hlen := int(b[0]&0x0f) * 4
	if hlen < 20 {
		return netip.Addr{}, netip.Addr{}, 0, nil, false
	}
	src, _ = netip.AddrFromSlice(b[12:16])
	dst, _ = netip.AddrFromSlice(b[16:20])
	protocol = b[9]
	payload = b[hlen:]
	return src, dst, protocol, payload, true
}

// parseUDP parses the IP and UDP headers from the raw Layer-3 packet data in b.
func parseUDP(b []byte) (src, dst netip.AddrPort, payload []byte, ok bool) {
	srcip, dstip, protocol, ippayload, ok := parseIPv4(b)
	if !ok {
		return netip.AddrPort{}, netip.AddrPort{}, nil, false
	}
	if protocol != 17 {
		return netip.AddrPort{}, netip.AddrPort{}, nil, false
	}
	if len(ippayload) < 8 {
		return netip.AddrPort{}, netip.AddrPort{}, nil, false
	}
	sport := binary.BigEndian.Uint16(ippayload[0:2])
	dport := binary.BigEndian.Uint16(ippayload[2:4])
	src = netip.AddrPortFrom(srcip, sport)
	dst = netip.AddrPortFrom(dstip, dport)
	payload = ippayload[8:]
	return src, dst, payload, true
}