Pārlūkot izejas kodu

libnet: add OTEL tracing to the embedded DNS

This change creates a few OTEL spans and plumb context through the DNS
resolver and DNS backends (ie. Sandbox and Network). This should help
better understand how much lock contention impacts performance, and
help debug issues related to DNS queries (we basically have no
visibility into what's happening here right now).

Signed-off-by: Albin Kerouanton <albinker@gmail.com>
Albin Kerouanton 1 gadu atpakaļ
vecāks
revīzija
4de8459265

+ 14 - 11
libnetwork/libnetwork_internal_test.go

@@ -1,6 +1,7 @@
 package libnetwork
 package libnetwork
 
 
 import (
 import (
+	"context"
 	"encoding/json"
 	"encoding/json"
 	"fmt"
 	"fmt"
 	"net"
 	"net"
@@ -418,7 +419,8 @@ func TestSRVServiceQuery(t *testing.T) {
 
 
 	c.svcRecords[n.ID()] = sr
 	c.svcRecords[n.ID()] = sr
 
 
-	_, ip := ep.Info().Sandbox().ResolveService("_http._tcp.web.swarm")
+	ctx := context.Background()
+	_, ip := ep.Info().Sandbox().ResolveService(ctx, "_http._tcp.web.swarm")
 
 
 	if len(ip) == 0 {
 	if len(ip) == 0 {
 		t.Fatal(err)
 		t.Fatal(err)
@@ -427,7 +429,7 @@ func TestSRVServiceQuery(t *testing.T) {
 		t.Fatal(err)
 		t.Fatal(err)
 	}
 	}
 
 
-	_, ip = ep.Info().Sandbox().ResolveService("_host_http._tcp.web.swarm")
+	_, ip = ep.Info().Sandbox().ResolveService(ctx, "_host_http._tcp.web.swarm")
 
 
 	if len(ip) == 0 {
 	if len(ip) == 0 {
 		t.Fatal(err)
 		t.Fatal(err)
@@ -437,7 +439,7 @@ func TestSRVServiceQuery(t *testing.T) {
 	}
 	}
 
 
 	// Service name with invalid protocol name. Should fail without error
 	// Service name with invalid protocol name. Should fail without error
-	_, ip = ep.Info().Sandbox().ResolveService("_http._icmp.web.swarm")
+	_, ip = ep.Info().Sandbox().ResolveService(ctx, "_http._icmp.web.swarm")
 	if len(ip) != 0 {
 	if len(ip) != 0 {
 		t.Fatal("Valid response for invalid service name")
 		t.Fatal("Valid response for invalid service name")
 	}
 	}
@@ -490,7 +492,8 @@ func TestServiceVIPReuse(t *testing.T) {
 
 
 	ipToResolve := netutils.ReverseIP("192.168.0.1")
 	ipToResolve := netutils.ReverseIP("192.168.0.1")
 
 
-	ipList, _ := n.ResolveName("service_test", types.IPv4)
+	ctx := context.Background()
+	ipList, _ := n.ResolveName(ctx, "service_test", types.IPv4)
 	if len(ipList) == 0 {
 	if len(ipList) == 0 {
 		t.Fatal("There must be the VIP")
 		t.Fatal("There must be the VIP")
 	}
 	}
@@ -500,7 +503,7 @@ func TestServiceVIPReuse(t *testing.T) {
 	if ipList[0].String() != "192.168.0.1" {
 	if ipList[0].String() != "192.168.0.1" {
 		t.Fatal("The service VIP is 192.168.0.1")
 		t.Fatal("The service VIP is 192.168.0.1")
 	}
 	}
-	name := n.ResolveIP(ipToResolve)
+	name := n.ResolveIP(ctx, ipToResolve)
 	if name == "" {
 	if name == "" {
 		t.Fatal("It must return a name")
 		t.Fatal("It must return a name")
 	}
 	}
@@ -510,7 +513,7 @@ func TestServiceVIPReuse(t *testing.T) {
 
 
 	// Delete service record for one of the services, the IP should remain because one service is still associated with it
 	// Delete service record for one of the services, the IP should remain because one service is still associated with it
 	n.deleteSvcRecords("ep1", "service_test", "serviceID1", net.ParseIP("192.168.0.1"), net.IP{}, true, "test")
 	n.deleteSvcRecords("ep1", "service_test", "serviceID1", net.ParseIP("192.168.0.1"), net.IP{}, true, "test")
-	ipList, _ = n.ResolveName("service_test", types.IPv4)
+	ipList, _ = n.ResolveName(ctx, "service_test", types.IPv4)
 	if len(ipList) == 0 {
 	if len(ipList) == 0 {
 		t.Fatal("There must be the VIP")
 		t.Fatal("There must be the VIP")
 	}
 	}
@@ -520,7 +523,7 @@ func TestServiceVIPReuse(t *testing.T) {
 	if ipList[0].String() != "192.168.0.1" {
 	if ipList[0].String() != "192.168.0.1" {
 		t.Fatal("The service VIP is 192.168.0.1")
 		t.Fatal("The service VIP is 192.168.0.1")
 	}
 	}
-	name = n.ResolveIP(ipToResolve)
+	name = n.ResolveIP(ctx, ipToResolve)
 	if name == "" {
 	if name == "" {
 		t.Fatal("It must return a name")
 		t.Fatal("It must return a name")
 	}
 	}
@@ -530,7 +533,7 @@ func TestServiceVIPReuse(t *testing.T) {
 
 
 	// Delete again the service using the previous service ID, nothing should happen
 	// Delete again the service using the previous service ID, nothing should happen
 	n.deleteSvcRecords("ep2", "service_test", "serviceID1", net.ParseIP("192.168.0.1"), net.IP{}, true, "test")
 	n.deleteSvcRecords("ep2", "service_test", "serviceID1", net.ParseIP("192.168.0.1"), net.IP{}, true, "test")
-	ipList, _ = n.ResolveName("service_test", types.IPv4)
+	ipList, _ = n.ResolveName(ctx, "service_test", types.IPv4)
 	if len(ipList) == 0 {
 	if len(ipList) == 0 {
 		t.Fatal("There must be the VIP")
 		t.Fatal("There must be the VIP")
 	}
 	}
@@ -540,7 +543,7 @@ func TestServiceVIPReuse(t *testing.T) {
 	if ipList[0].String() != "192.168.0.1" {
 	if ipList[0].String() != "192.168.0.1" {
 		t.Fatal("The service VIP is 192.168.0.1")
 		t.Fatal("The service VIP is 192.168.0.1")
 	}
 	}
-	name = n.ResolveIP(ipToResolve)
+	name = n.ResolveIP(ctx, ipToResolve)
 	if name == "" {
 	if name == "" {
 		t.Fatal("It must return a name")
 		t.Fatal("It must return a name")
 	}
 	}
@@ -550,11 +553,11 @@ func TestServiceVIPReuse(t *testing.T) {
 
 
 	// Delete now using the second service ID, now all the entries should be gone
 	// Delete now using the second service ID, now all the entries should be gone
 	n.deleteSvcRecords("ep2", "service_test", "serviceID2", net.ParseIP("192.168.0.1"), net.IP{}, true, "test")
 	n.deleteSvcRecords("ep2", "service_test", "serviceID2", net.ParseIP("192.168.0.1"), net.IP{}, true, "test")
-	ipList, _ = n.ResolveName("service_test", types.IPv4)
+	ipList, _ = n.ResolveName(ctx, "service_test", types.IPv4)
 	if len(ipList) != 0 {
 	if len(ipList) != 0 {
 		t.Fatal("All the VIPs should be gone now")
 		t.Fatal("All the VIPs should be gone now")
 	}
 	}
-	name = n.ResolveIP(ipToResolve)
+	name = n.ResolveIP(ctx, ipToResolve)
 	if name != "" {
 	if name != "" {
 		t.Fatalf("It must return empty no more services associated, instead:%s", name)
 		t.Fatalf("It must return empty no more services associated, instead:%s", name)
 	}
 	}

+ 15 - 4
libnetwork/network.go

@@ -23,6 +23,9 @@ import (
 	"github.com/docker/docker/libnetwork/scope"
 	"github.com/docker/docker/libnetwork/scope"
 	"github.com/docker/docker/libnetwork/types"
 	"github.com/docker/docker/libnetwork/types"
 	"github.com/docker/docker/pkg/stringid"
 	"github.com/docker/docker/pkg/stringid"
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/trace"
 )
 )
 
 
 // EndpointWalker is a client provided function which will be used to walk the Endpoints.
 // EndpointWalker is a client provided function which will be used to walk the Endpoints.
@@ -1955,12 +1958,20 @@ func (n *Network) hasLoadBalancerEndpoint() bool {
 	return len(n.loadBalancerIP) != 0
 	return len(n.loadBalancerIP) != 0
 }
 }
 
 
-func (n *Network) ResolveName(req string, ipType int) ([]net.IP, bool) {
+func (n *Network) ResolveName(ctx context.Context, req string, ipType int) ([]net.IP, bool) {
 	var ipv6Miss bool
 	var ipv6Miss bool
 
 
 	c := n.getController()
 	c := n.getController()
 	networkID := n.ID()
 	networkID := n.ID()
+
+	_, span := otel.Tracer("").Start(ctx, "Network.ResolveName", trace.WithAttributes(
+		attribute.String("libnet.network.name", n.Name()),
+		attribute.String("libnet.network.id", networkID),
+	))
+	defer span.End()
+
 	c.mu.Lock()
 	c.mu.Lock()
+	// TODO(aker): release the lock earlier
 	defer c.mu.Unlock()
 	defer c.mu.Unlock()
 	sr, ok := c.svcRecords[networkID]
 	sr, ok := c.svcRecords[networkID]
 
 
@@ -2019,7 +2030,7 @@ func (n *Network) HandleQueryResp(name string, ip net.IP) {
 	}
 	}
 }
 }
 
 
-func (n *Network) ResolveIP(ip string) string {
+func (n *Network) ResolveIP(_ context.Context, ip string) string {
 	networkID := n.ID()
 	networkID := n.ID()
 	c := n.getController()
 	c := n.getController()
 	c.mu.Lock()
 	c.mu.Lock()
@@ -2049,13 +2060,13 @@ func (n *Network) ResolveIP(ip string) string {
 	return elem.name + "." + nwName
 	return elem.name + "." + nwName
 }
 }
 
 
-func (n *Network) ResolveService(name string) ([]*net.SRV, []net.IP) {
+func (n *Network) ResolveService(ctx context.Context, name string) ([]*net.SRV, []net.IP) {
 	c := n.getController()
 	c := n.getController()
 
 
 	srv := []*net.SRV{}
 	srv := []*net.SRV{}
 	ip := []net.IP{}
 	ip := []net.IP{}
 
 
-	log.G(context.TODO()).Debugf("Service name To resolve: %v", name)
+	log.G(ctx).Debugf("Service name To resolve: %v", name)
 
 
 	// There are DNS implementations that allow SRV queries for names not in
 	// There are DNS implementations that allow SRV queries for names not in
 	// the format defined by RFC 2782. Hence specific validations checks are
 	// the format defined by RFC 2782. Hence specific validations checks are

+ 79 - 42
libnetwork/resolver.go

@@ -2,6 +2,7 @@ package libnetwork
 
 
 import (
 import (
 	"context"
 	"context"
+	"errors"
 	"fmt"
 	"fmt"
 	"math/rand"
 	"math/rand"
 	"net"
 	"net"
@@ -14,6 +15,10 @@ import (
 	"github.com/docker/docker/libnetwork/types"
 	"github.com/docker/docker/libnetwork/types"
 	"github.com/miekg/dns"
 	"github.com/miekg/dns"
 	"github.com/sirupsen/logrus"
 	"github.com/sirupsen/logrus"
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/trace"
 	"golang.org/x/sync/semaphore"
 	"golang.org/x/sync/semaphore"
 	"golang.org/x/time/rate"
 	"golang.org/x/time/rate"
 )
 )
@@ -26,13 +31,13 @@ type DNSBackend interface {
 	// the networks the sandbox is connected to. For IPv6 queries, second return
 	// the networks the sandbox is connected to. For IPv6 queries, second return
 	// value will be true if the name exists in docker domain but doesn't have an
 	// value will be true if the name exists in docker domain but doesn't have an
 	// IPv6 address. Such queries shouldn't be forwarded to external nameservers.
 	// IPv6 address. Such queries shouldn't be forwarded to external nameservers.
-	ResolveName(name string, iplen int) ([]net.IP, bool)
+	ResolveName(ctx context.Context, name string, iplen int) ([]net.IP, bool)
 	// ResolveIP returns the service name for the passed in IP. IP is in reverse dotted
 	// ResolveIP returns the service name for the passed in IP. IP is in reverse dotted
 	// notation; the format used for DNS PTR records
 	// notation; the format used for DNS PTR records
-	ResolveIP(name string) string
+	ResolveIP(ctx context.Context, name string) string
 	// ResolveService returns all the backend details about the containers or hosts
 	// ResolveService returns all the backend details about the containers or hosts
 	// backing a service. Its purpose is to satisfy an SRV query
 	// backing a service. Its purpose is to satisfy an SRV query
-	ResolveService(name string) ([]*net.SRV, []net.IP)
+	ResolveService(ctx context.Context, name string) ([]*net.SRV, []net.IP)
 	// ExecFunc allows a function to be executed in the context of the backend
 	// ExecFunc allows a function to be executed in the context of the backend
 	// on behalf of the resolver.
 	// on behalf of the resolver.
 	ExecFunc(f func()) error
 	ExecFunc(f func()) error
@@ -92,9 +97,9 @@ func NewResolver(address string, proxyDNS bool, backend DNSBackend) *Resolver {
 	}
 	}
 }
 }
 
 
-func (r *Resolver) log() *logrus.Logger {
+func (r *Resolver) log(ctx context.Context) *logrus.Logger {
 	if r.logger == nil {
 	if r.logger == nil {
-		return log.G(context.TODO()).Logger
+		return log.G(ctx).Logger
 	}
 	}
 	return r.logger
 	return r.logger
 }
 }
@@ -146,7 +151,7 @@ func (r *Resolver) Start() error {
 	r.server = s
 	r.server = s
 	go func() {
 	go func() {
 		if err := s.ActivateAndServe(); err != nil {
 		if err := s.ActivateAndServe(); err != nil {
-			r.log().WithError(err).Error("[resolver] failed to start PacketConn DNS server")
+			r.log(context.TODO()).WithError(err).Error("[resolver] failed to start PacketConn DNS server")
 		}
 		}
 	}()
 	}()
 
 
@@ -154,7 +159,7 @@ func (r *Resolver) Start() error {
 	r.tcpServer = tcpServer
 	r.tcpServer = tcpServer
 	go func() {
 	go func() {
 		if err := tcpServer.ActivateAndServe(); err != nil {
 		if err := tcpServer.ActivateAndServe(); err != nil {
-			r.log().WithError(err).Error("[resolver] failed to start TCP DNS server")
+			r.log(context.TODO()).WithError(err).Error("[resolver] failed to start TCP DNS server")
 		}
 		}
 	}()
 	}()
 	return nil
 	return nil
@@ -224,10 +229,10 @@ func createRespMsg(query *dns.Msg) *dns.Msg {
 	return resp
 	return resp
 }
 }
 
 
-func (r *Resolver) handleMXQuery(query *dns.Msg) (*dns.Msg, error) {
+func (r *Resolver) handleMXQuery(ctx context.Context, query *dns.Msg) (*dns.Msg, error) {
 	name := query.Question[0].Name
 	name := query.Question[0].Name
-	addrv4, _ := r.backend.ResolveName(name, types.IPv4)
-	addrv6, _ := r.backend.ResolveName(name, types.IPv6)
+	addrv4, _ := r.backend.ResolveName(ctx, name, types.IPv4)
+	addrv6, _ := r.backend.ResolveName(ctx, name, types.IPv6)
 
 
 	if addrv4 == nil && addrv6 == nil {
 	if addrv4 == nil && addrv6 == nil {
 		return nil, nil
 		return nil, nil
@@ -241,17 +246,17 @@ func (r *Resolver) handleMXQuery(query *dns.Msg) (*dns.Msg, error) {
 	return resp, nil
 	return resp, nil
 }
 }
 
 
-func (r *Resolver) handleIPQuery(query *dns.Msg, ipType int) (*dns.Msg, error) {
+func (r *Resolver) handleIPQuery(ctx context.Context, query *dns.Msg, ipType int) (*dns.Msg, error) {
 	var (
 	var (
 		addr     []net.IP
 		addr     []net.IP
 		ipv6Miss bool
 		ipv6Miss bool
 		name     = query.Question[0].Name
 		name     = query.Question[0].Name
 	)
 	)
-	addr, ipv6Miss = r.backend.ResolveName(name, ipType)
+	addr, ipv6Miss = r.backend.ResolveName(ctx, name, ipType)
 
 
 	if addr == nil && ipv6Miss {
 	if addr == nil && ipv6Miss {
 		// Send a reply without any Answer sections
 		// Send a reply without any Answer sections
-		r.log().Debugf("[resolver] lookup name %s present without IPv6 address", name)
+		r.log(ctx).Debugf("[resolver] lookup name %s present without IPv6 address", name)
 		resp := createRespMsg(query)
 		resp := createRespMsg(query)
 		return resp, nil
 		return resp, nil
 	}
 	}
@@ -259,7 +264,7 @@ func (r *Resolver) handleIPQuery(query *dns.Msg, ipType int) (*dns.Msg, error) {
 		return nil, nil
 		return nil, nil
 	}
 	}
 
 
-	r.log().Debugf("[resolver] lookup for %s: IP %v", name, addr)
+	r.log(ctx).Debugf("[resolver] lookup for %s: IP %v", name, addr)
 
 
 	resp := createRespMsg(query)
 	resp := createRespMsg(query)
 	if len(addr) > 1 {
 	if len(addr) > 1 {
@@ -283,7 +288,7 @@ func (r *Resolver) handleIPQuery(query *dns.Msg, ipType int) (*dns.Msg, error) {
 	return resp, nil
 	return resp, nil
 }
 }
 
 
-func (r *Resolver) handlePTRQuery(query *dns.Msg) (*dns.Msg, error) {
+func (r *Resolver) handlePTRQuery(ctx context.Context, query *dns.Msg) (*dns.Msg, error) {
 	ptr := query.Question[0].Name
 	ptr := query.Question[0].Name
 	name, after, found := strings.Cut(ptr, ptrIPv4domain)
 	name, after, found := strings.Cut(ptr, ptrIPv4domain)
 	if !found || after != "" {
 	if !found || after != "" {
@@ -295,12 +300,12 @@ func (r *Resolver) handlePTRQuery(query *dns.Msg) (*dns.Msg, error) {
 		return nil, nil
 		return nil, nil
 	}
 	}
 
 
-	host := r.backend.ResolveIP(name)
+	host := r.backend.ResolveIP(ctx, name)
 	if host == "" {
 	if host == "" {
 		return nil, nil
 		return nil, nil
 	}
 	}
 
 
-	r.log().Debugf("[resolver] lookup for IP %s: name %s", name, host)
+	r.log(ctx).Debugf("[resolver] lookup for IP %s: name %s", name, host)
 	fqdn := dns.Fqdn(host)
 	fqdn := dns.Fqdn(host)
 
 
 	resp := createRespMsg(query)
 	resp := createRespMsg(query)
@@ -311,9 +316,9 @@ func (r *Resolver) handlePTRQuery(query *dns.Msg) (*dns.Msg, error) {
 	return resp, nil
 	return resp, nil
 }
 }
 
 
-func (r *Resolver) handleSRVQuery(query *dns.Msg) (*dns.Msg, error) {
+func (r *Resolver) handleSRVQuery(ctx context.Context, query *dns.Msg) (*dns.Msg, error) {
 	svc := query.Question[0].Name
 	svc := query.Question[0].Name
-	srv, ip := r.backend.ResolveService(svc)
+	srv, ip := r.backend.ResolveService(ctx, svc)
 
 
 	if len(srv) == 0 {
 	if len(srv) == 0 {
 		return nil, nil
 		return nil, nil
@@ -351,29 +356,37 @@ func (r *Resolver) serveDNS(w dns.ResponseWriter, query *dns.Msg) {
 	queryName := query.Question[0].Name
 	queryName := query.Question[0].Name
 	queryType := query.Question[0].Qtype
 	queryType := query.Question[0].Qtype
 
 
+	ctx, span := otel.Tracer("").Start(context.Background(), "resolver.serveDNS", trace.WithAttributes(
+		attribute.String("libnet.resolver.query.name", queryName),
+		attribute.String("libnet.resolver.query.type", dns.TypeToString[queryType]),
+	))
+	defer span.End()
+
 	switch queryType {
 	switch queryType {
 	case dns.TypeA:
 	case dns.TypeA:
-		resp, err = r.handleIPQuery(query, types.IPv4)
+		resp, err = r.handleIPQuery(ctx, query, types.IPv4)
 	case dns.TypeAAAA:
 	case dns.TypeAAAA:
-		resp, err = r.handleIPQuery(query, types.IPv6)
+		resp, err = r.handleIPQuery(ctx, query, types.IPv6)
 	case dns.TypeMX:
 	case dns.TypeMX:
-		resp, err = r.handleMXQuery(query)
+		resp, err = r.handleMXQuery(ctx, query)
 	case dns.TypePTR:
 	case dns.TypePTR:
-		resp, err = r.handlePTRQuery(query)
+		resp, err = r.handlePTRQuery(ctx, query)
 	case dns.TypeSRV:
 	case dns.TypeSRV:
-		resp, err = r.handleSRVQuery(query)
+		resp, err = r.handleSRVQuery(ctx, query)
 	default:
 	default:
-		r.log().Debugf("[resolver] query type %s is not supported by the embedded DNS and will be forwarded to external DNS", dns.TypeToString[queryType])
+		r.log(ctx).Debugf("[resolver] query type %s is not supported by the embedded DNS and will be forwarded to external DNS", dns.TypeToString[queryType])
 	}
 	}
 
 
 	reply := func(msg *dns.Msg) {
 	reply := func(msg *dns.Msg) {
 		if err = w.WriteMsg(msg); err != nil {
 		if err = w.WriteMsg(msg); err != nil {
-			r.log().WithError(err).Errorf("[resolver] failed to write response")
+			r.log(ctx).WithError(err).Errorf("[resolver] failed to write response")
+			span.RecordError(err)
+			span.SetStatus(codes.Error, "WriteMsg failed")
 		}
 		}
 	}
 	}
 
 
 	if err != nil {
 	if err != nil {
-		r.log().WithError(err).Errorf("[resolver] failed to handle query: %s (%s)", queryName, dns.TypeToString[queryType])
+		r.log(ctx).WithError(err).Errorf("[resolver] failed to handle query: %s (%s)", queryName, dns.TypeToString[queryType])
 		reply(new(dns.Msg).SetRcode(query, dns.RcodeServerFailure))
 		reply(new(dns.Msg).SetRcode(query, dns.RcodeServerFailure))
 		return
 		return
 	}
 	}
@@ -393,6 +406,9 @@ func (r *Resolver) serveDNS(w dns.ResponseWriter, query *dns.Msg) {
 			}
 			}
 		}
 		}
 		resp.Truncate(maxSize)
 		resp.Truncate(maxSize)
+		span.AddEvent("found local record", trace.WithAttributes(
+			attribute.String("libnet.resolver.resp", resp.String()),
+		))
 		reply(resp)
 		reply(resp)
 		return
 		return
 	}
 	}
@@ -406,7 +422,7 @@ func (r *Resolver) serveDNS(w dns.ResponseWriter, query *dns.Msg) {
 			!strings.Contains(strings.TrimSuffix(queryName, "."), ".") {
 			!strings.Contains(strings.TrimSuffix(queryName, "."), ".") {
 			resp = createRespMsg(query)
 			resp = createRespMsg(query)
 		} else {
 		} else {
-			resp = r.forwardExtDNS(w.LocalAddr().Network(), query)
+			resp = r.forwardExtDNS(ctx, w.LocalAddr().Network(), query)
 		}
 		}
 	}
 	}
 
 
@@ -448,25 +464,31 @@ func (r *Resolver) dialExtDNS(proto string, server extDNSEntry) (net.Conn, error
 	return extConn, nil
 	return extConn, nil
 }
 }
 
 
-func (r *Resolver) forwardExtDNS(proto string, query *dns.Msg) *dns.Msg {
+func (r *Resolver) forwardExtDNS(ctx context.Context, proto string, query *dns.Msg) *dns.Msg {
+	ctx, span := otel.Tracer("").Start(ctx, "resolver.forwardExtDNS")
+	defer span.End()
+
 	for _, extDNS := range r.extDNSList {
 	for _, extDNS := range r.extDNSList {
 		if extDNS.IPStr == "" {
 		if extDNS.IPStr == "" {
 			break
 			break
 		}
 		}
 
 
 		// limits the number of outstanding concurrent queries.
 		// limits the number of outstanding concurrent queries.
-		ctx, cancel := context.WithTimeout(context.Background(), extIOTimeout)
+		ctx, cancel := context.WithTimeout(ctx, extIOTimeout)
 		err := r.fwdSem.Acquire(ctx, 1)
 		err := r.fwdSem.Acquire(ctx, 1)
 		cancel()
 		cancel()
+
 		if err != nil {
 		if err != nil {
-			r.logInverval.Do(func() {
-				r.log().Errorf("[resolver] more than %v concurrent queries", maxConcurrent)
-			})
+			if errors.Is(err, context.DeadlineExceeded) {
+				r.logInverval.Do(func() {
+					r.log(ctx).Errorf("[resolver] more than %v concurrent queries", maxConcurrent)
+				})
+			}
 			return new(dns.Msg).SetRcode(query, dns.RcodeRefused)
 			return new(dns.Msg).SetRcode(query, dns.RcodeRefused)
 		}
 		}
 		resp := func() *dns.Msg {
 		resp := func() *dns.Msg {
 			defer r.fwdSem.Release(1)
 			defer r.fwdSem.Release(1)
-			return r.exchange(proto, extDNS, query)
+			return r.exchange(ctx, proto, extDNS, query)
 		}()
 		}()
 		if resp == nil {
 		if resp == nil {
 			continue
 			continue
@@ -476,7 +498,7 @@ func (r *Resolver) forwardExtDNS(proto string, query *dns.Msg) *dns.Msg {
 		case dns.RcodeServerFailure, dns.RcodeRefused:
 		case dns.RcodeServerFailure, dns.RcodeRefused:
 			// Server returned FAILURE: continue with the next external DNS server
 			// Server returned FAILURE: continue with the next external DNS server
 			// Server returned REFUSED: this can be a transitional status, so continue with the next external DNS server
 			// Server returned REFUSED: this can be a transitional status, so continue with the next external DNS server
-			r.log().Debugf("[resolver] external DNS %s:%s returned failure:\n%s", proto, extDNS.IPStr, resp)
+			r.log(ctx).Debugf("[resolver] external DNS %s:%s returned failure:\n%s", proto, extDNS.IPStr, resp)
 			continue
 			continue
 		}
 		}
 		answers := 0
 		answers := 0
@@ -486,34 +508,46 @@ func (r *Resolver) forwardExtDNS(proto string, query *dns.Msg) *dns.Msg {
 			case dns.TypeA:
 			case dns.TypeA:
 				answers++
 				answers++
 				ip := rr.(*dns.A).A
 				ip := rr.(*dns.A).A
-				r.log().Debugf("[resolver] received A record %q for %q from %s:%s", ip, h.Name, proto, extDNS.IPStr)
+				r.log(ctx).Debugf("[resolver] received A record %q for %q from %s:%s", ip, h.Name, proto, extDNS.IPStr)
 				r.backend.HandleQueryResp(h.Name, ip)
 				r.backend.HandleQueryResp(h.Name, ip)
 			case dns.TypeAAAA:
 			case dns.TypeAAAA:
 				answers++
 				answers++
 				ip := rr.(*dns.AAAA).AAAA
 				ip := rr.(*dns.AAAA).AAAA
-				r.log().Debugf("[resolver] received AAAA record %q for %q from %s:%s", ip, h.Name, proto, extDNS.IPStr)
+				r.log(ctx).Debugf("[resolver] received AAAA record %q for %q from %s:%s", ip, h.Name, proto, extDNS.IPStr)
 				r.backend.HandleQueryResp(h.Name, ip)
 				r.backend.HandleQueryResp(h.Name, ip)
 			}
 			}
 		}
 		}
 		if len(resp.Answer) == 0 {
 		if len(resp.Answer) == 0 {
-			r.log().Debugf("[resolver] external DNS %s:%s returned response with no answers:\n%s", proto, extDNS.IPStr, resp)
+			r.log(ctx).Debugf("[resolver] external DNS %s:%s returned response with no answers:\n%s", proto, extDNS.IPStr, resp)
 		}
 		}
 		resp.Compress = true
 		resp.Compress = true
+		span.AddEvent("response from upstream server", trace.WithAttributes(
+			attribute.String("libnet.resolver.resp", resp.String()),
+		))
 		return resp
 		return resp
 	}
 	}
 
 
+	span.AddEvent("no response from upstream servers")
 	return nil
 	return nil
 }
 }
 
 
-func (r *Resolver) exchange(proto string, extDNS extDNSEntry, query *dns.Msg) *dns.Msg {
+func (r *Resolver) exchange(ctx context.Context, proto string, extDNS extDNSEntry, query *dns.Msg) *dns.Msg {
+	ctx, span := otel.Tracer("").Start(ctx, "resolver.exchange", trace.WithAttributes(
+		attribute.String("libnet.resolver.upstream.proto", proto),
+		attribute.String("libnet.resolver.upstream.address", extDNS.IPStr),
+		attribute.Bool("libnet.resolver.upstream.host-loopback", extDNS.HostLoopback)))
+	defer span.End()
+
 	extConn, err := r.dialExtDNS(proto, extDNS)
 	extConn, err := r.dialExtDNS(proto, extDNS)
 	if err != nil {
 	if err != nil {
-		r.log().WithError(err).Warn("[resolver] connect failed")
+		r.log(ctx).WithError(err).Warn("[resolver] connect failed")
+		span.RecordError(err)
+		span.SetStatus(codes.Error, "dialExtDNS failed")
 		return nil
 		return nil
 	}
 	}
 	defer extConn.Close()
 	defer extConn.Close()
 
 
-	logger := r.log().WithFields(log.Fields{
+	logger := r.log(ctx).WithFields(log.Fields{
 		"dns-server":  extConn.RemoteAddr().Network() + ":" + extConn.RemoteAddr().String(),
 		"dns-server":  extConn.RemoteAddr().Network() + ":" + extConn.RemoteAddr().String(),
 		"client-addr": extConn.LocalAddr().Network() + ":" + extConn.LocalAddr().String(),
 		"client-addr": extConn.LocalAddr().Network() + ":" + extConn.LocalAddr().String(),
 		"question":    query.Question[0].String(),
 		"question":    query.Question[0].String(),
@@ -534,13 +568,16 @@ func (r *Resolver) exchange(proto string, extDNS extDNSEntry, query *dns.Msg) *d
 		UDPSize: dns.MaxMsgSize,
 		UDPSize: dns.MaxMsgSize,
 	}).ExchangeWithConn(query, &dns.Conn{Conn: extConn})
 	}).ExchangeWithConn(query, &dns.Conn{Conn: extConn})
 	if err != nil {
 	if err != nil {
-		r.log().WithError(err).Errorf("[resolver] failed to query DNS server: %s, query: %s", extConn.RemoteAddr().String(), query.Question[0].String())
+		r.log(ctx).WithError(err).Errorf("[resolver] failed to query DNS server: %s, query: %s", extConn.RemoteAddr().String(), query.Question[0].String())
+		span.RecordError(err)
+		span.SetStatus(codes.Error, "ExchangeWithConn failed")
 		return nil
 		return nil
 	}
 	}
 
 
 	if resp == nil {
 	if resp == nil {
 		// Should be impossible, so make noise if it happens anyway.
 		// Should be impossible, so make noise if it happens anyway.
 		logger.Error("[resolver] external DNS returned empty response")
 		logger.Error("[resolver] external DNS returned empty response")
+		span.SetStatus(codes.Error, "External DNS returned empty response")
 	}
 	}
 	return resp
 	return resp
 }
 }

+ 4 - 2
libnetwork/resolver_test.go

@@ -242,7 +242,9 @@ func (w tlogWriter) Write(p []byte) (n int, err error) {
 
 
 type noopDNSBackend struct{ DNSBackend }
 type noopDNSBackend struct{ DNSBackend }
 
 
-func (noopDNSBackend) ResolveName(name string, iplen int) ([]net.IP, bool) { return nil, false }
+func (noopDNSBackend) ResolveName(_ context.Context, name string, iplen int) ([]net.IP, bool) {
+	return nil, false
+}
 
 
 func (noopDNSBackend) ExecFunc(f func()) error { f(); return nil }
 func (noopDNSBackend) ExecFunc(f func()) error { f(); return nil }
 
 
@@ -286,7 +288,7 @@ func TestReplySERVFAIL(t *testing.T) {
 
 
 type badSRVDNSBackend struct{ noopDNSBackend }
 type badSRVDNSBackend struct{ noopDNSBackend }
 
 
-func (badSRVDNSBackend) ResolveService(name string) ([]*net.SRV, []net.IP) {
+func (badSRVDNSBackend) ResolveService(_ context.Context, _ string) ([]*net.SRV, []net.IP) {
 	return []*net.SRV{nil, nil, nil}, nil // Mismatched slice lengths
 	return []*net.SRV{nil, nil, nil}, nil // Mismatched slice lengths
 }
 }
 
 

+ 22 - 12
libnetwork/sandbox.go

@@ -13,6 +13,9 @@ import (
 	"github.com/docker/docker/libnetwork/etchosts"
 	"github.com/docker/docker/libnetwork/etchosts"
 	"github.com/docker/docker/libnetwork/osl"
 	"github.com/docker/docker/libnetwork/osl"
 	"github.com/docker/docker/libnetwork/types"
 	"github.com/docker/docker/libnetwork/types"
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/trace"
 )
 )
 
 
 // SandboxOption is an option setter function type used to pass various options to
 // SandboxOption is an option setter function type used to pass various options to
@@ -351,13 +354,13 @@ func (sb *Sandbox) HandleQueryResp(name string, ip net.IP) {
 	}
 	}
 }
 }
 
 
-func (sb *Sandbox) ResolveIP(ip string) string {
+func (sb *Sandbox) ResolveIP(ctx context.Context, ip string) string {
 	var svc string
 	var svc string
-	log.G(context.TODO()).Debugf("IP To resolve %v", ip)
+	log.G(ctx).Debugf("IP To resolve %v", ip)
 
 
 	for _, ep := range sb.Endpoints() {
 	for _, ep := range sb.Endpoints() {
 		n := ep.getNetwork()
 		n := ep.getNetwork()
-		svc = n.ResolveIP(ip)
+		svc = n.ResolveIP(ctx, ip)
 		if len(svc) != 0 {
 		if len(svc) != 0 {
 			return svc
 			return svc
 		}
 		}
@@ -368,8 +371,8 @@ func (sb *Sandbox) ResolveIP(ip string) string {
 
 
 // ResolveService returns all the backend details about the containers or hosts
 // ResolveService returns all the backend details about the containers or hosts
 // backing a service. Its purpose is to satisfy an SRV query.
 // backing a service. Its purpose is to satisfy an SRV query.
-func (sb *Sandbox) ResolveService(name string) ([]*net.SRV, []net.IP) {
-	log.G(context.TODO()).Debugf("Service name To resolve: %v", name)
+func (sb *Sandbox) ResolveService(ctx context.Context, name string) ([]*net.SRV, []net.IP) {
+	log.G(ctx).Debugf("Service name To resolve: %v", name)
 
 
 	// There are DNS implementations that allow SRV queries for names not in
 	// There are DNS implementations that allow SRV queries for names not in
 	// the format defined by RFC 2782. Hence specific validations checks are
 	// the format defined by RFC 2782. Hence specific validations checks are
@@ -381,7 +384,7 @@ func (sb *Sandbox) ResolveService(name string) ([]*net.SRV, []net.IP) {
 	for _, ep := range sb.Endpoints() {
 	for _, ep := range sb.Endpoints() {
 		n := ep.getNetwork()
 		n := ep.getNetwork()
 
 
-		srv, ip := n.ResolveService(name)
+		srv, ip := n.ResolveService(ctx, name)
 		if len(srv) > 0 {
 		if len(srv) > 0 {
 			return srv, ip
 			return srv, ip
 		}
 		}
@@ -421,7 +424,7 @@ func getLocalNwEndpoints(epList []*Endpoint) []*Endpoint {
 	return eps
 	return eps
 }
 }
 
 
-func (sb *Sandbox) ResolveName(name string, ipType int) ([]net.IP, bool) {
+func (sb *Sandbox) ResolveName(ctx context.Context, name string, ipType int) ([]net.IP, bool) {
 	// Embedded server owns the docker network domain. Resolution should work
 	// Embedded server owns the docker network domain. Resolution should work
 	// for both container_name and container_name.network_name
 	// for both container_name and container_name.network_name
 	// We allow '.' in service name and network name. For a name a.b.c.d the
 	// We allow '.' in service name and network name. For a name a.b.c.d the
@@ -431,7 +434,7 @@ func (sb *Sandbox) ResolveName(name string, ipType int) ([]net.IP, bool) {
 	// {a.b in network c.d},
 	// {a.b in network c.d},
 	// {a in network b.c.d},
 	// {a in network b.c.d},
 
 
-	log.G(context.TODO()).Debugf("Name To resolve: %v", name)
+	log.G(ctx).Debugf("Name To resolve: %v", name)
 	name = strings.TrimSuffix(name, ".")
 	name = strings.TrimSuffix(name, ".")
 	reqName := []string{name}
 	reqName := []string{name}
 	networkName := []string{""}
 	networkName := []string{""}
@@ -468,7 +471,7 @@ func (sb *Sandbox) ResolveName(name string, ipType int) ([]net.IP, bool) {
 
 
 	for i := 0; i < len(reqName); i++ {
 	for i := 0; i < len(reqName); i++ {
 		// First check for local container alias
 		// First check for local container alias
-		ip, ipv6Miss := sb.resolveName(reqName[i], networkName[i], epList, true, ipType)
+		ip, ipv6Miss := sb.resolveName(ctx, reqName[i], networkName[i], epList, true, ipType)
 		if ip != nil {
 		if ip != nil {
 			return ip, false
 			return ip, false
 		}
 		}
@@ -477,7 +480,7 @@ func (sb *Sandbox) ResolveName(name string, ipType int) ([]net.IP, bool) {
 		}
 		}
 
 
 		// Resolve the actual container name
 		// Resolve the actual container name
-		ip, ipv6Miss = sb.resolveName(reqName[i], networkName[i], epList, false, ipType)
+		ip, ipv6Miss = sb.resolveName(ctx, reqName[i], networkName[i], epList, false, ipType)
 		if ip != nil {
 		if ip != nil {
 			return ip, false
 			return ip, false
 		}
 		}
@@ -488,7 +491,14 @@ func (sb *Sandbox) ResolveName(name string, ipType int) ([]net.IP, bool) {
 	return nil, false
 	return nil, false
 }
 }
 
 
-func (sb *Sandbox) resolveName(nameOrAlias string, networkName string, epList []*Endpoint, lookupAlias bool, ipType int) (_ []net.IP, ipv6Miss bool) {
+func (sb *Sandbox) resolveName(ctx context.Context, nameOrAlias string, networkName string, epList []*Endpoint, lookupAlias bool, ipType int) (_ []net.IP, ipv6Miss bool) {
+	ctx, span := otel.Tracer("").Start(ctx, "Sandbox.resolveName", trace.WithAttributes(
+		attribute.String("libnet.resolver.name-or-alias", nameOrAlias),
+		attribute.String("libnet.network.name", networkName),
+		attribute.Bool("libnet.resolver.alias-lookup", lookupAlias),
+		attribute.Int("libnet.resolver.ip-family", ipType)))
+	defer span.End()
+
 	for _, ep := range epList {
 	for _, ep := range epList {
 		if lookupAlias && len(ep.aliases) == 0 {
 		if lookupAlias && len(ep.aliases) == 0 {
 			continue
 			continue
@@ -519,7 +529,7 @@ func (sb *Sandbox) resolveName(nameOrAlias string, networkName string, epList []
 			}
 			}
 		}
 		}
 
 
-		ip, miss := nw.ResolveName(name, ipType)
+		ip, miss := nw.ResolveName(ctx, name, ipType)
 		if ip != nil {
 		if ip != nil {
 			return ip, false
 			return ip, false
 		}
 		}