Browse Source

Merge pull request #46551 from akerouanton/libnet-resolver-otel

libnet: add OTEL tracing to the embedded DNS
Sebastiaan van Stijn 1 year ago
parent
commit
26c5d1ea0d

+ 14 - 11
libnetwork/libnetwork_internal_test.go

@@ -1,6 +1,7 @@
 package libnetwork
 package libnetwork
 
 
 import (
 import (
+	"context"
 	"encoding/json"
 	"encoding/json"
 	"fmt"
 	"fmt"
 	"net"
 	"net"
@@ -418,7 +419,8 @@ func TestSRVServiceQuery(t *testing.T) {
 
 
 	c.svcRecords[n.ID()] = sr
 	c.svcRecords[n.ID()] = sr
 
 
-	_, ip := ep.Info().Sandbox().ResolveService("_http._tcp.web.swarm")
+	ctx := context.Background()
+	_, ip := ep.Info().Sandbox().ResolveService(ctx, "_http._tcp.web.swarm")
 
 
 	if len(ip) == 0 {
 	if len(ip) == 0 {
 		t.Fatal(err)
 		t.Fatal(err)
@@ -427,7 +429,7 @@ func TestSRVServiceQuery(t *testing.T) {
 		t.Fatal(err)
 		t.Fatal(err)
 	}
 	}
 
 
-	_, ip = ep.Info().Sandbox().ResolveService("_host_http._tcp.web.swarm")
+	_, ip = ep.Info().Sandbox().ResolveService(ctx, "_host_http._tcp.web.swarm")
 
 
 	if len(ip) == 0 {
 	if len(ip) == 0 {
 		t.Fatal(err)
 		t.Fatal(err)
@@ -437,7 +439,7 @@ func TestSRVServiceQuery(t *testing.T) {
 	}
 	}
 
 
 	// Service name with invalid protocol name. Should fail without error
 	// Service name with invalid protocol name. Should fail without error
-	_, ip = ep.Info().Sandbox().ResolveService("_http._icmp.web.swarm")
+	_, ip = ep.Info().Sandbox().ResolveService(ctx, "_http._icmp.web.swarm")
 	if len(ip) != 0 {
 	if len(ip) != 0 {
 		t.Fatal("Valid response for invalid service name")
 		t.Fatal("Valid response for invalid service name")
 	}
 	}
@@ -490,7 +492,8 @@ func TestServiceVIPReuse(t *testing.T) {
 
 
 	ipToResolve := netutils.ReverseIP("192.168.0.1")
 	ipToResolve := netutils.ReverseIP("192.168.0.1")
 
 
-	ipList, _ := n.ResolveName("service_test", types.IPv4)
+	ctx := context.Background()
+	ipList, _ := n.ResolveName(ctx, "service_test", types.IPv4)
 	if len(ipList) == 0 {
 	if len(ipList) == 0 {
 		t.Fatal("There must be the VIP")
 		t.Fatal("There must be the VIP")
 	}
 	}
@@ -500,7 +503,7 @@ func TestServiceVIPReuse(t *testing.T) {
 	if ipList[0].String() != "192.168.0.1" {
 	if ipList[0].String() != "192.168.0.1" {
 		t.Fatal("The service VIP is 192.168.0.1")
 		t.Fatal("The service VIP is 192.168.0.1")
 	}
 	}
-	name := n.ResolveIP(ipToResolve)
+	name := n.ResolveIP(ctx, ipToResolve)
 	if name == "" {
 	if name == "" {
 		t.Fatal("It must return a name")
 		t.Fatal("It must return a name")
 	}
 	}
@@ -510,7 +513,7 @@ func TestServiceVIPReuse(t *testing.T) {
 
 
 	// Delete service record for one of the services, the IP should remain because one service is still associated with it
 	// Delete service record for one of the services, the IP should remain because one service is still associated with it
 	n.deleteSvcRecords("ep1", "service_test", "serviceID1", net.ParseIP("192.168.0.1"), net.IP{}, true, "test")
 	n.deleteSvcRecords("ep1", "service_test", "serviceID1", net.ParseIP("192.168.0.1"), net.IP{}, true, "test")
-	ipList, _ = n.ResolveName("service_test", types.IPv4)
+	ipList, _ = n.ResolveName(ctx, "service_test", types.IPv4)
 	if len(ipList) == 0 {
 	if len(ipList) == 0 {
 		t.Fatal("There must be the VIP")
 		t.Fatal("There must be the VIP")
 	}
 	}
@@ -520,7 +523,7 @@ func TestServiceVIPReuse(t *testing.T) {
 	if ipList[0].String() != "192.168.0.1" {
 	if ipList[0].String() != "192.168.0.1" {
 		t.Fatal("The service VIP is 192.168.0.1")
 		t.Fatal("The service VIP is 192.168.0.1")
 	}
 	}
-	name = n.ResolveIP(ipToResolve)
+	name = n.ResolveIP(ctx, ipToResolve)
 	if name == "" {
 	if name == "" {
 		t.Fatal("It must return a name")
 		t.Fatal("It must return a name")
 	}
 	}
@@ -530,7 +533,7 @@ func TestServiceVIPReuse(t *testing.T) {
 
 
 	// Delete again the service using the previous service ID, nothing should happen
 	// Delete again the service using the previous service ID, nothing should happen
 	n.deleteSvcRecords("ep2", "service_test", "serviceID1", net.ParseIP("192.168.0.1"), net.IP{}, true, "test")
 	n.deleteSvcRecords("ep2", "service_test", "serviceID1", net.ParseIP("192.168.0.1"), net.IP{}, true, "test")
-	ipList, _ = n.ResolveName("service_test", types.IPv4)
+	ipList, _ = n.ResolveName(ctx, "service_test", types.IPv4)
 	if len(ipList) == 0 {
 	if len(ipList) == 0 {
 		t.Fatal("There must be the VIP")
 		t.Fatal("There must be the VIP")
 	}
 	}
@@ -540,7 +543,7 @@ func TestServiceVIPReuse(t *testing.T) {
 	if ipList[0].String() != "192.168.0.1" {
 	if ipList[0].String() != "192.168.0.1" {
 		t.Fatal("The service VIP is 192.168.0.1")
 		t.Fatal("The service VIP is 192.168.0.1")
 	}
 	}
-	name = n.ResolveIP(ipToResolve)
+	name = n.ResolveIP(ctx, ipToResolve)
 	if name == "" {
 	if name == "" {
 		t.Fatal("It must return a name")
 		t.Fatal("It must return a name")
 	}
 	}
@@ -550,11 +553,11 @@ func TestServiceVIPReuse(t *testing.T) {
 
 
 	// Delete now using the second service ID, now all the entries should be gone
 	// Delete now using the second service ID, now all the entries should be gone
 	n.deleteSvcRecords("ep2", "service_test", "serviceID2", net.ParseIP("192.168.0.1"), net.IP{}, true, "test")
 	n.deleteSvcRecords("ep2", "service_test", "serviceID2", net.ParseIP("192.168.0.1"), net.IP{}, true, "test")
-	ipList, _ = n.ResolveName("service_test", types.IPv4)
+	ipList, _ = n.ResolveName(ctx, "service_test", types.IPv4)
 	if len(ipList) != 0 {
 	if len(ipList) != 0 {
 		t.Fatal("All the VIPs should be gone now")
 		t.Fatal("All the VIPs should be gone now")
 	}
 	}
-	name = n.ResolveIP(ipToResolve)
+	name = n.ResolveIP(ctx, ipToResolve)
 	if name != "" {
 	if name != "" {
 		t.Fatalf("It must return empty no more services associated, instead:%s", name)
 		t.Fatalf("It must return empty no more services associated, instead:%s", name)
 	}
 	}

+ 15 - 4
libnetwork/network.go

@@ -23,6 +23,9 @@ import (
 	"github.com/docker/docker/libnetwork/scope"
 	"github.com/docker/docker/libnetwork/scope"
 	"github.com/docker/docker/libnetwork/types"
 	"github.com/docker/docker/libnetwork/types"
 	"github.com/docker/docker/pkg/stringid"
 	"github.com/docker/docker/pkg/stringid"
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/trace"
 )
 )
 
 
 // EndpointWalker is a client provided function which will be used to walk the Endpoints.
 // EndpointWalker is a client provided function which will be used to walk the Endpoints.
@@ -1955,12 +1958,20 @@ func (n *Network) hasLoadBalancerEndpoint() bool {
 	return len(n.loadBalancerIP) != 0
 	return len(n.loadBalancerIP) != 0
 }
 }
 
 
-func (n *Network) ResolveName(req string, ipType int) ([]net.IP, bool) {
+func (n *Network) ResolveName(ctx context.Context, req string, ipType int) ([]net.IP, bool) {
 	var ipv6Miss bool
 	var ipv6Miss bool
 
 
 	c := n.getController()
 	c := n.getController()
 	networkID := n.ID()
 	networkID := n.ID()
+
+	_, span := otel.Tracer("").Start(ctx, "Network.ResolveName", trace.WithAttributes(
+		attribute.String("libnet.network.name", n.Name()),
+		attribute.String("libnet.network.id", networkID),
+	))
+	defer span.End()
+
 	c.mu.Lock()
 	c.mu.Lock()
+	// TODO(aker): release the lock earlier
 	defer c.mu.Unlock()
 	defer c.mu.Unlock()
 	sr, ok := c.svcRecords[networkID]
 	sr, ok := c.svcRecords[networkID]
 
 
@@ -2019,7 +2030,7 @@ func (n *Network) HandleQueryResp(name string, ip net.IP) {
 	}
 	}
 }
 }
 
 
-func (n *Network) ResolveIP(ip string) string {
+func (n *Network) ResolveIP(_ context.Context, ip string) string {
 	networkID := n.ID()
 	networkID := n.ID()
 	c := n.getController()
 	c := n.getController()
 	c.mu.Lock()
 	c.mu.Lock()
@@ -2049,13 +2060,13 @@ func (n *Network) ResolveIP(ip string) string {
 	return elem.name + "." + nwName
 	return elem.name + "." + nwName
 }
 }
 
 
-func (n *Network) ResolveService(name string) ([]*net.SRV, []net.IP) {
+func (n *Network) ResolveService(ctx context.Context, name string) ([]*net.SRV, []net.IP) {
 	c := n.getController()
 	c := n.getController()
 
 
 	srv := []*net.SRV{}
 	srv := []*net.SRV{}
 	ip := []net.IP{}
 	ip := []net.IP{}
 
 
-	log.G(context.TODO()).Debugf("Service name To resolve: %v", name)
+	log.G(ctx).Debugf("Service name To resolve: %v", name)
 
 
 	// There are DNS implementations that allow SRV queries for names not in
 	// There are DNS implementations that allow SRV queries for names not in
 	// the format defined by RFC 2782. Hence specific validations checks are
 	// the format defined by RFC 2782. Hence specific validations checks are

+ 80 - 44
libnetwork/resolver.go

@@ -2,6 +2,7 @@ package libnetwork
 
 
 import (
 import (
 	"context"
 	"context"
+	"errors"
 	"fmt"
 	"fmt"
 	"math/rand"
 	"math/rand"
 	"net"
 	"net"
@@ -13,7 +14,10 @@ import (
 	"github.com/containerd/containerd/log"
 	"github.com/containerd/containerd/log"
 	"github.com/docker/docker/libnetwork/types"
 	"github.com/docker/docker/libnetwork/types"
 	"github.com/miekg/dns"
 	"github.com/miekg/dns"
-	"github.com/sirupsen/logrus"
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/codes"
+	"go.opentelemetry.io/otel/trace"
 	"golang.org/x/sync/semaphore"
 	"golang.org/x/sync/semaphore"
 	"golang.org/x/time/rate"
 	"golang.org/x/time/rate"
 )
 )
@@ -26,13 +30,13 @@ type DNSBackend interface {
 	// the networks the sandbox is connected to. For IPv6 queries, second return
 	// the networks the sandbox is connected to. For IPv6 queries, second return
 	// value will be true if the name exists in docker domain but doesn't have an
 	// value will be true if the name exists in docker domain but doesn't have an
 	// IPv6 address. Such queries shouldn't be forwarded to external nameservers.
 	// IPv6 address. Such queries shouldn't be forwarded to external nameservers.
-	ResolveName(name string, iplen int) ([]net.IP, bool)
+	ResolveName(ctx context.Context, name string, iplen int) ([]net.IP, bool)
 	// ResolveIP returns the service name for the passed in IP. IP is in reverse dotted
 	// ResolveIP returns the service name for the passed in IP. IP is in reverse dotted
 	// notation; the format used for DNS PTR records
 	// notation; the format used for DNS PTR records
-	ResolveIP(name string) string
+	ResolveIP(ctx context.Context, name string) string
 	// ResolveService returns all the backend details about the containers or hosts
 	// ResolveService returns all the backend details about the containers or hosts
 	// backing a service. Its purpose is to satisfy an SRV query
 	// backing a service. Its purpose is to satisfy an SRV query
-	ResolveService(name string) ([]*net.SRV, []net.IP)
+	ResolveService(ctx context.Context, name string) ([]*net.SRV, []net.IP)
 	// ExecFunc allows a function to be executed in the context of the backend
 	// ExecFunc allows a function to be executed in the context of the backend
 	// on behalf of the resolver.
 	// on behalf of the resolver.
 	ExecFunc(f func()) error
 	ExecFunc(f func()) error
@@ -73,7 +77,7 @@ type Resolver struct {
 	listenAddress string
 	listenAddress string
 	proxyDNS      bool
 	proxyDNS      bool
 	startCh       chan struct{}
 	startCh       chan struct{}
-	logger        *logrus.Logger
+	logger        *log.Entry
 
 
 	fwdSem      *semaphore.Weighted // Limit the number of concurrent external DNS requests in-flight
 	fwdSem      *semaphore.Weighted // Limit the number of concurrent external DNS requests in-flight
 	logInverval rate.Sometimes      // Rate-limit logging about hitting the fwdSem limit
 	logInverval rate.Sometimes      // Rate-limit logging about hitting the fwdSem limit
@@ -92,9 +96,9 @@ func NewResolver(address string, proxyDNS bool, backend DNSBackend) *Resolver {
 	}
 	}
 }
 }
 
 
-func (r *Resolver) log() *logrus.Logger {
+func (r *Resolver) log(ctx context.Context) *log.Entry {
 	if r.logger == nil {
 	if r.logger == nil {
-		return log.G(context.TODO()).Logger
+		return log.G(ctx)
 	}
 	}
 	return r.logger
 	return r.logger
 }
 }
@@ -146,7 +150,7 @@ func (r *Resolver) Start() error {
 	r.server = s
 	r.server = s
 	go func() {
 	go func() {
 		if err := s.ActivateAndServe(); err != nil {
 		if err := s.ActivateAndServe(); err != nil {
-			r.log().WithError(err).Error("[resolver] failed to start PacketConn DNS server")
+			r.log(context.TODO()).WithError(err).Error("[resolver] failed to start PacketConn DNS server")
 		}
 		}
 	}()
 	}()
 
 
@@ -154,7 +158,7 @@ func (r *Resolver) Start() error {
 	r.tcpServer = tcpServer
 	r.tcpServer = tcpServer
 	go func() {
 	go func() {
 		if err := tcpServer.ActivateAndServe(); err != nil {
 		if err := tcpServer.ActivateAndServe(); err != nil {
-			r.log().WithError(err).Error("[resolver] failed to start TCP DNS server")
+			r.log(context.TODO()).WithError(err).Error("[resolver] failed to start TCP DNS server")
 		}
 		}
 	}()
 	}()
 	return nil
 	return nil
@@ -224,10 +228,10 @@ func createRespMsg(query *dns.Msg) *dns.Msg {
 	return resp
 	return resp
 }
 }
 
 
-func (r *Resolver) handleMXQuery(query *dns.Msg) (*dns.Msg, error) {
+func (r *Resolver) handleMXQuery(ctx context.Context, query *dns.Msg) (*dns.Msg, error) {
 	name := query.Question[0].Name
 	name := query.Question[0].Name
-	addrv4, _ := r.backend.ResolveName(name, types.IPv4)
-	addrv6, _ := r.backend.ResolveName(name, types.IPv6)
+	addrv4, _ := r.backend.ResolveName(ctx, name, types.IPv4)
+	addrv6, _ := r.backend.ResolveName(ctx, name, types.IPv6)
 
 
 	if addrv4 == nil && addrv6 == nil {
 	if addrv4 == nil && addrv6 == nil {
 		return nil, nil
 		return nil, nil
@@ -241,17 +245,17 @@ func (r *Resolver) handleMXQuery(query *dns.Msg) (*dns.Msg, error) {
 	return resp, nil
 	return resp, nil
 }
 }
 
 
-func (r *Resolver) handleIPQuery(query *dns.Msg, ipType int) (*dns.Msg, error) {
+func (r *Resolver) handleIPQuery(ctx context.Context, query *dns.Msg, ipType int) (*dns.Msg, error) {
 	var (
 	var (
 		addr     []net.IP
 		addr     []net.IP
 		ipv6Miss bool
 		ipv6Miss bool
 		name     = query.Question[0].Name
 		name     = query.Question[0].Name
 	)
 	)
-	addr, ipv6Miss = r.backend.ResolveName(name, ipType)
+	addr, ipv6Miss = r.backend.ResolveName(ctx, name, ipType)
 
 
 	if addr == nil && ipv6Miss {
 	if addr == nil && ipv6Miss {
 		// Send a reply without any Answer sections
 		// Send a reply without any Answer sections
-		r.log().Debugf("[resolver] lookup name %s present without IPv6 address", name)
+		r.log(ctx).Debugf("[resolver] lookup name %s present without IPv6 address", name)
 		resp := createRespMsg(query)
 		resp := createRespMsg(query)
 		return resp, nil
 		return resp, nil
 	}
 	}
@@ -259,7 +263,7 @@ func (r *Resolver) handleIPQuery(query *dns.Msg, ipType int) (*dns.Msg, error) {
 		return nil, nil
 		return nil, nil
 	}
 	}
 
 
-	r.log().Debugf("[resolver] lookup for %s: IP %v", name, addr)
+	r.log(ctx).Debugf("[resolver] lookup for %s: IP %v", name, addr)
 
 
 	resp := createRespMsg(query)
 	resp := createRespMsg(query)
 	if len(addr) > 1 {
 	if len(addr) > 1 {
@@ -283,7 +287,7 @@ func (r *Resolver) handleIPQuery(query *dns.Msg, ipType int) (*dns.Msg, error) {
 	return resp, nil
 	return resp, nil
 }
 }
 
 
-func (r *Resolver) handlePTRQuery(query *dns.Msg) (*dns.Msg, error) {
+func (r *Resolver) handlePTRQuery(ctx context.Context, query *dns.Msg) (*dns.Msg, error) {
 	ptr := query.Question[0].Name
 	ptr := query.Question[0].Name
 	name, after, found := strings.Cut(ptr, ptrIPv4domain)
 	name, after, found := strings.Cut(ptr, ptrIPv4domain)
 	if !found || after != "" {
 	if !found || after != "" {
@@ -295,12 +299,12 @@ func (r *Resolver) handlePTRQuery(query *dns.Msg) (*dns.Msg, error) {
 		return nil, nil
 		return nil, nil
 	}
 	}
 
 
-	host := r.backend.ResolveIP(name)
+	host := r.backend.ResolveIP(ctx, name)
 	if host == "" {
 	if host == "" {
 		return nil, nil
 		return nil, nil
 	}
 	}
 
 
-	r.log().Debugf("[resolver] lookup for IP %s: name %s", name, host)
+	r.log(ctx).Debugf("[resolver] lookup for IP %s: name %s", name, host)
 	fqdn := dns.Fqdn(host)
 	fqdn := dns.Fqdn(host)
 
 
 	resp := createRespMsg(query)
 	resp := createRespMsg(query)
@@ -311,9 +315,9 @@ func (r *Resolver) handlePTRQuery(query *dns.Msg) (*dns.Msg, error) {
 	return resp, nil
 	return resp, nil
 }
 }
 
 
-func (r *Resolver) handleSRVQuery(query *dns.Msg) (*dns.Msg, error) {
+func (r *Resolver) handleSRVQuery(ctx context.Context, query *dns.Msg) (*dns.Msg, error) {
 	svc := query.Question[0].Name
 	svc := query.Question[0].Name
-	srv, ip := r.backend.ResolveService(svc)
+	srv, ip := r.backend.ResolveService(ctx, svc)
 
 
 	if len(srv) == 0 {
 	if len(srv) == 0 {
 		return nil, nil
 		return nil, nil
@@ -351,29 +355,37 @@ func (r *Resolver) serveDNS(w dns.ResponseWriter, query *dns.Msg) {
 	queryName := query.Question[0].Name
 	queryName := query.Question[0].Name
 	queryType := query.Question[0].Qtype
 	queryType := query.Question[0].Qtype
 
 
+	ctx, span := otel.Tracer("").Start(context.Background(), "resolver.serveDNS", trace.WithAttributes(
+		attribute.String("libnet.resolver.query.name", queryName),
+		attribute.String("libnet.resolver.query.type", dns.TypeToString[queryType]),
+	))
+	defer span.End()
+
 	switch queryType {
 	switch queryType {
 	case dns.TypeA:
 	case dns.TypeA:
-		resp, err = r.handleIPQuery(query, types.IPv4)
+		resp, err = r.handleIPQuery(ctx, query, types.IPv4)
 	case dns.TypeAAAA:
 	case dns.TypeAAAA:
-		resp, err = r.handleIPQuery(query, types.IPv6)
+		resp, err = r.handleIPQuery(ctx, query, types.IPv6)
 	case dns.TypeMX:
 	case dns.TypeMX:
-		resp, err = r.handleMXQuery(query)
+		resp, err = r.handleMXQuery(ctx, query)
 	case dns.TypePTR:
 	case dns.TypePTR:
-		resp, err = r.handlePTRQuery(query)
+		resp, err = r.handlePTRQuery(ctx, query)
 	case dns.TypeSRV:
 	case dns.TypeSRV:
-		resp, err = r.handleSRVQuery(query)
+		resp, err = r.handleSRVQuery(ctx, query)
 	default:
 	default:
-		r.log().Debugf("[resolver] query type %s is not supported by the embedded DNS and will be forwarded to external DNS", dns.TypeToString[queryType])
+		r.log(ctx).Debugf("[resolver] query type %s is not supported by the embedded DNS and will be forwarded to external DNS", dns.TypeToString[queryType])
 	}
 	}
 
 
 	reply := func(msg *dns.Msg) {
 	reply := func(msg *dns.Msg) {
 		if err = w.WriteMsg(msg); err != nil {
 		if err = w.WriteMsg(msg); err != nil {
-			r.log().WithError(err).Errorf("[resolver] failed to write response")
+			r.log(ctx).WithError(err).Errorf("[resolver] failed to write response")
+			span.RecordError(err)
+			span.SetStatus(codes.Error, "WriteMsg failed")
 		}
 		}
 	}
 	}
 
 
 	if err != nil {
 	if err != nil {
-		r.log().WithError(err).Errorf("[resolver] failed to handle query: %s (%s)", queryName, dns.TypeToString[queryType])
+		r.log(ctx).WithError(err).Errorf("[resolver] failed to handle query: %s (%s)", queryName, dns.TypeToString[queryType])
 		reply(new(dns.Msg).SetRcode(query, dns.RcodeServerFailure))
 		reply(new(dns.Msg).SetRcode(query, dns.RcodeServerFailure))
 		return
 		return
 	}
 	}
@@ -393,6 +405,9 @@ func (r *Resolver) serveDNS(w dns.ResponseWriter, query *dns.Msg) {
 			}
 			}
 		}
 		}
 		resp.Truncate(maxSize)
 		resp.Truncate(maxSize)
+		span.AddEvent("found local record", trace.WithAttributes(
+			attribute.String("libnet.resolver.resp", resp.String()),
+		))
 		reply(resp)
 		reply(resp)
 		return
 		return
 	}
 	}
@@ -406,7 +421,7 @@ func (r *Resolver) serveDNS(w dns.ResponseWriter, query *dns.Msg) {
 			!strings.Contains(strings.TrimSuffix(queryName, "."), ".") {
 			!strings.Contains(strings.TrimSuffix(queryName, "."), ".") {
 			resp = createRespMsg(query)
 			resp = createRespMsg(query)
 		} else {
 		} else {
-			resp = r.forwardExtDNS(w.LocalAddr().Network(), query)
+			resp = r.forwardExtDNS(ctx, w.LocalAddr().Network(), query)
 		}
 		}
 	}
 	}
 
 
@@ -448,25 +463,31 @@ func (r *Resolver) dialExtDNS(proto string, server extDNSEntry) (net.Conn, error
 	return extConn, nil
 	return extConn, nil
 }
 }
 
 
-func (r *Resolver) forwardExtDNS(proto string, query *dns.Msg) *dns.Msg {
+func (r *Resolver) forwardExtDNS(ctx context.Context, proto string, query *dns.Msg) *dns.Msg {
+	ctx, span := otel.Tracer("").Start(ctx, "resolver.forwardExtDNS")
+	defer span.End()
+
 	for _, extDNS := range r.extDNSList {
 	for _, extDNS := range r.extDNSList {
 		if extDNS.IPStr == "" {
 		if extDNS.IPStr == "" {
 			break
 			break
 		}
 		}
 
 
 		// limits the number of outstanding concurrent queries.
 		// limits the number of outstanding concurrent queries.
-		ctx, cancel := context.WithTimeout(context.Background(), extIOTimeout)
+		ctx, cancel := context.WithTimeout(ctx, extIOTimeout)
 		err := r.fwdSem.Acquire(ctx, 1)
 		err := r.fwdSem.Acquire(ctx, 1)
 		cancel()
 		cancel()
+
 		if err != nil {
 		if err != nil {
-			r.logInverval.Do(func() {
-				r.log().Errorf("[resolver] more than %v concurrent queries", maxConcurrent)
-			})
+			if errors.Is(err, context.DeadlineExceeded) {
+				r.logInverval.Do(func() {
+					r.log(ctx).Errorf("[resolver] more than %v concurrent queries", maxConcurrent)
+				})
+			}
 			return new(dns.Msg).SetRcode(query, dns.RcodeRefused)
 			return new(dns.Msg).SetRcode(query, dns.RcodeRefused)
 		}
 		}
 		resp := func() *dns.Msg {
 		resp := func() *dns.Msg {
 			defer r.fwdSem.Release(1)
 			defer r.fwdSem.Release(1)
-			return r.exchange(proto, extDNS, query)
+			return r.exchange(ctx, proto, extDNS, query)
 		}()
 		}()
 		if resp == nil {
 		if resp == nil {
 			continue
 			continue
@@ -476,7 +497,7 @@ func (r *Resolver) forwardExtDNS(proto string, query *dns.Msg) *dns.Msg {
 		case dns.RcodeServerFailure, dns.RcodeRefused:
 		case dns.RcodeServerFailure, dns.RcodeRefused:
 			// Server returned FAILURE: continue with the next external DNS server
 			// Server returned FAILURE: continue with the next external DNS server
 			// Server returned REFUSED: this can be a transitional status, so continue with the next external DNS server
 			// Server returned REFUSED: this can be a transitional status, so continue with the next external DNS server
-			r.log().Debugf("[resolver] external DNS %s:%s returned failure:\n%s", proto, extDNS.IPStr, resp)
+			r.log(ctx).Debugf("[resolver] external DNS %s:%s returned failure:\n%s", proto, extDNS.IPStr, resp)
 			continue
 			continue
 		}
 		}
 		answers := 0
 		answers := 0
@@ -486,34 +507,46 @@ func (r *Resolver) forwardExtDNS(proto string, query *dns.Msg) *dns.Msg {
 			case dns.TypeA:
 			case dns.TypeA:
 				answers++
 				answers++
 				ip := rr.(*dns.A).A
 				ip := rr.(*dns.A).A
-				r.log().Debugf("[resolver] received A record %q for %q from %s:%s", ip, h.Name, proto, extDNS.IPStr)
+				r.log(ctx).Debugf("[resolver] received A record %q for %q from %s:%s", ip, h.Name, proto, extDNS.IPStr)
 				r.backend.HandleQueryResp(h.Name, ip)
 				r.backend.HandleQueryResp(h.Name, ip)
 			case dns.TypeAAAA:
 			case dns.TypeAAAA:
 				answers++
 				answers++
 				ip := rr.(*dns.AAAA).AAAA
 				ip := rr.(*dns.AAAA).AAAA
-				r.log().Debugf("[resolver] received AAAA record %q for %q from %s:%s", ip, h.Name, proto, extDNS.IPStr)
+				r.log(ctx).Debugf("[resolver] received AAAA record %q for %q from %s:%s", ip, h.Name, proto, extDNS.IPStr)
 				r.backend.HandleQueryResp(h.Name, ip)
 				r.backend.HandleQueryResp(h.Name, ip)
 			}
 			}
 		}
 		}
 		if len(resp.Answer) == 0 {
 		if len(resp.Answer) == 0 {
-			r.log().Debugf("[resolver] external DNS %s:%s returned response with no answers:\n%s", proto, extDNS.IPStr, resp)
+			r.log(ctx).Debugf("[resolver] external DNS %s:%s returned response with no answers:\n%s", proto, extDNS.IPStr, resp)
 		}
 		}
 		resp.Compress = true
 		resp.Compress = true
+		span.AddEvent("response from upstream server", trace.WithAttributes(
+			attribute.String("libnet.resolver.resp", resp.String()),
+		))
 		return resp
 		return resp
 	}
 	}
 
 
+	span.AddEvent("no response from upstream servers")
 	return nil
 	return nil
 }
 }
 
 
-func (r *Resolver) exchange(proto string, extDNS extDNSEntry, query *dns.Msg) *dns.Msg {
+func (r *Resolver) exchange(ctx context.Context, proto string, extDNS extDNSEntry, query *dns.Msg) *dns.Msg {
+	ctx, span := otel.Tracer("").Start(ctx, "resolver.exchange", trace.WithAttributes(
+		attribute.String("libnet.resolver.upstream.proto", proto),
+		attribute.String("libnet.resolver.upstream.address", extDNS.IPStr),
+		attribute.Bool("libnet.resolver.upstream.host-loopback", extDNS.HostLoopback)))
+	defer span.End()
+
 	extConn, err := r.dialExtDNS(proto, extDNS)
 	extConn, err := r.dialExtDNS(proto, extDNS)
 	if err != nil {
 	if err != nil {
-		r.log().WithError(err).Warn("[resolver] connect failed")
+		r.log(ctx).WithError(err).Warn("[resolver] connect failed")
+		span.RecordError(err)
+		span.SetStatus(codes.Error, "dialExtDNS failed")
 		return nil
 		return nil
 	}
 	}
 	defer extConn.Close()
 	defer extConn.Close()
 
 
-	logger := r.log().WithFields(log.Fields{
+	logger := r.log(ctx).WithFields(log.Fields{
 		"dns-server":  extConn.RemoteAddr().Network() + ":" + extConn.RemoteAddr().String(),
 		"dns-server":  extConn.RemoteAddr().Network() + ":" + extConn.RemoteAddr().String(),
 		"client-addr": extConn.LocalAddr().Network() + ":" + extConn.LocalAddr().String(),
 		"client-addr": extConn.LocalAddr().Network() + ":" + extConn.LocalAddr().String(),
 		"question":    query.Question[0].String(),
 		"question":    query.Question[0].String(),
@@ -534,13 +567,16 @@ func (r *Resolver) exchange(proto string, extDNS extDNSEntry, query *dns.Msg) *d
 		UDPSize: dns.MaxMsgSize,
 		UDPSize: dns.MaxMsgSize,
 	}).ExchangeWithConn(query, &dns.Conn{Conn: extConn})
 	}).ExchangeWithConn(query, &dns.Conn{Conn: extConn})
 	if err != nil {
 	if err != nil {
-		r.log().WithError(err).Errorf("[resolver] failed to query DNS server: %s, query: %s", extConn.RemoteAddr().String(), query.Question[0].String())
+		r.log(ctx).WithError(err).Errorf("[resolver] failed to query DNS server: %s, query: %s", extConn.RemoteAddr().String(), query.Question[0].String())
+		span.RecordError(err)
+		span.SetStatus(codes.Error, "ExchangeWithConn failed")
 		return nil
 		return nil
 	}
 	}
 
 
 	if resp == nil {
 	if resp == nil {
 		// Should be impossible, so make noise if it happens anyway.
 		// Should be impossible, so make noise if it happens anyway.
 		logger.Error("[resolver] external DNS returned empty response")
 		logger.Error("[resolver] external DNS returned empty response")
+		span.SetStatus(codes.Error, "External DNS returned empty response")
 	}
 	}
 	return resp
 	return resp
 }
 }

+ 6 - 4
libnetwork/resolver_test.go

@@ -226,11 +226,11 @@ func TestOversizedDNSReply(t *testing.T) {
 	checkDNSRRType(t, resp.Answer[0].Header().Rrtype, dns.TypeA)
 	checkDNSRRType(t, resp.Answer[0].Header().Rrtype, dns.TypeA)
 }
 }
 
 
-func testLogger(t *testing.T) *logrus.Logger {
+func testLogger(t *testing.T) *logrus.Entry {
 	logger := logrus.New()
 	logger := logrus.New()
 	logger.SetLevel(logrus.DebugLevel)
 	logger.SetLevel(logrus.DebugLevel)
 	logger.SetOutput(tlogWriter{t})
 	logger.SetOutput(tlogWriter{t})
-	return logger
+	return logrus.NewEntry(logger)
 }
 }
 
 
 type tlogWriter struct{ t *testing.T }
 type tlogWriter struct{ t *testing.T }
@@ -242,7 +242,9 @@ func (w tlogWriter) Write(p []byte) (n int, err error) {
 
 
 type noopDNSBackend struct{ DNSBackend }
 type noopDNSBackend struct{ DNSBackend }
 
 
-func (noopDNSBackend) ResolveName(name string, iplen int) ([]net.IP, bool) { return nil, false }
+func (noopDNSBackend) ResolveName(_ context.Context, name string, iplen int) ([]net.IP, bool) {
+	return nil, false
+}
 
 
 func (noopDNSBackend) ExecFunc(f func()) error { f(); return nil }
 func (noopDNSBackend) ExecFunc(f func()) error { f(); return nil }
 
 
@@ -286,7 +288,7 @@ func TestReplySERVFAIL(t *testing.T) {
 
 
 type badSRVDNSBackend struct{ noopDNSBackend }
 type badSRVDNSBackend struct{ noopDNSBackend }
 
 
-func (badSRVDNSBackend) ResolveService(name string) ([]*net.SRV, []net.IP) {
+func (badSRVDNSBackend) ResolveService(_ context.Context, _ string) ([]*net.SRV, []net.IP) {
 	return []*net.SRV{nil, nil, nil}, nil // Mismatched slice lengths
 	return []*net.SRV{nil, nil, nil}, nil // Mismatched slice lengths
 }
 }
 
 

+ 22 - 12
libnetwork/sandbox.go

@@ -13,6 +13,9 @@ import (
 	"github.com/docker/docker/libnetwork/etchosts"
 	"github.com/docker/docker/libnetwork/etchosts"
 	"github.com/docker/docker/libnetwork/osl"
 	"github.com/docker/docker/libnetwork/osl"
 	"github.com/docker/docker/libnetwork/types"
 	"github.com/docker/docker/libnetwork/types"
+	"go.opentelemetry.io/otel"
+	"go.opentelemetry.io/otel/attribute"
+	"go.opentelemetry.io/otel/trace"
 )
 )
 
 
 // SandboxOption is an option setter function type used to pass various options to
 // SandboxOption is an option setter function type used to pass various options to
@@ -351,13 +354,13 @@ func (sb *Sandbox) HandleQueryResp(name string, ip net.IP) {
 	}
 	}
 }
 }
 
 
-func (sb *Sandbox) ResolveIP(ip string) string {
+func (sb *Sandbox) ResolveIP(ctx context.Context, ip string) string {
 	var svc string
 	var svc string
-	log.G(context.TODO()).Debugf("IP To resolve %v", ip)
+	log.G(ctx).Debugf("IP To resolve %v", ip)
 
 
 	for _, ep := range sb.Endpoints() {
 	for _, ep := range sb.Endpoints() {
 		n := ep.getNetwork()
 		n := ep.getNetwork()
-		svc = n.ResolveIP(ip)
+		svc = n.ResolveIP(ctx, ip)
 		if len(svc) != 0 {
 		if len(svc) != 0 {
 			return svc
 			return svc
 		}
 		}
@@ -368,8 +371,8 @@ func (sb *Sandbox) ResolveIP(ip string) string {
 
 
 // ResolveService returns all the backend details about the containers or hosts
 // ResolveService returns all the backend details about the containers or hosts
 // backing a service. Its purpose is to satisfy an SRV query.
 // backing a service. Its purpose is to satisfy an SRV query.
-func (sb *Sandbox) ResolveService(name string) ([]*net.SRV, []net.IP) {
-	log.G(context.TODO()).Debugf("Service name To resolve: %v", name)
+func (sb *Sandbox) ResolveService(ctx context.Context, name string) ([]*net.SRV, []net.IP) {
+	log.G(ctx).Debugf("Service name To resolve: %v", name)
 
 
 	// There are DNS implementations that allow SRV queries for names not in
 	// There are DNS implementations that allow SRV queries for names not in
 	// the format defined by RFC 2782. Hence specific validations checks are
 	// the format defined by RFC 2782. Hence specific validations checks are
@@ -381,7 +384,7 @@ func (sb *Sandbox) ResolveService(name string) ([]*net.SRV, []net.IP) {
 	for _, ep := range sb.Endpoints() {
 	for _, ep := range sb.Endpoints() {
 		n := ep.getNetwork()
 		n := ep.getNetwork()
 
 
-		srv, ip := n.ResolveService(name)
+		srv, ip := n.ResolveService(ctx, name)
 		if len(srv) > 0 {
 		if len(srv) > 0 {
 			return srv, ip
 			return srv, ip
 		}
 		}
@@ -421,7 +424,7 @@ func getLocalNwEndpoints(epList []*Endpoint) []*Endpoint {
 	return eps
 	return eps
 }
 }
 
 
-func (sb *Sandbox) ResolveName(name string, ipType int) ([]net.IP, bool) {
+func (sb *Sandbox) ResolveName(ctx context.Context, name string, ipType int) ([]net.IP, bool) {
 	// Embedded server owns the docker network domain. Resolution should work
 	// Embedded server owns the docker network domain. Resolution should work
 	// for both container_name and container_name.network_name
 	// for both container_name and container_name.network_name
 	// We allow '.' in service name and network name. For a name a.b.c.d the
 	// We allow '.' in service name and network name. For a name a.b.c.d the
@@ -431,7 +434,7 @@ func (sb *Sandbox) ResolveName(name string, ipType int) ([]net.IP, bool) {
 	// {a.b in network c.d},
 	// {a.b in network c.d},
 	// {a in network b.c.d},
 	// {a in network b.c.d},
 
 
-	log.G(context.TODO()).Debugf("Name To resolve: %v", name)
+	log.G(ctx).Debugf("Name To resolve: %v", name)
 	name = strings.TrimSuffix(name, ".")
 	name = strings.TrimSuffix(name, ".")
 	reqName := []string{name}
 	reqName := []string{name}
 	networkName := []string{""}
 	networkName := []string{""}
@@ -468,7 +471,7 @@ func (sb *Sandbox) ResolveName(name string, ipType int) ([]net.IP, bool) {
 
 
 	for i := 0; i < len(reqName); i++ {
 	for i := 0; i < len(reqName); i++ {
 		// First check for local container alias
 		// First check for local container alias
-		ip, ipv6Miss := sb.resolveName(reqName[i], networkName[i], epList, true, ipType)
+		ip, ipv6Miss := sb.resolveName(ctx, reqName[i], networkName[i], epList, true, ipType)
 		if ip != nil {
 		if ip != nil {
 			return ip, false
 			return ip, false
 		}
 		}
@@ -477,7 +480,7 @@ func (sb *Sandbox) ResolveName(name string, ipType int) ([]net.IP, bool) {
 		}
 		}
 
 
 		// Resolve the actual container name
 		// Resolve the actual container name
-		ip, ipv6Miss = sb.resolveName(reqName[i], networkName[i], epList, false, ipType)
+		ip, ipv6Miss = sb.resolveName(ctx, reqName[i], networkName[i], epList, false, ipType)
 		if ip != nil {
 		if ip != nil {
 			return ip, false
 			return ip, false
 		}
 		}
@@ -488,7 +491,14 @@ func (sb *Sandbox) ResolveName(name string, ipType int) ([]net.IP, bool) {
 	return nil, false
 	return nil, false
 }
 }
 
 
-func (sb *Sandbox) resolveName(nameOrAlias string, networkName string, epList []*Endpoint, lookupAlias bool, ipType int) (_ []net.IP, ipv6Miss bool) {
+func (sb *Sandbox) resolveName(ctx context.Context, nameOrAlias string, networkName string, epList []*Endpoint, lookupAlias bool, ipType int) (_ []net.IP, ipv6Miss bool) {
+	ctx, span := otel.Tracer("").Start(ctx, "Sandbox.resolveName", trace.WithAttributes(
+		attribute.String("libnet.resolver.name-or-alias", nameOrAlias),
+		attribute.String("libnet.network.name", networkName),
+		attribute.Bool("libnet.resolver.alias-lookup", lookupAlias),
+		attribute.Int("libnet.resolver.ip-family", ipType)))
+	defer span.End()
+
 	for _, ep := range epList {
 	for _, ep := range epList {
 		if lookupAlias && len(ep.aliases) == 0 {
 		if lookupAlias && len(ep.aliases) == 0 {
 			continue
 			continue
@@ -519,7 +529,7 @@ func (sb *Sandbox) resolveName(nameOrAlias string, networkName string, epList []
 			}
 			}
 		}
 		}
 
 
-		ip, miss := nw.ResolveName(name, ipType)
+		ip, miss := nw.ResolveName(ctx, name, ipType)
 		if ip != nil {
 		if ip != nil {
 			return ip, false
 			return ip, false
 		}
 		}