Просмотр исходного кода

Merge pull request #43219 from corhere/sdjournal

Rewrite journald LogReader and unit test it
Sebastiaan van Stijn 3 лет назад
Родитель
Сommit
9100e3c087

+ 1 - 3
Dockerfile

@@ -333,6 +333,7 @@ RUN --mount=type=cache,sharing=locked,id=moby-dev-aptlib,target=/var/lib/apt \
             python3-setuptools \
             python3-wheel \
             sudo \
+            systemd-journal-remote \
             thin-provisioning-tools \
             uidmap \
             vim \
@@ -386,9 +387,6 @@ RUN --mount=type=cache,sharing=locked,id=moby-dev-aptlib,target=/var/lib/apt \
             dbus-user-session \
             systemd \
             systemd-sysv
-RUN mkdir -p hack \
-  && curl -o hack/dind-systemd https://raw.githubusercontent.com/AkihiroSuda/containerized-systemd/b70bac0daeea120456764248164c21684ade7d0d/docker-entrypoint.sh \
-  && chmod +x hack/dind-systemd
 ENTRYPOINT ["hack/dind-systemd"]
 
 FROM dev-systemd-${SYSTEMD} AS dev

+ 3 - 0
daemon/logger/journald/doc.go

@@ -0,0 +1,3 @@
+// Package journald provides the log driver for forwarding server logs
+// to endpoints that receive the systemd format.
+package journald // import "github.com/docker/docker/daemon/logger/journald"

+ 50 - 0
daemon/logger/journald/internal/export/export.go

@@ -0,0 +1,50 @@
+// Package export implements a serializer for the systemd Journal Export Format
+// as documented at https://systemd.io/JOURNAL_EXPORT_FORMATS/
+package export // import "github.com/docker/docker/daemon/logger/journald/internal/export"
+
+import (
+	"encoding/binary"
+	"fmt"
+	"io"
+	"unicode/utf8"
+)
+
+// Returns whether s can be serialized as a field value "as they are" without
+// the special binary safe serialization.
+func isSerializableAsIs(s string) bool {
+	if !utf8.ValidString(s) {
+		return false
+	}
+	for _, c := range s {
+		if c < ' ' && c != '\t' {
+			return false
+		}
+	}
+	return true
+}
+
+// WriteField writes the field serialized to Journal Export format to w.
+//
+// The variable name must consist only of uppercase characters, numbers and
+// underscores. No validation or sanitization is performed.
+func WriteField(w io.Writer, variable, value string) error {
+	if isSerializableAsIs(value) {
+		_, err := fmt.Fprintf(w, "%s=%s\n", variable, value)
+		return err
+	}
+
+	if _, err := fmt.Fprintln(w, variable); err != nil {
+		return err
+	}
+	if err := binary.Write(w, binary.LittleEndian, uint64(len(value))); err != nil {
+		return err
+	}
+	_, err := fmt.Fprintln(w, value)
+	return err
+}
+
+// WriteEndOfEntry terminates the journal entry.
+func WriteEndOfEntry(w io.Writer) error {
+	_, err := fmt.Fprintln(w)
+	return err
+}

+ 27 - 0
daemon/logger/journald/internal/export/export_test.go

@@ -0,0 +1,27 @@
+package export_test
+
+import (
+	"bytes"
+	"testing"
+
+	"github.com/docker/docker/daemon/logger/journald/internal/export"
+	"gotest.tools/v3/assert"
+	"gotest.tools/v3/golden"
+)
+
+func TestExportSerialization(t *testing.T) {
+	must := func(err error) { t.Helper(); assert.NilError(t, err) }
+	var buf bytes.Buffer
+	must(export.WriteField(&buf, "_TRANSPORT", "journal"))
+	must(export.WriteField(&buf, "MESSAGE", "this is a single-line message.\t🚀"))
+	must(export.WriteField(&buf, "EMPTY_VALUE", ""))
+	must(export.WriteField(&buf, "NEWLINE", "\n"))
+	must(export.WriteEndOfEntry(&buf))
+
+	must(export.WriteField(&buf, "MESSAGE", "this is a\nmulti line\nmessage"))
+	must(export.WriteField(&buf, "INVALID_UTF8", "a\x80b"))
+	must(export.WriteField(&buf, "BINDATA", "\x00\x01\x02\x03"))
+	must(export.WriteEndOfEntry(&buf))
+
+	golden.Assert(t, buf.String(), "export-serialization.golden")
+}

BIN
daemon/logger/journald/internal/export/testdata/export-serialization.golden


+ 149 - 0
daemon/logger/journald/internal/fake/sender.go

@@ -0,0 +1,149 @@
+// Package fake implements a journal writer for testing which is decoupled from
+// the system's journald.
+//
+// The systemd project does not have any facilities to support testing of
+// journal reader clients (although it has been requested:
+// https://github.com/systemd/systemd/issues/14120) so we have to get creative.
+// The systemd-journal-remote command reads serialized journal entries in the
+// Journal Export Format and writes them to journal files. This format is
+// well-documented and straightforward to generate.
+package fake // import "github.com/docker/docker/daemon/logger/journald/internal/fake"
+
+import (
+	"bytes"
+	"errors"
+	"fmt"
+	"os"
+	"os/exec"
+	"regexp"
+	"strconv"
+	"testing"
+	"time"
+
+	"code.cloudfoundry.org/clock"
+	"github.com/coreos/go-systemd/v22/journal"
+	"gotest.tools/v3/assert"
+
+	"github.com/docker/docker/daemon/logger/journald/internal/export"
+)
+
+// The systemd-journal-remote command is not conventionally installed on $PATH.
+// The manpage from upstream systemd lists the command as
+// /usr/lib/systemd/systemd-journal-remote, but Debian installs it to
+// /lib/systemd instead.
+var cmdPaths = []string{
+	"/usr/lib/systemd/systemd-journal-remote",
+	"/lib/systemd/systemd-journal-remote",
+	"systemd-journal-remote", // Check $PATH anyway, just in case.
+}
+
+// ErrCommandNotFound is returned when the systemd-journal-remote command could
+// not be located at the well-known paths or $PATH.
+var ErrCommandNotFound = errors.New("systemd-journal-remote command not found")
+
+// JournalRemoteCmdPath searches for the systemd-journal-remote command in
+// well-known paths and the directories named in the $PATH environment variable.
+func JournalRemoteCmdPath() (string, error) {
+	for _, p := range cmdPaths {
+		if path, err := exec.LookPath(p); err == nil {
+			return path, nil
+		}
+	}
+	return "", ErrCommandNotFound
+}
+
+// Sender fakes github.com/coreos/go-systemd/v22/journal.Send, writing journal
+// entries to an arbitrary journal file without depending on a running journald
+// process.
+type Sender struct {
+	CmdName    string
+	OutputPath string
+
+	// Clock for timestamping sent messages.
+	Clock clock.Clock
+	// Whether to assign the event's realtime timestamp to the time
+	// specified by the SYSLOG_TIMESTAMP variable value. This is roughly
+	// analogous to journald receiving the event and assigning it a
+	// timestamp in zero time after the SYSLOG_TIMESTAMP value was set,
+	// which is higly unrealistic in practice.
+	AssignEventTimestampFromSyslogTimestamp bool
+}
+
+// New constructs a new Sender which will write journal entries to outpath. The
+// file name must end in '.journal' and the directory must already exist. The
+// journal file will be created if it does not exist. An existing journal file
+// will be appended to.
+func New(outpath string) (*Sender, error) {
+	p, err := JournalRemoteCmdPath()
+	if err != nil {
+		return nil, err
+	}
+	sender := &Sender{
+		CmdName:    p,
+		OutputPath: outpath,
+		Clock:      clock.NewClock(),
+	}
+	return sender, nil
+}
+
+// NewT is like New but will skip the test if the systemd-journal-remote command
+// is not available.
+func NewT(t *testing.T, outpath string) *Sender {
+	t.Helper()
+	s, err := New(outpath)
+	if errors.Is(err, ErrCommandNotFound) {
+		t.Skip(err)
+	}
+	assert.NilError(t, err)
+	return s
+}
+
+var validVarName = regexp.MustCompile("^[A-Z0-9][A-Z0-9_]*$")
+
+// Send is a drop-in replacement for
+// github.com/coreos/go-systemd/v22/journal.Send.
+func (s *Sender) Send(message string, priority journal.Priority, vars map[string]string) error {
+	var buf bytes.Buffer
+	// https://systemd.io/JOURNAL_EXPORT_FORMATS/ says "if you are
+	// generating this format you shouldn’t care about these special
+	// double-underscore fields," yet systemd-journal-remote treats entries
+	// without a __REALTIME_TIMESTAMP as invalid and discards them.
+	// Reported upstream: https://github.com/systemd/systemd/issues/22411
+	var ts time.Time
+	if sts := vars["SYSLOG_TIMESTAMP"]; s.AssignEventTimestampFromSyslogTimestamp && sts != "" {
+		var err error
+		if ts, err = time.Parse(time.RFC3339Nano, sts); err != nil {
+			return fmt.Errorf("fake: error parsing SYSLOG_TIMESTAMP value %q: %w", ts, err)
+		}
+	} else {
+		ts = s.Clock.Now()
+	}
+	if err := export.WriteField(&buf, "__REALTIME_TIMESTAMP", strconv.FormatInt(ts.UnixMicro(), 10)); err != nil {
+		return fmt.Errorf("fake: error writing entry to systemd-journal-remote: %w", err)
+	}
+	if err := export.WriteField(&buf, "MESSAGE", message); err != nil {
+		return fmt.Errorf("fake: error writing entry to systemd-journal-remote: %w", err)
+	}
+	if err := export.WriteField(&buf, "PRIORITY", strconv.Itoa(int(priority))); err != nil {
+		return fmt.Errorf("fake: error writing entry to systemd-journal-remote: %w", err)
+	}
+	for k, v := range vars {
+		if !validVarName.MatchString(k) {
+			return fmt.Errorf("fake: invalid journal-entry variable name %q", k)
+		}
+		if err := export.WriteField(&buf, k, v); err != nil {
+			return fmt.Errorf("fake: error writing entry to systemd-journal-remote: %w", err)
+		}
+	}
+	if err := export.WriteEndOfEntry(&buf); err != nil {
+		return fmt.Errorf("fake: error writing entry to systemd-journal-remote: %w", err)
+	}
+
+	// Invoke the command separately for each entry to ensure that the entry
+	// has been flushed to disk when Send returns.
+	cmd := exec.Command(s.CmdName, "--output", s.OutputPath, "-")
+	cmd.Stdin = &buf
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+	return cmd.Run()
+}

+ 3 - 0
daemon/logger/journald/internal/sdjournal/doc.go

@@ -0,0 +1,3 @@
+// Package sdjournal provides a Go interface to the systemd journal read API by
+// wrapping the libsystemd C library.
+package sdjournal // import "github.com/docker/docker/daemon/logger/journald/internal/sdjournal"

+ 264 - 0
daemon/logger/journald/internal/sdjournal/sdjournal.go

@@ -0,0 +1,264 @@
+//go:build linux && cgo && !static_build && journald
+// +build linux,cgo,!static_build,journald
+
+package sdjournal // import "github.com/docker/docker/daemon/logger/journald/internal/sdjournal"
+
+// #cgo pkg-config: libsystemd
+// #include <stdlib.h>
+// #include <systemd/sd-journal.h>
+//
+// static int add_match(sd_journal *j, _GoString_ s) {
+// 	return sd_journal_add_match(j, _GoStringPtr(s), _GoStringLen(s));
+// }
+import "C"
+import (
+	"fmt"
+	"runtime"
+	"strings"
+	"syscall"
+	"time"
+	"unsafe"
+)
+
+// Status is an sd-journal status code.
+type Status int
+
+// Status values for Process() and Wait().
+const (
+	StatusNOP        = Status(C.SD_JOURNAL_NOP)        // SD_JOURNAL_NOP
+	StatusAPPEND     = Status(C.SD_JOURNAL_APPEND)     // SD_JOURNAL_APPEND
+	StatusINVALIDATE = Status(C.SD_JOURNAL_INVALIDATE) // SD_JOURNAL_INVALIDATE
+)
+
+const (
+	// ErrInvalidReadPointer is the error returned when the read pointer is
+	// in an invalid position.
+	ErrInvalidReadPointer = syscall.EADDRNOTAVAIL
+)
+
+// Journal is a handle to an open journald journal.
+type Journal struct {
+	j      *C.sd_journal
+	noCopy noCopy //nolint:structcheck,unused // Exists only to mark values uncopyable for `go vet`.
+}
+
+// Open opens the log journal for reading.
+//
+// The returned Journal value may only be used from the same operating system
+// thread which Open was called from. Using it from only a single goroutine is
+// not sufficient; runtime.LockOSThread must also be used.
+func Open(flags int) (*Journal, error) {
+	j := &Journal{}
+	if rc := C.sd_journal_open(&j.j, C.int(flags)); rc != 0 {
+		return nil, fmt.Errorf("journald: error opening journal: %w", syscall.Errno(-rc))
+	}
+	runtime.SetFinalizer(j, (*Journal).Close)
+	return j, nil
+}
+
+// OpenDir opens the journal files at the specified absolute directory path for
+// reading.
+//
+// The returned Journal value may only be used from the same operating system
+// thread which Open was called from. Using it from only a single goroutine is
+// not sufficient; runtime.LockOSThread must also be used.
+func OpenDir(path string, flags int) (*Journal, error) {
+	j := &Journal{}
+	cpath := C.CString(path)
+	defer C.free(unsafe.Pointer(cpath))
+	if rc := C.sd_journal_open_directory(&j.j, cpath, C.int(flags)); rc != 0 {
+		return nil, fmt.Errorf("journald: error opening journal: %w", syscall.Errno(-rc))
+	}
+	runtime.SetFinalizer(j, (*Journal).Close)
+	return j, nil
+}
+
+// Close closes the journal. The return value is always nil.
+func (j *Journal) Close() error {
+	if j.j != nil {
+		C.sd_journal_close(j.j)
+		runtime.SetFinalizer(j, nil)
+		j.j = nil
+	}
+	return nil
+}
+
+// Process processes journal events.
+//
+// https://www.freedesktop.org/software/systemd/man/sd_journal_process.html
+func (j *Journal) Process() (Status, error) {
+	s := C.sd_journal_process(j.j)
+	if s < 0 {
+		return 0, fmt.Errorf("journald: error processing events: %w", syscall.Errno(-s))
+	}
+	return Status(s), nil
+}
+
+// InitializeInotify sets up change notifications for the journal.
+func (j *Journal) InitializeInotify() error {
+	if rc := C.sd_journal_get_fd(j.j); rc < 0 {
+		return fmt.Errorf("journald: error initializing inotify watches: %w", syscall.Errno(-rc))
+	}
+	return nil
+}
+
+// AddMatch adds a match by which to filter the entries of the journal file.
+//
+// https://www.freedesktop.org/software/systemd/man/sd_journal_add_match.html
+func (j *Journal) AddMatch(field, value string) error {
+	m := field + "=" + value
+	if rc := C.add_match(j.j, m); rc != 0 {
+		return fmt.Errorf("journald: error adding match %q: %w", m, syscall.Errno(-rc))
+	}
+	return nil
+}
+
+// Next advances the read pointer to the next entry.
+func (j *Journal) Next() (bool, error) {
+	rc := C.sd_journal_next(j.j)
+	if rc < 0 {
+		return false, fmt.Errorf("journald: error advancing read pointer: %w", syscall.Errno(-rc))
+	}
+	return rc > 0, nil
+}
+
+// Previous sets back the read pointer to the previous entry.
+func (j *Journal) Previous() (bool, error) {
+	rc := C.sd_journal_previous(j.j)
+	if rc < 0 {
+		return false, fmt.Errorf("journald: error setting back read pointer: %w", syscall.Errno(-rc))
+	}
+	return rc > 0, nil
+}
+
+// PreviousSkip sets back the read pointer by skip entries, returning the number
+// of entries set back. skip must be less than or equal to 2147483647
+// (2**31 - 1).
+//
+// skip == 0 is a special case: PreviousSkip(0) resolves the read pointer to a
+// discrete position without setting it back to a different entry. The trouble
+// is, it always returns zero on recent libsystemd versions. There is no way to
+// tell from the return values whether or not it successfully resolved the read
+// pointer to a discrete entry.
+// https://github.com/systemd/systemd/pull/5930#issuecomment-300878104
+func (j *Journal) PreviousSkip(skip uint) (int, error) {
+	rc := C.sd_journal_previous_skip(j.j, C.uint64_t(skip))
+	if rc < 0 {
+		return 0, fmt.Errorf("journald: error setting back read pointer: %w", syscall.Errno(-rc))
+	}
+	return int(rc), nil
+}
+
+// SeekHead sets the read pointer to the position before the oldest available entry.
+//
+// BUG: SeekHead() followed by Previous() has unexpected behavior.
+// https://github.com/systemd/systemd/issues/17662
+func (j *Journal) SeekHead() error {
+	if rc := C.sd_journal_seek_head(j.j); rc != 0 {
+		return fmt.Errorf("journald: error seeking to head of journal: %w", syscall.Errno(-rc))
+	}
+	return nil
+}
+
+// SeekTail sets the read pointer to the position after the most recent available entry.
+//
+// BUG: SeekTail() followed by Next() has unexpected behavior.
+// https://github.com/systemd/systemd/issues/9934
+func (j *Journal) SeekTail() error {
+	if rc := C.sd_journal_seek_tail(j.j); rc != 0 {
+		return fmt.Errorf("journald: error seeking to tail of journal: %w", syscall.Errno(-rc))
+	}
+	return nil
+}
+
+// SeekRealtime seeks to a position with a realtime (wallclock) timestamp after t.
+//
+// Note that the realtime clock is not necessarily monotonic. If a realtime
+// timestamp is ambiguous, the position seeked to is not defined.
+func (j *Journal) SeekRealtime(t time.Time) error {
+	if rc := C.sd_journal_seek_realtime_usec(j.j, C.uint64_t(t.UnixMicro())); rc != 0 {
+		return fmt.Errorf("journald: error seeking to time %v: %w", t, syscall.Errno(-rc))
+	}
+	return nil
+}
+
+// Wait blocks until the journal gets changed or timeout has elapsed.
+// Pass a negative timeout to wait indefinitely.
+func (j *Journal) Wait(timeout time.Duration) (Status, error) {
+	var dur C.uint64_t
+	if timeout < 0 {
+		// Wait indefinitely.
+		dur = ^C.uint64_t(0) // (uint64_t) -1
+	} else {
+		dur = C.uint64_t(timeout.Microseconds())
+	}
+	s := C.sd_journal_wait(j.j, dur)
+	if s < 0 {
+		return 0, fmt.Errorf("journald: error waiting for event: %w", syscall.Errno(-s))
+	}
+	return Status(s), nil
+}
+
+// Realtime returns the realtime timestamp of the current journal entry.
+func (j *Journal) Realtime() (time.Time, error) {
+	var stamp C.uint64_t
+	if rc := C.sd_journal_get_realtime_usec(j.j, &stamp); rc != 0 {
+		return time.Time{}, fmt.Errorf("journald: error getting journal entry timestamp: %w", syscall.Errno(-rc))
+	}
+	return time.UnixMicro(int64(stamp)), nil
+}
+
+// Data returns all data fields for the current journal entry.
+func (j *Journal) Data() (map[string]string, error) {
+	// Copying all the data fields for the entry into a map is more optimal
+	// than you might think. Doing so has time complexity O(N), where N is
+	// the number of fields in the entry. Looking up a data field in the map
+	// is amortized O(1), so the total complexity to look up M data fields
+	// is O(N+M). By comparison, looking up a data field using the
+	// sd_journal_get_data function has time complexity of O(N) as it is
+	// implemented as a linear search through the entry's fields. Therefore
+	// looking up M data fields in an entry by calling sd_journal_get_data
+	// in a loop would have time complexity of O(N*M).
+
+	m := make(map[string]string)
+	j.restartData()
+	for {
+		var (
+			data unsafe.Pointer
+			len  C.size_t
+		)
+		rc := C.sd_journal_enumerate_data(j.j, &data, &len)
+		if rc == 0 {
+			return m, nil
+		} else if rc < 0 {
+			return m, fmt.Errorf("journald: error enumerating entry data: %w", syscall.Errno(-rc))
+		}
+
+		kv := strings.SplitN(C.GoStringN((*C.char)(data), C.int(len)), "=", 2)
+		m[kv[0]] = kv[1]
+	}
+}
+
+func (j *Journal) restartData() {
+	C.sd_journal_restart_data(j.j)
+}
+
+// SetDataThreshold may be used to change the data field size threshold for data
+// returned by j.Data(). The threshold is a hint only; larger data fields might
+// still be returned.
+//
+// The default threshold is 64K. To retrieve the complete data fields this
+// threshold should be turned off by setting it to 0.
+//
+// https://www.freedesktop.org/software/systemd/man/sd_journal_set_data_threshold.html
+func (j *Journal) SetDataThreshold(v uint) error {
+	if rc := C.sd_journal_set_data_threshold(j.j, C.size_t(v)); rc != 0 {
+		return fmt.Errorf("journald: error setting journal data threshold: %w", syscall.Errno(-rc))
+	}
+	return nil
+}
+
+type noCopy struct{}
+
+func (noCopy) Lock()   {}
+func (noCopy) Unlock() {}

+ 80 - 15
daemon/logger/journald/journald.go

@@ -1,26 +1,70 @@
 //go:build linux
 // +build linux
 
-// Package journald provides the log driver for forwarding server logs
-// to endpoints that receive the systemd format.
 package journald // import "github.com/docker/docker/daemon/logger/journald"
 
 import (
 	"fmt"
 	"strconv"
+	"sync/atomic"
+	"time"
 	"unicode"
 
 	"github.com/coreos/go-systemd/v22/journal"
+
 	"github.com/docker/docker/daemon/logger"
 	"github.com/docker/docker/daemon/logger/loggerutils"
+	"github.com/docker/docker/pkg/stringid"
 )
 
 const name = "journald"
 
+// Well-known user journal fields.
+// https://www.freedesktop.org/software/systemd/man/systemd.journal-fields.html
+const (
+	fieldSyslogIdentifier = "SYSLOG_IDENTIFIER"
+	fieldSyslogTimestamp  = "SYSLOG_TIMESTAMP"
+)
+
+// User journal fields used by the log driver.
+const (
+	fieldContainerID     = "CONTAINER_ID"
+	fieldContainerIDFull = "CONTAINER_ID_FULL"
+	fieldContainerName   = "CONTAINER_NAME"
+	fieldContainerTag    = "CONTAINER_TAG"
+	fieldImageName       = "IMAGE_NAME"
+
+	// Fields used to serialize PLogMetaData.
+
+	fieldPLogID         = "CONTAINER_PARTIAL_ID"
+	fieldPLogOrdinal    = "CONTAINER_PARTIAL_ORDINAL"
+	fieldPLogLast       = "CONTAINER_PARTIAL_LAST"
+	fieldPartialMessage = "CONTAINER_PARTIAL_MESSAGE"
+
+	fieldLogEpoch   = "CONTAINER_LOG_EPOCH"
+	fieldLogOrdinal = "CONTAINER_LOG_ORDINAL"
+)
+
+var waitUntilFlushed func(*journald) error
+
 type journald struct {
+	// Sequence number of the most recent message sent by this instance of
+	// the log driver, starting from 1. Corollary: ordinal == 0 implies no
+	// messages have been sent by this instance.
+	ordinal uint64 // Placed first in struct to ensure 8-byte alignment for atomic ops.
+	// Epoch identifier to distinguish sequence numbers from this instance
+	// vs. other instances.
+	epoch string
+
 	vars map[string]string // additional variables and values to send to the journal along with the log message
 
 	closed chan struct{}
+
+	// Overrides for unit tests.
+
+	sendToJournal   func(message string, priority journal.Priority, vars map[string]string) error
+	journalReadDir  string //nolint:structcheck,unused // Referenced in read.go, which has more restrictive build constraints.
+	readSyncTimeout time.Duration
 }
 
 func init() {
@@ -59,19 +103,26 @@ func New(info logger.Info) (logger.Logger, error) {
 		return nil, fmt.Errorf("journald is not enabled on this host")
 	}
 
+	return new(info)
+}
+
+func new(info logger.Info) (*journald, error) {
 	// parse log tag
 	tag, err := loggerutils.ParseLogTag(info, loggerutils.DefaultTemplate)
 	if err != nil {
 		return nil, err
 	}
 
+	epoch := stringid.GenerateRandomID()
+
 	vars := map[string]string{
-		"CONTAINER_ID":      info.ContainerID[:12],
-		"CONTAINER_ID_FULL": info.ContainerID,
-		"CONTAINER_NAME":    info.Name(),
-		"CONTAINER_TAG":     tag,
-		"IMAGE_NAME":        info.ImageName(),
-		"SYSLOG_IDENTIFIER": tag,
+		fieldContainerID:      info.ContainerID[:12],
+		fieldContainerIDFull:  info.ContainerID,
+		fieldContainerName:    info.Name(),
+		fieldContainerTag:     tag,
+		fieldImageName:        info.ImageName(),
+		fieldSyslogIdentifier: tag,
+		fieldLogEpoch:         epoch,
 	}
 	extraAttrs, err := info.ExtraAttributes(sanitizeKeyMod)
 	if err != nil {
@@ -80,7 +131,12 @@ func New(info logger.Info) (logger.Logger, error) {
 	for k, v := range extraAttrs {
 		vars[k] = v
 	}
-	return &journald{vars: vars, closed: make(chan struct{})}, nil
+	return &journald{
+		epoch:         epoch,
+		vars:          vars,
+		closed:        make(chan struct{}),
+		sendToJournal: journal.Send,
+	}, nil
 }
 
 // We don't actually accept any options, but we have to supply a callback for
@@ -105,12 +161,15 @@ func (s *journald) Log(msg *logger.Message) error {
 	for k, v := range s.vars {
 		vars[k] = v
 	}
+	if !msg.Timestamp.IsZero() {
+		vars[fieldSyslogTimestamp] = msg.Timestamp.Format(time.RFC3339Nano)
+	}
 	if msg.PLogMetaData != nil {
-		vars["CONTAINER_PARTIAL_ID"] = msg.PLogMetaData.ID
-		vars["CONTAINER_PARTIAL_ORDINAL"] = strconv.Itoa(msg.PLogMetaData.Ordinal)
-		vars["CONTAINER_PARTIAL_LAST"] = strconv.FormatBool(msg.PLogMetaData.Last)
+		vars[fieldPLogID] = msg.PLogMetaData.ID
+		vars[fieldPLogOrdinal] = strconv.Itoa(msg.PLogMetaData.Ordinal)
+		vars[fieldPLogLast] = strconv.FormatBool(msg.PLogMetaData.Last)
 		if !msg.PLogMetaData.Last {
-			vars["CONTAINER_PARTIAL_MESSAGE"] = "true"
+			vars[fieldPartialMessage] = "true"
 		}
 	}
 
@@ -118,10 +177,13 @@ func (s *journald) Log(msg *logger.Message) error {
 	source := msg.Source
 	logger.PutMessage(msg)
 
+	seq := atomic.AddUint64(&s.ordinal, 1)
+	vars[fieldLogOrdinal] = strconv.FormatUint(seq, 10)
+
 	if source == "stderr" {
-		return journal.Send(line, journal.PriErr, vars)
+		return s.sendToJournal(line, journal.PriErr, vars)
 	}
-	return journal.Send(line, journal.PriInfo, vars)
+	return s.sendToJournal(line, journal.PriInfo, vars)
 }
 
 func (s *journald) Name() string {
@@ -130,5 +192,8 @@ func (s *journald) Name() string {
 
 func (s *journald) Close() error {
 	close(s.closed)
+	if waitUntilFlushed != nil {
+		return waitUntilFlushed(s)
+	}
 	return nil
 }

+ 0 - 7
daemon/logger/journald/journald_unsupported.go

@@ -1,7 +0,0 @@
-//go:build !linux
-// +build !linux
-
-package journald // import "github.com/docker/docker/daemon/logger/journald"
-
-type journald struct {
-}

+ 380 - 293
daemon/logger/journald/read.go

@@ -3,369 +3,456 @@
 
 package journald // import "github.com/docker/docker/daemon/logger/journald"
 
-// #include <sys/types.h>
-// #include <sys/poll.h>
-// #include <systemd/sd-journal.h>
-// #include <errno.h>
-// #include <stdio.h>
-// #include <stdlib.h>
-// #include <string.h>
-// #include <time.h>
-// #include <unistd.h>
-//
-//static int get_message(sd_journal *j, const char **msg, size_t *length, int *partial)
-//{
-//	int rc;
-//	size_t plength;
-//	*msg = NULL;
-//	*length = 0;
-//	plength = strlen("CONTAINER_PARTIAL_MESSAGE=true");
-//	rc = sd_journal_get_data(j, "CONTAINER_PARTIAL_MESSAGE", (const void **) msg, length);
-//	*partial = ((rc == 0) && (*length == plength) && (memcmp(*msg, "CONTAINER_PARTIAL_MESSAGE=true", plength) == 0));
-//	rc = sd_journal_get_data(j, "MESSAGE", (const void **) msg, length);
-//	if (rc == 0) {
-//		if (*length > 8) {
-//			(*msg) += 8;
-//			*length -= 8;
-//		} else {
-//			*msg = NULL;
-//			*length = 0;
-//			rc = -ENOENT;
-//		}
-//	}
-//	return rc;
-//}
-//static int get_priority(sd_journal *j, int *priority)
-//{
-//	const void *data;
-//	size_t i, length;
-//	int rc;
-//	*priority = -1;
-//	rc = sd_journal_get_data(j, "PRIORITY", &data, &length);
-//	if (rc == 0) {
-//		if ((length > 9) && (strncmp(data, "PRIORITY=", 9) == 0)) {
-//			*priority = 0;
-//			for (i = 9; i < length; i++) {
-//				*priority = *priority * 10 + ((const char *)data)[i] - '0';
-//			}
-//			if (length > 9) {
-//				rc = 0;
-//			}
-//		}
-//	}
-//	return rc;
-//}
-//static int is_attribute_field(const char *msg, size_t length)
-//{
-//	static const struct known_field {
-//		const char *name;
-//		size_t length;
-//	} fields[] = {
-//		{"MESSAGE", sizeof("MESSAGE") - 1},
-//		{"MESSAGE_ID", sizeof("MESSAGE_ID") - 1},
-//		{"PRIORITY", sizeof("PRIORITY") - 1},
-//		{"CODE_FILE", sizeof("CODE_FILE") - 1},
-//		{"CODE_LINE", sizeof("CODE_LINE") - 1},
-//		{"CODE_FUNC", sizeof("CODE_FUNC") - 1},
-//		{"ERRNO", sizeof("ERRNO") - 1},
-//		{"SYSLOG_FACILITY", sizeof("SYSLOG_FACILITY") - 1},
-//		{"SYSLOG_IDENTIFIER", sizeof("SYSLOG_IDENTIFIER") - 1},
-//		{"SYSLOG_PID", sizeof("SYSLOG_PID") - 1},
-//		{"CONTAINER_NAME", sizeof("CONTAINER_NAME") - 1},
-//		{"CONTAINER_ID", sizeof("CONTAINER_ID") - 1},
-//		{"CONTAINER_ID_FULL", sizeof("CONTAINER_ID_FULL") - 1},
-//		{"CONTAINER_TAG", sizeof("CONTAINER_TAG") - 1},
-//	};
-//	unsigned int i;
-//	void *p;
-//	if ((length < 1) || (msg[0] == '_') || ((p = memchr(msg, '=', length)) == NULL)) {
-//		return -1;
-//	}
-//	length = ((const char *) p) - msg;
-//	for (i = 0; i < sizeof(fields) / sizeof(fields[0]); i++) {
-//		if ((fields[i].length == length) && (memcmp(fields[i].name, msg, length) == 0)) {
-//			return -1;
-//		}
-//	}
-//	return 0;
-//}
-//static int get_attribute_field(sd_journal *j, const char **msg, size_t *length)
-//{
-//	int rc;
-//	*msg = NULL;
-//	*length = 0;
-//	while ((rc = sd_journal_enumerate_data(j, (const void **) msg, length)) > 0) {
-//		if (is_attribute_field(*msg, *length) == 0) {
-//			break;
-//		}
-//		rc = -ENOENT;
-//	}
-//	return rc;
-//}
-import "C"
-
 import (
 	"errors"
-	"strings"
+	"runtime"
+	"strconv"
+	"sync/atomic"
 	"time"
-	"unsafe"
 
 	"github.com/coreos/go-systemd/v22/journal"
+	"github.com/sirupsen/logrus"
+
 	"github.com/docker/docker/api/types/backend"
 	"github.com/docker/docker/daemon/logger"
-	"github.com/sirupsen/logrus"
+	"github.com/docker/docker/daemon/logger/journald/internal/sdjournal"
 )
 
-// CErr converts error code returned from a sd_journal_* function
-// (which returns -errno) to a string
-func CErr(ret C.int) string {
-	return C.GoString(C.strerror(-ret))
+const closedDrainTimeout = 5 * time.Second
+
+// Fields which we know are not user-provided attribute fields.
+var wellKnownFields = map[string]bool{
+	"MESSAGE":             true,
+	"MESSAGE_ID":          true,
+	"PRIORITY":            true,
+	"CODE_FILE":           true,
+	"CODE_LINE":           true,
+	"CODE_FUNC":           true,
+	"ERRNO":               true,
+	"SYSLOG_FACILITY":     true,
+	fieldSyslogIdentifier: true,
+	"SYSLOG_PID":          true,
+	fieldSyslogTimestamp:  true,
+	fieldContainerName:    true,
+	fieldContainerID:      true,
+	fieldContainerIDFull:  true,
+	fieldContainerTag:     true,
+	fieldImageName:        true,
+	fieldPLogID:           true,
+	fieldPLogOrdinal:      true,
+	fieldPLogLast:         true,
+	fieldPartialMessage:   true,
+	fieldLogEpoch:         true,
+	fieldLogOrdinal:       true,
 }
 
-func (s *journald) drainJournal(logWatcher *logger.LogWatcher, j *C.sd_journal, oldCursor *C.char, untilUnixMicro uint64) (*C.char, bool, int) {
-	var (
-		msg, data, cursor *C.char
-		length            C.size_t
-		stamp             C.uint64_t
-		priority, partial C.int
-		done              bool
-		shown             int
-	)
-
-	// Walk the journal from here forward until we run out of new entries
-	// or we reach the until value (if provided).
-drain:
-	for {
-		// Try not to send a given entry twice.
-		if oldCursor != nil {
-			for C.sd_journal_test_cursor(j, oldCursor) > 0 {
-				if C.sd_journal_next(j) <= 0 {
-					break drain
-				}
-			}
+type reader struct {
+	s           *journald
+	j           *sdjournal.Journal
+	logWatcher  *logger.LogWatcher
+	config      logger.ReadConfig
+	maxOrdinal  uint64
+	initialized bool
+	ready       chan struct{}
+}
+
+func getMessage(d map[string]string) (line []byte, ok bool) {
+	m, ok := d["MESSAGE"]
+	if ok {
+		line = []byte(m)
+		if d[fieldPartialMessage] != "true" {
+			line = append(line, "\n"...)
 		}
-		// Read and send the logged message, if there is one to read.
-		i := C.get_message(j, &msg, &length, &partial)
-		if i != -C.ENOENT && i != -C.EADDRNOTAVAIL {
-			// Read the entry's timestamp.
-			if C.sd_journal_get_realtime_usec(j, &stamp) != 0 {
-				break
+	}
+	return line, ok
+}
+
+func getPriority(d map[string]string) (journal.Priority, bool) {
+	if pri, ok := d["PRIORITY"]; ok {
+		i, err := strconv.Atoi(pri)
+		return journal.Priority(i), err == nil
+	}
+	return -1, false
+}
+
+// getSource recovers the stream name from the entry data by mapping from the
+// journal priority field back to the stream that we would have assigned that
+// value.
+func getSource(d map[string]string) string {
+	source := ""
+	if priority, ok := getPriority(d); ok {
+		if priority == journal.PriErr {
+			source = "stderr"
+		} else if priority == journal.PriInfo {
+			source = "stdout"
+		}
+	}
+	return source
+}
+
+func getAttrs(d map[string]string) []backend.LogAttr {
+	var attrs []backend.LogAttr
+	for k, v := range d {
+		if k[0] != '_' && !wellKnownFields[k] {
+			attrs = append(attrs, backend.LogAttr{Key: k, Value: v})
+		}
+	}
+	return attrs
+}
+
+// errDrainDone is the error returned by drainJournal to signal that there are
+// no more log entries to send to the log watcher.
+var errDrainDone = errors.New("journald drain done")
+
+// drainJournal reads and sends log messages from the journal.
+//
+// drainJournal returns errDrainDone when a terminal stopping condition has been
+// reached: the watch consumer is gone, a log entry is read which has a
+// timestamp after until (if until is nonzero), or the log driver is closed and
+// the last message logged has been sent from the journal. If the end of the
+// journal is reached without encountering a terminal stopping condition, a nil
+// error is returned.
+func (r *reader) drainJournal() error {
+	if !r.initialized {
+		defer func() {
+			r.signalReady()
+			r.initialized = true
+		}()
+
+		var (
+			err          error
+			seekedToTail bool
+		)
+		if r.config.Tail >= 0 {
+			if r.config.Until.IsZero() {
+				err = r.j.SeekTail()
+				seekedToTail = true
+			} else {
+				err = r.j.SeekRealtime(r.config.Until)
 			}
-			// Break if the timestamp exceeds any provided until flag.
-			if untilUnixMicro != 0 && untilUnixMicro < uint64(stamp) {
-				done = true
-				break
+		} else {
+			if r.config.Since.IsZero() {
+				err = r.j.SeekHead()
+			} else {
+				err = r.j.SeekRealtime(r.config.Since)
 			}
+		}
+		if err != nil {
+			return err
+		}
 
-			// Set up the time and text of the entry.
-			timestamp := time.Unix(int64(stamp)/1000000, (int64(stamp)%1000000)*1000)
-			line := C.GoBytes(unsafe.Pointer(msg), C.int(length))
-			if partial == 0 {
-				line = append(line, "\n"...)
+		// SeekTail() followed by Next() behaves incorrectly, so we need
+		// to work around the bug by ensuring the first discrete
+		// movement of the read pointer is Previous() or PreviousSkip().
+		// PreviousSkip() is called inside the loop when config.Tail > 0
+		// so the only special case requiring special handling is
+		// config.Tail == 0.
+		// https://github.com/systemd/systemd/issues/9934
+		if seekedToTail && r.config.Tail == 0 {
+			// Resolve the read pointer to the last entry in the
+			// journal so that the call to Next() inside the loop
+			// advances past it.
+			if ok, err := r.j.Previous(); err != nil || !ok {
+				return err
 			}
-			// Recover the stream name by mapping
-			// from the journal priority back to
-			// the stream that we would have
-			// assigned that value.
-			source := ""
-			if C.get_priority(j, &priority) == 0 {
-				if priority == C.int(journal.PriErr) {
-					source = "stderr"
-				} else if priority == C.int(journal.PriInfo) {
-					source = "stdout"
-				}
+		}
+	}
+
+	for i := 0; ; i++ {
+		if !r.initialized && i == 0 && r.config.Tail > 0 {
+			if n, err := r.j.PreviousSkip(uint(r.config.Tail)); err != nil || n == 0 {
+				return err
 			}
-			// Retrieve the values of any variables we're adding to the journal.
-			var attrs []backend.LogAttr
-			C.sd_journal_restart_data(j)
-			for C.get_attribute_field(j, &data, &length) > C.int(0) {
-				kv := strings.SplitN(C.GoStringN(data, C.int(length)), "=", 2)
-				attrs = append(attrs, backend.LogAttr{Key: kv[0], Value: kv[1]})
+		} else if ok, err := r.j.Next(); err != nil || !ok {
+			return err
+		}
+
+		if !r.initialized && i == 0 {
+			// The cursor is in a position which will be unaffected
+			// by subsequent logging.
+			r.signalReady()
+		}
+
+		// Read the entry's timestamp.
+		timestamp, err := r.j.Realtime()
+		if err != nil {
+			return err
+		}
+		// Check if the PreviousSkip went too far back. Check only the
+		// initial position as we are comparing wall-clock timestamps,
+		// which may not be monotonic. We don't want to skip over
+		// messages sent later in time just because the clock moved
+		// backwards.
+		if !r.initialized && i == 0 && r.config.Tail > 0 && timestamp.Before(r.config.Since) {
+			r.j.SeekRealtime(r.config.Since)
+			continue
+		}
+		if !r.config.Until.IsZero() && r.config.Until.Before(timestamp) {
+			return errDrainDone
+		}
+
+		// Read and send the logged message, if there is one to read.
+		data, err := r.j.Data()
+		if err != nil {
+			return err
+		}
+
+		if data[fieldLogEpoch] == r.s.epoch {
+			seq, err := strconv.ParseUint(data[fieldLogOrdinal], 10, 64)
+			if err == nil && seq > r.maxOrdinal {
+				r.maxOrdinal = seq
 			}
+		}
 
+		if line, ok := getMessage(data); ok {
 			// Send the log message, unless the consumer is gone
-			select {
-			case <-logWatcher.WatchConsumerGone():
-				done = true // we won't be able to write anything anymore
-				break drain
-			case logWatcher.Msg <- &logger.Message{
+			msg := &logger.Message{
 				Line:      line,
-				Source:    source,
+				Source:    getSource(data),
 				Timestamp: timestamp.In(time.UTC),
-				Attrs:     attrs,
-			}:
-				shown++
+				Attrs:     getAttrs(data),
 			}
-			// Call sd_journal_process() periodically during the processing loop
-			// to close any opened file descriptors for rotated (deleted) journal files.
-			if shown%1024 == 0 {
-				if ret := C.sd_journal_process(j); ret < 0 {
-					// log a warning but ignore it for now
-					logrus.WithField("container", s.vars["CONTAINER_ID_FULL"]).
-						WithField("error", CErr(ret)).
-						Warn("journald: error processing journal")
+			// The daemon timestamp will differ from the "trusted"
+			// timestamp of when the event was received by journald.
+			// We can efficiently seek around the journal by the
+			// event timestamp, and the event timestamp is what
+			// journalctl displays. The daemon timestamp is just an
+			// application-supplied field with no special
+			// significance; libsystemd won't help us seek to the
+			// entry with the closest timestamp.
+			/*
+				if sts := data["SYSLOG_TIMESTAMP"]; sts != "" {
+					if tv, err := time.Parse(time.RFC3339Nano, sts); err == nil {
+						msg.Timestamp = tv
+					}
 				}
+			*/
+			select {
+			case <-r.logWatcher.WatchConsumerGone():
+				return errDrainDone
+			case r.logWatcher.Msg <- msg:
 			}
 		}
-		// If we're at the end of the journal, we're done (for now).
-		if C.sd_journal_next(j) <= 0 {
-			break
-		}
-	}
 
-	// free(NULL) is safe
-	C.free(unsafe.Pointer(oldCursor))
-	if C.sd_journal_get_cursor(j, &cursor) != 0 {
-		// ensure that we won't be freeing an address that's invalid
-		cursor = nil
+		// Call sd_journal_process() periodically during the processing loop
+		// to close any opened file descriptors for rotated (deleted) journal files.
+		if i != 0 && i%1024 == 0 {
+			if _, err := r.j.Process(); err != nil {
+				// log a warning but ignore it for now
+				logrus.WithField("container", r.s.vars[fieldContainerIDFull]).
+					WithField("error", err).
+					Warn("journald: error processing journal")
+			}
+		}
 	}
-	return cursor, done, shown
 }
 
-func (s *journald) followJournal(logWatcher *logger.LogWatcher, j *C.sd_journal, cursor *C.char, untilUnixMicro uint64) *C.char {
-	defer close(logWatcher.Msg)
+func (r *reader) readJournal() error {
+	caughtUp := atomic.LoadUint64(&r.s.ordinal)
+	if err := r.drainJournal(); err != nil {
+		if err != errDrainDone {
+			return err
+		}
+		return nil
+	}
 
-	waitTimeout := C.uint64_t(250000) // 0.25s
+	var drainTimeout <-chan time.Time
+	if !r.config.Follow {
+		if r.s.readSyncTimeout == 0 {
+			return nil
+		}
+		tmr := time.NewTimer(r.s.readSyncTimeout)
+		defer tmr.Stop()
+		drainTimeout = tmr.C
+	}
 
-LOOP:
 	for {
-		status := C.sd_journal_wait(j, waitTimeout)
-		if status < 0 {
-			logWatcher.Err <- errors.New("error waiting for journal: " + CErr(status))
-			break
+		status, err := r.j.Wait(250 * time.Millisecond)
+		if err != nil {
+			return err
 		}
 		select {
-		case <-logWatcher.WatchConsumerGone():
-			break LOOP // won't be able to write anything anymore
-		case <-s.closed:
+		case <-r.logWatcher.WatchConsumerGone():
+			return nil // won't be able to write anything anymore
+		case <-drainTimeout:
+			// Container is gone but we haven't found the end of the
+			// logs within the timeout. Maybe it was dropped by
+			// journald, e.g. due to rate-limiting.
+			return nil
+		case <-r.s.closed:
 			// container is gone, drain journal
+			lastSeq := atomic.LoadUint64(&r.s.ordinal)
+			if r.maxOrdinal >= lastSeq {
+				// All caught up with the logger!
+				return nil
+			}
+			if drainTimeout == nil {
+				tmr := time.NewTimer(closedDrainTimeout)
+				defer tmr.Stop()
+				drainTimeout = tmr.C
+			}
 		default:
 			// container is still alive
-			if status == C.SD_JOURNAL_NOP {
+			if status == sdjournal.StatusNOP {
 				// no new data -- keep waiting
 				continue
 			}
 		}
-		newCursor, done, recv := s.drainJournal(logWatcher, j, cursor, untilUnixMicro)
-		cursor = newCursor
-		if done || (status == C.SD_JOURNAL_NOP && recv == 0) {
-			break
+		err = r.drainJournal()
+		if err != nil {
+			if err != errDrainDone {
+				return err
+			}
+			return nil
+		}
+		if !r.config.Follow && r.s.readSyncTimeout > 0 && r.maxOrdinal >= caughtUp {
+			return nil
 		}
 	}
-
-	return cursor
 }
 
-func (s *journald) readLogs(logWatcher *logger.LogWatcher, config logger.ReadConfig) {
-	var (
-		j              *C.sd_journal
-		cmatch, cursor *C.char
-		stamp          C.uint64_t
-		sinceUnixMicro uint64
-		untilUnixMicro uint64
-	)
+func (r *reader) readLogs() {
+	defer close(r.logWatcher.Msg)
+
+	// Make sure the ready channel is closed in the event of an early
+	// return.
+	defer r.signalReady()
+
+	// Quoting https://www.freedesktop.org/software/systemd/man/sd-journal.html:
+	//     Functions that operate on sd_journal objects are thread
+	//     agnostic — given sd_journal pointer may only be used from one
+	//     specific thread at all times (and it has to be the very same one
+	//     during the entire lifetime of the object), but multiple,
+	//     independent threads may use multiple, independent objects safely.
+	//
+	// sdjournal.Open returns a value which wraps an sd_journal pointer so
+	// we need to abide by those rules.
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
 
 	// Get a handle to the journal.
-	if rc := C.sd_journal_open(&j, C.int(0)); rc != 0 {
-		logWatcher.Err <- errors.New("error opening journal: " + CErr(rc))
-		close(logWatcher.Msg)
+	var err error
+	if r.s.journalReadDir != "" {
+		r.j, err = sdjournal.OpenDir(r.s.journalReadDir, 0)
+	} else {
+		r.j, err = sdjournal.Open(0)
+	}
+	if err != nil {
+		r.logWatcher.Err <- err
 		return
 	}
-	if config.Follow {
+	defer r.j.Close()
+
+	if r.config.Follow {
 		// Initialize library inotify watches early
-		if rc := C.sd_journal_get_fd(j); rc < 0 {
-			logWatcher.Err <- errors.New("error getting journald fd: " + CErr(rc))
-			close(logWatcher.Msg)
+		if err := r.j.InitializeInotify(); err != nil {
+			r.logWatcher.Err <- err
 			return
 		}
 	}
-	// If we end up following the log, we can set the journal context
-	// pointer and the channel pointer to nil so that we won't close them
-	// here, potentially while the goroutine that uses them is still
-	// running.  Otherwise, close them when we return from this function.
-	following := false
-	defer func() {
-		if !following {
-			close(logWatcher.Msg)
-		}
-		C.sd_journal_close(j)
-	}()
+
 	// Remove limits on the size of data items that we'll retrieve.
-	if rc := C.sd_journal_set_data_threshold(j, C.size_t(0)); rc != 0 {
-		logWatcher.Err <- errors.New("error setting journal data threshold: " + CErr(rc))
+	if err := r.j.SetDataThreshold(0); err != nil {
+		r.logWatcher.Err <- err
 		return
 	}
 	// Add a match to have the library do the searching for us.
-	cmatch = C.CString("CONTAINER_ID_FULL=" + s.vars["CONTAINER_ID_FULL"])
-	defer C.free(unsafe.Pointer(cmatch))
-	if rc := C.sd_journal_add_match(j, unsafe.Pointer(cmatch), C.strlen(cmatch)); rc != 0 {
-		logWatcher.Err <- errors.New("error setting journal match: " + CErr(rc))
+	if err := r.j.AddMatch(fieldContainerIDFull, r.s.vars[fieldContainerIDFull]); err != nil {
+		r.logWatcher.Err <- err
 		return
 	}
-	// If we have a cutoff time, convert it to Unix time once.
-	if !config.Since.IsZero() {
-		nano := config.Since.UnixNano()
-		sinceUnixMicro = uint64(nano / 1000)
+
+	if err := r.readJournal(); err != nil {
+		r.logWatcher.Err <- err
+		return
 	}
-	// If we have an until value, convert it too
-	if !config.Until.IsZero() {
-		nano := config.Until.UnixNano()
-		untilUnixMicro = uint64(nano / 1000)
+}
+
+func (r *reader) signalReady() {
+	select {
+	case <-r.ready:
+	default:
+		close(r.ready)
 	}
-	if config.Tail >= 0 {
-		// If until time provided, start from there.
-		// Otherwise start at the end of the journal.
-		if untilUnixMicro != 0 {
-			if rc := C.sd_journal_seek_realtime_usec(j, C.uint64_t(untilUnixMicro)); rc != 0 {
-				logWatcher.Err <- errors.New("error seeking provided until value: " + CErr(rc))
-				return
-			}
-		} else if rc := C.sd_journal_seek_tail(j); rc != 0 {
-			logWatcher.Err <- errors.New("error seeking to end of journal: " + CErr(rc))
-			return
+}
+
+func (s *journald) ReadLogs(config logger.ReadConfig) *logger.LogWatcher {
+	r := &reader{
+		s:          s,
+		logWatcher: logger.NewLogWatcher(),
+		config:     config,
+		ready:      make(chan struct{}),
+	}
+	go r.readLogs()
+	// Block until the reader is in position to read from the current config
+	// location to prevent race conditions in tests.
+	<-r.ready
+	return r.logWatcher
+}
+
+func waitUntilFlushedImpl(s *journald) error {
+	if s.readSyncTimeout == 0 {
+		return nil
+	}
+
+	ordinal := atomic.LoadUint64(&s.ordinal)
+	if ordinal == 0 {
+		return nil // No logs were sent; nothing to wait for.
+	}
+
+	flushed := make(chan error)
+	go func() {
+		defer close(flushed)
+		runtime.LockOSThread()
+
+		var (
+			j   *sdjournal.Journal
+			err error
+		)
+		if s.journalReadDir != "" {
+			j, err = sdjournal.OpenDir(s.journalReadDir, 0)
+		} else {
+			j, err = sdjournal.Open(0)
 		}
-		// (Try to) skip backwards by the requested number of lines...
-		if C.sd_journal_previous_skip(j, C.uint64_t(config.Tail)) >= 0 {
-			// ...but not before "since"
-			if sinceUnixMicro != 0 &&
-				C.sd_journal_get_realtime_usec(j, &stamp) == 0 &&
-				uint64(stamp) < sinceUnixMicro {
-				C.sd_journal_seek_realtime_usec(j, C.uint64_t(sinceUnixMicro))
-			}
+		if err != nil {
+			flushed <- err
+			return
 		}
-	} else {
-		// Start at the beginning of the journal.
-		if rc := C.sd_journal_seek_head(j); rc != 0 {
-			logWatcher.Err <- errors.New("error seeking to start of journal: " + CErr(rc))
+		defer j.Close()
+
+		if err := j.AddMatch(fieldContainerIDFull, s.vars[fieldContainerIDFull]); err != nil {
+			flushed <- err
 			return
 		}
-		// If we have a cutoff date, fast-forward to it.
-		if sinceUnixMicro != 0 {
-			if rc := C.sd_journal_seek_realtime_usec(j, C.uint64_t(sinceUnixMicro)); rc != 0 {
-				logWatcher.Err <- errors.New("error seeking to start time in journal: " + CErr(rc))
-				return
-			}
+		if err := j.AddMatch(fieldLogEpoch, s.epoch); err != nil {
+			flushed <- err
+			return
 		}
-		if rc := C.sd_journal_next(j); rc < 0 {
-			logWatcher.Err <- errors.New("error skipping to next journal entry: " + CErr(rc))
+		if err := j.AddMatch(fieldLogOrdinal, strconv.FormatUint(ordinal, 10)); err != nil {
+			flushed <- err
 			return
 		}
-	}
-	if config.Tail != 0 { // special case for --tail 0
-		cursor, _, _ = s.drainJournal(logWatcher, j, nil, untilUnixMicro)
-	}
-	if config.Follow {
-		cursor = s.followJournal(logWatcher, j, cursor, untilUnixMicro)
-		// Let followJournal handle freeing the journal context
-		// object and closing the channel.
-		following = true
-	}
 
-	C.free(unsafe.Pointer(cursor))
+		deadline := time.Now().Add(s.readSyncTimeout)
+		for time.Now().Before(deadline) {
+			if ok, err := j.Next(); ok {
+				// Found it!
+				return
+			} else if err != nil {
+				flushed <- err
+				return
+			}
+			if _, err := j.Wait(100 * time.Millisecond); err != nil {
+				flushed <- err
+				return
+			}
+		}
+		logrus.WithField("container", s.vars[fieldContainerIDFull]).
+			Warn("journald: deadline exceeded waiting for logs to be committed to journal")
+	}()
+	return <-flushed
 }
 
-func (s *journald) ReadLogs(config logger.ReadConfig) *logger.LogWatcher {
-	logWatcher := logger.NewLogWatcher()
-	go s.readLogs(logWatcher, config)
-	return logWatcher
+func init() {
+	waitUntilFlushed = waitUntilFlushedImpl
 }

+ 0 - 7
daemon/logger/journald/read_native.go

@@ -1,7 +0,0 @@
-//go:build linux && cgo && !static_build && journald
-// +build linux,cgo,!static_build,journald
-
-package journald // import "github.com/docker/docker/daemon/logger/journald"
-
-// #cgo pkg-config: libsystemd
-import "C"

+ 105 - 0
daemon/logger/journald/read_test.go

@@ -0,0 +1,105 @@
+//go:build linux && cgo && !static_build && journald
+// +build linux,cgo,!static_build,journald
+
+package journald // import "github.com/docker/docker/daemon/logger/journald"
+
+import (
+	"testing"
+	"time"
+
+	"github.com/coreos/go-systemd/v22/journal"
+	"gotest.tools/v3/assert"
+
+	"github.com/docker/docker/daemon/logger"
+	"github.com/docker/docker/daemon/logger/journald/internal/fake"
+	"github.com/docker/docker/daemon/logger/loggertest"
+)
+
+func TestLogRead(t *testing.T) {
+	r := loggertest.Reader{
+		Factory: func(t *testing.T, info logger.Info) func(*testing.T) logger.Logger {
+			journalDir := t.TempDir()
+
+			// Fill the journal with irrelevant events which the
+			// LogReader needs to filter out.
+			rotatedJournal := fake.NewT(t, journalDir+"/rotated.journal")
+			rotatedJournal.AssignEventTimestampFromSyslogTimestamp = true
+			l, err := new(logger.Info{
+				ContainerID:   "wrongone0001",
+				ContainerName: "fake",
+			})
+			assert.NilError(t, err)
+			l.sendToJournal = rotatedJournal.Send
+			assert.NilError(t, l.Log(&logger.Message{Source: "stdout", Timestamp: time.Now().Add(-1 * 30 * time.Minute), Line: []byte("stdout of a different container in a rotated journal file")}))
+			assert.NilError(t, l.Log(&logger.Message{Source: "stderr", Timestamp: time.Now().Add(-1 * 30 * time.Minute), Line: []byte("stderr of a different container in a rotated journal file")}))
+			assert.NilError(t, rotatedJournal.Send("a log message from a totally different process in a rotated journal", journal.PriInfo, nil))
+
+			activeJournal := fake.NewT(t, journalDir+"/fake.journal")
+			activeJournal.AssignEventTimestampFromSyslogTimestamp = true
+			l, err = new(logger.Info{
+				ContainerID:   "wrongone0002",
+				ContainerName: "fake",
+			})
+			assert.NilError(t, err)
+			l.sendToJournal = activeJournal.Send
+			assert.NilError(t, l.Log(&logger.Message{Source: "stdout", Timestamp: time.Now().Add(-1 * 30 * time.Minute), Line: []byte("stdout of a different container in the active journal file")}))
+			assert.NilError(t, l.Log(&logger.Message{Source: "stderr", Timestamp: time.Now().Add(-1 * 30 * time.Minute), Line: []byte("stderr of a different container in the active journal file")}))
+			assert.NilError(t, rotatedJournal.Send("a log message from a totally different process in the active journal", journal.PriInfo, nil))
+
+			return func(t *testing.T) logger.Logger {
+				s := make(chan sendit, 100)
+				t.Cleanup(func() { close(s) })
+				go func() {
+					for m := range s {
+						<-m.after
+						activeJournal.Send(m.message, m.priority, m.vars)
+						if m.sent != nil {
+							close(m.sent)
+						}
+					}
+				}()
+				l, err := new(info)
+				assert.NilError(t, err)
+				l.journalReadDir = journalDir
+
+				sl := &syncLogger{journald: l}
+				l.sendToJournal = func(message string, priority journal.Priority, vars map[string]string) error {
+					sent := make(chan struct{})
+					s <- sendit{
+						message:  message,
+						priority: priority,
+						vars:     vars,
+						after:    time.After(150 * time.Millisecond),
+						sent:     sent,
+					}
+					sl.waitOn = sent
+					return nil
+				}
+				l.readSyncTimeout = 3 * time.Second
+				return sl
+			}
+		},
+	}
+	t.Run("Tail", r.TestTail)
+	t.Run("Follow", r.TestFollow)
+}
+
+type sendit struct {
+	message  string
+	priority journal.Priority
+	vars     map[string]string
+	after    <-chan time.Time
+	sent     chan<- struct{}
+}
+
+type syncLogger struct {
+	*journald
+	waitOn <-chan struct{}
+}
+
+func (l *syncLogger) Sync() error {
+	if l.waitOn != nil {
+		<-l.waitOn
+	}
+	return nil
+}

+ 22 - 4
daemon/logger/loggertest/logreader.go

@@ -10,11 +10,24 @@ import (
 	"github.com/google/go-cmp/cmp"
 	"github.com/google/go-cmp/cmp/cmpopts"
 	"gotest.tools/v3/assert"
+	"gotest.tools/v3/assert/opt"
 
 	"github.com/docker/docker/api/types/backend"
 	"github.com/docker/docker/daemon/logger"
 )
 
+type syncer interface {
+	// Sync commits the current logs to stable storage such that the most
+	// recently-logged message can be immediately read back by a LogReader.
+	Sync() error
+}
+
+func syncLogger(t *testing.T, l logger.Logger) {
+	if sl, ok := l.(syncer); ok {
+		assert.NilError(t, sl.Sync())
+	}
+}
+
 // Reader tests that a logger.LogReader implementation behaves as it should.
 type Reader struct {
 	// Factory returns a function which constructs loggers for the container
@@ -25,6 +38,9 @@ type Reader struct {
 }
 
 var compareLog cmp.Options = []cmp.Option{
+	// Not all log drivers can round-trip timestamps at full nanosecond
+	// precision.
+	opt.TimeWithThreshold(time.Millisecond),
 	// The json-log driver does not round-trip PLogMetaData and API users do
 	// not expect it.
 	cmpopts.IgnoreFields(logger.Message{}, "PLogMetaData"),
@@ -47,6 +63,7 @@ func makeTestMessages() []*logger.Message {
 		{Source: "stderr", Timestamp: time.Now().Add(-1 * 15 * time.Minute), Line: []byte("continued"), PLogMetaData: &backend.PartialLogMetaData{ID: "bbbbbbbb", Ordinal: 2, Last: true}},
 		{Source: "stderr", Timestamp: time.Now().Add(-1 * 10 * time.Minute), Line: []byte("a really long message " + strings.Repeat("a", 4096))},
 		{Source: "stderr", Timestamp: time.Now().Add(-1 * 10 * time.Minute), Line: []byte("just one more message")},
+		{Source: "stdout", Timestamp: time.Now().Add(-1 * 90 * time.Minute), Line: []byte("someone adjusted the clock")},
 	}
 
 }
@@ -311,6 +328,7 @@ func (tr Reader) TestFollow(t *testing.T) {
 
 		mm := makeTestMessages()
 		logMessages(t, l, mm[0:2])
+		syncLogger(t, l)
 
 		lw := l.(logger.LogReader).ReadLogs(logger.ReadConfig{Tail: 0, Follow: true})
 		defer lw.ConsumerGone()
@@ -337,6 +355,7 @@ func (tr Reader) TestFollow(t *testing.T) {
 
 		mm := makeTestMessages()
 		expected := logMessages(t, l, mm[0:2])[1:]
+		syncLogger(t, l)
 
 		lw := l.(logger.LogReader).ReadLogs(logger.ReadConfig{Tail: 1, Follow: true})
 		defer lw.ConsumerGone()
@@ -519,13 +538,12 @@ func readMessage(t *testing.T, lw *logger.LogWatcher) *logger.Message {
 			select {
 			case err, open := <-lw.Err:
 				t.Errorf("unexpected receive on lw.Err with closed lw.Msg: err=%v, open=%v", err, open)
-				return nil
 			default:
 			}
+			return nil
 		}
-		if msg != nil {
-			t.Logf("loggertest: ReadMessage [%v %v] %s", msg.Source, msg.Timestamp, msg.Line)
-		}
+		assert.Assert(t, msg != nil)
+		t.Logf("[%v] %s: %s", msg.Timestamp, msg.Source, msg.Line)
 		return msg
 	}
 }

+ 10 - 35
daemon/logger/loggerutils/follow.go

@@ -4,7 +4,6 @@ import (
 	"fmt"
 	"io"
 	"os"
-	"time"
 
 	"github.com/docker/docker/daemon/logger"
 	"github.com/pkg/errors"
@@ -12,10 +11,10 @@ import (
 )
 
 type follow struct {
-	LogFile      *LogFile
-	Watcher      *logger.LogWatcher
-	Decoder      Decoder
-	Since, Until time.Time
+	LogFile   *LogFile
+	Watcher   *logger.LogWatcher
+	Decoder   Decoder
+	Forwarder *forwarder
 
 	log *logrus.Entry
 	c   chan logPos
@@ -49,7 +48,7 @@ func (fl *follow) Do(f *os.File, read logPos) {
 				fl.Watcher.Err <- err
 				return
 			}
-			if fl.decode(f) {
+			if !fl.forward(f) {
 				return
 			}
 
@@ -91,7 +90,7 @@ func (fl *follow) Do(f *os.File, read logPos) {
 			read.size = 0
 		}
 
-		if fl.decode(io.NewSectionReader(f, read.size, wrote.size-read.size)) {
+		if !fl.forward(io.NewSectionReader(f, read.size, wrote.size-read.size)) {
 			return
 		}
 		read = wrote
@@ -132,34 +131,10 @@ func (fl *follow) nextPos(current logPos) (next logPos, ok bool) {
 	return next, true
 }
 
-// decode decodes log messages from r and sends messages with timestamps between
-// Since and Until to the log watcher.
+// forward decodes log messages from r and forwards them to the log watcher.
 //
-// The return value, done, signals whether following should end due to a
-// condition encountered during decode.
-func (fl *follow) decode(r io.Reader) (done bool) {
+// The return value, cont, signals whether following should continue.
+func (fl *follow) forward(r io.Reader) (cont bool) {
 	fl.Decoder.Reset(r)
-	for {
-		msg, err := fl.Decoder.Decode()
-		if err != nil {
-			if errors.Is(err, io.EOF) {
-				return false
-			}
-			fl.Watcher.Err <- err
-			return true
-		}
-
-		if !fl.Since.IsZero() && msg.Timestamp.Before(fl.Since) {
-			continue
-		}
-		if !fl.Until.IsZero() && msg.Timestamp.After(fl.Until) {
-			return true
-		}
-		// send the message, unless the consumer is gone
-		select {
-		case fl.Watcher.Msg <- msg:
-		case <-fl.Watcher.WatchConsumerGone():
-			return true
-		}
-	}
+	return fl.Forwarder.Do(fl.Watcher, fl.Decoder)
 }

+ 41 - 20
daemon/logger/loggerutils/logfile.go

@@ -411,6 +411,7 @@ func (w *LogFile) readLogsLocked(currentPos logPos, config logger.ReadConfig, wa
 	defer dec.Close()
 
 	currentChunk := io.NewSectionReader(currentFile, 0, currentPos.size)
+	fwd := newForwarder(config)
 
 	if config.Tail != 0 {
 		// TODO(@cpuguy83): Instead of opening every file, only get the files which
@@ -449,7 +450,7 @@ func (w *LogFile) readLogsLocked(currentPos logPos, config logger.ReadConfig, wa
 			readers = append(readers, currentChunk)
 		}
 
-		ok := tailFiles(readers, watcher, dec, w.getTailReader, config)
+		ok := tailFiles(readers, watcher, dec, w.getTailReader, config.Tail, fwd)
 		closeFiles()
 		if !ok {
 			return
@@ -463,11 +464,10 @@ func (w *LogFile) readLogsLocked(currentPos logPos, config logger.ReadConfig, wa
 	}
 
 	(&follow{
-		LogFile: w,
-		Watcher: watcher,
-		Decoder: dec,
-		Since:   config.Since,
-		Until:   config.Until,
+		LogFile:   w,
+		Watcher:   watcher,
+		Decoder:   dec,
+		Forwarder: fwd,
 	}).Do(currentFile, currentPos)
 }
 
@@ -573,7 +573,7 @@ func decompress(dst io.WriteSeeker, src io.ReadSeeker) error {
 	return rc.Close()
 }
 
-func tailFiles(files []SizeReaderAt, watcher *logger.LogWatcher, dec Decoder, getTailReader GetTailReaderFunc, config logger.ReadConfig) (cont bool) {
+func tailFiles(files []SizeReaderAt, watcher *logger.LogWatcher, dec Decoder, getTailReader GetTailReaderFunc, nLines int, fwd *forwarder) (cont bool) {
 	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()
 
@@ -583,20 +583,18 @@ func tailFiles(files []SizeReaderAt, watcher *logger.LogWatcher, dec Decoder, ge
 		select {
 		case <-ctx.Done():
 		case <-watcher.WatchConsumerGone():
-			cont = false
 			cancel()
 		}
 	}()
 
 	readers := make([]io.Reader, 0, len(files))
 
-	if config.Tail > 0 {
-		nLines := config.Tail
+	if nLines > 0 {
 		for i := len(files) - 1; i >= 0 && nLines > 0; i-- {
 			tail, n, err := getTailReader(ctx, files[i], nLines)
 			if err != nil {
 				watcher.Err <- errors.Wrap(err, "error finding file position to start log tailing")
-				return
+				return false
 			}
 			nLines -= n
 			readers = append([]io.Reader{tail}, readers...)
@@ -609,24 +607,47 @@ func tailFiles(files []SizeReaderAt, watcher *logger.LogWatcher, dec Decoder, ge
 
 	rdr := io.MultiReader(readers...)
 	dec.Reset(rdr)
+	return fwd.Do(watcher, dec)
+}
+
+type forwarder struct {
+	since, until time.Time
+}
 
+func newForwarder(config logger.ReadConfig) *forwarder {
+	return &forwarder{since: config.Since, until: config.Until}
+}
+
+// Do reads log messages from dec and sends the messages matching the filter
+// conditions to watcher. Do returns cont=true iff it has read all messages from
+// dec without encountering a message with a timestamp which is after the
+// configured until time.
+func (fwd *forwarder) Do(watcher *logger.LogWatcher, dec Decoder) (cont bool) {
 	for {
 		msg, err := dec.Decode()
 		if err != nil {
-			if !errors.Is(err, io.EOF) {
-				watcher.Err <- err
+			if errors.Is(err, io.EOF) {
+				return true
 			}
-			return
+			watcher.Err <- err
+			return false
 		}
-		if !config.Since.IsZero() && msg.Timestamp.Before(config.Since) {
-			continue
+		if !fwd.since.IsZero() {
+			if msg.Timestamp.Before(fwd.since) {
+				continue
+			}
+			// We've found our first message with a timestamp >= since. As message
+			// timestamps might not be monotonic, we need to skip the since check for all
+			// subsequent messages so we do not filter out later messages which happen to
+			// have timestamps before since.
+			fwd.since = time.Time{}
 		}
-		if !config.Until.IsZero() && msg.Timestamp.After(config.Until) {
-			return
+		if !fwd.until.IsZero() && msg.Timestamp.After(fwd.until) {
+			return false
 		}
 		select {
-		case <-ctx.Done():
-			return
+		case <-watcher.WatchConsumerGone():
+			return false
 		case watcher.Msg <- msg:
 		}
 	}

+ 4 - 2
daemon/logger/loggerutils/logfile_test.go

@@ -64,19 +64,21 @@ func TestTailFiles(t *testing.T) {
 	for desc, config := range map[string]logger.ReadConfig{} {
 		t.Run(desc, func(t *testing.T) {
 			started := make(chan struct{})
+			fwd := newForwarder(config)
 			go func() {
 				close(started)
-				tailFiles(files, watcher, dec, tailReader, config)
+				tailFiles(files, watcher, dec, tailReader, config.Tail, fwd)
 			}()
 			<-started
 		})
 	}
 
 	config := logger.ReadConfig{Tail: 2}
+	fwd := newForwarder(config)
 	started := make(chan struct{})
 	go func() {
 		close(started)
-		tailFiles(files, watcher, dec, tailReader, config)
+		tailFiles(files, watcher, dec, tailReader, config.Tail, fwd)
 	}()
 	<-started
 

+ 62 - 0
hack/dind-systemd

@@ -0,0 +1,62 @@
+#!/bin/bash
+set -e
+container=docker
+export container
+
+if [ $# -eq 0 ]; then
+	echo >&2 'ERROR: No command specified. You probably want to run `journalctl -f`, or maybe `bash`?'
+	exit 1
+fi
+
+if [ ! -t 0 ]; then
+	echo >&2 'ERROR: TTY needs to be enabled (`docker run -t ...`).'
+	exit 1
+fi
+
+env > /etc/docker-entrypoint-env
+
+cat > /etc/systemd/system/docker-entrypoint.target << EOF
+[Unit]
+Description=the target for docker-entrypoint.service
+Requires=docker-entrypoint.service systemd-logind.service systemd-user-sessions.service
+EOF
+
+quoted_args="$(printf " %q" "${@}")"
+echo "${quoted_args}" > /etc/docker-entrypoint-cmd
+
+cat > /etc/systemd/system/docker-entrypoint.service << EOF
+[Unit]
+Description=docker-entrypoint.service
+
+[Service]
+ExecStart=/bin/bash -exc "source /etc/docker-entrypoint-cmd"
+# EXIT_STATUS is either an exit code integer or a signal name string, see systemd.exec(5)
+ExecStopPost=/bin/bash -ec "if echo \${EXIT_STATUS} | grep [A-Z] > /dev/null; then echo >&2 \"got signal \${EXIT_STATUS}\"; systemctl exit \$(( 128 + \$( kill -l \${EXIT_STATUS} ) )); else systemctl exit \${EXIT_STATUS}; fi"
+StandardInput=tty-force
+StandardOutput=inherit
+StandardError=inherit
+WorkingDirectory=$(pwd)
+EnvironmentFile=/etc/docker-entrypoint-env
+
+[Install]
+WantedBy=multi-user.target
+EOF
+
+systemctl mask systemd-firstboot.service systemd-udevd.service
+systemctl unmask systemd-logind
+systemctl enable docker-entrypoint.service
+
+systemd=
+if [ -x /lib/systemd/systemd ]; then
+	systemd=/lib/systemd/systemd
+elif [ -x /usr/lib/systemd/systemd ]; then
+	systemd=/usr/lib/systemd/systemd
+elif [ -x /sbin/init ]; then
+	systemd=/sbin/init
+else
+	echo >&2 'ERROR: systemd is not installed'
+	exit 1
+fi
+systemd_args="--show-status=false --unit=docker-entrypoint.target"
+echo "$0: starting $systemd $systemd_args"
+exec $systemd $systemd_args

+ 1 - 1
hack/test/unit

@@ -12,7 +12,7 @@
 #
 set -eux -o pipefail
 
-BUILDFLAGS=(-tags 'netgo libdm_no_deferred_remove')
+BUILDFLAGS=(-tags 'netgo libdm_no_deferred_remove journald')
 TESTFLAGS+=" -test.timeout=${TIMEOUT:-5m}"
 TESTDIRS="${TESTDIRS:-./...}"
 exclude_paths='/vendor/|/integration'

+ 1 - 1
vendor.mod

@@ -9,6 +9,7 @@ go 1.17
 require (
 	cloud.google.com/go v0.93.3
 	cloud.google.com/go/logging v1.4.2
+	code.cloudfoundry.org/clock v1.0.0
 	github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1
 	github.com/Graylog2/go-gelf v0.0.0-20191017102106-1550ee647df0
 	github.com/Microsoft/go-winio v0.5.2
@@ -87,7 +88,6 @@ require (
 )
 
 require (
-	code.cloudfoundry.org/clock v1.0.0 // indirect
 	github.com/agext/levenshtein v1.2.3 // indirect
 	github.com/armon/circbuf v0.0.0-20190214190532-5111143e8da2 // indirect
 	github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da // indirect

+ 118 - 0
vendor/gotest.tools/v3/assert/opt/opt.go

@@ -0,0 +1,118 @@
+/*Package opt provides common go-cmp.Options for use with assert.DeepEqual.
+ */
+package opt // import "gotest.tools/v3/assert/opt"
+
+import (
+	"fmt"
+	"reflect"
+	"strings"
+	"time"
+
+	gocmp "github.com/google/go-cmp/cmp"
+)
+
+// DurationWithThreshold returns a gocmp.Comparer for comparing time.Duration. The
+// Comparer returns true if the difference between the two Duration values is
+// within the threshold and neither value is zero.
+func DurationWithThreshold(threshold time.Duration) gocmp.Option {
+	return gocmp.Comparer(cmpDuration(threshold))
+}
+
+func cmpDuration(threshold time.Duration) func(x, y time.Duration) bool {
+	return func(x, y time.Duration) bool {
+		if x == 0 || y == 0 {
+			return false
+		}
+		delta := x - y
+		return delta <= threshold && delta >= -threshold
+	}
+}
+
+// TimeWithThreshold returns a gocmp.Comparer for comparing time.Time. The
+// Comparer returns true if the difference between the two Time values is
+// within the threshold and neither value is zero.
+func TimeWithThreshold(threshold time.Duration) gocmp.Option {
+	return gocmp.Comparer(cmpTime(threshold))
+}
+
+func cmpTime(threshold time.Duration) func(x, y time.Time) bool {
+	return func(x, y time.Time) bool {
+		if x.IsZero() || y.IsZero() {
+			return false
+		}
+		delta := x.Sub(y)
+		return delta <= threshold && delta >= -threshold
+	}
+}
+
+// PathString is a gocmp.FilterPath filter that returns true when path.String()
+// matches any of the specs.
+//
+// The path spec is a dot separated string where each segment is a field name.
+// Slices, Arrays, and Maps are always matched against every element in the
+// sequence. gocmp.Indirect, gocmp.Transform, and gocmp.TypeAssertion are always
+// ignored.
+//
+// Note: this path filter is not type safe. Incorrect paths will be silently
+// ignored. Consider using a type safe path filter for more complex paths.
+func PathString(specs ...string) func(path gocmp.Path) bool {
+	return func(path gocmp.Path) bool {
+		for _, spec := range specs {
+			if path.String() == spec {
+				return true
+			}
+		}
+		return false
+	}
+}
+
+// PathDebug is a gocmp.FilerPath filter that always returns false. It prints
+// each path it receives. It can be used to debug path matching problems.
+func PathDebug(path gocmp.Path) bool {
+	fmt.Printf("PATH string=%s gostring=%s\n", path, path.GoString())
+	for _, step := range path {
+		fmt.Printf("  STEP %s\ttype=%s\t%s\n",
+			formatStepType(step), step.Type(), stepTypeFields(step))
+	}
+	return false
+}
+
+func formatStepType(step gocmp.PathStep) string {
+	return strings.Title(strings.TrimPrefix(reflect.TypeOf(step).String(), "*cmp."))
+}
+
+func stepTypeFields(step gocmp.PathStep) string {
+	switch typed := step.(type) {
+	case gocmp.StructField:
+		return fmt.Sprintf("name=%s", typed.Name())
+	case gocmp.MapIndex:
+		return fmt.Sprintf("key=%s", typed.Key().Interface())
+	case gocmp.Transform:
+		return fmt.Sprintf("name=%s", typed.Name())
+	case gocmp.SliceIndex:
+		return fmt.Sprintf("name=%d", typed.Key())
+	}
+	return ""
+}
+
+// PathField is a gocmp.FilerPath filter that matches a struct field by name.
+// PathField will match every instance of the field in a recursive or nested
+// structure.
+func PathField(structType interface{}, field string) func(gocmp.Path) bool {
+	typ := reflect.TypeOf(structType)
+	if typ.Kind() != reflect.Struct {
+		panic(fmt.Sprintf("type %s is not a struct", typ))
+	}
+	if _, ok := typ.FieldByName(field); !ok {
+		panic(fmt.Sprintf("type %s does not have field %s", typ, field))
+	}
+
+	return func(path gocmp.Path) bool {
+		return path.Index(-2).Type() == typ && isStructField(path.Index(-1), field)
+	}
+}
+
+func isStructField(step gocmp.PathStep, name string) bool {
+	field, ok := step.(gocmp.StructField)
+	return ok && field.Name() == name
+}

+ 187 - 0
vendor/gotest.tools/v3/golden/golden.go

@@ -0,0 +1,187 @@
+/*Package golden provides tools for comparing large mutli-line strings.
+
+Golden files are files in the ./testdata/ subdirectory of the package under test.
+Golden files can be automatically updated to match new values by running
+`go test pkgname -test.update-golden`. To ensure the update is correct
+compare the diff of the old expected value to the new expected value.
+*/
+package golden // import "gotest.tools/v3/golden"
+
+import (
+	"bytes"
+	"flag"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+
+	"gotest.tools/v3/assert"
+	"gotest.tools/v3/assert/cmp"
+	"gotest.tools/v3/internal/format"
+)
+
+var flagUpdate = flag.Bool("test.update-golden", false, "update golden file")
+
+type helperT interface {
+	Helper()
+}
+
+// NormalizeCRLFToLF enables end-of-line normalization for actual values passed
+// to Assert and String, as well as the values saved to golden files with
+// -test.update-golden.
+//
+// Defaults to true. If you use the core.autocrlf=true git setting on windows
+// you will need to set this to false.
+//
+// The value may be set to false by setting GOTESTTOOLS_GOLDEN_NormalizeCRLFToLF=false
+// in the environment before running tests.
+//
+// The default value may change in a future major release.
+var NormalizeCRLFToLF = os.Getenv("GOTESTTOOLS_GOLDEN_NormalizeCRLFToLF") != "false"
+
+// FlagUpdate returns true when the -test.update-golden flag has been set.
+func FlagUpdate() bool {
+	return *flagUpdate
+}
+
+// Open opens the file in ./testdata
+func Open(t assert.TestingT, filename string) *os.File {
+	if ht, ok := t.(helperT); ok {
+		ht.Helper()
+	}
+	f, err := os.Open(Path(filename))
+	assert.NilError(t, err)
+	return f
+}
+
+// Get returns the contents of the file in ./testdata
+func Get(t assert.TestingT, filename string) []byte {
+	if ht, ok := t.(helperT); ok {
+		ht.Helper()
+	}
+	expected, err := ioutil.ReadFile(Path(filename))
+	assert.NilError(t, err)
+	return expected
+}
+
+// Path returns the full path to a file in ./testdata
+func Path(filename string) string {
+	if filepath.IsAbs(filename) {
+		return filename
+	}
+	return filepath.Join("testdata", filename)
+}
+
+func removeCarriageReturn(in []byte) []byte {
+	if !NormalizeCRLFToLF {
+		return in
+	}
+	return bytes.Replace(in, []byte("\r\n"), []byte("\n"), -1)
+}
+
+// Assert compares actual to the expected value in the golden file.
+//
+// Running `go test pkgname -test.update-golden` will write the value of actual
+// to the golden file.
+//
+// This is equivalent to assert.Assert(t, String(actual, filename))
+func Assert(t assert.TestingT, actual string, filename string, msgAndArgs ...interface{}) {
+	if ht, ok := t.(helperT); ok {
+		ht.Helper()
+	}
+	assert.Assert(t, String(actual, filename), msgAndArgs...)
+}
+
+// String compares actual to the contents of filename and returns success
+// if the strings are equal.
+//
+// Running `go test pkgname -test.update-golden` will write the value of actual
+// to the golden file.
+//
+// Any \r\n substrings in actual are converted to a single \n character
+// before comparing it to the expected string. When updating the golden file the
+// normalized version will be written to the file. This allows Windows to use
+// the same golden files as other operating systems.
+func String(actual string, filename string) cmp.Comparison {
+	return func() cmp.Result {
+		actualBytes := removeCarriageReturn([]byte(actual))
+		result, expected := compare(actualBytes, filename)
+		if result != nil {
+			return result
+		}
+		diff := format.UnifiedDiff(format.DiffConfig{
+			A:    string(expected),
+			B:    string(actualBytes),
+			From: "expected",
+			To:   "actual",
+		})
+		return cmp.ResultFailure("\n" + diff + failurePostamble(filename))
+	}
+}
+
+func failurePostamble(filename string) string {
+	return fmt.Sprintf(`
+
+You can run 'go test . -test.update-golden' to automatically update %s to the new expected value.'
+`, Path(filename))
+}
+
+// AssertBytes compares actual to the expected value in the golden.
+//
+// Running `go test pkgname -test.update-golden` will write the value of actual
+// to the golden file.
+//
+// This is equivalent to assert.Assert(t, Bytes(actual, filename))
+func AssertBytes(
+	t assert.TestingT,
+	actual []byte,
+	filename string,
+	msgAndArgs ...interface{},
+) {
+	if ht, ok := t.(helperT); ok {
+		ht.Helper()
+	}
+	assert.Assert(t, Bytes(actual, filename), msgAndArgs...)
+}
+
+// Bytes compares actual to the contents of filename and returns success
+// if the bytes are equal.
+//
+// Running `go test pkgname -test.update-golden` will write the value of actual
+// to the golden file.
+func Bytes(actual []byte, filename string) cmp.Comparison {
+	return func() cmp.Result {
+		result, expected := compare(actual, filename)
+		if result != nil {
+			return result
+		}
+		msg := fmt.Sprintf("%v (actual) != %v (expected)", actual, expected)
+		return cmp.ResultFailure(msg + failurePostamble(filename))
+	}
+}
+
+func compare(actual []byte, filename string) (cmp.Result, []byte) {
+	if err := update(filename, actual); err != nil {
+		return cmp.ResultFromError(err), nil
+	}
+	expected, err := ioutil.ReadFile(Path(filename))
+	if err != nil {
+		return cmp.ResultFromError(err), nil
+	}
+	if bytes.Equal(expected, actual) {
+		return cmp.ResultSuccess, nil
+	}
+	return nil, expected
+}
+
+func update(filename string, actual []byte) error {
+	if !*flagUpdate {
+		return nil
+	}
+	if dir := filepath.Dir(Path(filename)); dir != "." {
+		if err := os.MkdirAll(dir, 0755); err != nil {
+			return err
+		}
+	}
+	return ioutil.WriteFile(Path(filename), actual, 0644)
+}

+ 2 - 0
vendor/modules.txt

@@ -1105,8 +1105,10 @@ google.golang.org/protobuf/types/known/wrapperspb
 ## explicit; go 1.13
 gotest.tools/v3/assert
 gotest.tools/v3/assert/cmp
+gotest.tools/v3/assert/opt
 gotest.tools/v3/env
 gotest.tools/v3/fs
+gotest.tools/v3/golden
 gotest.tools/v3/icmd
 gotest.tools/v3/internal/assert
 gotest.tools/v3/internal/cleanup