Browse Source

Merge pull request #43634 from ameyag/win-port-conflict

Fix for Windows port conflict with published ports in host mode for overlay.
Sebastiaan van Stijn 3 years ago
parent
commit
bfd7fabdfd

+ 5 - 4
libnetwork/drivers/windows/windows.go

@@ -490,10 +490,11 @@ func ConvertPortBindings(portBindings []types.PortBinding) ([]json.RawMessage, e
 		}
 
 		encodedPolicy, err := json.Marshal(hcsshim.NatPolicy{
-			Type:         "NAT",
-			ExternalPort: elem.HostPort,
-			InternalPort: elem.Port,
-			Protocol:     elem.Proto.String(),
+			Type:                 "NAT",
+			ExternalPort:         elem.HostPort,
+			InternalPort:         elem.Port,
+			Protocol:             elem.Proto.String(),
+			ExternalPortReserved: true,
 		})
 
 		if err != nil {

+ 1 - 1
vendor.mod

@@ -11,7 +11,7 @@ require (
 	cloud.google.com/go/logging v1.4.2
 	github.com/Graylog2/go-gelf v0.0.0-20191017102106-1550ee647df0
 	github.com/Microsoft/go-winio v0.5.2
-	github.com/Microsoft/hcsshim v0.9.2
+	github.com/Microsoft/hcsshim v0.9.3
 	github.com/RackSec/srslog v0.0.0-20180709174129-a4725f04ec91
 	github.com/armon/go-radix v0.0.0-20180808171621-7fddfc383310
 	github.com/aws/aws-sdk-go v1.31.6

+ 2 - 1
vendor.sum

@@ -93,8 +93,9 @@ github.com/Microsoft/hcsshim v0.8.16/go.mod h1:o5/SZqmR7x9JNKsW3pu+nqHm0MF8vbA+V
 github.com/Microsoft/hcsshim v0.8.20/go.mod h1:+w2gRZ5ReXQhFOrvSQeNfhrYB/dg3oDwTOcER2fw4I4=
 github.com/Microsoft/hcsshim v0.8.21/go.mod h1:+w2gRZ5ReXQhFOrvSQeNfhrYB/dg3oDwTOcER2fw4I4=
 github.com/Microsoft/hcsshim v0.8.23/go.mod h1:4zegtUJth7lAvFyc6cH2gGQ5B3OFQim01nnU2M8jKDg=
-github.com/Microsoft/hcsshim v0.9.2 h1:wB06W5aYFfUB3IvootYAY2WnOmIdgPGfqSI6tufQNnY=
 github.com/Microsoft/hcsshim v0.9.2/go.mod h1:7pLA8lDk46WKDWlVsENo92gC0XFa8rbKfyFRBqxEbCc=
+github.com/Microsoft/hcsshim v0.9.3 h1:k371PzBuRrz2b+ebGuI2nVgVhgsVX60jMfSw80NECxo=
+github.com/Microsoft/hcsshim v0.9.3/go.mod h1:7pLA8lDk46WKDWlVsENo92gC0XFa8rbKfyFRBqxEbCc=
 github.com/Microsoft/hcsshim/test v0.0.0-20201218223536-d3e5debf77da/go.mod h1:5hlzMzRKMLyo42nCZ9oml8AdTlq/0cvIaBv6tK1RehU=
 github.com/Microsoft/hcsshim/test v0.0.0-20210227013316-43a75bb4edd3/go.mod h1:mw7qgWloBUl75W/gVH3cQszUg1+gUITj7D6NY7ywVnY=
 github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb06e3pkSAbeQ52E9H9iFoQsEEwGN64994WTCIhntQ=

+ 179 - 9
vendor/github.com/Microsoft/hcsshim/internal/hcs/system.go

@@ -4,17 +4,22 @@ import (
 	"context"
 	"encoding/json"
 	"errors"
+	"fmt"
 	"strings"
 	"sync"
 	"syscall"
+	"time"
 
 	"github.com/Microsoft/hcsshim/internal/cow"
 	"github.com/Microsoft/hcsshim/internal/hcs/schema1"
 	hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2"
+	"github.com/Microsoft/hcsshim/internal/jobobject"
 	"github.com/Microsoft/hcsshim/internal/log"
+	"github.com/Microsoft/hcsshim/internal/logfields"
 	"github.com/Microsoft/hcsshim/internal/oc"
 	"github.com/Microsoft/hcsshim/internal/timeout"
 	"github.com/Microsoft/hcsshim/internal/vmcompute"
+	"github.com/sirupsen/logrus"
 	"go.opencensus.io/trace"
 )
 
@@ -28,7 +33,8 @@ type System struct {
 	waitBlock      chan struct{}
 	waitError      error
 	exitError      error
-	os, typ        string
+	os, typ, owner string
+	startTime      time.Time
 }
 
 func newSystem(id string) *System {
@@ -38,6 +44,11 @@ func newSystem(id string) *System {
 	}
 }
 
+// Implementation detail for silo naming, this should NOT be relied upon very heavily.
+func siloNameFmt(containerID string) string {
+	return fmt.Sprintf(`\Container_%s`, containerID)
+}
+
 // CreateComputeSystem creates a new compute system with the given configuration but does not start it.
 func CreateComputeSystem(ctx context.Context, id string, hcsDocumentInterface interface{}) (_ *System, err error) {
 	operation := "hcs::CreateComputeSystem"
@@ -127,6 +138,7 @@ func (computeSystem *System) getCachedProperties(ctx context.Context) error {
 	}
 	computeSystem.typ = strings.ToLower(props.SystemType)
 	computeSystem.os = strings.ToLower(props.RuntimeOSType)
+	computeSystem.owner = strings.ToLower(props.Owner)
 	if computeSystem.os == "" && computeSystem.typ == "container" {
 		// Pre-RS5 HCS did not return the OS, but it only supported containers
 		// that ran Windows.
@@ -195,7 +207,7 @@ func (computeSystem *System) Start(ctx context.Context) (err error) {
 	if err != nil {
 		return makeSystemError(computeSystem, operation, err, events)
 	}
-
+	computeSystem.startTime = time.Now()
 	return nil
 }
 
@@ -324,11 +336,115 @@ func (computeSystem *System) Properties(ctx context.Context, types ...schema1.Pr
 	return properties, nil
 }
 
-// PropertiesV2 returns the requested container properties targeting a V2 schema container.
-func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschema.PropertyType) (*hcsschema.Properties, error) {
-	computeSystem.handleLock.RLock()
-	defer computeSystem.handleLock.RUnlock()
+// queryInProc handles querying for container properties without reaching out to HCS. `props`
+// will be updated to contain any data returned from the queries present in `types`. If any properties
+// failed to be queried they will be tallied up and returned in as the first return value. Failures on
+// query are NOT considered errors; the only failure case for this method is if the containers job object
+// cannot be opened.
+func (computeSystem *System) queryInProc(ctx context.Context, props *hcsschema.Properties, types []hcsschema.PropertyType) ([]hcsschema.PropertyType, error) {
+	// In the future we can make use of some new functionality in the HCS that allows you
+	// to pass a job object for HCS to use for the container. Currently, the only way we'll
+	// be able to open the job/silo is if we're running as SYSTEM.
+	jobOptions := &jobobject.Options{
+		UseNTVariant: true,
+		Name:         siloNameFmt(computeSystem.id),
+	}
+	job, err := jobobject.Open(ctx, jobOptions)
+	if err != nil {
+		return nil, err
+	}
+	defer job.Close()
+
+	var fallbackQueryTypes []hcsschema.PropertyType
+	for _, propType := range types {
+		switch propType {
+		case hcsschema.PTStatistics:
+			// Handle a bad caller asking for the same type twice. No use in re-querying if this is
+			// filled in already.
+			if props.Statistics == nil {
+				props.Statistics, err = computeSystem.statisticsInProc(job)
+				if err != nil {
+					log.G(ctx).WithError(err).Warn("failed to get statistics in-proc")
+
+					fallbackQueryTypes = append(fallbackQueryTypes, propType)
+				}
+			}
+		default:
+			fallbackQueryTypes = append(fallbackQueryTypes, propType)
+		}
+	}
+
+	return fallbackQueryTypes, nil
+}
+
+// statisticsInProc emulates what HCS does to grab statistics for a given container with a small
+// change to make grabbing the private working set total much more efficient.
+func (computeSystem *System) statisticsInProc(job *jobobject.JobObject) (*hcsschema.Statistics, error) {
+	// Start timestamp for these stats before we grab them to match HCS
+	timestamp := time.Now()
+
+	memInfo, err := job.QueryMemoryStats()
+	if err != nil {
+		return nil, err
+	}
+
+	processorInfo, err := job.QueryProcessorStats()
+	if err != nil {
+		return nil, err
+	}
+
+	storageInfo, err := job.QueryStorageStats()
+	if err != nil {
+		return nil, err
+	}
+
+	// This calculates the private working set more efficiently than HCS does. HCS calls NtQuerySystemInformation
+	// with the class SystemProcessInformation which returns an array containing system information for *every*
+	// process running on the machine. They then grab the pids that are running in the container and filter down
+	// the entries in the array to only what's running in that silo and start tallying up the total. This doesn't
+	// work well as performance should get worse if more processess are running on the machine in general and not
+	// just in the container. All of the additional information besides the WorkingSetPrivateSize field is ignored
+	// as well which isn't great and is wasted work to fetch.
+	//
+	// HCS only let's you grab statistics in an all or nothing fashion, so we can't just grab the private
+	// working set ourselves and ask for everything else seperately. The optimization we can make here is
+	// to open the silo ourselves and do the same queries for the rest of the info, as well as calculating
+	// the private working set in a more efficient manner by:
+	//
+	// 1. Find the pids running in the silo
+	// 2. Get a process handle for every process (only need PROCESS_QUERY_LIMITED_INFORMATION access)
+	// 3. Call NtQueryInformationProcess on each process with the class ProcessVmCounters
+	// 4. Tally up the total using the field PrivateWorkingSetSize in VM_COUNTERS_EX2.
+	privateWorkingSet, err := job.QueryPrivateWorkingSet()
+	if err != nil {
+		return nil, err
+	}
 
+	return &hcsschema.Statistics{
+		Timestamp:          timestamp,
+		ContainerStartTime: computeSystem.startTime,
+		Uptime100ns:        uint64(time.Since(computeSystem.startTime).Nanoseconds()) / 100,
+		Memory: &hcsschema.MemoryStats{
+			MemoryUsageCommitBytes:            memInfo.JobMemory,
+			MemoryUsageCommitPeakBytes:        memInfo.PeakJobMemoryUsed,
+			MemoryUsagePrivateWorkingSetBytes: privateWorkingSet,
+		},
+		Processor: &hcsschema.ProcessorStats{
+			RuntimeKernel100ns: uint64(processorInfo.TotalKernelTime),
+			RuntimeUser100ns:   uint64(processorInfo.TotalUserTime),
+			TotalRuntime100ns:  uint64(processorInfo.TotalKernelTime + processorInfo.TotalUserTime),
+		},
+		Storage: &hcsschema.StorageStats{
+			ReadCountNormalized:  uint64(storageInfo.ReadStats.IoCount),
+			ReadSizeBytes:        storageInfo.ReadStats.TotalSize,
+			WriteCountNormalized: uint64(storageInfo.WriteStats.IoCount),
+			WriteSizeBytes:       storageInfo.WriteStats.TotalSize,
+		},
+	}, nil
+}
+
+// hcsPropertiesV2Query is a helper to make a HcsGetComputeSystemProperties call using the V2 schema property types.
+func (computeSystem *System) hcsPropertiesV2Query(ctx context.Context, types []hcsschema.PropertyType) (*hcsschema.Properties, error) {
 	operation := "hcs::System::PropertiesV2"
 
 	queryBytes, err := json.Marshal(hcsschema.PropertyQuery{PropertyTypes: types})
@@ -345,12 +461,66 @@ func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschem
 	if propertiesJSON == "" {
 		return nil, ErrUnexpectedValue
 	}
-	properties := &hcsschema.Properties{}
-	if err := json.Unmarshal([]byte(propertiesJSON), properties); err != nil {
+	props := &hcsschema.Properties{}
+	if err := json.Unmarshal([]byte(propertiesJSON), props); err != nil {
 		return nil, makeSystemError(computeSystem, operation, err, nil)
 	}
 
-	return properties, nil
+	return props, nil
+}
+
+// PropertiesV2 returns the requested compute systems properties targeting a V2 schema compute system.
+func (computeSystem *System) PropertiesV2(ctx context.Context, types ...hcsschema.PropertyType) (_ *hcsschema.Properties, err error) {
+	computeSystem.handleLock.RLock()
+	defer computeSystem.handleLock.RUnlock()
+
+	// Let HCS tally up the total for VM based queries instead of querying ourselves.
+	if computeSystem.typ != "container" {
+		return computeSystem.hcsPropertiesV2Query(ctx, types)
+	}
+
+	// Define a starter Properties struct with the default fields returned from every
+	// query. Owner is only returned from Statistics but it's harmless to include.
+	properties := &hcsschema.Properties{
+		Id:            computeSystem.id,
+		SystemType:    computeSystem.typ,
+		RuntimeOsType: computeSystem.os,
+		Owner:         computeSystem.owner,
+	}
+
+	logEntry := log.G(ctx)
+	// First lets try and query ourselves without reaching to HCS. If any of the queries fail
+	// we'll take note and fallback to querying HCS for any of the failed types.
+	fallbackTypes, err := computeSystem.queryInProc(ctx, properties, types)
+	if err == nil && len(fallbackTypes) == 0 {
+		return properties, nil
+	} else if err != nil {
+		logEntry.WithError(fmt.Errorf("failed to query compute system properties in-proc: %w", err))
+		fallbackTypes = types
+	}
+
+	logEntry.WithFields(logrus.Fields{
+		logfields.ContainerID: computeSystem.id,
+		"propertyTypes":       fallbackTypes,
+	}).Info("falling back to HCS for property type queries")
+
+	hcsProperties, err := computeSystem.hcsPropertiesV2Query(ctx, fallbackTypes)
+	if err != nil {
+		return nil, err
+	}
+
+	// Now add in anything that we might have successfully queried in process.
+	if properties.Statistics != nil {
+		hcsProperties.Statistics = properties.Statistics
+		hcsProperties.Owner = properties.Owner
+	}
+
+	// For future support for querying processlist in-proc as well.
+	if properties.ProcessList != nil {
+		hcsProperties.ProcessList = properties.ProcessList
+	}
+
+	return hcsProperties, nil
 }
 
 // Pause pauses the execution of the computeSystem. This feature is not enabled in TP5.

+ 5 - 4
vendor/github.com/Microsoft/hcsshim/internal/hns/hnspolicy.go

@@ -21,10 +21,11 @@ const (
 )
 
 type NatPolicy struct {
-	Type         PolicyType `json:"Type"`
-	Protocol     string     `json:",omitempty"`
-	InternalPort uint16     `json:",omitempty"`
-	ExternalPort uint16     `json:",omitempty"`
+	Type                 PolicyType `json:"Type"`
+	Protocol             string     `json:",omitempty"`
+	InternalPort         uint16     `json:",omitempty"`
+	ExternalPort         uint16     `json:",omitempty"`
+	ExternalPortReserved bool       `json:",omitempty"`
 }
 
 type QosPolicy struct {

+ 111 - 0
vendor/github.com/Microsoft/hcsshim/internal/jobobject/iocp.go

@@ -0,0 +1,111 @@
+package jobobject
+
+import (
+	"context"
+	"fmt"
+	"sync"
+	"unsafe"
+
+	"github.com/Microsoft/hcsshim/internal/log"
+	"github.com/Microsoft/hcsshim/internal/queue"
+	"github.com/Microsoft/hcsshim/internal/winapi"
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/windows"
+)
+
+var (
+	ioInitOnce sync.Once
+	initIOErr  error
+	// Global iocp handle that will be re-used for every job object
+	ioCompletionPort windows.Handle
+	// Mapping of job handle to queue to place notifications in.
+	jobMap sync.Map
+)
+
+// MsgAllProcessesExited is a type representing a message that every process in a job has exited.
+type MsgAllProcessesExited struct{}
+
+// MsgUnimplemented represents a message that we are aware of, but that isn't implemented currently.
+// This should not be treated as an error.
+type MsgUnimplemented struct{}
+
+// pollIOCP polls the io completion port forever.
+func pollIOCP(ctx context.Context, iocpHandle windows.Handle) {
+	var (
+		overlapped uintptr
+		code       uint32
+		key        uintptr
+	)
+
+	for {
+		err := windows.GetQueuedCompletionStatus(iocpHandle, &code, &key, (**windows.Overlapped)(unsafe.Pointer(&overlapped)), windows.INFINITE)
+		if err != nil {
+			log.G(ctx).WithError(err).Error("failed to poll for job object message")
+			continue
+		}
+		if val, ok := jobMap.Load(key); ok {
+			msq, ok := val.(*queue.MessageQueue)
+			if !ok {
+				log.G(ctx).WithField("value", msq).Warn("encountered non queue type in job map")
+				continue
+			}
+			notification, err := parseMessage(code, overlapped)
+			if err != nil {
+				log.G(ctx).WithFields(logrus.Fields{
+					"code":       code,
+					"overlapped": overlapped,
+				}).Warn("failed to parse job object message")
+				continue
+			}
+			if err := msq.Write(notification); err == queue.ErrQueueClosed {
+				// Write will only return an error when the queue is closed.
+				// The only time a queue would ever be closed is when we call `Close` on
+				// the job it belongs to which also removes it from the jobMap, so something
+				// went wrong here. We can't return as this is reading messages for all jobs
+				// so just log it and move on.
+				log.G(ctx).WithFields(logrus.Fields{
+					"code":       code,
+					"overlapped": overlapped,
+				}).Warn("tried to write to a closed queue")
+				continue
+			}
+		} else {
+			log.G(ctx).Warn("received a message for a job not present in the mapping")
+		}
+	}
+}
+
+func parseMessage(code uint32, overlapped uintptr) (interface{}, error) {
+	// Check code and parse out relevant information related to that notification
+	// that we care about. For now all we handle is the message that all processes
+	// in the job have exited.
+	switch code {
+	case winapi.JOB_OBJECT_MSG_ACTIVE_PROCESS_ZERO:
+		return MsgAllProcessesExited{}, nil
+	// Other messages for completeness and a check to make sure that if we fall
+	// into the default case that this is a code we don't know how to handle.
+	case winapi.JOB_OBJECT_MSG_END_OF_JOB_TIME:
+	case winapi.JOB_OBJECT_MSG_END_OF_PROCESS_TIME:
+	case winapi.JOB_OBJECT_MSG_ACTIVE_PROCESS_LIMIT:
+	case winapi.JOB_OBJECT_MSG_NEW_PROCESS:
+	case winapi.JOB_OBJECT_MSG_EXIT_PROCESS:
+	case winapi.JOB_OBJECT_MSG_ABNORMAL_EXIT_PROCESS:
+	case winapi.JOB_OBJECT_MSG_PROCESS_MEMORY_LIMIT:
+	case winapi.JOB_OBJECT_MSG_JOB_MEMORY_LIMIT:
+	case winapi.JOB_OBJECT_MSG_NOTIFICATION_LIMIT:
+	default:
+		return nil, fmt.Errorf("unknown job notification type: %d", code)
+	}
+	return MsgUnimplemented{}, nil
+}
+
+// Assigns an IO completion port to get notified of events for the registered job
+// object.
+func attachIOCP(job windows.Handle, iocp windows.Handle) error {
+	info := winapi.JOBOBJECT_ASSOCIATE_COMPLETION_PORT{
+		CompletionKey:  job,
+		CompletionPort: iocp,
+	}
+	_, err := windows.SetInformationJobObject(job, windows.JobObjectAssociateCompletionPortInformation, uintptr(unsafe.Pointer(&info)), uint32(unsafe.Sizeof(info)))
+	return err
+}

+ 499 - 0
vendor/github.com/Microsoft/hcsshim/internal/jobobject/jobobject.go

@@ -0,0 +1,499 @@
+package jobobject
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"sync"
+	"unsafe"
+
+	"github.com/Microsoft/hcsshim/internal/queue"
+	"github.com/Microsoft/hcsshim/internal/winapi"
+	"golang.org/x/sys/windows"
+)
+
+// This file provides higher level constructs for the win32 job object API.
+// Most of the core creation and management functions are already present in "golang.org/x/sys/windows"
+// (CreateJobObject, AssignProcessToJobObject, etc.) as well as most of the limit information
+// structs and associated limit flags. Whatever is not present from the job object API
+// in golang.org/x/sys/windows is located in /internal/winapi.
+//
+// https://docs.microsoft.com/en-us/windows/win32/procthread/job-objects
+
+// JobObject is a high level wrapper around a Windows job object. Holds a handle to
+// the job, a queue to receive iocp notifications about the lifecycle
+// of the job and a mutex for synchronized handle access.
+type JobObject struct {
+	handle     windows.Handle
+	mq         *queue.MessageQueue
+	handleLock sync.RWMutex
+}
+
+// JobLimits represents the resource constraints that can be applied to a job object.
+type JobLimits struct {
+	CPULimit           uint32
+	CPUWeight          uint32
+	MemoryLimitInBytes uint64
+	MaxIOPS            int64
+	MaxBandwidth       int64
+}
+
+type CPURateControlType uint32
+
+const (
+	WeightBased CPURateControlType = iota
+	RateBased
+)
+
+// Processor resource controls
+const (
+	cpuLimitMin  = 1
+	cpuLimitMax  = 10000
+	cpuWeightMin = 1
+	cpuWeightMax = 9
+)
+
+var (
+	ErrAlreadyClosed = errors.New("the handle has already been closed")
+	ErrNotRegistered = errors.New("job is not registered to receive notifications")
+)
+
+// Options represents the set of configurable options when making or opening a job object.
+type Options struct {
+	// `Name` specifies the name of the job object if a named job object is desired.
+	Name string
+	// `Notifications` specifies if the job will be registered to receive notifications.
+	// Defaults to false.
+	Notifications bool
+	// `UseNTVariant` specifies if we should use the `Nt` variant of Open/CreateJobObject.
+	// Defaults to false.
+	UseNTVariant bool
+}
+
+// Create creates a job object.
+//
+// If options.Name is an empty string, the job will not be assigned a name.
+//
+// If options.Notifications are not enabled `PollNotifications` will return immediately with error `errNotRegistered`.
+//
+// If `options` is nil, use default option values.
+//
+// Returns a JobObject structure and an error if there is one.
+func Create(ctx context.Context, options *Options) (_ *JobObject, err error) {
+	if options == nil {
+		options = &Options{}
+	}
+
+	var jobName *winapi.UnicodeString
+	if options.Name != "" {
+		jobName, err = winapi.NewUnicodeString(options.Name)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	var jobHandle windows.Handle
+	if options.UseNTVariant {
+		oa := winapi.ObjectAttributes{
+			Length:     unsafe.Sizeof(winapi.ObjectAttributes{}),
+			ObjectName: jobName,
+			Attributes: 0,
+		}
+		status := winapi.NtCreateJobObject(&jobHandle, winapi.JOB_OBJECT_ALL_ACCESS, &oa)
+		if status != 0 {
+			return nil, winapi.RtlNtStatusToDosError(status)
+		}
+	} else {
+		var jobNameBuf *uint16
+		if jobName != nil && jobName.Buffer != nil {
+			jobNameBuf = jobName.Buffer
+		}
+		jobHandle, err = windows.CreateJobObject(nil, jobNameBuf)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	defer func() {
+		if err != nil {
+			windows.Close(jobHandle)
+		}
+	}()
+
+	job := &JobObject{
+		handle: jobHandle,
+	}
+
+	// If the IOCP we'll be using to receive messages for all jobs hasn't been
+	// created, create it and start polling.
+	if options.Notifications {
+		mq, err := setupNotifications(ctx, job)
+		if err != nil {
+			return nil, err
+		}
+		job.mq = mq
+	}
+
+	return job, nil
+}
+
+// Open opens an existing job object with name provided in `options`. If no name is provided
+// return an error since we need to know what job object to open.
+//
+// If options.Notifications is false `PollNotifications` will return immediately with error `errNotRegistered`.
+//
+// Returns a JobObject structure and an error if there is one.
+func Open(ctx context.Context, options *Options) (_ *JobObject, err error) {
+	if options == nil || (options != nil && options.Name == "") {
+		return nil, errors.New("no job object name specified to open")
+	}
+
+	unicodeJobName, err := winapi.NewUnicodeString(options.Name)
+	if err != nil {
+		return nil, err
+	}
+
+	var jobHandle windows.Handle
+	if options != nil && options.UseNTVariant {
+		oa := winapi.ObjectAttributes{
+			Length:     unsafe.Sizeof(winapi.ObjectAttributes{}),
+			ObjectName: unicodeJobName,
+			Attributes: 0,
+		}
+		status := winapi.NtOpenJobObject(&jobHandle, winapi.JOB_OBJECT_ALL_ACCESS, &oa)
+		if status != 0 {
+			return nil, winapi.RtlNtStatusToDosError(status)
+		}
+	} else {
+		jobHandle, err = winapi.OpenJobObject(winapi.JOB_OBJECT_ALL_ACCESS, false, unicodeJobName.Buffer)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	defer func() {
+		if err != nil {
+			windows.Close(jobHandle)
+		}
+	}()
+
+	job := &JobObject{
+		handle: jobHandle,
+	}
+
+	// If the IOCP we'll be using to receive messages for all jobs hasn't been
+	// created, create it and start polling.
+	if options != nil && options.Notifications {
+		mq, err := setupNotifications(ctx, job)
+		if err != nil {
+			return nil, err
+		}
+		job.mq = mq
+	}
+
+	return job, nil
+}
+
+// helper function to setup notifications for creating/opening a job object
+func setupNotifications(ctx context.Context, job *JobObject) (*queue.MessageQueue, error) {
+	job.handleLock.RLock()
+	defer job.handleLock.RUnlock()
+
+	if job.handle == 0 {
+		return nil, ErrAlreadyClosed
+	}
+
+	ioInitOnce.Do(func() {
+		h, err := windows.CreateIoCompletionPort(windows.InvalidHandle, 0, 0, 0xffffffff)
+		if err != nil {
+			initIOErr = err
+			return
+		}
+		ioCompletionPort = h
+		go pollIOCP(ctx, h)
+	})
+
+	if initIOErr != nil {
+		return nil, initIOErr
+	}
+
+	mq := queue.NewMessageQueue()
+	jobMap.Store(uintptr(job.handle), mq)
+	if err := attachIOCP(job.handle, ioCompletionPort); err != nil {
+		jobMap.Delete(uintptr(job.handle))
+		return nil, fmt.Errorf("failed to attach job to IO completion port: %w", err)
+	}
+	return mq, nil
+}
+
+// PollNotification will poll for a job object notification. This call should only be called once
+// per job (ideally in a goroutine loop) and will block if there is not a notification ready.
+// This call will return immediately with error `ErrNotRegistered` if the job was not registered
+// to receive notifications during `Create`. Internally, messages will be queued and there
+// is no worry of messages being dropped.
+func (job *JobObject) PollNotification() (interface{}, error) {
+	if job.mq == nil {
+		return nil, ErrNotRegistered
+	}
+	return job.mq.ReadOrWait()
+}
+
+// UpdateProcThreadAttribute updates the passed in ProcThreadAttributeList to contain what is necessary to
+// launch a process in a job at creation time. This can be used to avoid having to call Assign() after a process
+// has already started running.
+func (job *JobObject) UpdateProcThreadAttribute(attrList *windows.ProcThreadAttributeListContainer) error {
+	job.handleLock.RLock()
+	defer job.handleLock.RUnlock()
+
+	if job.handle == 0 {
+		return ErrAlreadyClosed
+	}
+
+	if err := attrList.Update(
+		winapi.PROC_THREAD_ATTRIBUTE_JOB_LIST,
+		unsafe.Pointer(&job.handle),
+		unsafe.Sizeof(job.handle),
+	); err != nil {
+		return fmt.Errorf("failed to update proc thread attributes for job object: %w", err)
+	}
+
+	return nil
+}
+
+// Close closes the job object handle.
+func (job *JobObject) Close() error {
+	job.handleLock.Lock()
+	defer job.handleLock.Unlock()
+
+	if job.handle == 0 {
+		return ErrAlreadyClosed
+	}
+
+	if err := windows.Close(job.handle); err != nil {
+		return err
+	}
+
+	if job.mq != nil {
+		job.mq.Close()
+	}
+	// Handles now invalid so if the map entry to receive notifications for this job still
+	// exists remove it so we can stop receiving notifications.
+	if _, ok := jobMap.Load(uintptr(job.handle)); ok {
+		jobMap.Delete(uintptr(job.handle))
+	}
+
+	job.handle = 0
+	return nil
+}
+
+// Assign assigns a process to the job object.
+func (job *JobObject) Assign(pid uint32) error {
+	job.handleLock.RLock()
+	defer job.handleLock.RUnlock()
+
+	if job.handle == 0 {
+		return ErrAlreadyClosed
+	}
+
+	if pid == 0 {
+		return errors.New("invalid pid: 0")
+	}
+	hProc, err := windows.OpenProcess(winapi.PROCESS_ALL_ACCESS, true, pid)
+	if err != nil {
+		return err
+	}
+	defer windows.Close(hProc)
+	return windows.AssignProcessToJobObject(job.handle, hProc)
+}
+
+// Terminate terminates the job, essentially calls TerminateProcess on every process in the
+// job.
+func (job *JobObject) Terminate(exitCode uint32) error {
+	job.handleLock.RLock()
+	defer job.handleLock.RUnlock()
+	if job.handle == 0 {
+		return ErrAlreadyClosed
+	}
+	return windows.TerminateJobObject(job.handle, exitCode)
+}
+
+// Pids returns all of the process IDs in the job object.
+func (job *JobObject) Pids() ([]uint32, error) {
+	job.handleLock.RLock()
+	defer job.handleLock.RUnlock()
+
+	if job.handle == 0 {
+		return nil, ErrAlreadyClosed
+	}
+
+	info := winapi.JOBOBJECT_BASIC_PROCESS_ID_LIST{}
+	err := winapi.QueryInformationJobObject(
+		job.handle,
+		winapi.JobObjectBasicProcessIdList,
+		uintptr(unsafe.Pointer(&info)),
+		uint32(unsafe.Sizeof(info)),
+		nil,
+	)
+
+	// This is either the case where there is only one process or no processes in
+	// the job. Any other case will result in ERROR_MORE_DATA. Check if info.NumberOfProcessIdsInList
+	// is 1 and just return this, otherwise return an empty slice.
+	if err == nil {
+		if info.NumberOfProcessIdsInList == 1 {
+			return []uint32{uint32(info.ProcessIdList[0])}, nil
+		}
+		// Return empty slice instead of nil to play well with the caller of this.
+		// Do not return an error if no processes are running inside the job
+		return []uint32{}, nil
+	}
+
+	if err != winapi.ERROR_MORE_DATA {
+		return nil, fmt.Errorf("failed initial query for PIDs in job object: %w", err)
+	}
+
+	jobBasicProcessIDListSize := unsafe.Sizeof(info) + (unsafe.Sizeof(info.ProcessIdList[0]) * uintptr(info.NumberOfAssignedProcesses-1))
+	buf := make([]byte, jobBasicProcessIDListSize)
+	if err = winapi.QueryInformationJobObject(
+		job.handle,
+		winapi.JobObjectBasicProcessIdList,
+		uintptr(unsafe.Pointer(&buf[0])),
+		uint32(len(buf)),
+		nil,
+	); err != nil {
+		return nil, fmt.Errorf("failed to query for PIDs in job object: %w", err)
+	}
+
+	bufInfo := (*winapi.JOBOBJECT_BASIC_PROCESS_ID_LIST)(unsafe.Pointer(&buf[0]))
+	pids := make([]uint32, bufInfo.NumberOfProcessIdsInList)
+	for i, bufPid := range bufInfo.AllPids() {
+		pids[i] = uint32(bufPid)
+	}
+	return pids, nil
+}
+
+// QueryMemoryStats gets the memory stats for the job object.
+func (job *JobObject) QueryMemoryStats() (*winapi.JOBOBJECT_MEMORY_USAGE_INFORMATION, error) {
+	job.handleLock.RLock()
+	defer job.handleLock.RUnlock()
+
+	if job.handle == 0 {
+		return nil, ErrAlreadyClosed
+	}
+
+	info := winapi.JOBOBJECT_MEMORY_USAGE_INFORMATION{}
+	if err := winapi.QueryInformationJobObject(
+		job.handle,
+		winapi.JobObjectMemoryUsageInformation,
+		uintptr(unsafe.Pointer(&info)),
+		uint32(unsafe.Sizeof(info)),
+		nil,
+	); err != nil {
+		return nil, fmt.Errorf("failed to query for job object memory stats: %w", err)
+	}
+	return &info, nil
+}
+
+// QueryProcessorStats gets the processor stats for the job object.
+func (job *JobObject) QueryProcessorStats() (*winapi.JOBOBJECT_BASIC_ACCOUNTING_INFORMATION, error) {
+	job.handleLock.RLock()
+	defer job.handleLock.RUnlock()
+
+	if job.handle == 0 {
+		return nil, ErrAlreadyClosed
+	}
+
+	info := winapi.JOBOBJECT_BASIC_ACCOUNTING_INFORMATION{}
+	if err := winapi.QueryInformationJobObject(
+		job.handle,
+		winapi.JobObjectBasicAccountingInformation,
+		uintptr(unsafe.Pointer(&info)),
+		uint32(unsafe.Sizeof(info)),
+		nil,
+	); err != nil {
+		return nil, fmt.Errorf("failed to query for job object process stats: %w", err)
+	}
+	return &info, nil
+}
+
+// QueryStorageStats gets the storage (I/O) stats for the job object.
+func (job *JobObject) QueryStorageStats() (*winapi.JOBOBJECT_IO_ATTRIBUTION_INFORMATION, error) {
+	job.handleLock.RLock()
+	defer job.handleLock.RUnlock()
+
+	if job.handle == 0 {
+		return nil, ErrAlreadyClosed
+	}
+
+	info := winapi.JOBOBJECT_IO_ATTRIBUTION_INFORMATION{
+		ControlFlags: winapi.JOBOBJECT_IO_ATTRIBUTION_CONTROL_ENABLE,
+	}
+	if err := winapi.QueryInformationJobObject(
+		job.handle,
+		winapi.JobObjectIoAttribution,
+		uintptr(unsafe.Pointer(&info)),
+		uint32(unsafe.Sizeof(info)),
+		nil,
+	); err != nil {
+		return nil, fmt.Errorf("failed to query for job object storage stats: %w", err)
+	}
+	return &info, nil
+}
+
+// QueryPrivateWorkingSet returns the private working set size for the job. This is calculated by adding up the
+// private working set for every process running in the job.
+func (job *JobObject) QueryPrivateWorkingSet() (uint64, error) {
+	pids, err := job.Pids()
+	if err != nil {
+		return 0, err
+	}
+
+	openAndQueryWorkingSet := func(pid uint32) (uint64, error) {
+		h, err := windows.OpenProcess(windows.PROCESS_QUERY_LIMITED_INFORMATION, false, pid)
+		if err != nil {
+			// Continue to the next if OpenProcess doesn't return a valid handle (fails). Handles a
+			// case where one of the pids in the job exited before we open.
+			return 0, nil
+		}
+		defer func() {
+			_ = windows.Close(h)
+		}()
+		// Check if the process is actually running in the job still. There's a small chance
+		// that the process could have exited and had its pid re-used between grabbing the pids
+		// in the job and opening the handle to it above.
+		var inJob int32
+		if err := winapi.IsProcessInJob(h, job.handle, &inJob); err != nil {
+			// This shouldn't fail unless we have incorrect access rights which we control
+			// here so probably best to error out if this failed.
+			return 0, err
+		}
+		// Don't report stats for this process as it's not running in the job. This shouldn't be
+		// an error condition though.
+		if inJob == 0 {
+			return 0, nil
+		}
+
+		var vmCounters winapi.VM_COUNTERS_EX2
+		status := winapi.NtQueryInformationProcess(
+			h,
+			winapi.ProcessVmCounters,
+			uintptr(unsafe.Pointer(&vmCounters)),
+			uint32(unsafe.Sizeof(vmCounters)),
+			nil,
+		)
+		if !winapi.NTSuccess(status) {
+			return 0, fmt.Errorf("failed to query information for process: %w", winapi.RtlNtStatusToDosError(status))
+		}
+		return uint64(vmCounters.PrivateWorkingSetSize), nil
+	}
+
+	var jobWorkingSetSize uint64
+	for _, pid := range pids {
+		workingSet, err := openAndQueryWorkingSet(pid)
+		if err != nil {
+			return 0, err
+		}
+		jobWorkingSetSize += workingSet
+	}
+
+	return jobWorkingSetSize, nil
+}

+ 315 - 0
vendor/github.com/Microsoft/hcsshim/internal/jobobject/limits.go

@@ -0,0 +1,315 @@
+package jobobject
+
+import (
+	"errors"
+	"fmt"
+	"unsafe"
+
+	"github.com/Microsoft/hcsshim/internal/winapi"
+	"golang.org/x/sys/windows"
+)
+
+const (
+	memoryLimitMax uint64 = 0xffffffffffffffff
+)
+
+func isFlagSet(flag, controlFlags uint32) bool {
+	return (flag & controlFlags) == flag
+}
+
+// SetResourceLimits sets resource limits on the job object (cpu, memory, storage).
+func (job *JobObject) SetResourceLimits(limits *JobLimits) error {
+	// Go through and check what limits were specified and apply them to the job.
+	if limits.MemoryLimitInBytes != 0 {
+		if err := job.SetMemoryLimit(limits.MemoryLimitInBytes); err != nil {
+			return fmt.Errorf("failed to set job object memory limit: %w", err)
+		}
+	}
+
+	if limits.CPULimit != 0 {
+		if err := job.SetCPULimit(RateBased, limits.CPULimit); err != nil {
+			return fmt.Errorf("failed to set job object cpu limit: %w", err)
+		}
+	} else if limits.CPUWeight != 0 {
+		if err := job.SetCPULimit(WeightBased, limits.CPUWeight); err != nil {
+			return fmt.Errorf("failed to set job object cpu limit: %w", err)
+		}
+	}
+
+	if limits.MaxBandwidth != 0 || limits.MaxIOPS != 0 {
+		if err := job.SetIOLimit(limits.MaxBandwidth, limits.MaxIOPS); err != nil {
+			return fmt.Errorf("failed to set io limit on job object: %w", err)
+		}
+	}
+	return nil
+}
+
+// SetTerminateOnLastHandleClose sets the job object flag that specifies that the job should terminate
+// all processes in the job on the last open handle being closed.
+func (job *JobObject) SetTerminateOnLastHandleClose() error {
+	info, err := job.getExtendedInformation()
+	if err != nil {
+		return err
+	}
+	info.BasicLimitInformation.LimitFlags |= windows.JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE
+	return job.setExtendedInformation(info)
+}
+
+// SetMemoryLimit sets the memory limit of the job object based on the given `memoryLimitInBytes`.
+func (job *JobObject) SetMemoryLimit(memoryLimitInBytes uint64) error {
+	if memoryLimitInBytes >= memoryLimitMax {
+		return errors.New("memory limit specified exceeds the max size")
+	}
+
+	info, err := job.getExtendedInformation()
+	if err != nil {
+		return err
+	}
+
+	info.JobMemoryLimit = uintptr(memoryLimitInBytes)
+	info.BasicLimitInformation.LimitFlags |= windows.JOB_OBJECT_LIMIT_JOB_MEMORY
+	return job.setExtendedInformation(info)
+}
+
+// GetMemoryLimit gets the memory limit in bytes of the job object.
+func (job *JobObject) GetMemoryLimit() (uint64, error) {
+	info, err := job.getExtendedInformation()
+	if err != nil {
+		return 0, err
+	}
+	return uint64(info.JobMemoryLimit), nil
+}
+
+// SetCPULimit sets the CPU limit depending on the specified `CPURateControlType` to
+// `rateControlValue` for the job object.
+func (job *JobObject) SetCPULimit(rateControlType CPURateControlType, rateControlValue uint32) error {
+	cpuInfo, err := job.getCPURateControlInformation()
+	if err != nil {
+		return err
+	}
+	switch rateControlType {
+	case WeightBased:
+		if rateControlValue < cpuWeightMin || rateControlValue > cpuWeightMax {
+			return fmt.Errorf("processor weight value of `%d` is invalid", rateControlValue)
+		}
+		cpuInfo.ControlFlags |= winapi.JOB_OBJECT_CPU_RATE_CONTROL_ENABLE | winapi.JOB_OBJECT_CPU_RATE_CONTROL_WEIGHT_BASED
+		cpuInfo.Value = rateControlValue
+	case RateBased:
+		if rateControlValue < cpuLimitMin || rateControlValue > cpuLimitMax {
+			return fmt.Errorf("processor rate of `%d` is invalid", rateControlValue)
+		}
+		cpuInfo.ControlFlags |= winapi.JOB_OBJECT_CPU_RATE_CONTROL_ENABLE | winapi.JOB_OBJECT_CPU_RATE_CONTROL_HARD_CAP
+		cpuInfo.Value = rateControlValue
+	default:
+		return errors.New("invalid job object cpu rate control type")
+	}
+	return job.setCPURateControlInfo(cpuInfo)
+}
+
+// GetCPULimit gets the cpu limits for the job object.
+// `rateControlType` is used to indicate what type of cpu limit to query for.
+func (job *JobObject) GetCPULimit(rateControlType CPURateControlType) (uint32, error) {
+	info, err := job.getCPURateControlInformation()
+	if err != nil {
+		return 0, err
+	}
+
+	if !isFlagSet(winapi.JOB_OBJECT_CPU_RATE_CONTROL_ENABLE, info.ControlFlags) {
+		return 0, errors.New("the job does not have cpu rate control enabled")
+	}
+
+	switch rateControlType {
+	case WeightBased:
+		if !isFlagSet(winapi.JOB_OBJECT_CPU_RATE_CONTROL_WEIGHT_BASED, info.ControlFlags) {
+			return 0, errors.New("cannot get cpu weight for job object without cpu weight option set")
+		}
+	case RateBased:
+		if !isFlagSet(winapi.JOB_OBJECT_CPU_RATE_CONTROL_HARD_CAP, info.ControlFlags) {
+			return 0, errors.New("cannot get cpu rate hard cap for job object without cpu rate hard cap option set")
+		}
+	default:
+		return 0, errors.New("invalid job object cpu rate control type")
+	}
+	return info.Value, nil
+}
+
+// SetCPUAffinity sets the processor affinity for the job object.
+// The affinity is passed in as a bitmask.
+func (job *JobObject) SetCPUAffinity(affinityBitMask uint64) error {
+	info, err := job.getExtendedInformation()
+	if err != nil {
+		return err
+	}
+	info.BasicLimitInformation.LimitFlags |= uint32(windows.JOB_OBJECT_LIMIT_AFFINITY)
+	info.BasicLimitInformation.Affinity = uintptr(affinityBitMask)
+	return job.setExtendedInformation(info)
+}
+
+// GetCPUAffinity gets the processor affinity for the job object.
+// The returned affinity is a bitmask.
+func (job *JobObject) GetCPUAffinity() (uint64, error) {
+	info, err := job.getExtendedInformation()
+	if err != nil {
+		return 0, err
+	}
+	return uint64(info.BasicLimitInformation.Affinity), nil
+}
+
+// SetIOLimit sets the IO limits specified on the job object.
+func (job *JobObject) SetIOLimit(maxBandwidth, maxIOPS int64) error {
+	ioInfo, err := job.getIOLimit()
+	if err != nil {
+		return err
+	}
+	ioInfo.ControlFlags |= winapi.JOB_OBJECT_IO_RATE_CONTROL_ENABLE
+	if maxBandwidth != 0 {
+		ioInfo.MaxBandwidth = maxBandwidth
+	}
+	if maxIOPS != 0 {
+		ioInfo.MaxIops = maxIOPS
+	}
+	return job.setIORateControlInfo(ioInfo)
+}
+
+// GetIOMaxBandwidthLimit gets the max bandwidth for the job object.
+func (job *JobObject) GetIOMaxBandwidthLimit() (int64, error) {
+	info, err := job.getIOLimit()
+	if err != nil {
+		return 0, err
+	}
+	return info.MaxBandwidth, nil
+}
+
+// GetIOMaxIopsLimit gets the max iops for the job object.
+func (job *JobObject) GetIOMaxIopsLimit() (int64, error) {
+	info, err := job.getIOLimit()
+	if err != nil {
+		return 0, err
+	}
+	return info.MaxIops, nil
+}
+
+// Helper function for getting a job object's extended information.
+func (job *JobObject) getExtendedInformation() (*windows.JOBOBJECT_EXTENDED_LIMIT_INFORMATION, error) {
+	job.handleLock.RLock()
+	defer job.handleLock.RUnlock()
+
+	if job.handle == 0 {
+		return nil, ErrAlreadyClosed
+	}
+
+	info := windows.JOBOBJECT_EXTENDED_LIMIT_INFORMATION{}
+	if err := winapi.QueryInformationJobObject(
+		job.handle,
+		windows.JobObjectExtendedLimitInformation,
+		uintptr(unsafe.Pointer(&info)),
+		uint32(unsafe.Sizeof(info)),
+		nil,
+	); err != nil {
+		return nil, fmt.Errorf("query %v returned error: %w", info, err)
+	}
+	return &info, nil
+}
+
+// Helper function for getting a job object's CPU rate control information.
+func (job *JobObject) getCPURateControlInformation() (*winapi.JOBOBJECT_CPU_RATE_CONTROL_INFORMATION, error) {
+	job.handleLock.RLock()
+	defer job.handleLock.RUnlock()
+
+	if job.handle == 0 {
+		return nil, ErrAlreadyClosed
+	}
+
+	info := winapi.JOBOBJECT_CPU_RATE_CONTROL_INFORMATION{}
+	if err := winapi.QueryInformationJobObject(
+		job.handle,
+		windows.JobObjectCpuRateControlInformation,
+		uintptr(unsafe.Pointer(&info)),
+		uint32(unsafe.Sizeof(info)),
+		nil,
+	); err != nil {
+		return nil, fmt.Errorf("query %v returned error: %w", info, err)
+	}
+	return &info, nil
+}
+
+// Helper function for setting a job object's extended information.
+func (job *JobObject) setExtendedInformation(info *windows.JOBOBJECT_EXTENDED_LIMIT_INFORMATION) error {
+	job.handleLock.RLock()
+	defer job.handleLock.RUnlock()
+
+	if job.handle == 0 {
+		return ErrAlreadyClosed
+	}
+
+	if _, err := windows.SetInformationJobObject(
+		job.handle,
+		windows.JobObjectExtendedLimitInformation,
+		uintptr(unsafe.Pointer(info)),
+		uint32(unsafe.Sizeof(*info)),
+	); err != nil {
+		return fmt.Errorf("failed to set Extended info %v on job object: %w", info, err)
+	}
+	return nil
+}
+
+// Helper function for querying job handle for IO limit information.
+func (job *JobObject) getIOLimit() (*winapi.JOBOBJECT_IO_RATE_CONTROL_INFORMATION, error) {
+	job.handleLock.RLock()
+	defer job.handleLock.RUnlock()
+
+	if job.handle == 0 {
+		return nil, ErrAlreadyClosed
+	}
+
+	ioInfo := &winapi.JOBOBJECT_IO_RATE_CONTROL_INFORMATION{}
+	var blockCount uint32 = 1
+
+	if _, err := winapi.QueryIoRateControlInformationJobObject(
+		job.handle,
+		nil,
+		&ioInfo,
+		&blockCount,
+	); err != nil {
+		return nil, fmt.Errorf("query %v returned error: %w", ioInfo, err)
+	}
+
+	if !isFlagSet(winapi.JOB_OBJECT_IO_RATE_CONTROL_ENABLE, ioInfo.ControlFlags) {
+		return nil, fmt.Errorf("query %v cannot get IO limits for job object without IO rate control option set", ioInfo)
+	}
+	return ioInfo, nil
+}
+
+// Helper function for setting a job object's IO rate control information.
+func (job *JobObject) setIORateControlInfo(ioInfo *winapi.JOBOBJECT_IO_RATE_CONTROL_INFORMATION) error {
+	job.handleLock.RLock()
+	defer job.handleLock.RUnlock()
+
+	if job.handle == 0 {
+		return ErrAlreadyClosed
+	}
+
+	if _, err := winapi.SetIoRateControlInformationJobObject(job.handle, ioInfo); err != nil {
+		return fmt.Errorf("failed to set IO limit info %v on job object: %w", ioInfo, err)
+	}
+	return nil
+}
+
+// Helper function for setting a job object's CPU rate control information.
+func (job *JobObject) setCPURateControlInfo(cpuInfo *winapi.JOBOBJECT_CPU_RATE_CONTROL_INFORMATION) error {
+	job.handleLock.RLock()
+	defer job.handleLock.RUnlock()
+
+	if job.handle == 0 {
+		return ErrAlreadyClosed
+	}
+	if _, err := windows.SetInformationJobObject(
+		job.handle,
+		windows.JobObjectCpuRateControlInformation,
+		uintptr(unsafe.Pointer(cpuInfo)),
+		uint32(unsafe.Sizeof(cpuInfo)),
+	); err != nil {
+		return fmt.Errorf("failed to set cpu limit info %v on job object: %w", cpuInfo, err)
+	}
+	return nil
+}

+ 111 - 0
vendor/github.com/Microsoft/hcsshim/internal/queue/mq.go

@@ -0,0 +1,111 @@
+package queue
+
+import (
+	"errors"
+	"sync"
+)
+
+var (
+	ErrQueueClosed = errors.New("the queue is closed for reading and writing")
+	ErrQueueEmpty  = errors.New("the queue is empty")
+)
+
+// MessageQueue represents a threadsafe message queue to be used to retrieve or
+// write messages to.
+type MessageQueue struct {
+	m        *sync.RWMutex
+	c        *sync.Cond
+	messages []interface{}
+	closed   bool
+}
+
+// NewMessageQueue returns a new MessageQueue.
+func NewMessageQueue() *MessageQueue {
+	m := &sync.RWMutex{}
+	return &MessageQueue{
+		m:        m,
+		c:        sync.NewCond(m),
+		messages: []interface{}{},
+	}
+}
+
+// Write writes `msg` to the queue.
+func (mq *MessageQueue) Write(msg interface{}) error {
+	mq.m.Lock()
+	defer mq.m.Unlock()
+
+	if mq.closed {
+		return ErrQueueClosed
+	}
+	mq.messages = append(mq.messages, msg)
+	// Signal a waiter that there is now a value available in the queue.
+	mq.c.Signal()
+	return nil
+}
+
+// Read will read a value from the queue if available, otherwise return an error.
+func (mq *MessageQueue) Read() (interface{}, error) {
+	mq.m.Lock()
+	defer mq.m.Unlock()
+	if mq.closed {
+		return nil, ErrQueueClosed
+	}
+	if mq.isEmpty() {
+		return nil, ErrQueueEmpty
+	}
+	val := mq.messages[0]
+	mq.messages[0] = nil
+	mq.messages = mq.messages[1:]
+	return val, nil
+}
+
+// ReadOrWait will read a value from the queue if available, else it will wait for a
+// value to become available. This will block forever if nothing gets written or until
+// the queue gets closed.
+func (mq *MessageQueue) ReadOrWait() (interface{}, error) {
+	mq.m.Lock()
+	if mq.closed {
+		mq.m.Unlock()
+		return nil, ErrQueueClosed
+	}
+	if mq.isEmpty() {
+		for !mq.closed && mq.isEmpty() {
+			mq.c.Wait()
+		}
+		mq.m.Unlock()
+		return mq.Read()
+	}
+	val := mq.messages[0]
+	mq.messages[0] = nil
+	mq.messages = mq.messages[1:]
+	mq.m.Unlock()
+	return val, nil
+}
+
+// IsEmpty returns if the queue is empty
+func (mq *MessageQueue) IsEmpty() bool {
+	mq.m.RLock()
+	defer mq.m.RUnlock()
+	return len(mq.messages) == 0
+}
+
+// Nonexported empty check that doesn't lock so we can call this in Read and Write.
+func (mq *MessageQueue) isEmpty() bool {
+	return len(mq.messages) == 0
+}
+
+// Close closes the queue for future writes or reads. Any attempts to read or write from the
+// queue after close will return ErrQueueClosed. This is safe to call multiple times.
+func (mq *MessageQueue) Close() {
+	mq.m.Lock()
+	defer mq.m.Unlock()
+	// Already closed
+	if mq.closed {
+		return
+	}
+	mq.messages = nil
+	mq.closed = true
+	// If there's anybody currently waiting on a value from ReadOrWait, we need to
+	// broadcast so the read(s) can return ErrQueueClosed.
+	mq.c.Broadcast()
+}

+ 0 - 3
vendor/github.com/Microsoft/hcsshim/internal/winapi/iocp.go

@@ -1,3 +0,0 @@
-package winapi
-
-//sys GetQueuedCompletionStatus(cphandle windows.Handle, qty *uint32, key *uintptr, overlapped **windows.Overlapped, timeout uint32) (err error)

+ 6 - 3
vendor/github.com/Microsoft/hcsshim/internal/winapi/jobobject.go

@@ -24,7 +24,10 @@ const (
 // Access rights for creating or opening job objects.
 //
 // https://docs.microsoft.com/en-us/windows/win32/procthread/job-object-security-and-access-rights
-const JOB_OBJECT_ALL_ACCESS = 0x1F001F
+const (
+	JOB_OBJECT_QUERY      = 0x0004
+	JOB_OBJECT_ALL_ACCESS = 0x1F001F
+)
 
 // IO limit flags
 //
@@ -93,7 +96,7 @@ type JOBOBJECT_BASIC_PROCESS_ID_LIST struct {
 
 // AllPids returns all the process Ids in the job object.
 func (p *JOBOBJECT_BASIC_PROCESS_ID_LIST) AllPids() []uintptr {
-	return (*[(1 << 27) - 1]uintptr)(unsafe.Pointer(&p.ProcessIdList[0]))[:p.NumberOfProcessIdsInList]
+	return (*[(1 << 27) - 1]uintptr)(unsafe.Pointer(&p.ProcessIdList[0]))[:p.NumberOfProcessIdsInList:p.NumberOfProcessIdsInList]
 }
 
 // https://docs.microsoft.com/en-us/windows/win32/api/winnt/ns-winnt-jobobject_basic_accounting_information
@@ -162,7 +165,7 @@ type JOBOBJECT_ASSOCIATE_COMPLETION_PORT struct {
 // 		PBOOL  Result
 // );
 //
-//sys IsProcessInJob(procHandle windows.Handle, jobHandle windows.Handle, result *bool) (err error) = kernel32.IsProcessInJob
+//sys IsProcessInJob(procHandle windows.Handle, jobHandle windows.Handle, result *int32) (err error) = kernel32.IsProcessInJob
 
 // BOOL QueryInformationJobObject(
 //		HANDLE             hJob,

+ 57 - 0
vendor/github.com/Microsoft/hcsshim/internal/winapi/process.go

@@ -6,3 +6,60 @@ const (
 	PROC_THREAD_ATTRIBUTE_PSEUDOCONSOLE = 0x20016
 	PROC_THREAD_ATTRIBUTE_JOB_LIST      = 0x2000D
 )
+
+// ProcessVmCounters corresponds to the _VM_COUNTERS_EX and _VM_COUNTERS_EX2 structures.
+const ProcessVmCounters = 3
+
+// __kernel_entry NTSTATUS NtQueryInformationProcess(
+// 	[in]            HANDLE           ProcessHandle,
+// 	[in]            PROCESSINFOCLASS ProcessInformationClass,
+// 	[out]           PVOID            ProcessInformation,
+// 	[in]            ULONG            ProcessInformationLength,
+// 	[out, optional] PULONG           ReturnLength
+// );
+//
+//sys NtQueryInformationProcess(processHandle windows.Handle, processInfoClass uint32, processInfo uintptr, processInfoLength uint32, returnLength *uint32) (status uint32) = ntdll.NtQueryInformationProcess
+
+// typedef struct _VM_COUNTERS_EX
+// {
+//    SIZE_T PeakVirtualSize;
+//    SIZE_T VirtualSize;
+//    ULONG PageFaultCount;
+//    SIZE_T PeakWorkingSetSize;
+//    SIZE_T WorkingSetSize;
+//    SIZE_T QuotaPeakPagedPoolUsage;
+//    SIZE_T QuotaPagedPoolUsage;
+//    SIZE_T QuotaPeakNonPagedPoolUsage;
+//    SIZE_T QuotaNonPagedPoolUsage;
+//    SIZE_T PagefileUsage;
+//    SIZE_T PeakPagefileUsage;
+//    SIZE_T PrivateUsage;
+// } VM_COUNTERS_EX, *PVM_COUNTERS_EX;
+//
+type VM_COUNTERS_EX struct {
+	PeakVirtualSize            uintptr
+	VirtualSize                uintptr
+	PageFaultCount             uint32
+	PeakWorkingSetSize         uintptr
+	WorkingSetSize             uintptr
+	QuotaPeakPagedPoolUsage    uintptr
+	QuotaPagedPoolUsage        uintptr
+	QuotaPeakNonPagedPoolUsage uintptr
+	QuotaNonPagedPoolUsage     uintptr
+	PagefileUsage              uintptr
+	PeakPagefileUsage          uintptr
+	PrivateUsage               uintptr
+}
+
+// typedef struct _VM_COUNTERS_EX2
+// {
+//    VM_COUNTERS_EX CountersEx;
+//    SIZE_T PrivateWorkingSetSize;
+//    SIZE_T SharedCommitUsage;
+// } VM_COUNTERS_EX2, *PVM_COUNTERS_EX2;
+//
+type VM_COUNTERS_EX2 struct {
+	CountersEx            VM_COUNTERS_EX
+	PrivateWorkingSetSize uintptr
+	SharedCommitUsage     uintptr
+}

+ 1 - 1
vendor/github.com/Microsoft/hcsshim/internal/winapi/winapi.go

@@ -2,4 +2,4 @@
 // be thought of as an extension to golang.org/x/sys/windows.
 package winapi
 
-//go:generate go run ..\..\mksyscall_windows.go -output zsyscall_windows.go console.go system.go net.go path.go thread.go iocp.go jobobject.go logon.go memory.go process.go processor.go devices.go filesystem.go errors.go
+//go:generate go run ..\..\mksyscall_windows.go -output zsyscall_windows.go user.go console.go system.go net.go path.go thread.go jobobject.go logon.go memory.go process.go processor.go devices.go filesystem.go errors.go

+ 8 - 14
vendor/github.com/Microsoft/hcsshim/internal/winapi/zsyscall_windows.go

@@ -50,7 +50,6 @@ var (
 	procSetJobCompartmentId                    = modiphlpapi.NewProc("SetJobCompartmentId")
 	procSearchPathW                            = modkernel32.NewProc("SearchPathW")
 	procCreateRemoteThread                     = modkernel32.NewProc("CreateRemoteThread")
-	procGetQueuedCompletionStatus              = modkernel32.NewProc("GetQueuedCompletionStatus")
 	procIsProcessInJob                         = modkernel32.NewProc("IsProcessInJob")
 	procQueryInformationJobObject              = modkernel32.NewProc("QueryInformationJobObject")
 	procOpenJobObjectW                         = modkernel32.NewProc("OpenJobObjectW")
@@ -61,6 +60,7 @@ var (
 	procLogonUserW                             = modadvapi32.NewProc("LogonUserW")
 	procLocalAlloc                             = modkernel32.NewProc("LocalAlloc")
 	procLocalFree                              = modkernel32.NewProc("LocalFree")
+	procNtQueryInformationProcess              = modntdll.NewProc("NtQueryInformationProcess")
 	procGetActiveProcessorCount                = modkernel32.NewProc("GetActiveProcessorCount")
 	procCM_Get_Device_ID_List_SizeA            = modcfgmgr32.NewProc("CM_Get_Device_ID_List_SizeA")
 	procCM_Get_Device_ID_ListA                 = modcfgmgr32.NewProc("CM_Get_Device_ID_ListA")
@@ -140,19 +140,7 @@ func CreateRemoteThread(process windows.Handle, sa *windows.SecurityAttributes,
 	return
 }
 
-func GetQueuedCompletionStatus(cphandle windows.Handle, qty *uint32, key *uintptr, overlapped **windows.Overlapped, timeout uint32) (err error) {
-	r1, _, e1 := syscall.Syscall6(procGetQueuedCompletionStatus.Addr(), 5, uintptr(cphandle), uintptr(unsafe.Pointer(qty)), uintptr(unsafe.Pointer(key)), uintptr(unsafe.Pointer(overlapped)), uintptr(timeout), 0)
-	if r1 == 0 {
-		if e1 != 0 {
-			err = errnoErr(e1)
-		} else {
-			err = syscall.EINVAL
-		}
-	}
-	return
-}
-
-func IsProcessInJob(procHandle windows.Handle, jobHandle windows.Handle, result *bool) (err error) {
+func IsProcessInJob(procHandle windows.Handle, jobHandle windows.Handle, result *int32) (err error) {
 	r1, _, e1 := syscall.Syscall(procIsProcessInJob.Addr(), 3, uintptr(procHandle), uintptr(jobHandle), uintptr(unsafe.Pointer(result)))
 	if r1 == 0 {
 		if e1 != 0 {
@@ -256,6 +244,12 @@ func LocalFree(ptr uintptr) {
 	return
 }
 
+func NtQueryInformationProcess(processHandle windows.Handle, processInfoClass uint32, processInfo uintptr, processInfoLength uint32, returnLength *uint32) (status uint32) {
+	r0, _, _ := syscall.Syscall6(procNtQueryInformationProcess.Addr(), 5, uintptr(processHandle), uintptr(processInfoClass), uintptr(processInfo), uintptr(processInfoLength), uintptr(unsafe.Pointer(returnLength)), 0)
+	status = uint32(r0)
+	return
+}
+
 func GetActiveProcessorCount(groupNumber uint16) (amount uint32) {
 	r0, _, _ := syscall.Syscall(procGetActiveProcessorCount.Addr(), 1, uintptr(groupNumber), 0, 0)
 	amount = uint32(r0)

+ 3 - 1
vendor/modules.txt

@@ -27,7 +27,7 @@ github.com/Microsoft/go-winio/pkg/etwlogrus
 github.com/Microsoft/go-winio/pkg/guid
 github.com/Microsoft/go-winio/pkg/security
 github.com/Microsoft/go-winio/vhd
-# github.com/Microsoft/hcsshim v0.9.2
+# github.com/Microsoft/hcsshim v0.9.3
 ## explicit; go 1.13
 github.com/Microsoft/hcsshim
 github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1/options
@@ -39,11 +39,13 @@ github.com/Microsoft/hcsshim/internal/hcs/schema2
 github.com/Microsoft/hcsshim/internal/hcserror
 github.com/Microsoft/hcsshim/internal/hns
 github.com/Microsoft/hcsshim/internal/interop
+github.com/Microsoft/hcsshim/internal/jobobject
 github.com/Microsoft/hcsshim/internal/log
 github.com/Microsoft/hcsshim/internal/logfields
 github.com/Microsoft/hcsshim/internal/longpath
 github.com/Microsoft/hcsshim/internal/mergemaps
 github.com/Microsoft/hcsshim/internal/oc
+github.com/Microsoft/hcsshim/internal/queue
 github.com/Microsoft/hcsshim/internal/safefile
 github.com/Microsoft/hcsshim/internal/timeout
 github.com/Microsoft/hcsshim/internal/vmcompute