123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105 |
- package daemon
- import (
- "os"
- "os/exec"
- "strconv"
- "strings"
- "github.com/containerd/containerd/contrib/nvidia"
- "github.com/docker/docker/pkg/capabilities"
- specs "github.com/opencontainers/runtime-spec/specs-go"
- "github.com/pkg/errors"
- )
- // TODO: nvidia should not be hard-coded, and should be a device plugin instead on the daemon object.
- // TODO: add list of device capabilities in daemon/node info
- var errConflictCountDeviceIDs = errors.New("cannot set both Count and DeviceIDs on device request")
- const nvidiaHook = "nvidia-container-runtime-hook"
- // These are NVIDIA-specific capabilities stolen from github.com/containerd/containerd/contrib/nvidia.allCaps
- var allNvidiaCaps = map[nvidia.Capability]struct{}{
- nvidia.Compute: {},
- nvidia.Compat32: {},
- nvidia.Graphics: {},
- nvidia.Utility: {},
- nvidia.Video: {},
- nvidia.Display: {},
- }
- func init() {
- if _, err := exec.LookPath(nvidiaHook); err != nil {
- // do not register Nvidia driver if helper binary is not present.
- return
- }
- capset := capabilities.Set{"gpu": struct{}{}, "nvidia": struct{}{}}
- nvidiaDriver := &deviceDriver{
- capset: capset,
- updateSpec: setNvidiaGPUs,
- }
- for c := range allNvidiaCaps {
- nvidiaDriver.capset[string(c)] = struct{}{}
- }
- registerDeviceDriver("nvidia", nvidiaDriver)
- }
- func setNvidiaGPUs(s *specs.Spec, dev *deviceInstance) error {
- req := dev.req
- if req.Count != 0 && len(req.DeviceIDs) > 0 {
- return errConflictCountDeviceIDs
- }
- if len(req.DeviceIDs) > 0 {
- s.Process.Env = append(s.Process.Env, "NVIDIA_VISIBLE_DEVICES="+strings.Join(req.DeviceIDs, ","))
- } else if req.Count > 0 {
- s.Process.Env = append(s.Process.Env, "NVIDIA_VISIBLE_DEVICES="+countToDevices(req.Count))
- } else if req.Count < 0 {
- s.Process.Env = append(s.Process.Env, "NVIDIA_VISIBLE_DEVICES=all")
- }
- var nvidiaCaps []string
- // req.Capabilities contains device capabilities, some but not all are NVIDIA driver capabilities.
- for _, c := range dev.selectedCaps {
- nvcap := nvidia.Capability(c)
- if _, isNvidiaCap := allNvidiaCaps[nvcap]; isNvidiaCap {
- nvidiaCaps = append(nvidiaCaps, c)
- continue
- }
- // TODO: nvidia.WithRequiredCUDAVersion
- // for now we let the prestart hook verify cuda versions but errors are not pretty.
- }
- if nvidiaCaps != nil {
- s.Process.Env = append(s.Process.Env, "NVIDIA_DRIVER_CAPABILITIES="+strings.Join(nvidiaCaps, ","))
- }
- path, err := exec.LookPath(nvidiaHook)
- if err != nil {
- return err
- }
- if s.Hooks == nil {
- s.Hooks = &specs.Hooks{}
- }
- s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{
- Path: path,
- Args: []string{
- nvidiaHook,
- "prestart",
- },
- Env: os.Environ(),
- })
- return nil
- }
- // countToDevices returns the list 0, 1, ... count-1 of deviceIDs.
- func countToDevices(count int) string {
- devices := make([]string, count)
- for i := range devices {
- devices[i] = strconv.Itoa(i)
- }
- return strings.Join(devices, ",")
- }
|