2019-03-01 00:32:08 +00:00
|
|
|
package daemon
|
|
|
|
|
|
|
|
import (
|
2019-03-26 07:56:17 +00:00
|
|
|
"os"
|
2019-03-01 00:32:08 +00:00
|
|
|
"os/exec"
|
|
|
|
"strconv"
|
2019-03-26 07:56:17 +00:00
|
|
|
"strings"
|
2019-03-01 00:32:08 +00:00
|
|
|
|
|
|
|
"github.com/containerd/containerd/contrib/nvidia"
|
|
|
|
"github.com/docker/docker/pkg/capabilities"
|
2019-08-05 14:37:47 +00:00
|
|
|
specs "github.com/opencontainers/runtime-spec/specs-go"
|
2019-03-01 00:32:08 +00:00
|
|
|
"github.com/pkg/errors"
|
|
|
|
)
|
|
|
|
|
|
|
|
// TODO: nvidia should not be hard-coded, and should be a device plugin instead on the daemon object.
|
|
|
|
// TODO: add list of device capabilities in daemon/node info
|
|
|
|
|
|
|
|
var errConflictCountDeviceIDs = errors.New("cannot set both Count and DeviceIDs on device request")
|
|
|
|
|
2019-03-26 07:56:17 +00:00
|
|
|
const nvidiaHook = "nvidia-container-runtime-hook"
|
2019-03-01 00:32:08 +00:00
|
|
|
|
|
|
|
// These are NVIDIA-specific capabilities stolen from github.com/containerd/containerd/contrib/nvidia.allCaps
|
|
|
|
var allNvidiaCaps = map[nvidia.Capability]struct{}{
|
|
|
|
nvidia.Compute: {},
|
|
|
|
nvidia.Compat32: {},
|
|
|
|
nvidia.Graphics: {},
|
|
|
|
nvidia.Utility: {},
|
|
|
|
nvidia.Video: {},
|
|
|
|
nvidia.Display: {},
|
|
|
|
}
|
|
|
|
|
|
|
|
func init() {
|
2019-03-26 07:56:17 +00:00
|
|
|
if _, err := exec.LookPath(nvidiaHook); err != nil {
|
2019-03-01 00:32:08 +00:00
|
|
|
// do not register Nvidia driver if helper binary is not present.
|
|
|
|
return
|
|
|
|
}
|
|
|
|
capset := capabilities.Set{"gpu": struct{}{}, "nvidia": struct{}{}}
|
|
|
|
nvidiaDriver := &deviceDriver{
|
|
|
|
capset: capset,
|
|
|
|
updateSpec: setNvidiaGPUs,
|
|
|
|
}
|
2019-03-25 22:40:19 +00:00
|
|
|
for c := range allNvidiaCaps {
|
|
|
|
nvidiaDriver.capset[string(c)] = struct{}{}
|
2019-03-01 00:32:08 +00:00
|
|
|
}
|
|
|
|
registerDeviceDriver("nvidia", nvidiaDriver)
|
|
|
|
}
|
|
|
|
|
|
|
|
func setNvidiaGPUs(s *specs.Spec, dev *deviceInstance) error {
|
|
|
|
req := dev.req
|
|
|
|
if req.Count != 0 && len(req.DeviceIDs) > 0 {
|
|
|
|
return errConflictCountDeviceIDs
|
|
|
|
}
|
|
|
|
|
|
|
|
if len(req.DeviceIDs) > 0 {
|
2019-03-26 07:56:17 +00:00
|
|
|
s.Process.Env = append(s.Process.Env, "NVIDIA_VISIBLE_DEVICES="+strings.Join(req.DeviceIDs, ","))
|
2019-03-01 00:32:08 +00:00
|
|
|
} else if req.Count > 0 {
|
2019-03-26 07:56:17 +00:00
|
|
|
s.Process.Env = append(s.Process.Env, "NVIDIA_VISIBLE_DEVICES="+countToDevices(req.Count))
|
|
|
|
} else if req.Count < 0 {
|
|
|
|
s.Process.Env = append(s.Process.Env, "NVIDIA_VISIBLE_DEVICES=all")
|
2019-03-01 00:32:08 +00:00
|
|
|
}
|
|
|
|
|
2019-03-26 07:56:17 +00:00
|
|
|
var nvidiaCaps []string
|
2019-03-01 00:32:08 +00:00
|
|
|
// req.Capabilities contains device capabilities, some but not all are NVIDIA driver capabilities.
|
|
|
|
for _, c := range dev.selectedCaps {
|
|
|
|
nvcap := nvidia.Capability(c)
|
|
|
|
if _, isNvidiaCap := allNvidiaCaps[nvcap]; isNvidiaCap {
|
2019-03-26 07:56:17 +00:00
|
|
|
nvidiaCaps = append(nvidiaCaps, c)
|
2019-03-01 00:32:08 +00:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
// TODO: nvidia.WithRequiredCUDAVersion
|
|
|
|
// for now we let the prestart hook verify cuda versions but errors are not pretty.
|
|
|
|
}
|
|
|
|
|
|
|
|
if nvidiaCaps != nil {
|
2019-03-26 07:56:17 +00:00
|
|
|
s.Process.Env = append(s.Process.Env, "NVIDIA_DRIVER_CAPABILITIES="+strings.Join(nvidiaCaps, ","))
|
|
|
|
}
|
|
|
|
|
|
|
|
path, err := exec.LookPath(nvidiaHook)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
if s.Hooks == nil {
|
|
|
|
s.Hooks = &specs.Hooks{}
|
2019-03-01 00:32:08 +00:00
|
|
|
}
|
daemon: add nolint-comments for deprecated kernel-memory options, hooks
This adds some nolint-comments for the deprecated kernel-memory options; we
deprecated these, but they could technically still be accepted by alternative
runtimes.
daemon/daemon_unix.go:108:3: SA1019: memory.Kernel is deprecated: kernel-memory limits are not supported in cgroups v2, and were obsoleted in [kernel v5.4]. This field should no longer be used, as it may be ignored by runtimes. (staticcheck)
memory.Kernel = &config.KernelMemory
^
daemon/update_linux.go:63:3: SA1019: memory.Kernel is deprecated: kernel-memory limits are not supported in cgroups v2, and were obsoleted in [kernel v5.4]. This field should no longer be used, as it may be ignored by runtimes. (staticcheck)
memory.Kernel = &resources.KernelMemory
^
Prestart hooks are deprecated, and more granular hooks should be used instead.
CreateRuntime are the closest equivalent, and executed in the same locations
as Prestart-hooks, but depending on what these hooks do, possibly one of the
other hooks could be used instead (such as CreateContainer or StartContainer).
As these hooks are still supported, this patch adds nolint comments, but adds
some TODOs to consider migrating to something else;
daemon/nvidia_linux.go:86:2: SA1019: s.Hooks.Prestart is deprecated: use [Hooks.CreateRuntime], [Hooks.CreateContainer], and [Hooks.StartContainer] instead, which allow more granular hook control during the create and start phase. (staticcheck)
s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{
^
daemon/oci_linux.go:76:5: SA1019: s.Hooks.Prestart is deprecated: use [Hooks.CreateRuntime], [Hooks.CreateContainer], and [Hooks.StartContainer] instead, which allow more granular hook control during the create and start phase. (staticcheck)
s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{
^
Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
2024-04-15 13:53:55 +00:00
|
|
|
|
|
|
|
// This implementation uses prestart hooks, which are deprecated.
|
|
|
|
// CreateRuntime is the closest equivalent, and executed in the same
|
|
|
|
// locations as prestart-hooks, but depending on what these hooks do,
|
|
|
|
// possibly one of the other hooks could be used instead (such as
|
|
|
|
// CreateContainer or StartContainer).
|
|
|
|
s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{ //nolint:staticcheck // FIXME(thaJeztah); replace prestart hook with a non-deprecated one.
|
2019-03-26 07:56:17 +00:00
|
|
|
Path: path,
|
|
|
|
Args: []string{
|
|
|
|
nvidiaHook,
|
|
|
|
"prestart",
|
|
|
|
},
|
|
|
|
Env: os.Environ(),
|
|
|
|
})
|
2019-03-01 00:32:08 +00:00
|
|
|
|
2019-03-26 07:56:17 +00:00
|
|
|
return nil
|
2019-03-01 00:32:08 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// countToDevices returns the list 0, 1, ... count-1 of deviceIDs.
|
2019-03-26 07:56:17 +00:00
|
|
|
func countToDevices(count int) string {
|
|
|
|
devices := make([]string, count)
|
2019-03-01 00:32:08 +00:00
|
|
|
for i := range devices {
|
2019-03-26 07:56:17 +00:00
|
|
|
devices[i] = strconv.Itoa(i)
|
2019-03-01 00:32:08 +00:00
|
|
|
}
|
2019-03-26 07:56:17 +00:00
|
|
|
return strings.Join(devices, ",")
|
2019-03-01 00:32:08 +00:00
|
|
|
}
|