gpu_util.go 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. /*
  2. Copyright 2017 The Kubernetes Authors.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package gpu
  14. import (
  15. v1 "k8s.io/api/core/v1"
  16. metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  17. "k8s.io/apimachinery/pkg/util/uuid"
  18. "k8s.io/klog"
  19. "k8s.io/kubernetes/test/e2e/framework"
  20. )
  21. const (
  22. // NVIDIAGPUResourceName is the extended name of the GPU resource since v1.8
  23. // this uses the device plugin mechanism
  24. NVIDIAGPUResourceName = "nvidia.com/gpu"
  25. // GPUDevicePluginDSYAML is the official Google Device Plugin Daemonset NVIDIA GPU manifest for GKE
  26. // TODO: Parametrize it by making it a feature in TestFramework.
  27. // so we can override the daemonset in other setups (non COS).
  28. GPUDevicePluginDSYAML = "https://raw.githubusercontent.com/kubernetes/kubernetes/master/cluster/addons/device-plugins/nvidia-gpu/daemonset.yaml"
  29. )
  30. // NumberOfNVIDIAGPUs returns the number of GPUs advertised by a node
  31. // This is based on the Device Plugin system and expected to run on a COS based node
  32. // After the NVIDIA drivers were installed
  33. // TODO make this generic and not linked to COS only
  34. func NumberOfNVIDIAGPUs(node *v1.Node) int64 {
  35. val, ok := node.Status.Capacity[NVIDIAGPUResourceName]
  36. if !ok {
  37. return 0
  38. }
  39. return val.Value()
  40. }
  41. // NVIDIADevicePlugin returns the official Google Device Plugin pod for NVIDIA GPU in GKE
  42. func NVIDIADevicePlugin() *v1.Pod {
  43. ds, err := framework.DsFromManifest(GPUDevicePluginDSYAML)
  44. framework.ExpectNoError(err)
  45. p := &v1.Pod{
  46. ObjectMeta: metav1.ObjectMeta{
  47. Name: "device-plugin-nvidia-gpu-" + string(uuid.NewUUID()),
  48. Namespace: metav1.NamespaceSystem,
  49. },
  50. Spec: ds.Spec.Template.Spec,
  51. }
  52. // Remove node affinity
  53. p.Spec.Affinity = nil
  54. return p
  55. }
  56. // GetGPUDevicePluginImage returns the image of GPU device plugin.
  57. func GetGPUDevicePluginImage() string {
  58. ds, err := framework.DsFromManifest(GPUDevicePluginDSYAML)
  59. if err != nil {
  60. klog.Errorf("Failed to parse the device plugin image: %v", err)
  61. return ""
  62. }
  63. if ds == nil {
  64. klog.Errorf("Failed to parse the device plugin image: the extracted DaemonSet is nil")
  65. return ""
  66. }
  67. if len(ds.Spec.Template.Spec.Containers) < 1 {
  68. klog.Errorf("Failed to parse the device plugin image: cannot extract the container from YAML")
  69. return ""
  70. }
  71. return ds.Spec.Template.Spec.Containers[0].Image
  72. }