tzeneto
/
custom-kube-scheduler


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432
							/*
Copyright 2017 Google Inc.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package gonvml

// #cgo LDFLAGS: -ldl
/*
#include <stddef.h>
#include <dlfcn.h>
#include <stdlib.h>

#include "nvml.h"

// nvmlHandle is the handle for dynamically loaded libnvidia-ml.so
void *nvmlHandle;

nvmlReturn_t (*nvmlInitFunc)(void);

nvmlReturn_t (*nvmlShutdownFunc)(void);

const char* (*nvmlErrorStringFunc)(nvmlReturn_t result);
const char* nvmlErrorString(nvmlReturn_t result) {
  if (nvmlErrorStringFunc == NULL) {
    return "nvmlErrorString Function Not Found";
  }
  return nvmlErrorStringFunc(result);
}

nvmlReturn_t (*nvmlSystemGetDriverVersionFunc)(char *version, unsigned int length);
nvmlReturn_t nvmlSystemGetDriverVersion(char *version, unsigned int length) {
  if (nvmlSystemGetDriverVersionFunc == NULL) {
    return NVML_ERROR_FUNCTION_NOT_FOUND;
  }
  return nvmlSystemGetDriverVersionFunc(version, length);
}

nvmlReturn_t (*nvmlDeviceGetCountFunc)(unsigned int *deviceCount);
nvmlReturn_t nvmlDeviceGetCount(unsigned int *deviceCount) {
  if (nvmlDeviceGetCountFunc == NULL) {
    return NVML_ERROR_FUNCTION_NOT_FOUND;
  }
  return nvmlDeviceGetCountFunc(deviceCount);
}

nvmlReturn_t (*nvmlDeviceGetHandleByIndexFunc)(unsigned int index, nvmlDevice_t *device);
nvmlReturn_t nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device) {
  if (nvmlDeviceGetHandleByIndexFunc == NULL) {
    return NVML_ERROR_FUNCTION_NOT_FOUND;
  }
  return nvmlDeviceGetHandleByIndexFunc(index, device);
}

nvmlReturn_t (*nvmlDeviceGetMinorNumberFunc)(nvmlDevice_t device, unsigned int *minorNumber);
nvmlReturn_t nvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int *minorNumber) {
  if (nvmlDeviceGetMinorNumberFunc == NULL) {
    return NVML_ERROR_FUNCTION_NOT_FOUND;
  }
  return nvmlDeviceGetMinorNumberFunc(device, minorNumber);
}

nvmlReturn_t (*nvmlDeviceGetUUIDFunc)(nvmlDevice_t device, char *uuid, unsigned int length);
nvmlReturn_t nvmlDeviceGetUUID(nvmlDevice_t device, char *uuid, unsigned int length) {
  if (nvmlDeviceGetUUIDFunc == NULL) {
    return NVML_ERROR_FUNCTION_NOT_FOUND;
  }
  return nvmlDeviceGetUUIDFunc(device, uuid, length);
}

nvmlReturn_t (*nvmlDeviceGetNameFunc)(nvmlDevice_t device, char *name, unsigned int length);
nvmlReturn_t nvmlDeviceGetName(nvmlDevice_t device, char *name, unsigned int length) {
  if (nvmlDeviceGetNameFunc == NULL) {
    return NVML_ERROR_FUNCTION_NOT_FOUND;
  }
  return nvmlDeviceGetNameFunc(device, name, length);
}

nvmlReturn_t (*nvmlDeviceGetMemoryInfoFunc)(nvmlDevice_t device, nvmlMemory_t *memory);
nvmlReturn_t nvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t *memory) {
  if (nvmlDeviceGetMemoryInfoFunc == NULL) {
    return NVML_ERROR_FUNCTION_NOT_FOUND;
  }
  return nvmlDeviceGetMemoryInfoFunc(device, memory);
}

nvmlReturn_t (*nvmlDeviceGetUtilizationRatesFunc)(nvmlDevice_t device, nvmlUtilization_t *utilization);
nvmlReturn_t nvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_t *utilization) {
  if (nvmlDeviceGetUtilizationRatesFunc == NULL) {
    return NVML_ERROR_FUNCTION_NOT_FOUND;
  }
  return nvmlDeviceGetUtilizationRatesFunc(device, utilization);
}

nvmlReturn_t (*nvmlDeviceGetPowerUsageFunc)(nvmlDevice_t device, unsigned int *power);
nvmlReturn_t nvmlDeviceGetPowerUsage(nvmlDevice_t device, unsigned int *power) {
  if (nvmlDeviceGetPowerUsageFunc == NULL) {
    return NVML_ERROR_FUNCTION_NOT_FOUND;
  }
  return nvmlDeviceGetPowerUsageFunc(device, power);
}

nvmlReturn_t (*nvmlDeviceGetSamplesFunc)(nvmlDevice_t device, nvmlSamplingType_t type, unsigned long long lastSeenTimeStamp, nvmlValueType_t *sampleValType, unsigned int *sampleCount, nvmlSample_t *samples);

// Loads the "libnvidia-ml.so.1" shared library.
// Loads all symbols needed and initializes NVML.
// Call this before calling any other methods.
nvmlReturn_t nvmlInit_dl(void) {
  nvmlHandle = dlopen("libnvidia-ml.so.1", RTLD_LAZY);
  if (nvmlHandle == NULL) {
    return NVML_ERROR_LIBRARY_NOT_FOUND;
  }
  nvmlInitFunc = dlsym(nvmlHandle, "nvmlInit_v2");
  if (nvmlInitFunc == NULL) {
    return NVML_ERROR_FUNCTION_NOT_FOUND;
  }
  nvmlShutdownFunc = dlsym(nvmlHandle, "nvmlShutdown");
  if (nvmlShutdownFunc == NULL) {
    return NVML_ERROR_FUNCTION_NOT_FOUND;
  }
  nvmlErrorStringFunc = dlsym(nvmlHandle, "nvmlErrorString");
  if (nvmlErrorStringFunc == NULL) {
    return NVML_ERROR_FUNCTION_NOT_FOUND;
  }
  nvmlSystemGetDriverVersionFunc = dlsym(nvmlHandle, "nvmlSystemGetDriverVersion");
  if (nvmlSystemGetDriverVersionFunc == NULL) {
    return NVML_ERROR_FUNCTION_NOT_FOUND;
  }
  nvmlDeviceGetCountFunc = dlsym(nvmlHandle, "nvmlDeviceGetCount_v2");
  if (nvmlDeviceGetCountFunc == NULL) {
    return NVML_ERROR_FUNCTION_NOT_FOUND;
  }
  nvmlDeviceGetHandleByIndexFunc = dlsym(nvmlHandle, "nvmlDeviceGetHandleByIndex_v2");
  if (nvmlDeviceGetHandleByIndexFunc == NULL) {
    return NVML_ERROR_FUNCTION_NOT_FOUND;
  }
  nvmlDeviceGetMinorNumberFunc = dlsym(nvmlHandle, "nvmlDeviceGetMinorNumber");
  if (nvmlDeviceGetMinorNumberFunc == NULL) {
    return NVML_ERROR_FUNCTION_NOT_FOUND;
  }
  nvmlDeviceGetUUIDFunc = dlsym(nvmlHandle, "nvmlDeviceGetUUID");
  if (nvmlDeviceGetUUIDFunc == NULL) {
    return NVML_ERROR_FUNCTION_NOT_FOUND;
  }
  nvmlDeviceGetNameFunc = dlsym(nvmlHandle, "nvmlDeviceGetName");
  if (nvmlDeviceGetNameFunc == NULL) {
    return NVML_ERROR_FUNCTION_NOT_FOUND;
  }
  nvmlDeviceGetMemoryInfoFunc = dlsym(nvmlHandle, "nvmlDeviceGetMemoryInfo");
  if (nvmlDeviceGetMemoryInfoFunc == NULL) {
    return NVML_ERROR_FUNCTION_NOT_FOUND;
  }
  nvmlDeviceGetUtilizationRatesFunc = dlsym(nvmlHandle, "nvmlDeviceGetUtilizationRates");
  if (nvmlDeviceGetUtilizationRatesFunc == NULL) {
    return NVML_ERROR_FUNCTION_NOT_FOUND;
  }
  nvmlDeviceGetPowerUsageFunc = dlsym(nvmlHandle, "nvmlDeviceGetPowerUsage");
  if (nvmlDeviceGetPowerUsageFunc == NULL) {
    return NVML_ERROR_FUNCTION_NOT_FOUND;
  }
  nvmlDeviceGetSamplesFunc = dlsym(nvmlHandle, "nvmlDeviceGetSamples");
  if (nvmlDeviceGetSamplesFunc == NULL) {
    return NVML_ERROR_FUNCTION_NOT_FOUND;
  }
  nvmlReturn_t result = nvmlInitFunc();
  if (result != NVML_SUCCESS) {
    dlclose(nvmlHandle);
    nvmlHandle = NULL;
    return result;
  }
  return NVML_SUCCESS;
}

// Shuts down NVML and decrements the reference count on the dynamically loaded
// "libnvidia-ml.so.1" library.
// Call this once NVML is no longer being used.
nvmlReturn_t nvmlShutdown_dl(void) {
  if (nvmlHandle == NULL) {
    return NVML_SUCCESS;
  }
  if (nvmlShutdownFunc == NULL) {
    return NVML_ERROR_FUNCTION_NOT_FOUND;
  }
  nvmlReturn_t r = nvmlShutdownFunc();
  if (r != NVML_SUCCESS) {
    return r;
  }
  return (dlclose(nvmlHandle) ? NVML_ERROR_UNKNOWN : NVML_SUCCESS);
}

// This function is here because the API provided by NVML is not very user
// friendly. This function can be used to get average utilization.gpu or
// power.draw.
//
// `device`: The identifier of the target device.
// `type`: Type of sampling event. Only NVML_TOTAL_POWER_SAMPLES and NVML_GPU_UTILIZATION_SAMPLES are supported.
// `lastSeenTimeStamp`: Return average using samples with timestamp greather than this timestamp. Unix epoch in micro seconds.
// `averageUsage`: Reference in which average is returned.
//
// In my experiments, I found that NVML_GPU_UTILIZATION_SAMPLES buffer stores
// 100 samples that are uniformly spread with ~6 samples per second. So the
// buffer stores last ~16s of data.
// NVML_TOTAL_POWER_SAMPLES buffer stores 120 samples, but in different runs I
// noticed them to be non-uniformly separated. Sometimes 120 samples only
// consisted of 10s of data and sometimes they were spread over 60s.
//
nvmlReturn_t nvmlDeviceGetAverageUsage(nvmlDevice_t device, nvmlSamplingType_t type, unsigned long long lastSeenTimeStamp, unsigned int* averageUsage) {
  if (nvmlHandle == NULL) {
    return NVML_ERROR_LIBRARY_NOT_FOUND;
  }
  if (nvmlDeviceGetSamplesFunc == NULL) {
    return NVML_ERROR_FUNCTION_NOT_FOUND;
  }

  // We don't really use this because both the metrics we support
  // averagePowerUsage and averageGPUUtilization are unsigned int.
  nvmlValueType_t sampleValType;

  // This will be set to the number of samples that can be queried. We would
  // need to allocate an array of this size to store the samples.
  unsigned int sampleCount;

  // Invoking this method with `samples` set to NULL sets the sampleCount.
  nvmlReturn_t r = nvmlDeviceGetSamplesFunc(device, type, lastSeenTimeStamp, &sampleValType, &sampleCount, NULL);
  if (r != NVML_SUCCESS) {
    return r;
  }

  // Allocate memory to store sampleCount samples.
  // In my experiments, the sampleCount at this stage was always 120 for
  // NVML_TOTAL_POWER_SAMPLES and 100 for NVML_GPU_UTILIZATION_SAMPLES
  nvmlSample_t* samples = (nvmlSample_t*) malloc(sampleCount * sizeof(nvmlSample_t));

  r = nvmlDeviceGetSamplesFunc(device, type, lastSeenTimeStamp, &sampleValType, &sampleCount, samples);
  if (r != NVML_SUCCESS) {
    free(samples);
    return r;
  }

  int i = 0;
  unsigned int sum = 0;
  for (; i < sampleCount; i++) {
    sum += samples[i].sampleValue.uiVal;
  }
  *averageUsage = sum/sampleCount;

  free(samples);
  return r;
}
*/
import "C"

import (
	"errors"
	"fmt"
	"time"
)

const (
	szDriver = C.NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE
	szName   = C.NVML_DEVICE_NAME_BUFFER_SIZE
	szUUID   = C.NVML_DEVICE_UUID_BUFFER_SIZE
)

var errLibraryNotLoaded = errors.New("could not load NVML library")

// Initialize initializes NVML.
// Call this before calling any other methods.
func Initialize() error {
	return errorString(C.nvmlInit_dl())
}

// Shutdown shuts down NVML.
// Call this once NVML is no longer being used.
func Shutdown() error {
	return errorString(C.nvmlShutdown_dl())
}

// errorString takes a nvmlReturn_t and converts it into a golang error.
// It uses a nvml method to convert to a user friendly error message.
func errorString(ret C.nvmlReturn_t) error {
	if ret == C.NVML_SUCCESS {
		return nil
	}
	// We need to special case this because if nvml library is not found
	// nvmlErrorString() method will not work.
	if ret == C.NVML_ERROR_LIBRARY_NOT_FOUND || C.nvmlHandle == nil {
		return errLibraryNotLoaded
	}
	err := C.GoString(C.nvmlErrorString(ret))
	return fmt.Errorf("nvml: %v", err)
}

// SystemDriverVersion returns the the driver version on the system.
func SystemDriverVersion() (string, error) {
	if C.nvmlHandle == nil {
		return "", errLibraryNotLoaded
	}
	var driver [szDriver]C.char
	r := C.nvmlSystemGetDriverVersion(&driver[0], szDriver)
	return C.GoString(&driver[0]), errorString(r)
}

// DeviceCount returns the number of nvidia devices on the system.
func DeviceCount() (uint, error) {
	if C.nvmlHandle == nil {
		return 0, errLibraryNotLoaded
	}
	var n C.uint
	r := C.nvmlDeviceGetCount(&n)
	return uint(n), errorString(r)
}

// Device is the handle for the device.
// This handle is obtained by calling DeviceHandleByIndex().
type Device struct {
	dev C.nvmlDevice_t
}

// DeviceHandleByIndex returns the device handle for a particular index.
// The indices range from 0 to DeviceCount()-1. The order in which NVML
// enumerates devices has no guarantees of consistency between reboots.
func DeviceHandleByIndex(idx uint) (Device, error) {
	if C.nvmlHandle == nil {
		return Device{}, errLibraryNotLoaded
	}
	var dev C.nvmlDevice_t
	r := C.nvmlDeviceGetHandleByIndex(C.uint(idx), &dev)
	return Device{dev}, errorString(r)
}

// MinorNumber returns the minor number for the device.
// The minor number for the device is such that the Nvidia device node
// file for each GPU will have the form /dev/nvidia[minor number].
func (d Device) MinorNumber() (uint, error) {
	if C.nvmlHandle == nil {
		return 0, errLibraryNotLoaded
	}
	var n C.uint
	r := C.nvmlDeviceGetMinorNumber(d.dev, &n)
	return uint(n), errorString(r)
}

// UUID returns the globally unique immutable UUID associated with this device.
func (d Device) UUID() (string, error) {
	if C.nvmlHandle == nil {
		return "", errLibraryNotLoaded
	}
	var uuid [szUUID]C.char
	r := C.nvmlDeviceGetUUID(d.dev, &uuid[0], szUUID)
	return C.GoString(&uuid[0]), errorString(r)
}

// Name returns the product name of the device.
func (d Device) Name() (string, error) {
	if C.nvmlHandle == nil {
		return "", errLibraryNotLoaded
	}
	var name [szName]C.char
	r := C.nvmlDeviceGetName(d.dev, &name[0], szName)
	return C.GoString(&name[0]), errorString(r)
}

// MemoryInfo returns the total and used memory (in bytes) of the device.
func (d Device) MemoryInfo() (uint64, uint64, error) {
	if C.nvmlHandle == nil {
		return 0, 0, errLibraryNotLoaded
	}
	var memory C.nvmlMemory_t
	r := C.nvmlDeviceGetMemoryInfo(d.dev, &memory)
	return uint64(memory.total), uint64(memory.used), errorString(r)
}

// UtilizationRates returns the percent of time over the past sample period during which:
// utilization.gpu: one or more kernels were executing on the GPU.
// utilizatoin.memory: global (device) memory was being read or written.
func (d Device) UtilizationRates() (uint, uint, error) {
	if C.nvmlHandle == nil {
		return 0, 0, errLibraryNotLoaded
	}
	var utilization C.nvmlUtilization_t
	r := C.nvmlDeviceGetUtilizationRates(d.dev, &utilization)
	return uint(utilization.gpu), uint(utilization.memory), errorString(r)
}

// PowerUsage returns the power usage for this GPU and its associated circuitry
// in milliwatts. The reading is accurate to within +/- 5% of current power draw.
func (d Device) PowerUsage() (uint, error) {
	if C.nvmlHandle == nil {
		return 0, errLibraryNotLoaded
	}
	var n C.uint
	r := C.nvmlDeviceGetPowerUsage(d.dev, &n)
	return uint(n), errorString(r)
}

// AveragePowerUsage returns the power usage for this GPU and its associated circuitry
// in milliwatts averaged over the samples collected in the last `since` duration.
func (d Device) AveragePowerUsage(since time.Duration) (uint, error) {
	if C.nvmlHandle == nil {
		return 0, errLibraryNotLoaded
	}
	lastTs := C.ulonglong(time.Now().Add(-1*since).UnixNano() / 1000)
	var n C.uint
	r := C.nvmlDeviceGetAverageUsage(d.dev, C.NVML_TOTAL_POWER_SAMPLES, lastTs, &n)
	return uint(n), errorString(r)
}

// AverageGPUUtilization returns the utilization.gpu metric (percent of time
// one of more kernels were executing on the GPU) averaged over the samples
// collected in the last `since` duration.
func (d Device) AverageGPUUtilization(since time.Duration) (uint, error) {
	if C.nvmlHandle == nil {
		return 0, errLibraryNotLoaded
	}
	lastTs := C.ulonglong(time.Now().Add(-1*since).UnixNano() / 1000)
	var n C.uint
	r := C.nvmlDeviceGetAverageUsage(d.dev, C.NVML_GPU_UTILIZATION_SAMPLES, lastTs, &n)
	return uint(n), errorString(r)
}