bindings.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432
  1. /*
  2. Copyright 2017 Google Inc.
  3. Licensed under the Apache License, Version 2.0 (the "License");
  4. you may not use this file except in compliance with the License.
  5. You may obtain a copy of the License at
  6. http://www.apache.org/licenses/LICENSE-2.0
  7. Unless required by applicable law or agreed to in writing, software
  8. distributed under the License is distributed on an "AS IS" BASIS,
  9. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. See the License for the specific language governing permissions and
  11. limitations under the License.
  12. */
  13. package gonvml
  14. // #cgo LDFLAGS: -ldl
  15. /*
  16. #include <stddef.h>
  17. #include <dlfcn.h>
  18. #include <stdlib.h>
  19. #include "nvml.h"
  20. // nvmlHandle is the handle for dynamically loaded libnvidia-ml.so
  21. void *nvmlHandle;
  22. nvmlReturn_t (*nvmlInitFunc)(void);
  23. nvmlReturn_t (*nvmlShutdownFunc)(void);
  24. const char* (*nvmlErrorStringFunc)(nvmlReturn_t result);
  25. const char* nvmlErrorString(nvmlReturn_t result) {
  26. if (nvmlErrorStringFunc == NULL) {
  27. return "nvmlErrorString Function Not Found";
  28. }
  29. return nvmlErrorStringFunc(result);
  30. }
  31. nvmlReturn_t (*nvmlSystemGetDriverVersionFunc)(char *version, unsigned int length);
  32. nvmlReturn_t nvmlSystemGetDriverVersion(char *version, unsigned int length) {
  33. if (nvmlSystemGetDriverVersionFunc == NULL) {
  34. return NVML_ERROR_FUNCTION_NOT_FOUND;
  35. }
  36. return nvmlSystemGetDriverVersionFunc(version, length);
  37. }
  38. nvmlReturn_t (*nvmlDeviceGetCountFunc)(unsigned int *deviceCount);
  39. nvmlReturn_t nvmlDeviceGetCount(unsigned int *deviceCount) {
  40. if (nvmlDeviceGetCountFunc == NULL) {
  41. return NVML_ERROR_FUNCTION_NOT_FOUND;
  42. }
  43. return nvmlDeviceGetCountFunc(deviceCount);
  44. }
  45. nvmlReturn_t (*nvmlDeviceGetHandleByIndexFunc)(unsigned int index, nvmlDevice_t *device);
  46. nvmlReturn_t nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device) {
  47. if (nvmlDeviceGetHandleByIndexFunc == NULL) {
  48. return NVML_ERROR_FUNCTION_NOT_FOUND;
  49. }
  50. return nvmlDeviceGetHandleByIndexFunc(index, device);
  51. }
  52. nvmlReturn_t (*nvmlDeviceGetMinorNumberFunc)(nvmlDevice_t device, unsigned int *minorNumber);
  53. nvmlReturn_t nvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int *minorNumber) {
  54. if (nvmlDeviceGetMinorNumberFunc == NULL) {
  55. return NVML_ERROR_FUNCTION_NOT_FOUND;
  56. }
  57. return nvmlDeviceGetMinorNumberFunc(device, minorNumber);
  58. }
  59. nvmlReturn_t (*nvmlDeviceGetUUIDFunc)(nvmlDevice_t device, char *uuid, unsigned int length);
  60. nvmlReturn_t nvmlDeviceGetUUID(nvmlDevice_t device, char *uuid, unsigned int length) {
  61. if (nvmlDeviceGetUUIDFunc == NULL) {
  62. return NVML_ERROR_FUNCTION_NOT_FOUND;
  63. }
  64. return nvmlDeviceGetUUIDFunc(device, uuid, length);
  65. }
  66. nvmlReturn_t (*nvmlDeviceGetNameFunc)(nvmlDevice_t device, char *name, unsigned int length);
  67. nvmlReturn_t nvmlDeviceGetName(nvmlDevice_t device, char *name, unsigned int length) {
  68. if (nvmlDeviceGetNameFunc == NULL) {
  69. return NVML_ERROR_FUNCTION_NOT_FOUND;
  70. }
  71. return nvmlDeviceGetNameFunc(device, name, length);
  72. }
  73. nvmlReturn_t (*nvmlDeviceGetMemoryInfoFunc)(nvmlDevice_t device, nvmlMemory_t *memory);
  74. nvmlReturn_t nvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t *memory) {
  75. if (nvmlDeviceGetMemoryInfoFunc == NULL) {
  76. return NVML_ERROR_FUNCTION_NOT_FOUND;
  77. }
  78. return nvmlDeviceGetMemoryInfoFunc(device, memory);
  79. }
  80. nvmlReturn_t (*nvmlDeviceGetUtilizationRatesFunc)(nvmlDevice_t device, nvmlUtilization_t *utilization);
  81. nvmlReturn_t nvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_t *utilization) {
  82. if (nvmlDeviceGetUtilizationRatesFunc == NULL) {
  83. return NVML_ERROR_FUNCTION_NOT_FOUND;
  84. }
  85. return nvmlDeviceGetUtilizationRatesFunc(device, utilization);
  86. }
  87. nvmlReturn_t (*nvmlDeviceGetPowerUsageFunc)(nvmlDevice_t device, unsigned int *power);
  88. nvmlReturn_t nvmlDeviceGetPowerUsage(nvmlDevice_t device, unsigned int *power) {
  89. if (nvmlDeviceGetPowerUsageFunc == NULL) {
  90. return NVML_ERROR_FUNCTION_NOT_FOUND;
  91. }
  92. return nvmlDeviceGetPowerUsageFunc(device, power);
  93. }
  94. nvmlReturn_t (*nvmlDeviceGetSamplesFunc)(nvmlDevice_t device, nvmlSamplingType_t type, unsigned long long lastSeenTimeStamp, nvmlValueType_t *sampleValType, unsigned int *sampleCount, nvmlSample_t *samples);
  95. // Loads the "libnvidia-ml.so.1" shared library.
  96. // Loads all symbols needed and initializes NVML.
  97. // Call this before calling any other methods.
  98. nvmlReturn_t nvmlInit_dl(void) {
  99. nvmlHandle = dlopen("libnvidia-ml.so.1", RTLD_LAZY);
  100. if (nvmlHandle == NULL) {
  101. return NVML_ERROR_LIBRARY_NOT_FOUND;
  102. }
  103. nvmlInitFunc = dlsym(nvmlHandle, "nvmlInit_v2");
  104. if (nvmlInitFunc == NULL) {
  105. return NVML_ERROR_FUNCTION_NOT_FOUND;
  106. }
  107. nvmlShutdownFunc = dlsym(nvmlHandle, "nvmlShutdown");
  108. if (nvmlShutdownFunc == NULL) {
  109. return NVML_ERROR_FUNCTION_NOT_FOUND;
  110. }
  111. nvmlErrorStringFunc = dlsym(nvmlHandle, "nvmlErrorString");
  112. if (nvmlErrorStringFunc == NULL) {
  113. return NVML_ERROR_FUNCTION_NOT_FOUND;
  114. }
  115. nvmlSystemGetDriverVersionFunc = dlsym(nvmlHandle, "nvmlSystemGetDriverVersion");
  116. if (nvmlSystemGetDriverVersionFunc == NULL) {
  117. return NVML_ERROR_FUNCTION_NOT_FOUND;
  118. }
  119. nvmlDeviceGetCountFunc = dlsym(nvmlHandle, "nvmlDeviceGetCount_v2");
  120. if (nvmlDeviceGetCountFunc == NULL) {
  121. return NVML_ERROR_FUNCTION_NOT_FOUND;
  122. }
  123. nvmlDeviceGetHandleByIndexFunc = dlsym(nvmlHandle, "nvmlDeviceGetHandleByIndex_v2");
  124. if (nvmlDeviceGetHandleByIndexFunc == NULL) {
  125. return NVML_ERROR_FUNCTION_NOT_FOUND;
  126. }
  127. nvmlDeviceGetMinorNumberFunc = dlsym(nvmlHandle, "nvmlDeviceGetMinorNumber");
  128. if (nvmlDeviceGetMinorNumberFunc == NULL) {
  129. return NVML_ERROR_FUNCTION_NOT_FOUND;
  130. }
  131. nvmlDeviceGetUUIDFunc = dlsym(nvmlHandle, "nvmlDeviceGetUUID");
  132. if (nvmlDeviceGetUUIDFunc == NULL) {
  133. return NVML_ERROR_FUNCTION_NOT_FOUND;
  134. }
  135. nvmlDeviceGetNameFunc = dlsym(nvmlHandle, "nvmlDeviceGetName");
  136. if (nvmlDeviceGetNameFunc == NULL) {
  137. return NVML_ERROR_FUNCTION_NOT_FOUND;
  138. }
  139. nvmlDeviceGetMemoryInfoFunc = dlsym(nvmlHandle, "nvmlDeviceGetMemoryInfo");
  140. if (nvmlDeviceGetMemoryInfoFunc == NULL) {
  141. return NVML_ERROR_FUNCTION_NOT_FOUND;
  142. }
  143. nvmlDeviceGetUtilizationRatesFunc = dlsym(nvmlHandle, "nvmlDeviceGetUtilizationRates");
  144. if (nvmlDeviceGetUtilizationRatesFunc == NULL) {
  145. return NVML_ERROR_FUNCTION_NOT_FOUND;
  146. }
  147. nvmlDeviceGetPowerUsageFunc = dlsym(nvmlHandle, "nvmlDeviceGetPowerUsage");
  148. if (nvmlDeviceGetPowerUsageFunc == NULL) {
  149. return NVML_ERROR_FUNCTION_NOT_FOUND;
  150. }
  151. nvmlDeviceGetSamplesFunc = dlsym(nvmlHandle, "nvmlDeviceGetSamples");
  152. if (nvmlDeviceGetSamplesFunc == NULL) {
  153. return NVML_ERROR_FUNCTION_NOT_FOUND;
  154. }
  155. nvmlReturn_t result = nvmlInitFunc();
  156. if (result != NVML_SUCCESS) {
  157. dlclose(nvmlHandle);
  158. nvmlHandle = NULL;
  159. return result;
  160. }
  161. return NVML_SUCCESS;
  162. }
  163. // Shuts down NVML and decrements the reference count on the dynamically loaded
  164. // "libnvidia-ml.so.1" library.
  165. // Call this once NVML is no longer being used.
  166. nvmlReturn_t nvmlShutdown_dl(void) {
  167. if (nvmlHandle == NULL) {
  168. return NVML_SUCCESS;
  169. }
  170. if (nvmlShutdownFunc == NULL) {
  171. return NVML_ERROR_FUNCTION_NOT_FOUND;
  172. }
  173. nvmlReturn_t r = nvmlShutdownFunc();
  174. if (r != NVML_SUCCESS) {
  175. return r;
  176. }
  177. return (dlclose(nvmlHandle) ? NVML_ERROR_UNKNOWN : NVML_SUCCESS);
  178. }
  179. // This function is here because the API provided by NVML is not very user
  180. // friendly. This function can be used to get average utilization.gpu or
  181. // power.draw.
  182. //
  183. // `device`: The identifier of the target device.
  184. // `type`: Type of sampling event. Only NVML_TOTAL_POWER_SAMPLES and NVML_GPU_UTILIZATION_SAMPLES are supported.
  185. // `lastSeenTimeStamp`: Return average using samples with timestamp greather than this timestamp. Unix epoch in micro seconds.
  186. // `averageUsage`: Reference in which average is returned.
  187. //
  188. // In my experiments, I found that NVML_GPU_UTILIZATION_SAMPLES buffer stores
  189. // 100 samples that are uniformly spread with ~6 samples per second. So the
  190. // buffer stores last ~16s of data.
  191. // NVML_TOTAL_POWER_SAMPLES buffer stores 120 samples, but in different runs I
  192. // noticed them to be non-uniformly separated. Sometimes 120 samples only
  193. // consisted of 10s of data and sometimes they were spread over 60s.
  194. //
  195. nvmlReturn_t nvmlDeviceGetAverageUsage(nvmlDevice_t device, nvmlSamplingType_t type, unsigned long long lastSeenTimeStamp, unsigned int* averageUsage) {
  196. if (nvmlHandle == NULL) {
  197. return NVML_ERROR_LIBRARY_NOT_FOUND;
  198. }
  199. if (nvmlDeviceGetSamplesFunc == NULL) {
  200. return NVML_ERROR_FUNCTION_NOT_FOUND;
  201. }
  202. // We don't really use this because both the metrics we support
  203. // averagePowerUsage and averageGPUUtilization are unsigned int.
  204. nvmlValueType_t sampleValType;
  205. // This will be set to the number of samples that can be queried. We would
  206. // need to allocate an array of this size to store the samples.
  207. unsigned int sampleCount;
  208. // Invoking this method with `samples` set to NULL sets the sampleCount.
  209. nvmlReturn_t r = nvmlDeviceGetSamplesFunc(device, type, lastSeenTimeStamp, &sampleValType, &sampleCount, NULL);
  210. if (r != NVML_SUCCESS) {
  211. return r;
  212. }
  213. // Allocate memory to store sampleCount samples.
  214. // In my experiments, the sampleCount at this stage was always 120 for
  215. // NVML_TOTAL_POWER_SAMPLES and 100 for NVML_GPU_UTILIZATION_SAMPLES
  216. nvmlSample_t* samples = (nvmlSample_t*) malloc(sampleCount * sizeof(nvmlSample_t));
  217. r = nvmlDeviceGetSamplesFunc(device, type, lastSeenTimeStamp, &sampleValType, &sampleCount, samples);
  218. if (r != NVML_SUCCESS) {
  219. free(samples);
  220. return r;
  221. }
  222. int i = 0;
  223. unsigned int sum = 0;
  224. for (; i < sampleCount; i++) {
  225. sum += samples[i].sampleValue.uiVal;
  226. }
  227. *averageUsage = sum/sampleCount;
  228. free(samples);
  229. return r;
  230. }
  231. */
  232. import "C"
  233. import (
  234. "errors"
  235. "fmt"
  236. "time"
  237. )
  238. const (
  239. szDriver = C.NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE
  240. szName = C.NVML_DEVICE_NAME_BUFFER_SIZE
  241. szUUID = C.NVML_DEVICE_UUID_BUFFER_SIZE
  242. )
  243. var errLibraryNotLoaded = errors.New("could not load NVML library")
  244. // Initialize initializes NVML.
  245. // Call this before calling any other methods.
  246. func Initialize() error {
  247. return errorString(C.nvmlInit_dl())
  248. }
  249. // Shutdown shuts down NVML.
  250. // Call this once NVML is no longer being used.
  251. func Shutdown() error {
  252. return errorString(C.nvmlShutdown_dl())
  253. }
  254. // errorString takes a nvmlReturn_t and converts it into a golang error.
  255. // It uses a nvml method to convert to a user friendly error message.
  256. func errorString(ret C.nvmlReturn_t) error {
  257. if ret == C.NVML_SUCCESS {
  258. return nil
  259. }
  260. // We need to special case this because if nvml library is not found
  261. // nvmlErrorString() method will not work.
  262. if ret == C.NVML_ERROR_LIBRARY_NOT_FOUND || C.nvmlHandle == nil {
  263. return errLibraryNotLoaded
  264. }
  265. err := C.GoString(C.nvmlErrorString(ret))
  266. return fmt.Errorf("nvml: %v", err)
  267. }
  268. // SystemDriverVersion returns the the driver version on the system.
  269. func SystemDriverVersion() (string, error) {
  270. if C.nvmlHandle == nil {
  271. return "", errLibraryNotLoaded
  272. }
  273. var driver [szDriver]C.char
  274. r := C.nvmlSystemGetDriverVersion(&driver[0], szDriver)
  275. return C.GoString(&driver[0]), errorString(r)
  276. }
  277. // DeviceCount returns the number of nvidia devices on the system.
  278. func DeviceCount() (uint, error) {
  279. if C.nvmlHandle == nil {
  280. return 0, errLibraryNotLoaded
  281. }
  282. var n C.uint
  283. r := C.nvmlDeviceGetCount(&n)
  284. return uint(n), errorString(r)
  285. }
  286. // Device is the handle for the device.
  287. // This handle is obtained by calling DeviceHandleByIndex().
  288. type Device struct {
  289. dev C.nvmlDevice_t
  290. }
  291. // DeviceHandleByIndex returns the device handle for a particular index.
  292. // The indices range from 0 to DeviceCount()-1. The order in which NVML
  293. // enumerates devices has no guarantees of consistency between reboots.
  294. func DeviceHandleByIndex(idx uint) (Device, error) {
  295. if C.nvmlHandle == nil {
  296. return Device{}, errLibraryNotLoaded
  297. }
  298. var dev C.nvmlDevice_t
  299. r := C.nvmlDeviceGetHandleByIndex(C.uint(idx), &dev)
  300. return Device{dev}, errorString(r)
  301. }
  302. // MinorNumber returns the minor number for the device.
  303. // The minor number for the device is such that the Nvidia device node
  304. // file for each GPU will have the form /dev/nvidia[minor number].
  305. func (d Device) MinorNumber() (uint, error) {
  306. if C.nvmlHandle == nil {
  307. return 0, errLibraryNotLoaded
  308. }
  309. var n C.uint
  310. r := C.nvmlDeviceGetMinorNumber(d.dev, &n)
  311. return uint(n), errorString(r)
  312. }
  313. // UUID returns the globally unique immutable UUID associated with this device.
  314. func (d Device) UUID() (string, error) {
  315. if C.nvmlHandle == nil {
  316. return "", errLibraryNotLoaded
  317. }
  318. var uuid [szUUID]C.char
  319. r := C.nvmlDeviceGetUUID(d.dev, &uuid[0], szUUID)
  320. return C.GoString(&uuid[0]), errorString(r)
  321. }
  322. // Name returns the product name of the device.
  323. func (d Device) Name() (string, error) {
  324. if C.nvmlHandle == nil {
  325. return "", errLibraryNotLoaded
  326. }
  327. var name [szName]C.char
  328. r := C.nvmlDeviceGetName(d.dev, &name[0], szName)
  329. return C.GoString(&name[0]), errorString(r)
  330. }
  331. // MemoryInfo returns the total and used memory (in bytes) of the device.
  332. func (d Device) MemoryInfo() (uint64, uint64, error) {
  333. if C.nvmlHandle == nil {
  334. return 0, 0, errLibraryNotLoaded
  335. }
  336. var memory C.nvmlMemory_t
  337. r := C.nvmlDeviceGetMemoryInfo(d.dev, &memory)
  338. return uint64(memory.total), uint64(memory.used), errorString(r)
  339. }
  340. // UtilizationRates returns the percent of time over the past sample period during which:
  341. // utilization.gpu: one or more kernels were executing on the GPU.
  342. // utilizatoin.memory: global (device) memory was being read or written.
  343. func (d Device) UtilizationRates() (uint, uint, error) {
  344. if C.nvmlHandle == nil {
  345. return 0, 0, errLibraryNotLoaded
  346. }
  347. var utilization C.nvmlUtilization_t
  348. r := C.nvmlDeviceGetUtilizationRates(d.dev, &utilization)
  349. return uint(utilization.gpu), uint(utilization.memory), errorString(r)
  350. }
  351. // PowerUsage returns the power usage for this GPU and its associated circuitry
  352. // in milliwatts. The reading is accurate to within +/- 5% of current power draw.
  353. func (d Device) PowerUsage() (uint, error) {
  354. if C.nvmlHandle == nil {
  355. return 0, errLibraryNotLoaded
  356. }
  357. var n C.uint
  358. r := C.nvmlDeviceGetPowerUsage(d.dev, &n)
  359. return uint(n), errorString(r)
  360. }
  361. // AveragePowerUsage returns the power usage for this GPU and its associated circuitry
  362. // in milliwatts averaged over the samples collected in the last `since` duration.
  363. func (d Device) AveragePowerUsage(since time.Duration) (uint, error) {
  364. if C.nvmlHandle == nil {
  365. return 0, errLibraryNotLoaded
  366. }
  367. lastTs := C.ulonglong(time.Now().Add(-1*since).UnixNano() / 1000)
  368. var n C.uint
  369. r := C.nvmlDeviceGetAverageUsage(d.dev, C.NVML_TOTAL_POWER_SAMPLES, lastTs, &n)
  370. return uint(n), errorString(r)
  371. }
  372. // AverageGPUUtilization returns the utilization.gpu metric (percent of time
  373. // one of more kernels were executing on the GPU) averaged over the samples
  374. // collected in the last `since` duration.
  375. func (d Device) AverageGPUUtilization(since time.Duration) (uint, error) {
  376. if C.nvmlHandle == nil {
  377. return 0, errLibraryNotLoaded
  378. }
  379. lastTs := C.ulonglong(time.Now().Add(-1*since).UnixNano() / 1000)
  380. var n C.uint
  381. r := C.nvmlDeviceGetAverageUsage(d.dev, C.NVML_GPU_UTILIZATION_SAMPLES, lastTs, &n)
  382. return uint(n), errorString(r)
  383. }