perfmodel_bus.c 29 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009, 2010-2011 Université de Bordeaux 1
  4. * Copyright (C) 2010, 2011 Centre National de la Recherche Scientifique
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #ifdef STARPU_USE_CUDA
  18. #ifndef _GNU_SOURCE
  19. #define _GNU_SOURCE
  20. #endif
  21. #include <sched.h>
  22. #endif
  23. #include <unistd.h>
  24. #include <sys/time.h>
  25. #include <stdlib.h>
  26. #include <math.h>
  27. #include <starpu.h>
  28. #include <starpu_cuda.h>
  29. #include <starpu_opencl.h>
  30. #include <common/config.h>
  31. #include <core/workers.h>
  32. #include <core/perfmodel/perfmodel.h>
  33. #ifdef STARPU_USE_OPENCL
  34. #include <starpu_opencl.h>
  35. #endif
  36. #ifdef STARPU_HAVE_WINDOWS
  37. #include <windows.h>
  38. #endif
  39. #define SIZE (32*1024*1024*sizeof(char))
  40. #define NITER 128
  41. #define MAXCPUS 32
  42. /* timing is in µs per byte (i.e. slowness, inverse of bandwidth) */
  43. struct dev_timing {
  44. int cpu_id;
  45. double timing_htod;
  46. double timing_dtoh;
  47. };
  48. static double bandwidth_matrix[STARPU_MAXNODES][STARPU_MAXNODES] = {{-1.0}};
  49. static double latency_matrix[STARPU_MAXNODES][STARPU_MAXNODES] = {{ -1.0}};
  50. static unsigned was_benchmarked = 0;
  51. static unsigned ncpus = 0;
  52. static int ncuda = 0;
  53. static int nopencl = 0;
  54. /* Benchmarking the performance of the bus */
  55. #ifdef STARPU_USE_CUDA
  56. static int cuda_affinity_matrix[STARPU_MAXCUDADEVS][MAXCPUS];
  57. static double cudadev_timing_htod[STARPU_MAXNODES] = {0.0};
  58. static double cudadev_timing_dtoh[STARPU_MAXNODES] = {0.0};
  59. static double cudadev_timing_dtod[STARPU_MAXNODES][STARPU_MAXNODES] = {{0.0}};
  60. static struct dev_timing cudadev_timing_per_cpu[STARPU_MAXNODES*MAXCPUS];
  61. #endif
  62. #ifdef STARPU_USE_OPENCL
  63. static int opencl_affinity_matrix[STARPU_MAXOPENCLDEVS][MAXCPUS];
  64. static double opencldev_timing_htod[STARPU_MAXNODES] = {0.0};
  65. static double opencldev_timing_dtoh[STARPU_MAXNODES] = {0.0};
  66. static struct dev_timing opencldev_timing_per_cpu[STARPU_MAXNODES*MAXCPUS];
  67. #endif
  68. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  69. #ifdef STARPU_HAVE_HWLOC
  70. static hwloc_topology_t hwtopology;
  71. #endif
  72. #ifdef STARPU_USE_CUDA
  73. static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int cpu, struct dev_timing *dev_timing_per_cpu)
  74. {
  75. struct starpu_machine_config_s *config = _starpu_get_machine_config();
  76. _starpu_bind_thread_on_cpu(config, cpu);
  77. size_t size = SIZE;
  78. /* Initiliaze CUDA context on the device */
  79. cudaSetDevice(dev);
  80. /* hack to avoid third party libs to rebind threads */
  81. _starpu_bind_thread_on_cpu(config, cpu);
  82. /* hack to force the initialization */
  83. cudaFree(0);
  84. /* hack to avoid third party libs to rebind threads */
  85. _starpu_bind_thread_on_cpu(config, cpu);
  86. /* Get the maximum size which can be allocated on the device */
  87. struct cudaDeviceProp prop;
  88. cudaError_t cures;
  89. cures = cudaGetDeviceProperties(&prop, dev);
  90. if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
  91. if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
  92. /* Allocate a buffer on the device */
  93. unsigned char *d_buffer;
  94. cudaMalloc((void **)&d_buffer, size);
  95. assert(d_buffer);
  96. /* hack to avoid third party libs to rebind threads */
  97. _starpu_bind_thread_on_cpu(config, cpu);
  98. /* Allocate a buffer on the host */
  99. unsigned char *h_buffer;
  100. cudaHostAlloc((void **)&h_buffer, size, 0);
  101. assert(h_buffer);
  102. /* hack to avoid third party libs to rebind threads */
  103. _starpu_bind_thread_on_cpu(config, cpu);
  104. /* Fill them */
  105. memset(h_buffer, 0, size);
  106. cudaMemset(d_buffer, 0, size);
  107. /* hack to avoid third party libs to rebind threads */
  108. _starpu_bind_thread_on_cpu(config, cpu);
  109. unsigned iter;
  110. double timing;
  111. struct timeval start;
  112. struct timeval end;
  113. /* Measure upload bandwidth */
  114. gettimeofday(&start, NULL);
  115. for (iter = 0; iter < NITER; iter++)
  116. {
  117. cudaMemcpy(d_buffer, h_buffer, size, cudaMemcpyHostToDevice);
  118. cudaThreadSynchronize();
  119. }
  120. gettimeofday(&end, NULL);
  121. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  122. dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod = timing/NITER/size;
  123. /* Measure download bandwidth */
  124. gettimeofday(&start, NULL);
  125. for (iter = 0; iter < NITER; iter++)
  126. {
  127. cudaMemcpy(h_buffer, d_buffer, size, cudaMemcpyDeviceToHost);
  128. cudaThreadSynchronize();
  129. }
  130. gettimeofday(&end, NULL);
  131. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  132. dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh = timing/NITER/size;
  133. /* Free buffers */
  134. cudaFreeHost(h_buffer);
  135. cudaFree(d_buffer);
  136. cudaThreadExit();
  137. }
  138. #ifdef HAVE_CUDA_MEMCPY_PEER
  139. static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
  140. {
  141. size_t size = SIZE;
  142. /* Get the maximum size which can be allocated on the device */
  143. struct cudaDeviceProp prop;
  144. cudaError_t cures;
  145. cures = cudaGetDeviceProperties(&prop, src);
  146. if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
  147. if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
  148. cures = cudaGetDeviceProperties(&prop, dst);
  149. if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
  150. if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
  151. /* Initiliaze CUDA context on the source */
  152. cudaSetDevice(src);
  153. /* Allocate a buffer on the device */
  154. unsigned char *s_buffer;
  155. cudaMalloc((void **)&s_buffer, size);
  156. assert(s_buffer);
  157. cudaMemset(s_buffer, 0, size);
  158. /* Initiliaze CUDA context on the destination */
  159. cudaSetDevice(dst);
  160. /* Allocate a buffer on the device */
  161. unsigned char *d_buffer;
  162. cudaMalloc((void **)&d_buffer, size);
  163. assert(d_buffer);
  164. cudaMemset(d_buffer, 0, size);
  165. unsigned iter;
  166. double timing;
  167. struct timeval start;
  168. struct timeval end;
  169. /* Measure upload bandwidth */
  170. gettimeofday(&start, NULL);
  171. for (iter = 0; iter < NITER; iter++)
  172. {
  173. cudaMemcpyPeer(d_buffer, dst, s_buffer, src, size);
  174. cudaThreadSynchronize();
  175. }
  176. gettimeofday(&end, NULL);
  177. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  178. cudadev_timing_dtod[src+1][dst+1] = timing/NITER/size;
  179. /* Free buffers */
  180. cudaFree(d_buffer);
  181. cudaSetDevice(src);
  182. cudaFree(s_buffer);
  183. cudaThreadExit();
  184. }
  185. #endif
  186. #endif
  187. #ifdef STARPU_USE_OPENCL
  188. static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, int cpu, struct dev_timing *dev_timing_per_cpu)
  189. {
  190. cl_context context;
  191. cl_command_queue queue;
  192. cl_int err=0;
  193. size_t size = SIZE;
  194. struct starpu_machine_config_s *config = _starpu_get_machine_config();
  195. _starpu_bind_thread_on_cpu(config, cpu);
  196. /* Initialize OpenCL context on the device */
  197. _starpu_opencl_init_context(dev);
  198. starpu_opencl_get_context(dev, &context);
  199. starpu_opencl_get_queue(dev, &queue);
  200. /* Get the maximum size which can be allocated on the device */
  201. cl_device_id device;
  202. cl_ulong maxMemAllocSize;
  203. starpu_opencl_get_device(dev, &device);
  204. err = clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(maxMemAllocSize), &maxMemAllocSize, NULL);
  205. if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
  206. if (size > (size_t)maxMemAllocSize/4) size = maxMemAllocSize/4;
  207. /* hack to avoid third party libs to rebind threads */
  208. _starpu_bind_thread_on_cpu(config, cpu);
  209. /* Allocate a buffer on the device */
  210. cl_mem d_buffer;
  211. d_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err);
  212. if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
  213. /* hack to avoid third party libs to rebind threads */
  214. _starpu_bind_thread_on_cpu(config, cpu);
  215. /* Allocate a buffer on the host */
  216. unsigned char *h_buffer;
  217. h_buffer = malloc(size);
  218. assert(h_buffer);
  219. /* hack to avoid third party libs to rebind threads */
  220. _starpu_bind_thread_on_cpu(config, cpu);
  221. /* Fill them */
  222. memset(h_buffer, 0, size);
  223. err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, size, h_buffer, 0, NULL, NULL);
  224. if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
  225. /* hack to avoid third party libs to rebind threads */
  226. _starpu_bind_thread_on_cpu(config, cpu);
  227. unsigned iter;
  228. double timing;
  229. struct timeval start;
  230. struct timeval end;
  231. /* Measure upload bandwidth */
  232. gettimeofday(&start, NULL);
  233. for (iter = 0; iter < NITER; iter++)
  234. {
  235. err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, size, h_buffer, 0, NULL, NULL);
  236. if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
  237. }
  238. gettimeofday(&end, NULL);
  239. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  240. dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod = timing/NITER/size;
  241. /* Measure download bandwidth */
  242. gettimeofday(&start, NULL);
  243. for (iter = 0; iter < NITER; iter++)
  244. {
  245. err = clEnqueueReadBuffer(queue, d_buffer, CL_TRUE, 0, size, h_buffer, 0, NULL, NULL);
  246. if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
  247. }
  248. gettimeofday(&end, NULL);
  249. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  250. dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh = timing/NITER/size;
  251. /* Free buffers */
  252. clReleaseMemObject(d_buffer);
  253. free(h_buffer);
  254. /* Uninitiliaze OpenCL context on the device */
  255. _starpu_opencl_deinit_context(dev);
  256. }
  257. #endif
  258. /* NB: we want to sort the bandwidth by DECREASING order */
  259. static int compar_dev_timing(const void *left_dev_timing, const void *right_dev_timing)
  260. {
  261. const struct dev_timing *left = left_dev_timing;
  262. const struct dev_timing *right = right_dev_timing;
  263. double left_dtoh = left->timing_dtoh;
  264. double left_htod = left->timing_htod;
  265. double right_dtoh = right->timing_dtoh;
  266. double right_htod = right->timing_htod;
  267. double bandwidth_sum2_left = left_dtoh*left_dtoh + left_htod*left_htod;
  268. double bandwidth_sum2_right = right_dtoh*right_dtoh + right_htod*right_htod;
  269. /* it's for a decreasing sorting */
  270. return (bandwidth_sum2_left < bandwidth_sum2_right);
  271. }
  272. #ifdef STARPU_HAVE_HWLOC
  273. static int find_numa_node(hwloc_obj_t obj)
  274. {
  275. STARPU_ASSERT(obj);
  276. hwloc_obj_t current = obj;
  277. while (current->depth != HWLOC_OBJ_NODE)
  278. {
  279. current = current->parent;
  280. /* If we don't find a "node" obj before the root, this means
  281. * hwloc does not know whether there are numa nodes or not, so
  282. * we should not use a per-node sampling in that case. */
  283. STARPU_ASSERT(current);
  284. }
  285. STARPU_ASSERT(current->depth == HWLOC_OBJ_NODE);
  286. return current->logical_index;
  287. }
  288. #endif
  289. static void measure_bandwidth_between_cpus_and_dev(int dev, struct dev_timing *dev_timing_per_cpu, char type)
  290. {
  291. /* Either we have hwloc and we measure the bandwith between each GPU
  292. * and each NUMA node, or we don't have such NUMA information and we
  293. * measure the bandwith for each pair of (CPU, GPU), which is slower.
  294. * */
  295. #ifdef STARPU_HAVE_HWLOC
  296. int cpu_depth = hwloc_get_type_depth(hwtopology, HWLOC_OBJ_CORE);
  297. int nnuma_nodes = hwloc_get_nbobjs_by_depth(hwtopology, HWLOC_OBJ_NODE);
  298. /* If no NUMA node was found, we assume that we have a single memory
  299. * bank. */
  300. const unsigned no_node_obj_was_found = (nnuma_nodes == 0);
  301. unsigned is_available_per_numa_node[nnuma_nodes];
  302. double dev_timing_htod_per_numa_node[nnuma_nodes];
  303. double dev_timing_dtoh_per_numa_node[nnuma_nodes];
  304. memset(is_available_per_numa_node, 0, nnuma_nodes*sizeof(unsigned));
  305. #endif
  306. unsigned cpu;
  307. for (cpu = 0; cpu < ncpus; cpu++)
  308. {
  309. dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].cpu_id = cpu;
  310. #ifdef STARPU_HAVE_HWLOC
  311. int numa_id = 0;
  312. if (!no_node_obj_was_found)
  313. {
  314. hwloc_obj_t obj = hwloc_get_obj_by_depth(hwtopology, cpu_depth, cpu);
  315. numa_id = find_numa_node(obj);
  316. if (is_available_per_numa_node[numa_id])
  317. {
  318. /* We reuse the previous numbers for that NUMA node */
  319. dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod =
  320. dev_timing_htod_per_numa_node[numa_id];
  321. dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh =
  322. dev_timing_dtoh_per_numa_node[numa_id];
  323. continue;
  324. }
  325. }
  326. #endif
  327. #ifdef STARPU_USE_CUDA
  328. if (type == 'C')
  329. measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(dev, cpu, dev_timing_per_cpu);
  330. #endif
  331. #ifdef STARPU_USE_OPENCL
  332. if (type == 'O')
  333. measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(dev, cpu, dev_timing_per_cpu);
  334. #endif
  335. #ifdef STARPU_HAVE_HWLOC
  336. if (!no_node_obj_was_found && !is_available_per_numa_node[numa_id])
  337. {
  338. /* Save the results for that NUMA node */
  339. dev_timing_htod_per_numa_node[numa_id] =
  340. dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod;
  341. dev_timing_dtoh_per_numa_node[numa_id] =
  342. dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh;
  343. is_available_per_numa_node[numa_id] = 1;
  344. }
  345. #endif
  346. }
  347. }
  348. static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_htod, double *dev_timing_dtoh,
  349. struct dev_timing *dev_timing_per_cpu, char type)
  350. {
  351. measure_bandwidth_between_cpus_and_dev(dev, dev_timing_per_cpu, type);
  352. /* sort the results */
  353. qsort(&(dev_timing_per_cpu[(dev+1)*MAXCPUS]), ncpus,
  354. sizeof(struct dev_timing),
  355. compar_dev_timing);
  356. #ifdef STARPU_VERBOSE
  357. unsigned cpu;
  358. for (cpu = 0; cpu < ncpus; cpu++)
  359. {
  360. unsigned current_cpu = dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].cpu_id;
  361. double bandwidth_dtoh = dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_dtoh;
  362. double bandwidth_htod = dev_timing_per_cpu[(dev+1)*MAXCPUS+cpu].timing_htod;
  363. double bandwidth_sum2 = bandwidth_dtoh*bandwidth_dtoh + bandwidth_htod*bandwidth_htod;
  364. _STARPU_DISP("BANDWIDTH GPU %d CPU %u - htod %f - dtoh %f - %f\n", dev, current_cpu, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
  365. }
  366. unsigned best_cpu = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].cpu_id;
  367. _STARPU_DISP("BANDWIDTH GPU %d BEST CPU %u\n", dev, best_cpu);
  368. #endif
  369. /* The results are sorted in a decreasing order, so that the best
  370. * measurement is currently the first entry. */
  371. dev_timing_dtoh[dev+1] = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].timing_dtoh;
  372. dev_timing_htod[dev+1] = dev_timing_per_cpu[(dev+1)*MAXCPUS+0].timing_htod;
  373. }
  374. #endif /* defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) */
  375. static void benchmark_all_gpu_devices(void)
  376. {
  377. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  378. int i, j;
  379. _STARPU_DEBUG("Benchmarking the speed of the bus\n");
  380. #ifdef STARPU_HAVE_HWLOC
  381. hwloc_topology_init(&hwtopology);
  382. hwloc_topology_load(hwtopology);
  383. #endif
  384. /* TODO: use hwloc */
  385. #ifdef __linux__
  386. /* Save the current cpu binding */
  387. cpu_set_t former_process_affinity;
  388. int ret;
  389. ret = sched_getaffinity(0, sizeof(former_process_affinity), &former_process_affinity);
  390. if (ret)
  391. {
  392. perror("sched_getaffinity");
  393. STARPU_ABORT();
  394. }
  395. #else
  396. #warning Missing binding support, StarPU will not be able to properly benchmark NUMA topology
  397. #endif
  398. struct starpu_machine_config_s *config = _starpu_get_machine_config();
  399. ncpus = _starpu_topology_get_nhwcpu(config);
  400. #ifdef STARPU_USE_CUDA
  401. ncuda = _starpu_get_cuda_device_count();
  402. for (i = 0; i < ncuda; i++)
  403. {
  404. fprintf(stderr," CUDA %d...", i);
  405. /* measure bandwidth between Host and Device i */
  406. measure_bandwidth_between_host_and_dev(i, cudadev_timing_htod, cudadev_timing_dtoh, cudadev_timing_per_cpu, 'C');
  407. }
  408. #ifdef HAVE_CUDA_MEMCPY_PEER
  409. for (i = 0; i < ncuda; i++)
  410. for (j = 0; j < ncuda; j++)
  411. if (i != j)
  412. {
  413. fprintf(stderr," CUDA %d -> %d...", i, j);
  414. /* measure bandwidth between Host and Device i */
  415. measure_bandwidth_between_dev_and_dev_cuda(i, j);
  416. }
  417. #endif
  418. #endif
  419. #ifdef STARPU_USE_OPENCL
  420. nopencl = _starpu_opencl_get_device_count();
  421. for (i = 0; i < nopencl; i++)
  422. {
  423. fprintf(stderr," OpenCL %d...", i);
  424. /* measure bandwith between Host and Device i */
  425. measure_bandwidth_between_host_and_dev(i, opencldev_timing_htod, opencldev_timing_dtoh, opencldev_timing_per_cpu, 'O');
  426. }
  427. #endif
  428. /* FIXME: use hwloc */
  429. #ifdef __linux__
  430. /* Restore the former affinity */
  431. ret = sched_setaffinity(0, sizeof(former_process_affinity), &former_process_affinity);
  432. if (ret)
  433. {
  434. perror("sched_setaffinity");
  435. STARPU_ABORT();
  436. }
  437. #endif
  438. #ifdef STARPU_HAVE_HWLOC
  439. hwloc_topology_destroy(hwtopology);
  440. #endif
  441. _STARPU_DEBUG("Benchmarking the speed of the bus is done.\n");
  442. #endif /* defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) */
  443. was_benchmarked = 1;
  444. }
  445. static void get_bus_path(const char *type, char *path, size_t maxlen)
  446. {
  447. _starpu_get_perf_model_dir_bus(path, maxlen);
  448. strncat(path, type, maxlen);
  449. char hostname[32];
  450. char *forced_hostname = getenv("STARPU_HOSTNAME");
  451. if (forced_hostname && forced_hostname[0])
  452. snprintf(hostname, sizeof(hostname), "%s", forced_hostname);
  453. else
  454. gethostname(hostname, sizeof(hostname));
  455. strncat(path, ".", maxlen);
  456. strncat(path, hostname, maxlen);
  457. }
  458. /*
  459. * Affinity
  460. */
  461. static void get_affinity_path(char *path, size_t maxlen)
  462. {
  463. get_bus_path("affinity", path, maxlen);
  464. }
  465. static void load_bus_affinity_file_content(void)
  466. {
  467. FILE *f;
  468. char path[256];
  469. get_affinity_path(path, 256);
  470. f = fopen(path, "r");
  471. STARPU_ASSERT(f);
  472. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  473. struct starpu_machine_config_s *config = _starpu_get_machine_config();
  474. ncpus = _starpu_topology_get_nhwcpu(config);
  475. int gpu;
  476. #ifdef STARPU_USE_CUDA
  477. ncuda = _starpu_get_cuda_device_count();
  478. for (gpu = 0; gpu < ncuda; gpu++)
  479. {
  480. int ret;
  481. int dummy;
  482. _starpu_drop_comments(f);
  483. ret = fscanf(f, "%d\t", &dummy);
  484. STARPU_ASSERT(ret == 1);
  485. STARPU_ASSERT(dummy == gpu);
  486. unsigned cpu;
  487. for (cpu = 0; cpu < ncpus; cpu++)
  488. {
  489. ret = fscanf(f, "%d\t", &cuda_affinity_matrix[gpu][cpu]);
  490. STARPU_ASSERT(ret == 1);
  491. }
  492. ret = fscanf(f, "\n");
  493. STARPU_ASSERT(ret == 0);
  494. }
  495. #endif
  496. #ifdef STARPU_USE_OPENCL
  497. nopencl = _starpu_opencl_get_device_count();
  498. for (gpu = 0; gpu < nopencl; gpu++)
  499. {
  500. int ret;
  501. int dummy;
  502. _starpu_drop_comments(f);
  503. ret = fscanf(f, "%d\t", &dummy);
  504. STARPU_ASSERT(ret == 1);
  505. STARPU_ASSERT(dummy == gpu);
  506. unsigned cpu;
  507. for (cpu = 0; cpu < ncpus; cpu++)
  508. {
  509. ret = fscanf(f, "%d\t", &opencl_affinity_matrix[gpu][cpu]);
  510. STARPU_ASSERT(ret == 1);
  511. }
  512. ret = fscanf(f, "\n");
  513. STARPU_ASSERT(ret == 0);
  514. }
  515. #endif
  516. #endif
  517. fclose(f);
  518. }
  519. static void write_bus_affinity_file_content(void)
  520. {
  521. FILE *f;
  522. STARPU_ASSERT(was_benchmarked);
  523. char path[256];
  524. get_affinity_path(path, 256);
  525. f = fopen(path, "w+");
  526. if (!f)
  527. {
  528. perror("fopen write_buf_affinity_file_content");
  529. _STARPU_DISP("path '%s'\n", path);
  530. fflush(stderr);
  531. STARPU_ABORT();
  532. }
  533. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  534. unsigned cpu;
  535. int gpu;
  536. fprintf(f, "# GPU\t");
  537. for (cpu = 0; cpu < ncpus; cpu++)
  538. fprintf(f, "CPU%u\t", cpu);
  539. fprintf(f, "\n");
  540. #ifdef STARPU_USE_CUDA
  541. for (gpu = 0; gpu < ncuda; gpu++)
  542. {
  543. fprintf(f, "%d\t", gpu);
  544. for (cpu = 0; cpu < ncpus; cpu++)
  545. {
  546. fprintf(f, "%d\t", cudadev_timing_per_cpu[(gpu+1)*MAXCPUS+cpu].cpu_id);
  547. }
  548. fprintf(f, "\n");
  549. }
  550. #endif
  551. #ifdef STARPU_USE_OPENCL
  552. for (gpu = 0; gpu < nopencl; gpu++)
  553. {
  554. fprintf(f, "%d\t", gpu);
  555. for (cpu = 0; cpu < ncpus; cpu++)
  556. {
  557. fprintf(f, "%d\t", opencldev_timing_per_cpu[(gpu+1)*MAXCPUS+cpu].cpu_id);
  558. }
  559. fprintf(f, "\n");
  560. }
  561. #endif
  562. fclose(f);
  563. #endif
  564. }
  565. static void generate_bus_affinity_file(void)
  566. {
  567. if (!was_benchmarked)
  568. benchmark_all_gpu_devices();
  569. write_bus_affinity_file_content();
  570. }
  571. static void load_bus_affinity_file(void)
  572. {
  573. int res;
  574. char path[256];
  575. get_affinity_path(path, 256);
  576. res = access(path, F_OK);
  577. if (res)
  578. {
  579. /* File does not exist yet */
  580. generate_bus_affinity_file();
  581. }
  582. load_bus_affinity_file_content();
  583. }
  584. #ifdef STARPU_USE_CUDA
  585. int *_starpu_get_cuda_affinity_vector(unsigned gpuid)
  586. {
  587. return cuda_affinity_matrix[gpuid];
  588. }
  589. #endif /* STARPU_USE_CUDA */
  590. #ifdef STARPU_USE_OPENCL
  591. int *_starpu_get_opencl_affinity_vector(unsigned gpuid)
  592. {
  593. return opencl_affinity_matrix[gpuid];
  594. }
  595. #endif /* STARPU_USE_OPENCL */
  596. /*
  597. * Latency
  598. */
  599. static void get_latency_path(char *path, size_t maxlen)
  600. {
  601. get_bus_path("latency", path, maxlen);
  602. }
  603. static int load_bus_latency_file_content(void)
  604. {
  605. int n;
  606. unsigned src, dst;
  607. FILE *f;
  608. char path[256];
  609. get_latency_path(path, 256);
  610. f = fopen(path, "r");
  611. STARPU_ASSERT(f);
  612. for (src = 0; src < STARPU_MAXNODES; src++)
  613. {
  614. _starpu_drop_comments(f);
  615. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  616. {
  617. double latency;
  618. n = fscanf(f, "%lf", &latency);
  619. if (n != 1) {
  620. fclose(f);
  621. return 0;
  622. }
  623. n = getc(f);
  624. if (n != '\t') {
  625. fclose(f);
  626. return 0;
  627. }
  628. latency_matrix[src][dst] = latency;
  629. }
  630. n = getc(f);
  631. if (n != '\n') {
  632. fclose(f);
  633. return 0;
  634. }
  635. }
  636. fclose(f);
  637. return 1;
  638. }
  639. static void write_bus_latency_file_content(void)
  640. {
  641. int src, dst, maxnode;
  642. FILE *f;
  643. STARPU_ASSERT(was_benchmarked);
  644. char path[256];
  645. get_latency_path(path, 256);
  646. f = fopen(path, "w+");
  647. if (!f)
  648. {
  649. perror("fopen write_bus_latency_file_content");
  650. _STARPU_DISP("path '%s'\n", path);
  651. fflush(stderr);
  652. STARPU_ABORT();
  653. }
  654. fprintf(f, "# ");
  655. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  656. fprintf(f, "to %d\t\t", dst);
  657. fprintf(f, "\n");
  658. maxnode = ncuda;
  659. #ifdef STARPU_USE_OPENCL
  660. maxnode += nopencl;
  661. #endif
  662. for (src = 0; src < STARPU_MAXNODES; src++)
  663. {
  664. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  665. {
  666. double latency;
  667. if ((src > maxnode) || (dst > maxnode))
  668. {
  669. /* convention */
  670. latency = -1.0;
  671. }
  672. else if (src == dst)
  673. {
  674. latency = 0.0;
  675. }
  676. else {
  677. /* µs */
  678. latency = ((src && dst)?2000.0:500.0);
  679. }
  680. fprintf(f, "%f\t", latency);
  681. }
  682. fprintf(f, "\n");
  683. }
  684. fclose(f);
  685. }
  686. static void generate_bus_latency_file(void)
  687. {
  688. if (!was_benchmarked)
  689. benchmark_all_gpu_devices();
  690. write_bus_latency_file_content();
  691. }
  692. static void load_bus_latency_file(void)
  693. {
  694. int res;
  695. char path[256];
  696. get_latency_path(path, 256);
  697. res = access(path, F_OK);
  698. if (res || !load_bus_latency_file_content())
  699. {
  700. /* File does not exist yet or is bogus */
  701. generate_bus_latency_file();
  702. }
  703. }
  704. /*
  705. * Bandwidth
  706. */
  707. static void get_bandwidth_path(char *path, size_t maxlen)
  708. {
  709. get_bus_path("bandwidth", path, maxlen);
  710. }
  711. static int load_bus_bandwidth_file_content(void)
  712. {
  713. int n;
  714. unsigned src, dst;
  715. FILE *f;
  716. char path[256];
  717. get_bandwidth_path(path, 256);
  718. f = fopen(path, "r");
  719. if (!f)
  720. {
  721. perror("fopen load_bus_bandwidth_file_content");
  722. _STARPU_DISP("path '%s'\n", path);
  723. fflush(stderr);
  724. STARPU_ABORT();
  725. }
  726. for (src = 0; src < STARPU_MAXNODES; src++)
  727. {
  728. _starpu_drop_comments(f);
  729. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  730. {
  731. double bandwidth;
  732. n = fscanf(f, "%lf", &bandwidth);
  733. if (n != 1) {
  734. fprintf(stderr,"didn't get a number\n");
  735. fclose(f);
  736. return 0;
  737. }
  738. n = getc(f);
  739. if (n != '\t') {
  740. fclose(f);
  741. return 0;
  742. }
  743. bandwidth_matrix[src][dst] = bandwidth;
  744. }
  745. n = getc(f);
  746. if (n != '\n') {
  747. fclose(f);
  748. return 0;
  749. }
  750. }
  751. fclose(f);
  752. return 1;
  753. }
  754. static void write_bus_bandwidth_file_content(void)
  755. {
  756. int src, dst, maxnode;
  757. FILE *f;
  758. STARPU_ASSERT(was_benchmarked);
  759. char path[256];
  760. get_bandwidth_path(path, 256);
  761. f = fopen(path, "w+");
  762. STARPU_ASSERT(f);
  763. fprintf(f, "# ");
  764. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  765. fprintf(f, "to %d\t\t", dst);
  766. fprintf(f, "\n");
  767. maxnode = ncuda;
  768. #ifdef STARPU_USE_OPENCL
  769. maxnode += nopencl;
  770. #endif
  771. for (src = 0; src < STARPU_MAXNODES; src++)
  772. {
  773. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  774. {
  775. double bandwidth;
  776. if ((src > maxnode) || (dst > maxnode))
  777. {
  778. bandwidth = -1.0;
  779. }
  780. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  781. else if (src != dst)
  782. {
  783. double slowness = 0.0;
  784. /* Total bandwidth is the harmonic mean of bandwidths */
  785. #ifdef STARPU_USE_CUDA
  786. #ifdef HAVE_CUDA_MEMCPY_PEER
  787. if (src && src <= ncuda && dst && dst <= ncuda)
  788. /* Direct GPU-GPU transfert */
  789. slowness = cudadev_timing_dtod[src][dst];
  790. else
  791. #endif
  792. {
  793. if (src && src <= ncuda)
  794. slowness += cudadev_timing_dtoh[src];
  795. if (dst && dst <= ncuda)
  796. slowness += cudadev_timing_htod[dst];
  797. }
  798. #endif
  799. #ifdef STARPU_USE_OPENCL
  800. if (src > ncuda)
  801. slowness += opencldev_timing_dtoh[src-ncuda];
  802. if (dst > ncuda)
  803. slowness += opencldev_timing_htod[dst-ncuda];
  804. #endif
  805. bandwidth = 1.0/slowness;
  806. }
  807. #endif
  808. else {
  809. /* convention */
  810. bandwidth = 0.0;
  811. }
  812. fprintf(f, "%f\t", bandwidth);
  813. }
  814. fprintf(f, "\n");
  815. }
  816. fclose(f);
  817. }
  818. void starpu_print_bus_bandwidth(FILE *f)
  819. {
  820. int src, dst, maxnode;
  821. maxnode = ncuda;
  822. #ifdef STARPU_USE_OPENCL
  823. maxnode += nopencl;
  824. #endif
  825. fprintf(f, "from\t");
  826. fprintf(f, "to RAM\t\t");
  827. for (dst = 0; dst < ncuda; dst++)
  828. fprintf(f, "to CUDA %d\t", dst);
  829. for (dst = 0; dst < nopencl; dst++)
  830. fprintf(f, "to OpenCL %d\t", dst);
  831. fprintf(f, "\n");
  832. for (src = 0; src <= maxnode; src++)
  833. {
  834. if (!src)
  835. fprintf(f, "RAM\t");
  836. else if (src <= ncuda)
  837. fprintf(f, "CUDA %d\t", src-1);
  838. else
  839. fprintf(f, "OpenCL%d\t", src-ncuda-1);
  840. for (dst = 0; dst <= maxnode; dst++)
  841. fprintf(f, "%f\t", bandwidth_matrix[src][dst]);
  842. fprintf(f, "\n");
  843. }
  844. }
  845. static void generate_bus_bandwidth_file(void)
  846. {
  847. if (!was_benchmarked)
  848. benchmark_all_gpu_devices();
  849. write_bus_bandwidth_file_content();
  850. }
  851. static void load_bus_bandwidth_file(void)
  852. {
  853. int res;
  854. char path[256];
  855. get_bandwidth_path(path, 256);
  856. res = access(path, F_OK);
  857. if (res || !load_bus_bandwidth_file_content())
  858. {
  859. /* File does not exist yet or is bogus */
  860. generate_bus_bandwidth_file();
  861. }
  862. }
  863. /*
  864. * Config
  865. */
  866. static void get_config_path(char *path, size_t maxlen)
  867. {
  868. get_bus_path("config", path, maxlen);
  869. }
  870. static void check_bus_config_file()
  871. {
  872. int res;
  873. char path[256];
  874. get_config_path(path, 256);
  875. res = access(path, F_OK);
  876. if (res) {
  877. fprintf(stderr, "No performance model for the bus, calibrating...");
  878. starpu_force_bus_sampling();
  879. fprintf(stderr, "done\n");
  880. }
  881. else {
  882. FILE *f;
  883. int ret, read_cuda, read_opencl;
  884. unsigned read_cpus;
  885. struct starpu_machine_config_s *config = _starpu_get_machine_config();
  886. // Loading configuration from file
  887. f = fopen(path, "r");
  888. STARPU_ASSERT(f);
  889. _starpu_drop_comments(f);
  890. ret = fscanf(f, "%u\t", &read_cpus);
  891. STARPU_ASSERT(ret == 1);
  892. _starpu_drop_comments(f);
  893. ret = fscanf(f, "%d\t", &read_cuda);
  894. STARPU_ASSERT(ret == 1);
  895. _starpu_drop_comments(f);
  896. ret = fscanf(f, "%d\t", &read_opencl);
  897. STARPU_ASSERT(ret == 1);
  898. _starpu_drop_comments(f);
  899. fclose(f);
  900. // Loading current configuration
  901. ncpus = _starpu_topology_get_nhwcpu(config);
  902. #ifdef STARPU_USE_CUDA
  903. ncuda = _starpu_get_cuda_device_count();
  904. #endif
  905. #ifdef STARPU_USE_OPENCL
  906. nopencl = _starpu_opencl_get_device_count();
  907. #endif
  908. // Checking if both configurations match
  909. if (read_cpus != ncpus) {
  910. fprintf(stderr, "Current configuration does not match the bus performance model (CPUS: (stored) %u != (current) %u), recalibrating...", read_cpus, ncpus);
  911. starpu_force_bus_sampling();
  912. fprintf(stderr, "done\n");
  913. }
  914. else if (read_cuda != ncuda) {
  915. fprintf(stderr, "Current configuration does not match the bus performance model (CUDA: (stored) %d != (current) %d), recalibrating...", read_cuda, ncuda);
  916. starpu_force_bus_sampling();
  917. fprintf(stderr, "done\n");
  918. }
  919. else if (read_opencl != nopencl) {
  920. fprintf(stderr, "Current configuration does not match the bus performance model (OpenCL: (stored) %d != (current) %d), recalibrating...", read_opencl, nopencl);
  921. starpu_force_bus_sampling();
  922. fprintf(stderr, "done\n");
  923. }
  924. }
  925. }
  926. static void write_bus_config_file_content(void)
  927. {
  928. FILE *f;
  929. char path[256];
  930. STARPU_ASSERT(was_benchmarked);
  931. get_config_path(path, 256);
  932. f = fopen(path, "w+");
  933. STARPU_ASSERT(f);
  934. fprintf(f, "# Current configuration\n");
  935. fprintf(f, "%u # Number of CPUs\n", ncpus);
  936. fprintf(f, "%d # Number of CUDA devices\n", ncuda);
  937. fprintf(f, "%d # Number of OpenCL devices\n", nopencl);
  938. fclose(f);
  939. }
  940. static void generate_bus_config_file()
  941. {
  942. if (!was_benchmarked)
  943. benchmark_all_gpu_devices();
  944. write_bus_config_file_content();
  945. }
  946. /*
  947. * Generic
  948. */
  949. void starpu_force_bus_sampling(void)
  950. {
  951. _starpu_create_sampling_directory_if_needed();
  952. generate_bus_affinity_file();
  953. generate_bus_latency_file();
  954. generate_bus_bandwidth_file();
  955. generate_bus_config_file();
  956. }
  957. void _starpu_load_bus_performance_files(void)
  958. {
  959. _starpu_create_sampling_directory_if_needed();
  960. check_bus_config_file();
  961. load_bus_affinity_file();
  962. load_bus_latency_file();
  963. load_bus_bandwidth_file();
  964. }
  965. /* (in µs) */
  966. double _starpu_predict_transfer_time(unsigned src_node, unsigned dst_node, size_t size)
  967. {
  968. double bandwidth = bandwidth_matrix[src_node][dst_node];
  969. double latency = latency_matrix[src_node][dst_node];
  970. struct starpu_machine_topology_s *topology = &_starpu_get_machine_config()->topology;
  971. return latency + (size/bandwidth)*2*(topology->ncudagpus+topology->nopenclgpus);
  972. }