perfmodel_bus.c 39 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009-2012 Université de Bordeaux 1
  4. * Copyright (C) 2010, 2011, 2012 Centre National de la Recherche Scientifique
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #ifdef STARPU_USE_CUDA
  18. #ifndef _GNU_SOURCE
  19. #define _GNU_SOURCE
  20. #endif
  21. #include <sched.h>
  22. #endif
  23. #include <unistd.h>
  24. #include <sys/time.h>
  25. #include <stdlib.h>
  26. #include <math.h>
  27. #include <starpu.h>
  28. #include <starpu_cuda.h>
  29. #include <starpu_opencl.h>
  30. #include <common/config.h>
  31. #include <core/workers.h>
  32. #include <core/perfmodel/perfmodel.h>
  33. #ifdef STARPU_USE_OPENCL
  34. #include <starpu_opencl.h>
  35. #endif
  36. #ifdef STARPU_HAVE_WINDOWS
  37. #include <windows.h>
  38. #endif
  39. #define SIZE (32*1024*1024*sizeof(char))
  40. #define NITER 128
  41. #ifndef STARPU_SIMGRID
  42. static void starpu_force_bus_sampling(void);
  43. #endif
  44. /* timing is in µs per byte (i.e. slowness, inverse of bandwidth) */
  45. struct dev_timing
  46. {
  47. int cpu_id;
  48. double timing_htod;
  49. double latency_htod;
  50. double timing_dtoh;
  51. double latency_dtoh;
  52. };
  53. /* TODO: measure latency */
  54. static double bandwidth_matrix[STARPU_MAXNODES][STARPU_MAXNODES];
  55. static double latency_matrix[STARPU_MAXNODES][STARPU_MAXNODES];
  56. static unsigned was_benchmarked = 0;
  57. static unsigned ncpus = 0;
  58. static int ncuda = 0;
  59. static int nopencl = 0;
  60. /* Benchmarking the performance of the bus */
  61. #ifdef STARPU_USE_CUDA
  62. static int cuda_affinity_matrix[STARPU_MAXCUDADEVS][STARPU_MAXCPUS];
  63. static double cudadev_timing_htod[STARPU_MAXNODES] = {0.0};
  64. static double cudadev_latency_htod[STARPU_MAXNODES] = {0.0};
  65. static double cudadev_timing_dtoh[STARPU_MAXNODES] = {0.0};
  66. static double cudadev_latency_dtoh[STARPU_MAXNODES] = {0.0};
  67. #ifdef HAVE_CUDA_MEMCPY_PEER
  68. static double cudadev_timing_dtod[STARPU_MAXNODES][STARPU_MAXNODES] = {{0.0}};
  69. static double cudadev_latency_dtod[STARPU_MAXNODES][STARPU_MAXNODES] = {{0.0}};
  70. #endif
  71. static struct dev_timing cudadev_timing_per_cpu[STARPU_MAXNODES*STARPU_MAXCPUS];
  72. #endif
  73. #ifdef STARPU_USE_OPENCL
  74. static int opencl_affinity_matrix[STARPU_MAXOPENCLDEVS][STARPU_MAXCPUS];
  75. static double opencldev_timing_htod[STARPU_MAXNODES] = {0.0};
  76. static double opencldev_latency_htod[STARPU_MAXNODES] = {0.0};
  77. static double opencldev_timing_dtoh[STARPU_MAXNODES] = {0.0};
  78. static double opencldev_latency_dtoh[STARPU_MAXNODES] = {0.0};
  79. static struct dev_timing opencldev_timing_per_cpu[STARPU_MAXNODES*STARPU_MAXCPUS];
  80. #endif
  81. #ifdef STARPU_HAVE_HWLOC
  82. static hwloc_topology_t hwtopology;
  83. #endif
  84. #if (defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)) && !defined(STARPU_SIMGRID)
  85. #ifdef STARPU_USE_CUDA
  86. static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int cpu, struct dev_timing *dev_timing_per_cpu)
  87. {
  88. struct _starpu_machine_config *config = _starpu_get_machine_config();
  89. _starpu_bind_thread_on_cpu(config, cpu);
  90. size_t size = SIZE;
  91. /* Initialize CUDA context on the device */
  92. /* We do not need to enable OpenGL interoperability at this point,
  93. * since we cleanly shutdown CUDA before returning. */
  94. cudaSetDevice(dev);
  95. /* hack to avoid third party libs to rebind threads */
  96. _starpu_bind_thread_on_cpu(config, cpu);
  97. /* hack to force the initialization */
  98. cudaFree(0);
  99. /* hack to avoid third party libs to rebind threads */
  100. _starpu_bind_thread_on_cpu(config, cpu);
  101. /* Get the maximum size which can be allocated on the device */
  102. struct cudaDeviceProp prop;
  103. cudaError_t cures;
  104. cures = cudaGetDeviceProperties(&prop, dev);
  105. if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
  106. if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
  107. /* Allocate a buffer on the device */
  108. unsigned char *d_buffer;
  109. cudaMalloc((void **)&d_buffer, size);
  110. STARPU_ASSERT(d_buffer);
  111. /* hack to avoid third party libs to rebind threads */
  112. _starpu_bind_thread_on_cpu(config, cpu);
  113. /* Allocate a buffer on the host */
  114. unsigned char *h_buffer;
  115. cures = cudaHostAlloc((void **)&h_buffer, size, 0);
  116. STARPU_ASSERT(cures == cudaSuccess);
  117. /* hack to avoid third party libs to rebind threads */
  118. _starpu_bind_thread_on_cpu(config, cpu);
  119. /* Fill them */
  120. memset(h_buffer, 0, size);
  121. cudaMemset(d_buffer, 0, size);
  122. /* hack to avoid third party libs to rebind threads */
  123. _starpu_bind_thread_on_cpu(config, cpu);
  124. unsigned iter;
  125. double timing;
  126. struct timeval start;
  127. struct timeval end;
  128. /* Measure upload bandwidth */
  129. gettimeofday(&start, NULL);
  130. for (iter = 0; iter < NITER; iter++)
  131. {
  132. cudaMemcpy(d_buffer, h_buffer, size, cudaMemcpyHostToDevice);
  133. cudaThreadSynchronize();
  134. }
  135. gettimeofday(&end, NULL);
  136. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  137. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_htod = timing/NITER/size;
  138. /* Measure download bandwidth */
  139. gettimeofday(&start, NULL);
  140. for (iter = 0; iter < NITER; iter++)
  141. {
  142. cudaMemcpy(h_buffer, d_buffer, size, cudaMemcpyDeviceToHost);
  143. cudaThreadSynchronize();
  144. }
  145. gettimeofday(&end, NULL);
  146. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  147. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_dtoh = timing/NITER/size;
  148. /* Measure upload latency */
  149. gettimeofday(&start, NULL);
  150. for (iter = 0; iter < NITER; iter++)
  151. {
  152. cudaMemcpy(d_buffer, h_buffer, 1, cudaMemcpyHostToDevice);
  153. cudaThreadSynchronize();
  154. }
  155. gettimeofday(&end, NULL);
  156. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  157. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].latency_htod = timing/NITER;
  158. /* Measure download latency */
  159. gettimeofday(&start, NULL);
  160. for (iter = 0; iter < NITER; iter++)
  161. {
  162. cudaMemcpy(d_buffer, h_buffer, 1, cudaMemcpyHostToDevice);
  163. cudaThreadSynchronize();
  164. }
  165. gettimeofday(&end, NULL);
  166. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  167. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].latency_dtoh = timing/NITER;
  168. /* Free buffers */
  169. cudaFreeHost(h_buffer);
  170. cudaFree(d_buffer);
  171. cudaThreadExit();
  172. }
  173. #ifdef HAVE_CUDA_MEMCPY_PEER
  174. static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
  175. {
  176. size_t size = SIZE;
  177. int can;
  178. /* Get the maximum size which can be allocated on the device */
  179. struct cudaDeviceProp prop;
  180. cudaError_t cures;
  181. cures = cudaGetDeviceProperties(&prop, src);
  182. if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
  183. if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
  184. cures = cudaGetDeviceProperties(&prop, dst);
  185. if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
  186. if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
  187. /* Initialize CUDA context on the source */
  188. /* We do not need to enable OpenGL interoperability at this point,
  189. * since we cleanly shutdown CUDA before returning. */
  190. cudaSetDevice(src);
  191. if (starpu_get_env_number("STARPU_DISABLE_CUDA_GPU_GPU_DIRECT") <= 0) {
  192. cures = cudaDeviceCanAccessPeer(&can, src, dst);
  193. if (!cures && can) {
  194. cures = cudaDeviceEnablePeerAccess(dst, 0);
  195. if (!cures)
  196. _STARPU_DISP("GPU-Direct %d -> %d\n", dst, src);
  197. }
  198. }
  199. /* Allocate a buffer on the device */
  200. unsigned char *s_buffer;
  201. cudaMalloc((void **)&s_buffer, size);
  202. STARPU_ASSERT(s_buffer);
  203. cudaMemset(s_buffer, 0, size);
  204. /* Initialize CUDA context on the destination */
  205. /* We do not need to enable OpenGL interoperability at this point,
  206. * since we cleanly shutdown CUDA before returning. */
  207. cudaSetDevice(dst);
  208. if (starpu_get_env_number("STARPU_DISABLE_CUDA_GPU_GPU_DIRECT") <= 0) {
  209. cures = cudaDeviceCanAccessPeer(&can, dst, src);
  210. if (!cures && can) {
  211. cures = cudaDeviceEnablePeerAccess(src, 0);
  212. if (!cures)
  213. _STARPU_DISP("GPU-Direct %d -> %d\n", src, dst);
  214. }
  215. }
  216. /* Allocate a buffer on the device */
  217. unsigned char *d_buffer;
  218. cudaMalloc((void **)&d_buffer, size);
  219. STARPU_ASSERT(d_buffer);
  220. cudaMemset(d_buffer, 0, size);
  221. unsigned iter;
  222. double timing;
  223. struct timeval start;
  224. struct timeval end;
  225. /* Measure upload bandwidth */
  226. gettimeofday(&start, NULL);
  227. for (iter = 0; iter < NITER; iter++)
  228. {
  229. cudaMemcpyPeer(d_buffer, dst, s_buffer, src, size);
  230. cudaThreadSynchronize();
  231. }
  232. gettimeofday(&end, NULL);
  233. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  234. cudadev_timing_dtod[src+1][dst+1] = timing/NITER/size;
  235. /* Measure upload latency */
  236. gettimeofday(&start, NULL);
  237. for (iter = 0; iter < NITER; iter++)
  238. {
  239. cudaMemcpyPeer(d_buffer, dst, s_buffer, src, 1);
  240. cudaThreadSynchronize();
  241. }
  242. gettimeofday(&end, NULL);
  243. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  244. cudadev_latency_dtod[src+1][dst+1] = timing/NITER;
  245. /* Free buffers */
  246. cudaFree(d_buffer);
  247. cudaSetDevice(src);
  248. cudaFree(s_buffer);
  249. cudaThreadExit();
  250. }
  251. #endif
  252. #endif
  253. #ifdef STARPU_USE_OPENCL
  254. static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, int cpu, struct dev_timing *dev_timing_per_cpu)
  255. {
  256. cl_context context;
  257. cl_command_queue queue;
  258. cl_int err=0;
  259. size_t size = SIZE;
  260. int not_initialized;
  261. struct _starpu_machine_config *config = _starpu_get_machine_config();
  262. _starpu_bind_thread_on_cpu(config, cpu);
  263. /* Is the context already initialised ? */
  264. starpu_opencl_get_context(dev, &context);
  265. not_initialized = (context == NULL);
  266. if (not_initialized == 1)
  267. _starpu_opencl_init_context(dev);
  268. /* Get context and queue */
  269. starpu_opencl_get_context(dev, &context);
  270. starpu_opencl_get_queue(dev, &queue);
  271. /* Get the maximum size which can be allocated on the device */
  272. cl_device_id device;
  273. cl_ulong maxMemAllocSize;
  274. starpu_opencl_get_device(dev, &device);
  275. err = clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(maxMemAllocSize), &maxMemAllocSize, NULL);
  276. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  277. if (size > (size_t)maxMemAllocSize/4) size = maxMemAllocSize/4;
  278. if (_starpu_opencl_get_device_type(dev) == CL_DEVICE_TYPE_CPU)
  279. {
  280. /* Let's not use too much RAM when running OpenCL on a CPU: it
  281. * would make the OS swap like crazy. */
  282. size /= 2;
  283. }
  284. /* hack to avoid third party libs to rebind threads */
  285. _starpu_bind_thread_on_cpu(config, cpu);
  286. /* Allocate a buffer on the device */
  287. cl_mem d_buffer;
  288. d_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err);
  289. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  290. /* hack to avoid third party libs to rebind threads */
  291. _starpu_bind_thread_on_cpu(config, cpu);
  292. /* Allocate a buffer on the host */
  293. unsigned char *h_buffer;
  294. h_buffer = (unsigned char *)malloc(size);
  295. STARPU_ASSERT(h_buffer);
  296. /* hack to avoid third party libs to rebind threads */
  297. _starpu_bind_thread_on_cpu(config, cpu);
  298. /* Fill them */
  299. memset(h_buffer, 0, size);
  300. err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, size, h_buffer, 0, NULL, NULL);
  301. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  302. /* hack to avoid third party libs to rebind threads */
  303. _starpu_bind_thread_on_cpu(config, cpu);
  304. unsigned iter;
  305. double timing;
  306. struct timeval start;
  307. struct timeval end;
  308. /* Measure upload bandwidth */
  309. gettimeofday(&start, NULL);
  310. for (iter = 0; iter < NITER; iter++)
  311. {
  312. err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, size, h_buffer, 0, NULL, NULL);
  313. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  314. }
  315. gettimeofday(&end, NULL);
  316. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  317. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_htod = timing/NITER/size;
  318. /* Measure download bandwidth */
  319. gettimeofday(&start, NULL);
  320. for (iter = 0; iter < NITER; iter++)
  321. {
  322. err = clEnqueueReadBuffer(queue, d_buffer, CL_TRUE, 0, size, h_buffer, 0, NULL, NULL);
  323. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  324. }
  325. gettimeofday(&end, NULL);
  326. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  327. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_dtoh = timing/NITER/size;
  328. /* Measure upload latency */
  329. gettimeofday(&start, NULL);
  330. for (iter = 0; iter < NITER; iter++)
  331. {
  332. err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, 1, h_buffer, 0, NULL, NULL);
  333. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  334. }
  335. gettimeofday(&end, NULL);
  336. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  337. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].latency_htod = timing/NITER;
  338. /* Measure download latency */
  339. gettimeofday(&start, NULL);
  340. for (iter = 0; iter < NITER; iter++)
  341. {
  342. err = clEnqueueReadBuffer(queue, d_buffer, CL_TRUE, 0, 1, h_buffer, 0, NULL, NULL);
  343. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  344. }
  345. gettimeofday(&end, NULL);
  346. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  347. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].latency_dtoh = timing/NITER;
  348. /* Free buffers */
  349. err = clReleaseMemObject(d_buffer);
  350. if (STARPU_UNLIKELY(err != CL_SUCCESS))
  351. STARPU_OPENCL_REPORT_ERROR(err);
  352. free(h_buffer);
  353. /* Uninitiliaze OpenCL context on the device */
  354. if (not_initialized == 1)
  355. _starpu_opencl_deinit_context(dev);
  356. }
  357. #endif
  358. /* NB: we want to sort the bandwidth by DECREASING order */
  359. static int compar_dev_timing(const void *left_dev_timing, const void *right_dev_timing)
  360. {
  361. const struct dev_timing *left = (const struct dev_timing *)left_dev_timing;
  362. const struct dev_timing *right = (const struct dev_timing *)right_dev_timing;
  363. double left_dtoh = left->timing_dtoh;
  364. double left_htod = left->timing_htod;
  365. double right_dtoh = right->timing_dtoh;
  366. double right_htod = right->timing_htod;
  367. double timing_sum2_left = left_dtoh*left_dtoh + left_htod*left_htod;
  368. double timing_sum2_right = right_dtoh*right_dtoh + right_htod*right_htod;
  369. /* it's for a decreasing sorting */
  370. return (timing_sum2_left > timing_sum2_right);
  371. }
  372. #ifdef STARPU_HAVE_HWLOC
  373. static int find_numa_node(hwloc_obj_t obj)
  374. {
  375. STARPU_ASSERT(obj);
  376. hwloc_obj_t current = obj;
  377. while (current->depth != HWLOC_OBJ_NODE)
  378. {
  379. current = current->parent;
  380. /* If we don't find a "node" obj before the root, this means
  381. * hwloc does not know whether there are numa nodes or not, so
  382. * we should not use a per-node sampling in that case. */
  383. STARPU_ASSERT(current);
  384. }
  385. STARPU_ASSERT(current->depth == HWLOC_OBJ_NODE);
  386. return current->logical_index;
  387. }
  388. #endif
  389. static void measure_bandwidth_between_cpus_and_dev(int dev, struct dev_timing *dev_timing_per_cpu, char *type)
  390. {
  391. /* Either we have hwloc and we measure the bandwith between each GPU
  392. * and each NUMA node, or we don't have such NUMA information and we
  393. * measure the bandwith for each pair of (CPU, GPU), which is slower.
  394. * */
  395. #ifdef STARPU_HAVE_HWLOC
  396. int cpu_depth = hwloc_get_type_depth(hwtopology, HWLOC_OBJ_CORE);
  397. int nnuma_nodes = hwloc_get_nbobjs_by_depth(hwtopology, HWLOC_OBJ_NODE);
  398. /* If no NUMA node was found, we assume that we have a single memory
  399. * bank. */
  400. const unsigned no_node_obj_was_found = (nnuma_nodes == 0);
  401. unsigned *is_available_per_numa_node = NULL;
  402. double *dev_timing_htod_per_numa_node = NULL;
  403. double *dev_latency_htod_per_numa_node = NULL;
  404. double *dev_timing_dtoh_per_numa_node = NULL;
  405. double *dev_latency_dtoh_per_numa_node = NULL;
  406. if (!no_node_obj_was_found)
  407. {
  408. is_available_per_numa_node = (unsigned *)malloc(nnuma_nodes * sizeof(unsigned));
  409. STARPU_ASSERT(is_available_per_numa_node);
  410. dev_timing_htod_per_numa_node = (double *)malloc(nnuma_nodes * sizeof(double));
  411. STARPU_ASSERT(dev_timing_htod_per_numa_node);
  412. dev_latency_htod_per_numa_node = (double *)malloc(nnuma_nodes * sizeof(double));
  413. STARPU_ASSERT(dev_latency_htod_per_numa_node);
  414. dev_timing_dtoh_per_numa_node = (double *)malloc(nnuma_nodes * sizeof(double));
  415. STARPU_ASSERT(dev_timing_dtoh_per_numa_node);
  416. dev_latency_dtoh_per_numa_node = (double *)malloc(nnuma_nodes * sizeof(double));
  417. STARPU_ASSERT(dev_latency_dtoh_per_numa_node);
  418. memset(is_available_per_numa_node, 0, nnuma_nodes*sizeof(unsigned));
  419. }
  420. #endif
  421. unsigned cpu;
  422. for (cpu = 0; cpu < ncpus; cpu++)
  423. {
  424. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].cpu_id = cpu;
  425. #ifdef STARPU_HAVE_HWLOC
  426. int numa_id = 0;
  427. if (!no_node_obj_was_found)
  428. {
  429. hwloc_obj_t obj = hwloc_get_obj_by_depth(hwtopology, cpu_depth, cpu);
  430. numa_id = find_numa_node(obj);
  431. if (is_available_per_numa_node[numa_id])
  432. {
  433. /* We reuse the previous numbers for that NUMA node */
  434. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_htod =
  435. dev_timing_htod_per_numa_node[numa_id];
  436. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].latency_htod =
  437. dev_latency_htod_per_numa_node[numa_id];
  438. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_dtoh =
  439. dev_timing_dtoh_per_numa_node[numa_id];
  440. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].latency_dtoh =
  441. dev_latency_dtoh_per_numa_node[numa_id];
  442. continue;
  443. }
  444. }
  445. #endif
  446. #ifdef STARPU_USE_CUDA
  447. if (strncmp(type, "CUDA", 4) == 0)
  448. measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(dev, cpu, dev_timing_per_cpu);
  449. #endif
  450. #ifdef STARPU_USE_OPENCL
  451. if (strncmp(type, "OpenCL", 6) == 0)
  452. measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(dev, cpu, dev_timing_per_cpu);
  453. #endif
  454. #ifdef STARPU_HAVE_HWLOC
  455. if (!no_node_obj_was_found && !is_available_per_numa_node[numa_id])
  456. {
  457. /* Save the results for that NUMA node */
  458. dev_timing_htod_per_numa_node[numa_id] =
  459. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_htod;
  460. dev_latency_htod_per_numa_node[numa_id] =
  461. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].latency_htod;
  462. dev_timing_dtoh_per_numa_node[numa_id] =
  463. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_dtoh;
  464. dev_latency_dtoh_per_numa_node[numa_id] =
  465. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].latency_dtoh;
  466. is_available_per_numa_node[numa_id] = 1;
  467. }
  468. #endif
  469. }
  470. #ifdef STARPU_HAVE_HWLOC
  471. if (!no_node_obj_was_found)
  472. {
  473. free(is_available_per_numa_node);
  474. free(dev_timing_htod_per_numa_node);
  475. free(dev_latency_htod_per_numa_node);
  476. free(dev_timing_dtoh_per_numa_node);
  477. free(dev_latency_dtoh_per_numa_node);
  478. }
  479. #endif /* STARPU_HAVE_HWLOC */
  480. }
  481. static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_htod, double *dev_latency_htod,
  482. double *dev_timing_dtoh, double *dev_latency_dtoh,
  483. struct dev_timing *dev_timing_per_cpu, char *type)
  484. {
  485. measure_bandwidth_between_cpus_and_dev(dev, dev_timing_per_cpu, type);
  486. /* sort the results */
  487. qsort(&(dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS]), ncpus,
  488. sizeof(struct dev_timing),
  489. compar_dev_timing);
  490. #ifdef STARPU_VERBOSE
  491. unsigned cpu;
  492. for (cpu = 0; cpu < ncpus; cpu++)
  493. {
  494. unsigned current_cpu = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].cpu_id;
  495. double bandwidth_dtoh = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_dtoh;
  496. double bandwidth_htod = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_htod;
  497. double bandwidth_sum2 = bandwidth_dtoh*bandwidth_dtoh + bandwidth_htod*bandwidth_htod;
  498. _STARPU_DISP("(%10s) BANDWIDTH GPU %d CPU %u - htod %f - dtoh %f - %f\n", type, dev, current_cpu, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
  499. }
  500. unsigned best_cpu = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+0].cpu_id;
  501. _STARPU_DISP("(%10s) BANDWIDTH GPU %d BEST CPU %u\n", type, dev, best_cpu);
  502. #endif
  503. /* The results are sorted in a decreasing order, so that the best
  504. * measurement is currently the first entry. */
  505. dev_timing_dtoh[dev+1] = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+0].timing_dtoh;
  506. dev_latency_dtoh[dev+1] = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+0].latency_dtoh;
  507. dev_timing_htod[dev+1] = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+0].timing_htod;
  508. dev_latency_htod[dev+1] = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+0].latency_htod;
  509. }
  510. #endif /* defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) */
  511. static void benchmark_all_gpu_devices(void)
  512. {
  513. #ifdef STARPU_SIMGRID
  514. _STARPU_DISP("can not measure bus in simgrid mode, please run starpu_calibrate_bus in non-simgrid mode to make sure the bus performance model was calibrated\n");
  515. STARPU_ABORT();
  516. #else /* !SIMGRID */
  517. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  518. int i;
  519. #endif
  520. #ifdef HAVE_CUDA_MEMCPY_PEER
  521. int j;
  522. #endif
  523. _STARPU_DEBUG("Benchmarking the speed of the bus\n");
  524. #ifdef STARPU_HAVE_HWLOC
  525. hwloc_topology_init(&hwtopology);
  526. hwloc_topology_load(hwtopology);
  527. #endif
  528. #ifdef STARPU_HAVE_HWLOC
  529. hwloc_bitmap_t former_cpuset = hwloc_bitmap_alloc();
  530. hwloc_get_cpubind(hwtopology, former_cpuset, HWLOC_CPUBIND_THREAD);
  531. #elif __linux__
  532. /* Save the current cpu binding */
  533. cpu_set_t former_process_affinity;
  534. int ret;
  535. ret = sched_getaffinity(0, sizeof(former_process_affinity), &former_process_affinity);
  536. if (ret)
  537. {
  538. perror("sched_getaffinity");
  539. STARPU_ABORT();
  540. }
  541. #else
  542. #warning Missing binding support, StarPU will not be able to properly benchmark NUMA topology
  543. #endif
  544. struct _starpu_machine_config *config = _starpu_get_machine_config();
  545. ncpus = _starpu_topology_get_nhwcpu(config);
  546. #ifdef STARPU_USE_CUDA
  547. ncuda = _starpu_get_cuda_device_count();
  548. for (i = 0; i < ncuda; i++)
  549. {
  550. _STARPU_DISP("CUDA %d...\n", i);
  551. /* measure bandwidth between Host and Device i */
  552. measure_bandwidth_between_host_and_dev(i, cudadev_timing_htod, cudadev_latency_htod, cudadev_timing_dtoh, cudadev_latency_dtoh, cudadev_timing_per_cpu, "CUDA");
  553. }
  554. #ifdef HAVE_CUDA_MEMCPY_PEER
  555. for (i = 0; i < ncuda; i++)
  556. for (j = 0; j < ncuda; j++)
  557. if (i != j)
  558. {
  559. _STARPU_DISP("CUDA %d -> %d...\n", i, j);
  560. /* measure bandwidth between Host and Device i */
  561. measure_bandwidth_between_dev_and_dev_cuda(i, j);
  562. }
  563. #endif
  564. #endif
  565. #ifdef STARPU_USE_OPENCL
  566. nopencl = _starpu_opencl_get_device_count();
  567. for (i = 0; i < nopencl; i++)
  568. {
  569. _STARPU_DISP("OpenCL %d...\n", i);
  570. /* measure bandwith between Host and Device i */
  571. measure_bandwidth_between_host_and_dev(i, opencldev_timing_htod, opencldev_latency_htod, opencldev_timing_dtoh, opencldev_latency_dtoh, opencldev_timing_per_cpu, "OpenCL");
  572. }
  573. #endif
  574. #ifdef STARPU_HAVE_HWLOC
  575. hwloc_set_cpubind(hwtopology, former_cpuset, HWLOC_CPUBIND_THREAD);
  576. #elif __linux__
  577. /* Restore the former affinity */
  578. ret = sched_setaffinity(0, sizeof(former_process_affinity), &former_process_affinity);
  579. if (ret)
  580. {
  581. perror("sched_setaffinity");
  582. STARPU_ABORT();
  583. }
  584. #endif
  585. #ifdef STARPU_HAVE_HWLOC
  586. hwloc_topology_destroy(hwtopology);
  587. #endif
  588. _STARPU_DEBUG("Benchmarking the speed of the bus is done.\n");
  589. was_benchmarked = 1;
  590. #endif /* !SIMGRID */
  591. }
  592. static void get_bus_path(const char *type, char *path, size_t maxlen)
  593. {
  594. _starpu_get_perf_model_dir_bus(path, maxlen);
  595. char hostname[32];
  596. char *forced_hostname = getenv("STARPU_HOSTNAME");
  597. if (forced_hostname && forced_hostname[0])
  598. snprintf(hostname, sizeof(hostname), "%s", forced_hostname);
  599. else
  600. gethostname(hostname, sizeof(hostname));
  601. strncat(path, hostname, maxlen);
  602. strncat(path, ".", maxlen);
  603. strncat(path, type, maxlen);
  604. }
  605. /*
  606. * Affinity
  607. */
  608. #ifndef STARPU_SIMGRID
  609. static void get_affinity_path(char *path, size_t maxlen)
  610. {
  611. get_bus_path("affinity", path, maxlen);
  612. }
  613. static void load_bus_affinity_file_content(void)
  614. {
  615. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  616. FILE *f;
  617. char path[256];
  618. get_affinity_path(path, 256);
  619. f = fopen(path, "r");
  620. STARPU_ASSERT(f);
  621. struct _starpu_machine_config *config = _starpu_get_machine_config();
  622. ncpus = _starpu_topology_get_nhwcpu(config);
  623. int gpu;
  624. #ifdef STARPU_USE_CUDA
  625. ncuda = _starpu_get_cuda_device_count();
  626. for (gpu = 0; gpu < ncuda; gpu++)
  627. {
  628. int ret;
  629. int dummy;
  630. _starpu_drop_comments(f);
  631. ret = fscanf(f, "%d\t", &dummy);
  632. STARPU_ASSERT(ret == 1);
  633. STARPU_ASSERT(dummy == gpu);
  634. unsigned cpu;
  635. for (cpu = 0; cpu < ncpus; cpu++)
  636. {
  637. ret = fscanf(f, "%d\t", &cuda_affinity_matrix[gpu][cpu]);
  638. STARPU_ASSERT(ret == 1);
  639. }
  640. ret = fscanf(f, "\n");
  641. STARPU_ASSERT(ret == 0);
  642. }
  643. #endif /* !STARPU_USE_CUDA */
  644. #ifdef STARPU_USE_OPENCL
  645. nopencl = _starpu_opencl_get_device_count();
  646. for (gpu = 0; gpu < nopencl; gpu++)
  647. {
  648. int ret;
  649. int dummy;
  650. _starpu_drop_comments(f);
  651. ret = fscanf(f, "%d\t", &dummy);
  652. STARPU_ASSERT(ret == 1);
  653. STARPU_ASSERT(dummy == gpu);
  654. unsigned cpu;
  655. for (cpu = 0; cpu < ncpus; cpu++)
  656. {
  657. ret = fscanf(f, "%d\t", &opencl_affinity_matrix[gpu][cpu]);
  658. STARPU_ASSERT(ret == 1);
  659. }
  660. ret = fscanf(f, "\n");
  661. STARPU_ASSERT(ret == 0);
  662. }
  663. #endif /* !STARPU_USE_OPENCL */
  664. fclose(f);
  665. #endif /* !(STARPU_USE_CUDA_ || STARPU_USE_OPENCL */
  666. }
  667. static void write_bus_affinity_file_content(void)
  668. {
  669. STARPU_ASSERT(was_benchmarked);
  670. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  671. FILE *f;
  672. char path[256];
  673. get_affinity_path(path, 256);
  674. f = fopen(path, "w+");
  675. if (!f)
  676. {
  677. perror("fopen write_buf_affinity_file_content");
  678. _STARPU_DISP("path '%s'\n", path);
  679. fflush(stderr);
  680. STARPU_ABORT();
  681. }
  682. unsigned cpu;
  683. int gpu;
  684. fprintf(f, "# GPU\t");
  685. for (cpu = 0; cpu < ncpus; cpu++)
  686. fprintf(f, "CPU%u\t", cpu);
  687. fprintf(f, "\n");
  688. #ifdef STARPU_USE_CUDA
  689. for (gpu = 0; gpu < ncuda; gpu++)
  690. {
  691. fprintf(f, "%d\t", gpu);
  692. for (cpu = 0; cpu < ncpus; cpu++)
  693. {
  694. fprintf(f, "%d\t", cudadev_timing_per_cpu[(gpu+1)*STARPU_MAXCPUS+cpu].cpu_id);
  695. }
  696. fprintf(f, "\n");
  697. }
  698. #endif
  699. #ifdef STARPU_USE_OPENCL
  700. for (gpu = 0; gpu < nopencl; gpu++)
  701. {
  702. fprintf(f, "%d\t", gpu);
  703. for (cpu = 0; cpu < ncpus; cpu++)
  704. {
  705. fprintf(f, "%d\t", opencldev_timing_per_cpu[(gpu+1)*STARPU_MAXCPUS+cpu].cpu_id);
  706. }
  707. fprintf(f, "\n");
  708. }
  709. #endif
  710. fclose(f);
  711. #endif
  712. }
  713. static void generate_bus_affinity_file(void)
  714. {
  715. if (!was_benchmarked)
  716. benchmark_all_gpu_devices();
  717. write_bus_affinity_file_content();
  718. }
  719. static void load_bus_affinity_file(void)
  720. {
  721. int res;
  722. char path[256];
  723. get_affinity_path(path, 256);
  724. res = access(path, F_OK);
  725. if (res)
  726. {
  727. /* File does not exist yet */
  728. generate_bus_affinity_file();
  729. }
  730. load_bus_affinity_file_content();
  731. }
  732. #endif /* !SIMGRID */
  733. #ifdef STARPU_USE_CUDA
  734. int *_starpu_get_cuda_affinity_vector(unsigned gpuid)
  735. {
  736. return cuda_affinity_matrix[gpuid];
  737. }
  738. #endif /* STARPU_USE_CUDA */
  739. #ifdef STARPU_USE_OPENCL
  740. int *_starpu_get_opencl_affinity_vector(unsigned gpuid)
  741. {
  742. return opencl_affinity_matrix[gpuid];
  743. }
  744. #endif /* STARPU_USE_OPENCL */
  745. void starpu_bus_print_affinity(FILE *f)
  746. {
  747. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  748. unsigned cpu;
  749. int gpu;
  750. #endif
  751. fprintf(f, "# GPU\tCPU in preference order (logical index)\n");
  752. #ifdef STARPU_USE_CUDA
  753. fprintf(f, "# CUDA\n");
  754. for(gpu = 0 ; gpu<ncuda ; gpu++)
  755. {
  756. fprintf(f, "%d\t", gpu);
  757. for (cpu = 0; cpu < ncpus; cpu++)
  758. {
  759. fprintf(f, "%d\t", cuda_affinity_matrix[gpu][cpu]);
  760. }
  761. fprintf(f, "\n");
  762. }
  763. #endif
  764. #ifdef STARPU_USE_OPENCL
  765. fprintf(f, "# OpenCL\n");
  766. for(gpu = 0 ; gpu<nopencl ; gpu++)
  767. {
  768. fprintf(f, "%d\t", gpu);
  769. for (cpu = 0; cpu < ncpus; cpu++)
  770. {
  771. fprintf(f, "%d\t", opencl_affinity_matrix[gpu][cpu]);
  772. }
  773. fprintf(f, "\n");
  774. }
  775. #endif
  776. }
  777. /*
  778. * Latency
  779. */
  780. static void get_latency_path(char *path, size_t maxlen)
  781. {
  782. get_bus_path("latency", path, maxlen);
  783. }
  784. static int load_bus_latency_file_content(void)
  785. {
  786. int n;
  787. unsigned src, dst;
  788. FILE *f;
  789. char path[256];
  790. get_latency_path(path, 256);
  791. f = fopen(path, "r");
  792. STARPU_ASSERT(f);
  793. for (src = 0; src < STARPU_MAXNODES; src++)
  794. {
  795. _starpu_drop_comments(f);
  796. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  797. {
  798. double latency;
  799. n = fscanf(f, "%lf", &latency);
  800. if (n != 1)
  801. {
  802. fclose(f);
  803. return 0;
  804. }
  805. n = getc(f);
  806. if (n != '\t')
  807. {
  808. fclose(f);
  809. return 0;
  810. }
  811. latency_matrix[src][dst] = latency;
  812. }
  813. n = getc(f);
  814. if (n != '\n')
  815. {
  816. fclose(f);
  817. return 0;
  818. }
  819. }
  820. fclose(f);
  821. return 1;
  822. }
  823. static void write_bus_latency_file_content(void)
  824. {
  825. int src, dst, maxnode;
  826. FILE *f;
  827. STARPU_ASSERT(was_benchmarked);
  828. char path[256];
  829. get_latency_path(path, 256);
  830. f = fopen(path, "w+");
  831. if (!f)
  832. {
  833. perror("fopen write_bus_latency_file_content");
  834. _STARPU_DISP("path '%s'\n", path);
  835. fflush(stderr);
  836. STARPU_ABORT();
  837. }
  838. fprintf(f, "# ");
  839. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  840. fprintf(f, "to %d\t\t", dst);
  841. fprintf(f, "\n");
  842. maxnode = ncuda;
  843. #ifdef STARPU_USE_OPENCL
  844. maxnode += nopencl;
  845. #endif
  846. for (src = 0; src < STARPU_MAXNODES; src++)
  847. {
  848. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  849. {
  850. double latency = 0.0;
  851. if ((src > maxnode) || (dst > maxnode))
  852. {
  853. /* convention */
  854. latency = NAN;
  855. }
  856. else if (src == dst)
  857. {
  858. latency = 0.0;
  859. }
  860. else
  861. {
  862. /* µs */
  863. #ifdef STARPU_USE_CUDA
  864. #ifdef HAVE_CUDA_MEMCPY_PEER
  865. if (src && src < ncuda && dst && dst <= ncuda)
  866. latency = cudadev_latency_dtod[src][dst];
  867. else
  868. #endif
  869. {
  870. if (src && src <= ncuda)
  871. latency += cudadev_latency_dtoh[src];
  872. if (dst && dst <= ncuda)
  873. latency += cudadev_latency_htod[dst];
  874. }
  875. #endif
  876. #ifdef STARPU_USE_OPENCL
  877. if (src > ncuda)
  878. latency += opencldev_latency_dtoh[src-ncuda];
  879. if (dst > ncuda)
  880. latency += opencldev_latency_htod[dst-ncuda];
  881. #endif
  882. }
  883. fprintf(f, "%f\t", latency);
  884. }
  885. fprintf(f, "\n");
  886. }
  887. fclose(f);
  888. }
  889. static void generate_bus_latency_file(void)
  890. {
  891. if (!was_benchmarked)
  892. benchmark_all_gpu_devices();
  893. write_bus_latency_file_content();
  894. }
  895. static void load_bus_latency_file(void)
  896. {
  897. int res;
  898. char path[256];
  899. get_latency_path(path, 256);
  900. res = access(path, F_OK);
  901. if (res || !load_bus_latency_file_content())
  902. {
  903. /* File does not exist yet or is bogus */
  904. generate_bus_latency_file();
  905. }
  906. }
  907. /*
  908. * Bandwidth
  909. */
  910. static void get_bandwidth_path(char *path, size_t maxlen)
  911. {
  912. get_bus_path("bandwidth", path, maxlen);
  913. }
  914. static int load_bus_bandwidth_file_content(void)
  915. {
  916. int n;
  917. unsigned src, dst;
  918. FILE *f;
  919. char path[256];
  920. get_bandwidth_path(path, 256);
  921. f = fopen(path, "r");
  922. if (!f)
  923. {
  924. perror("fopen load_bus_bandwidth_file_content");
  925. _STARPU_DISP("path '%s'\n", path);
  926. fflush(stderr);
  927. STARPU_ABORT();
  928. }
  929. for (src = 0; src < STARPU_MAXNODES; src++)
  930. {
  931. _starpu_drop_comments(f);
  932. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  933. {
  934. double bandwidth;
  935. n = fscanf(f, "%lf", &bandwidth);
  936. if (n != 1)
  937. {
  938. _STARPU_DISP("Error while reading sampling file <%s>. Expected a number\n", path);
  939. fclose(f);
  940. return 0;
  941. }
  942. n = getc(f);
  943. if (n != '\t')
  944. {
  945. fclose(f);
  946. return 0;
  947. }
  948. bandwidth_matrix[src][dst] = bandwidth;
  949. }
  950. n = getc(f);
  951. if (n != '\n')
  952. {
  953. fclose(f);
  954. return 0;
  955. }
  956. }
  957. fclose(f);
  958. return 1;
  959. }
  960. static void write_bus_bandwidth_file_content(void)
  961. {
  962. int src, dst, maxnode;
  963. FILE *f;
  964. STARPU_ASSERT(was_benchmarked);
  965. char path[256];
  966. get_bandwidth_path(path, 256);
  967. f = fopen(path, "w+");
  968. STARPU_ASSERT(f);
  969. fprintf(f, "# ");
  970. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  971. fprintf(f, "to %d\t\t", dst);
  972. fprintf(f, "\n");
  973. maxnode = ncuda;
  974. #ifdef STARPU_USE_OPENCL
  975. maxnode += nopencl;
  976. #endif
  977. for (src = 0; src < STARPU_MAXNODES; src++)
  978. {
  979. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  980. {
  981. double bandwidth;
  982. if ((src > maxnode) || (dst > maxnode))
  983. {
  984. bandwidth = NAN;
  985. }
  986. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  987. else if (src != dst)
  988. {
  989. double slowness = 0.0;
  990. /* Total bandwidth is the harmonic mean of bandwidths */
  991. #ifdef STARPU_USE_CUDA
  992. #ifdef HAVE_CUDA_MEMCPY_PEER
  993. if (src && src <= ncuda && dst && dst <= ncuda)
  994. /* Direct GPU-GPU transfert */
  995. slowness = cudadev_timing_dtod[src][dst];
  996. else
  997. #endif
  998. {
  999. if (src && src <= ncuda)
  1000. slowness += cudadev_timing_dtoh[src];
  1001. if (dst && dst <= ncuda)
  1002. slowness += cudadev_timing_htod[dst];
  1003. }
  1004. #endif
  1005. #ifdef STARPU_USE_OPENCL
  1006. if (src > ncuda)
  1007. slowness += opencldev_timing_dtoh[src-ncuda];
  1008. if (dst > ncuda)
  1009. slowness += opencldev_timing_htod[dst-ncuda];
  1010. #endif
  1011. bandwidth = 1.0/slowness;
  1012. }
  1013. #endif
  1014. else
  1015. {
  1016. /* convention */
  1017. bandwidth = 0.0;
  1018. }
  1019. fprintf(f, "%f\t", bandwidth);
  1020. }
  1021. fprintf(f, "\n");
  1022. }
  1023. fclose(f);
  1024. }
  1025. void starpu_bus_print_bandwidth(FILE *f)
  1026. {
  1027. int src, dst, maxnode;
  1028. maxnode = ncuda;
  1029. #ifdef STARPU_USE_OPENCL
  1030. maxnode += nopencl;
  1031. #endif
  1032. fprintf(f, "from/to\t");
  1033. fprintf(f, "RAM\t");
  1034. for (dst = 0; dst < ncuda; dst++)
  1035. fprintf(f, "CUDA %d\t", dst);
  1036. for (dst = 0; dst < nopencl; dst++)
  1037. fprintf(f, "OpenCL%d\t", dst);
  1038. fprintf(f, "\n");
  1039. for (src = 0; src <= maxnode; src++)
  1040. {
  1041. if (!src)
  1042. fprintf(f, "RAM\t");
  1043. else if (src <= ncuda)
  1044. fprintf(f, "CUDA %d\t", src-1);
  1045. else
  1046. fprintf(f, "OpenCL%d\t", src-ncuda-1);
  1047. for (dst = 0; dst <= maxnode; dst++)
  1048. fprintf(f, "%.0f\t", bandwidth_matrix[src][dst]);
  1049. fprintf(f, "\n");
  1050. }
  1051. fprintf(f, "\n");
  1052. for (src = 0; src <= maxnode; src++)
  1053. {
  1054. if (!src)
  1055. fprintf(f, "RAM\t");
  1056. else if (src <= ncuda)
  1057. fprintf(f, "CUDA %d\t", src-1);
  1058. else
  1059. fprintf(f, "OpenCL%d\t", src-ncuda-1);
  1060. for (dst = 0; dst <= maxnode; dst++)
  1061. fprintf(f, "%.0f\t", latency_matrix[src][dst]);
  1062. fprintf(f, "\n");
  1063. }
  1064. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  1065. if (ncuda != 0 || nopencl != 0)
  1066. fprintf(f, "\nGPU\tCPU in preference order (logical index), host-to-device, device-to-host\n");
  1067. for (src = 1; src <= maxnode; src++)
  1068. {
  1069. struct dev_timing *timing;
  1070. struct _starpu_machine_config *config = _starpu_get_machine_config();
  1071. int ncpus = _starpu_topology_get_nhwcpu(config);
  1072. int cpu;
  1073. #ifdef STARPU_USE_CUDA
  1074. if (src <= ncuda)
  1075. {
  1076. fprintf(f, "CUDA %d\t", src-1);
  1077. for (cpu = 0; cpu < ncpus; cpu++)
  1078. {
  1079. timing = &cudadev_timing_per_cpu[src*STARPU_MAXCPUS+cpu];
  1080. if (timing->timing_htod)
  1081. fprintf(f, "%2d %.0f %.0f\t", timing->cpu_id, 1/timing->timing_htod, 1/timing->timing_dtoh);
  1082. else
  1083. fprintf(f, "%2d\t", cuda_affinity_matrix[src-1][cpu]);
  1084. }
  1085. }
  1086. #ifdef STARPU_USE_OPENCL
  1087. else
  1088. #endif
  1089. #endif
  1090. #ifdef STARPU_USE_OPENCL
  1091. {
  1092. fprintf(f, "OpenCL%d\t", src-ncuda-1);
  1093. for (cpu = 0; cpu < ncpus; cpu++)
  1094. {
  1095. timing = &opencldev_timing_per_cpu[(src-ncuda)*STARPU_MAXCPUS+cpu];
  1096. if (timing->timing_htod)
  1097. fprintf(f, "%2d %.0f %.0f\t", timing->cpu_id, 1/timing->timing_htod, 1/timing->timing_dtoh);
  1098. else
  1099. fprintf(f, "%2d\t", opencl_affinity_matrix[src-1][cpu]);
  1100. }
  1101. }
  1102. #endif
  1103. fprintf(f, "\n");
  1104. }
  1105. #endif
  1106. }
  1107. static void generate_bus_bandwidth_file(void)
  1108. {
  1109. if (!was_benchmarked)
  1110. benchmark_all_gpu_devices();
  1111. write_bus_bandwidth_file_content();
  1112. }
  1113. static void load_bus_bandwidth_file(void)
  1114. {
  1115. int res;
  1116. char path[256];
  1117. get_bandwidth_path(path, 256);
  1118. res = access(path, F_OK);
  1119. if (res || !load_bus_bandwidth_file_content())
  1120. {
  1121. /* File does not exist yet or is bogus */
  1122. generate_bus_bandwidth_file();
  1123. }
  1124. }
  1125. #ifndef STARPU_SIMGRID
  1126. /*
  1127. * Config
  1128. */
  1129. static void get_config_path(char *path, size_t maxlen)
  1130. {
  1131. get_bus_path("config", path, maxlen);
  1132. }
  1133. static void check_bus_config_file(void)
  1134. {
  1135. int res;
  1136. char path[256];
  1137. struct _starpu_machine_config *config = _starpu_get_machine_config();
  1138. get_config_path(path, 256);
  1139. res = access(path, F_OK);
  1140. if (res || config->conf->bus_calibrate > 0)
  1141. {
  1142. if (res)
  1143. _STARPU_DISP("No performance model for the bus, calibrating...\n");
  1144. starpu_force_bus_sampling();
  1145. if (res)
  1146. _STARPU_DISP("... done\n");
  1147. }
  1148. else
  1149. {
  1150. FILE *f;
  1151. int ret, read_cuda = -1, read_opencl = -1;
  1152. unsigned read_cpus = -1;
  1153. // Loading configuration from file
  1154. f = fopen(path, "r");
  1155. STARPU_ASSERT(f);
  1156. _starpu_drop_comments(f);
  1157. ret = fscanf(f, "%u\t", &read_cpus);
  1158. STARPU_ASSERT(ret == 1);
  1159. _starpu_drop_comments(f);
  1160. ret = fscanf(f, "%d\t", &read_cuda);
  1161. STARPU_ASSERT(ret == 1);
  1162. _starpu_drop_comments(f);
  1163. ret = fscanf(f, "%d\t", &read_opencl);
  1164. STARPU_ASSERT(ret == 1);
  1165. _starpu_drop_comments(f);
  1166. fclose(f);
  1167. // Loading current configuration
  1168. ncpus = _starpu_topology_get_nhwcpu(config);
  1169. #ifdef STARPU_USE_CUDA
  1170. ncuda = _starpu_get_cuda_device_count();
  1171. #endif
  1172. #ifdef STARPU_USE_OPENCL
  1173. nopencl = _starpu_opencl_get_device_count();
  1174. #endif
  1175. // Checking if both configurations match
  1176. if (read_cpus != ncpus)
  1177. {
  1178. _STARPU_DISP("Current configuration does not match the bus performance model (CPUS: (stored) %u != (current) %u), recalibrating...\n", read_cpus, ncpus);
  1179. starpu_force_bus_sampling();
  1180. _STARPU_DISP("... done\n");
  1181. }
  1182. else if (read_cuda != ncuda)
  1183. {
  1184. _STARPU_DISP("Current configuration does not match the bus performance model (CUDA: (stored) %d != (current) %d), recalibrating...\n", read_cuda, ncuda);
  1185. starpu_force_bus_sampling();
  1186. _STARPU_DISP("... done\n");
  1187. }
  1188. else if (read_opencl != nopencl)
  1189. {
  1190. _STARPU_DISP("Current configuration does not match the bus performance model (OpenCL: (stored) %d != (current) %d), recalibrating...\n", read_opencl, nopencl);
  1191. starpu_force_bus_sampling();
  1192. _STARPU_DISP("... done\n");
  1193. }
  1194. }
  1195. }
  1196. static void write_bus_config_file_content(void)
  1197. {
  1198. FILE *f;
  1199. char path[256];
  1200. STARPU_ASSERT(was_benchmarked);
  1201. get_config_path(path, 256);
  1202. f = fopen(path, "w+");
  1203. STARPU_ASSERT(f);
  1204. fprintf(f, "# Current configuration\n");
  1205. fprintf(f, "%u # Number of CPUs\n", ncpus);
  1206. fprintf(f, "%d # Number of CUDA devices\n", ncuda);
  1207. fprintf(f, "%d # Number of OpenCL devices\n", nopencl);
  1208. fclose(f);
  1209. }
  1210. static void generate_bus_config_file(void)
  1211. {
  1212. if (!was_benchmarked)
  1213. benchmark_all_gpu_devices();
  1214. write_bus_config_file_content();
  1215. }
  1216. /*
  1217. * Generic
  1218. */
  1219. static void starpu_force_bus_sampling(void)
  1220. {
  1221. _STARPU_DEBUG("Force bus sampling ...\n");
  1222. _starpu_create_sampling_directory_if_needed();
  1223. generate_bus_affinity_file();
  1224. generate_bus_latency_file();
  1225. generate_bus_bandwidth_file();
  1226. generate_bus_config_file();
  1227. }
  1228. #endif /* !SIMGRID */
  1229. void _starpu_load_bus_performance_files(void)
  1230. {
  1231. _starpu_create_sampling_directory_if_needed();
  1232. #ifndef STARPU_SIMGRID
  1233. check_bus_config_file();
  1234. load_bus_affinity_file();
  1235. #endif
  1236. load_bus_latency_file();
  1237. load_bus_bandwidth_file();
  1238. }
  1239. /* (in MB/s) */
  1240. double _starpu_transfer_bandwidth(unsigned src_node, unsigned dst_node)
  1241. {
  1242. return bandwidth_matrix[src_node][dst_node];
  1243. }
  1244. /* (in µs) */
  1245. double _starpu_transfer_latency(unsigned src_node, unsigned dst_node)
  1246. {
  1247. return latency_matrix[src_node][dst_node];
  1248. }
  1249. /* (in µs) */
  1250. double _starpu_predict_transfer_time(unsigned src_node, unsigned dst_node, size_t size)
  1251. {
  1252. double bandwidth = bandwidth_matrix[src_node][dst_node];
  1253. double latency = latency_matrix[src_node][dst_node];
  1254. struct starpu_machine_topology *topology = &_starpu_get_machine_config()->topology;
  1255. return latency + (size/bandwidth)*2*(topology->ncudagpus+topology->nopenclgpus);
  1256. }