perfmodel_bus.c 50 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009-2013 Université de Bordeaux 1
  4. * Copyright (C) 2010, 2011, 2012, 2013 Centre National de la Recherche Scientifique
  5. * Copyright (C) 2013 Corentin Salingue
  6. *
  7. * StarPU is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU Lesser General Public License as published by
  9. * the Free Software Foundation; either version 2.1 of the License, or (at
  10. * your option) any later version.
  11. *
  12. * StarPU is distributed in the hope that it will be useful, but
  13. * WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  15. *
  16. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  17. */
  18. #ifdef STARPU_USE_CUDA
  19. #ifndef _GNU_SOURCE
  20. #define _GNU_SOURCE
  21. #endif
  22. #include <sched.h>
  23. #endif
  24. #include <unistd.h>
  25. #include <sys/time.h>
  26. #include <stdlib.h>
  27. #include <math.h>
  28. #include <starpu.h>
  29. #include <starpu_cuda.h>
  30. #include <starpu_opencl.h>
  31. #include <common/config.h>
  32. #include <core/workers.h>
  33. #include <core/perfmodel/perfmodel.h>
  34. #include <core/simgrid.h>
  35. #include <common/utils.h>
  36. #ifdef STARPU_USE_OPENCL
  37. #include <starpu_opencl.h>
  38. #endif
  39. #ifdef STARPU_HAVE_WINDOWS
  40. #include <windows.h>
  41. #endif
  42. #define SIZE (32*1024*1024*sizeof(char))
  43. #define NITER 128
  44. #ifndef STARPU_SIMGRID
  45. static void _starpu_bus_force_sampling(void);
  46. #endif
  47. /* timing is in µs per byte (i.e. slowness, inverse of bandwidth) */
  48. struct dev_timing
  49. {
  50. int cpu_id;
  51. double timing_htod;
  52. double latency_htod;
  53. double timing_dtoh;
  54. double latency_dtoh;
  55. };
  56. /* TODO: measure latency */
  57. static double bandwidth_matrix[STARPU_MAXNODES][STARPU_MAXNODES];
  58. static double latency_matrix[STARPU_MAXNODES][STARPU_MAXNODES];
  59. static unsigned was_benchmarked = 0;
  60. static unsigned ncpus = 0;
  61. static unsigned ncuda = 0;
  62. static unsigned nopencl = 0;
  63. /* Benchmarking the performance of the bus */
  64. #ifdef STARPU_USE_CUDA
  65. static int cuda_affinity_matrix[STARPU_MAXCUDADEVS][STARPU_MAXCPUS];
  66. static double cudadev_timing_htod[STARPU_MAXNODES] = {0.0};
  67. static double cudadev_latency_htod[STARPU_MAXNODES] = {0.0};
  68. static double cudadev_timing_dtoh[STARPU_MAXNODES] = {0.0};
  69. static double cudadev_latency_dtoh[STARPU_MAXNODES] = {0.0};
  70. #ifdef HAVE_CUDA_MEMCPY_PEER
  71. static double cudadev_timing_dtod[STARPU_MAXNODES][STARPU_MAXNODES] = {{0.0}};
  72. static double cudadev_latency_dtod[STARPU_MAXNODES][STARPU_MAXNODES] = {{0.0}};
  73. #endif
  74. static struct dev_timing cudadev_timing_per_cpu[STARPU_MAXNODES*STARPU_MAXCPUS];
  75. #endif
  76. #ifdef STARPU_USE_OPENCL
  77. static int opencl_affinity_matrix[STARPU_MAXOPENCLDEVS][STARPU_MAXCPUS];
  78. static double opencldev_timing_htod[STARPU_MAXNODES] = {0.0};
  79. static double opencldev_latency_htod[STARPU_MAXNODES] = {0.0};
  80. static double opencldev_timing_dtoh[STARPU_MAXNODES] = {0.0};
  81. static double opencldev_latency_dtoh[STARPU_MAXNODES] = {0.0};
  82. static struct dev_timing opencldev_timing_per_cpu[STARPU_MAXNODES*STARPU_MAXCPUS];
  83. #endif
  84. #ifdef STARPU_HAVE_HWLOC
  85. static hwloc_topology_t hwtopology;
  86. #endif
  87. #if (defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)) && !defined(STARPU_SIMGRID)
  88. #ifdef STARPU_USE_CUDA
  89. static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int cpu, struct dev_timing *dev_timing_per_cpu)
  90. {
  91. struct _starpu_machine_config *config = _starpu_get_machine_config();
  92. _starpu_bind_thread_on_cpu(config, cpu);
  93. size_t size = SIZE;
  94. /* Initialize CUDA context on the device */
  95. /* We do not need to enable OpenGL interoperability at this point,
  96. * since we cleanly shutdown CUDA before returning. */
  97. cudaSetDevice(dev);
  98. /* hack to avoid third party libs to rebind threads */
  99. _starpu_bind_thread_on_cpu(config, cpu);
  100. /* hack to force the initialization */
  101. cudaFree(0);
  102. /* hack to avoid third party libs to rebind threads */
  103. _starpu_bind_thread_on_cpu(config, cpu);
  104. /* Get the maximum size which can be allocated on the device */
  105. struct cudaDeviceProp prop;
  106. cudaError_t cures;
  107. cures = cudaGetDeviceProperties(&prop, dev);
  108. if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
  109. if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
  110. /* Allocate a buffer on the device */
  111. unsigned char *d_buffer;
  112. cures = cudaMalloc((void **)&d_buffer, size);
  113. STARPU_ASSERT(cures == cudaSuccess);
  114. /* hack to avoid third party libs to rebind threads */
  115. _starpu_bind_thread_on_cpu(config, cpu);
  116. /* Allocate a buffer on the host */
  117. unsigned char *h_buffer;
  118. cures = cudaHostAlloc((void **)&h_buffer, size, 0);
  119. STARPU_ASSERT(cures == cudaSuccess);
  120. /* hack to avoid third party libs to rebind threads */
  121. _starpu_bind_thread_on_cpu(config, cpu);
  122. /* Fill them */
  123. memset(h_buffer, 0, size);
  124. cudaMemset(d_buffer, 0, size);
  125. /* hack to avoid third party libs to rebind threads */
  126. _starpu_bind_thread_on_cpu(config, cpu);
  127. unsigned iter;
  128. double timing;
  129. struct timeval start;
  130. struct timeval end;
  131. /* Measure upload bandwidth */
  132. gettimeofday(&start, NULL);
  133. for (iter = 0; iter < NITER; iter++)
  134. {
  135. cudaMemcpy(d_buffer, h_buffer, size, cudaMemcpyHostToDevice);
  136. cudaThreadSynchronize();
  137. }
  138. gettimeofday(&end, NULL);
  139. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  140. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_htod = timing/NITER/size;
  141. /* Measure download bandwidth */
  142. gettimeofday(&start, NULL);
  143. for (iter = 0; iter < NITER; iter++)
  144. {
  145. cudaMemcpy(h_buffer, d_buffer, size, cudaMemcpyDeviceToHost);
  146. cudaThreadSynchronize();
  147. }
  148. gettimeofday(&end, NULL);
  149. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  150. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_dtoh = timing/NITER/size;
  151. /* Measure upload latency */
  152. gettimeofday(&start, NULL);
  153. for (iter = 0; iter < NITER; iter++)
  154. {
  155. cudaMemcpy(d_buffer, h_buffer, 1, cudaMemcpyHostToDevice);
  156. cudaThreadSynchronize();
  157. }
  158. gettimeofday(&end, NULL);
  159. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  160. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].latency_htod = timing/NITER;
  161. /* Measure download latency */
  162. gettimeofday(&start, NULL);
  163. for (iter = 0; iter < NITER; iter++)
  164. {
  165. cudaMemcpy(d_buffer, h_buffer, 1, cudaMemcpyHostToDevice);
  166. cudaThreadSynchronize();
  167. }
  168. gettimeofday(&end, NULL);
  169. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  170. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].latency_dtoh = timing/NITER;
  171. /* Free buffers */
  172. cudaFreeHost(h_buffer);
  173. cudaFree(d_buffer);
  174. cudaThreadExit();
  175. }
  176. #ifdef HAVE_CUDA_MEMCPY_PEER
  177. static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
  178. {
  179. size_t size = SIZE;
  180. int can;
  181. /* Get the maximum size which can be allocated on the device */
  182. struct cudaDeviceProp prop;
  183. cudaError_t cures;
  184. cures = cudaGetDeviceProperties(&prop, src);
  185. if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
  186. if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
  187. cures = cudaGetDeviceProperties(&prop, dst);
  188. if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
  189. if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
  190. /* Initialize CUDA context on the source */
  191. /* We do not need to enable OpenGL interoperability at this point,
  192. * since we cleanly shutdown CUDA before returning. */
  193. cudaSetDevice(src);
  194. if (starpu_get_env_number("STARPU_DISABLE_CUDA_GPU_GPU_DIRECT") <= 0)
  195. {
  196. cures = cudaDeviceCanAccessPeer(&can, src, dst);
  197. if (!cures && can)
  198. {
  199. cures = cudaDeviceEnablePeerAccess(dst, 0);
  200. if (!cures)
  201. _STARPU_DISP("GPU-Direct %d -> %d\n", dst, src);
  202. }
  203. }
  204. /* Allocate a buffer on the device */
  205. unsigned char *s_buffer;
  206. cures = cudaMalloc((void **)&s_buffer, size);
  207. STARPU_ASSERT(cures == cudaSuccess);
  208. cudaMemset(s_buffer, 0, size);
  209. /* Initialize CUDA context on the destination */
  210. /* We do not need to enable OpenGL interoperability at this point,
  211. * since we cleanly shutdown CUDA before returning. */
  212. cudaSetDevice(dst);
  213. if (starpu_get_env_number("STARPU_DISABLE_CUDA_GPU_GPU_DIRECT") <= 0)
  214. {
  215. cures = cudaDeviceCanAccessPeer(&can, dst, src);
  216. if (!cures && can)
  217. {
  218. cures = cudaDeviceEnablePeerAccess(src, 0);
  219. if (!cures)
  220. _STARPU_DISP("GPU-Direct %d -> %d\n", src, dst);
  221. }
  222. }
  223. /* Allocate a buffer on the device */
  224. unsigned char *d_buffer;
  225. cures = cudaMalloc((void **)&d_buffer, size);
  226. STARPU_ASSERT(cures == cudaSuccess);
  227. cudaMemset(d_buffer, 0, size);
  228. unsigned iter;
  229. double timing;
  230. struct timeval start;
  231. struct timeval end;
  232. /* Measure upload bandwidth */
  233. gettimeofday(&start, NULL);
  234. for (iter = 0; iter < NITER; iter++)
  235. {
  236. cudaMemcpyPeer(d_buffer, dst, s_buffer, src, size);
  237. cudaThreadSynchronize();
  238. }
  239. gettimeofday(&end, NULL);
  240. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  241. cudadev_timing_dtod[src+1][dst+1] = timing/NITER/size;
  242. /* Measure upload latency */
  243. gettimeofday(&start, NULL);
  244. for (iter = 0; iter < NITER; iter++)
  245. {
  246. cudaMemcpyPeer(d_buffer, dst, s_buffer, src, 1);
  247. cudaThreadSynchronize();
  248. }
  249. gettimeofday(&end, NULL);
  250. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  251. cudadev_latency_dtod[src+1][dst+1] = timing/NITER;
  252. /* Free buffers */
  253. cudaFree(d_buffer);
  254. cudaSetDevice(src);
  255. cudaFree(s_buffer);
  256. cudaThreadExit();
  257. }
  258. #endif
  259. #endif
  260. #ifdef STARPU_USE_OPENCL
  261. static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, int cpu, struct dev_timing *dev_timing_per_cpu)
  262. {
  263. cl_context context;
  264. cl_command_queue queue;
  265. cl_int err=0;
  266. size_t size = SIZE;
  267. int not_initialized;
  268. struct _starpu_machine_config *config = _starpu_get_machine_config();
  269. _starpu_bind_thread_on_cpu(config, cpu);
  270. /* Is the context already initialised ? */
  271. starpu_opencl_get_context(dev, &context);
  272. not_initialized = (context == NULL);
  273. if (not_initialized == 1)
  274. _starpu_opencl_init_context(dev);
  275. /* Get context and queue */
  276. starpu_opencl_get_context(dev, &context);
  277. starpu_opencl_get_queue(dev, &queue);
  278. /* Get the maximum size which can be allocated on the device */
  279. cl_device_id device;
  280. cl_ulong maxMemAllocSize;
  281. starpu_opencl_get_device(dev, &device);
  282. err = clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(maxMemAllocSize), &maxMemAllocSize, NULL);
  283. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  284. if (size > (size_t)maxMemAllocSize/4) size = maxMemAllocSize/4;
  285. if (_starpu_opencl_get_device_type(dev) == CL_DEVICE_TYPE_CPU)
  286. {
  287. /* Let's not use too much RAM when running OpenCL on a CPU: it
  288. * would make the OS swap like crazy. */
  289. size /= 2;
  290. }
  291. /* hack to avoid third party libs to rebind threads */
  292. _starpu_bind_thread_on_cpu(config, cpu);
  293. /* Allocate a buffer on the device */
  294. cl_mem d_buffer;
  295. d_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err);
  296. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  297. /* hack to avoid third party libs to rebind threads */
  298. _starpu_bind_thread_on_cpu(config, cpu);
  299. /* Allocate a buffer on the host */
  300. unsigned char *h_buffer;
  301. h_buffer = (unsigned char *)malloc(size);
  302. STARPU_ASSERT(h_buffer);
  303. /* hack to avoid third party libs to rebind threads */
  304. _starpu_bind_thread_on_cpu(config, cpu);
  305. /* Fill them */
  306. memset(h_buffer, 0, size);
  307. err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, size, h_buffer, 0, NULL, NULL);
  308. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  309. clFinish(queue);
  310. /* hack to avoid third party libs to rebind threads */
  311. _starpu_bind_thread_on_cpu(config, cpu);
  312. unsigned iter;
  313. double timing;
  314. struct timeval start;
  315. struct timeval end;
  316. /* Measure upload bandwidth */
  317. gettimeofday(&start, NULL);
  318. for (iter = 0; iter < NITER; iter++)
  319. {
  320. err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, size, h_buffer, 0, NULL, NULL);
  321. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  322. clFinish(queue);
  323. }
  324. gettimeofday(&end, NULL);
  325. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  326. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_htod = timing/NITER/size;
  327. /* Measure download bandwidth */
  328. gettimeofday(&start, NULL);
  329. for (iter = 0; iter < NITER; iter++)
  330. {
  331. err = clEnqueueReadBuffer(queue, d_buffer, CL_TRUE, 0, size, h_buffer, 0, NULL, NULL);
  332. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  333. clFinish(queue);
  334. }
  335. gettimeofday(&end, NULL);
  336. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  337. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_dtoh = timing/NITER/size;
  338. /* Measure upload latency */
  339. gettimeofday(&start, NULL);
  340. for (iter = 0; iter < NITER; iter++)
  341. {
  342. err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, 1, h_buffer, 0, NULL, NULL);
  343. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  344. clFinish(queue);
  345. }
  346. gettimeofday(&end, NULL);
  347. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  348. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].latency_htod = timing/NITER;
  349. /* Measure download latency */
  350. gettimeofday(&start, NULL);
  351. for (iter = 0; iter < NITER; iter++)
  352. {
  353. err = clEnqueueReadBuffer(queue, d_buffer, CL_TRUE, 0, 1, h_buffer, 0, NULL, NULL);
  354. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  355. clFinish(queue);
  356. }
  357. gettimeofday(&end, NULL);
  358. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  359. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].latency_dtoh = timing/NITER;
  360. /* Free buffers */
  361. err = clReleaseMemObject(d_buffer);
  362. if (STARPU_UNLIKELY(err != CL_SUCCESS))
  363. STARPU_OPENCL_REPORT_ERROR(err);
  364. free(h_buffer);
  365. /* Uninitiliaze OpenCL context on the device */
  366. if (not_initialized == 1)
  367. _starpu_opencl_deinit_context(dev);
  368. }
  369. #endif
  370. /* NB: we want to sort the bandwidth by DECREASING order */
  371. static int compar_dev_timing(const void *left_dev_timing, const void *right_dev_timing)
  372. {
  373. const struct dev_timing *left = (const struct dev_timing *)left_dev_timing;
  374. const struct dev_timing *right = (const struct dev_timing *)right_dev_timing;
  375. double left_dtoh = left->timing_dtoh;
  376. double left_htod = left->timing_htod;
  377. double right_dtoh = right->timing_dtoh;
  378. double right_htod = right->timing_htod;
  379. double timing_sum2_left = left_dtoh*left_dtoh + left_htod*left_htod;
  380. double timing_sum2_right = right_dtoh*right_dtoh + right_htod*right_htod;
  381. /* it's for a decreasing sorting */
  382. return (timing_sum2_left > timing_sum2_right);
  383. }
  384. #ifdef STARPU_HAVE_HWLOC
  385. static int find_numa_node(hwloc_obj_t obj)
  386. {
  387. STARPU_ASSERT(obj);
  388. hwloc_obj_t current = obj;
  389. while (current->depth != HWLOC_OBJ_NODE)
  390. {
  391. current = current->parent;
  392. /* If we don't find a "node" obj before the root, this means
  393. * hwloc does not know whether there are numa nodes or not, so
  394. * we should not use a per-node sampling in that case. */
  395. STARPU_ASSERT(current);
  396. }
  397. STARPU_ASSERT(current->depth == HWLOC_OBJ_NODE);
  398. return current->logical_index;
  399. }
  400. #endif
  401. static void measure_bandwidth_between_cpus_and_dev(int dev, struct dev_timing *dev_timing_per_cpu, char *type)
  402. {
  403. /* Either we have hwloc and we measure the bandwith between each GPU
  404. * and each NUMA node, or we don't have such NUMA information and we
  405. * measure the bandwith for each pair of (CPU, GPU), which is slower.
  406. * */
  407. #ifdef STARPU_HAVE_HWLOC
  408. int cpu_depth = hwloc_get_type_depth(hwtopology, HWLOC_OBJ_CORE);
  409. int nnuma_nodes = hwloc_get_nbobjs_by_depth(hwtopology, HWLOC_OBJ_NODE);
  410. /* If no NUMA node was found, we assume that we have a single memory
  411. * bank. */
  412. const unsigned no_node_obj_was_found = (nnuma_nodes == 0);
  413. unsigned *is_available_per_numa_node = NULL;
  414. double *dev_timing_htod_per_numa_node = NULL;
  415. double *dev_latency_htod_per_numa_node = NULL;
  416. double *dev_timing_dtoh_per_numa_node = NULL;
  417. double *dev_latency_dtoh_per_numa_node = NULL;
  418. if (!no_node_obj_was_found)
  419. {
  420. is_available_per_numa_node = (unsigned *)malloc(nnuma_nodes * sizeof(unsigned));
  421. STARPU_ASSERT(is_available_per_numa_node);
  422. dev_timing_htod_per_numa_node = (double *)malloc(nnuma_nodes * sizeof(double));
  423. STARPU_ASSERT(dev_timing_htod_per_numa_node);
  424. dev_latency_htod_per_numa_node = (double *)malloc(nnuma_nodes * sizeof(double));
  425. STARPU_ASSERT(dev_latency_htod_per_numa_node);
  426. dev_timing_dtoh_per_numa_node = (double *)malloc(nnuma_nodes * sizeof(double));
  427. STARPU_ASSERT(dev_timing_dtoh_per_numa_node);
  428. dev_latency_dtoh_per_numa_node = (double *)malloc(nnuma_nodes * sizeof(double));
  429. STARPU_ASSERT(dev_latency_dtoh_per_numa_node);
  430. memset(is_available_per_numa_node, 0, nnuma_nodes*sizeof(unsigned));
  431. }
  432. #endif
  433. unsigned cpu;
  434. for (cpu = 0; cpu < ncpus; cpu++)
  435. {
  436. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].cpu_id = cpu;
  437. #ifdef STARPU_HAVE_HWLOC
  438. int numa_id = 0;
  439. if (!no_node_obj_was_found)
  440. {
  441. hwloc_obj_t obj = hwloc_get_obj_by_depth(hwtopology, cpu_depth, cpu);
  442. numa_id = find_numa_node(obj);
  443. if (is_available_per_numa_node[numa_id])
  444. {
  445. /* We reuse the previous numbers for that NUMA node */
  446. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_htod =
  447. dev_timing_htod_per_numa_node[numa_id];
  448. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].latency_htod =
  449. dev_latency_htod_per_numa_node[numa_id];
  450. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_dtoh =
  451. dev_timing_dtoh_per_numa_node[numa_id];
  452. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].latency_dtoh =
  453. dev_latency_dtoh_per_numa_node[numa_id];
  454. continue;
  455. }
  456. }
  457. #endif
  458. #ifdef STARPU_USE_CUDA
  459. if (strncmp(type, "CUDA", 4) == 0)
  460. measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(dev, cpu, dev_timing_per_cpu);
  461. #endif
  462. #ifdef STARPU_USE_OPENCL
  463. if (strncmp(type, "OpenCL", 6) == 0)
  464. measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(dev, cpu, dev_timing_per_cpu);
  465. #endif
  466. #ifdef STARPU_HAVE_HWLOC
  467. if (!no_node_obj_was_found && !is_available_per_numa_node[numa_id])
  468. {
  469. /* Save the results for that NUMA node */
  470. dev_timing_htod_per_numa_node[numa_id] =
  471. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_htod;
  472. dev_latency_htod_per_numa_node[numa_id] =
  473. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].latency_htod;
  474. dev_timing_dtoh_per_numa_node[numa_id] =
  475. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_dtoh;
  476. dev_latency_dtoh_per_numa_node[numa_id] =
  477. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].latency_dtoh;
  478. is_available_per_numa_node[numa_id] = 1;
  479. }
  480. #endif
  481. }
  482. #ifdef STARPU_HAVE_HWLOC
  483. if (!no_node_obj_was_found)
  484. {
  485. free(is_available_per_numa_node);
  486. free(dev_timing_htod_per_numa_node);
  487. free(dev_latency_htod_per_numa_node);
  488. free(dev_timing_dtoh_per_numa_node);
  489. free(dev_latency_dtoh_per_numa_node);
  490. }
  491. #endif /* STARPU_HAVE_HWLOC */
  492. }
  493. static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_htod, double *dev_latency_htod,
  494. double *dev_timing_dtoh, double *dev_latency_dtoh,
  495. struct dev_timing *dev_timing_per_cpu, char *type)
  496. {
  497. measure_bandwidth_between_cpus_and_dev(dev, dev_timing_per_cpu, type);
  498. /* sort the results */
  499. qsort(&(dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS]), ncpus,
  500. sizeof(struct dev_timing),
  501. compar_dev_timing);
  502. #ifdef STARPU_VERBOSE
  503. unsigned cpu;
  504. for (cpu = 0; cpu < ncpus; cpu++)
  505. {
  506. unsigned current_cpu = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].cpu_id;
  507. double bandwidth_dtoh = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_dtoh;
  508. double bandwidth_htod = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_htod;
  509. double bandwidth_sum2 = bandwidth_dtoh*bandwidth_dtoh + bandwidth_htod*bandwidth_htod;
  510. _STARPU_DISP("(%10s) BANDWIDTH GPU %d CPU %u - htod %f - dtoh %f - %f\n", type, dev, current_cpu, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
  511. }
  512. unsigned best_cpu = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+0].cpu_id;
  513. _STARPU_DISP("(%10s) BANDWIDTH GPU %d BEST CPU %u\n", type, dev, best_cpu);
  514. #endif
  515. /* The results are sorted in a decreasing order, so that the best
  516. * measurement is currently the first entry. */
  517. dev_timing_dtoh[dev+1] = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+0].timing_dtoh;
  518. dev_latency_dtoh[dev+1] = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+0].latency_dtoh;
  519. dev_timing_htod[dev+1] = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+0].timing_htod;
  520. dev_latency_htod[dev+1] = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+0].latency_htod;
  521. }
  522. #endif /* defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) */
  523. static void benchmark_all_gpu_devices(void)
  524. {
  525. #ifdef STARPU_SIMGRID
  526. _STARPU_DISP("can not measure bus in simgrid mode, please run starpu_calibrate_bus in non-simgrid mode to make sure the bus performance model was calibrated\n");
  527. STARPU_ABORT();
  528. #else /* !SIMGRID */
  529. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  530. unsigned i;
  531. #endif
  532. #ifdef HAVE_CUDA_MEMCPY_PEER
  533. unsigned j;
  534. #endif
  535. _STARPU_DEBUG("Benchmarking the speed of the bus\n");
  536. #ifdef STARPU_HAVE_HWLOC
  537. hwloc_topology_init(&hwtopology);
  538. hwloc_topology_load(hwtopology);
  539. #endif
  540. #ifdef STARPU_HAVE_HWLOC
  541. hwloc_bitmap_t former_cpuset = hwloc_bitmap_alloc();
  542. hwloc_get_cpubind(hwtopology, former_cpuset, HWLOC_CPUBIND_THREAD);
  543. #elif __linux__
  544. /* Save the current cpu binding */
  545. cpu_set_t former_process_affinity;
  546. int ret;
  547. ret = sched_getaffinity(0, sizeof(former_process_affinity), &former_process_affinity);
  548. if (ret)
  549. {
  550. perror("sched_getaffinity");
  551. STARPU_ABORT();
  552. }
  553. #else
  554. #warning Missing binding support, StarPU will not be able to properly benchmark NUMA topology
  555. #endif
  556. struct _starpu_machine_config *config = _starpu_get_machine_config();
  557. ncpus = _starpu_topology_get_nhwcpu(config);
  558. #ifdef STARPU_USE_CUDA
  559. ncuda = _starpu_get_cuda_device_count();
  560. for (i = 0; i < ncuda; i++)
  561. {
  562. _STARPU_DISP("CUDA %d...\n", i);
  563. /* measure bandwidth between Host and Device i */
  564. measure_bandwidth_between_host_and_dev(i, cudadev_timing_htod, cudadev_latency_htod, cudadev_timing_dtoh, cudadev_latency_dtoh, cudadev_timing_per_cpu, "CUDA");
  565. }
  566. #ifdef HAVE_CUDA_MEMCPY_PEER
  567. for (i = 0; i < ncuda; i++)
  568. for (j = 0; j < ncuda; j++)
  569. if (i != j)
  570. {
  571. _STARPU_DISP("CUDA %d -> %d...\n", i, j);
  572. /* measure bandwidth between Host and Device i */
  573. measure_bandwidth_between_dev_and_dev_cuda(i, j);
  574. }
  575. #endif
  576. #endif
  577. #ifdef STARPU_USE_OPENCL
  578. nopencl = _starpu_opencl_get_device_count();
  579. for (i = 0; i < nopencl; i++)
  580. {
  581. _STARPU_DISP("OpenCL %d...\n", i);
  582. /* measure bandwith between Host and Device i */
  583. measure_bandwidth_between_host_and_dev(i, opencldev_timing_htod, opencldev_latency_htod, opencldev_timing_dtoh, opencldev_latency_dtoh, opencldev_timing_per_cpu, "OpenCL");
  584. }
  585. #endif
  586. #ifdef STARPU_HAVE_HWLOC
  587. hwloc_set_cpubind(hwtopology, former_cpuset, HWLOC_CPUBIND_THREAD);
  588. #elif __linux__
  589. /* Restore the former affinity */
  590. ret = sched_setaffinity(0, sizeof(former_process_affinity), &former_process_affinity);
  591. if (ret)
  592. {
  593. perror("sched_setaffinity");
  594. STARPU_ABORT();
  595. }
  596. #endif
  597. #ifdef STARPU_HAVE_HWLOC
  598. hwloc_topology_destroy(hwtopology);
  599. #endif
  600. _STARPU_DEBUG("Benchmarking the speed of the bus is done.\n");
  601. was_benchmarked = 1;
  602. #endif /* !SIMGRID */
  603. }
  604. static void get_bus_path(const char *type, char *path, size_t maxlen)
  605. {
  606. _starpu_get_perf_model_dir_bus(path, maxlen);
  607. char hostname[65];
  608. _starpu_gethostname(hostname, sizeof(hostname));
  609. strncat(path, hostname, maxlen);
  610. strncat(path, ".", maxlen);
  611. strncat(path, type, maxlen);
  612. }
  613. /*
  614. * Affinity
  615. */
  616. #ifndef STARPU_SIMGRID
  617. static void get_affinity_path(char *path, size_t maxlen)
  618. {
  619. get_bus_path("affinity", path, maxlen);
  620. }
  621. static void load_bus_affinity_file_content(void)
  622. {
  623. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  624. FILE *f;
  625. char path[256];
  626. get_affinity_path(path, sizeof(path));
  627. _STARPU_DEBUG("loading affinities from %s\n", path);
  628. f = fopen(path, "r");
  629. STARPU_ASSERT(f);
  630. struct _starpu_machine_config *config = _starpu_get_machine_config();
  631. ncpus = _starpu_topology_get_nhwcpu(config);
  632. unsigned gpu;
  633. #ifdef STARPU_USE_CUDA
  634. ncuda = _starpu_get_cuda_device_count();
  635. for (gpu = 0; gpu < ncuda; gpu++)
  636. {
  637. int ret;
  638. unsigned dummy;
  639. _starpu_drop_comments(f);
  640. ret = fscanf(f, "%d\t", &dummy);
  641. STARPU_ASSERT(ret == 1);
  642. STARPU_ASSERT(dummy == gpu);
  643. unsigned cpu;
  644. for (cpu = 0; cpu < ncpus; cpu++)
  645. {
  646. ret = fscanf(f, "%d\t", &cuda_affinity_matrix[gpu][cpu]);
  647. STARPU_ASSERT(ret == 1);
  648. }
  649. ret = fscanf(f, "\n");
  650. STARPU_ASSERT(ret == 0);
  651. }
  652. #endif /* !STARPU_USE_CUDA */
  653. #ifdef STARPU_USE_OPENCL
  654. nopencl = _starpu_opencl_get_device_count();
  655. for (gpu = 0; gpu < nopencl; gpu++)
  656. {
  657. int ret;
  658. unsigned dummy;
  659. _starpu_drop_comments(f);
  660. ret = fscanf(f, "%d\t", &dummy);
  661. STARPU_ASSERT(ret == 1);
  662. STARPU_ASSERT(dummy == gpu);
  663. unsigned cpu;
  664. for (cpu = 0; cpu < ncpus; cpu++)
  665. {
  666. ret = fscanf(f, "%d\t", &opencl_affinity_matrix[gpu][cpu]);
  667. STARPU_ASSERT(ret == 1);
  668. }
  669. ret = fscanf(f, "\n");
  670. STARPU_ASSERT(ret == 0);
  671. }
  672. #endif /* !STARPU_USE_OPENCL */
  673. fclose(f);
  674. #endif /* !(STARPU_USE_CUDA_ || STARPU_USE_OPENCL */
  675. }
  676. #ifndef STARPU_SIMGRID
  677. static void write_bus_affinity_file_content(void)
  678. {
  679. STARPU_ASSERT(was_benchmarked);
  680. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  681. FILE *f;
  682. char path[256];
  683. get_affinity_path(path, sizeof(path));
  684. _STARPU_DEBUG("writing affinities to %s\n", path);
  685. f = fopen(path, "w+");
  686. if (!f)
  687. {
  688. perror("fopen write_buf_affinity_file_content");
  689. _STARPU_DISP("path '%s'\n", path);
  690. fflush(stderr);
  691. STARPU_ABORT();
  692. }
  693. unsigned cpu;
  694. unsigned gpu;
  695. fprintf(f, "# GPU\t");
  696. for (cpu = 0; cpu < ncpus; cpu++)
  697. fprintf(f, "CPU%u\t", cpu);
  698. fprintf(f, "\n");
  699. #ifdef STARPU_USE_CUDA
  700. for (gpu = 0; gpu < ncuda; gpu++)
  701. {
  702. fprintf(f, "%d\t", gpu);
  703. for (cpu = 0; cpu < ncpus; cpu++)
  704. {
  705. fprintf(f, "%d\t", cudadev_timing_per_cpu[(gpu+1)*STARPU_MAXCPUS+cpu].cpu_id);
  706. }
  707. fprintf(f, "\n");
  708. }
  709. #endif
  710. #ifdef STARPU_USE_OPENCL
  711. for (gpu = 0; gpu < nopencl; gpu++)
  712. {
  713. fprintf(f, "%d\t", gpu);
  714. for (cpu = 0; cpu < ncpus; cpu++)
  715. {
  716. fprintf(f, "%d\t", opencldev_timing_per_cpu[(gpu+1)*STARPU_MAXCPUS+cpu].cpu_id);
  717. }
  718. fprintf(f, "\n");
  719. }
  720. #endif
  721. fclose(f);
  722. #endif
  723. }
  724. #endif /* STARPU_SIMGRID */
  725. static void generate_bus_affinity_file(void)
  726. {
  727. if (!was_benchmarked)
  728. benchmark_all_gpu_devices();
  729. write_bus_affinity_file_content();
  730. }
  731. static void load_bus_affinity_file(void)
  732. {
  733. int res;
  734. char path[256];
  735. get_affinity_path(path, sizeof(path));
  736. res = access(path, F_OK);
  737. if (res)
  738. {
  739. /* File does not exist yet */
  740. generate_bus_affinity_file();
  741. }
  742. load_bus_affinity_file_content();
  743. }
  744. #ifdef STARPU_USE_CUDA
  745. int *_starpu_get_cuda_affinity_vector(unsigned gpuid)
  746. {
  747. return cuda_affinity_matrix[gpuid];
  748. }
  749. #endif /* STARPU_USE_CUDA */
  750. #ifdef STARPU_USE_OPENCL
  751. int *_starpu_get_opencl_affinity_vector(unsigned gpuid)
  752. {
  753. return opencl_affinity_matrix[gpuid];
  754. }
  755. #endif /* STARPU_USE_OPENCL */
  756. void starpu_bus_print_affinity(FILE *f)
  757. {
  758. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  759. unsigned cpu;
  760. unsigned gpu;
  761. #endif
  762. fprintf(f, "# GPU\tCPU in preference order (logical index)\n");
  763. #ifdef STARPU_USE_CUDA
  764. fprintf(f, "# CUDA\n");
  765. for(gpu = 0 ; gpu<ncuda ; gpu++)
  766. {
  767. fprintf(f, "%d\t", gpu);
  768. for (cpu = 0; cpu < ncpus; cpu++)
  769. {
  770. fprintf(f, "%d\t", cuda_affinity_matrix[gpu][cpu]);
  771. }
  772. fprintf(f, "\n");
  773. }
  774. #endif
  775. #ifdef STARPU_USE_OPENCL
  776. fprintf(f, "# OpenCL\n");
  777. for(gpu = 0 ; gpu<nopencl ; gpu++)
  778. {
  779. fprintf(f, "%d\t", gpu);
  780. for (cpu = 0; cpu < ncpus; cpu++)
  781. {
  782. fprintf(f, "%d\t", opencl_affinity_matrix[gpu][cpu]);
  783. }
  784. fprintf(f, "\n");
  785. }
  786. #endif
  787. }
  788. #endif /* STARPU_SIMGRID */
  789. /*
  790. * Latency
  791. */
  792. static void get_latency_path(char *path, size_t maxlen)
  793. {
  794. get_bus_path("latency", path, maxlen);
  795. }
  796. static int load_bus_latency_file_content(void)
  797. {
  798. int n;
  799. unsigned src, dst;
  800. FILE *f;
  801. double latency;
  802. char path[256];
  803. get_latency_path(path, sizeof(path));
  804. _STARPU_DEBUG("loading latencies from %s\n", path);
  805. f = fopen(path, "r");
  806. if (!f)
  807. {
  808. perror("fopen load_bus_latency_file_content");
  809. _STARPU_DISP("path '%s'\n", path);
  810. fflush(stderr);
  811. STARPU_ABORT();
  812. }
  813. for (src = 0; src < STARPU_MAXNODES; src++)
  814. {
  815. _starpu_drop_comments(f);
  816. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  817. {
  818. n = fscanf(f, "%lf", &latency);
  819. if (n != 1)
  820. {
  821. _STARPU_DISP("Error while reading latency file <%s>. Expected a number\n", path);
  822. fclose(f);
  823. return 0;
  824. }
  825. n = getc(f);
  826. if (n == '\n')
  827. break;
  828. if (n != '\t')
  829. {
  830. _STARPU_DISP("bogus character %c in latency file %s\n", n, path);
  831. fclose(f);
  832. return 0;
  833. }
  834. latency_matrix[src][dst] = latency;
  835. /* Look out for \t\n */
  836. n = getc(f);
  837. if (n == '\n')
  838. break;
  839. ungetc(n, f);
  840. n = '\t';
  841. }
  842. /* No more values, take NAN */
  843. for ( ; dst < STARPU_MAXNODES; dst++)
  844. latency_matrix[src][dst] = NAN;
  845. while (n == '\t')
  846. {
  847. /* Look out for \t\n */
  848. n = getc(f);
  849. if (n == '\n')
  850. break;
  851. ungetc(n, f);
  852. n = fscanf(f, "%lf", &latency);
  853. if (n && !isnan(latency))
  854. {
  855. _STARPU_DISP("Too many nodes in latency file %s for this configuration (%d)\n", path, STARPU_MAXNODES);
  856. fclose(f);
  857. return 0;
  858. }
  859. n = getc(f);
  860. }
  861. if (n != '\n')
  862. {
  863. _STARPU_DISP("Bogus character %c in latency file %s\n", n, path);
  864. fclose(f);
  865. return 0;
  866. }
  867. /* Look out for EOF */
  868. n = getc(f);
  869. if (n == EOF)
  870. break;
  871. ungetc(n, f);
  872. }
  873. /* No more values, take NAN */
  874. for ( ; src < STARPU_MAXNODES; src++)
  875. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  876. latency_matrix[src][dst] = NAN;
  877. fclose(f);
  878. return 1;
  879. }
  880. #ifndef STARPU_SIMGRID
  881. static void write_bus_latency_file_content(void)
  882. {
  883. unsigned src, dst, maxnode;
  884. FILE *f;
  885. STARPU_ASSERT(was_benchmarked);
  886. char path[256];
  887. get_latency_path(path, sizeof(path));
  888. _STARPU_DEBUG("writing latencies to %s\n", path);
  889. f = fopen(path, "w+");
  890. if (!f)
  891. {
  892. perror("fopen write_bus_latency_file_content");
  893. _STARPU_DISP("path '%s'\n", path);
  894. fflush(stderr);
  895. STARPU_ABORT();
  896. }
  897. fprintf(f, "# ");
  898. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  899. fprintf(f, "to %d\t\t", dst);
  900. fprintf(f, "\n");
  901. maxnode = ncuda;
  902. #ifdef STARPU_USE_OPENCL
  903. maxnode += nopencl;
  904. #endif
  905. for (src = 0; src < STARPU_MAXNODES; src++)
  906. {
  907. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  908. {
  909. double latency = 0.0;
  910. if ((src > maxnode) || (dst > maxnode))
  911. {
  912. /* convention */
  913. latency = NAN;
  914. }
  915. else if (src == dst)
  916. {
  917. latency = 0.0;
  918. }
  919. else
  920. {
  921. /* µs */
  922. #ifdef STARPU_USE_CUDA
  923. #ifdef HAVE_CUDA_MEMCPY_PEER
  924. if (src && src < ncuda && dst && dst <= ncuda)
  925. latency = cudadev_latency_dtod[src][dst];
  926. else
  927. #endif
  928. {
  929. if (src && src <= ncuda)
  930. latency += cudadev_latency_dtoh[src];
  931. if (dst && dst <= ncuda)
  932. latency += cudadev_latency_htod[dst];
  933. }
  934. #endif
  935. #ifdef STARPU_USE_OPENCL
  936. if (src > ncuda)
  937. latency += opencldev_latency_dtoh[src-ncuda];
  938. if (dst > ncuda)
  939. latency += opencldev_latency_htod[dst-ncuda];
  940. #endif
  941. }
  942. if (dst)
  943. fputc('\t', f);
  944. fprintf(f, "%f", latency);
  945. }
  946. fprintf(f, "\n");
  947. }
  948. fclose(f);
  949. }
  950. #endif
  951. static void generate_bus_latency_file(void)
  952. {
  953. if (!was_benchmarked)
  954. benchmark_all_gpu_devices();
  955. #ifndef STARPU_SIMGRID
  956. write_bus_latency_file_content();
  957. #endif
  958. }
  959. static void load_bus_latency_file(void)
  960. {
  961. int res;
  962. char path[256];
  963. get_latency_path(path, sizeof(path));
  964. res = access(path, F_OK);
  965. if (res || !load_bus_latency_file_content())
  966. {
  967. /* File does not exist yet or is bogus */
  968. generate_bus_latency_file();
  969. }
  970. }
  971. /*
  972. * Bandwidth
  973. */
  974. static void get_bandwidth_path(char *path, size_t maxlen)
  975. {
  976. get_bus_path("bandwidth", path, maxlen);
  977. }
  978. static int load_bus_bandwidth_file_content(void)
  979. {
  980. int n;
  981. unsigned src, dst;
  982. FILE *f;
  983. double bandwidth;
  984. char path[256];
  985. get_bandwidth_path(path, sizeof(path));
  986. _STARPU_DEBUG("loading bandwidth from %s\n", path);
  987. f = fopen(path, "r");
  988. if (!f)
  989. {
  990. perror("fopen load_bus_bandwidth_file_content");
  991. _STARPU_DISP("path '%s'\n", path);
  992. fflush(stderr);
  993. STARPU_ABORT();
  994. }
  995. for (src = 0; src < STARPU_MAXNODES; src++)
  996. {
  997. _starpu_drop_comments(f);
  998. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  999. {
  1000. n = fscanf(f, "%lf", &bandwidth);
  1001. if (n != 1)
  1002. {
  1003. _STARPU_DISP("Error while reading bandwidth file <%s>. Expected a number\n", path);
  1004. fclose(f);
  1005. return 0;
  1006. }
  1007. n = getc(f);
  1008. if (n == '\n')
  1009. break;
  1010. if (n != '\t')
  1011. {
  1012. _STARPU_DISP("bogus character %c in bandwidth file %s\n", n, path);
  1013. fclose(f);
  1014. return 0;
  1015. }
  1016. bandwidth_matrix[src][dst] = bandwidth;
  1017. /* Look out for \t\n */
  1018. n = getc(f);
  1019. if (n == '\n')
  1020. break;
  1021. ungetc(n, f);
  1022. n = '\t';
  1023. }
  1024. /* No more values, take NAN */
  1025. for ( ; dst < STARPU_MAXNODES; dst++)
  1026. bandwidth_matrix[src][dst] = NAN;
  1027. while (n == '\t')
  1028. {
  1029. /* Look out for \t\n */
  1030. n = getc(f);
  1031. if (n == '\n')
  1032. break;
  1033. ungetc(n, f);
  1034. n = fscanf(f, "%lf", &bandwidth);
  1035. if (n && !isnan(bandwidth))
  1036. {
  1037. _STARPU_DISP("Too many nodes in bandwidth file %s for this configuration (%d)\n", path, STARPU_MAXNODES);
  1038. fclose(f);
  1039. return 0;
  1040. }
  1041. n = getc(f);
  1042. }
  1043. if (n != '\n')
  1044. {
  1045. _STARPU_DISP("Bogus character %c in bandwidth file %s\n", n, path);
  1046. fclose(f);
  1047. return 0;
  1048. }
  1049. /* Look out for EOF */
  1050. n = getc(f);
  1051. if (n == EOF)
  1052. break;
  1053. ungetc(n, f);
  1054. }
  1055. /* No more values, take NAN */
  1056. for ( ; src < STARPU_MAXNODES; src++)
  1057. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  1058. latency_matrix[src][dst] = NAN;
  1059. fclose(f);
  1060. return 1;
  1061. }
  1062. #ifndef STARPU_SIMGRID
  1063. static void write_bus_bandwidth_file_content(void)
  1064. {
  1065. unsigned src, dst, maxnode;
  1066. FILE *f;
  1067. STARPU_ASSERT(was_benchmarked);
  1068. char path[256];
  1069. get_bandwidth_path(path, sizeof(path));
  1070. _STARPU_DEBUG("writing bandwidth to %s\n", path);
  1071. f = fopen(path, "w+");
  1072. STARPU_ASSERT(f);
  1073. fprintf(f, "# ");
  1074. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  1075. fprintf(f, "to %d\t\t", dst);
  1076. fprintf(f, "\n");
  1077. maxnode = ncuda;
  1078. #ifdef STARPU_USE_OPENCL
  1079. maxnode += nopencl;
  1080. #endif
  1081. for (src = 0; src < STARPU_MAXNODES; src++)
  1082. {
  1083. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  1084. {
  1085. double bandwidth;
  1086. if ((src > maxnode) || (dst > maxnode))
  1087. {
  1088. bandwidth = NAN;
  1089. }
  1090. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  1091. else if (src != dst)
  1092. {
  1093. double slowness = 0.0;
  1094. /* Total bandwidth is the harmonic mean of bandwidths */
  1095. #ifdef STARPU_USE_CUDA
  1096. #ifdef HAVE_CUDA_MEMCPY_PEER
  1097. if (src && src <= ncuda && dst && dst <= ncuda)
  1098. /* Direct GPU-GPU transfert */
  1099. slowness = cudadev_timing_dtod[src][dst];
  1100. else
  1101. #endif
  1102. {
  1103. if (src && src <= ncuda)
  1104. slowness += cudadev_timing_dtoh[src];
  1105. if (dst && dst <= ncuda)
  1106. slowness += cudadev_timing_htod[dst];
  1107. }
  1108. #endif
  1109. #ifdef STARPU_USE_OPENCL
  1110. if (src > ncuda)
  1111. slowness += opencldev_timing_dtoh[src-ncuda];
  1112. if (dst > ncuda)
  1113. slowness += opencldev_timing_htod[dst-ncuda];
  1114. #endif
  1115. bandwidth = 1.0/slowness;
  1116. }
  1117. #endif
  1118. else
  1119. {
  1120. /* convention */
  1121. bandwidth = 0.0;
  1122. }
  1123. if (dst)
  1124. fputc('\t', f);
  1125. fprintf(f, "%f", bandwidth);
  1126. }
  1127. fprintf(f, "\n");
  1128. }
  1129. fclose(f);
  1130. }
  1131. #endif /* STARPU_SIMGRID */
  1132. double starpu_get_bandwidth_RAM_CUDA(unsigned cudadev)
  1133. {
  1134. return bandwidth_matrix[0][cudadev+1];
  1135. }
  1136. double starpu_get_latency_RAM_CUDA(unsigned cudadev)
  1137. {
  1138. return latency_matrix[0][cudadev+1];
  1139. }
  1140. void starpu_bus_print_bandwidth(FILE *f)
  1141. {
  1142. unsigned src, dst, maxnode;
  1143. maxnode = ncuda;
  1144. #ifdef STARPU_USE_OPENCL
  1145. maxnode += nopencl;
  1146. #endif
  1147. fprintf(f, "from/to\t");
  1148. fprintf(f, "RAM\t");
  1149. for (dst = 0; dst < ncuda; dst++)
  1150. fprintf(f, "CUDA %d\t", dst);
  1151. for (dst = 0; dst < nopencl; dst++)
  1152. fprintf(f, "OpenCL%d\t", dst);
  1153. fprintf(f, "\n");
  1154. for (src = 0; src <= maxnode; src++)
  1155. {
  1156. if (!src)
  1157. fprintf(f, "RAM\t");
  1158. else if (src <= ncuda)
  1159. fprintf(f, "CUDA %d\t", src-1);
  1160. else
  1161. fprintf(f, "OpenCL%d\t", src-ncuda-1);
  1162. for (dst = 0; dst <= maxnode; dst++)
  1163. fprintf(f, "%.0f\t", bandwidth_matrix[src][dst]);
  1164. fprintf(f, "\n");
  1165. }
  1166. fprintf(f, "\n");
  1167. for (src = 0; src <= maxnode; src++)
  1168. {
  1169. if (!src)
  1170. fprintf(f, "RAM\t");
  1171. else if (src <= ncuda)
  1172. fprintf(f, "CUDA %d\t", src-1);
  1173. else
  1174. fprintf(f, "OpenCL%d\t", src-ncuda-1);
  1175. for (dst = 0; dst <= maxnode; dst++)
  1176. fprintf(f, "%.0f\t", latency_matrix[src][dst]);
  1177. fprintf(f, "\n");
  1178. }
  1179. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  1180. if (ncuda != 0 || nopencl != 0)
  1181. fprintf(f, "\nGPU\tCPU in preference order (logical index), host-to-device, device-to-host\n");
  1182. for (src = 1; src <= maxnode; src++)
  1183. {
  1184. struct dev_timing *timing;
  1185. struct _starpu_machine_config *config = _starpu_get_machine_config();
  1186. unsigned config_ncpus = _starpu_topology_get_nhwcpu(config);
  1187. unsigned cpu;
  1188. #ifdef STARPU_USE_CUDA
  1189. if (src <= ncuda)
  1190. {
  1191. fprintf(f, "CUDA %d\t", src-1);
  1192. for (cpu = 0; cpu < config_ncpus; cpu++)
  1193. {
  1194. timing = &cudadev_timing_per_cpu[src*STARPU_MAXCPUS+cpu];
  1195. if (timing->timing_htod)
  1196. fprintf(f, "%2d %.0f %.0f\t", timing->cpu_id, 1/timing->timing_htod, 1/timing->timing_dtoh);
  1197. else
  1198. fprintf(f, "%2d\t", cuda_affinity_matrix[src-1][cpu]);
  1199. }
  1200. }
  1201. #ifdef STARPU_USE_OPENCL
  1202. else
  1203. #endif
  1204. #endif
  1205. #ifdef STARPU_USE_OPENCL
  1206. {
  1207. fprintf(f, "OpenCL%d\t", src-ncuda-1);
  1208. for (cpu = 0; cpu < config_ncpus; cpu++)
  1209. {
  1210. timing = &opencldev_timing_per_cpu[(src-ncuda)*STARPU_MAXCPUS+cpu];
  1211. if (timing->timing_htod)
  1212. fprintf(f, "%2d %.0f %.0f\t", timing->cpu_id, 1/timing->timing_htod, 1/timing->timing_dtoh);
  1213. else
  1214. fprintf(f, "%2d\t", opencl_affinity_matrix[src-1][cpu]);
  1215. }
  1216. }
  1217. #endif
  1218. fprintf(f, "\n");
  1219. }
  1220. #endif
  1221. }
  1222. static void generate_bus_bandwidth_file(void)
  1223. {
  1224. if (!was_benchmarked)
  1225. benchmark_all_gpu_devices();
  1226. #ifndef STARPU_SIMGRID
  1227. write_bus_bandwidth_file_content();
  1228. #endif
  1229. }
  1230. static void load_bus_bandwidth_file(void)
  1231. {
  1232. int res;
  1233. char path[256];
  1234. get_bandwidth_path(path, sizeof(path));
  1235. res = access(path, F_OK);
  1236. if (res || !load_bus_bandwidth_file_content())
  1237. {
  1238. /* File does not exist yet or is bogus */
  1239. generate_bus_bandwidth_file();
  1240. }
  1241. }
  1242. #ifndef STARPU_SIMGRID
  1243. /*
  1244. * Config
  1245. */
  1246. static void get_config_path(char *path, size_t maxlen)
  1247. {
  1248. get_bus_path("config", path, maxlen);
  1249. }
  1250. static void check_bus_config_file(void)
  1251. {
  1252. int res;
  1253. char path[256];
  1254. struct _starpu_machine_config *config = _starpu_get_machine_config();
  1255. get_config_path(path, sizeof(path));
  1256. res = access(path, F_OK);
  1257. if (res || config->conf->bus_calibrate > 0)
  1258. {
  1259. if (res)
  1260. _STARPU_DISP("No performance model for the bus, calibrating...\n");
  1261. _starpu_bus_force_sampling();
  1262. if (res)
  1263. _STARPU_DISP("... done\n");
  1264. }
  1265. else
  1266. {
  1267. FILE *f;
  1268. int ret;
  1269. unsigned read_cuda = -1, read_opencl = -1;
  1270. unsigned read_cpus = -1;
  1271. // Loading configuration from file
  1272. f = fopen(path, "r");
  1273. STARPU_ASSERT(f);
  1274. _starpu_drop_comments(f);
  1275. ret = fscanf(f, "%u\t", &read_cpus);
  1276. STARPU_ASSERT(ret == 1);
  1277. _starpu_drop_comments(f);
  1278. ret = fscanf(f, "%d\t", &read_cuda);
  1279. STARPU_ASSERT(ret == 1);
  1280. _starpu_drop_comments(f);
  1281. ret = fscanf(f, "%d\t", &read_opencl);
  1282. STARPU_ASSERT(ret == 1);
  1283. _starpu_drop_comments(f);
  1284. fclose(f);
  1285. // Loading current configuration
  1286. ncpus = _starpu_topology_get_nhwcpu(config);
  1287. #ifdef STARPU_USE_CUDA
  1288. ncuda = _starpu_get_cuda_device_count();
  1289. #endif
  1290. #ifdef STARPU_USE_OPENCL
  1291. nopencl = _starpu_opencl_get_device_count();
  1292. #endif
  1293. // Checking if both configurations match
  1294. if (read_cpus != ncpus)
  1295. {
  1296. _STARPU_DISP("Current configuration does not match the bus performance model (CPUS: (stored) %u != (current) %u), recalibrating...\n", read_cpus, ncpus);
  1297. _starpu_bus_force_sampling();
  1298. _STARPU_DISP("... done\n");
  1299. }
  1300. else if (read_cuda != ncuda)
  1301. {
  1302. _STARPU_DISP("Current configuration does not match the bus performance model (CUDA: (stored) %d != (current) %d), recalibrating...\n", read_cuda, ncuda);
  1303. _starpu_bus_force_sampling();
  1304. _STARPU_DISP("... done\n");
  1305. }
  1306. else if (read_opencl != nopencl)
  1307. {
  1308. _STARPU_DISP("Current configuration does not match the bus performance model (OpenCL: (stored) %d != (current) %d), recalibrating...\n", read_opencl, nopencl);
  1309. _starpu_bus_force_sampling();
  1310. _STARPU_DISP("... done\n");
  1311. }
  1312. }
  1313. }
  1314. static void write_bus_config_file_content(void)
  1315. {
  1316. FILE *f;
  1317. char path[256];
  1318. STARPU_ASSERT(was_benchmarked);
  1319. get_config_path(path, sizeof(path));
  1320. _STARPU_DEBUG("writing config to %s\n", path);
  1321. f = fopen(path, "w+");
  1322. STARPU_ASSERT(f);
  1323. fprintf(f, "# Current configuration\n");
  1324. fprintf(f, "%u # Number of CPUs\n", ncpus);
  1325. fprintf(f, "%d # Number of CUDA devices\n", ncuda);
  1326. fprintf(f, "%d # Number of OpenCL devices\n", nopencl);
  1327. fclose(f);
  1328. }
  1329. static void generate_bus_config_file(void)
  1330. {
  1331. if (!was_benchmarked)
  1332. benchmark_all_gpu_devices();
  1333. write_bus_config_file_content();
  1334. }
  1335. #endif /* !SIMGRID */
  1336. void _starpu_simgrid_get_platform_path(char *path, size_t maxlen)
  1337. {
  1338. get_bus_path("platform.xml", path, maxlen);
  1339. }
  1340. #ifndef STARPU_SIMGRID
  1341. static void write_bus_platform_file_content(void)
  1342. {
  1343. FILE *f;
  1344. char path[256];
  1345. unsigned i;
  1346. STARPU_ASSERT(was_benchmarked);
  1347. _starpu_simgrid_get_platform_path(path, sizeof(path));
  1348. _STARPU_DEBUG("writing platform to %s\n", path);
  1349. f = fopen(path, "w+");
  1350. if (!f)
  1351. {
  1352. perror("fopen write_bus_platform_file_content");
  1353. _STARPU_DISP("path '%s'\n", path);
  1354. fflush(stderr);
  1355. STARPU_ABORT();
  1356. }
  1357. fprintf(f,
  1358. "<?xml version='1.0'?>\n"
  1359. " <!DOCTYPE platform SYSTEM 'http://simgrid.gforge.inria.fr/simgrid.dtd'>\n"
  1360. " <platform version='3'>\n"
  1361. " <config id='General'>\n"
  1362. " <prop id='network/TCP_gamma' value='-1'></prop>\n"
  1363. " <prop id='network/latency_factor' value='1'></prop>\n"
  1364. " <prop id='network/bandwidth_factor' value='1'></prop>\n"
  1365. " </config>\n"
  1366. " <AS id='AS0' routing='Full'>\n"
  1367. " <host id='MAIN' power='1'/>\n"
  1368. );
  1369. for (i = 0; i < ncpus; i++)
  1370. fprintf(f, " <host id='CPU%d' power='2000000000'/>\n", i);
  1371. for (i = 0; i < ncuda; i++)
  1372. fprintf(f, " <host id='CUDA%d' power='2000000000'/>\n", i);
  1373. for (i = 0; i < nopencl; i++)
  1374. fprintf(f, " <host id='OpenCL%d' power='2000000000'/>\n", i);
  1375. fprintf(f, "\n <host id='RAM' power='1'/>\n");
  1376. /* Compute maximum bandwidth, taken as machine bandwidth */
  1377. double max_bandwidth = 0;
  1378. #ifdef STARPU_USE_CUDA
  1379. for (i = 0; i < ncuda; i++)
  1380. {
  1381. double down_bw = 1.0 / cudadev_timing_dtoh[1+i];
  1382. double up_bw = 1.0 / cudadev_timing_htod[1+i];
  1383. if (max_bandwidth < down_bw)
  1384. max_bandwidth = down_bw;
  1385. if (max_bandwidth < up_bw)
  1386. max_bandwidth = up_bw;
  1387. }
  1388. #endif
  1389. #ifdef STARPU_USE_OPENCL
  1390. for (i = 0; i < nopencl; i++)
  1391. {
  1392. double down_bw = 1.0 / opencldev_timing_dtoh[1+i];
  1393. double up_bw = 1.0 / opencldev_timing_htod[1+i];
  1394. if (max_bandwidth < down_bw)
  1395. max_bandwidth = down_bw;
  1396. if (max_bandwidth < up_bw)
  1397. max_bandwidth = up_bw;
  1398. }
  1399. #endif
  1400. fprintf(f, "\n <link id='Share' bandwidth='%f' latency='0.000000'/>\n\n", max_bandwidth*1000000);
  1401. /* Write bandwidths & latencies */
  1402. #ifdef STARPU_USE_CUDA
  1403. for (i = 0; i < ncuda; i++)
  1404. {
  1405. char i_name[16];
  1406. snprintf(i_name, sizeof(i_name), "CUDA%d", i);
  1407. fprintf(f, " <link id='RAM-%s' bandwidth='%f' latency='%f'/>\n",
  1408. i_name,
  1409. 1000000. / cudadev_timing_htod[1+i],
  1410. cudadev_latency_htod[1+i]/1000000.);
  1411. fprintf(f, " <link id='%s-RAM' bandwidth='%f' latency='%f'/>\n",
  1412. i_name,
  1413. 1000000. / cudadev_timing_dtoh[1+i],
  1414. cudadev_latency_dtoh[1+i]/1000000.);
  1415. }
  1416. #ifdef HAVE_CUDA_MEMCPY_PEER
  1417. for (i = 0; i < ncuda; i++)
  1418. {
  1419. unsigned j;
  1420. char i_name[16];
  1421. snprintf(i_name, sizeof(i_name), "CUDA%d", i);
  1422. for (j = 0; j < ncuda; j++)
  1423. {
  1424. char j_name[16];
  1425. if (j == i)
  1426. continue;
  1427. snprintf(j_name, sizeof(j_name), "CUDA%d", j);
  1428. fprintf(f, " <link id='%s-%s' bandwidth='%f' latency='%f'/>\n",
  1429. i_name, j_name,
  1430. 1000000. / cudadev_timing_dtod[1+i][1+j],
  1431. cudadev_latency_dtod[1+i][1+j]/1000000.);
  1432. }
  1433. }
  1434. #endif
  1435. #endif
  1436. #ifdef STARPU_USE_OPENCL
  1437. for (i = 0; i < nopencl; i++)
  1438. {
  1439. char i_name[16];
  1440. snprintf(i_name, sizeof(i_name), "OpenCL%d", i);
  1441. fprintf(f, " <link id='RAM-%s' bandwidth='%f' latency='%f'/>\n",
  1442. i_name,
  1443. 1000000 / opencldev_timing_htod[1+i],
  1444. opencldev_latency_htod[1+i]/1000000.);
  1445. fprintf(f, " <link id='%s-RAM' bandwidth='%f' latency='%f'/>\n",
  1446. i_name,
  1447. 1000000 / opencldev_timing_dtoh[1+i],
  1448. opencldev_latency_dtoh[1+i]/1000000.);
  1449. }
  1450. #endif
  1451. /* Write routes */
  1452. #ifdef STARPU_USE_CUDA
  1453. for (i = 0; i < ncuda; i++)
  1454. {
  1455. char i_name[16];
  1456. snprintf(i_name, sizeof(i_name), "CUDA%d", i);
  1457. fprintf(f, " <route src='RAM' dst='%s' symmetrical='NO'><link_ctn id='RAM-%s'/><link_ctn id='Share'/></route>\n", i_name, i_name);
  1458. fprintf(f, " <route src='%s' dst='RAM' symmetrical='NO'><link_ctn id='%s-RAM'/><link_ctn id='Share'/></route>\n", i_name, i_name);
  1459. }
  1460. #ifdef HAVE_CUDA_MEMCPY_PEER
  1461. for (i = 0; i < ncuda; i++)
  1462. {
  1463. unsigned j;
  1464. char i_name[16];
  1465. snprintf(i_name, sizeof(i_name), "CUDA%d", i);
  1466. for (j = 0; j < ncuda; j++)
  1467. {
  1468. char j_name[16];
  1469. if (j == i)
  1470. continue;
  1471. snprintf(j_name, sizeof(j_name), "CUDA%d", j);
  1472. fprintf(f, " <route src='%s' dst='%s' symmetrical='NO'><link_ctn id='%s-%s'/><link_ctn id='Share'/></route>\n", i_name, j_name, i_name, j_name);
  1473. }
  1474. }
  1475. #endif
  1476. #endif
  1477. #ifdef STARPU_USE_OPENCL
  1478. for (i = 0; i < nopencl; i++)
  1479. {
  1480. char i_name[16];
  1481. snprintf(i_name, sizeof(i_name), "OpenCL%d", i);
  1482. fprintf(f, " <route src='RAM' dst='%s' symmetrical='NO'><link_ctn id='RAM-%s'/><link_ctn id='Share'/></route>\n", i_name, i_name);
  1483. fprintf(f, " <route src='%s' dst='RAM' symmetrical='NO'><link_ctn id='%s-RAM'/><link_ctn id='Share'/></route>\n", i_name, i_name);
  1484. }
  1485. #endif
  1486. fprintf(f,
  1487. " </AS>\n"
  1488. " </platform>\n"
  1489. );
  1490. fclose(f);
  1491. }
  1492. static void generate_bus_platform_file(void)
  1493. {
  1494. if (!was_benchmarked)
  1495. benchmark_all_gpu_devices();
  1496. write_bus_platform_file_content();
  1497. }
  1498. static void check_bus_platform_file(void)
  1499. {
  1500. int res;
  1501. char path[256];
  1502. _starpu_simgrid_get_platform_path(path, sizeof(path));
  1503. res = access(path, F_OK);
  1504. if (res)
  1505. {
  1506. /* File does not exist yet */
  1507. generate_bus_platform_file();
  1508. }
  1509. }
  1510. /*
  1511. * Generic
  1512. */
  1513. static void _starpu_bus_force_sampling(void)
  1514. {
  1515. _STARPU_DEBUG("Force bus sampling ...\n");
  1516. _starpu_create_sampling_directory_if_needed();
  1517. generate_bus_affinity_file();
  1518. generate_bus_latency_file();
  1519. generate_bus_bandwidth_file();
  1520. generate_bus_config_file();
  1521. generate_bus_platform_file();
  1522. }
  1523. #endif /* !SIMGRID */
  1524. void _starpu_load_bus_performance_files(void)
  1525. {
  1526. _starpu_create_sampling_directory_if_needed();
  1527. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_SIMGRID)
  1528. ncuda = _starpu_get_cuda_device_count();
  1529. #endif
  1530. #if defined(STARPU_USE_OPENCL) || defined(STARPU_USE_SIMGRID)
  1531. nopencl = _starpu_opencl_get_device_count();
  1532. #endif
  1533. #ifndef STARPU_SIMGRID
  1534. check_bus_config_file();
  1535. load_bus_affinity_file();
  1536. #endif
  1537. load_bus_latency_file();
  1538. load_bus_bandwidth_file();
  1539. #ifndef STARPU_SIMGRID
  1540. check_bus_platform_file();
  1541. #endif
  1542. }
  1543. /* (in MB/s) */
  1544. double _starpu_transfer_bandwidth(unsigned src_node, unsigned dst_node)
  1545. {
  1546. return bandwidth_matrix[src_node][dst_node];
  1547. }
  1548. /* (in µs) */
  1549. double _starpu_transfer_latency(unsigned src_node, unsigned dst_node)
  1550. {
  1551. return latency_matrix[src_node][dst_node];
  1552. }
  1553. /* (in µs) */
  1554. double _starpu_predict_transfer_time(unsigned src_node, unsigned dst_node, size_t size)
  1555. {
  1556. double bandwidth = bandwidth_matrix[src_node][dst_node];
  1557. double latency = latency_matrix[src_node][dst_node];
  1558. struct starpu_machine_topology *topology = &_starpu_get_machine_config()->topology;
  1559. return latency + (size/bandwidth)*2*(topology->ncudagpus+topology->nopenclgpus);
  1560. }
  1561. /* calculate save bandwidth and latency */
  1562. /* bandwidth in MB/s - latency in µs */
  1563. void _starpu_save_bandwidth_and_latency_disk(double bandwidth_write, double bandwidth_read, double latency_write, double latency_read, unsigned node)
  1564. {
  1565. unsigned int i, j;
  1566. double slowness_disk_between_main_ram, slowness_main_ram_between_node;
  1567. /* save bandwith */
  1568. for(i = 0; i < STARPU_MAXNODES; ++i)
  1569. {
  1570. for(j = 0; j < STARPU_MAXNODES; ++j)
  1571. {
  1572. if (i == j && j == node) /* source == destination == node */
  1573. {
  1574. bandwidth_matrix[i][j] = 0;
  1575. }
  1576. else if (i == node) /* source == disk */
  1577. {
  1578. /* convert in slowness */
  1579. if(bandwidth_read != 0)
  1580. slowness_disk_between_main_ram = 1/bandwidth_read;
  1581. else
  1582. slowness_disk_between_main_ram = 0;
  1583. if(bandwidth_matrix[STARPU_MAIN_RAM][j] != 0)
  1584. slowness_main_ram_between_node = 1/bandwidth_matrix[STARPU_MAIN_RAM][j];
  1585. else
  1586. slowness_main_ram_between_node = 0;
  1587. bandwidth_matrix[i][j] = 1/(slowness_disk_between_main_ram+slowness_main_ram_between_node);
  1588. }
  1589. else if (j == node) /* destination == disk */
  1590. {
  1591. /* convert in slowness */
  1592. if(bandwidth_write != 0)
  1593. slowness_disk_between_main_ram = 1/bandwidth_write;
  1594. else
  1595. slowness_disk_between_main_ram = 0;
  1596. if(bandwidth_matrix[i][STARPU_MAIN_RAM] != 0)
  1597. slowness_main_ram_between_node = 1/bandwidth_matrix[i][STARPU_MAIN_RAM];
  1598. else
  1599. slowness_main_ram_between_node = 0;
  1600. bandwidth_matrix[i][j] = 1/(slowness_disk_between_main_ram+slowness_main_ram_between_node);
  1601. }
  1602. else if (j > node || i > node) /* not affected by the node */
  1603. {
  1604. bandwidth_matrix[i][j] = NAN;
  1605. }
  1606. }
  1607. }
  1608. /* save latency */
  1609. for(i = 0; i < STARPU_MAXNODES; ++i)
  1610. {
  1611. for(j = 0; j < STARPU_MAXNODES; ++j)
  1612. {
  1613. if (i == j && j == node) /* source == destination == node */
  1614. {
  1615. latency_matrix[i][j] = 0;
  1616. }
  1617. else if (i == node) /* source == disk */
  1618. {
  1619. latency_matrix[i][j] = (latency_write+latency_matrix[STARPU_MAIN_RAM][j]);
  1620. }
  1621. else if (j == node) /* destination == disk */
  1622. {
  1623. latency_matrix[i][j] = (latency_read+latency_matrix[i][STARPU_MAIN_RAM]);
  1624. }
  1625. else if (j > node || i > node) /* not affected by the node */
  1626. {
  1627. latency_matrix[i][j] = NAN;
  1628. }
  1629. }
  1630. }
  1631. }