perfmodel_bus.c 52 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009-2014 Université de Bordeaux 1
  4. * Copyright (C) 2010, 2011, 2012, 2013 Centre National de la Recherche Scientifique
  5. * Copyright (C) 2013 Corentin Salingue
  6. *
  7. * StarPU is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU Lesser General Public License as published by
  9. * the Free Software Foundation; either version 2.1 of the License, or (at
  10. * your option) any later version.
  11. *
  12. * StarPU is distributed in the hope that it will be useful, but
  13. * WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  15. *
  16. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  17. */
  18. #ifdef STARPU_USE_CUDA
  19. #ifndef _GNU_SOURCE
  20. #define _GNU_SOURCE
  21. #endif
  22. #include <sched.h>
  23. #endif
  24. #include <unistd.h>
  25. #include <sys/time.h>
  26. #include <stdlib.h>
  27. #include <math.h>
  28. #include <starpu.h>
  29. #include <starpu_cuda.h>
  30. #include <starpu_opencl.h>
  31. #include <common/config.h>
  32. #include <core/workers.h>
  33. #include <core/perfmodel/perfmodel.h>
  34. #include <core/simgrid.h>
  35. #include <common/utils.h>
  36. #ifdef STARPU_USE_OPENCL
  37. #include <starpu_opencl.h>
  38. #endif
  39. #ifdef STARPU_HAVE_WINDOWS
  40. #include <windows.h>
  41. #endif
  42. #define SIZE (32*1024*1024*sizeof(char))
  43. #define NITER 128
  44. #ifndef STARPU_SIMGRID
  45. static void _starpu_bus_force_sampling(void);
  46. #endif
  47. /* timing is in µs per byte (i.e. slowness, inverse of bandwidth) */
  48. struct dev_timing
  49. {
  50. int cpu_id;
  51. double timing_htod;
  52. double latency_htod;
  53. double timing_dtoh;
  54. double latency_dtoh;
  55. };
  56. /* TODO: measure latency */
  57. static double bandwidth_matrix[STARPU_MAXNODES][STARPU_MAXNODES];
  58. static double latency_matrix[STARPU_MAXNODES][STARPU_MAXNODES];
  59. static unsigned was_benchmarked = 0;
  60. static unsigned ncpus = 0;
  61. static unsigned ncuda = 0;
  62. static unsigned nopencl = 0;
  63. static unsigned nmic = 0;
  64. /* Benchmarking the performance of the bus */
  65. #ifdef STARPU_USE_CUDA
  66. static uint64_t cuda_size[STARPU_MAXCUDADEVS];
  67. static int cuda_affinity_matrix[STARPU_MAXCUDADEVS][STARPU_MAXCPUS];
  68. static double cudadev_timing_htod[STARPU_MAXNODES] = {0.0};
  69. static double cudadev_latency_htod[STARPU_MAXNODES] = {0.0};
  70. static double cudadev_timing_dtoh[STARPU_MAXNODES] = {0.0};
  71. static double cudadev_latency_dtoh[STARPU_MAXNODES] = {0.0};
  72. #ifdef HAVE_CUDA_MEMCPY_PEER
  73. static double cudadev_timing_dtod[STARPU_MAXNODES][STARPU_MAXNODES] = {{0.0}};
  74. static double cudadev_latency_dtod[STARPU_MAXNODES][STARPU_MAXNODES] = {{0.0}};
  75. #endif
  76. static struct dev_timing cudadev_timing_per_cpu[STARPU_MAXNODES*STARPU_MAXCPUS];
  77. #endif
  78. #ifdef STARPU_USE_OPENCL
  79. static uint64_t opencl_size[STARPU_MAXCUDADEVS];
  80. static int opencl_affinity_matrix[STARPU_MAXOPENCLDEVS][STARPU_MAXCPUS];
  81. static double opencldev_timing_htod[STARPU_MAXNODES] = {0.0};
  82. static double opencldev_latency_htod[STARPU_MAXNODES] = {0.0};
  83. static double opencldev_timing_dtoh[STARPU_MAXNODES] = {0.0};
  84. static double opencldev_latency_dtoh[STARPU_MAXNODES] = {0.0};
  85. static struct dev_timing opencldev_timing_per_cpu[STARPU_MAXNODES*STARPU_MAXCPUS];
  86. #endif
  87. #ifdef STARPU_USE_MIC
  88. static double mic_time_host_to_device[STARPU_MAXNODES] = {0.0};
  89. static double mic_time_device_to_host[STARPU_MAXNODES] = {0.0};
  90. #endif /* STARPU_USE_MIC */
  91. #ifdef STARPU_HAVE_HWLOC
  92. static hwloc_topology_t hwtopology;
  93. #endif
  94. #if (defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)) && !defined(STARPU_SIMGRID)
  95. #ifdef STARPU_USE_CUDA
  96. static void measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(int dev, int cpu, struct dev_timing *dev_timing_per_cpu)
  97. {
  98. struct _starpu_machine_config *config = _starpu_get_machine_config();
  99. _starpu_bind_thread_on_cpu(config, cpu);
  100. size_t size = SIZE;
  101. /* Initialize CUDA context on the device */
  102. /* We do not need to enable OpenGL interoperability at this point,
  103. * since we cleanly shutdown CUDA before returning. */
  104. cudaSetDevice(dev);
  105. /* hack to avoid third party libs to rebind threads */
  106. _starpu_bind_thread_on_cpu(config, cpu);
  107. /* hack to force the initialization */
  108. cudaFree(0);
  109. /* hack to avoid third party libs to rebind threads */
  110. _starpu_bind_thread_on_cpu(config, cpu);
  111. /* Get the maximum size which can be allocated on the device */
  112. struct cudaDeviceProp prop;
  113. cudaError_t cures;
  114. cures = cudaGetDeviceProperties(&prop, dev);
  115. if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
  116. cuda_size[dev] = prop.totalGlobalMem;
  117. if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
  118. /* Allocate a buffer on the device */
  119. unsigned char *d_buffer;
  120. cures = cudaMalloc((void **)&d_buffer, size);
  121. STARPU_ASSERT(cures == cudaSuccess);
  122. /* hack to avoid third party libs to rebind threads */
  123. _starpu_bind_thread_on_cpu(config, cpu);
  124. /* Allocate a buffer on the host */
  125. unsigned char *h_buffer;
  126. cures = cudaHostAlloc((void **)&h_buffer, size, 0);
  127. STARPU_ASSERT(cures == cudaSuccess);
  128. /* hack to avoid third party libs to rebind threads */
  129. _starpu_bind_thread_on_cpu(config, cpu);
  130. /* Fill them */
  131. memset(h_buffer, 0, size);
  132. cudaMemset(d_buffer, 0, size);
  133. /* hack to avoid third party libs to rebind threads */
  134. _starpu_bind_thread_on_cpu(config, cpu);
  135. unsigned iter;
  136. double timing;
  137. struct timeval start;
  138. struct timeval end;
  139. /* Measure upload bandwidth */
  140. gettimeofday(&start, NULL);
  141. for (iter = 0; iter < NITER; iter++)
  142. {
  143. cudaMemcpy(d_buffer, h_buffer, size, cudaMemcpyHostToDevice);
  144. cudaThreadSynchronize();
  145. }
  146. gettimeofday(&end, NULL);
  147. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  148. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_htod = timing/NITER/size;
  149. /* Measure download bandwidth */
  150. gettimeofday(&start, NULL);
  151. for (iter = 0; iter < NITER; iter++)
  152. {
  153. cudaMemcpy(h_buffer, d_buffer, size, cudaMemcpyDeviceToHost);
  154. cudaThreadSynchronize();
  155. }
  156. gettimeofday(&end, NULL);
  157. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  158. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_dtoh = timing/NITER/size;
  159. /* Measure upload latency */
  160. gettimeofday(&start, NULL);
  161. for (iter = 0; iter < NITER; iter++)
  162. {
  163. cudaMemcpy(d_buffer, h_buffer, 1, cudaMemcpyHostToDevice);
  164. cudaThreadSynchronize();
  165. }
  166. gettimeofday(&end, NULL);
  167. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  168. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].latency_htod = timing/NITER;
  169. /* Measure download latency */
  170. gettimeofday(&start, NULL);
  171. for (iter = 0; iter < NITER; iter++)
  172. {
  173. cudaMemcpy(d_buffer, h_buffer, 1, cudaMemcpyHostToDevice);
  174. cudaThreadSynchronize();
  175. }
  176. gettimeofday(&end, NULL);
  177. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  178. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].latency_dtoh = timing/NITER;
  179. /* Free buffers */
  180. cudaFreeHost(h_buffer);
  181. cudaFree(d_buffer);
  182. cudaThreadExit();
  183. }
  184. #ifdef HAVE_CUDA_MEMCPY_PEER
  185. static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
  186. {
  187. size_t size = SIZE;
  188. int can;
  189. /* Get the maximum size which can be allocated on the device */
  190. struct cudaDeviceProp prop;
  191. cudaError_t cures;
  192. cures = cudaGetDeviceProperties(&prop, src);
  193. if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
  194. if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
  195. cures = cudaGetDeviceProperties(&prop, dst);
  196. if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
  197. if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
  198. /* Initialize CUDA context on the source */
  199. /* We do not need to enable OpenGL interoperability at this point,
  200. * since we cleanly shutdown CUDA before returning. */
  201. cudaSetDevice(src);
  202. if (starpu_get_env_number("STARPU_ENABLE_CUDA_GPU_GPU_DIRECT") != 0)
  203. {
  204. cures = cudaDeviceCanAccessPeer(&can, src, dst);
  205. if (!cures && can)
  206. {
  207. cures = cudaDeviceEnablePeerAccess(dst, 0);
  208. if (!cures)
  209. _STARPU_DISP("GPU-Direct %d -> %d\n", dst, src);
  210. }
  211. }
  212. /* Allocate a buffer on the device */
  213. unsigned char *s_buffer;
  214. cures = cudaMalloc((void **)&s_buffer, size);
  215. STARPU_ASSERT(cures == cudaSuccess);
  216. cudaMemset(s_buffer, 0, size);
  217. /* Initialize CUDA context on the destination */
  218. /* We do not need to enable OpenGL interoperability at this point,
  219. * since we cleanly shutdown CUDA before returning. */
  220. cudaSetDevice(dst);
  221. if (starpu_get_env_number("STARPU_ENABLE_CUDA_GPU_GPU_DIRECT") != 0)
  222. {
  223. cures = cudaDeviceCanAccessPeer(&can, dst, src);
  224. if (!cures && can)
  225. {
  226. cures = cudaDeviceEnablePeerAccess(src, 0);
  227. if (!cures)
  228. _STARPU_DISP("GPU-Direct %d -> %d\n", src, dst);
  229. }
  230. }
  231. /* Allocate a buffer on the device */
  232. unsigned char *d_buffer;
  233. cures = cudaMalloc((void **)&d_buffer, size);
  234. STARPU_ASSERT(cures == cudaSuccess);
  235. cudaMemset(d_buffer, 0, size);
  236. unsigned iter;
  237. double timing;
  238. struct timeval start;
  239. struct timeval end;
  240. /* Measure upload bandwidth */
  241. gettimeofday(&start, NULL);
  242. for (iter = 0; iter < NITER; iter++)
  243. {
  244. cudaMemcpyPeer(d_buffer, dst, s_buffer, src, size);
  245. cudaThreadSynchronize();
  246. }
  247. gettimeofday(&end, NULL);
  248. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  249. cudadev_timing_dtod[src+1][dst+1] = timing/NITER/size;
  250. /* Measure upload latency */
  251. gettimeofday(&start, NULL);
  252. for (iter = 0; iter < NITER; iter++)
  253. {
  254. cudaMemcpyPeer(d_buffer, dst, s_buffer, src, 1);
  255. cudaThreadSynchronize();
  256. }
  257. gettimeofday(&end, NULL);
  258. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  259. cudadev_latency_dtod[src+1][dst+1] = timing/NITER;
  260. /* Free buffers */
  261. cudaFree(d_buffer);
  262. cudaSetDevice(src);
  263. cudaFree(s_buffer);
  264. cudaThreadExit();
  265. }
  266. #endif
  267. #endif
  268. #ifdef STARPU_USE_OPENCL
  269. static void measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(int dev, int cpu, struct dev_timing *dev_timing_per_cpu)
  270. {
  271. cl_context context;
  272. cl_command_queue queue;
  273. cl_int err=0;
  274. size_t size = SIZE;
  275. int not_initialized;
  276. struct _starpu_machine_config *config = _starpu_get_machine_config();
  277. _starpu_bind_thread_on_cpu(config, cpu);
  278. /* Is the context already initialised ? */
  279. starpu_opencl_get_context(dev, &context);
  280. not_initialized = (context == NULL);
  281. if (not_initialized == 1)
  282. _starpu_opencl_init_context(dev);
  283. /* Get context and queue */
  284. starpu_opencl_get_context(dev, &context);
  285. starpu_opencl_get_queue(dev, &queue);
  286. /* Get the maximum size which can be allocated on the device */
  287. cl_device_id device;
  288. cl_ulong maxMemAllocSize;
  289. starpu_opencl_get_device(dev, &device);
  290. err = clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(maxMemAllocSize), &maxMemAllocSize, NULL);
  291. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  292. opencl_size[dev] = maxMemAllocSize;
  293. if (size > (size_t)maxMemAllocSize/4) size = maxMemAllocSize/4;
  294. if (_starpu_opencl_get_device_type(dev) == CL_DEVICE_TYPE_CPU)
  295. {
  296. /* Let's not use too much RAM when running OpenCL on a CPU: it
  297. * would make the OS swap like crazy. */
  298. size /= 2;
  299. }
  300. /* hack to avoid third party libs to rebind threads */
  301. _starpu_bind_thread_on_cpu(config, cpu);
  302. /* Allocate a buffer on the device */
  303. cl_mem d_buffer;
  304. d_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err);
  305. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  306. /* hack to avoid third party libs to rebind threads */
  307. _starpu_bind_thread_on_cpu(config, cpu);
  308. /* Allocate a buffer on the host */
  309. unsigned char *h_buffer;
  310. h_buffer = (unsigned char *)malloc(size);
  311. STARPU_ASSERT(h_buffer);
  312. /* hack to avoid third party libs to rebind threads */
  313. _starpu_bind_thread_on_cpu(config, cpu);
  314. /* Fill them */
  315. memset(h_buffer, 0, size);
  316. err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, size, h_buffer, 0, NULL, NULL);
  317. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  318. clFinish(queue);
  319. /* hack to avoid third party libs to rebind threads */
  320. _starpu_bind_thread_on_cpu(config, cpu);
  321. unsigned iter;
  322. double timing;
  323. struct timeval start;
  324. struct timeval end;
  325. /* Measure upload bandwidth */
  326. gettimeofday(&start, NULL);
  327. for (iter = 0; iter < NITER; iter++)
  328. {
  329. err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, size, h_buffer, 0, NULL, NULL);
  330. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  331. clFinish(queue);
  332. }
  333. gettimeofday(&end, NULL);
  334. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  335. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_htod = timing/NITER/size;
  336. /* Measure download bandwidth */
  337. gettimeofday(&start, NULL);
  338. for (iter = 0; iter < NITER; iter++)
  339. {
  340. err = clEnqueueReadBuffer(queue, d_buffer, CL_TRUE, 0, size, h_buffer, 0, NULL, NULL);
  341. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  342. clFinish(queue);
  343. }
  344. gettimeofday(&end, NULL);
  345. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  346. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_dtoh = timing/NITER/size;
  347. /* Measure upload latency */
  348. gettimeofday(&start, NULL);
  349. for (iter = 0; iter < NITER; iter++)
  350. {
  351. err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, 1, h_buffer, 0, NULL, NULL);
  352. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  353. clFinish(queue);
  354. }
  355. gettimeofday(&end, NULL);
  356. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  357. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].latency_htod = timing/NITER;
  358. /* Measure download latency */
  359. gettimeofday(&start, NULL);
  360. for (iter = 0; iter < NITER; iter++)
  361. {
  362. err = clEnqueueReadBuffer(queue, d_buffer, CL_TRUE, 0, 1, h_buffer, 0, NULL, NULL);
  363. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  364. clFinish(queue);
  365. }
  366. gettimeofday(&end, NULL);
  367. timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
  368. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].latency_dtoh = timing/NITER;
  369. /* Free buffers */
  370. err = clReleaseMemObject(d_buffer);
  371. if (STARPU_UNLIKELY(err != CL_SUCCESS))
  372. STARPU_OPENCL_REPORT_ERROR(err);
  373. free(h_buffer);
  374. /* Uninitiliaze OpenCL context on the device */
  375. if (not_initialized == 1)
  376. _starpu_opencl_deinit_context(dev);
  377. }
  378. #endif
  379. /* NB: we want to sort the bandwidth by DECREASING order */
  380. static int compar_dev_timing(const void *left_dev_timing, const void *right_dev_timing)
  381. {
  382. const struct dev_timing *left = (const struct dev_timing *)left_dev_timing;
  383. const struct dev_timing *right = (const struct dev_timing *)right_dev_timing;
  384. double left_dtoh = left->timing_dtoh;
  385. double left_htod = left->timing_htod;
  386. double right_dtoh = right->timing_dtoh;
  387. double right_htod = right->timing_htod;
  388. double timing_sum2_left = left_dtoh*left_dtoh + left_htod*left_htod;
  389. double timing_sum2_right = right_dtoh*right_dtoh + right_htod*right_htod;
  390. /* it's for a decreasing sorting */
  391. return (timing_sum2_left > timing_sum2_right);
  392. }
  393. #ifdef STARPU_HAVE_HWLOC
  394. static int find_numa_node(hwloc_obj_t obj)
  395. {
  396. STARPU_ASSERT(obj);
  397. hwloc_obj_t current = obj;
  398. while (current->depth != HWLOC_OBJ_NODE)
  399. {
  400. current = current->parent;
  401. /* If we don't find a "node" obj before the root, this means
  402. * hwloc does not know whether there are numa nodes or not, so
  403. * we should not use a per-node sampling in that case. */
  404. STARPU_ASSERT(current);
  405. }
  406. STARPU_ASSERT(current->depth == HWLOC_OBJ_NODE);
  407. return current->logical_index;
  408. }
  409. #endif
  410. static void measure_bandwidth_between_cpus_and_dev(int dev, struct dev_timing *dev_timing_per_cpu, char *type)
  411. {
  412. /* Either we have hwloc and we measure the bandwith between each GPU
  413. * and each NUMA node, or we don't have such NUMA information and we
  414. * measure the bandwith for each pair of (CPU, GPU), which is slower.
  415. * */
  416. #ifdef STARPU_HAVE_HWLOC
  417. int cpu_depth = hwloc_get_type_depth(hwtopology, HWLOC_OBJ_CORE);
  418. int nnuma_nodes = hwloc_get_nbobjs_by_depth(hwtopology, HWLOC_OBJ_NODE);
  419. /* If no NUMA node was found, we assume that we have a single memory
  420. * bank. */
  421. const unsigned no_node_obj_was_found = (nnuma_nodes == 0);
  422. unsigned *is_available_per_numa_node = NULL;
  423. double *dev_timing_htod_per_numa_node = NULL;
  424. double *dev_latency_htod_per_numa_node = NULL;
  425. double *dev_timing_dtoh_per_numa_node = NULL;
  426. double *dev_latency_dtoh_per_numa_node = NULL;
  427. if (!no_node_obj_was_found)
  428. {
  429. is_available_per_numa_node = (unsigned *)malloc(nnuma_nodes * sizeof(unsigned));
  430. STARPU_ASSERT(is_available_per_numa_node);
  431. dev_timing_htod_per_numa_node = (double *)malloc(nnuma_nodes * sizeof(double));
  432. STARPU_ASSERT(dev_timing_htod_per_numa_node);
  433. dev_latency_htod_per_numa_node = (double *)malloc(nnuma_nodes * sizeof(double));
  434. STARPU_ASSERT(dev_latency_htod_per_numa_node);
  435. dev_timing_dtoh_per_numa_node = (double *)malloc(nnuma_nodes * sizeof(double));
  436. STARPU_ASSERT(dev_timing_dtoh_per_numa_node);
  437. dev_latency_dtoh_per_numa_node = (double *)malloc(nnuma_nodes * sizeof(double));
  438. STARPU_ASSERT(dev_latency_dtoh_per_numa_node);
  439. memset(is_available_per_numa_node, 0, nnuma_nodes*sizeof(unsigned));
  440. }
  441. #endif
  442. unsigned cpu;
  443. for (cpu = 0; cpu < ncpus; cpu++)
  444. {
  445. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].cpu_id = cpu;
  446. #ifdef STARPU_HAVE_HWLOC
  447. int numa_id = 0;
  448. if (!no_node_obj_was_found)
  449. {
  450. hwloc_obj_t obj = hwloc_get_obj_by_depth(hwtopology, cpu_depth, cpu);
  451. numa_id = find_numa_node(obj);
  452. if (is_available_per_numa_node[numa_id])
  453. {
  454. /* We reuse the previous numbers for that NUMA node */
  455. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_htod =
  456. dev_timing_htod_per_numa_node[numa_id];
  457. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].latency_htod =
  458. dev_latency_htod_per_numa_node[numa_id];
  459. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_dtoh =
  460. dev_timing_dtoh_per_numa_node[numa_id];
  461. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].latency_dtoh =
  462. dev_latency_dtoh_per_numa_node[numa_id];
  463. continue;
  464. }
  465. }
  466. #endif
  467. #ifdef STARPU_USE_CUDA
  468. if (strncmp(type, "CUDA", 4) == 0)
  469. measure_bandwidth_between_host_and_dev_on_cpu_with_cuda(dev, cpu, dev_timing_per_cpu);
  470. #endif
  471. #ifdef STARPU_USE_OPENCL
  472. if (strncmp(type, "OpenCL", 6) == 0)
  473. measure_bandwidth_between_host_and_dev_on_cpu_with_opencl(dev, cpu, dev_timing_per_cpu);
  474. #endif
  475. #ifdef STARPU_HAVE_HWLOC
  476. if (!no_node_obj_was_found && !is_available_per_numa_node[numa_id])
  477. {
  478. /* Save the results for that NUMA node */
  479. dev_timing_htod_per_numa_node[numa_id] =
  480. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_htod;
  481. dev_latency_htod_per_numa_node[numa_id] =
  482. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].latency_htod;
  483. dev_timing_dtoh_per_numa_node[numa_id] =
  484. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_dtoh;
  485. dev_latency_dtoh_per_numa_node[numa_id] =
  486. dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].latency_dtoh;
  487. is_available_per_numa_node[numa_id] = 1;
  488. }
  489. #endif
  490. }
  491. #ifdef STARPU_HAVE_HWLOC
  492. if (!no_node_obj_was_found)
  493. {
  494. free(is_available_per_numa_node);
  495. free(dev_timing_htod_per_numa_node);
  496. free(dev_latency_htod_per_numa_node);
  497. free(dev_timing_dtoh_per_numa_node);
  498. free(dev_latency_dtoh_per_numa_node);
  499. }
  500. #endif /* STARPU_HAVE_HWLOC */
  501. }
  502. static void measure_bandwidth_between_host_and_dev(int dev, double *dev_timing_htod, double *dev_latency_htod,
  503. double *dev_timing_dtoh, double *dev_latency_dtoh,
  504. struct dev_timing *dev_timing_per_cpu, char *type)
  505. {
  506. measure_bandwidth_between_cpus_and_dev(dev, dev_timing_per_cpu, type);
  507. /* sort the results */
  508. qsort(&(dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS]), ncpus,
  509. sizeof(struct dev_timing),
  510. compar_dev_timing);
  511. #ifdef STARPU_VERBOSE
  512. unsigned cpu;
  513. for (cpu = 0; cpu < ncpus; cpu++)
  514. {
  515. unsigned current_cpu = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].cpu_id;
  516. double bandwidth_dtoh = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_dtoh;
  517. double bandwidth_htod = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+cpu].timing_htod;
  518. double bandwidth_sum2 = bandwidth_dtoh*bandwidth_dtoh + bandwidth_htod*bandwidth_htod;
  519. _STARPU_DISP("(%10s) BANDWIDTH GPU %d CPU %u - htod %f - dtoh %f - %f\n", type, dev, current_cpu, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
  520. }
  521. unsigned best_cpu = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+0].cpu_id;
  522. _STARPU_DISP("(%10s) BANDWIDTH GPU %d BEST CPU %u\n", type, dev, best_cpu);
  523. #endif
  524. /* The results are sorted in a decreasing order, so that the best
  525. * measurement is currently the first entry. */
  526. dev_timing_dtoh[dev+1] = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+0].timing_dtoh;
  527. dev_latency_dtoh[dev+1] = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+0].latency_dtoh;
  528. dev_timing_htod[dev+1] = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+0].timing_htod;
  529. dev_latency_htod[dev+1] = dev_timing_per_cpu[(dev+1)*STARPU_MAXCPUS+0].latency_htod;
  530. }
  531. #endif /* defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) */
  532. static void benchmark_all_gpu_devices(void)
  533. {
  534. #ifdef STARPU_SIMGRID
  535. _STARPU_DISP("can not measure bus in simgrid mode, please run starpu_calibrate_bus in non-simgrid mode to make sure the bus performance model was calibrated\n");
  536. STARPU_ABORT();
  537. #else /* !SIMGRID */
  538. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_MIC)
  539. unsigned i;
  540. #endif
  541. #ifdef HAVE_CUDA_MEMCPY_PEER
  542. unsigned j;
  543. #endif
  544. _STARPU_DEBUG("Benchmarking the speed of the bus\n");
  545. #ifdef STARPU_HAVE_HWLOC
  546. hwloc_topology_init(&hwtopology);
  547. hwloc_topology_load(hwtopology);
  548. #endif
  549. #ifdef STARPU_HAVE_HWLOC
  550. hwloc_bitmap_t former_cpuset = hwloc_bitmap_alloc();
  551. hwloc_get_cpubind(hwtopology, former_cpuset, HWLOC_CPUBIND_THREAD);
  552. #elif __linux__
  553. /* Save the current cpu binding */
  554. cpu_set_t former_process_affinity;
  555. int ret;
  556. ret = sched_getaffinity(0, sizeof(former_process_affinity), &former_process_affinity);
  557. if (ret)
  558. {
  559. perror("sched_getaffinity");
  560. STARPU_ABORT();
  561. }
  562. #else
  563. #warning Missing binding support, StarPU will not be able to properly benchmark NUMA topology
  564. #endif
  565. struct _starpu_machine_config *config = _starpu_get_machine_config();
  566. ncpus = _starpu_topology_get_nhwcpu(config);
  567. #ifdef STARPU_USE_CUDA
  568. ncuda = _starpu_get_cuda_device_count();
  569. for (i = 0; i < ncuda; i++)
  570. {
  571. _STARPU_DISP("CUDA %d...\n", i);
  572. /* measure bandwidth between Host and Device i */
  573. measure_bandwidth_between_host_and_dev(i, cudadev_timing_htod, cudadev_latency_htod, cudadev_timing_dtoh, cudadev_latency_dtoh, cudadev_timing_per_cpu, "CUDA");
  574. }
  575. #ifdef HAVE_CUDA_MEMCPY_PEER
  576. for (i = 0; i < ncuda; i++)
  577. for (j = 0; j < ncuda; j++)
  578. if (i != j)
  579. {
  580. _STARPU_DISP("CUDA %d -> %d...\n", i, j);
  581. /* measure bandwidth between Host and Device i */
  582. measure_bandwidth_between_dev_and_dev_cuda(i, j);
  583. }
  584. #endif
  585. #endif
  586. #ifdef STARPU_USE_OPENCL
  587. nopencl = _starpu_opencl_get_device_count();
  588. for (i = 0; i < nopencl; i++)
  589. {
  590. _STARPU_DISP("OpenCL %d...\n", i);
  591. /* measure bandwith between Host and Device i */
  592. measure_bandwidth_between_host_and_dev(i, opencldev_timing_htod, opencldev_latency_htod, opencldev_timing_dtoh, opencldev_latency_dtoh, opencldev_timing_per_cpu, "OpenCL");
  593. }
  594. #endif
  595. #ifdef STARPU_USE_MIC
  596. /* TODO: implement real calibration ! For now we only put an arbitrary
  597. * value for each device during at the declaration as a bug fix, else
  598. * we get problems on heft scheduler */
  599. nmic = _starpu_mic_src_get_device_count();
  600. for (i = 0; i < STARPU_MAXNODES; i++)
  601. {
  602. mic_time_host_to_device[i] = 0.1;
  603. mic_time_device_to_host[i] = 0.1;
  604. }
  605. #endif /* STARPU_USE_MIC */
  606. #ifdef STARPU_HAVE_HWLOC
  607. hwloc_set_cpubind(hwtopology, former_cpuset, HWLOC_CPUBIND_THREAD);
  608. hwloc_bitmap_free(former_cpuset);
  609. #elif __linux__
  610. /* Restore the former affinity */
  611. ret = sched_setaffinity(0, sizeof(former_process_affinity), &former_process_affinity);
  612. if (ret)
  613. {
  614. perror("sched_setaffinity");
  615. STARPU_ABORT();
  616. }
  617. #endif
  618. #ifdef STARPU_HAVE_HWLOC
  619. hwloc_topology_destroy(hwtopology);
  620. #endif
  621. _STARPU_DEBUG("Benchmarking the speed of the bus is done.\n");
  622. was_benchmarked = 1;
  623. #endif /* !SIMGRID */
  624. }
  625. static void get_bus_path(const char *type, char *path, size_t maxlen)
  626. {
  627. _starpu_get_perf_model_dir_bus(path, maxlen);
  628. char hostname[65];
  629. _starpu_gethostname(hostname, sizeof(hostname));
  630. strncat(path, hostname, maxlen);
  631. strncat(path, ".", maxlen);
  632. strncat(path, type, maxlen);
  633. }
  634. /*
  635. * Affinity
  636. */
  637. #ifndef STARPU_SIMGRID
  638. static void get_affinity_path(char *path, size_t maxlen)
  639. {
  640. get_bus_path("affinity", path, maxlen);
  641. }
  642. static void load_bus_affinity_file_content(void)
  643. {
  644. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  645. FILE *f;
  646. char path[256];
  647. get_affinity_path(path, sizeof(path));
  648. _STARPU_DEBUG("loading affinities from %s\n", path);
  649. f = fopen(path, "r");
  650. STARPU_ASSERT(f);
  651. struct _starpu_machine_config *config = _starpu_get_machine_config();
  652. ncpus = _starpu_topology_get_nhwcpu(config);
  653. unsigned gpu;
  654. #ifdef STARPU_USE_CUDA
  655. ncuda = _starpu_get_cuda_device_count();
  656. for (gpu = 0; gpu < ncuda; gpu++)
  657. {
  658. int ret;
  659. unsigned dummy;
  660. _starpu_drop_comments(f);
  661. ret = fscanf(f, "%d\t", &dummy);
  662. STARPU_ASSERT(ret == 1);
  663. STARPU_ASSERT(dummy == gpu);
  664. unsigned cpu;
  665. for (cpu = 0; cpu < ncpus; cpu++)
  666. {
  667. ret = fscanf(f, "%d\t", &cuda_affinity_matrix[gpu][cpu]);
  668. STARPU_ASSERT(ret == 1);
  669. }
  670. ret = fscanf(f, "\n");
  671. STARPU_ASSERT(ret == 0);
  672. }
  673. #endif /* !STARPU_USE_CUDA */
  674. #ifdef STARPU_USE_OPENCL
  675. nopencl = _starpu_opencl_get_device_count();
  676. for (gpu = 0; gpu < nopencl; gpu++)
  677. {
  678. int ret;
  679. unsigned dummy;
  680. _starpu_drop_comments(f);
  681. ret = fscanf(f, "%d\t", &dummy);
  682. STARPU_ASSERT(ret == 1);
  683. STARPU_ASSERT(dummy == gpu);
  684. unsigned cpu;
  685. for (cpu = 0; cpu < ncpus; cpu++)
  686. {
  687. ret = fscanf(f, "%d\t", &opencl_affinity_matrix[gpu][cpu]);
  688. STARPU_ASSERT(ret == 1);
  689. }
  690. ret = fscanf(f, "\n");
  691. STARPU_ASSERT(ret == 0);
  692. }
  693. #endif /* !STARPU_USE_OPENCL */
  694. fclose(f);
  695. #endif /* !(STARPU_USE_CUDA_ || STARPU_USE_OPENCL */
  696. }
  697. #ifndef STARPU_SIMGRID
  698. static void write_bus_affinity_file_content(void)
  699. {
  700. STARPU_ASSERT(was_benchmarked);
  701. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  702. FILE *f;
  703. char path[256];
  704. get_affinity_path(path, sizeof(path));
  705. _STARPU_DEBUG("writing affinities to %s\n", path);
  706. f = fopen(path, "w+");
  707. if (!f)
  708. {
  709. perror("fopen write_buf_affinity_file_content");
  710. _STARPU_DISP("path '%s'\n", path);
  711. fflush(stderr);
  712. STARPU_ABORT();
  713. }
  714. unsigned cpu;
  715. unsigned gpu;
  716. fprintf(f, "# GPU\t");
  717. for (cpu = 0; cpu < ncpus; cpu++)
  718. fprintf(f, "CPU%u\t", cpu);
  719. fprintf(f, "\n");
  720. #ifdef STARPU_USE_CUDA
  721. for (gpu = 0; gpu < ncuda; gpu++)
  722. {
  723. fprintf(f, "%d\t", gpu);
  724. for (cpu = 0; cpu < ncpus; cpu++)
  725. {
  726. fprintf(f, "%d\t", cudadev_timing_per_cpu[(gpu+1)*STARPU_MAXCPUS+cpu].cpu_id);
  727. }
  728. fprintf(f, "\n");
  729. }
  730. #endif
  731. #ifdef STARPU_USE_OPENCL
  732. for (gpu = 0; gpu < nopencl; gpu++)
  733. {
  734. fprintf(f, "%d\t", gpu);
  735. for (cpu = 0; cpu < ncpus; cpu++)
  736. {
  737. fprintf(f, "%d\t", opencldev_timing_per_cpu[(gpu+1)*STARPU_MAXCPUS+cpu].cpu_id);
  738. }
  739. fprintf(f, "\n");
  740. }
  741. #endif
  742. fclose(f);
  743. #endif
  744. }
  745. #endif /* STARPU_SIMGRID */
  746. static void generate_bus_affinity_file(void)
  747. {
  748. if (!was_benchmarked)
  749. benchmark_all_gpu_devices();
  750. write_bus_affinity_file_content();
  751. }
  752. static void load_bus_affinity_file(void)
  753. {
  754. int res;
  755. char path[256];
  756. get_affinity_path(path, sizeof(path));
  757. res = access(path, F_OK);
  758. if (res)
  759. {
  760. /* File does not exist yet */
  761. generate_bus_affinity_file();
  762. }
  763. load_bus_affinity_file_content();
  764. }
  765. #ifdef STARPU_USE_CUDA
  766. int *_starpu_get_cuda_affinity_vector(unsigned gpuid)
  767. {
  768. return cuda_affinity_matrix[gpuid];
  769. }
  770. #endif /* STARPU_USE_CUDA */
  771. #ifdef STARPU_USE_OPENCL
  772. int *_starpu_get_opencl_affinity_vector(unsigned gpuid)
  773. {
  774. return opencl_affinity_matrix[gpuid];
  775. }
  776. #endif /* STARPU_USE_OPENCL */
  777. void starpu_bus_print_affinity(FILE *f)
  778. {
  779. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  780. unsigned cpu;
  781. unsigned gpu;
  782. #endif
  783. fprintf(f, "# GPU\tCPU in preference order (logical index)\n");
  784. #ifdef STARPU_USE_CUDA
  785. fprintf(f, "# CUDA\n");
  786. for(gpu = 0 ; gpu<ncuda ; gpu++)
  787. {
  788. fprintf(f, "%d\t", gpu);
  789. for (cpu = 0; cpu < ncpus; cpu++)
  790. {
  791. fprintf(f, "%d\t", cuda_affinity_matrix[gpu][cpu]);
  792. }
  793. fprintf(f, "\n");
  794. }
  795. #endif
  796. #ifdef STARPU_USE_OPENCL
  797. fprintf(f, "# OpenCL\n");
  798. for(gpu = 0 ; gpu<nopencl ; gpu++)
  799. {
  800. fprintf(f, "%d\t", gpu);
  801. for (cpu = 0; cpu < ncpus; cpu++)
  802. {
  803. fprintf(f, "%d\t", opencl_affinity_matrix[gpu][cpu]);
  804. }
  805. fprintf(f, "\n");
  806. }
  807. #endif
  808. }
  809. #endif /* STARPU_SIMGRID */
  810. /*
  811. * Latency
  812. */
  813. static void get_latency_path(char *path, size_t maxlen)
  814. {
  815. get_bus_path("latency", path, maxlen);
  816. }
  817. static int load_bus_latency_file_content(void)
  818. {
  819. int n;
  820. unsigned src, dst;
  821. FILE *f;
  822. double latency;
  823. char path[256];
  824. get_latency_path(path, sizeof(path));
  825. _STARPU_DEBUG("loading latencies from %s\n", path);
  826. f = fopen(path, "r");
  827. if (!f)
  828. {
  829. perror("fopen load_bus_latency_file_content");
  830. _STARPU_DISP("path '%s'\n", path);
  831. fflush(stderr);
  832. STARPU_ABORT();
  833. }
  834. for (src = 0; src < STARPU_MAXNODES; src++)
  835. {
  836. _starpu_drop_comments(f);
  837. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  838. {
  839. n = _starpu_read_double(f, "%lf", &latency);
  840. if (n != 1)
  841. {
  842. _STARPU_DISP("Error while reading latency file <%s>. Expected a number\n", path);
  843. fclose(f);
  844. return 0;
  845. }
  846. n = getc(f);
  847. if (n == '\n')
  848. break;
  849. if (n != '\t')
  850. {
  851. _STARPU_DISP("bogus character %c in latency file %s\n", n, path);
  852. fclose(f);
  853. return 0;
  854. }
  855. latency_matrix[src][dst] = latency;
  856. /* Look out for \t\n */
  857. n = getc(f);
  858. if (n == '\n')
  859. break;
  860. ungetc(n, f);
  861. n = '\t';
  862. }
  863. /* No more values, take NAN */
  864. for ( ; dst < STARPU_MAXNODES; dst++)
  865. latency_matrix[src][dst] = NAN;
  866. while (n == '\t')
  867. {
  868. /* Look out for \t\n */
  869. n = getc(f);
  870. if (n == '\n')
  871. break;
  872. ungetc(n, f);
  873. n = _starpu_read_double(f, "%lf", &latency);
  874. if (n && !isnan(latency))
  875. {
  876. _STARPU_DISP("Too many nodes in latency file %s for this configuration (%d)\n", path, STARPU_MAXNODES);
  877. fclose(f);
  878. return 0;
  879. }
  880. n = getc(f);
  881. }
  882. if (n != '\n')
  883. {
  884. _STARPU_DISP("Bogus character %c in latency file %s\n", n, path);
  885. fclose(f);
  886. return 0;
  887. }
  888. /* Look out for EOF */
  889. n = getc(f);
  890. if (n == EOF)
  891. break;
  892. ungetc(n, f);
  893. }
  894. /* No more values, take NAN */
  895. for ( ; src < STARPU_MAXNODES; src++)
  896. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  897. latency_matrix[src][dst] = NAN;
  898. fclose(f);
  899. return 1;
  900. }
  901. #ifndef STARPU_SIMGRID
  902. static void write_bus_latency_file_content(void)
  903. {
  904. unsigned src, dst, maxnode;
  905. FILE *f;
  906. STARPU_ASSERT(was_benchmarked);
  907. char path[256];
  908. get_latency_path(path, sizeof(path));
  909. _STARPU_DEBUG("writing latencies to %s\n", path);
  910. f = fopen(path, "w+");
  911. if (!f)
  912. {
  913. perror("fopen write_bus_latency_file_content");
  914. _STARPU_DISP("path '%s'\n", path);
  915. fflush(stderr);
  916. STARPU_ABORT();
  917. }
  918. fprintf(f, "# ");
  919. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  920. fprintf(f, "to %d\t\t", dst);
  921. fprintf(f, "\n");
  922. maxnode = ncuda;
  923. #ifdef STARPU_USE_OPENCL
  924. maxnode += nopencl;
  925. #endif
  926. #ifdef STARPU_USE_MIC
  927. maxnode += nmic;
  928. #endif
  929. for (src = 0; src < STARPU_MAXNODES; src++)
  930. {
  931. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  932. {
  933. double latency = 0.0;
  934. if ((src > maxnode) || (dst > maxnode))
  935. {
  936. /* convention */
  937. latency = NAN;
  938. }
  939. else if (src == dst)
  940. {
  941. latency = 0.0;
  942. }
  943. else
  944. {
  945. /* µs */
  946. #ifdef STARPU_USE_CUDA
  947. #ifdef HAVE_CUDA_MEMCPY_PEER
  948. if (src && src < ncuda && dst && dst <= ncuda)
  949. latency = cudadev_latency_dtod[src][dst];
  950. else
  951. #endif
  952. {
  953. if (src && src <= ncuda)
  954. latency += cudadev_latency_dtoh[src];
  955. if (dst && dst <= ncuda)
  956. latency += cudadev_latency_htod[dst];
  957. }
  958. #endif
  959. #ifdef STARPU_USE_OPENCL
  960. if (src > ncuda)
  961. latency += opencldev_latency_dtoh[src-ncuda];
  962. if (dst > ncuda)
  963. latency += opencldev_latency_htod[dst-ncuda];
  964. #endif
  965. }
  966. if (dst)
  967. fputc('\t', f);
  968. fprintf(f, "%lf", latency);
  969. }
  970. fprintf(f, "\n");
  971. }
  972. fclose(f);
  973. }
  974. #endif
  975. static void generate_bus_latency_file(void)
  976. {
  977. if (!was_benchmarked)
  978. benchmark_all_gpu_devices();
  979. #ifndef STARPU_SIMGRID
  980. write_bus_latency_file_content();
  981. #endif
  982. }
  983. static void load_bus_latency_file(void)
  984. {
  985. int res;
  986. char path[256];
  987. get_latency_path(path, sizeof(path));
  988. res = access(path, F_OK);
  989. if (res || !load_bus_latency_file_content())
  990. {
  991. /* File does not exist yet or is bogus */
  992. generate_bus_latency_file();
  993. }
  994. }
  995. /*
  996. * Bandwidth
  997. */
  998. static void get_bandwidth_path(char *path, size_t maxlen)
  999. {
  1000. get_bus_path("bandwidth", path, maxlen);
  1001. }
  1002. static int load_bus_bandwidth_file_content(void)
  1003. {
  1004. int n;
  1005. unsigned src, dst;
  1006. FILE *f;
  1007. double bandwidth;
  1008. char path[256];
  1009. get_bandwidth_path(path, sizeof(path));
  1010. _STARPU_DEBUG("loading bandwidth from %s\n", path);
  1011. f = fopen(path, "r");
  1012. if (!f)
  1013. {
  1014. perror("fopen load_bus_bandwidth_file_content");
  1015. _STARPU_DISP("path '%s'\n", path);
  1016. fflush(stderr);
  1017. STARPU_ABORT();
  1018. }
  1019. for (src = 0; src < STARPU_MAXNODES; src++)
  1020. {
  1021. _starpu_drop_comments(f);
  1022. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  1023. {
  1024. n = _starpu_read_double(f, "%lf", &bandwidth);
  1025. if (n != 1)
  1026. {
  1027. _STARPU_DISP("Error while reading bandwidth file <%s>. Expected a number\n", path);
  1028. fclose(f);
  1029. return 0;
  1030. }
  1031. n = getc(f);
  1032. if (n == '\n')
  1033. break;
  1034. if (n != '\t')
  1035. {
  1036. _STARPU_DISP("bogus character %c in bandwidth file %s\n", n, path);
  1037. fclose(f);
  1038. return 0;
  1039. }
  1040. bandwidth_matrix[src][dst] = bandwidth;
  1041. /* Look out for \t\n */
  1042. n = getc(f);
  1043. if (n == '\n')
  1044. break;
  1045. ungetc(n, f);
  1046. n = '\t';
  1047. }
  1048. /* No more values, take NAN */
  1049. for ( ; dst < STARPU_MAXNODES; dst++)
  1050. bandwidth_matrix[src][dst] = NAN;
  1051. while (n == '\t')
  1052. {
  1053. /* Look out for \t\n */
  1054. n = getc(f);
  1055. if (n == '\n')
  1056. break;
  1057. ungetc(n, f);
  1058. n = _starpu_read_double(f, "%lf", &bandwidth);
  1059. if (n && !isnan(bandwidth))
  1060. {
  1061. _STARPU_DISP("Too many nodes in bandwidth file %s for this configuration (%d)\n", path, STARPU_MAXNODES);
  1062. fclose(f);
  1063. return 0;
  1064. }
  1065. n = getc(f);
  1066. }
  1067. if (n != '\n')
  1068. {
  1069. _STARPU_DISP("Bogus character %c in bandwidth file %s\n", n, path);
  1070. fclose(f);
  1071. return 0;
  1072. }
  1073. /* Look out for EOF */
  1074. n = getc(f);
  1075. if (n == EOF)
  1076. break;
  1077. ungetc(n, f);
  1078. }
  1079. /* No more values, take NAN */
  1080. for ( ; src < STARPU_MAXNODES; src++)
  1081. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  1082. latency_matrix[src][dst] = NAN;
  1083. fclose(f);
  1084. return 1;
  1085. }
  1086. #ifndef STARPU_SIMGRID
  1087. static void write_bus_bandwidth_file_content(void)
  1088. {
  1089. unsigned src, dst, maxnode;
  1090. FILE *f;
  1091. STARPU_ASSERT(was_benchmarked);
  1092. char path[256];
  1093. get_bandwidth_path(path, sizeof(path));
  1094. _STARPU_DEBUG("writing bandwidth to %s\n", path);
  1095. f = fopen(path, "w+");
  1096. STARPU_ASSERT(f);
  1097. fprintf(f, "# ");
  1098. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  1099. fprintf(f, "to %d\t\t", dst);
  1100. fprintf(f, "\n");
  1101. maxnode = ncuda;
  1102. #ifdef STARPU_USE_OPENCL
  1103. maxnode += nopencl;
  1104. #endif
  1105. #ifdef STARPU_USE_MIC
  1106. maxnode += nmic;
  1107. #endif
  1108. for (src = 0; src < STARPU_MAXNODES; src++)
  1109. {
  1110. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  1111. {
  1112. double bandwidth;
  1113. if ((src > maxnode) || (dst > maxnode))
  1114. {
  1115. bandwidth = NAN;
  1116. }
  1117. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_MIC)
  1118. else if (src != dst)
  1119. {
  1120. double slowness = 0.0;
  1121. /* Total bandwidth is the harmonic mean of bandwidths */
  1122. #ifdef STARPU_USE_CUDA
  1123. #ifdef HAVE_CUDA_MEMCPY_PEER
  1124. if (src && src <= ncuda && dst && dst <= ncuda)
  1125. /* Direct GPU-GPU transfert */
  1126. slowness = cudadev_timing_dtod[src][dst];
  1127. else
  1128. #endif
  1129. {
  1130. if (src && src <= ncuda)
  1131. slowness += cudadev_timing_dtoh[src];
  1132. if (dst && dst <= ncuda)
  1133. slowness += cudadev_timing_htod[dst];
  1134. }
  1135. #endif
  1136. /* TODO: generalize computation */
  1137. #ifdef STARPU_USE_OPENCL
  1138. if (src > ncuda && src <= ncuda + nopencl)
  1139. slowness += opencldev_timing_dtoh[src-ncuda];
  1140. if (dst > ncuda && dst <= ncuda + nopencl)
  1141. slowness += opencldev_timing_htod[dst-ncuda];
  1142. #endif
  1143. #ifdef STARPU_USE_MIC
  1144. if (src > ncuda + nopencl)
  1145. slowness += mic_time_device_to_host[src - (ncuda + nopencl)];
  1146. if (dst > ncuda + nopencl)
  1147. slowness += mic_time_host_to_device[dst - (ncuda + nopencl)];
  1148. #endif
  1149. bandwidth = 1.0/slowness;
  1150. }
  1151. #endif
  1152. else
  1153. {
  1154. /* convention */
  1155. bandwidth = 0.0;
  1156. }
  1157. if (dst)
  1158. fputc('\t', f);
  1159. fprintf(f, "%f", bandwidth);
  1160. }
  1161. fprintf(f, "\n");
  1162. }
  1163. fclose(f);
  1164. }
  1165. #endif /* STARPU_SIMGRID */
  1166. void starpu_bus_print_bandwidth(FILE *f)
  1167. {
  1168. unsigned src, dst, maxnode;
  1169. maxnode = ncuda;
  1170. #ifdef STARPU_USE_OPENCL
  1171. maxnode += nopencl;
  1172. #endif
  1173. #ifdef STARPU_USE_MIC
  1174. maxnode += nmic;
  1175. #endif
  1176. fprintf(f, "from/to\t");
  1177. fprintf(f, "RAM\t");
  1178. for (dst = 0; dst < ncuda; dst++)
  1179. fprintf(f, "CUDA %d\t", dst);
  1180. for (dst = 0; dst < nopencl; dst++)
  1181. fprintf(f, "OpenCL%d\t", dst);
  1182. for (dst = 0; dst < nmic; dst++)
  1183. fprintf(f, "MIC%d\t", dst);
  1184. fprintf(f, "\n");
  1185. for (src = 0; src <= maxnode; src++)
  1186. {
  1187. if (!src)
  1188. fprintf(f, "RAM\t");
  1189. else if (src <= ncuda)
  1190. fprintf(f, "CUDA %d\t", src-1);
  1191. else if (src <= ncuda + nopencl)
  1192. fprintf(f, "OpenCL%d\t", src-ncuda-1);
  1193. else
  1194. fprintf(f, "MIC%d\t", src-ncuda-nopencl-1);
  1195. for (dst = 0; dst <= maxnode; dst++)
  1196. fprintf(f, "%.0f\t", bandwidth_matrix[src][dst]);
  1197. fprintf(f, "\n");
  1198. }
  1199. fprintf(f, "\n");
  1200. for (src = 0; src <= maxnode; src++)
  1201. {
  1202. if (!src)
  1203. fprintf(f, "RAM\t");
  1204. else if (src <= ncuda)
  1205. fprintf(f, "CUDA %d\t", src-1);
  1206. else if (src <= ncuda + nopencl)
  1207. fprintf(f, "OpenCL%d\t", src-ncuda-1);
  1208. else
  1209. fprintf(f, "MIC%d\t", src-ncuda-nopencl-1);
  1210. for (dst = 0; dst <= maxnode; dst++)
  1211. fprintf(f, "%.0f\t", latency_matrix[src][dst]);
  1212. fprintf(f, "\n");
  1213. }
  1214. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  1215. if (ncuda != 0 || nopencl != 0)
  1216. fprintf(f, "\nGPU\tCPU in preference order (logical index), host-to-device, device-to-host\n");
  1217. for (src = 1; src <= ncuda + nopencl; src++)
  1218. {
  1219. struct dev_timing *timing;
  1220. struct _starpu_machine_config *config = _starpu_get_machine_config();
  1221. unsigned config_ncpus = _starpu_topology_get_nhwcpu(config);
  1222. unsigned cpu;
  1223. #ifdef STARPU_USE_CUDA
  1224. if (src <= ncuda)
  1225. {
  1226. fprintf(f, "CUDA %d\t", src-1);
  1227. for (cpu = 0; cpu < config_ncpus; cpu++)
  1228. {
  1229. timing = &cudadev_timing_per_cpu[src*STARPU_MAXCPUS+cpu];
  1230. if (timing->timing_htod)
  1231. fprintf(f, "%2d %.0f %.0f\t", timing->cpu_id, 1/timing->timing_htod, 1/timing->timing_dtoh);
  1232. else
  1233. fprintf(f, "%2d\t", cuda_affinity_matrix[src-1][cpu]);
  1234. }
  1235. }
  1236. #ifdef STARPU_USE_OPENCL
  1237. else
  1238. #endif
  1239. #endif
  1240. #ifdef STARPU_USE_OPENCL
  1241. {
  1242. fprintf(f, "OpenCL%d\t", src-ncuda-1);
  1243. for (cpu = 0; cpu < config_ncpus; cpu++)
  1244. {
  1245. timing = &opencldev_timing_per_cpu[(src-ncuda)*STARPU_MAXCPUS+cpu];
  1246. if (timing->timing_htod)
  1247. fprintf(f, "%2d %.0f %.0f\t", timing->cpu_id, 1/timing->timing_htod, 1/timing->timing_dtoh);
  1248. else
  1249. fprintf(f, "%2d\t", opencl_affinity_matrix[src-1][cpu]);
  1250. }
  1251. }
  1252. #endif
  1253. fprintf(f, "\n");
  1254. }
  1255. #endif
  1256. }
  1257. static void generate_bus_bandwidth_file(void)
  1258. {
  1259. if (!was_benchmarked)
  1260. benchmark_all_gpu_devices();
  1261. #ifndef STARPU_SIMGRID
  1262. write_bus_bandwidth_file_content();
  1263. #endif
  1264. }
  1265. static void load_bus_bandwidth_file(void)
  1266. {
  1267. int res;
  1268. char path[256];
  1269. get_bandwidth_path(path, sizeof(path));
  1270. res = access(path, F_OK);
  1271. if (res || !load_bus_bandwidth_file_content())
  1272. {
  1273. /* File does not exist yet or is bogus */
  1274. generate_bus_bandwidth_file();
  1275. }
  1276. }
  1277. #ifndef STARPU_SIMGRID
  1278. /*
  1279. * Config
  1280. */
  1281. static void get_config_path(char *path, size_t maxlen)
  1282. {
  1283. get_bus_path("config", path, maxlen);
  1284. }
  1285. static void check_bus_config_file(void)
  1286. {
  1287. int res;
  1288. char path[256];
  1289. struct _starpu_machine_config *config = _starpu_get_machine_config();
  1290. get_config_path(path, sizeof(path));
  1291. res = access(path, F_OK);
  1292. if (res || config->conf->bus_calibrate > 0)
  1293. {
  1294. if (res)
  1295. _STARPU_DISP("No performance model for the bus, calibrating...\n");
  1296. _starpu_bus_force_sampling();
  1297. if (res)
  1298. _STARPU_DISP("... done\n");
  1299. }
  1300. else
  1301. {
  1302. FILE *f;
  1303. int ret;
  1304. unsigned read_cuda = -1, read_opencl = -1, read_mic = -1;
  1305. unsigned read_cpus = -1;
  1306. // Loading configuration from file
  1307. f = fopen(path, "r");
  1308. STARPU_ASSERT(f);
  1309. _starpu_drop_comments(f);
  1310. ret = fscanf(f, "%u\t", &read_cpus);
  1311. STARPU_ASSERT(ret == 1);
  1312. _starpu_drop_comments(f);
  1313. ret = fscanf(f, "%d\t", &read_cuda);
  1314. STARPU_ASSERT(ret == 1);
  1315. _starpu_drop_comments(f);
  1316. ret = fscanf(f, "%d\t", &read_opencl);
  1317. STARPU_ASSERT(ret == 1);
  1318. _starpu_drop_comments(f);
  1319. ret = fscanf(f, "%d\t", &read_mic);
  1320. if (ret == 0)
  1321. read_mic = 0;
  1322. _starpu_drop_comments(f);
  1323. fclose(f);
  1324. // Loading current configuration
  1325. ncpus = _starpu_topology_get_nhwcpu(config);
  1326. #ifdef STARPU_USE_CUDA
  1327. ncuda = _starpu_get_cuda_device_count();
  1328. #endif
  1329. #ifdef STARPU_USE_OPENCL
  1330. nopencl = _starpu_opencl_get_device_count();
  1331. #endif
  1332. #ifdef STARPU_USE_MIC
  1333. nmic = _starpu_mic_src_get_device_count();
  1334. #endif /* STARPU_USE_MIC */
  1335. // Checking if both configurations match
  1336. if (read_cpus != ncpus)
  1337. {
  1338. _STARPU_DISP("Current configuration does not match the bus performance model (CPUS: (stored) %u != (current) %u), recalibrating...\n", read_cpus, ncpus);
  1339. _starpu_bus_force_sampling();
  1340. _STARPU_DISP("... done\n");
  1341. }
  1342. else if (read_cuda != ncuda)
  1343. {
  1344. _STARPU_DISP("Current configuration does not match the bus performance model (CUDA: (stored) %d != (current) %d), recalibrating...\n", read_cuda, ncuda);
  1345. _starpu_bus_force_sampling();
  1346. _STARPU_DISP("... done\n");
  1347. }
  1348. else if (read_opencl != nopencl)
  1349. {
  1350. _STARPU_DISP("Current configuration does not match the bus performance model (OpenCL: (stored) %d != (current) %d), recalibrating...\n", read_opencl, nopencl);
  1351. _starpu_bus_force_sampling();
  1352. _STARPU_DISP("... done\n");
  1353. }
  1354. else if (read_mic != nmic)
  1355. {
  1356. _STARPU_DISP("Current configuration does not match the bus performance model (MIC: (stored) %d != (current) %d), recalibrating...\n", read_mic, nmic);
  1357. _starpu_bus_force_sampling();
  1358. _STARPU_DISP("... done\n");
  1359. }
  1360. }
  1361. }
  1362. static void write_bus_config_file_content(void)
  1363. {
  1364. FILE *f;
  1365. char path[256];
  1366. STARPU_ASSERT(was_benchmarked);
  1367. get_config_path(path, sizeof(path));
  1368. _STARPU_DEBUG("writing config to %s\n", path);
  1369. f = fopen(path, "w+");
  1370. STARPU_ASSERT(f);
  1371. fprintf(f, "# Current configuration\n");
  1372. fprintf(f, "%u # Number of CPUs\n", ncpus);
  1373. fprintf(f, "%d # Number of CUDA devices\n", ncuda);
  1374. fprintf(f, "%d # Number of OpenCL devices\n", nopencl);
  1375. fprintf(f, "%d # Number of MIC devices\n", nmic);
  1376. fclose(f);
  1377. }
  1378. static void generate_bus_config_file(void)
  1379. {
  1380. if (!was_benchmarked)
  1381. benchmark_all_gpu_devices();
  1382. write_bus_config_file_content();
  1383. }
  1384. #endif /* !SIMGRID */
  1385. void _starpu_simgrid_get_platform_path(char *path, size_t maxlen)
  1386. {
  1387. get_bus_path("platform.xml", path, maxlen);
  1388. }
  1389. #ifndef STARPU_SIMGRID
  1390. static void write_bus_platform_file_content(void)
  1391. {
  1392. FILE *f;
  1393. char path[256];
  1394. unsigned i;
  1395. STARPU_ASSERT(was_benchmarked);
  1396. _starpu_simgrid_get_platform_path(path, sizeof(path));
  1397. _STARPU_DEBUG("writing platform to %s\n", path);
  1398. f = fopen(path, "w+");
  1399. if (!f)
  1400. {
  1401. perror("fopen write_bus_platform_file_content");
  1402. _STARPU_DISP("path '%s'\n", path);
  1403. fflush(stderr);
  1404. STARPU_ABORT();
  1405. }
  1406. fprintf(f,
  1407. "<?xml version='1.0'?>\n"
  1408. " <!DOCTYPE platform SYSTEM 'http://simgrid.gforge.inria.fr/simgrid.dtd'>\n"
  1409. " <platform version='3'>\n"
  1410. " <config id='General'>\n"
  1411. " <prop id='network/TCP_gamma' value='-1'></prop>\n"
  1412. " <prop id='network/latency_factor' value='1'></prop>\n"
  1413. " <prop id='network/bandwidth_factor' value='1'></prop>\n"
  1414. " </config>\n"
  1415. " <AS id='AS0' routing='Full'>\n"
  1416. " <host id='MAIN' power='1'/>\n"
  1417. );
  1418. for (i = 0; i < ncpus; i++)
  1419. /* TODO: host memory for out-of-core simulation */
  1420. fprintf(f, " <host id='CPU%d' power='2000000000'/>\n", i);
  1421. for (i = 0; i < ncuda; i++)
  1422. fprintf(f, " <host id='CUDA%d' power='2000000000'>\n <prop id='memsize' value='%llu'/>\n </host>\n", i, (unsigned long long) cuda_size[i]);
  1423. for (i = 0; i < nopencl; i++)
  1424. fprintf(f, " <host id='OpenCL%d' power='2000000000'>\n <prop id='memsize' value='%llu'/>\n </host>\n", i, (unsigned long long) opencl_size[i]);
  1425. fprintf(f, "\n <host id='RAM' power='1'/>\n");
  1426. /* Compute maximum bandwidth, taken as machine bandwidth */
  1427. double max_bandwidth = 0;
  1428. #ifdef STARPU_USE_CUDA
  1429. for (i = 0; i < ncuda; i++)
  1430. {
  1431. double down_bw = 1.0 / cudadev_timing_dtoh[1+i];
  1432. double up_bw = 1.0 / cudadev_timing_htod[1+i];
  1433. if (max_bandwidth < down_bw)
  1434. max_bandwidth = down_bw;
  1435. if (max_bandwidth < up_bw)
  1436. max_bandwidth = up_bw;
  1437. }
  1438. #endif
  1439. #ifdef STARPU_USE_OPENCL
  1440. for (i = 0; i < nopencl; i++)
  1441. {
  1442. double down_bw = 1.0 / opencldev_timing_dtoh[1+i];
  1443. double up_bw = 1.0 / opencldev_timing_htod[1+i];
  1444. if (max_bandwidth < down_bw)
  1445. max_bandwidth = down_bw;
  1446. if (max_bandwidth < up_bw)
  1447. max_bandwidth = up_bw;
  1448. }
  1449. #endif
  1450. fprintf(f, "\n <link id='Share' bandwidth='%f' latency='0.000000'/>\n\n", max_bandwidth*1000000);
  1451. /* Write bandwidths & latencies */
  1452. #ifdef STARPU_USE_CUDA
  1453. for (i = 0; i < ncuda; i++)
  1454. {
  1455. char i_name[16];
  1456. snprintf(i_name, sizeof(i_name), "CUDA%d", i);
  1457. fprintf(f, " <link id='RAM-%s' bandwidth='%f' latency='%f'/>\n",
  1458. i_name,
  1459. 1000000. / cudadev_timing_htod[1+i],
  1460. cudadev_latency_htod[1+i]/1000000.);
  1461. fprintf(f, " <link id='%s-RAM' bandwidth='%f' latency='%f'/>\n",
  1462. i_name,
  1463. 1000000. / cudadev_timing_dtoh[1+i],
  1464. cudadev_latency_dtoh[1+i]/1000000.);
  1465. }
  1466. #ifdef HAVE_CUDA_MEMCPY_PEER
  1467. for (i = 0; i < ncuda; i++)
  1468. {
  1469. unsigned j;
  1470. char i_name[16];
  1471. snprintf(i_name, sizeof(i_name), "CUDA%d", i);
  1472. for (j = 0; j < ncuda; j++)
  1473. {
  1474. char j_name[16];
  1475. if (j == i)
  1476. continue;
  1477. snprintf(j_name, sizeof(j_name), "CUDA%d", j);
  1478. fprintf(f, " <link id='%s-%s' bandwidth='%f' latency='%f'/>\n",
  1479. i_name, j_name,
  1480. 1000000. / cudadev_timing_dtod[1+i][1+j],
  1481. cudadev_latency_dtod[1+i][1+j]/1000000.);
  1482. }
  1483. }
  1484. #endif
  1485. #endif
  1486. #ifdef STARPU_USE_OPENCL
  1487. for (i = 0; i < nopencl; i++)
  1488. {
  1489. char i_name[16];
  1490. snprintf(i_name, sizeof(i_name), "OpenCL%d", i);
  1491. fprintf(f, " <link id='RAM-%s' bandwidth='%f' latency='%f'/>\n",
  1492. i_name,
  1493. 1000000 / opencldev_timing_htod[1+i],
  1494. opencldev_latency_htod[1+i]/1000000.);
  1495. fprintf(f, " <link id='%s-RAM' bandwidth='%f' latency='%f'/>\n",
  1496. i_name,
  1497. 1000000 / opencldev_timing_dtoh[1+i],
  1498. opencldev_latency_dtoh[1+i]/1000000.);
  1499. }
  1500. #endif
  1501. /* Write routes */
  1502. #ifdef STARPU_USE_CUDA
  1503. for (i = 0; i < ncuda; i++)
  1504. {
  1505. char i_name[16];
  1506. snprintf(i_name, sizeof(i_name), "CUDA%d", i);
  1507. fprintf(f, " <route src='RAM' dst='%s' symmetrical='NO'><link_ctn id='RAM-%s'/><link_ctn id='Share'/></route>\n", i_name, i_name);
  1508. fprintf(f, " <route src='%s' dst='RAM' symmetrical='NO'><link_ctn id='%s-RAM'/><link_ctn id='Share'/></route>\n", i_name, i_name);
  1509. }
  1510. #ifdef HAVE_CUDA_MEMCPY_PEER
  1511. for (i = 0; i < ncuda; i++)
  1512. {
  1513. unsigned j;
  1514. char i_name[16];
  1515. snprintf(i_name, sizeof(i_name), "CUDA%d", i);
  1516. for (j = 0; j < ncuda; j++)
  1517. {
  1518. char j_name[16];
  1519. if (j == i)
  1520. continue;
  1521. snprintf(j_name, sizeof(j_name), "CUDA%d", j);
  1522. fprintf(f, " <route src='%s' dst='%s' symmetrical='NO'><link_ctn id='%s-%s'/><link_ctn id='Share'/></route>\n", i_name, j_name, i_name, j_name);
  1523. }
  1524. }
  1525. #endif
  1526. #endif
  1527. #ifdef STARPU_USE_OPENCL
  1528. for (i = 0; i < nopencl; i++)
  1529. {
  1530. char i_name[16];
  1531. snprintf(i_name, sizeof(i_name), "OpenCL%d", i);
  1532. fprintf(f, " <route src='RAM' dst='%s' symmetrical='NO'><link_ctn id='RAM-%s'/><link_ctn id='Share'/></route>\n", i_name, i_name);
  1533. fprintf(f, " <route src='%s' dst='RAM' symmetrical='NO'><link_ctn id='%s-RAM'/><link_ctn id='Share'/></route>\n", i_name, i_name);
  1534. }
  1535. #endif
  1536. fprintf(f,
  1537. " </AS>\n"
  1538. " </platform>\n"
  1539. );
  1540. fclose(f);
  1541. }
  1542. static void generate_bus_platform_file(void)
  1543. {
  1544. if (!was_benchmarked)
  1545. benchmark_all_gpu_devices();
  1546. write_bus_platform_file_content();
  1547. }
  1548. static void check_bus_platform_file(void)
  1549. {
  1550. int res;
  1551. char path[256];
  1552. _starpu_simgrid_get_platform_path(path, sizeof(path));
  1553. res = access(path, F_OK);
  1554. if (res)
  1555. {
  1556. /* File does not exist yet */
  1557. generate_bus_platform_file();
  1558. }
  1559. }
  1560. /*
  1561. * Generic
  1562. */
  1563. static void _starpu_bus_force_sampling(void)
  1564. {
  1565. _STARPU_DEBUG("Force bus sampling ...\n");
  1566. _starpu_create_sampling_directory_if_needed();
  1567. generate_bus_affinity_file();
  1568. generate_bus_latency_file();
  1569. generate_bus_bandwidth_file();
  1570. generate_bus_config_file();
  1571. generate_bus_platform_file();
  1572. }
  1573. #endif /* !SIMGRID */
  1574. void _starpu_load_bus_performance_files(void)
  1575. {
  1576. _starpu_create_sampling_directory_if_needed();
  1577. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_SIMGRID)
  1578. ncuda = _starpu_get_cuda_device_count();
  1579. #endif
  1580. #if defined(STARPU_USE_OPENCL) || defined(STARPU_USE_SIMGRID)
  1581. nopencl = _starpu_opencl_get_device_count();
  1582. #endif
  1583. #ifndef STARPU_SIMGRID
  1584. check_bus_config_file();
  1585. load_bus_affinity_file();
  1586. #endif
  1587. load_bus_latency_file();
  1588. load_bus_bandwidth_file();
  1589. #ifndef STARPU_SIMGRID
  1590. check_bus_platform_file();
  1591. #endif
  1592. }
  1593. /* (in MB/s) */
  1594. double starpu_transfer_bandwidth(unsigned src_node, unsigned dst_node)
  1595. {
  1596. return bandwidth_matrix[src_node][dst_node];
  1597. }
  1598. /* (in µs) */
  1599. double starpu_transfer_latency(unsigned src_node, unsigned dst_node)
  1600. {
  1601. return latency_matrix[src_node][dst_node];
  1602. }
  1603. /* (in µs) */
  1604. double starpu_transfer_predict(unsigned src_node, unsigned dst_node, size_t size)
  1605. {
  1606. double bandwidth = bandwidth_matrix[src_node][dst_node];
  1607. double latency = latency_matrix[src_node][dst_node];
  1608. struct _starpu_machine_topology *topology = &_starpu_get_machine_config()->topology;
  1609. return latency + (size/bandwidth)*2*(topology->ncudagpus+topology->nopenclgpus);
  1610. }
  1611. /* calculate save bandwidth and latency */
  1612. /* bandwidth in MB/s - latency in µs */
  1613. void _starpu_save_bandwidth_and_latency_disk(double bandwidth_write, double bandwidth_read, double latency_write, double latency_read, unsigned node)
  1614. {
  1615. unsigned int i, j;
  1616. double slowness_disk_between_main_ram, slowness_main_ram_between_node;
  1617. /* save bandwith */
  1618. for(i = 0; i < STARPU_MAXNODES; ++i)
  1619. {
  1620. for(j = 0; j < STARPU_MAXNODES; ++j)
  1621. {
  1622. if (i == j && j == node) /* source == destination == node */
  1623. {
  1624. bandwidth_matrix[i][j] = 0;
  1625. }
  1626. else if (i == node) /* source == disk */
  1627. {
  1628. /* convert in slowness */
  1629. if(bandwidth_read != 0)
  1630. slowness_disk_between_main_ram = 1/bandwidth_read;
  1631. else
  1632. slowness_disk_between_main_ram = 0;
  1633. if(bandwidth_matrix[STARPU_MAIN_RAM][j] != 0)
  1634. slowness_main_ram_between_node = 1/bandwidth_matrix[STARPU_MAIN_RAM][j];
  1635. else
  1636. slowness_main_ram_between_node = 0;
  1637. bandwidth_matrix[i][j] = 1/(slowness_disk_between_main_ram+slowness_main_ram_between_node);
  1638. }
  1639. else if (j == node) /* destination == disk */
  1640. {
  1641. /* convert in slowness */
  1642. if(bandwidth_write != 0)
  1643. slowness_disk_between_main_ram = 1/bandwidth_write;
  1644. else
  1645. slowness_disk_between_main_ram = 0;
  1646. if(bandwidth_matrix[i][STARPU_MAIN_RAM] != 0)
  1647. slowness_main_ram_between_node = 1/bandwidth_matrix[i][STARPU_MAIN_RAM];
  1648. else
  1649. slowness_main_ram_between_node = 0;
  1650. bandwidth_matrix[i][j] = 1/(slowness_disk_between_main_ram+slowness_main_ram_between_node);
  1651. }
  1652. else if (j > node || i > node) /* not affected by the node */
  1653. {
  1654. bandwidth_matrix[i][j] = NAN;
  1655. }
  1656. }
  1657. }
  1658. /* save latency */
  1659. for(i = 0; i < STARPU_MAXNODES; ++i)
  1660. {
  1661. for(j = 0; j < STARPU_MAXNODES; ++j)
  1662. {
  1663. if (i == j && j == node) /* source == destination == node */
  1664. {
  1665. latency_matrix[i][j] = 0;
  1666. }
  1667. else if (i == node) /* source == disk */
  1668. {
  1669. latency_matrix[i][j] = (latency_write+latency_matrix[STARPU_MAIN_RAM][j]);
  1670. }
  1671. else if (j == node) /* destination == disk */
  1672. {
  1673. latency_matrix[i][j] = (latency_read+latency_matrix[i][STARPU_MAIN_RAM]);
  1674. }
  1675. else if (j > node || i > node) /* not affected by the node */
  1676. {
  1677. latency_matrix[i][j] = NAN;
  1678. }
  1679. }
  1680. }
  1681. }