perfmodel_bus.c 82 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009-2016 Université de Bordeaux
  4. * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 CNRS
  5. * Copyright (C) 2017 Inria
  6. * Copyright (C) 2013 Corentin Salingue
  7. *
  8. * StarPU is free software; you can redistribute it and/or modify
  9. * it under the terms of the GNU Lesser General Public License as published by
  10. * the Free Software Foundation; either version 2.1 of the License, or (at
  11. * your option) any later version.
  12. *
  13. * StarPU is distributed in the hope that it will be useful, but
  14. * WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  16. *
  17. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  18. */
  19. #ifdef STARPU_USE_CUDA
  20. #ifndef _GNU_SOURCE
  21. #define _GNU_SOURCE
  22. #endif
  23. #include <sched.h>
  24. #endif
  25. #include <stdlib.h>
  26. #include <math.h>
  27. #include <starpu.h>
  28. #include <starpu_cuda.h>
  29. #include <starpu_opencl.h>
  30. #include <common/config.h>
  31. #ifdef HAVE_UNISTD_H
  32. #include <unistd.h>
  33. #endif
  34. #include <core/workers.h>
  35. #include <core/perfmodel/perfmodel.h>
  36. #include <core/simgrid.h>
  37. #include <core/topology.h>
  38. #include <common/utils.h>
  39. #include <drivers/mpi/driver_mpi_common.h>
  40. #ifdef STARPU_USE_OPENCL
  41. #include <starpu_opencl.h>
  42. #endif
  43. #ifdef STARPU_HAVE_WINDOWS
  44. #include <windows.h>
  45. #endif
  46. #if defined(HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX) && HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX
  47. #include <hwloc/cuda.h>
  48. #endif
  49. #define SIZE (32*1024*1024*sizeof(char))
  50. #define NITER 32
  51. #ifndef STARPU_SIMGRID
  52. static void _starpu_bus_force_sampling(void);
  53. #endif
  54. /* timing is in µs per byte (i.e. slowness, inverse of bandwidth) */
  55. struct dev_timing
  56. {
  57. int numa_id;
  58. double timing_htod;
  59. double latency_htod;
  60. double timing_dtoh;
  61. double latency_dtoh;
  62. };
  63. /* TODO: measure latency */
  64. static double bandwidth_matrix[STARPU_MAXNODES][STARPU_MAXNODES];
  65. static double latency_matrix[STARPU_MAXNODES][STARPU_MAXNODES];
  66. static unsigned was_benchmarked = 0;
  67. #ifndef STARPU_SIMGRID
  68. static unsigned ncpus = 0;
  69. #endif
  70. static unsigned nnumas = 0;
  71. static unsigned ncuda = 0;
  72. static unsigned nopencl = 0;
  73. static unsigned nmic = 0;
  74. static unsigned nmpi_ms = 0;
  75. /* Benchmarking the performance of the bus */
  76. static double numa_latency[STARPU_MAXNUMANODES][STARPU_MAXNUMANODES];
  77. static double numa_timing[STARPU_MAXNUMANODES][STARPU_MAXNUMANODES];
  78. #ifndef STARPU_SIMGRID
  79. static uint64_t cuda_size[STARPU_MAXCUDADEVS];
  80. #endif
  81. #ifdef STARPU_USE_CUDA
  82. /* preference order of cores (logical indexes) */
  83. static int cuda_affinity_matrix[STARPU_MAXCUDADEVS][STARPU_MAXNUMANODES];
  84. #ifndef STARPU_SIMGRID
  85. #ifdef HAVE_CUDA_MEMCPY_PEER
  86. static double cudadev_timing_dtod[STARPU_MAXNODES][STARPU_MAXNODES] = {{0.0}};
  87. static double cudadev_latency_dtod[STARPU_MAXNODES][STARPU_MAXNODES] = {{0.0}};
  88. #endif
  89. #endif
  90. static struct dev_timing cudadev_timing_per_numa[STARPU_MAXCUDADEVS*STARPU_MAXNUMANODES];
  91. static char cudadev_direct[STARPU_MAXNODES][STARPU_MAXNODES];
  92. #endif
  93. #ifndef STARPU_SIMGRID
  94. static uint64_t opencl_size[STARPU_MAXCUDADEVS];
  95. #endif
  96. #ifdef STARPU_USE_OPENCL
  97. /* preference order of cores (logical indexes) */
  98. static int opencl_affinity_matrix[STARPU_MAXOPENCLDEVS][STARPU_MAXNUMANODES];
  99. static struct dev_timing opencldev_timing_per_numa[STARPU_MAXOPENCLDEVS*STARPU_MAXNUMANODES];
  100. #endif
  101. #ifdef STARPU_USE_MIC
  102. static double mic_time_host_to_device[STARPU_MAXNODES] = {0.0};
  103. static double mic_time_device_to_host[STARPU_MAXNODES] = {0.0};
  104. #endif /* STARPU_USE_MIC */
  105. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  106. static double mpi_time_device_to_device[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS] = {{0.0}};
  107. static double mpi_latency_device_to_device[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS] = {{0.0}};
  108. #endif
  109. #ifdef STARPU_HAVE_HWLOC
  110. static hwloc_topology_t hwtopology;
  111. hwloc_topology_t _starpu_perfmodel_get_hwtopology()
  112. {
  113. return hwtopology;
  114. }
  115. #endif
  116. #if (defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)) && !defined(STARPU_SIMGRID)
  117. #ifdef STARPU_USE_CUDA
  118. static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, int numa, int cpu, struct dev_timing *dev_timing_per_cpu)
  119. {
  120. struct _starpu_machine_config *config = _starpu_get_machine_config();
  121. _starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
  122. size_t size = SIZE;
  123. /* Initialize CUDA context on the device */
  124. /* We do not need to enable OpenGL interoperability at this point,
  125. * since we cleanly shutdown CUDA before returning. */
  126. cudaSetDevice(dev);
  127. /* hack to avoid third party libs to rebind threads */
  128. _starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
  129. /* hack to force the initialization */
  130. cudaFree(0);
  131. /* hack to avoid third party libs to rebind threads */
  132. _starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
  133. /* Get the maximum size which can be allocated on the device */
  134. struct cudaDeviceProp prop;
  135. cudaError_t cures;
  136. cures = cudaGetDeviceProperties(&prop, dev);
  137. if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
  138. cuda_size[dev] = prop.totalGlobalMem;
  139. if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
  140. /* Allocate a buffer on the device */
  141. unsigned char *d_buffer;
  142. cures = cudaMalloc((void **)&d_buffer, size);
  143. STARPU_ASSERT(cures == cudaSuccess);
  144. /* hack to avoid third party libs to rebind threads */
  145. _starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
  146. /* Allocate a buffer on the host */
  147. unsigned char *h_buffer;
  148. #if defined(STARPU_HAVE_HWLOC)
  149. if (nnumas > 1)
  150. {
  151. /* NUMA mode activated */
  152. hwloc_obj_t obj = hwloc_get_obj_by_type(hwtopology, HWLOC_OBJ_NODE, numa);
  153. h_buffer = hwloc_alloc_membind_nodeset(hwtopology, size, obj->nodeset, HWLOC_MEMBIND_BIND, 0);
  154. }
  155. else
  156. #endif
  157. {
  158. /* we use STARPU_MAIN_RAM */
  159. _STARPU_MALLOC(h_buffer, size);
  160. cudaHostRegister((void *)h_buffer, size, 0);
  161. }
  162. STARPU_ASSERT(cures == cudaSuccess);
  163. /* hack to avoid third party libs to rebind threads */
  164. _starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
  165. /* Fill them */
  166. memset(h_buffer, 0, size);
  167. cudaMemset(d_buffer, 0, size);
  168. /* hack to avoid third party libs to rebind threads */
  169. _starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
  170. const unsigned timing_numa_index = dev*STARPU_MAXNUMANODES + numa;
  171. unsigned iter;
  172. double timing;
  173. double start;
  174. double end;
  175. /* Measure upload bandwidth */
  176. start = starpu_timing_now();
  177. for (iter = 0; iter < NITER; iter++)
  178. {
  179. cudaMemcpy(d_buffer, h_buffer, size, cudaMemcpyHostToDevice);
  180. cudaThreadSynchronize();
  181. }
  182. end = starpu_timing_now();
  183. timing = end - start;
  184. dev_timing_per_cpu[timing_numa_index].timing_htod = timing/NITER/size;
  185. /* Measure download bandwidth */
  186. start = starpu_timing_now();
  187. for (iter = 0; iter < NITER; iter++)
  188. {
  189. cudaMemcpy(h_buffer, d_buffer, size, cudaMemcpyDeviceToHost);
  190. cudaThreadSynchronize();
  191. }
  192. end = starpu_timing_now();
  193. timing = end - start;
  194. dev_timing_per_cpu[timing_numa_index].timing_dtoh = timing/NITER/size;
  195. /* Measure upload latency */
  196. start = starpu_timing_now();
  197. for (iter = 0; iter < NITER; iter++)
  198. {
  199. cudaMemcpy(d_buffer, h_buffer, 1, cudaMemcpyHostToDevice);
  200. cudaThreadSynchronize();
  201. }
  202. end = starpu_timing_now();
  203. timing = end - start;
  204. dev_timing_per_cpu[timing_numa_index].latency_htod = timing/NITER;
  205. /* Measure download latency */
  206. start = starpu_timing_now();
  207. for (iter = 0; iter < NITER; iter++)
  208. {
  209. cudaMemcpy(h_buffer, d_buffer, 1, cudaMemcpyDeviceToHost);
  210. cudaThreadSynchronize();
  211. }
  212. end = starpu_timing_now();
  213. timing = end - start;
  214. dev_timing_per_cpu[timing_numa_index].latency_dtoh = timing/NITER;
  215. /* Free buffers */
  216. cudaHostUnregister(h_buffer);
  217. #if defined(STARPU_HAVE_HWLOC)
  218. if (nnumas > 1)
  219. {
  220. /* NUMA mode activated */
  221. hwloc_free(hwtopology, h_buffer, size);
  222. }
  223. else
  224. #endif
  225. {
  226. free(h_buffer);
  227. }
  228. cudaFree(d_buffer);
  229. cudaThreadExit();
  230. }
  231. #ifdef HAVE_CUDA_MEMCPY_PEER
  232. static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
  233. {
  234. size_t size = SIZE;
  235. int can;
  236. /* Get the maximum size which can be allocated on the device */
  237. struct cudaDeviceProp prop;
  238. cudaError_t cures;
  239. cures = cudaGetDeviceProperties(&prop, src);
  240. if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
  241. if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
  242. cures = cudaGetDeviceProperties(&prop, dst);
  243. if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
  244. if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
  245. /* Initialize CUDA context on the source */
  246. /* We do not need to enable OpenGL interoperability at this point,
  247. * since we cleanly shutdown CUDA before returning. */
  248. cudaSetDevice(src);
  249. if (starpu_get_env_number("STARPU_ENABLE_CUDA_GPU_GPU_DIRECT") != 0)
  250. {
  251. cures = cudaDeviceCanAccessPeer(&can, src, dst);
  252. if (!cures && can)
  253. {
  254. cures = cudaDeviceEnablePeerAccess(dst, 0);
  255. if (!cures)
  256. {
  257. _STARPU_DISP("GPU-Direct %d -> %d\n", dst, src);
  258. cudadev_direct[src][dst] = 1;
  259. }
  260. }
  261. }
  262. /* Allocate a buffer on the device */
  263. unsigned char *s_buffer;
  264. cures = cudaMalloc((void **)&s_buffer, size);
  265. STARPU_ASSERT(cures == cudaSuccess);
  266. cudaMemset(s_buffer, 0, size);
  267. /* Initialize CUDA context on the destination */
  268. /* We do not need to enable OpenGL interoperability at this point,
  269. * since we cleanly shutdown CUDA before returning. */
  270. cudaSetDevice(dst);
  271. if (starpu_get_env_number("STARPU_ENABLE_CUDA_GPU_GPU_DIRECT") != 0)
  272. {
  273. cures = cudaDeviceCanAccessPeer(&can, dst, src);
  274. if (!cures && can)
  275. {
  276. cures = cudaDeviceEnablePeerAccess(src, 0);
  277. if (!cures)
  278. {
  279. _STARPU_DISP("GPU-Direct %d -> %d\n", src, dst);
  280. cudadev_direct[dst][src] = 1;
  281. }
  282. }
  283. }
  284. /* Allocate a buffer on the device */
  285. unsigned char *d_buffer;
  286. cures = cudaMalloc((void **)&d_buffer, size);
  287. STARPU_ASSERT(cures == cudaSuccess);
  288. cudaMemset(d_buffer, 0, size);
  289. unsigned iter;
  290. double timing;
  291. double start;
  292. double end;
  293. /* Measure upload bandwidth */
  294. start = starpu_timing_now();
  295. for (iter = 0; iter < NITER; iter++)
  296. {
  297. cudaMemcpyPeer(d_buffer, dst, s_buffer, src, size);
  298. cudaThreadSynchronize();
  299. }
  300. end = starpu_timing_now();
  301. timing = end - start;
  302. cudadev_timing_dtod[src][dst] = timing/NITER/size;
  303. /* Measure upload latency */
  304. start = starpu_timing_now();
  305. for (iter = 0; iter < NITER; iter++)
  306. {
  307. cudaMemcpyPeer(d_buffer, dst, s_buffer, src, 1);
  308. cudaThreadSynchronize();
  309. }
  310. end = starpu_timing_now();
  311. timing = end - start;
  312. cudadev_latency_dtod[src][dst] = timing/NITER;
  313. /* Free buffers */
  314. cudaFree(d_buffer);
  315. cudaSetDevice(src);
  316. cudaFree(s_buffer);
  317. cudaThreadExit();
  318. }
  319. #endif
  320. #endif
  321. #ifdef STARPU_USE_OPENCL
  322. static void measure_bandwidth_between_host_and_dev_on_numa_with_opencl(int dev, int numa, int cpu, struct dev_timing *dev_timing_per_cpu)
  323. {
  324. cl_context context;
  325. cl_command_queue queue;
  326. cl_int err=0;
  327. size_t size = SIZE;
  328. int not_initialized;
  329. struct _starpu_machine_config *config = _starpu_get_machine_config();
  330. _starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
  331. /* Is the context already initialised ? */
  332. starpu_opencl_get_context(dev, &context);
  333. not_initialized = (context == NULL);
  334. if (not_initialized == 1)
  335. _starpu_opencl_init_context(dev);
  336. /* Get context and queue */
  337. starpu_opencl_get_context(dev, &context);
  338. starpu_opencl_get_queue(dev, &queue);
  339. /* Get the maximum size which can be allocated on the device */
  340. cl_device_id device;
  341. cl_ulong maxMemAllocSize, totalGlobalMem;
  342. starpu_opencl_get_device(dev, &device);
  343. err = clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(maxMemAllocSize), &maxMemAllocSize, NULL);
  344. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  345. if (size > (size_t)maxMemAllocSize/4) size = maxMemAllocSize/4;
  346. err = clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE , sizeof(totalGlobalMem), &totalGlobalMem, NULL);
  347. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  348. opencl_size[dev] = totalGlobalMem;
  349. if (_starpu_opencl_get_device_type(dev) == CL_DEVICE_TYPE_CPU)
  350. {
  351. /* Let's not use too much RAM when running OpenCL on a CPU: it
  352. * would make the OS swap like crazy. */
  353. size /= 2;
  354. }
  355. /* hack to avoid third party libs to rebind threads */
  356. _starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
  357. /* Allocate a buffer on the device */
  358. cl_mem d_buffer;
  359. d_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err);
  360. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  361. /* hack to avoid third party libs to rebind threads */
  362. _starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
  363. /* Allocate a buffer on the host */
  364. unsigned char *h_buffer;
  365. #if defined(STARPU_HAVE_HWLOC)
  366. if (nnumas > 1)
  367. {
  368. /* NUMA mode activated */
  369. hwloc_obj_t obj = hwloc_get_obj_by_type(hwtopology, HWLOC_OBJ_NODE, numa);
  370. h_buffer = hwloc_alloc_membind_nodeset(hwtopology, size, obj->nodeset, HWLOC_MEMBIND_BIND, 0);
  371. }
  372. else
  373. #endif
  374. {
  375. /* we use STARPU_MAIN_RAM */
  376. _STARPU_MALLOC(h_buffer, size);
  377. }
  378. /* hack to avoid third party libs to rebind threads */
  379. _starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
  380. /* Fill them */
  381. memset(h_buffer, 0, size);
  382. err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, size, h_buffer, 0, NULL, NULL);
  383. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  384. clFinish(queue);
  385. /* hack to avoid third party libs to rebind threads */
  386. _starpu_bind_thread_on_cpu(config, cpu, STARPU_NOWORKERID);
  387. const unsigned timing_numa_index = dev*STARPU_MAXNUMANODES + numa;
  388. unsigned iter;
  389. double timing;
  390. double start;
  391. double end;
  392. /* Measure upload bandwidth */
  393. start = starpu_timing_now();
  394. for (iter = 0; iter < NITER; iter++)
  395. {
  396. err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, size, h_buffer, 0, NULL, NULL);
  397. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  398. clFinish(queue);
  399. }
  400. end = starpu_timing_now();
  401. timing = end - start;
  402. dev_timing_per_cpu[timing_numa_index].timing_htod = timing/NITER/size;
  403. /* Measure download bandwidth */
  404. start = starpu_timing_now();
  405. for (iter = 0; iter < NITER; iter++)
  406. {
  407. err = clEnqueueReadBuffer(queue, d_buffer, CL_TRUE, 0, size, h_buffer, 0, NULL, NULL);
  408. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  409. clFinish(queue);
  410. }
  411. end = starpu_timing_now();
  412. timing = end - start;
  413. dev_timing_per_cpu[timing_numa_index].timing_dtoh = timing/NITER/size;
  414. /* Measure upload latency */
  415. start = starpu_timing_now();
  416. for (iter = 0; iter < NITER; iter++)
  417. {
  418. err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, 1, h_buffer, 0, NULL, NULL);
  419. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  420. clFinish(queue);
  421. }
  422. end = starpu_timing_now();
  423. timing = end - start;
  424. dev_timing_per_cpu[timing_numa_index].latency_htod = timing/NITER;
  425. /* Measure download latency */
  426. start = starpu_timing_now();
  427. for (iter = 0; iter < NITER; iter++)
  428. {
  429. err = clEnqueueReadBuffer(queue, d_buffer, CL_TRUE, 0, 1, h_buffer, 0, NULL, NULL);
  430. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  431. clFinish(queue);
  432. }
  433. end = starpu_timing_now();
  434. timing = end - start;
  435. dev_timing_per_cpu[timing_numa_index].latency_dtoh = timing/NITER;
  436. /* Free buffers */
  437. err = clReleaseMemObject(d_buffer);
  438. if (STARPU_UNLIKELY(err != CL_SUCCESS))
  439. STARPU_OPENCL_REPORT_ERROR(err);
  440. #if defined(STARPU_HAVE_HWLOC)
  441. if (nnumas > 1)
  442. {
  443. /* NUMA mode activated */
  444. hwloc_free(hwtopology, h_buffer, size);
  445. }
  446. else
  447. #endif
  448. {
  449. free(h_buffer);
  450. }
  451. /* Uninitiliaze OpenCL context on the device */
  452. if (not_initialized == 1)
  453. _starpu_opencl_deinit_context(dev);
  454. }
  455. #endif
  456. /* NB: we want to sort the bandwidth by DECREASING order */
  457. static int compar_dev_timing(const void *left_dev_timing, const void *right_dev_timing)
  458. {
  459. const struct dev_timing *left = (const struct dev_timing *)left_dev_timing;
  460. const struct dev_timing *right = (const struct dev_timing *)right_dev_timing;
  461. double left_dtoh = left->timing_dtoh;
  462. double left_htod = left->timing_htod;
  463. double right_dtoh = right->timing_dtoh;
  464. double right_htod = right->timing_htod;
  465. double timing_sum2_left = left_dtoh*left_dtoh + left_htod*left_htod;
  466. double timing_sum2_right = right_dtoh*right_dtoh + right_htod*right_htod;
  467. /* it's for a decreasing sorting */
  468. return (timing_sum2_left > timing_sum2_right);
  469. }
  470. #ifdef STARPU_HAVE_HWLOC
  471. #if 0
  472. static int find_numa_node(hwloc_obj_t obj)
  473. {
  474. STARPU_ASSERT(obj);
  475. hwloc_obj_t current = obj;
  476. while (current->depth != HWLOC_OBJ_NODE)
  477. {
  478. current = current->parent;
  479. /* If we don't find a "node" obj before the root, this means
  480. * hwloc does not know whether there are numa nodes or not, so
  481. * we should not use a per-node sampling in that case. */
  482. STARPU_ASSERT(current);
  483. }
  484. STARPU_ASSERT(current->depth == HWLOC_OBJ_NODE);
  485. return current->logical_index;
  486. }
  487. #endif
  488. static int find_cpu_from_numa_node(hwloc_obj_t obj)
  489. {
  490. STARPU_ASSERT(obj);
  491. hwloc_obj_t current = obj;
  492. while (current->depth != HWLOC_OBJ_PU)
  493. {
  494. current = current->first_child;
  495. /* If we don't find a "PU" obj before the leave, this means
  496. * hwloc does not know whether there are CPU or not. */
  497. STARPU_ASSERT(current);
  498. }
  499. STARPU_ASSERT(current->depth == HWLOC_OBJ_PU);
  500. return current->logical_index;
  501. }
  502. #endif
  503. static void measure_bandwidth_between_numa_nodes_and_dev(int dev, struct dev_timing *dev_timing_per_numanode, char *type)
  504. {
  505. /* We measure the bandwith between each GPU and each NUMA node */
  506. struct _starpu_machine_config * config = _starpu_get_machine_config();
  507. const unsigned nnuma_nodes = _starpu_topology_get_nnumanodes(config);
  508. unsigned numa_id;
  509. for (numa_id = 0; numa_id < nnuma_nodes; numa_id++)
  510. {
  511. /* Store results by starpu id */
  512. const unsigned timing_numa_index = dev*STARPU_MAXNUMANODES + numa_id;
  513. /* Store STARPU_memnode for later */
  514. dev_timing_per_numanode[timing_numa_index].numa_id = numa_id;
  515. /* Chose one CPU connected to this NUMA node */
  516. unsigned cpu_id = 0;
  517. #ifdef STARPU_HAVE_HWLOC
  518. hwloc_obj_t obj = hwloc_get_obj_by_type(hwtopology, HWLOC_OBJ_NODE, numa_id);
  519. cpu_id = find_cpu_from_numa_node(obj);
  520. #endif
  521. #ifdef STARPU_USE_CUDA
  522. if (strncmp(type, "CUDA", 4) == 0)
  523. measure_bandwidth_between_host_and_dev_on_numa_with_cuda(dev, numa_id, cpu_id, dev_timing_per_numanode);
  524. #endif
  525. #ifdef STARPU_USE_OPENCL
  526. if (strncmp(type, "OpenCL", 6) == 0)
  527. measure_bandwidth_between_host_and_dev_on_numa_with_opencl(dev, numa_id, cpu_id, dev_timing_per_numanode);
  528. #endif
  529. }
  530. }
  531. static void measure_bandwidth_between_host_and_dev(int dev, struct dev_timing *dev_timing_per_numa, char *type)
  532. {
  533. measure_bandwidth_between_numa_nodes_and_dev(dev, dev_timing_per_numa, type);
  534. #ifdef STARPU_VERBOSE
  535. struct _starpu_machine_config * config = _starpu_get_machine_config();
  536. const unsigned nnuma_nodes = _starpu_topology_get_nnumanodes(config);
  537. unsigned numa_id;
  538. for (numa_id = 0; numa_id < nnuma_nodes; numa_id++)
  539. {
  540. const unsigned timing_numa_index = dev*STARPU_MAXNUMANODES + numa_id;
  541. double bandwidth_dtoh = dev_timing_per_numa[timing_numa_index].timing_dtoh;
  542. double bandwidth_htod = dev_timing_per_numa[timing_numa_index].timing_htod;
  543. double bandwidth_sum2 = bandwidth_dtoh*bandwidth_dtoh + bandwidth_htod*bandwidth_htod;
  544. _STARPU_DISP("(%10s) BANDWIDTH GPU %d NUMA %u - htod %f - dtoh %f - %f\n", type, dev, numa_id, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
  545. }
  546. #endif
  547. }
  548. #endif /* defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) */
  549. static void measure_bandwidth_latency_between_numa(int numa_src, int numa_dst)
  550. {
  551. #if defined(STARPU_HAVE_HWLOC)
  552. if (nnumas > 1)
  553. {
  554. /* NUMA mode activated */
  555. double start, end, timing;
  556. unsigned iter;
  557. unsigned char *h_buffer;
  558. hwloc_obj_t obj_src = hwloc_get_obj_by_type(hwtopology, HWLOC_OBJ_NODE, numa_src);
  559. h_buffer = hwloc_alloc_membind_nodeset(hwtopology, SIZE, obj_src->nodeset, HWLOC_MEMBIND_BIND, 0);
  560. unsigned char *d_buffer;
  561. hwloc_obj_t obj_dst = hwloc_get_obj_by_type(hwtopology, HWLOC_OBJ_NODE, numa_dst);
  562. d_buffer = hwloc_alloc_membind_nodeset(hwtopology, SIZE, obj_dst->nodeset, HWLOC_MEMBIND_BIND, 0);
  563. memset(h_buffer, 0, SIZE);
  564. start = starpu_timing_now();
  565. for (iter = 0; iter < NITER; iter++)
  566. {
  567. memcpy(d_buffer, h_buffer, SIZE);
  568. }
  569. end = starpu_timing_now();
  570. timing = end - start;
  571. numa_timing[numa_src][numa_dst] = timing/NITER/SIZE;
  572. start = starpu_timing_now();
  573. for (iter = 0; iter < NITER; iter++)
  574. {
  575. memcpy(d_buffer, h_buffer, 1);
  576. }
  577. end = starpu_timing_now();
  578. timing = end - start;
  579. numa_latency[numa_src][numa_dst] = timing/NITER;
  580. hwloc_free(hwtopology, h_buffer, SIZE);
  581. hwloc_free(hwtopology, d_buffer, SIZE);
  582. }
  583. else
  584. #endif
  585. {
  586. /* Cannot make a real calibration */
  587. numa_timing[numa_src][numa_dst] = 0.01;
  588. numa_latency[numa_src][numa_dst] = 0;
  589. }
  590. }
  591. static void benchmark_all_gpu_devices(void)
  592. {
  593. #ifdef STARPU_SIMGRID
  594. _STARPU_DISP("Can not measure bus in simgrid mode, please run starpu_calibrate_bus in non-simgrid mode to make sure the bus performance model was calibrated\n");
  595. STARPU_ABORT();
  596. #else /* !SIMGRID */
  597. unsigned i, j;
  598. _STARPU_DEBUG("Benchmarking the speed of the bus\n");
  599. #ifdef STARPU_HAVE_HWLOC
  600. hwloc_topology_init(&hwtopology);
  601. hwloc_topology_load(hwtopology);
  602. #endif
  603. #ifdef STARPU_HAVE_HWLOC
  604. hwloc_bitmap_t former_cpuset = hwloc_bitmap_alloc();
  605. hwloc_get_cpubind(hwtopology, former_cpuset, HWLOC_CPUBIND_THREAD);
  606. #elif __linux__
  607. /* Save the current cpu binding */
  608. cpu_set_t former_process_affinity;
  609. int ret;
  610. ret = sched_getaffinity(0, sizeof(former_process_affinity), &former_process_affinity);
  611. if (ret)
  612. {
  613. perror("sched_getaffinity");
  614. STARPU_ABORT();
  615. }
  616. #else
  617. #warning Missing binding support, StarPU will not be able to properly benchmark NUMA topology
  618. #endif
  619. struct _starpu_machine_config *config = _starpu_get_machine_config();
  620. ncpus = _starpu_topology_get_nhwcpu(config);
  621. nnumas = _starpu_topology_get_nnumanodes(config);
  622. for (i = 0; i < nnumas; i++)
  623. for (j = 0; j < nnumas; j++)
  624. if (i != j)
  625. {
  626. _STARPU_DISP("NUMA %d -> %d...\n", i, j);
  627. measure_bandwidth_latency_between_numa(i, j);
  628. }
  629. #ifdef STARPU_USE_CUDA
  630. ncuda = _starpu_get_cuda_device_count();
  631. for (i = 0; i < ncuda; i++)
  632. {
  633. _STARPU_DISP("CUDA %u...\n", i);
  634. /* measure bandwidth between Host and Device i */
  635. measure_bandwidth_between_host_and_dev(i, cudadev_timing_per_numa, "CUDA");
  636. }
  637. #ifdef HAVE_CUDA_MEMCPY_PEER
  638. for (i = 0; i < ncuda; i++)
  639. {
  640. unsigned j;
  641. for (j = 0; j < ncuda; j++)
  642. if (i != j)
  643. {
  644. _STARPU_DISP("CUDA %u -> %u...\n", i, j);
  645. /* measure bandwidth between Host and Device i */
  646. measure_bandwidth_between_dev_and_dev_cuda(i, j);
  647. }
  648. }
  649. #endif
  650. #endif
  651. #ifdef STARPU_USE_OPENCL
  652. nopencl = _starpu_opencl_get_device_count();
  653. for (i = 0; i < nopencl; i++)
  654. {
  655. _STARPU_DISP("OpenCL %u...\n", i);
  656. /* measure bandwith between Host and Device i */
  657. measure_bandwidth_between_host_and_dev(i, opencldev_timing_per_numa, "OpenCL");
  658. }
  659. #endif
  660. #ifdef STARPU_USE_MIC
  661. /* TODO: implement real calibration ! For now we only put an arbitrary
  662. * value for each device during at the declaration as a bug fix, else
  663. * we get problems on heft scheduler */
  664. nmic = _starpu_mic_src_get_device_count();
  665. for (i = 0; i < STARPU_MAXNODES; i++)
  666. {
  667. mic_time_host_to_device[i] = 0.1;
  668. mic_time_device_to_host[i] = 0.1;
  669. }
  670. #endif /* STARPU_USE_MIC */
  671. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  672. _starpu_mpi_common_measure_bandwidth_latency(mpi_time_device_to_device, mpi_latency_device_to_device);
  673. #endif /* STARPU_USE_MPI_MASTER_SLAVE */
  674. #ifdef STARPU_HAVE_HWLOC
  675. hwloc_set_cpubind(hwtopology, former_cpuset, HWLOC_CPUBIND_THREAD);
  676. hwloc_bitmap_free(former_cpuset);
  677. #elif __linux__
  678. /* Restore the former affinity */
  679. ret = sched_setaffinity(0, sizeof(former_process_affinity), &former_process_affinity);
  680. if (ret)
  681. {
  682. perror("sched_setaffinity");
  683. STARPU_ABORT();
  684. }
  685. #endif
  686. #ifdef STARPU_HAVE_HWLOC
  687. hwloc_topology_destroy(hwtopology);
  688. #endif
  689. _STARPU_DEBUG("Benchmarking the speed of the bus is done.\n");
  690. was_benchmarked = 1;
  691. #endif /* !SIMGRID */
  692. }
  693. static void get_bus_path(const char *type, char *path, size_t maxlen)
  694. {
  695. char hostname[65];
  696. _starpu_gethostname(hostname, sizeof(hostname));
  697. snprintf(path, maxlen, "%s%s.%s", _starpu_get_perf_model_dir_bus(), hostname, type);
  698. }
  699. /*
  700. * Affinity
  701. */
  702. static void get_affinity_path(char *path, size_t maxlen)
  703. {
  704. get_bus_path("affinity", path, maxlen);
  705. }
  706. #ifndef STARPU_SIMGRID
  707. static void load_bus_affinity_file_content(void)
  708. {
  709. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  710. FILE *f;
  711. int locked;
  712. char path[256];
  713. get_affinity_path(path, sizeof(path));
  714. _STARPU_DEBUG("loading affinities from %s\n", path);
  715. f = fopen(path, "r");
  716. STARPU_ASSERT(f);
  717. locked = _starpu_frdlock(f) == 0;
  718. unsigned gpu;
  719. #ifdef STARPU_USE_CUDA
  720. ncuda = _starpu_get_cuda_device_count();
  721. for (gpu = 0; gpu < ncuda; gpu++)
  722. {
  723. int ret;
  724. unsigned dummy;
  725. _starpu_drop_comments(f);
  726. ret = fscanf(f, "%u\t", &dummy);
  727. STARPU_ASSERT(ret == 1);
  728. STARPU_ASSERT(dummy == gpu);
  729. unsigned numa;
  730. for (numa = 0; numa < nnumas; numa++)
  731. {
  732. ret = fscanf(f, "%d\t", &cuda_affinity_matrix[gpu][numa]);
  733. STARPU_ASSERT(ret == 1);
  734. }
  735. ret = fscanf(f, "\n");
  736. STARPU_ASSERT(ret == 0);
  737. }
  738. #endif /* !STARPU_USE_CUDA */
  739. #ifdef STARPU_USE_OPENCL
  740. nopencl = _starpu_opencl_get_device_count();
  741. for (gpu = 0; gpu < nopencl; gpu++)
  742. {
  743. int ret;
  744. unsigned dummy;
  745. _starpu_drop_comments(f);
  746. ret = fscanf(f, "%u\t", &dummy);
  747. STARPU_ASSERT(ret == 1);
  748. STARPU_ASSERT(dummy == gpu);
  749. unsigned numa;
  750. for (numa = 0; numa < nnumas; numa++)
  751. {
  752. ret = fscanf(f, "%d\t", &opencl_affinity_matrix[gpu][numa]);
  753. STARPU_ASSERT(ret == 1);
  754. }
  755. ret = fscanf(f, "\n");
  756. STARPU_ASSERT(ret == 0);
  757. }
  758. #endif /* !STARPU_USE_OPENCL */
  759. if (locked)
  760. _starpu_frdunlock(f);
  761. fclose(f);
  762. #endif /* !(STARPU_USE_CUDA_ || STARPU_USE_OPENCL */
  763. }
  764. #ifndef STARPU_SIMGRID
  765. static void write_bus_affinity_file_content(void)
  766. {
  767. STARPU_ASSERT(was_benchmarked);
  768. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  769. FILE *f;
  770. char path[256];
  771. int locked;
  772. get_affinity_path(path, sizeof(path));
  773. _STARPU_DEBUG("writing affinities to %s\n", path);
  774. f = fopen(path, "w+");
  775. if (!f)
  776. {
  777. perror("fopen write_buf_affinity_file_content");
  778. _STARPU_DISP("path '%s'\n", path);
  779. fflush(stderr);
  780. STARPU_ABORT();
  781. }
  782. locked = _starpu_frdlock(f) == 0;
  783. unsigned numa;
  784. unsigned gpu;
  785. fprintf(f, "# GPU\t");
  786. for (numa = 0; numa < nnumas; numa++)
  787. fprintf(f, "NUMA%u\t", numa);
  788. fprintf(f, "\n");
  789. #ifdef STARPU_USE_CUDA
  790. {
  791. /* Use an other array to sort bandwidth */
  792. struct dev_timing cudadev_timing_per_numa_sorted[STARPU_MAXCUDADEVS*STARPU_MAXNUMANODES];
  793. memcpy(cudadev_timing_per_numa_sorted, cudadev_timing_per_numa, STARPU_MAXCUDADEVS*STARPU_MAXNUMANODES*sizeof(struct dev_timing));
  794. for (gpu = 0; gpu < ncuda; gpu++)
  795. {
  796. fprintf(f, "%u\t", gpu);
  797. qsort(&(cudadev_timing_per_numa_sorted[gpu*STARPU_MAXNUMANODES]), nnumas, sizeof(struct dev_timing), compar_dev_timing);
  798. for (numa = 0; numa < nnumas; numa++)
  799. {
  800. fprintf(f, "%d\t", cudadev_timing_per_numa_sorted[gpu*STARPU_MAXNUMANODES+numa].numa_id);
  801. }
  802. fprintf(f, "\n");
  803. }
  804. }
  805. #endif
  806. #ifdef STARPU_USE_OPENCL
  807. {
  808. /* Use an other array to sort bandwidth */
  809. struct dev_timing opencldev_timing_per_numa_sorted[STARPU_MAXOPENCLDEVS*STARPU_MAXNUMANODES];
  810. memcpy(opencldev_timing_per_numa_sorted, opencldev_timing_per_numa, STARPU_MAXOPENCLDEVS*STARPU_MAXNUMANODES*sizeof(struct dev_timing));
  811. for (gpu = 0; gpu < nopencl; gpu++)
  812. {
  813. fprintf(f, "%u\t", gpu);
  814. qsort(&(opencldev_timing_per_numa_sorted[gpu*STARPU_MAXNUMANODES]), nnumas, sizeof(struct dev_timing), compar_dev_timing);
  815. for (numa = 0; numa < nnumas; numa++)
  816. {
  817. fprintf(f, "%d\t", opencldev_timing_per_numa_sorted[gpu*STARPU_MAXNUMANODES+numa].numa_id);
  818. }
  819. fprintf(f, "\n");
  820. }
  821. }
  822. #endif
  823. if (locked)
  824. _starpu_frdunlock(f);
  825. fclose(f);
  826. #endif
  827. }
  828. #endif /* STARPU_SIMGRID */
  829. static void generate_bus_affinity_file(void)
  830. {
  831. if (!was_benchmarked)
  832. benchmark_all_gpu_devices();
  833. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  834. /* Slaves don't write files */
  835. if (!_starpu_mpi_common_is_src_node())
  836. return;
  837. #endif
  838. write_bus_affinity_file_content();
  839. }
  840. static int check_bus_affinity_file(void)
  841. {
  842. int ret = 1;
  843. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  844. FILE *f;
  845. int locked;
  846. unsigned dummy;
  847. char path[256];
  848. get_affinity_path(path, sizeof(path));
  849. _STARPU_DEBUG("loading affinities from %s\n", path);
  850. f = fopen(path, "r");
  851. STARPU_ASSERT(f);
  852. locked = _starpu_frdlock(f) == 0;
  853. ret = fscanf(f, "# GPU\t");
  854. STARPU_ASSERT(ret == 0);
  855. ret = fscanf(f, "NUMA%u\t", &dummy);
  856. if (locked)
  857. _starpu_frdunlock(f);
  858. fclose(f);
  859. #endif
  860. return ret == 1;
  861. }
  862. static void load_bus_affinity_file(void)
  863. {
  864. int exist, check = 1;
  865. char path[256];
  866. get_affinity_path(path, sizeof(path));
  867. /* access return 0 if file exists */
  868. exist = access(path, F_OK);
  869. if (exist == 0)
  870. /* return 0 if it's not good */
  871. check = check_bus_affinity_file();
  872. if (check == 0)
  873. _STARPU_DISP("Affinity File is too old for this version of StarPU ! Rebuilding it...\n");
  874. if (check == 0 || exist != 0)
  875. {
  876. /* File does not exist yet */
  877. generate_bus_affinity_file();
  878. }
  879. load_bus_affinity_file_content();
  880. }
  881. #ifdef STARPU_USE_CUDA
  882. int *_starpu_get_cuda_affinity_vector(unsigned gpuid)
  883. {
  884. return cuda_affinity_matrix[gpuid];
  885. }
  886. #endif /* STARPU_USE_CUDA */
  887. #ifdef STARPU_USE_OPENCL
  888. int *_starpu_get_opencl_affinity_vector(unsigned gpuid)
  889. {
  890. return opencl_affinity_matrix[gpuid];
  891. }
  892. #endif /* STARPU_USE_OPENCL */
  893. void starpu_bus_print_affinity(FILE *f)
  894. {
  895. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  896. unsigned numa;
  897. unsigned gpu;
  898. #endif
  899. fprintf(f, "# GPU\tNUMA in preference order (logical index)\n");
  900. #ifdef STARPU_USE_CUDA
  901. fprintf(f, "# CUDA\n");
  902. for(gpu = 0 ; gpu<ncuda ; gpu++)
  903. {
  904. fprintf(f, "%u\t", gpu);
  905. for (numa = 0; numa < nnumas; numa++)
  906. {
  907. fprintf(f, "%d\t", cuda_affinity_matrix[gpu][numa]);
  908. }
  909. fprintf(f, "\n");
  910. }
  911. #endif
  912. #ifdef STARPU_USE_OPENCL
  913. fprintf(f, "# OpenCL\n");
  914. for(gpu = 0 ; gpu<nopencl ; gpu++)
  915. {
  916. fprintf(f, "%u\t", gpu);
  917. for (numa = 0; numa < nnumas; numa++)
  918. {
  919. fprintf(f, "%d\t", opencl_affinity_matrix[gpu][numa]);
  920. }
  921. fprintf(f, "\n");
  922. }
  923. #endif
  924. }
  925. #endif /* STARPU_SIMGRID */
  926. /*
  927. * Latency
  928. */
  929. static void get_latency_path(char *path, size_t maxlen)
  930. {
  931. get_bus_path("latency", path, maxlen);
  932. }
  933. static int load_bus_latency_file_content(void)
  934. {
  935. int n;
  936. unsigned src, dst;
  937. FILE *f;
  938. double latency;
  939. int locked;
  940. char path[256];
  941. get_latency_path(path, sizeof(path));
  942. _STARPU_DEBUG("loading latencies from %s\n", path);
  943. f = fopen(path, "r");
  944. if (!f)
  945. {
  946. perror("fopen load_bus_latency_file_content");
  947. _STARPU_DISP("path '%s'\n", path);
  948. fflush(stderr);
  949. STARPU_ABORT();
  950. }
  951. locked = _starpu_frdlock(f) == 0;
  952. for (src = 0; src < STARPU_MAXNODES; src++)
  953. {
  954. _starpu_drop_comments(f);
  955. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  956. {
  957. n = _starpu_read_double(f, "%le", &latency);
  958. if (n != 1)
  959. {
  960. _STARPU_DISP("Error while reading latency file <%s>. Expected a number. Did you change the maximum number of GPUs at ./configure time?\n", path);
  961. fclose(f);
  962. return 0;
  963. }
  964. n = getc(f);
  965. if (n == '\n')
  966. break;
  967. if (n != '\t')
  968. {
  969. _STARPU_DISP("bogus character '%c' (%d) in latency file %s\n", n, n, path);
  970. fclose(f);
  971. return 0;
  972. }
  973. latency_matrix[src][dst] = latency;
  974. /* Look out for \t\n */
  975. n = getc(f);
  976. if (n == '\n')
  977. break;
  978. ungetc(n, f);
  979. n = '\t';
  980. }
  981. /* No more values, take NAN */
  982. for ( ; dst < STARPU_MAXNODES; dst++)
  983. latency_matrix[src][dst] = NAN;
  984. while (n == '\t')
  985. {
  986. /* Look out for \t\n */
  987. n = getc(f);
  988. if (n == '\n')
  989. break;
  990. ungetc(n, f);
  991. n = _starpu_read_double(f, "%le", &latency);
  992. if (n && !isnan(latency))
  993. {
  994. _STARPU_DISP("Too many nodes in latency file %s for this configuration (%d). Did you change the maximum number of GPUs at ./configure time?\n", path, STARPU_MAXNODES);
  995. fclose(f);
  996. return 0;
  997. }
  998. n = getc(f);
  999. }
  1000. if (n != '\n')
  1001. {
  1002. _STARPU_DISP("Bogus character '%c' (%d) in latency file %s\n", n, n, path);
  1003. fclose(f);
  1004. return 0;
  1005. }
  1006. /* Look out for EOF */
  1007. n = getc(f);
  1008. if (n == EOF)
  1009. break;
  1010. ungetc(n, f);
  1011. }
  1012. if (locked)
  1013. _starpu_frdunlock(f);
  1014. fclose(f);
  1015. /* No more values, take NAN */
  1016. for ( ; src < STARPU_MAXNODES; src++)
  1017. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  1018. latency_matrix[src][dst] = NAN;
  1019. return 1;
  1020. }
  1021. #ifndef STARPU_SIMGRID
  1022. static double search_bus_best_latency(int src, char * type, int htod)
  1023. {
  1024. /* Search the best latency for this node */
  1025. double best = 0.0;
  1026. double actual = 0.0;
  1027. unsigned check = 0;
  1028. unsigned numa;
  1029. for (numa = 0; numa < nnumas; numa++)
  1030. {
  1031. #ifdef STARPU_USE_CUDA
  1032. if (strncmp(type, "CUDA", 4) == 0)
  1033. {
  1034. if (htod)
  1035. actual = cudadev_timing_per_numa[src*STARPU_MAXNUMANODES+numa].latency_htod;
  1036. else
  1037. actual = cudadev_timing_per_numa[src*STARPU_MAXNUMANODES+numa].latency_dtoh;
  1038. }
  1039. #endif
  1040. #ifdef STARPU_USE_OPENCL
  1041. if (strncmp(type, "OpenCL", 6) == 0)
  1042. {
  1043. if (htod)
  1044. actual = opencldev_timing_per_numa[src*STARPU_MAXNUMANODES+numa].latency_htod;
  1045. else
  1046. actual = opencldev_timing_per_numa[src*STARPU_MAXNUMANODES+numa].latency_dtoh;
  1047. }
  1048. #endif
  1049. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  1050. if (!check || actual < best)
  1051. {
  1052. best = actual;
  1053. check = 1;
  1054. }
  1055. #endif
  1056. }
  1057. return best;
  1058. }
  1059. static void write_bus_latency_file_content(void)
  1060. {
  1061. unsigned src, dst, maxnode;
  1062. /* Boundaries to check if src or dst are inside the interval */
  1063. unsigned b_low, b_up;
  1064. FILE *f;
  1065. int locked;
  1066. STARPU_ASSERT(was_benchmarked);
  1067. char path[256];
  1068. get_latency_path(path, sizeof(path));
  1069. _STARPU_DEBUG("writing latencies to %s\n", path);
  1070. f = fopen(path, "w+");
  1071. if (!f)
  1072. {
  1073. perror("fopen write_bus_latency_file_content");
  1074. _STARPU_DISP("path '%s'\n", path);
  1075. fflush(stderr);
  1076. STARPU_ABORT();
  1077. }
  1078. locked = _starpu_fwrlock(f) == 0;
  1079. _starpu_fftruncate(f, 0);
  1080. fprintf(f, "# ");
  1081. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  1082. fprintf(f, "to %u\t\t", dst);
  1083. fprintf(f, "\n");
  1084. maxnode = nnumas;
  1085. #ifdef STARPU_USE_CUDA
  1086. maxnode += ncuda;
  1087. #endif
  1088. #ifdef STARPU_USE_OPENCL
  1089. maxnode += nopencl;
  1090. #endif
  1091. #ifdef STARPU_USE_MIC
  1092. maxnode += nmic;
  1093. #endif
  1094. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  1095. maxnode += nmpi_ms;
  1096. #endif
  1097. for (src = 0; src < STARPU_MAXNODES; src++)
  1098. {
  1099. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  1100. {
  1101. /* µs */
  1102. double latency = 0.0;
  1103. if ((src >= maxnode) || (dst >= maxnode))
  1104. {
  1105. /* convention */
  1106. latency = NAN;
  1107. }
  1108. else if (src == dst)
  1109. {
  1110. latency = 0.0;
  1111. }
  1112. else
  1113. {
  1114. b_low = b_up = 0;
  1115. /* ---- Begin NUMA ---- */
  1116. b_up += nnumas;
  1117. if (src >= b_low && src < b_up && dst >= b_low && dst < b_up)
  1118. latency += numa_latency[src-b_low][dst-b_low];
  1119. /* copy interval to check numa index later */
  1120. unsigned numa_low = b_low;
  1121. unsigned numa_up = b_up;
  1122. b_low += nnumas;
  1123. /* ---- End NUMA ---- */
  1124. #ifdef STARPU_USE_CUDA
  1125. b_up += ncuda;
  1126. #ifdef HAVE_CUDA_MEMCPY_PEER
  1127. if (src >= b_low && src < b_up && dst >= b_low && dst < b_up)
  1128. latency += cudadev_latency_dtod[src-b_low][dst-b_low];
  1129. else
  1130. #endif
  1131. {
  1132. /* Check if it's CUDA <-> NUMA link */
  1133. if (src >=b_low && src < b_up && dst >= numa_low && dst < numa_up)
  1134. latency += cudadev_timing_per_numa[(src-b_low)*STARPU_MAXNUMANODES+dst-numa_low].latency_dtoh;
  1135. if (dst >= b_low && dst < b_up && src >= numa_low && dst < numa_up)
  1136. latency += cudadev_timing_per_numa[(dst-b_low)*STARPU_MAXNUMANODES+src-numa_low].latency_htod;
  1137. /* To other devices, take the best latency */
  1138. if (src >= b_low && src < b_up && !(dst >= numa_low && dst < numa_up))
  1139. latency += search_bus_best_latency(src-b_low, "CUDA", 0);
  1140. if (dst >= b_low && dst < b_up && !(src >= numa_low && dst < numa_up))
  1141. latency += search_bus_best_latency(dst-b_low, "CUDA", 1);
  1142. }
  1143. b_low += ncuda;
  1144. #endif
  1145. #ifdef STARPU_USE_OPENCL
  1146. b_up += nopencl;
  1147. /* Check if it's OpenCL <-> NUMA link */
  1148. if (src >= b_low && src < b_up && dst >= numa_low && dst < numa_up)
  1149. latency += opencldev_timing_per_numa[(src-b_low)*STARPU_MAXNUMANODES+dst-numa_low].latency_dtoh;
  1150. if (dst >= b_low && dst < b_up && src >= numa_low && dst < numa_up)
  1151. latency += opencldev_timing_per_numa[(dst-b_low)*STARPU_MAXNUMANODES+src-numa_low].latency_htod;
  1152. /* To other devices, take the best latency */
  1153. if (src >= b_low && src < b_up && !(dst >= numa_low && dst < numa_up))
  1154. latency += search_bus_best_latency(src-b_low, "OpenCL", 0);
  1155. if (dst >= b_low && dst < b_up && !(src >= numa_low && dst < numa_up))
  1156. latency += search_bus_best_latency(dst-b_low, "OpenCL", 1);
  1157. b_low += nopencl;
  1158. #endif
  1159. #ifdef STARPU_USE_MIC
  1160. b_up += nmic;
  1161. /* TODO Latency MIC */
  1162. b_low += nmic;
  1163. #endif
  1164. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  1165. b_up += nmpi_ms;
  1166. /* Modify MPI src and MPI dst if they contain the master node or not
  1167. * Because, we only take care about slaves */
  1168. int mpi_master = _starpu_mpi_common_get_src_node();
  1169. int mpi_src = src - b_low;
  1170. mpi_src = (mpi_master <= mpi_src) ? mpi_src+1 : mpi_src;
  1171. int mpi_dst = dst - b_low;
  1172. mpi_dst = (mpi_master <= mpi_dst) ? mpi_dst+1 : mpi_dst;
  1173. if (src >= b_low && src < b_up && dst >= b_low && dst < b_up)
  1174. latency += mpi_latency_device_to_device[mpi_src][mpi_dst];
  1175. else
  1176. {
  1177. if (src >= b_low && src < b_up)
  1178. latency += mpi_latency_device_to_device[mpi_src][mpi_master];
  1179. if (dst >= b_low && dst < b_up)
  1180. latency += mpi_latency_device_to_device[mpi_master][mpi_dst];
  1181. }
  1182. b_low += nmpi_ms;
  1183. #endif
  1184. }
  1185. if (dst > 0)
  1186. fputc('\t', f);
  1187. _starpu_write_double(f, "%e", latency);
  1188. }
  1189. fprintf(f, "\n");
  1190. }
  1191. if (locked)
  1192. _starpu_fwrunlock(f);
  1193. fclose(f);
  1194. }
  1195. #endif
  1196. static void generate_bus_latency_file(void)
  1197. {
  1198. if (!was_benchmarked)
  1199. benchmark_all_gpu_devices();
  1200. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  1201. /* Slaves don't write files */
  1202. if (!_starpu_mpi_common_is_src_node())
  1203. return;
  1204. #endif
  1205. #ifndef STARPU_SIMGRID
  1206. write_bus_latency_file_content();
  1207. #endif
  1208. }
  1209. static void load_bus_latency_file(void)
  1210. {
  1211. int res;
  1212. char path[256];
  1213. get_latency_path(path, sizeof(path));
  1214. res = access(path, F_OK);
  1215. if (res || !load_bus_latency_file_content())
  1216. {
  1217. /* File does not exist yet or is bogus */
  1218. generate_bus_latency_file();
  1219. }
  1220. }
  1221. /*
  1222. * Bandwidth
  1223. */
  1224. static void get_bandwidth_path(char *path, size_t maxlen)
  1225. {
  1226. get_bus_path("bandwidth", path, maxlen);
  1227. }
  1228. static int load_bus_bandwidth_file_content(void)
  1229. {
  1230. int n;
  1231. unsigned src, dst;
  1232. FILE *f;
  1233. double bandwidth;
  1234. int locked;
  1235. char path[256];
  1236. get_bandwidth_path(path, sizeof(path));
  1237. _STARPU_DEBUG("loading bandwidth from %s\n", path);
  1238. f = fopen(path, "r");
  1239. if (!f)
  1240. {
  1241. perror("fopen load_bus_bandwidth_file_content");
  1242. _STARPU_DISP("path '%s'\n", path);
  1243. fflush(stderr);
  1244. STARPU_ABORT();
  1245. }
  1246. locked = _starpu_frdlock(f) == 0;
  1247. for (src = 0; src < STARPU_MAXNODES; src++)
  1248. {
  1249. _starpu_drop_comments(f);
  1250. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  1251. {
  1252. n = _starpu_read_double(f, "%le", &bandwidth);
  1253. if (n != 1)
  1254. {
  1255. _STARPU_DISP("Error while reading bandwidth file <%s>. Expected a number\n", path);
  1256. fclose(f);
  1257. return 0;
  1258. }
  1259. n = getc(f);
  1260. if (n == '\n')
  1261. break;
  1262. if (n != '\t')
  1263. {
  1264. _STARPU_DISP("bogus character '%c' (%d) in bandwidth file %s\n", n, n, path);
  1265. fclose(f);
  1266. return 0;
  1267. }
  1268. bandwidth_matrix[src][dst] = bandwidth;
  1269. /* Look out for \t\n */
  1270. n = getc(f);
  1271. if (n == '\n')
  1272. break;
  1273. ungetc(n, f);
  1274. n = '\t';
  1275. }
  1276. /* No more values, take NAN */
  1277. for ( ; dst < STARPU_MAXNODES; dst++)
  1278. bandwidth_matrix[src][dst] = NAN;
  1279. while (n == '\t')
  1280. {
  1281. /* Look out for \t\n */
  1282. n = getc(f);
  1283. if (n == '\n')
  1284. break;
  1285. ungetc(n, f);
  1286. n = _starpu_read_double(f, "%le", &bandwidth);
  1287. if (n && !isnan(bandwidth))
  1288. {
  1289. _STARPU_DISP("Too many nodes in bandwidth file %s for this configuration (%d)\n", path, STARPU_MAXNODES);
  1290. fclose(f);
  1291. return 0;
  1292. }
  1293. n = getc(f);
  1294. }
  1295. if (n != '\n')
  1296. {
  1297. _STARPU_DISP("Bogus character '%c' (%d) in bandwidth file %s\n", n, n, path);
  1298. fclose(f);
  1299. return 0;
  1300. }
  1301. /* Look out for EOF */
  1302. n = getc(f);
  1303. if (n == EOF)
  1304. break;
  1305. ungetc(n, f);
  1306. }
  1307. if (locked)
  1308. _starpu_frdunlock(f);
  1309. fclose(f);
  1310. /* No more values, take NAN */
  1311. for ( ; src < STARPU_MAXNODES; src++)
  1312. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  1313. latency_matrix[src][dst] = NAN;
  1314. return 1;
  1315. }
  1316. #ifndef STARPU_SIMGRID
  1317. static double search_bus_best_timing(int src, char * type, int htod)
  1318. {
  1319. /* Search the best latency for this node */
  1320. double best = 0.0;
  1321. double actual = 0.0;
  1322. unsigned check = 0;
  1323. unsigned numa;
  1324. for (numa = 0; numa < nnumas; numa++)
  1325. {
  1326. #ifdef STARPU_USE_CUDA
  1327. if (strncmp(type, "CUDA", 4) == 0)
  1328. {
  1329. if (htod)
  1330. actual = cudadev_timing_per_numa[src*STARPU_MAXNUMANODES+numa].timing_htod;
  1331. else
  1332. actual = opencldev_timing_per_numa[src*STARPU_MAXNUMANODES+numa].timing_dtoh;
  1333. }
  1334. #endif
  1335. #ifdef STARPU_USE_OPENCL
  1336. if (strncmp(type, "OpenCL", 6) == 0)
  1337. {
  1338. if (htod)
  1339. actual = opencldev_timing_per_numa[src*STARPU_MAXNUMANODES+numa].timing_htod;
  1340. else
  1341. actual = opencldev_timing_per_numa[src*STARPU_MAXNUMANODES+numa].timing_dtoh;
  1342. }
  1343. #endif
  1344. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  1345. if (!check || actual < best)
  1346. {
  1347. best = actual;
  1348. check = 1;
  1349. }
  1350. #endif
  1351. }
  1352. return best;
  1353. }
  1354. static void write_bus_bandwidth_file_content(void)
  1355. {
  1356. unsigned src, dst, maxnode;
  1357. unsigned b_low, b_up;
  1358. FILE *f;
  1359. int locked;
  1360. STARPU_ASSERT(was_benchmarked);
  1361. char path[256];
  1362. get_bandwidth_path(path, sizeof(path));
  1363. _STARPU_DEBUG("writing bandwidth to %s\n", path);
  1364. f = fopen(path, "w+");
  1365. STARPU_ASSERT(f);
  1366. locked = _starpu_fwrlock(f) == 0;
  1367. _starpu_fftruncate(f, 0);
  1368. fprintf(f, "# ");
  1369. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  1370. fprintf(f, "to %u\t\t", dst);
  1371. fprintf(f, "\n");
  1372. maxnode = nnumas;
  1373. #ifdef STARPU_USE_CUDA
  1374. maxnode += ncuda;
  1375. #endif
  1376. #ifdef STARPU_USE_OPENCL
  1377. maxnode += nopencl;
  1378. #endif
  1379. #ifdef STARPU_USE_MIC
  1380. maxnode += nmic;
  1381. #endif
  1382. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  1383. maxnode += nmpi_ms;
  1384. #endif
  1385. for (src = 0; src < STARPU_MAXNODES; src++)
  1386. {
  1387. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  1388. {
  1389. double bandwidth;
  1390. if ((src >= maxnode) || (dst >= maxnode))
  1391. {
  1392. bandwidth = NAN;
  1393. }
  1394. else if (src != dst)
  1395. {
  1396. double slowness = 0.0;
  1397. /* Total bandwidth is the harmonic mean of bandwidths */
  1398. b_low = b_up = 0;
  1399. /* Begin NUMA */
  1400. b_up += nnumas;
  1401. if (src >= b_low && src < b_up && dst >= b_low && dst < b_up)
  1402. slowness += numa_timing[src-b_low][dst-b_low];
  1403. /* copy interval to check numa index later */
  1404. unsigned numa_low = b_low;
  1405. unsigned numa_up = b_up;
  1406. b_low += nnumas;
  1407. /* End NUMA */
  1408. #ifdef STARPU_USE_CUDA
  1409. b_up += ncuda;
  1410. #ifdef HAVE_CUDA_MEMCPY_PEER
  1411. if (src >= b_low && src < b_up && dst >= b_low && dst < b_up)
  1412. /* Direct GPU-GPU transfert */
  1413. slowness += cudadev_timing_dtod[src-b_low][dst-b_low];
  1414. else
  1415. #endif
  1416. {
  1417. /* Check if it's CUDA <-> NUMA link */
  1418. if (src >= b_low && src < b_up && dst >= numa_low && dst < numa_up)
  1419. slowness += cudadev_timing_per_numa[(src-b_low)*STARPU_MAXNUMANODES+dst-numa_low].timing_dtoh;
  1420. if (dst >= b_low && dst < b_up && src >= numa_low && dst < numa_up)
  1421. slowness += cudadev_timing_per_numa[(dst-b_low)*STARPU_MAXNUMANODES+src-numa_low].timing_htod;
  1422. /* To other devices, take the best slowness */
  1423. if (src >= b_low && src < b_up && !(dst >= numa_low && dst < numa_up))
  1424. slowness += search_bus_best_timing(src-b_low, "CUDA", 0);
  1425. if (dst >= b_low && dst < b_up && !(src >= numa_low && dst < numa_up))
  1426. slowness += search_bus_best_timing(dst-b_low, "CUDA", 1);
  1427. }
  1428. b_low += ncuda;
  1429. #endif
  1430. #ifdef STARPU_USE_OPENCL
  1431. b_up += nopencl;
  1432. /* Check if it's OpenCL <-> NUMA link */
  1433. if (src >= b_low && src < b_up && dst >= numa_low && dst < numa_up)
  1434. slowness += opencldev_timing_per_numa[(src-b_low)*STARPU_MAXNUMANODES+dst-numa_low].timing_dtoh;
  1435. if (dst >= b_low && dst < b_up && src >= numa_low && dst < numa_up)
  1436. slowness += opencldev_timing_per_numa[(dst-b_low)*STARPU_MAXNUMANODES+src-numa_low].timing_htod;
  1437. /* To other devices, take the best slowness */
  1438. if (src >= b_low && src < b_up && !(dst >= numa_low && dst < numa_up))
  1439. slowness += search_bus_best_timing(src-b_low, "OpenCL", 0);
  1440. if (dst >= b_low && dst < b_up && !(src >= numa_low && dst < numa_up))
  1441. slowness += search_bus_best_timing(dst-b_low, "OpenCL", 1);
  1442. b_low += nopencl;
  1443. #endif
  1444. #ifdef STARPU_USE_MIC
  1445. b_up += nmic;
  1446. if (src >= b_low && src < b_up)
  1447. slowness += mic_time_device_to_host[src-b_low];
  1448. if (dst >= b_low && dst < b_up)
  1449. slowness += mic_time_host_to_device[dst-b_low];
  1450. b_low += nmic;
  1451. #endif
  1452. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  1453. b_up += nmpi_ms;
  1454. /* Modify MPI src and MPI dst if they contain the master node or not
  1455. * Because, we only take care about slaves */
  1456. int mpi_master = _starpu_mpi_common_get_src_node();
  1457. int mpi_src = src - b_low;
  1458. mpi_src = (mpi_master <= mpi_src) ? mpi_src+1 : mpi_src;
  1459. int mpi_dst = dst - b_low;
  1460. mpi_dst = (mpi_master <= mpi_dst) ? mpi_dst+1 : mpi_dst;
  1461. if (src >= b_low && src < b_up && dst >= b_low && dst < b_up)
  1462. slowness += mpi_time_device_to_device[mpi_src][mpi_dst];
  1463. else
  1464. {
  1465. if (src >= b_low && src < b_up)
  1466. slowness += mpi_time_device_to_device[mpi_src][mpi_master];
  1467. if (dst >= b_low && dst < b_up)
  1468. slowness += mpi_time_device_to_device[mpi_master][mpi_dst];
  1469. }
  1470. b_low += nmpi_ms;
  1471. #endif
  1472. bandwidth = 1.0/slowness;
  1473. }
  1474. else
  1475. {
  1476. /* convention */
  1477. bandwidth = 0.0;
  1478. }
  1479. if (dst)
  1480. fputc('\t', f);
  1481. _starpu_write_double(f, "%e", bandwidth);
  1482. }
  1483. fprintf(f, "\n");
  1484. }
  1485. if (locked)
  1486. _starpu_fwrunlock(f);
  1487. fclose(f);
  1488. }
  1489. #endif /* STARPU_SIMGRID */
  1490. void starpu_bus_print_filenames(FILE *output)
  1491. {
  1492. char bandwidth_path[256];
  1493. char affinity_path[256];
  1494. char latency_path[256];
  1495. get_bandwidth_path(bandwidth_path, sizeof(bandwidth_path));
  1496. get_affinity_path(affinity_path, sizeof(affinity_path));
  1497. get_latency_path(latency_path, sizeof(latency_path));
  1498. fprintf(output, "bandwidth: <%s>\n", bandwidth_path);
  1499. fprintf(output, " affinity: <%s>\n", affinity_path);
  1500. fprintf(output, " latency: <%s>\n", latency_path);
  1501. }
  1502. void starpu_bus_print_bandwidth(FILE *f)
  1503. {
  1504. unsigned src, dst, maxnode;
  1505. maxnode = nnumas;
  1506. #ifdef STARPU_USE_CUDA
  1507. maxnode += ncuda;
  1508. #endif
  1509. #ifdef STARPU_USE_OPENCL
  1510. maxnode += nopencl;
  1511. #endif
  1512. #ifdef STARPU_USE_MIC
  1513. maxnode += nmic;
  1514. #endif
  1515. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  1516. maxnode += nmpi_ms;
  1517. #endif
  1518. fprintf(f, "from/to\t");
  1519. for (dst = 0; dst < nnumas; dst++)
  1520. fprintf(f, "NUMA_%u\t", dst);
  1521. for (dst = 0; dst < ncuda; dst++)
  1522. fprintf(f, "CUDA_%u\t", dst);
  1523. for (dst = 0; dst < nopencl; dst++)
  1524. fprintf(f, "OpenCL%u\t", dst);
  1525. for (dst = 0; dst < nmic; dst++)
  1526. fprintf(f, "MIC_%u\t", dst);
  1527. for (dst = 0; dst < nmpi_ms; dst++)
  1528. fprintf(f, "MPI_MS%u\t", dst);
  1529. fprintf(f, "\n");
  1530. for (src = 0; src < maxnode; src++)
  1531. {
  1532. if (src < nnumas)
  1533. fprintf(f, "RAM_%u\t", src);
  1534. else if (src < nnumas + ncuda)
  1535. fprintf(f, "CUDA_%u\t", src-nnumas);
  1536. else if (src < nnumas + ncuda + nopencl)
  1537. fprintf(f, "OpenCL%u\t", src-nnumas-ncuda);
  1538. else if (src < nnumas + ncuda + nopencl + nmic)
  1539. fprintf(f, "MIC_%u\t", src-nnumas-ncuda-nopencl);
  1540. else
  1541. fprintf(f, "MPI_MS%u\t", src-nnumas-ncuda-nopencl-nmic);
  1542. for (dst = 0; dst < maxnode; dst++)
  1543. fprintf(f, "%.0f\t", bandwidth_matrix[src][dst]);
  1544. fprintf(f, "\n");
  1545. }
  1546. fprintf(f, "\n");
  1547. for (src = 0; src < maxnode; src++)
  1548. {
  1549. if (src < nnumas)
  1550. fprintf(f, "NUMA_%u\t", src);
  1551. else if (src < nnumas + ncuda)
  1552. fprintf(f, "CUDA_%u\t", src-nnumas);
  1553. else if (src < nnumas + ncuda + nopencl)
  1554. fprintf(f, "OpenCL%u\t", src-nnumas-ncuda);
  1555. else if (src < nnumas + ncuda + nopencl + nmic)
  1556. fprintf(f, "MIC_%u\t", src-nnumas-ncuda-nopencl);
  1557. else
  1558. fprintf(f, "MPI_MS%u\t", src-nnumas-ncuda-nopencl-nmic);
  1559. for (dst = 0; dst < maxnode; dst++)
  1560. fprintf(f, "%.0f\t", latency_matrix[src][dst]);
  1561. fprintf(f, "\n");
  1562. }
  1563. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  1564. if (ncuda != 0 || nopencl != 0)
  1565. fprintf(f, "\nGPU\tNUMA in preference order (logical index), host-to-device, device-to-host\n");
  1566. for (src = 0; src < ncuda + nopencl; src++)
  1567. {
  1568. struct dev_timing *timing;
  1569. struct _starpu_machine_config * config = _starpu_get_machine_config();
  1570. unsigned config_nnumas = _starpu_topology_get_nnumanodes(config);
  1571. unsigned numa;
  1572. #ifdef STARPU_USE_CUDA
  1573. if (src < ncuda)
  1574. {
  1575. fprintf(f, "CUDA_%u\t", src);
  1576. for (numa = 0; numa < config_nnumas; numa++)
  1577. {
  1578. timing = &cudadev_timing_per_numa[src*STARPU_MAXNUMANODES+numa];
  1579. if (timing->timing_htod)
  1580. fprintf(f, "%2d %.0f %.0f\t", timing->numa_id, 1/timing->timing_htod, 1/timing->timing_dtoh);
  1581. else
  1582. fprintf(f, "%2d\t", cuda_affinity_matrix[src][numa]);
  1583. }
  1584. }
  1585. #ifdef STARPU_USE_OPENCL
  1586. else
  1587. #endif
  1588. #endif
  1589. #ifdef STARPU_USE_OPENCL
  1590. {
  1591. fprintf(f, "OpenCL%u\t", src-ncuda);
  1592. for (numa = 0; numa < config_nnumas; numa++)
  1593. {
  1594. timing = &opencldev_timing_per_numa[(src-ncuda)*STARPU_MAXNUMANODES+numa];
  1595. if (timing->timing_htod)
  1596. fprintf(f, "%2d %.0f %.0f\t", timing->numa_id, 1/timing->timing_htod, 1/timing->timing_dtoh);
  1597. else
  1598. fprintf(f, "%2d\t", opencl_affinity_matrix[src][numa]);
  1599. }
  1600. }
  1601. #endif
  1602. fprintf(f, "\n");
  1603. }
  1604. #endif
  1605. }
  1606. static void generate_bus_bandwidth_file(void)
  1607. {
  1608. if (!was_benchmarked)
  1609. benchmark_all_gpu_devices();
  1610. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  1611. /* Slaves don't write files */
  1612. if (!_starpu_mpi_common_is_src_node())
  1613. return;
  1614. #endif
  1615. #ifndef STARPU_SIMGRID
  1616. write_bus_bandwidth_file_content();
  1617. #endif
  1618. }
  1619. static void load_bus_bandwidth_file(void)
  1620. {
  1621. int res;
  1622. char path[256];
  1623. get_bandwidth_path(path, sizeof(path));
  1624. res = access(path, F_OK);
  1625. if (res || !load_bus_bandwidth_file_content())
  1626. {
  1627. /* File does not exist yet or is bogus */
  1628. generate_bus_bandwidth_file();
  1629. }
  1630. }
  1631. #ifndef STARPU_SIMGRID
  1632. /*
  1633. * Config
  1634. */
  1635. static void get_config_path(char *path, size_t maxlen)
  1636. {
  1637. get_bus_path("config", path, maxlen);
  1638. }
  1639. #if defined(STARPU_USE_MPI_MASTER_SLAVE)
  1640. /* check if the master or one slave has to recalibrate */
  1641. static int mpi_check_recalibrate(int my_recalibrate)
  1642. {
  1643. int nb_mpi = _starpu_mpi_src_get_device_count() + 1;
  1644. int mpi_recalibrate[nb_mpi];
  1645. int i;
  1646. MPI_Allgather(&my_recalibrate, 1, MPI_INT, mpi_recalibrate, 1, MPI_INT, MPI_COMM_WORLD);
  1647. for (i = 0; i < nb_mpi; i++)
  1648. {
  1649. if (mpi_recalibrate[i])
  1650. {
  1651. return 1;
  1652. }
  1653. }
  1654. return 0;
  1655. }
  1656. #endif
  1657. static void compare_value_and_recalibrate(char * msg, unsigned val_file, unsigned val_detected)
  1658. {
  1659. int recalibrate = 0;
  1660. if (val_file != val_detected)
  1661. recalibrate = 1;
  1662. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  1663. //Send to each other to know if we had to recalibrate because someone cannot have the correct value in the config file
  1664. recalibrate = mpi_check_recalibrate(recalibrate);
  1665. #endif
  1666. if (recalibrate)
  1667. {
  1668. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  1669. /* Only the master prints the message */
  1670. if (_starpu_mpi_common_is_src_node())
  1671. #endif
  1672. _STARPU_DISP("Current configuration does not match the bus performance model (%s: (stored) %d != (current) %d), recalibrating...\n", msg, val_file, val_detected);
  1673. _starpu_bus_force_sampling();
  1674. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  1675. if (_starpu_mpi_common_is_src_node())
  1676. #endif
  1677. _STARPU_DISP("... done\n");
  1678. }
  1679. }
  1680. static void check_bus_config_file(void)
  1681. {
  1682. int res;
  1683. char path[256];
  1684. struct _starpu_machine_config *config = _starpu_get_machine_config();
  1685. int recalibrate = 0;
  1686. get_config_path(path, sizeof(path));
  1687. res = access(path, F_OK);
  1688. if (res || config->conf.bus_calibrate > 0)
  1689. recalibrate = 1;
  1690. #if defined(STARPU_USE_MPI_MASTER_SLAVE)
  1691. //Send to each other to know if we had to recalibrate because someone cannot have the config file
  1692. recalibrate = mpi_check_recalibrate(recalibrate);
  1693. #endif
  1694. if (recalibrate)
  1695. {
  1696. if (res)
  1697. _STARPU_DISP("No performance model for the bus, calibrating...\n");
  1698. _starpu_bus_force_sampling();
  1699. if (res)
  1700. _STARPU_DISP("... done\n");
  1701. }
  1702. else
  1703. {
  1704. FILE *f;
  1705. int ret;
  1706. unsigned read_cuda = -1, read_opencl = -1, read_mic = -1, read_mpi_ms = -1;
  1707. unsigned read_cpus = -1, read_numa = -1;
  1708. int locked;
  1709. // Loading configuration from file
  1710. f = fopen(path, "r");
  1711. STARPU_ASSERT(f);
  1712. locked = _starpu_frdlock(f) == 0;
  1713. _starpu_drop_comments(f);
  1714. ret = fscanf(f, "%u\t", &read_cpus);
  1715. STARPU_ASSERT(ret == 1);
  1716. _starpu_drop_comments(f);
  1717. ret = fscanf(f, "%u\t", &read_numa);
  1718. STARPU_ASSERT(ret == 1);
  1719. _starpu_drop_comments(f);
  1720. ret = fscanf(f, "%u\t", &read_cuda);
  1721. STARPU_ASSERT(ret == 1);
  1722. _starpu_drop_comments(f);
  1723. ret = fscanf(f, "%u\t", &read_opencl);
  1724. STARPU_ASSERT(ret == 1);
  1725. _starpu_drop_comments(f);
  1726. ret = fscanf(f, "%u\t", &read_mic);
  1727. if (ret == 0)
  1728. read_mic = 0;
  1729. _starpu_drop_comments(f);
  1730. ret = fscanf(f, "%u\t", &read_mpi_ms);
  1731. if (ret == 0)
  1732. read_mpi_ms = 0;
  1733. _starpu_drop_comments(f);
  1734. if (locked)
  1735. _starpu_frdunlock(f);
  1736. fclose(f);
  1737. // Loading current configuration
  1738. ncpus = _starpu_topology_get_nhwcpu(config);
  1739. nnumas = _starpu_topology_get_nnumanodes(config);
  1740. #ifdef STARPU_USE_CUDA
  1741. ncuda = _starpu_get_cuda_device_count();
  1742. #endif
  1743. #ifdef STARPU_USE_OPENCL
  1744. nopencl = _starpu_opencl_get_device_count();
  1745. #endif
  1746. #ifdef STARPU_USE_MIC
  1747. nmic = _starpu_mic_src_get_device_count();
  1748. #endif /* STARPU_USE_MIC */
  1749. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  1750. nmpi_ms = _starpu_mpi_src_get_device_count();
  1751. #endif /* STARPU_USE_MPI_MASTER_SLAVE */
  1752. // Checking if both configurations match
  1753. compare_value_and_recalibrate("CPUS", read_cpus, ncpus);
  1754. compare_value_and_recalibrate("NUMA", read_numa, nnumas);
  1755. compare_value_and_recalibrate("CUDA", read_cuda, ncuda);
  1756. compare_value_and_recalibrate("OpenCL", read_opencl, nopencl);
  1757. compare_value_and_recalibrate("MIC", read_mic, nmic);
  1758. compare_value_and_recalibrate("MPI Master-Slave", read_mpi_ms, nmpi_ms);
  1759. }
  1760. }
  1761. static void write_bus_config_file_content(void)
  1762. {
  1763. FILE *f;
  1764. char path[256];
  1765. int locked;
  1766. STARPU_ASSERT(was_benchmarked);
  1767. get_config_path(path, sizeof(path));
  1768. _STARPU_DEBUG("writing config to %s\n", path);
  1769. f = fopen(path, "w+");
  1770. STARPU_ASSERT(f);
  1771. locked = _starpu_fwrlock(f) == 0;
  1772. _starpu_fftruncate(f, 0);
  1773. fprintf(f, "# Current configuration\n");
  1774. fprintf(f, "%u # Number of CPUs\n", ncpus);
  1775. fprintf(f, "%u # Number of NUMA nodes\n", nnumas);
  1776. fprintf(f, "%u # Number of CUDA devices\n", ncuda);
  1777. fprintf(f, "%u # Number of OpenCL devices\n", nopencl);
  1778. fprintf(f, "%u # Number of MIC devices\n", nmic);
  1779. fprintf(f, "%u # Number of MPI devices\n", nmpi_ms);
  1780. if (locked)
  1781. _starpu_fwrunlock(f);
  1782. fclose(f);
  1783. }
  1784. static void generate_bus_config_file(void)
  1785. {
  1786. if (!was_benchmarked)
  1787. benchmark_all_gpu_devices();
  1788. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  1789. /* Slaves don't write files */
  1790. if (!_starpu_mpi_common_is_src_node())
  1791. return;
  1792. #endif
  1793. write_bus_config_file_content();
  1794. }
  1795. #endif /* !SIMGRID */
  1796. void _starpu_simgrid_get_platform_path(int version, char *path, size_t maxlen)
  1797. {
  1798. if (version == 3)
  1799. get_bus_path("platform.xml", path, maxlen);
  1800. else
  1801. get_bus_path("platform.v4.xml", path, maxlen);
  1802. }
  1803. #ifndef STARPU_SIMGRID
  1804. /*
  1805. * Compute the precise PCI tree bandwidth and link shares
  1806. *
  1807. * We only have measurements from one leaf to another. We assume that the
  1808. * available bandwidth is greater at lower levels, and thus measurements from
  1809. * increasingly far GPUs provide the PCI bridges bandwidths at each level.
  1810. *
  1811. * The bandwidth of a PCI bridge is thus computed as the maximum of the speed
  1812. * of the various transfers that we have achieved through it. We thus browse
  1813. * the PCI tree three times:
  1814. *
  1815. * - first through all CUDA-CUDA possible transfers to compute the maximum
  1816. * measured bandwidth on each PCI link and hub used for that.
  1817. * - then through the whole tree to emit links for each PCI link and hub.
  1818. * - then through all CUDA-CUDA possible transfers again to emit routes.
  1819. */
  1820. #if defined(STARPU_USE_CUDA) && defined(HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX) && HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX && defined(HAVE_CUDA_MEMCPY_PEER)
  1821. /* Records, for each PCI link and hub, the maximum bandwidth seen through it */
  1822. struct pci_userdata
  1823. {
  1824. /* Uplink max measurement */
  1825. double bw_up;
  1826. double bw_down;
  1827. /* Hub max measurement */
  1828. double bw;
  1829. };
  1830. /* Allocate a pci_userdata structure for the given object */
  1831. static void allocate_userdata(hwloc_obj_t obj)
  1832. {
  1833. struct pci_userdata *data;
  1834. if (obj->userdata)
  1835. return;
  1836. _STARPU_MALLOC(obj->userdata, sizeof(*data));
  1837. data = obj->userdata;
  1838. data->bw_up = 0.0;
  1839. data->bw_down = 0.0;
  1840. data->bw = 0.0;
  1841. }
  1842. /* Update the maximum bandwidth seen going to upstream */
  1843. static void update_bandwidth_up(hwloc_obj_t obj, double bandwidth)
  1844. {
  1845. struct pci_userdata *data;
  1846. if (obj->type != HWLOC_OBJ_BRIDGE && obj->type != HWLOC_OBJ_PCI_DEVICE)
  1847. return;
  1848. allocate_userdata(obj);
  1849. data = obj->userdata;
  1850. if (data->bw_up < bandwidth)
  1851. data->bw_up = bandwidth;
  1852. }
  1853. /* Update the maximum bandwidth seen going from upstream */
  1854. static void update_bandwidth_down(hwloc_obj_t obj, double bandwidth)
  1855. {
  1856. struct pci_userdata *data;
  1857. if (obj->type != HWLOC_OBJ_BRIDGE && obj->type != HWLOC_OBJ_PCI_DEVICE)
  1858. return;
  1859. allocate_userdata(obj);
  1860. data = obj->userdata;
  1861. if (data->bw_down < bandwidth)
  1862. data->bw_down = bandwidth;
  1863. }
  1864. /* Update the maximum bandwidth seen going through this Hub */
  1865. static void update_bandwidth_through(hwloc_obj_t obj, double bandwidth)
  1866. {
  1867. struct pci_userdata *data;
  1868. allocate_userdata(obj);
  1869. data = obj->userdata;
  1870. if (data->bw < bandwidth)
  1871. data->bw = bandwidth;
  1872. }
  1873. /* find_* functions perform the first step: computing maximum bandwidths */
  1874. /* Our trafic had to go through the host, go back from target up to the host,
  1875. * updating uplink downstream bandwidth along the way */
  1876. static void find_platform_backward_path(hwloc_obj_t obj, double bandwidth)
  1877. {
  1878. if (!obj)
  1879. /* Oops, we should have seen a host bridge. Well, too bad. */
  1880. return;
  1881. /* Update uplink bandwidth of PCI Hub */
  1882. update_bandwidth_down(obj, bandwidth);
  1883. /* Update internal bandwidth of PCI Hub */
  1884. update_bandwidth_through(obj, bandwidth);
  1885. if (obj->type == HWLOC_OBJ_BRIDGE && obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST)
  1886. /* Finished */
  1887. return;
  1888. /* Continue up */
  1889. find_platform_backward_path(obj->parent, bandwidth);
  1890. }
  1891. /* Same, but update uplink upstream bandwidth */
  1892. static void find_platform_forward_path(hwloc_obj_t obj, double bandwidth)
  1893. {
  1894. if (!obj)
  1895. /* Oops, we should have seen a host bridge. Well, too bad. */
  1896. return;
  1897. /* Update uplink bandwidth of PCI Hub */
  1898. update_bandwidth_up(obj, bandwidth);
  1899. /* Update internal bandwidth of PCI Hub */
  1900. update_bandwidth_through(obj, bandwidth);
  1901. if (obj->type == HWLOC_OBJ_BRIDGE && obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST)
  1902. /* Finished */
  1903. return;
  1904. /* Continue up */
  1905. find_platform_forward_path(obj->parent, bandwidth);
  1906. }
  1907. /* Find the path from obj1 through parent down to obj2 (without ever going up),
  1908. * and update the maximum bandwidth along the path */
  1909. static int find_platform_path_down(hwloc_obj_t parent, hwloc_obj_t obj1, hwloc_obj_t obj2, double bandwidth)
  1910. {
  1911. unsigned i;
  1912. /* Base case, path is empty */
  1913. if (parent == obj2)
  1914. return 1;
  1915. /* Try to go down from parent */
  1916. for (i = 0; i < parent->arity; i++)
  1917. if (parent->children[i] != obj1 && find_platform_path_down(parent->children[i], NULL, obj2, bandwidth))
  1918. {
  1919. /* Found it down there, update bandwidth of parent */
  1920. update_bandwidth_down(parent->children[i], bandwidth);
  1921. update_bandwidth_through(parent, bandwidth);
  1922. return 1;
  1923. }
  1924. return 0;
  1925. }
  1926. /* Find the path from obj1 to obj2, and update the maximum bandwidth along the
  1927. * path */
  1928. static int find_platform_path_up(hwloc_obj_t obj1, hwloc_obj_t obj2, double bandwidth)
  1929. {
  1930. int ret;
  1931. hwloc_obj_t parent = obj1->parent;
  1932. if (!parent)
  1933. {
  1934. /* Oops, we should have seen a host bridge. Act as if we had seen it. */
  1935. find_platform_backward_path(obj2, bandwidth);
  1936. return 1;
  1937. }
  1938. if (find_platform_path_down(parent, obj1, obj2, bandwidth))
  1939. /* obj2 was a mere (sub)child of our parent */
  1940. return 1;
  1941. /* obj2 is not a (sub)child of our parent, we have to go up through the parent */
  1942. if (parent->type == HWLOC_OBJ_BRIDGE && parent->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST)
  1943. {
  1944. /* We have to go up to the Host, so obj2 is not in the same PCI
  1945. * tree, so we're for for obj1 to Host, and just find the path
  1946. * from obj2 to Host too.
  1947. */
  1948. find_platform_backward_path(obj2, bandwidth);
  1949. update_bandwidth_up(parent, bandwidth);
  1950. update_bandwidth_through(parent, bandwidth);
  1951. return 1;
  1952. }
  1953. /* Not at host yet, just go up */
  1954. ret = find_platform_path_up(parent, obj2, bandwidth);
  1955. update_bandwidth_up(parent, bandwidth);
  1956. update_bandwidth_through(parent, bandwidth);
  1957. return ret;
  1958. }
  1959. /* find the path between cuda i and cuda j, and update the maximum bandwidth along the path */
  1960. static int find_platform_cuda_path(hwloc_topology_t topology, unsigned i, unsigned j, double bandwidth)
  1961. {
  1962. hwloc_obj_t cudai, cudaj;
  1963. cudai = hwloc_cuda_get_device_osdev_by_index(topology, i);
  1964. cudaj = hwloc_cuda_get_device_osdev_by_index(topology, j);
  1965. if (!cudai || !cudaj)
  1966. return 0;
  1967. return find_platform_path_up(cudai, cudaj, bandwidth);
  1968. }
  1969. /* emit_topology_bandwidths performs the second step: emitting link names */
  1970. /* Emit the link name of the object */
  1971. static void emit_pci_hub(FILE *f, hwloc_obj_t obj)
  1972. {
  1973. STARPU_ASSERT(obj->type == HWLOC_OBJ_BRIDGE);
  1974. fprintf(f, "PCI:%04x:[%02x-%02x]", obj->attr->bridge.downstream.pci.domain, obj->attr->bridge.downstream.pci.secondary_bus, obj->attr->bridge.downstream.pci.subordinate_bus);
  1975. }
  1976. static void emit_pci_dev(FILE *f, struct hwloc_pcidev_attr_s *pcidev)
  1977. {
  1978. fprintf(f, "PCI:%04x:%02x:%02x.%1x", pcidev->domain, pcidev->bus, pcidev->dev, pcidev->func);
  1979. }
  1980. /* Emit the links of the object */
  1981. static void emit_topology_bandwidths(FILE *f, hwloc_obj_t obj, const char *Bps, const char *s)
  1982. {
  1983. unsigned i;
  1984. if (obj->userdata)
  1985. {
  1986. struct pci_userdata *data = obj->userdata;
  1987. if (obj->type == HWLOC_OBJ_BRIDGE)
  1988. {
  1989. /* Uplink */
  1990. fprintf(f, " <link id=\"");
  1991. emit_pci_hub(f, obj);
  1992. fprintf(f, " up\" bandwidth=\"%f%s\" latency=\"0.000000%s\"/>\n", data->bw_up, Bps, s);
  1993. fprintf(f, " <link id=\"");
  1994. emit_pci_hub(f, obj);
  1995. fprintf(f, " down\" bandwidth=\"%f%s\" latency=\"0.000000%s\"/>\n", data->bw_down, Bps, s);
  1996. /* PCI Switches are assumed to have infinite internal bandwidth */
  1997. if (!obj->name || !strstr(obj->name, "Switch"))
  1998. {
  1999. /* We assume that PCI Hubs have double bandwidth in
  2000. * order to support full duplex but not more */
  2001. fprintf(f, " <link id=\"");
  2002. emit_pci_hub(f, obj);
  2003. fprintf(f, " through\" bandwidth=\"%f%s\" latency=\"0.000000%s\"/>\n", data->bw * 2, Bps, s);
  2004. }
  2005. }
  2006. else if (obj->type == HWLOC_OBJ_PCI_DEVICE)
  2007. {
  2008. fprintf(f, " <link id=\"");
  2009. emit_pci_dev(f, &obj->attr->pcidev);
  2010. fprintf(f, " up\" bandwidth=\"%f%s\" latency=\"0.000000%s\"/>\n", data->bw_up, Bps, s);
  2011. fprintf(f, " <link id=\"");
  2012. emit_pci_dev(f, &obj->attr->pcidev);
  2013. fprintf(f, " down\" bandwidth=\"%f%s\" latency=\"0.000000%s\"/>\n", data->bw_down, Bps, s);
  2014. }
  2015. }
  2016. for (i = 0; i < obj->arity; i++)
  2017. emit_topology_bandwidths(f, obj->children[i], Bps, s);
  2018. }
  2019. /* emit_pci_link_* functions perform the third step: emitting the routes */
  2020. static void emit_pci_link(FILE *f, hwloc_obj_t obj, const char *suffix)
  2021. {
  2022. if (obj->type == HWLOC_OBJ_BRIDGE)
  2023. {
  2024. fprintf(f, " <link_ctn id=\"");
  2025. emit_pci_hub(f, obj);
  2026. fprintf(f, " %s\"/>\n", suffix);
  2027. }
  2028. else if (obj->type == HWLOC_OBJ_PCI_DEVICE)
  2029. {
  2030. fprintf(f, " <link_ctn id=\"");
  2031. emit_pci_dev(f, &obj->attr->pcidev);
  2032. fprintf(f, " %s\"/>\n", suffix);
  2033. }
  2034. }
  2035. /* Go to upstream */
  2036. static void emit_pci_link_up(FILE *f, hwloc_obj_t obj)
  2037. {
  2038. emit_pci_link(f, obj, "up");
  2039. }
  2040. /* Go from upstream */
  2041. static void emit_pci_link_down(FILE *f, hwloc_obj_t obj)
  2042. {
  2043. emit_pci_link(f, obj, "down");
  2044. }
  2045. /* Go through PCI hub */
  2046. static void emit_pci_link_through(FILE *f, hwloc_obj_t obj)
  2047. {
  2048. /* We don't care about trafic going through PCI switches */
  2049. if (obj->type == HWLOC_OBJ_BRIDGE)
  2050. {
  2051. if (!obj->name || !strstr(obj->name, "Switch"))
  2052. emit_pci_link(f, obj, "through");
  2053. else
  2054. {
  2055. fprintf(f, " <!-- Switch ");
  2056. emit_pci_hub(f, obj);
  2057. fprintf(f, " through -->\n");
  2058. }
  2059. }
  2060. }
  2061. /* Our trafic has to go through the host, go back from target up to the host,
  2062. * using uplink downstream along the way */
  2063. static void emit_platform_backward_path(FILE *f, hwloc_obj_t obj)
  2064. {
  2065. if (!obj)
  2066. /* Oops, we should have seen a host bridge. Well, too bad. */
  2067. return;
  2068. /* Go through PCI Hub */
  2069. emit_pci_link_through(f, obj);
  2070. /* Go through uplink */
  2071. emit_pci_link_down(f, obj);
  2072. if (obj->type == HWLOC_OBJ_BRIDGE && obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST)
  2073. {
  2074. /* Finished, go through host */
  2075. fprintf(f, " <link_ctn id=\"Host\"/>\n");
  2076. return;
  2077. }
  2078. /* Continue up */
  2079. emit_platform_backward_path(f, obj->parent);
  2080. }
  2081. /* Same, but use upstream link */
  2082. static void emit_platform_forward_path(FILE *f, hwloc_obj_t obj)
  2083. {
  2084. if (!obj)
  2085. /* Oops, we should have seen a host bridge. Well, too bad. */
  2086. return;
  2087. /* Go through PCI Hub */
  2088. emit_pci_link_through(f, obj);
  2089. /* Go through uplink */
  2090. emit_pci_link_up(f, obj);
  2091. if (obj->type == HWLOC_OBJ_BRIDGE && obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST)
  2092. {
  2093. /* Finished, go through host */
  2094. fprintf(f, " <link_ctn id=\"Host\"/>\n");
  2095. return;
  2096. }
  2097. /* Continue up */
  2098. emit_platform_forward_path(f, obj->parent);
  2099. }
  2100. /* Find the path from obj1 through parent down to obj2 (without ever going up),
  2101. * and use the links along the path */
  2102. static int emit_platform_path_down(FILE *f, hwloc_obj_t parent, hwloc_obj_t obj1, hwloc_obj_t obj2)
  2103. {
  2104. unsigned i;
  2105. /* Base case, path is empty */
  2106. if (parent == obj2)
  2107. return 1;
  2108. /* Try to go down from parent */
  2109. for (i = 0; i < parent->arity; i++)
  2110. if (parent->children[i] != obj1 && emit_platform_path_down(f, parent->children[i], NULL, obj2))
  2111. {
  2112. /* Found it down there, path goes through this hub */
  2113. emit_pci_link_down(f, parent->children[i]);
  2114. emit_pci_link_through(f, parent);
  2115. return 1;
  2116. }
  2117. return 0;
  2118. }
  2119. /* Find the path from obj1 to obj2, and use the links along the path */
  2120. static int emit_platform_path_up(FILE *f, hwloc_obj_t obj1, hwloc_obj_t obj2)
  2121. {
  2122. int ret;
  2123. hwloc_obj_t parent = obj1->parent;
  2124. if (!parent)
  2125. {
  2126. /* Oops, we should have seen a host bridge. Act as if we had seen it. */
  2127. emit_platform_backward_path(f, obj2);
  2128. return 1;
  2129. }
  2130. if (emit_platform_path_down(f, parent, obj1, obj2))
  2131. /* obj2 was a mere (sub)child of our parent */
  2132. return 1;
  2133. /* obj2 is not a (sub)child of our parent, we have to go up through the parent */
  2134. if (parent->type == HWLOC_OBJ_BRIDGE && parent->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST)
  2135. {
  2136. /* We have to go up to the Host, so obj2 is not in the same PCI
  2137. * tree, so we're for for obj1 to Host, and just find the path
  2138. * from obj2 to Host too.
  2139. */
  2140. emit_platform_backward_path(f, obj2);
  2141. fprintf(f, " <link_ctn id=\"Host\"/>\n");
  2142. emit_pci_link_up(f, parent);
  2143. emit_pci_link_through(f, parent);
  2144. return 1;
  2145. }
  2146. /* Not at host yet, just go up */
  2147. ret = emit_platform_path_up(f, parent, obj2);
  2148. emit_pci_link_up(f, parent);
  2149. emit_pci_link_through(f, parent);
  2150. return ret;
  2151. }
  2152. /* Clean our mess in the topology before destroying it */
  2153. static void clean_topology(hwloc_obj_t obj)
  2154. {
  2155. unsigned i;
  2156. if (obj->userdata)
  2157. free(obj->userdata);
  2158. for (i = 0; i < obj->arity; i++)
  2159. clean_topology(obj->children[i]);
  2160. }
  2161. #endif
  2162. static void write_bus_platform_file_content(int version)
  2163. {
  2164. FILE *f;
  2165. char path[256];
  2166. unsigned i;
  2167. const char *speed, *flops, *Bps, *s;
  2168. char dash;
  2169. int locked;
  2170. if (version == 3)
  2171. {
  2172. speed = "power";
  2173. flops = "";
  2174. Bps = "";
  2175. s = "";
  2176. dash = '_';
  2177. }
  2178. else
  2179. {
  2180. speed = "speed";
  2181. flops = "f";
  2182. Bps = "Bps";
  2183. s = "s";
  2184. dash = '-';
  2185. }
  2186. STARPU_ASSERT(was_benchmarked);
  2187. _starpu_simgrid_get_platform_path(version, path, sizeof(path));
  2188. _STARPU_DEBUG("writing platform to %s\n", path);
  2189. f = fopen(path, "w+");
  2190. if (!f)
  2191. {
  2192. perror("fopen write_bus_platform_file_content");
  2193. _STARPU_DISP("path '%s'\n", path);
  2194. fflush(stderr);
  2195. STARPU_ABORT();
  2196. }
  2197. locked = _starpu_fwrlock(f) == 0;
  2198. _starpu_fftruncate(f, 0);
  2199. fprintf(f,
  2200. "<?xml version='1.0'?>\n"
  2201. "<!DOCTYPE platform SYSTEM '%s'>\n"
  2202. " <platform version=\"%d\">\n"
  2203. " <config id=\"General\">\n"
  2204. " <prop id=\"network/TCP%cgamma\" value=\"-1\"></prop>\n"
  2205. " <prop id=\"network/latency%cfactor\" value=\"1\"></prop>\n"
  2206. " <prop id=\"network/bandwidth%cfactor\" value=\"1\"></prop>\n"
  2207. " </config>\n"
  2208. " <AS id=\"AS0\" routing=\"Full\">\n"
  2209. " <host id=\"MAIN\" %s=\"1%s\"/>\n",
  2210. version == 3
  2211. ? "http://simgrid.gforge.inria.fr/simgrid.dtd"
  2212. : "http://simgrid.gforge.inria.fr/simgrid/simgrid.dtd",
  2213. version, dash, dash, dash, speed, flops);
  2214. for (i = 0; i < ncpus; i++)
  2215. /* TODO: host memory for out-of-core simulation */
  2216. fprintf(f, " <host id=\"CPU%u\" %s=\"2000000000%s\"/>\n", i, speed, flops);
  2217. for (i = 0; i < ncuda; i++)
  2218. {
  2219. fprintf(f, " <host id=\"CUDA%u\" %s=\"2000000000%s\">\n", i, speed, flops);
  2220. fprintf(f, " <prop id=\"memsize\" value=\"%llu\"/>\n", (unsigned long long) cuda_size[i]);
  2221. #ifdef HAVE_CUDA_MEMCPY_PEER
  2222. fprintf(f, " <prop id=\"memcpy_peer\" value=\"1\"/>\n");
  2223. #endif
  2224. /* TODO: record cudadev_direct instead of assuming it's NUMA nodes */
  2225. fprintf(f, " </host>\n");
  2226. }
  2227. for (i = 0; i < nopencl; i++)
  2228. {
  2229. fprintf(f, " <host id=\"OpenCL%u\" %s=\"2000000000%s\">\n", i, speed, flops);
  2230. fprintf(f, " <prop id=\"memsize\" value=\"%llu\"/>\n", (unsigned long long) opencl_size[i]);
  2231. fprintf(f, " </host>\n");
  2232. }
  2233. fprintf(f, "\n <host id=\"RAM\" %s=\"1%s\"/>\n", speed, flops);
  2234. /*
  2235. * Compute maximum bandwidth, taken as host bandwidth
  2236. */
  2237. double max_bandwidth = 0;
  2238. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  2239. unsigned numa;
  2240. #endif
  2241. #ifdef STARPU_USE_CUDA
  2242. for (i = 0; i < ncuda; i++)
  2243. {
  2244. for (numa = 0; numa < nnumas; numa++)
  2245. {
  2246. double down_bw = 1.0 / cudadev_timing_per_numa[i*STARPU_MAXNUMANODES+numa].timing_dtoh;
  2247. double up_bw = 1.0 / cudadev_timing_per_numa[i*STARPU_MAXNUMANODES+numa].timing_htod;
  2248. if (max_bandwidth < down_bw)
  2249. max_bandwidth = down_bw;
  2250. if (max_bandwidth < up_bw)
  2251. max_bandwidth = up_bw;
  2252. }
  2253. }
  2254. #endif
  2255. #ifdef STARPU_USE_OPENCL
  2256. for (i = 0; i < nopencl; i++)
  2257. {
  2258. for (numa = 0; numa < nnumas; numa++)
  2259. {
  2260. double down_bw = 1.0 / opencldev_timing_per_numa[i*STARPU_MAXNUMANODES+numa].timing_dtoh;
  2261. double up_bw = 1.0 / opencldev_timing_per_numa[i*STARPU_MAXNUMANODES+numa].timing_htod;
  2262. if (max_bandwidth < down_bw)
  2263. max_bandwidth = down_bw;
  2264. if (max_bandwidth < up_bw)
  2265. max_bandwidth = up_bw;
  2266. }
  2267. }
  2268. #endif
  2269. fprintf(f, "\n <link id=\"Host\" bandwidth=\"%f%s\" latency=\"0.000000%s\"/>\n\n", max_bandwidth*1000000, Bps, s);
  2270. /*
  2271. * OpenCL links
  2272. */
  2273. #ifdef STARPU_USE_OPENCL
  2274. for (i = 0; i < nopencl; i++)
  2275. {
  2276. char i_name[16];
  2277. snprintf(i_name, sizeof(i_name), "OpenCL%u", i);
  2278. fprintf(f, " <link id=\"RAM-%s\" bandwidth=\"%f%s\" latency=\"%f%s\"/>\n",
  2279. i_name,
  2280. 1000000 / search_bus_best_timing(i, "OpenCL", 1), Bps,
  2281. search_bus_best_latency(i, "OpenCL", 1)/1000000., s);
  2282. fprintf(f, " <link id=\"%s-RAM\" bandwidth=\"%f%s\" latency=\"%f%s\"/>\n",
  2283. i_name,
  2284. 1000000 / search_bus_best_timing(i, "OpenCL", 0), Bps,
  2285. search_bus_best_latency(i, "OpenCL", 0)/1000000., s);
  2286. }
  2287. fprintf(f, "\n");
  2288. #endif
  2289. /*
  2290. * CUDA links and routes
  2291. */
  2292. #ifdef STARPU_USE_CUDA
  2293. /* Write RAM/CUDA bandwidths and latencies */
  2294. for (i = 0; i < ncuda; i++)
  2295. {
  2296. char i_name[16];
  2297. snprintf(i_name, sizeof(i_name), "CUDA%u", i);
  2298. fprintf(f, " <link id=\"RAM-%s\" bandwidth=\"%f%s\" latency=\"%f%s\"/>\n",
  2299. i_name,
  2300. 1000000. / search_bus_best_timing(i, "CUDA", 1), Bps,
  2301. search_bus_best_latency(i, "CUDA", 1)/1000000., s);
  2302. fprintf(f, " <link id=\"%s-RAM\" bandwidth=\"%f%s\" latency=\"%f%s\"/>\n",
  2303. i_name,
  2304. 1000000. / search_bus_best_timing(i, "CUDA", 0), Bps,
  2305. search_bus_best_latency(i, "CUDA", 0)/1000000., s);
  2306. }
  2307. fprintf(f, "\n");
  2308. #ifdef HAVE_CUDA_MEMCPY_PEER
  2309. /* Write CUDA/CUDA bandwidths and latencies */
  2310. for (i = 0; i < ncuda; i++)
  2311. {
  2312. unsigned j;
  2313. char i_name[16];
  2314. snprintf(i_name, sizeof(i_name), "CUDA%u", i);
  2315. for (j = 0; j < ncuda; j++)
  2316. {
  2317. char j_name[16];
  2318. if (j == i)
  2319. continue;
  2320. snprintf(j_name, sizeof(j_name), "CUDA%u", j);
  2321. fprintf(f, " <link id=\"%s-%s\" bandwidth=\"%f%s\" latency=\"%f%s\"/>\n",
  2322. i_name, j_name,
  2323. 1000000. / cudadev_timing_dtod[i][j], Bps,
  2324. cudadev_latency_dtod[i][j]/1000000., s);
  2325. }
  2326. }
  2327. #endif
  2328. #if defined(HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX) && HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX && defined(HAVE_CUDA_MEMCPY_PEER)
  2329. /* If we have enough hwloc information, write PCI bandwidths and routes */
  2330. if (!starpu_get_env_number_default("STARPU_PCI_FLAT", 0))
  2331. {
  2332. hwloc_topology_t topology;
  2333. hwloc_topology_init(&topology);
  2334. _starpu_topology_filter(topology);
  2335. hwloc_topology_load(topology);
  2336. /* First find paths and record measured bandwidth along the path */
  2337. for (i = 0; i < ncuda; i++)
  2338. {
  2339. unsigned j;
  2340. for (j = 0; j < ncuda; j++)
  2341. if (i != j)
  2342. if (!find_platform_cuda_path(topology, i, j, 1000000. / cudadev_timing_dtod[i][j]))
  2343. {
  2344. clean_topology(hwloc_get_root_obj(topology));
  2345. hwloc_topology_destroy(topology);
  2346. goto flat_cuda;
  2347. }
  2348. /* Record RAM/CUDA bandwidths */
  2349. find_platform_forward_path(hwloc_cuda_get_device_osdev_by_index(topology, i), 1000000. / search_bus_best_timing(i, "CUDA", 0));
  2350. find_platform_backward_path(hwloc_cuda_get_device_osdev_by_index(topology, i), 1000000. / search_bus_best_timing(i, "CUDA", 1));
  2351. }
  2352. /* Ok, found path in all cases, can emit advanced platform routes */
  2353. fprintf(f, "\n");
  2354. emit_topology_bandwidths(f, hwloc_get_root_obj(topology), Bps, s);
  2355. fprintf(f, "\n");
  2356. for (i = 0; i < ncuda; i++)
  2357. {
  2358. unsigned j;
  2359. for (j = 0; j < ncuda; j++)
  2360. if (i != j)
  2361. {
  2362. fprintf(f, " <route src=\"CUDA%u\" dst=\"CUDA%u\" symmetrical=\"NO\">\n", i, j);
  2363. fprintf(f, " <link_ctn id=\"CUDA%u-CUDA%u\"/>\n", i, j);
  2364. emit_platform_path_up(f,
  2365. hwloc_cuda_get_device_osdev_by_index(topology, i),
  2366. hwloc_cuda_get_device_osdev_by_index(topology, j));
  2367. fprintf(f, " </route>\n");
  2368. }
  2369. fprintf(f, " <route src=\"CUDA%u\" dst=\"RAM\" symmetrical=\"NO\">\n", i);
  2370. fprintf(f, " <link_ctn id=\"CUDA%u-RAM\"/>\n", i);
  2371. emit_platform_forward_path(f, hwloc_cuda_get_device_osdev_by_index(topology, i));
  2372. fprintf(f, " </route>\n");
  2373. fprintf(f, " <route src=\"RAM\" dst=\"CUDA%u\" symmetrical=\"NO\">\n", i);
  2374. fprintf(f, " <link_ctn id=\"RAM-CUDA%u\"/>\n", i);
  2375. emit_platform_backward_path(f, hwloc_cuda_get_device_osdev_by_index(topology, i));
  2376. fprintf(f, " </route>\n");
  2377. }
  2378. clean_topology(hwloc_get_root_obj(topology));
  2379. hwloc_topology_destroy(topology);
  2380. }
  2381. else
  2382. {
  2383. flat_cuda:
  2384. #else
  2385. {
  2386. #endif
  2387. /* If we don't have enough hwloc information, write trivial routes always through host */
  2388. for (i = 0; i < ncuda; i++)
  2389. {
  2390. char i_name[16];
  2391. snprintf(i_name, sizeof(i_name), "CUDA%u", i);
  2392. fprintf(f, " <route src=\"RAM\" dst=\"%s\" symmetrical=\"NO\"><link_ctn id=\"RAM-%s\"/><link_ctn id=\"Host\"/></route>\n", i_name, i_name);
  2393. fprintf(f, " <route src=\"%s\" dst=\"RAM\" symmetrical=\"NO\"><link_ctn id=\"%s-RAM\"/><link_ctn id=\"Host\"/></route>\n", i_name, i_name);
  2394. }
  2395. #ifdef HAVE_CUDA_MEMCPY_PEER
  2396. for (i = 0; i < ncuda; i++)
  2397. {
  2398. unsigned j;
  2399. char i_name[16];
  2400. snprintf(i_name, sizeof(i_name), "CUDA%u", i);
  2401. for (j = 0; j < ncuda; j++)
  2402. {
  2403. char j_name[16];
  2404. if (j == i)
  2405. continue;
  2406. snprintf(j_name, sizeof(j_name), "CUDA%u", j);
  2407. fprintf(f, " <route src=\"%s\" dst=\"%s\" symmetrical=\"NO\"><link_ctn id=\"%s-%s\"/><link_ctn id=\"Host\"/></route>\n", i_name, j_name, i_name, j_name);
  2408. }
  2409. }
  2410. #endif
  2411. } /* defined(STARPU_HAVE_HWLOC) && defined(HAVE_CUDA_MEMCPY_PEER) */
  2412. fprintf(f, "\n");
  2413. #endif /* STARPU_USE_CUDA */
  2414. /*
  2415. * OpenCL routes
  2416. */
  2417. #ifdef STARPU_USE_OPENCL
  2418. for (i = 0; i < nopencl; i++)
  2419. {
  2420. char i_name[16];
  2421. snprintf(i_name, sizeof(i_name), "OpenCL%u", i);
  2422. fprintf(f, " <route src=\"RAM\" dst=\"%s\" symmetrical=\"NO\"><link_ctn id=\"RAM-%s\"/><link_ctn id=\"Host\"/></route>\n", i_name, i_name);
  2423. fprintf(f, " <route src=\"%s\" dst=\"RAM\" symmetrical=\"NO\"><link_ctn id=\"%s-RAM\"/><link_ctn id=\"Host\"/></route>\n", i_name, i_name);
  2424. }
  2425. #endif
  2426. fprintf(f,
  2427. " </AS>\n"
  2428. " </platform>\n"
  2429. );
  2430. if (locked)
  2431. _starpu_fwrunlock(f);
  2432. fclose(f);
  2433. }
  2434. static void generate_bus_platform_file(void)
  2435. {
  2436. if (!was_benchmarked)
  2437. benchmark_all_gpu_devices();
  2438. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  2439. /* Slaves don't write files */
  2440. if (!_starpu_mpi_common_is_src_node())
  2441. return;
  2442. #endif
  2443. write_bus_platform_file_content(3);
  2444. write_bus_platform_file_content(4);
  2445. }
  2446. static void check_bus_platform_file(void)
  2447. {
  2448. int res;
  2449. char path[256];
  2450. _starpu_simgrid_get_platform_path(4, path, sizeof(path));
  2451. res = access(path, F_OK);
  2452. if (!res)
  2453. {
  2454. _starpu_simgrid_get_platform_path(3, path, sizeof(path));
  2455. res = access(path, F_OK);
  2456. }
  2457. if (res)
  2458. {
  2459. /* File does not exist yet */
  2460. generate_bus_platform_file();
  2461. }
  2462. }
  2463. /*
  2464. * Generic
  2465. */
  2466. static void _starpu_bus_force_sampling(void)
  2467. {
  2468. _STARPU_DEBUG("Force bus sampling ...\n");
  2469. _starpu_create_sampling_directory_if_needed();
  2470. generate_bus_affinity_file();
  2471. generate_bus_latency_file();
  2472. generate_bus_bandwidth_file();
  2473. generate_bus_config_file();
  2474. generate_bus_platform_file();
  2475. }
  2476. #endif /* !SIMGRID */
  2477. void _starpu_load_bus_performance_files(void)
  2478. {
  2479. _starpu_create_sampling_directory_if_needed();
  2480. struct _starpu_machine_config * config = _starpu_get_machine_config();
  2481. nnumas = _starpu_topology_get_nnumanodes(config);
  2482. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_SIMGRID)
  2483. ncuda = _starpu_get_cuda_device_count();
  2484. #endif
  2485. #if defined(STARPU_USE_OPENCL) || defined(STARPU_USE_SIMGRID)
  2486. nopencl = _starpu_opencl_get_device_count();
  2487. #endif
  2488. #if defined(STARPU_USE_MPI_MASTER_SLAVE) || defined(STARPU_USE_SIMGRID)
  2489. nmpi_ms = _starpu_mpi_src_get_device_count();
  2490. #endif
  2491. #if defined(STARPU_USE_MIC) || defined(STARPU_USE_SIMGRID)
  2492. nmic = _starpu_mic_src_get_device_count();
  2493. #endif
  2494. #ifndef STARPU_SIMGRID
  2495. check_bus_config_file();
  2496. #endif
  2497. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  2498. /* be sure that master wrote the perf files */
  2499. _starpu_mpi_common_barrier();
  2500. #endif
  2501. #ifndef STARPU_SIMGRID
  2502. load_bus_affinity_file();
  2503. #endif
  2504. load_bus_latency_file();
  2505. load_bus_bandwidth_file();
  2506. #ifndef STARPU_SIMGRID
  2507. check_bus_platform_file();
  2508. #endif
  2509. }
  2510. /* (in MB/s) */
  2511. double starpu_transfer_bandwidth(unsigned src_node, unsigned dst_node)
  2512. {
  2513. return bandwidth_matrix[src_node][dst_node];
  2514. }
  2515. /* (in µs) */
  2516. double starpu_transfer_latency(unsigned src_node, unsigned dst_node)
  2517. {
  2518. return latency_matrix[src_node][dst_node];
  2519. }
  2520. /* (in µs) */
  2521. double starpu_transfer_predict(unsigned src_node, unsigned dst_node, size_t size)
  2522. {
  2523. double bandwidth = bandwidth_matrix[src_node][dst_node];
  2524. double latency = latency_matrix[src_node][dst_node];
  2525. struct _starpu_machine_topology *topology = &_starpu_get_machine_config()->topology;
  2526. #if 0
  2527. int busid = starpu_bus_get_id(src_node, dst_node);
  2528. int direct = starpu_bus_get_direct(busid);
  2529. #endif
  2530. float ngpus = topology->ncudagpus+topology->nopenclgpus;
  2531. #if 0
  2532. /* Ideally we should take into account that some GPUs are directly
  2533. * connected through a PCI switch, which has less contention that the
  2534. * Host bridge, but doing that seems to *decrease* performance... */
  2535. if (direct)
  2536. {
  2537. float neighbours = starpu_bus_get_ngpus(busid);
  2538. /* Count transfers of these GPUs, and count transfers between
  2539. * other GPUs and these GPUs */
  2540. ngpus = neighbours + (ngpus - neighbours) * neighbours / ngpus;
  2541. }
  2542. #endif
  2543. return latency + (size/bandwidth)*2*ngpus;
  2544. }
  2545. /* calculate save bandwidth and latency */
  2546. /* bandwidth in MB/s - latency in µs */
  2547. void _starpu_save_bandwidth_and_latency_disk(double bandwidth_write, double bandwidth_read, double latency_write, double latency_read, unsigned node)
  2548. {
  2549. unsigned int i, j;
  2550. double slowness_disk_between_main_ram, slowness_main_ram_between_node;
  2551. /* save bandwith */
  2552. for(i = 0; i < STARPU_MAXNODES; ++i)
  2553. {
  2554. for(j = 0; j < STARPU_MAXNODES; ++j)
  2555. {
  2556. if (i == j && j == node) /* source == destination == node */
  2557. {
  2558. bandwidth_matrix[i][j] = 0;
  2559. }
  2560. else if (i == node) /* source == disk */
  2561. {
  2562. /* convert in slowness */
  2563. if(bandwidth_read != 0)
  2564. slowness_disk_between_main_ram = 1/bandwidth_read;
  2565. else
  2566. slowness_disk_between_main_ram = 0;
  2567. if(bandwidth_matrix[STARPU_MAIN_RAM][j] != 0)
  2568. slowness_main_ram_between_node = 1/bandwidth_matrix[STARPU_MAIN_RAM][j];
  2569. else
  2570. slowness_main_ram_between_node = 0;
  2571. bandwidth_matrix[i][j] = 1/(slowness_disk_between_main_ram+slowness_main_ram_between_node);
  2572. }
  2573. else if (j == node) /* destination == disk */
  2574. {
  2575. /* convert in slowness */
  2576. if(bandwidth_write != 0)
  2577. slowness_disk_between_main_ram = 1/bandwidth_write;
  2578. else
  2579. slowness_disk_between_main_ram = 0;
  2580. if(bandwidth_matrix[i][STARPU_MAIN_RAM] != 0)
  2581. slowness_main_ram_between_node = 1/bandwidth_matrix[i][STARPU_MAIN_RAM];
  2582. else
  2583. slowness_main_ram_between_node = 0;
  2584. bandwidth_matrix[i][j] = 1/(slowness_disk_between_main_ram+slowness_main_ram_between_node);
  2585. }
  2586. else if (j > node || i > node) /* not affected by the node */
  2587. {
  2588. bandwidth_matrix[i][j] = NAN;
  2589. }
  2590. }
  2591. }
  2592. /* save latency */
  2593. for(i = 0; i < STARPU_MAXNODES; ++i)
  2594. {
  2595. for(j = 0; j < STARPU_MAXNODES; ++j)
  2596. {
  2597. if (i == j && j == node) /* source == destination == node */
  2598. {
  2599. latency_matrix[i][j] = 0;
  2600. }
  2601. else if (i == node) /* source == disk */
  2602. {
  2603. latency_matrix[i][j] = (latency_write+latency_matrix[STARPU_MAIN_RAM][j]);
  2604. }
  2605. else if (j == node) /* destination == disk */
  2606. {
  2607. latency_matrix[i][j] = (latency_read+latency_matrix[i][STARPU_MAIN_RAM]);
  2608. }
  2609. else if (j > node || i > node) /* not affected by the node */
  2610. {
  2611. latency_matrix[i][j] = NAN;
  2612. }
  2613. }
  2614. }
  2615. }