perfmodel_bus.c 89 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009-2021 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  4. * Copyright (C) 2013 Corentin Salingue
  5. *
  6. * StarPU is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU Lesser General Public License as published by
  8. * the Free Software Foundation; either version 2.1 of the License, or (at
  9. * your option) any later version.
  10. *
  11. * StarPU is distributed in the hope that it will be useful, but
  12. * WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  14. *
  15. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  16. */
  17. #ifdef STARPU_USE_CUDA
  18. #ifndef _GNU_SOURCE
  19. #define _GNU_SOURCE 1
  20. #endif
  21. #include <sched.h>
  22. #endif
  23. #include <stdlib.h>
  24. #include <math.h>
  25. #include <starpu.h>
  26. #include <starpu_cuda.h>
  27. #include <starpu_opencl.h>
  28. #include <common/config.h>
  29. #ifdef HAVE_UNISTD_H
  30. #include <unistd.h>
  31. #endif
  32. #include <core/workers.h>
  33. #include <core/perfmodel/perfmodel.h>
  34. #include <core/simgrid.h>
  35. #include <core/topology.h>
  36. #include <common/utils.h>
  37. #include <drivers/mpi/driver_mpi_common.h>
  38. #include <datawizard/memory_nodes.h>
  39. #ifdef STARPU_USE_OPENCL
  40. #include <starpu_opencl.h>
  41. #endif
  42. #ifdef STARPU_HAVE_WINDOWS
  43. #include <windows.h>
  44. #endif
  45. #ifdef STARPU_HAVE_HWLOC
  46. #include <hwloc.h>
  47. #ifdef STARPU_HAVE_LIBNVIDIA_ML
  48. #include <hwloc/nvml.h>
  49. #endif
  50. #ifndef HWLOC_API_VERSION
  51. #define HWLOC_OBJ_PU HWLOC_OBJ_PROC
  52. #endif
  53. #if HWLOC_API_VERSION < 0x00010b00
  54. #define HWLOC_OBJ_NUMANODE HWLOC_OBJ_NODE
  55. #endif
  56. #endif
  57. #if defined(HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX) && HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX
  58. #include <hwloc/cuda.h>
  59. #endif
  60. #define SIZE (32*1024*1024*sizeof(char))
  61. #define NITER 32
  62. #define PATH_LENGTH 256
  63. #ifndef STARPU_SIMGRID
  64. static void _starpu_bus_force_sampling(void);
  65. #endif
  66. /* timing is in µs per byte (i.e. slowness, inverse of bandwidth) */
  67. struct dev_timing
  68. {
  69. int numa_id;
  70. double timing_htod;
  71. double latency_htod;
  72. double timing_dtoh;
  73. double latency_dtoh;
  74. };
  75. /* TODO: measure latency */
  76. static double bandwidth_matrix[STARPU_MAXNODES][STARPU_MAXNODES]; /* MB/s */
  77. static double latency_matrix[STARPU_MAXNODES][STARPU_MAXNODES]; /* µs */
  78. static unsigned was_benchmarked = 0;
  79. #ifndef STARPU_SIMGRID
  80. static unsigned ncpus = 0;
  81. #endif
  82. static unsigned nnumas = 0;
  83. static unsigned ncuda = 0;
  84. static unsigned nopencl = 0;
  85. #ifndef STARPU_SIMGRID
  86. static unsigned nmpi_ms = 0;
  87. /* Benchmarking the performance of the bus */
  88. static double numa_latency[STARPU_MAXNUMANODES][STARPU_MAXNUMANODES];
  89. static double numa_timing[STARPU_MAXNUMANODES][STARPU_MAXNUMANODES];
  90. static uint64_t cuda_size[STARPU_MAXCUDADEVS];
  91. #endif
  92. #ifdef STARPU_USE_CUDA
  93. /* preference order of cores (logical indexes) */
  94. static unsigned cuda_affinity_matrix[STARPU_MAXCUDADEVS][STARPU_MAXNUMANODES];
  95. #ifndef STARPU_SIMGRID
  96. #ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
  97. static double cudadev_timing_dtod[STARPU_MAXNODES][STARPU_MAXNODES] = {{0.0}};
  98. static double cudadev_latency_dtod[STARPU_MAXNODES][STARPU_MAXNODES] = {{0.0}};
  99. #endif
  100. #endif
  101. static struct dev_timing cudadev_timing_per_numa[STARPU_MAXCUDADEVS*STARPU_MAXNUMANODES];
  102. static char cudadev_direct[STARPU_MAXNODES][STARPU_MAXNODES];
  103. #endif
  104. #ifndef STARPU_SIMGRID
  105. static uint64_t opencl_size[STARPU_MAXOPENCLDEVS];
  106. #endif
  107. #ifdef STARPU_USE_OPENCL
  108. /* preference order of cores (logical indexes) */
  109. static unsigned opencl_affinity_matrix[STARPU_MAXOPENCLDEVS][STARPU_MAXNUMANODES];
  110. static struct dev_timing opencldev_timing_per_numa[STARPU_MAXOPENCLDEVS*STARPU_MAXNUMANODES];
  111. #endif
  112. #ifdef STARPU_USE_FPGA
  113. /* preference order of cores (logical indexes) */
  114. static unsigned fpga_affinity_matrix[STARPU_MAXFPGADEVS][STARPU_MAXCPUS];
  115. #endif
  116. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  117. static double mpi_time_device_to_device[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS] = {{0.0}};
  118. static double mpi_latency_device_to_device[STARPU_MAXMPIDEVS][STARPU_MAXMPIDEVS] = {{0.0}};
  119. #endif
  120. #ifdef STARPU_HAVE_HWLOC
  121. static hwloc_topology_t hwtopology;
  122. hwloc_topology_t _starpu_perfmodel_get_hwtopology()
  123. {
  124. return hwtopology;
  125. }
  126. #endif
  127. #if (defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)) && !defined(STARPU_SIMGRID)
  128. #ifdef STARPU_USE_CUDA
  129. static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, int numa, int cpu, struct dev_timing *dev_timing_per_cpu)
  130. {
  131. _starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
  132. size_t size = SIZE;
  133. /* Initialize CUDA context on the device */
  134. /* We do not need to enable OpenGL interoperability at this point,
  135. * since we cleanly shutdown CUDA before returning. */
  136. cudaSetDevice(dev);
  137. /* hack to avoid third party libs to rebind threads */
  138. _starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
  139. /* hack to force the initialization */
  140. cudaFree(0);
  141. /* hack to avoid third party libs to rebind threads */
  142. _starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
  143. /* Get the maximum size which can be allocated on the device */
  144. struct cudaDeviceProp prop;
  145. cudaError_t cures;
  146. cures = cudaGetDeviceProperties(&prop, dev);
  147. if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
  148. cuda_size[dev] = prop.totalGlobalMem;
  149. if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
  150. /* Allocate a buffer on the device */
  151. unsigned char *d_buffer;
  152. cures = cudaMalloc((void **)&d_buffer, size);
  153. if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
  154. /* hack to avoid third party libs to rebind threads */
  155. _starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
  156. /* Allocate a buffer on the host */
  157. unsigned char *h_buffer;
  158. #if defined(STARPU_HAVE_HWLOC)
  159. struct _starpu_machine_config *config = _starpu_get_machine_config();
  160. const unsigned nnuma_nodes = _starpu_topology_get_nnumanodes(config);
  161. if (nnuma_nodes > 1)
  162. {
  163. /* NUMA mode activated */
  164. hwloc_obj_t obj = hwloc_get_obj_by_type(hwtopology, HWLOC_OBJ_NUMANODE, numa);
  165. #if HWLOC_API_VERSION >= 0x00020000
  166. h_buffer = hwloc_alloc_membind(hwtopology, size, obj->nodeset, HWLOC_MEMBIND_BIND, HWLOC_MEMBIND_BYNODESET);
  167. #else
  168. h_buffer = hwloc_alloc_membind_nodeset(hwtopology, size, obj->nodeset, HWLOC_MEMBIND_BIND, 0);
  169. #endif
  170. }
  171. else
  172. #endif
  173. {
  174. /* we use STARPU_MAIN_RAM */
  175. _STARPU_MALLOC(h_buffer, size);
  176. cudaHostRegister((void *)h_buffer, size, 0);
  177. }
  178. if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
  179. /* hack to avoid third party libs to rebind threads */
  180. _starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
  181. /* Fill them */
  182. memset(h_buffer, 0, size);
  183. cudaMemset(d_buffer, 0, size);
  184. cudaDeviceSynchronize();
  185. /* hack to avoid third party libs to rebind threads */
  186. _starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
  187. const unsigned timing_numa_index = dev*STARPU_MAXNUMANODES + numa;
  188. unsigned iter;
  189. double timing;
  190. double start;
  191. double end;
  192. /* Measure upload bandwidth */
  193. start = starpu_timing_now();
  194. for (iter = 0; iter < NITER; iter++)
  195. {
  196. cudaMemcpy(d_buffer, h_buffer, size, cudaMemcpyHostToDevice);
  197. cudaDeviceSynchronize();
  198. }
  199. end = starpu_timing_now();
  200. timing = end - start;
  201. dev_timing_per_cpu[timing_numa_index].timing_htod = timing/NITER/size;
  202. /* Measure download bandwidth */
  203. start = starpu_timing_now();
  204. for (iter = 0; iter < NITER; iter++)
  205. {
  206. cudaMemcpy(h_buffer, d_buffer, size, cudaMemcpyDeviceToHost);
  207. cudaDeviceSynchronize();
  208. }
  209. end = starpu_timing_now();
  210. timing = end - start;
  211. dev_timing_per_cpu[timing_numa_index].timing_dtoh = timing/NITER/size;
  212. /* Measure upload latency */
  213. start = starpu_timing_now();
  214. for (iter = 0; iter < NITER; iter++)
  215. {
  216. cudaMemcpy(d_buffer, h_buffer, 1, cudaMemcpyHostToDevice);
  217. cudaDeviceSynchronize();
  218. }
  219. end = starpu_timing_now();
  220. timing = end - start;
  221. dev_timing_per_cpu[timing_numa_index].latency_htod = timing/NITER;
  222. /* Measure download latency */
  223. start = starpu_timing_now();
  224. for (iter = 0; iter < NITER; iter++)
  225. {
  226. cudaMemcpy(h_buffer, d_buffer, 1, cudaMemcpyDeviceToHost);
  227. cudaDeviceSynchronize();
  228. }
  229. end = starpu_timing_now();
  230. timing = end - start;
  231. dev_timing_per_cpu[timing_numa_index].latency_dtoh = timing/NITER;
  232. /* Free buffers */
  233. cudaHostUnregister(h_buffer);
  234. #if defined(STARPU_HAVE_HWLOC)
  235. if (nnuma_nodes > 1)
  236. {
  237. /* NUMA mode activated */
  238. hwloc_free(hwtopology, h_buffer, size);
  239. }
  240. else
  241. #endif
  242. {
  243. free(h_buffer);
  244. }
  245. cudaFree(d_buffer);
  246. #if CUDART_VERSION >= 4000
  247. cudaDeviceReset();
  248. #else
  249. cudaThreadExit();
  250. #endif
  251. }
  252. #ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
  253. static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
  254. {
  255. size_t size = SIZE;
  256. int can;
  257. /* Get the maximum size which can be allocated on the device */
  258. struct cudaDeviceProp prop;
  259. cudaError_t cures;
  260. cures = cudaGetDeviceProperties(&prop, src);
  261. if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
  262. if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
  263. cures = cudaGetDeviceProperties(&prop, dst);
  264. if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
  265. if (size > prop.totalGlobalMem/4) size = prop.totalGlobalMem/4;
  266. /* Initialize CUDA context on the source */
  267. /* We do not need to enable OpenGL interoperability at this point,
  268. * since we cleanly shutdown CUDA before returning. */
  269. cudaSetDevice(src);
  270. if (starpu_get_env_number("STARPU_ENABLE_CUDA_GPU_GPU_DIRECT") != 0)
  271. {
  272. cures = cudaDeviceCanAccessPeer(&can, src, dst);
  273. (void) cudaGetLastError();
  274. if (!cures && can)
  275. {
  276. cures = cudaDeviceEnablePeerAccess(dst, 0);
  277. (void) cudaGetLastError();
  278. if (!cures)
  279. {
  280. _STARPU_DISP("GPU-Direct %d -> %d\n", dst, src);
  281. cudadev_direct[src][dst] = 1;
  282. }
  283. }
  284. }
  285. /* Allocate a buffer on the device */
  286. unsigned char *s_buffer;
  287. cures = cudaMalloc((void **)&s_buffer, size);
  288. if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
  289. cudaMemset(s_buffer, 0, size);
  290. cudaDeviceSynchronize();
  291. /* Initialize CUDA context on the destination */
  292. /* We do not need to enable OpenGL interoperability at this point,
  293. * since we cleanly shutdown CUDA before returning. */
  294. cudaSetDevice(dst);
  295. if (starpu_get_env_number("STARPU_ENABLE_CUDA_GPU_GPU_DIRECT") != 0)
  296. {
  297. cures = cudaDeviceCanAccessPeer(&can, dst, src);
  298. (void) cudaGetLastError();
  299. if (!cures && can)
  300. {
  301. cures = cudaDeviceEnablePeerAccess(src, 0);
  302. (void) cudaGetLastError();
  303. if (!cures)
  304. {
  305. _STARPU_DISP("GPU-Direct %d -> %d\n", src, dst);
  306. cudadev_direct[dst][src] = 1;
  307. }
  308. }
  309. }
  310. /* Allocate a buffer on the device */
  311. unsigned char *d_buffer;
  312. cures = cudaMalloc((void **)&d_buffer, size);
  313. if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
  314. cudaMemset(d_buffer, 0, size);
  315. cudaDeviceSynchronize();
  316. unsigned iter;
  317. double timing;
  318. double start;
  319. double end;
  320. /* Measure upload bandwidth */
  321. start = starpu_timing_now();
  322. for (iter = 0; iter < NITER; iter++)
  323. {
  324. cudaMemcpyPeer(d_buffer, dst, s_buffer, src, size);
  325. cudaDeviceSynchronize();
  326. }
  327. end = starpu_timing_now();
  328. timing = end - start;
  329. cudadev_timing_dtod[src][dst] = timing/NITER/size;
  330. /* Measure upload latency */
  331. start = starpu_timing_now();
  332. for (iter = 0; iter < NITER; iter++)
  333. {
  334. cudaMemcpyPeer(d_buffer, dst, s_buffer, src, 1);
  335. cudaDeviceSynchronize();
  336. }
  337. end = starpu_timing_now();
  338. timing = end - start;
  339. cudadev_latency_dtod[src][dst] = timing/NITER;
  340. /* Free buffers */
  341. cudaFree(d_buffer);
  342. cudaSetDevice(src);
  343. cudaFree(s_buffer);
  344. #if CUDART_VERSION >= 4000
  345. cudaDeviceReset();
  346. #else
  347. cudaThreadExit();
  348. #endif
  349. }
  350. #endif
  351. #endif
  352. #ifdef STARPU_USE_OPENCL
  353. static void measure_bandwidth_between_host_and_dev_on_numa_with_opencl(int dev, int numa, int cpu, struct dev_timing *dev_timing_per_cpu)
  354. {
  355. cl_context context;
  356. cl_command_queue queue;
  357. cl_int err=0;
  358. size_t size = SIZE;
  359. int not_initialized;
  360. _starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
  361. /* Is the context already initialised ? */
  362. starpu_opencl_get_context(dev, &context);
  363. not_initialized = (context == NULL);
  364. if (not_initialized == 1)
  365. _starpu_opencl_init_context(dev);
  366. /* Get context and queue */
  367. starpu_opencl_get_context(dev, &context);
  368. starpu_opencl_get_queue(dev, &queue);
  369. /* Get the maximum size which can be allocated on the device */
  370. cl_device_id device;
  371. cl_ulong maxMemAllocSize, totalGlobalMem;
  372. starpu_opencl_get_device(dev, &device);
  373. err = clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(maxMemAllocSize), &maxMemAllocSize, NULL);
  374. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  375. if (size > (size_t)maxMemAllocSize/4) size = maxMemAllocSize/4;
  376. err = clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE , sizeof(totalGlobalMem), &totalGlobalMem, NULL);
  377. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  378. opencl_size[dev] = totalGlobalMem;
  379. if (_starpu_opencl_get_device_type(dev) == CL_DEVICE_TYPE_CPU)
  380. {
  381. /* Let's not use too much RAM when running OpenCL on a CPU: it
  382. * would make the OS swap like crazy. */
  383. size /= 2;
  384. }
  385. /* hack to avoid third party libs to rebind threads */
  386. _starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
  387. /* Allocate a buffer on the device */
  388. cl_mem d_buffer;
  389. d_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err);
  390. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  391. /* hack to avoid third party libs to rebind threads */
  392. _starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
  393. /* Allocate a buffer on the host */
  394. unsigned char *h_buffer;
  395. #if defined(STARPU_HAVE_HWLOC)
  396. struct _starpu_machine_config *config = _starpu_get_machine_config();
  397. const unsigned nnuma_nodes = _starpu_topology_get_nnumanodes(config);
  398. if (nnuma_nodes > 1)
  399. {
  400. /* NUMA mode activated */
  401. hwloc_obj_t obj = hwloc_get_obj_by_type(hwtopology, HWLOC_OBJ_NUMANODE, numa);
  402. #if HWLOC_API_VERSION >= 0x00020000
  403. h_buffer = hwloc_alloc_membind(hwtopology, size, obj->nodeset, HWLOC_MEMBIND_BIND, HWLOC_MEMBIND_BYNODESET);
  404. #else
  405. h_buffer = hwloc_alloc_membind_nodeset(hwtopology, size, obj->nodeset, HWLOC_MEMBIND_BIND, 0);
  406. #endif
  407. }
  408. else
  409. #endif
  410. {
  411. /* we use STARPU_MAIN_RAM */
  412. _STARPU_MALLOC(h_buffer, size);
  413. }
  414. /* hack to avoid third party libs to rebind threads */
  415. _starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
  416. /* Fill them */
  417. memset(h_buffer, 0, size);
  418. err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, size, h_buffer, 0, NULL, NULL);
  419. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  420. clFinish(queue);
  421. /* hack to avoid third party libs to rebind threads */
  422. _starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
  423. const unsigned timing_numa_index = dev*STARPU_MAXNUMANODES + numa;
  424. unsigned iter;
  425. double timing;
  426. double start;
  427. double end;
  428. /* Measure upload bandwidth */
  429. start = starpu_timing_now();
  430. for (iter = 0; iter < NITER; iter++)
  431. {
  432. err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, size, h_buffer, 0, NULL, NULL);
  433. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  434. clFinish(queue);
  435. }
  436. end = starpu_timing_now();
  437. timing = end - start;
  438. dev_timing_per_cpu[timing_numa_index].timing_htod = timing/NITER/size;
  439. /* Measure download bandwidth */
  440. start = starpu_timing_now();
  441. for (iter = 0; iter < NITER; iter++)
  442. {
  443. err = clEnqueueReadBuffer(queue, d_buffer, CL_TRUE, 0, size, h_buffer, 0, NULL, NULL);
  444. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  445. clFinish(queue);
  446. }
  447. end = starpu_timing_now();
  448. timing = end - start;
  449. dev_timing_per_cpu[timing_numa_index].timing_dtoh = timing/NITER/size;
  450. /* Measure upload latency */
  451. start = starpu_timing_now();
  452. for (iter = 0; iter < NITER; iter++)
  453. {
  454. err = clEnqueueWriteBuffer(queue, d_buffer, CL_TRUE, 0, 1, h_buffer, 0, NULL, NULL);
  455. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  456. clFinish(queue);
  457. }
  458. end = starpu_timing_now();
  459. timing = end - start;
  460. dev_timing_per_cpu[timing_numa_index].latency_htod = timing/NITER;
  461. /* Measure download latency */
  462. start = starpu_timing_now();
  463. for (iter = 0; iter < NITER; iter++)
  464. {
  465. err = clEnqueueReadBuffer(queue, d_buffer, CL_TRUE, 0, 1, h_buffer, 0, NULL, NULL);
  466. if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
  467. clFinish(queue);
  468. }
  469. end = starpu_timing_now();
  470. timing = end - start;
  471. dev_timing_per_cpu[timing_numa_index].latency_dtoh = timing/NITER;
  472. /* Free buffers */
  473. err = clReleaseMemObject(d_buffer);
  474. if (STARPU_UNLIKELY(err != CL_SUCCESS))
  475. STARPU_OPENCL_REPORT_ERROR(err);
  476. #if defined(STARPU_HAVE_HWLOC)
  477. if (nnuma_nodes > 1)
  478. {
  479. /* NUMA mode activated */
  480. hwloc_free(hwtopology, h_buffer, size);
  481. }
  482. else
  483. #endif
  484. {
  485. free(h_buffer);
  486. }
  487. /* Uninitiliaze OpenCL context on the device */
  488. if (not_initialized == 1)
  489. _starpu_opencl_deinit_context(dev);
  490. }
  491. #endif
  492. /* NB: we want to sort the bandwidth by DECREASING order */
  493. static int compar_dev_timing(const void *left_dev_timing, const void *right_dev_timing)
  494. {
  495. const struct dev_timing *left = (const struct dev_timing *)left_dev_timing;
  496. const struct dev_timing *right = (const struct dev_timing *)right_dev_timing;
  497. double left_dtoh = left->timing_dtoh;
  498. double left_htod = left->timing_htod;
  499. double right_dtoh = right->timing_dtoh;
  500. double right_htod = right->timing_htod;
  501. double timing_sum2_left = left_dtoh*left_dtoh + left_htod*left_htod;
  502. double timing_sum2_right = right_dtoh*right_dtoh + right_htod*right_htod;
  503. /* it's for a decreasing sorting */
  504. return (timing_sum2_left > timing_sum2_right);
  505. }
  506. #ifdef STARPU_HAVE_HWLOC
  507. static int find_cpu_from_numa_node(hwloc_obj_t obj)
  508. {
  509. STARPU_ASSERT(obj);
  510. hwloc_obj_t current = obj;
  511. while (current->depth != HWLOC_OBJ_PU)
  512. {
  513. current = current->first_child;
  514. /* If we don't find a "PU" obj before the leave, perhaps we are
  515. * just not allowed to use it. */
  516. if (!current)
  517. return -1;
  518. }
  519. STARPU_ASSERT(current->depth == HWLOC_OBJ_PU);
  520. return current->logical_index;
  521. }
  522. #endif
  523. static void measure_bandwidth_between_numa_nodes_and_dev(int dev, struct dev_timing *dev_timing_per_numanode, char *type)
  524. {
  525. /* We measure the bandwith between each GPU and each NUMA node */
  526. struct _starpu_machine_config * config = _starpu_get_machine_config();
  527. const unsigned nnuma_nodes = _starpu_topology_get_nnumanodes(config);
  528. unsigned numa_id;
  529. for (numa_id = 0; numa_id < nnuma_nodes; numa_id++)
  530. {
  531. /* Store results by starpu id */
  532. const unsigned timing_numa_index = dev*STARPU_MAXNUMANODES + numa_id;
  533. /* Store STARPU_memnode for later */
  534. dev_timing_per_numanode[timing_numa_index].numa_id = numa_id;
  535. /* Chose one CPU connected to this NUMA node */
  536. int cpu_id = 0;
  537. #ifdef STARPU_HAVE_HWLOC
  538. hwloc_obj_t obj = hwloc_get_obj_by_type(hwtopology, HWLOC_OBJ_NUMANODE, numa_id);
  539. if (obj)
  540. {
  541. #if HWLOC_API_VERSION >= 0x00020000
  542. /* From hwloc 2.0, NUMAnode objects do not contain CPUs, they are contained in a group which contain the CPUs. */
  543. obj = obj->parent;
  544. #endif
  545. cpu_id = find_cpu_from_numa_node(obj);
  546. }
  547. else
  548. /* No such NUMA node, probably hwloc 1.x with no NUMA
  549. * node, just take one CPU from the whole system */
  550. cpu_id = find_cpu_from_numa_node(hwloc_get_root_obj(hwtopology));
  551. #endif
  552. if (cpu_id < 0)
  553. continue;
  554. #ifdef STARPU_USE_CUDA
  555. if (strncmp(type, "CUDA", 4) == 0)
  556. measure_bandwidth_between_host_and_dev_on_numa_with_cuda(dev, numa_id, cpu_id, dev_timing_per_numanode);
  557. #endif
  558. #ifdef STARPU_USE_OPENCL
  559. if (strncmp(type, "OpenCL", 6) == 0)
  560. measure_bandwidth_between_host_and_dev_on_numa_with_opencl(dev, numa_id, cpu_id, dev_timing_per_numanode);
  561. #endif
  562. }
  563. }
  564. static void measure_bandwidth_between_host_and_dev(int dev, struct dev_timing *dev_timing_per_numa, char *type)
  565. {
  566. measure_bandwidth_between_numa_nodes_and_dev(dev, dev_timing_per_numa, type);
  567. #ifdef STARPU_VERBOSE
  568. struct _starpu_machine_config * config = _starpu_get_machine_config();
  569. const unsigned nnuma_nodes = _starpu_topology_get_nnumanodes(config);
  570. unsigned numa_id;
  571. for (numa_id = 0; numa_id < nnuma_nodes; numa_id++)
  572. {
  573. const unsigned timing_numa_index = dev*STARPU_MAXNUMANODES + numa_id;
  574. double bandwidth_dtoh = dev_timing_per_numa[timing_numa_index].timing_dtoh;
  575. double bandwidth_htod = dev_timing_per_numa[timing_numa_index].timing_htod;
  576. double bandwidth_sum2 = bandwidth_dtoh*bandwidth_dtoh + bandwidth_htod*bandwidth_htod;
  577. _STARPU_DISP("(%10s) BANDWIDTH GPU %d NUMA %u - htod %f - dtoh %f - %f\n", type, dev, numa_id, bandwidth_htod, bandwidth_dtoh, sqrt(bandwidth_sum2));
  578. }
  579. #endif
  580. }
  581. #endif /* defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) */
  582. #if !defined(STARPU_SIMGRID)
  583. static void measure_bandwidth_latency_between_numa(int numa_src, int numa_dst)
  584. {
  585. #if defined(STARPU_HAVE_HWLOC)
  586. if (nnumas > 1)
  587. {
  588. /* NUMA mode activated */
  589. double start, end, timing;
  590. unsigned iter;
  591. unsigned char *h_buffer;
  592. hwloc_obj_t obj_src = hwloc_get_obj_by_type(hwtopology, HWLOC_OBJ_NUMANODE, numa_src);
  593. #if HWLOC_API_VERSION >= 0x00020000
  594. h_buffer = hwloc_alloc_membind(hwtopology, SIZE, obj_src->nodeset, HWLOC_MEMBIND_BIND, HWLOC_MEMBIND_BYNODESET);
  595. #else
  596. h_buffer = hwloc_alloc_membind_nodeset(hwtopology, SIZE, obj_src->nodeset, HWLOC_MEMBIND_BIND, 0);
  597. #endif
  598. unsigned char *d_buffer;
  599. hwloc_obj_t obj_dst = hwloc_get_obj_by_type(hwtopology, HWLOC_OBJ_NUMANODE, numa_dst);
  600. #if HWLOC_API_VERSION >= 0x00020000
  601. d_buffer = hwloc_alloc_membind(hwtopology, SIZE, obj_dst->nodeset, HWLOC_MEMBIND_BIND, HWLOC_MEMBIND_BYNODESET);
  602. #else
  603. d_buffer = hwloc_alloc_membind_nodeset(hwtopology, SIZE, obj_dst->nodeset, HWLOC_MEMBIND_BIND, 0);
  604. #endif
  605. memset(h_buffer, 0, SIZE);
  606. start = starpu_timing_now();
  607. for (iter = 0; iter < NITER; iter++)
  608. {
  609. memcpy(d_buffer, h_buffer, SIZE);
  610. }
  611. end = starpu_timing_now();
  612. timing = end - start;
  613. numa_timing[numa_src][numa_dst] = timing/NITER/SIZE;
  614. start = starpu_timing_now();
  615. for (iter = 0; iter < NITER; iter++)
  616. {
  617. memcpy(d_buffer, h_buffer, 1);
  618. }
  619. end = starpu_timing_now();
  620. timing = end - start;
  621. numa_latency[numa_src][numa_dst] = timing/NITER;
  622. hwloc_free(hwtopology, h_buffer, SIZE);
  623. hwloc_free(hwtopology, d_buffer, SIZE);
  624. }
  625. else
  626. #endif
  627. {
  628. /* Cannot make a real calibration */
  629. numa_timing[numa_src][numa_dst] = 0.01;
  630. numa_latency[numa_src][numa_dst] = 0;
  631. }
  632. }
  633. #endif
  634. static void benchmark_all_gpu_devices(void)
  635. {
  636. #ifdef STARPU_SIMGRID
  637. _STARPU_DISP("Can not measure bus in simgrid mode, please run starpu_calibrate_bus in non-simgrid mode to make sure the bus performance model was calibrated\n");
  638. STARPU_ABORT();
  639. #else /* !SIMGRID */
  640. unsigned i, j;
  641. _STARPU_DEBUG("Benchmarking the speed of the bus\n");
  642. #ifdef STARPU_HAVE_HWLOC
  643. hwloc_topology_init(&hwtopology);
  644. _starpu_topology_filter(hwtopology);
  645. hwloc_topology_load(hwtopology);
  646. #endif
  647. #ifdef STARPU_HAVE_HWLOC
  648. hwloc_bitmap_t former_cpuset = hwloc_bitmap_alloc();
  649. hwloc_get_cpubind(hwtopology, former_cpuset, HWLOC_CPUBIND_THREAD);
  650. #elif defined(__linux__)
  651. /* Save the current cpu binding */
  652. cpu_set_t former_process_affinity;
  653. int ret;
  654. ret = sched_getaffinity(0, sizeof(former_process_affinity), &former_process_affinity);
  655. if (ret)
  656. {
  657. perror("sched_getaffinity");
  658. STARPU_ABORT();
  659. }
  660. #else
  661. #warning Missing binding support, StarPU will not be able to properly benchmark NUMA topology
  662. #endif
  663. struct _starpu_machine_config *config = _starpu_get_machine_config();
  664. ncpus = _starpu_topology_get_nhwcpu(config);
  665. nnumas = _starpu_topology_get_nnumanodes(config);
  666. for (i = 0; i < nnumas; i++)
  667. for (j = 0; j < nnumas; j++)
  668. if (i != j)
  669. {
  670. _STARPU_DISP("NUMA %d -> %d...\n", i, j);
  671. measure_bandwidth_latency_between_numa(i, j);
  672. }
  673. #ifdef STARPU_USE_CUDA
  674. ncuda = _starpu_get_cuda_device_count();
  675. for (i = 0; i < ncuda; i++)
  676. {
  677. _STARPU_DISP("CUDA %u...\n", i);
  678. /* measure bandwidth between Host and Device i */
  679. measure_bandwidth_between_host_and_dev(i, cudadev_timing_per_numa, "CUDA");
  680. }
  681. #ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
  682. for (i = 0; i < ncuda; i++)
  683. {
  684. for (j = 0; j < ncuda; j++)
  685. if (i != j)
  686. {
  687. _STARPU_DISP("CUDA %u -> %u...\n", i, j);
  688. /* measure bandwidth between Host and Device i */
  689. measure_bandwidth_between_dev_and_dev_cuda(i, j);
  690. }
  691. }
  692. #endif
  693. #endif
  694. #ifdef STARPU_USE_OPENCL
  695. nopencl = _starpu_opencl_get_device_count();
  696. for (i = 0; i < nopencl; i++)
  697. {
  698. _STARPU_DISP("OpenCL %u...\n", i);
  699. /* measure bandwith between Host and Device i */
  700. measure_bandwidth_between_host_and_dev(i, opencldev_timing_per_numa, "OpenCL");
  701. }
  702. #endif
  703. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  704. _starpu_mpi_common_measure_bandwidth_latency(mpi_time_device_to_device, mpi_latency_device_to_device);
  705. #endif /* STARPU_USE_MPI_MASTER_SLAVE */
  706. #ifdef STARPU_HAVE_HWLOC
  707. hwloc_set_cpubind(hwtopology, former_cpuset, HWLOC_CPUBIND_THREAD);
  708. hwloc_bitmap_free(former_cpuset);
  709. #elif defined(__linux__)
  710. /* Restore the former affinity */
  711. ret = sched_setaffinity(0, sizeof(former_process_affinity), &former_process_affinity);
  712. if (ret)
  713. {
  714. perror("sched_setaffinity");
  715. STARPU_ABORT();
  716. }
  717. #endif
  718. #ifdef STARPU_HAVE_HWLOC
  719. hwloc_topology_destroy(hwtopology);
  720. #endif
  721. _STARPU_DEBUG("Benchmarking the speed of the bus is done.\n");
  722. was_benchmarked = 1;
  723. #endif /* !SIMGRID */
  724. }
  725. static void get_bus_path(const char *type, char *path, size_t maxlen)
  726. {
  727. char hostname[65];
  728. _starpu_gethostname(hostname, sizeof(hostname));
  729. snprintf(path, maxlen, "%s%s.%s", _starpu_get_perf_model_dir_bus(), hostname, type);
  730. }
  731. /*
  732. * Affinity
  733. */
  734. static void get_affinity_path(char *path, size_t maxlen)
  735. {
  736. get_bus_path("affinity", path, maxlen);
  737. }
  738. #ifndef STARPU_SIMGRID
  739. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  740. static void load_bus_affinity_file_content(void)
  741. {
  742. FILE *f;
  743. int locked;
  744. char path[PATH_LENGTH];
  745. get_affinity_path(path, sizeof(path));
  746. _STARPU_DEBUG("loading affinities from %s\n", path);
  747. f = fopen(path, "r");
  748. STARPU_ASSERT_MSG(f, "Error when reading from file '%s'", path);
  749. locked = _starpu_frdlock(f) == 0;
  750. unsigned gpu;
  751. #ifdef STARPU_USE_CUDA
  752. ncuda = _starpu_get_cuda_device_count();
  753. for (gpu = 0; gpu < ncuda; gpu++)
  754. {
  755. int ret;
  756. unsigned dummy;
  757. _starpu_drop_comments(f);
  758. ret = fscanf(f, "%u\t", &dummy);
  759. STARPU_ASSERT_MSG(ret == 1, "Error when reading from file '%s'", path);
  760. STARPU_ASSERT(dummy == gpu);
  761. unsigned numa;
  762. for (numa = 0; numa < nnumas; numa++)
  763. {
  764. ret = fscanf(f, "%u\t", &cuda_affinity_matrix[gpu][numa]);
  765. STARPU_ASSERT_MSG(ret == 1, "Error when reading from file '%s'", path);
  766. }
  767. ret = fscanf(f, "\n");
  768. STARPU_ASSERT_MSG(ret == 0, "Error when reading from file '%s'", path);
  769. }
  770. #endif /* !STARPU_USE_CUDA */
  771. #ifdef STARPU_USE_OPENCL
  772. nopencl = _starpu_opencl_get_device_count();
  773. for (gpu = 0; gpu < nopencl; gpu++)
  774. {
  775. int ret;
  776. unsigned dummy;
  777. _starpu_drop_comments(f);
  778. ret = fscanf(f, "%u\t", &dummy);
  779. STARPU_ASSERT_MSG(ret == 1, "Error when reading from file '%s'", path);
  780. STARPU_ASSERT(dummy == gpu);
  781. unsigned numa;
  782. for (numa = 0; numa < nnumas; numa++)
  783. {
  784. ret = fscanf(f, "%u\t", &opencl_affinity_matrix[gpu][numa]);
  785. STARPU_ASSERT_MSG(ret == 1, "Error when reading from file '%s'", path);
  786. }
  787. ret = fscanf(f, "\n");
  788. STARPU_ASSERT_MSG(ret == 0, "Error when reading from file '%s'", path);
  789. }
  790. #endif /* !STARPU_USE_OPENCL */
  791. if (locked)
  792. _starpu_frdunlock(f);
  793. fclose(f);
  794. }
  795. #endif /* !(STARPU_USE_CUDA_ || STARPU_USE_OPENCL */
  796. #ifndef STARPU_SIMGRID
  797. static void write_bus_affinity_file_content(void)
  798. {
  799. STARPU_ASSERT(was_benchmarked);
  800. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  801. FILE *f;
  802. char path[PATH_LENGTH];
  803. int locked;
  804. get_affinity_path(path, sizeof(path));
  805. _STARPU_DEBUG("writing affinities to %s\n", path);
  806. f = fopen(path, "w+");
  807. if (!f)
  808. {
  809. perror("fopen write_buf_affinity_file_content");
  810. _STARPU_DISP("path '%s'\n", path);
  811. fflush(stderr);
  812. STARPU_ABORT();
  813. }
  814. locked = _starpu_frdlock(f) == 0;
  815. unsigned numa;
  816. unsigned gpu;
  817. fprintf(f, "# GPU\t");
  818. for (numa = 0; numa < nnumas; numa++)
  819. fprintf(f, "NUMA%u\t", numa);
  820. fprintf(f, "\n");
  821. #ifdef STARPU_USE_CUDA
  822. {
  823. /* Use an other array to sort bandwidth */
  824. struct dev_timing cudadev_timing_per_numa_sorted[STARPU_MAXCUDADEVS*STARPU_MAXNUMANODES];
  825. memcpy(cudadev_timing_per_numa_sorted, cudadev_timing_per_numa, STARPU_MAXCUDADEVS*STARPU_MAXNUMANODES*sizeof(struct dev_timing));
  826. for (gpu = 0; gpu < ncuda; gpu++)
  827. {
  828. fprintf(f, "%u\t", gpu);
  829. qsort(&(cudadev_timing_per_numa_sorted[gpu*STARPU_MAXNUMANODES]), nnumas, sizeof(struct dev_timing), compar_dev_timing);
  830. for (numa = 0; numa < nnumas; numa++)
  831. {
  832. fprintf(f, "%d\t", cudadev_timing_per_numa_sorted[gpu*STARPU_MAXNUMANODES+numa].numa_id);
  833. }
  834. fprintf(f, "\n");
  835. }
  836. }
  837. #endif
  838. #ifdef STARPU_USE_OPENCL
  839. {
  840. /* Use an other array to sort bandwidth */
  841. struct dev_timing opencldev_timing_per_numa_sorted[STARPU_MAXOPENCLDEVS*STARPU_MAXNUMANODES];
  842. memcpy(opencldev_timing_per_numa_sorted, opencldev_timing_per_numa, STARPU_MAXOPENCLDEVS*STARPU_MAXNUMANODES*sizeof(struct dev_timing));
  843. for (gpu = 0; gpu < nopencl; gpu++)
  844. {
  845. fprintf(f, "%u\t", gpu);
  846. qsort(&(opencldev_timing_per_numa_sorted[gpu*STARPU_MAXNUMANODES]), nnumas, sizeof(struct dev_timing), compar_dev_timing);
  847. for (numa = 0; numa < nnumas; numa++)
  848. {
  849. fprintf(f, "%d\t", opencldev_timing_per_numa_sorted[gpu*STARPU_MAXNUMANODES+numa].numa_id);
  850. }
  851. fprintf(f, "\n");
  852. }
  853. }
  854. #endif
  855. if (locked)
  856. _starpu_frdunlock(f);
  857. fclose(f);
  858. #endif
  859. }
  860. #endif /* STARPU_SIMGRID */
  861. static void generate_bus_affinity_file(void)
  862. {
  863. if (!was_benchmarked)
  864. benchmark_all_gpu_devices();
  865. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  866. /* Slaves don't write files */
  867. if (!_starpu_mpi_common_is_src_node())
  868. return;
  869. #endif
  870. write_bus_affinity_file_content();
  871. }
  872. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  873. static int check_bus_affinity_file(void)
  874. {
  875. int ret = 1;
  876. FILE *f;
  877. int locked;
  878. unsigned dummy;
  879. char path[PATH_LENGTH];
  880. get_affinity_path(path, sizeof(path));
  881. _STARPU_DEBUG("loading affinities from %s\n", path);
  882. f = fopen(path, "r");
  883. STARPU_ASSERT_MSG(f, "Error when reading from file '%s'", path);
  884. locked = _starpu_frdlock(f) == 0;
  885. ret = fscanf(f, "# GPU\t");
  886. STARPU_ASSERT_MSG(ret == 0, "Error when reading from file '%s'", path);
  887. ret = fscanf(f, "NUMA%u\t", &dummy);
  888. if (locked)
  889. _starpu_frdunlock(f);
  890. fclose(f);
  891. return ret == 1;
  892. }
  893. #endif
  894. static void load_bus_affinity_file(void)
  895. {
  896. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  897. int exist, check = 1;
  898. char path[PATH_LENGTH];
  899. get_affinity_path(path, sizeof(path));
  900. /* access return 0 if file exists */
  901. exist = access(path, F_OK);
  902. if (exist == 0)
  903. /* return 0 if it's not good */
  904. check = check_bus_affinity_file();
  905. if (check == 0)
  906. _STARPU_DISP("Affinity File is too old for this version of StarPU ! Rebuilding it...\n");
  907. if (check == 0 || exist != 0)
  908. {
  909. /* File does not exist yet */
  910. generate_bus_affinity_file();
  911. }
  912. load_bus_affinity_file_content();
  913. #endif
  914. }
  915. #ifdef STARPU_USE_CUDA
  916. unsigned *_starpu_get_cuda_affinity_vector(unsigned gpuid)
  917. {
  918. return cuda_affinity_matrix[gpuid];
  919. }
  920. #endif /* STARPU_USE_CUDA */
  921. #ifdef STARPU_USE_OPENCL
  922. unsigned *_starpu_get_opencl_affinity_vector(unsigned gpuid)
  923. {
  924. return opencl_affinity_matrix[gpuid];
  925. }
  926. #endif /* STARPU_USE_OPENCL */
  927. #ifdef STARPU_USE_FPGA
  928. unsigned *_starpu_get_fpga_affinity_vector(unsigned fpgaid)
  929. {
  930. return fpga_affinity_matrix[fpgaid];
  931. }
  932. #endif /* STARPU_USE_FPGA */
  933. void starpu_bus_print_affinity(FILE *f)
  934. {
  935. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  936. unsigned numa;
  937. unsigned gpu;
  938. #endif
  939. fprintf(f, "# GPU\tNUMA in preference order (logical index)\n");
  940. #ifdef STARPU_USE_CUDA
  941. fprintf(f, "# CUDA\n");
  942. for(gpu = 0 ; gpu<ncuda ; gpu++)
  943. {
  944. fprintf(f, "%u\t", gpu);
  945. for (numa = 0; numa < nnumas; numa++)
  946. {
  947. fprintf(f, "%u\t", cuda_affinity_matrix[gpu][numa]);
  948. }
  949. fprintf(f, "\n");
  950. }
  951. #endif
  952. #ifdef STARPU_USE_OPENCL
  953. fprintf(f, "# OpenCL\n");
  954. for(gpu = 0 ; gpu<nopencl ; gpu++)
  955. {
  956. fprintf(f, "%u\t", gpu);
  957. for (numa = 0; numa < nnumas; numa++)
  958. {
  959. fprintf(f, "%u\t", opencl_affinity_matrix[gpu][numa]);
  960. }
  961. fprintf(f, "\n");
  962. }
  963. #endif
  964. }
  965. #endif /* STARPU_SIMGRID */
  966. /*
  967. * Latency
  968. */
  969. static void get_latency_path(char *path, size_t maxlen)
  970. {
  971. get_bus_path("latency", path, maxlen);
  972. }
  973. static int load_bus_latency_file_content(void)
  974. {
  975. int n;
  976. unsigned src, dst;
  977. FILE *f;
  978. double latency;
  979. int locked;
  980. char path[PATH_LENGTH];
  981. get_latency_path(path, sizeof(path));
  982. _STARPU_DEBUG("loading latencies from %s\n", path);
  983. f = fopen(path, "r");
  984. if (!f)
  985. {
  986. perror("fopen load_bus_latency_file_content");
  987. _STARPU_DISP("path '%s'\n", path);
  988. fflush(stderr);
  989. STARPU_ABORT();
  990. }
  991. locked = _starpu_frdlock(f) == 0;
  992. for (src = 0; src < STARPU_MAXNODES; src++)
  993. {
  994. _starpu_drop_comments(f);
  995. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  996. {
  997. n = _starpu_read_double(f, "%le", &latency);
  998. if (n != 1)
  999. {
  1000. _STARPU_DISP("Error while reading latency file <%s>. Expected a number. Did you change the maximum number of GPUs at ./configure time?\n", path);
  1001. fclose(f);
  1002. return 0;
  1003. }
  1004. n = getc(f);
  1005. if (n == '\n')
  1006. break;
  1007. if (n != '\t')
  1008. {
  1009. _STARPU_DISP("bogus character '%c' (%d) in latency file %s\n", n, n, path);
  1010. fclose(f);
  1011. return 0;
  1012. }
  1013. latency_matrix[src][dst] = latency;
  1014. /* Look out for \t\n */
  1015. n = getc(f);
  1016. if (n == '\n')
  1017. break;
  1018. ungetc(n, f);
  1019. n = '\t';
  1020. }
  1021. /* No more values, take NAN */
  1022. for ( ; dst < STARPU_MAXNODES; dst++)
  1023. latency_matrix[src][dst] = NAN;
  1024. while (n == '\t')
  1025. {
  1026. /* Look out for \t\n */
  1027. n = getc(f);
  1028. if (n == '\n')
  1029. break;
  1030. ungetc(n, f);
  1031. n = _starpu_read_double(f, "%le", &latency);
  1032. if (n && !isnan(latency))
  1033. {
  1034. _STARPU_DISP("Too many nodes in latency file %s for this configuration (%d). Did you change the maximum number of GPUs at ./configure time?\n", path, STARPU_MAXNODES);
  1035. fclose(f);
  1036. return 0;
  1037. }
  1038. n = getc(f);
  1039. }
  1040. if (n != '\n')
  1041. {
  1042. _STARPU_DISP("Bogus character '%c' (%d) in latency file %s\n", n, n, path);
  1043. fclose(f);
  1044. return 0;
  1045. }
  1046. /* Look out for EOF */
  1047. n = getc(f);
  1048. if (n == EOF)
  1049. break;
  1050. ungetc(n, f);
  1051. }
  1052. if (locked)
  1053. _starpu_frdunlock(f);
  1054. fclose(f);
  1055. /* No more values, take NAN */
  1056. for ( ; src < STARPU_MAXNODES; src++)
  1057. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  1058. latency_matrix[src][dst] = NAN;
  1059. return 1;
  1060. }
  1061. #if !defined(STARPU_SIMGRID) && (defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL))
  1062. static double search_bus_best_latency(int src, char * type, int htod)
  1063. {
  1064. /* Search the best latency for this node */
  1065. double best = 0.0;
  1066. double actual = 0.0;
  1067. unsigned check = 0;
  1068. unsigned numa;
  1069. for (numa = 0; numa < nnumas; numa++)
  1070. {
  1071. #ifdef STARPU_USE_CUDA
  1072. if (strncmp(type, "CUDA", 4) == 0)
  1073. {
  1074. if (htod)
  1075. actual = cudadev_timing_per_numa[src*STARPU_MAXNUMANODES+numa].latency_htod;
  1076. else
  1077. actual = cudadev_timing_per_numa[src*STARPU_MAXNUMANODES+numa].latency_dtoh;
  1078. }
  1079. #endif
  1080. #ifdef STARPU_USE_OPENCL
  1081. if (strncmp(type, "OpenCL", 6) == 0)
  1082. {
  1083. if (htod)
  1084. actual = opencldev_timing_per_numa[src*STARPU_MAXNUMANODES+numa].latency_htod;
  1085. else
  1086. actual = opencldev_timing_per_numa[src*STARPU_MAXNUMANODES+numa].latency_dtoh;
  1087. }
  1088. #endif
  1089. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  1090. if (!check || actual < best)
  1091. {
  1092. best = actual;
  1093. check = 1;
  1094. }
  1095. #endif
  1096. }
  1097. return best;
  1098. }
  1099. #endif
  1100. #if !defined(STARPU_SIMGRID)
  1101. static void write_bus_latency_file_content(void)
  1102. {
  1103. unsigned src, dst, maxnode;
  1104. /* Boundaries to check if src or dst are inside the interval */
  1105. unsigned b_low, b_up;
  1106. FILE *f;
  1107. int locked;
  1108. STARPU_ASSERT(was_benchmarked);
  1109. char path[PATH_LENGTH];
  1110. get_latency_path(path, sizeof(path));
  1111. _STARPU_DEBUG("writing latencies to %s\n", path);
  1112. f = fopen(path, "a+");
  1113. if (!f)
  1114. {
  1115. perror("fopen write_bus_latency_file_content");
  1116. _STARPU_DISP("path '%s'\n", path);
  1117. fflush(stderr);
  1118. STARPU_ABORT();
  1119. }
  1120. locked = _starpu_fwrlock(f) == 0;
  1121. fseek(f, 0, SEEK_SET);
  1122. _starpu_fftruncate(f, 0);
  1123. fprintf(f, "# ");
  1124. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  1125. fprintf(f, "to %u\t\t", dst);
  1126. fprintf(f, "\n");
  1127. maxnode = nnumas;
  1128. #ifdef STARPU_USE_CUDA
  1129. maxnode += ncuda;
  1130. #endif
  1131. #ifdef STARPU_USE_OPENCL
  1132. maxnode += nopencl;
  1133. #endif
  1134. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  1135. maxnode += nmpi_ms;
  1136. #endif
  1137. for (src = 0; src < STARPU_MAXNODES; src++)
  1138. {
  1139. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  1140. {
  1141. /* µs */
  1142. double latency = 0.0;
  1143. if ((src >= maxnode) || (dst >= maxnode))
  1144. {
  1145. /* convention */
  1146. latency = NAN;
  1147. }
  1148. else if (src == dst)
  1149. {
  1150. latency = 0.0;
  1151. }
  1152. else
  1153. {
  1154. b_low = b_up = 0;
  1155. /* ---- Begin NUMA ---- */
  1156. b_up += nnumas;
  1157. if (src >= b_low && src < b_up && dst >= b_low && dst < b_up)
  1158. latency += numa_latency[src-b_low][dst-b_low];
  1159. /* copy interval to check numa index later */
  1160. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  1161. unsigned numa_low = b_low;
  1162. unsigned numa_up = b_up;
  1163. #endif
  1164. b_low += nnumas;
  1165. /* ---- End NUMA ---- */
  1166. #ifdef STARPU_USE_CUDA
  1167. b_up += ncuda;
  1168. #ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
  1169. if (src >= b_low && src < b_up && dst >= b_low && dst < b_up)
  1170. latency += cudadev_latency_dtod[src-b_low][dst-b_low];
  1171. else
  1172. #endif
  1173. {
  1174. /* Check if it's CUDA <-> NUMA link */
  1175. if (src >=b_low && src < b_up && dst >= numa_low && dst < numa_up)
  1176. latency += cudadev_timing_per_numa[(src-b_low)*STARPU_MAXNUMANODES+dst-numa_low].latency_dtoh;
  1177. if (dst >= b_low && dst < b_up && src >= numa_low && dst < numa_up)
  1178. latency += cudadev_timing_per_numa[(dst-b_low)*STARPU_MAXNUMANODES+src-numa_low].latency_htod;
  1179. /* To other devices, take the best latency */
  1180. if (src >= b_low && src < b_up && !(dst >= numa_low && dst < numa_up))
  1181. latency += search_bus_best_latency(src-b_low, "CUDA", 0);
  1182. if (dst >= b_low && dst < b_up && !(src >= numa_low && dst < numa_up))
  1183. latency += search_bus_best_latency(dst-b_low, "CUDA", 1);
  1184. }
  1185. b_low += ncuda;
  1186. #endif
  1187. #ifdef STARPU_USE_OPENCL
  1188. b_up += nopencl;
  1189. /* Check if it's OpenCL <-> NUMA link */
  1190. if (src >= b_low && src < b_up && dst >= numa_low && dst < numa_up)
  1191. latency += opencldev_timing_per_numa[(src-b_low)*STARPU_MAXNUMANODES+dst-numa_low].latency_dtoh;
  1192. if (dst >= b_low && dst < b_up && src >= numa_low && dst < numa_up)
  1193. latency += opencldev_timing_per_numa[(dst-b_low)*STARPU_MAXNUMANODES+src-numa_low].latency_htod;
  1194. /* To other devices, take the best latency */
  1195. if (src >= b_low && src < b_up && !(dst >= numa_low && dst < numa_up))
  1196. latency += search_bus_best_latency(src-b_low, "OpenCL", 0);
  1197. if (dst >= b_low && dst < b_up && !(src >= numa_low && dst < numa_up))
  1198. latency += search_bus_best_latency(dst-b_low, "OpenCL", 1);
  1199. b_low += nopencl;
  1200. #endif
  1201. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  1202. b_up += nmpi_ms;
  1203. /* Modify MPI src and MPI dst if they contain the master node or not
  1204. * Because, we only take care about slaves */
  1205. int mpi_master = _starpu_mpi_common_get_src_node();
  1206. int mpi_src = src - b_low;
  1207. mpi_src = (mpi_master <= mpi_src) ? mpi_src+1 : mpi_src;
  1208. int mpi_dst = dst - b_low;
  1209. mpi_dst = (mpi_master <= mpi_dst) ? mpi_dst+1 : mpi_dst;
  1210. if (src >= b_low && src < b_up && dst >= b_low && dst < b_up)
  1211. latency += mpi_latency_device_to_device[mpi_src][mpi_dst];
  1212. else
  1213. {
  1214. if (src >= b_low && src < b_up)
  1215. latency += mpi_latency_device_to_device[mpi_src][mpi_master];
  1216. if (dst >= b_low && dst < b_up)
  1217. latency += mpi_latency_device_to_device[mpi_master][mpi_dst];
  1218. }
  1219. b_low += nmpi_ms;
  1220. #endif
  1221. }
  1222. if (dst > 0)
  1223. fputc('\t', f);
  1224. _starpu_write_double(f, "%e", latency);
  1225. }
  1226. fprintf(f, "\n");
  1227. }
  1228. if (locked)
  1229. _starpu_fwrunlock(f);
  1230. fclose(f);
  1231. }
  1232. #endif
  1233. static void generate_bus_latency_file(void)
  1234. {
  1235. if (!was_benchmarked)
  1236. benchmark_all_gpu_devices();
  1237. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  1238. /* Slaves don't write files */
  1239. if (!_starpu_mpi_common_is_src_node())
  1240. return;
  1241. #endif
  1242. #ifndef STARPU_SIMGRID
  1243. write_bus_latency_file_content();
  1244. #endif
  1245. }
  1246. static void load_bus_latency_file(void)
  1247. {
  1248. int res;
  1249. char path[PATH_LENGTH];
  1250. get_latency_path(path, sizeof(path));
  1251. res = access(path, F_OK);
  1252. if (res || !load_bus_latency_file_content())
  1253. {
  1254. /* File does not exist yet or is bogus */
  1255. generate_bus_latency_file();
  1256. res = load_bus_latency_file_content();
  1257. STARPU_ASSERT(res);
  1258. }
  1259. }
  1260. /*
  1261. * Bandwidth
  1262. */
  1263. static void get_bandwidth_path(char *path, size_t maxlen)
  1264. {
  1265. get_bus_path("bandwidth", path, maxlen);
  1266. }
  1267. static int load_bus_bandwidth_file_content(void)
  1268. {
  1269. int n;
  1270. unsigned src, dst;
  1271. FILE *f;
  1272. double bandwidth;
  1273. int locked;
  1274. char path[PATH_LENGTH];
  1275. get_bandwidth_path(path, sizeof(path));
  1276. _STARPU_DEBUG("loading bandwidth from %s\n", path);
  1277. f = fopen(path, "r");
  1278. if (!f)
  1279. {
  1280. perror("fopen load_bus_bandwidth_file_content");
  1281. _STARPU_DISP("path '%s'\n", path);
  1282. fflush(stderr);
  1283. STARPU_ABORT();
  1284. }
  1285. locked = _starpu_frdlock(f) == 0;
  1286. for (src = 0; src < STARPU_MAXNODES; src++)
  1287. {
  1288. _starpu_drop_comments(f);
  1289. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  1290. {
  1291. n = _starpu_read_double(f, "%le", &bandwidth);
  1292. if (n != 1)
  1293. {
  1294. _STARPU_DISP("Error while reading bandwidth file <%s>. Expected a number\n", path);
  1295. fclose(f);
  1296. return 0;
  1297. }
  1298. n = getc(f);
  1299. if (n == '\n')
  1300. break;
  1301. if (n != '\t')
  1302. {
  1303. _STARPU_DISP("bogus character '%c' (%d) in bandwidth file %s\n", n, n, path);
  1304. fclose(f);
  1305. return 0;
  1306. }
  1307. int limit_bandwidth = starpu_get_env_number("STARPU_LIMIT_BANDWIDTH");
  1308. if (limit_bandwidth >= 0)
  1309. {
  1310. #ifndef STARPU_SIMGRID
  1311. _STARPU_DISP("Warning: STARPU_LIMIT_BANDWIDTH set to %d but simgrid not enabled, thus ignored\n", limit_bandwidth);
  1312. #else
  1313. #ifdef HAVE_SG_LINK_BANDWIDTH_SET
  1314. bandwidth = limit_bandwidth;
  1315. #else
  1316. _STARPU_DISP("Warning: STARPU_LIMIT_BANDWIDTH set to %d but this requires simgrid 3.26\n", limit_bandwidth);
  1317. #endif
  1318. #endif
  1319. }
  1320. bandwidth_matrix[src][dst] = bandwidth;
  1321. /* Look out for \t\n */
  1322. n = getc(f);
  1323. if (n == '\n')
  1324. break;
  1325. ungetc(n, f);
  1326. n = '\t';
  1327. }
  1328. /* No more values, take NAN */
  1329. for ( ; dst < STARPU_MAXNODES; dst++)
  1330. bandwidth_matrix[src][dst] = NAN;
  1331. while (n == '\t')
  1332. {
  1333. /* Look out for \t\n */
  1334. n = getc(f);
  1335. if (n == '\n')
  1336. break;
  1337. ungetc(n, f);
  1338. n = _starpu_read_double(f, "%le", &bandwidth);
  1339. if (n && !isnan(bandwidth))
  1340. {
  1341. _STARPU_DISP("Too many nodes in bandwidth file %s for this configuration (%d)\n", path, STARPU_MAXNODES);
  1342. fclose(f);
  1343. return 0;
  1344. }
  1345. n = getc(f);
  1346. }
  1347. if (n != '\n')
  1348. {
  1349. _STARPU_DISP("Bogus character '%c' (%d) in bandwidth file %s\n", n, n, path);
  1350. fclose(f);
  1351. return 0;
  1352. }
  1353. /* Look out for EOF */
  1354. n = getc(f);
  1355. if (n == EOF)
  1356. break;
  1357. ungetc(n, f);
  1358. }
  1359. if (locked)
  1360. _starpu_frdunlock(f);
  1361. fclose(f);
  1362. /* No more values, take NAN */
  1363. for ( ; src < STARPU_MAXNODES; src++)
  1364. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  1365. latency_matrix[src][dst] = NAN;
  1366. return 1;
  1367. }
  1368. #if !defined(STARPU_SIMGRID) && (defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL))
  1369. static double search_bus_best_timing(int src, char * type, int htod)
  1370. {
  1371. /* Search the best latency for this node */
  1372. double best = 0.0;
  1373. double actual = 0.0;
  1374. unsigned check = 0;
  1375. unsigned numa;
  1376. for (numa = 0; numa < nnumas; numa++)
  1377. {
  1378. #ifdef STARPU_USE_CUDA
  1379. if (strncmp(type, "CUDA", 4) == 0)
  1380. {
  1381. if (htod)
  1382. actual = cudadev_timing_per_numa[src*STARPU_MAXNUMANODES+numa].timing_htod;
  1383. else
  1384. actual = cudadev_timing_per_numa[src*STARPU_MAXNUMANODES+numa].timing_dtoh;
  1385. }
  1386. #endif
  1387. #ifdef STARPU_USE_OPENCL
  1388. if (strncmp(type, "OpenCL", 6) == 0)
  1389. {
  1390. if (htod)
  1391. actual = opencldev_timing_per_numa[src*STARPU_MAXNUMANODES+numa].timing_htod;
  1392. else
  1393. actual = opencldev_timing_per_numa[src*STARPU_MAXNUMANODES+numa].timing_dtoh;
  1394. }
  1395. #endif
  1396. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  1397. if (!check || actual < best)
  1398. {
  1399. best = actual;
  1400. check = 1;
  1401. }
  1402. #endif
  1403. }
  1404. return best;
  1405. }
  1406. #endif
  1407. #if !defined(STARPU_SIMGRID)
  1408. static void write_bus_bandwidth_file_content(void)
  1409. {
  1410. unsigned src, dst, maxnode;
  1411. unsigned b_low, b_up;
  1412. FILE *f;
  1413. int locked;
  1414. STARPU_ASSERT(was_benchmarked);
  1415. char path[PATH_LENGTH];
  1416. get_bandwidth_path(path, sizeof(path));
  1417. _STARPU_DEBUG("writing bandwidth to %s\n", path);
  1418. f = fopen(path, "a+");
  1419. STARPU_ASSERT_MSG(f, "Error when opening file (writing) '%s'", path);
  1420. locked = _starpu_fwrlock(f) == 0;
  1421. fseek(f, 0, SEEK_SET);
  1422. _starpu_fftruncate(f, 0);
  1423. fprintf(f, "# ");
  1424. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  1425. fprintf(f, "to %u\t\t", dst);
  1426. fprintf(f, "\n");
  1427. maxnode = nnumas;
  1428. #ifdef STARPU_USE_CUDA
  1429. maxnode += ncuda;
  1430. #endif
  1431. #ifdef STARPU_USE_OPENCL
  1432. maxnode += nopencl;
  1433. #endif
  1434. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  1435. maxnode += nmpi_ms;
  1436. #endif
  1437. for (src = 0; src < STARPU_MAXNODES; src++)
  1438. {
  1439. for (dst = 0; dst < STARPU_MAXNODES; dst++)
  1440. {
  1441. double bandwidth;
  1442. if ((src >= maxnode) || (dst >= maxnode))
  1443. {
  1444. bandwidth = NAN;
  1445. }
  1446. else if (src != dst)
  1447. {
  1448. double slowness = 0.0;
  1449. /* Total bandwidth is the harmonic mean of bandwidths */
  1450. b_low = b_up = 0;
  1451. /* Begin NUMA */
  1452. b_up += nnumas;
  1453. if (src >= b_low && src < b_up && dst >= b_low && dst < b_up)
  1454. slowness += numa_timing[src-b_low][dst-b_low];
  1455. /* copy interval to check numa index later */
  1456. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  1457. unsigned numa_low = b_low;
  1458. unsigned numa_up = b_up;
  1459. #endif
  1460. b_low += nnumas;
  1461. /* End NUMA */
  1462. #ifdef STARPU_USE_CUDA
  1463. b_up += ncuda;
  1464. #ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
  1465. if (src >= b_low && src < b_up && dst >= b_low && dst < b_up)
  1466. /* Direct GPU-GPU transfert */
  1467. slowness += cudadev_timing_dtod[src-b_low][dst-b_low];
  1468. else
  1469. #endif
  1470. {
  1471. /* Check if it's CUDA <-> NUMA link */
  1472. if (src >= b_low && src < b_up && dst >= numa_low && dst < numa_up)
  1473. slowness += cudadev_timing_per_numa[(src-b_low)*STARPU_MAXNUMANODES+dst-numa_low].timing_dtoh;
  1474. if (dst >= b_low && dst < b_up && src >= numa_low && dst < numa_up)
  1475. slowness += cudadev_timing_per_numa[(dst-b_low)*STARPU_MAXNUMANODES+src-numa_low].timing_htod;
  1476. /* To other devices, take the best slowness */
  1477. if (src >= b_low && src < b_up && !(dst >= numa_low && dst < numa_up))
  1478. slowness += search_bus_best_timing(src-b_low, "CUDA", 0);
  1479. if (dst >= b_low && dst < b_up && !(src >= numa_low && dst < numa_up))
  1480. slowness += search_bus_best_timing(dst-b_low, "CUDA", 1);
  1481. }
  1482. b_low += ncuda;
  1483. #endif
  1484. #ifdef STARPU_USE_OPENCL
  1485. b_up += nopencl;
  1486. /* Check if it's OpenCL <-> NUMA link */
  1487. if (src >= b_low && src < b_up && dst >= numa_low && dst < numa_up)
  1488. slowness += opencldev_timing_per_numa[(src-b_low)*STARPU_MAXNUMANODES+dst-numa_low].timing_dtoh;
  1489. if (dst >= b_low && dst < b_up && src >= numa_low && dst < numa_up)
  1490. slowness += opencldev_timing_per_numa[(dst-b_low)*STARPU_MAXNUMANODES+src-numa_low].timing_htod;
  1491. /* To other devices, take the best slowness */
  1492. if (src >= b_low && src < b_up && !(dst >= numa_low && dst < numa_up))
  1493. slowness += search_bus_best_timing(src-b_low, "OpenCL", 0);
  1494. if (dst >= b_low && dst < b_up && !(src >= numa_low && dst < numa_up))
  1495. slowness += search_bus_best_timing(dst-b_low, "OpenCL", 1);
  1496. b_low += nopencl;
  1497. #endif
  1498. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  1499. b_up += nmpi_ms;
  1500. /* Modify MPI src and MPI dst if they contain the master node or not
  1501. * Because, we only take care about slaves */
  1502. int mpi_master = _starpu_mpi_common_get_src_node();
  1503. int mpi_src = src - b_low;
  1504. mpi_src = (mpi_master <= mpi_src) ? mpi_src+1 : mpi_src;
  1505. int mpi_dst = dst - b_low;
  1506. mpi_dst = (mpi_master <= mpi_dst) ? mpi_dst+1 : mpi_dst;
  1507. if (src >= b_low && src < b_up && dst >= b_low && dst < b_up)
  1508. slowness += mpi_time_device_to_device[mpi_src][mpi_dst];
  1509. else
  1510. {
  1511. if (src >= b_low && src < b_up)
  1512. slowness += mpi_time_device_to_device[mpi_src][mpi_master];
  1513. if (dst >= b_low && dst < b_up)
  1514. slowness += mpi_time_device_to_device[mpi_master][mpi_dst];
  1515. }
  1516. b_low += nmpi_ms;
  1517. #endif
  1518. bandwidth = 1.0/slowness;
  1519. }
  1520. else
  1521. {
  1522. /* convention */
  1523. bandwidth = 0.0;
  1524. }
  1525. if (dst)
  1526. fputc('\t', f);
  1527. _starpu_write_double(f, "%e", bandwidth);
  1528. }
  1529. fprintf(f, "\n");
  1530. }
  1531. if (locked)
  1532. _starpu_fwrunlock(f);
  1533. fclose(f);
  1534. }
  1535. #endif /* STARPU_SIMGRID */
  1536. void starpu_bus_print_filenames(FILE *output)
  1537. {
  1538. char bandwidth_path[PATH_LENGTH];
  1539. char affinity_path[PATH_LENGTH];
  1540. char latency_path[PATH_LENGTH];
  1541. get_bandwidth_path(bandwidth_path, sizeof(bandwidth_path));
  1542. get_affinity_path(affinity_path, sizeof(affinity_path));
  1543. get_latency_path(latency_path, sizeof(latency_path));
  1544. fprintf(output, "bandwidth: <%s>\n", bandwidth_path);
  1545. fprintf(output, " affinity: <%s>\n", affinity_path);
  1546. fprintf(output, " latency: <%s>\n", latency_path);
  1547. }
  1548. void starpu_bus_print_bandwidth(FILE *f)
  1549. {
  1550. unsigned src, dst, maxnode = starpu_memory_nodes_get_count();
  1551. fprintf(f, "from/to\t");
  1552. for (dst = 0; dst < maxnode; dst++)
  1553. {
  1554. char name[128];
  1555. starpu_memory_node_get_name(dst, name, sizeof(name));
  1556. fprintf(f, "%s\t", name);
  1557. }
  1558. fprintf(f, "\n");
  1559. for (src = 0; src < maxnode; src++)
  1560. {
  1561. char name[128];
  1562. starpu_memory_node_get_name(src, name, sizeof(name));
  1563. fprintf(f, "%s\t", name);
  1564. for (dst = 0; dst < maxnode; dst++)
  1565. fprintf(f, "%.0f\t", bandwidth_matrix[src][dst]);
  1566. fprintf(f, "\n");
  1567. }
  1568. fprintf(f, "\n");
  1569. for (src = 0; src < maxnode; src++)
  1570. {
  1571. char name[128];
  1572. starpu_memory_node_get_name(src, name, sizeof(name));
  1573. fprintf(f, "%s\t", name);
  1574. for (dst = 0; dst < maxnode; dst++)
  1575. fprintf(f, "%.0f\t", latency_matrix[src][dst]);
  1576. fprintf(f, "\n");
  1577. }
  1578. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  1579. if (ncuda != 0 || nopencl != 0)
  1580. fprintf(f, "\nGPU\tNUMA in preference order (logical index), host-to-device, device-to-host\n");
  1581. for (src = 0; src < ncuda + nopencl; src++)
  1582. {
  1583. struct dev_timing *timing;
  1584. struct _starpu_machine_config * config = _starpu_get_machine_config();
  1585. unsigned config_nnumas = _starpu_topology_get_nnumanodes(config);
  1586. unsigned numa;
  1587. #ifdef STARPU_USE_CUDA
  1588. if (src < ncuda)
  1589. {
  1590. fprintf(f, "CUDA_%u\t", src);
  1591. for (numa = 0; numa < config_nnumas; numa++)
  1592. {
  1593. timing = &cudadev_timing_per_numa[src*STARPU_MAXNUMANODES+numa];
  1594. if (timing->timing_htod)
  1595. fprintf(f, "%2d %.0f %.0f\t", timing->numa_id, 1/timing->timing_htod, 1/timing->timing_dtoh);
  1596. else
  1597. fprintf(f, "%2u\t", cuda_affinity_matrix[src][numa]);
  1598. }
  1599. }
  1600. #ifdef STARPU_USE_OPENCL
  1601. else
  1602. #endif
  1603. #endif
  1604. #ifdef STARPU_USE_OPENCL
  1605. {
  1606. fprintf(f, "OpenCL%u\t", src-ncuda);
  1607. for (numa = 0; numa < config_nnumas; numa++)
  1608. {
  1609. timing = &opencldev_timing_per_numa[(src-ncuda)*STARPU_MAXNUMANODES+numa];
  1610. if (timing->timing_htod)
  1611. fprintf(f, "%2d %.0f %.0f\t", timing->numa_id, 1/timing->timing_htod, 1/timing->timing_dtoh);
  1612. else
  1613. fprintf(f, "%2u\t", opencl_affinity_matrix[src][numa]);
  1614. }
  1615. }
  1616. #endif
  1617. fprintf(f, "\n");
  1618. }
  1619. #endif
  1620. }
  1621. static void generate_bus_bandwidth_file(void)
  1622. {
  1623. if (!was_benchmarked)
  1624. benchmark_all_gpu_devices();
  1625. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  1626. /* Slaves don't write files */
  1627. if (!_starpu_mpi_common_is_src_node())
  1628. return;
  1629. #endif
  1630. #ifndef STARPU_SIMGRID
  1631. write_bus_bandwidth_file_content();
  1632. #endif
  1633. }
  1634. static void load_bus_bandwidth_file(void)
  1635. {
  1636. int res;
  1637. char path[PATH_LENGTH];
  1638. get_bandwidth_path(path, sizeof(path));
  1639. res = access(path, F_OK);
  1640. if (res || !load_bus_bandwidth_file_content())
  1641. {
  1642. /* File does not exist yet or is bogus */
  1643. generate_bus_bandwidth_file();
  1644. res = load_bus_bandwidth_file_content();
  1645. STARPU_ASSERT(res);
  1646. }
  1647. }
  1648. #ifndef STARPU_SIMGRID
  1649. /*
  1650. * Config
  1651. */
  1652. static void get_config_path(char *path, size_t maxlen)
  1653. {
  1654. get_bus_path("config", path, maxlen);
  1655. }
  1656. #if defined(STARPU_USE_MPI_MASTER_SLAVE)
  1657. /* check if the master or one slave has to recalibrate */
  1658. static int mpi_check_recalibrate(int my_recalibrate)
  1659. {
  1660. int nb_mpi = _starpu_mpi_src_get_device_count() + 1;
  1661. int mpi_recalibrate[nb_mpi];
  1662. int i;
  1663. MPI_Allgather(&my_recalibrate, 1, MPI_INT, mpi_recalibrate, 1, MPI_INT, MPI_COMM_WORLD);
  1664. for (i = 0; i < nb_mpi; i++)
  1665. {
  1666. if (mpi_recalibrate[i])
  1667. {
  1668. return 1;
  1669. }
  1670. }
  1671. return 0;
  1672. }
  1673. #endif
  1674. static void compare_value_and_recalibrate(char * msg, unsigned val_file, unsigned val_detected)
  1675. {
  1676. int recalibrate = 0;
  1677. if (val_file != val_detected)
  1678. recalibrate = 1;
  1679. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  1680. //Send to each other to know if we had to recalibrate because someone cannot have the correct value in the config file
  1681. recalibrate = mpi_check_recalibrate(recalibrate);
  1682. #endif
  1683. if (recalibrate)
  1684. {
  1685. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  1686. /* Only the master prints the message */
  1687. if (_starpu_mpi_common_is_src_node())
  1688. #endif
  1689. _STARPU_DISP("Current configuration does not match the bus performance model (%s: (stored) %d != (current) %d), recalibrating...\n", msg, val_file, val_detected);
  1690. _starpu_bus_force_sampling();
  1691. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  1692. if (_starpu_mpi_common_is_src_node())
  1693. #endif
  1694. _STARPU_DISP("... done\n");
  1695. }
  1696. }
  1697. static void check_bus_config_file(void)
  1698. {
  1699. int res;
  1700. char path[PATH_LENGTH];
  1701. struct _starpu_machine_config *config = _starpu_get_machine_config();
  1702. int recalibrate = 0;
  1703. get_config_path(path, sizeof(path));
  1704. res = access(path, F_OK);
  1705. if (res || config->conf.bus_calibrate > 0)
  1706. recalibrate = 1;
  1707. #if defined(STARPU_USE_MPI_MASTER_SLAVE)
  1708. //Send to each other to know if we had to recalibrate because someone cannot have the config file
  1709. recalibrate = mpi_check_recalibrate(recalibrate);
  1710. #endif
  1711. if (recalibrate)
  1712. {
  1713. if (res)
  1714. _STARPU_DISP("No performance model for the bus, calibrating...\n");
  1715. _starpu_bus_force_sampling();
  1716. if (res)
  1717. _STARPU_DISP("... done\n");
  1718. }
  1719. else
  1720. {
  1721. FILE *f;
  1722. int ret;
  1723. unsigned read_cuda = -1, read_opencl = -1, read_mpi_ms = -1;
  1724. unsigned read_cpus = -1, read_numa = -1;
  1725. int locked;
  1726. // Loading configuration from file
  1727. f = fopen(path, "r");
  1728. STARPU_ASSERT_MSG(f, "Error when reading from file '%s'", path);
  1729. locked = _starpu_frdlock(f) == 0;
  1730. _starpu_drop_comments(f);
  1731. ret = fscanf(f, "%u\t", &read_cpus);
  1732. STARPU_ASSERT_MSG(ret == 1, "Error when reading from file '%s'", path);
  1733. _starpu_drop_comments(f);
  1734. ret = fscanf(f, "%u\t", &read_numa);
  1735. STARPU_ASSERT_MSG(ret == 1, "Error when reading from file '%s'", path);
  1736. _starpu_drop_comments(f);
  1737. ret = fscanf(f, "%u\t", &read_cuda);
  1738. STARPU_ASSERT_MSG(ret == 1, "Error when reading from file '%s'", path);
  1739. _starpu_drop_comments(f);
  1740. ret = fscanf(f, "%u\t", &read_opencl);
  1741. STARPU_ASSERT_MSG(ret == 1, "Error when reading from file '%s'", path);
  1742. _starpu_drop_comments(f);
  1743. ret = fscanf(f, "%u\t", &read_mpi_ms);
  1744. if (ret == 0)
  1745. read_mpi_ms = 0;
  1746. _starpu_drop_comments(f);
  1747. if (locked)
  1748. _starpu_frdunlock(f);
  1749. fclose(f);
  1750. // Loading current configuration
  1751. ncpus = _starpu_topology_get_nhwcpu(config);
  1752. nnumas = _starpu_topology_get_nnumanodes(config);
  1753. #ifdef STARPU_USE_CUDA
  1754. ncuda = _starpu_get_cuda_device_count();
  1755. #endif
  1756. #ifdef STARPU_USE_OPENCL
  1757. nopencl = _starpu_opencl_get_device_count();
  1758. #endif
  1759. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  1760. nmpi_ms = _starpu_mpi_src_get_device_count();
  1761. #endif /* STARPU_USE_MPI_MASTER_SLAVE */
  1762. // Checking if both configurations match
  1763. compare_value_and_recalibrate("CPUS", read_cpus, ncpus);
  1764. compare_value_and_recalibrate("NUMA", read_numa, nnumas);
  1765. compare_value_and_recalibrate("CUDA", read_cuda, ncuda);
  1766. compare_value_and_recalibrate("OpenCL", read_opencl, nopencl);
  1767. compare_value_and_recalibrate("MPI Master-Slave", read_mpi_ms, nmpi_ms);
  1768. }
  1769. }
  1770. static void write_bus_config_file_content(void)
  1771. {
  1772. FILE *f;
  1773. char path[PATH_LENGTH];
  1774. int locked;
  1775. STARPU_ASSERT(was_benchmarked);
  1776. get_config_path(path, sizeof(path));
  1777. _STARPU_DEBUG("writing config to %s\n", path);
  1778. f = fopen(path, "a+");
  1779. STARPU_ASSERT_MSG(f, "Error when opening file (writing) '%s'", path);
  1780. locked = _starpu_fwrlock(f) == 0;
  1781. fseek(f, 0, SEEK_SET);
  1782. _starpu_fftruncate(f, 0);
  1783. fprintf(f, "# Current configuration\n");
  1784. fprintf(f, "%u # Number of CPUs\n", ncpus);
  1785. fprintf(f, "%u # Number of NUMA nodes\n", nnumas);
  1786. fprintf(f, "%u # Number of CUDA devices\n", ncuda);
  1787. fprintf(f, "%u # Number of OpenCL devices\n", nopencl);
  1788. fprintf(f, "%u # Number of MPI devices\n", nmpi_ms);
  1789. if (locked)
  1790. _starpu_fwrunlock(f);
  1791. fclose(f);
  1792. }
  1793. static void generate_bus_config_file(void)
  1794. {
  1795. if (!was_benchmarked)
  1796. benchmark_all_gpu_devices();
  1797. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  1798. /* Slaves don't write files */
  1799. if (!_starpu_mpi_common_is_src_node())
  1800. return;
  1801. #endif
  1802. write_bus_config_file_content();
  1803. }
  1804. #endif /* !SIMGRID */
  1805. void _starpu_simgrid_get_platform_path(int version, char *path, size_t maxlen)
  1806. {
  1807. if (version == 3)
  1808. get_bus_path("platform.xml", path, maxlen);
  1809. else
  1810. get_bus_path("platform.v4.xml", path, maxlen);
  1811. }
  1812. #ifndef STARPU_SIMGRID
  1813. /*
  1814. * Compute the precise PCI tree bandwidth and link shares
  1815. *
  1816. * We only have measurements from one leaf to another. We assume that the
  1817. * available bandwidth is greater at lower levels, and thus measurements from
  1818. * increasingly far GPUs provide the PCI bridges bandwidths at each level.
  1819. *
  1820. * The bandwidth of a PCI bridge is thus computed as the maximum of the speed
  1821. * of the various transfers that we have achieved through it. We thus browse
  1822. * the PCI tree three times:
  1823. *
  1824. * - first through all CUDA-CUDA possible transfers to compute the maximum
  1825. * measured bandwidth on each PCI link and hub used for that.
  1826. * - then through the whole tree to emit links for each PCI link and hub.
  1827. * - then through all CUDA-CUDA possible transfers again to emit routes.
  1828. */
  1829. #if defined(STARPU_USE_CUDA) && defined(HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX) && HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX && defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
  1830. /* Records, for each PCI link and hub, the maximum bandwidth seen through it */
  1831. struct pci_userdata
  1832. {
  1833. /* Uplink max measurement */
  1834. double bw_up;
  1835. double bw_down;
  1836. /* Hub max measurement */
  1837. double bw;
  1838. };
  1839. /* Allocate a pci_userdata structure for the given object */
  1840. static void allocate_userdata(hwloc_obj_t obj)
  1841. {
  1842. struct pci_userdata *data;
  1843. if (obj->userdata)
  1844. return;
  1845. _STARPU_MALLOC(obj->userdata, sizeof(*data));
  1846. data = obj->userdata;
  1847. data->bw_up = 0.0;
  1848. data->bw_down = 0.0;
  1849. data->bw = 0.0;
  1850. }
  1851. /* Update the maximum bandwidth seen going to upstream */
  1852. static void update_bandwidth_up(hwloc_obj_t obj, double bandwidth)
  1853. {
  1854. struct pci_userdata *data;
  1855. if (obj->type != HWLOC_OBJ_BRIDGE && obj->type != HWLOC_OBJ_PCI_DEVICE)
  1856. return;
  1857. allocate_userdata(obj);
  1858. data = obj->userdata;
  1859. if (data->bw_up < bandwidth)
  1860. data->bw_up = bandwidth;
  1861. }
  1862. /* Update the maximum bandwidth seen going from upstream */
  1863. static void update_bandwidth_down(hwloc_obj_t obj, double bandwidth)
  1864. {
  1865. struct pci_userdata *data;
  1866. if (obj->type != HWLOC_OBJ_BRIDGE && obj->type != HWLOC_OBJ_PCI_DEVICE)
  1867. return;
  1868. allocate_userdata(obj);
  1869. data = obj->userdata;
  1870. if (data->bw_down < bandwidth)
  1871. data->bw_down = bandwidth;
  1872. }
  1873. /* Update the maximum bandwidth seen going through this Hub */
  1874. static void update_bandwidth_through(hwloc_obj_t obj, double bandwidth)
  1875. {
  1876. struct pci_userdata *data;
  1877. allocate_userdata(obj);
  1878. data = obj->userdata;
  1879. if (data->bw < bandwidth)
  1880. data->bw = bandwidth;
  1881. }
  1882. /* find_* functions perform the first step: computing maximum bandwidths */
  1883. /* Our trafic had to go through the host, go back from target up to the host,
  1884. * updating uplink downstream bandwidth along the way */
  1885. static void find_platform_backward_path(hwloc_obj_t obj, double bandwidth)
  1886. {
  1887. if (!obj)
  1888. /* Oops, we should have seen a host bridge. Well, too bad. */
  1889. return;
  1890. /* Update uplink bandwidth of PCI Hub */
  1891. update_bandwidth_down(obj, bandwidth);
  1892. /* Update internal bandwidth of PCI Hub */
  1893. update_bandwidth_through(obj, bandwidth);
  1894. if (obj->type == HWLOC_OBJ_BRIDGE && obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST)
  1895. /* Finished */
  1896. return;
  1897. /* Continue up */
  1898. find_platform_backward_path(obj->parent, bandwidth);
  1899. }
  1900. /* Same, but update uplink upstream bandwidth */
  1901. static void find_platform_forward_path(hwloc_obj_t obj, double bandwidth)
  1902. {
  1903. if (!obj)
  1904. /* Oops, we should have seen a host bridge. Well, too bad. */
  1905. return;
  1906. /* Update uplink bandwidth of PCI Hub */
  1907. update_bandwidth_up(obj, bandwidth);
  1908. /* Update internal bandwidth of PCI Hub */
  1909. update_bandwidth_through(obj, bandwidth);
  1910. if (obj->type == HWLOC_OBJ_BRIDGE && obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST)
  1911. /* Finished */
  1912. return;
  1913. /* Continue up */
  1914. find_platform_forward_path(obj->parent, bandwidth);
  1915. }
  1916. /* Find the path from obj1 through parent down to obj2 (without ever going up),
  1917. * and update the maximum bandwidth along the path */
  1918. static int find_platform_path_down(hwloc_obj_t parent, hwloc_obj_t obj1, hwloc_obj_t obj2, double bandwidth)
  1919. {
  1920. unsigned i;
  1921. /* Base case, path is empty */
  1922. if (parent == obj2)
  1923. return 1;
  1924. /* Try to go down from parent */
  1925. for (i = 0; i < parent->arity; i++)
  1926. if (parent->children[i] != obj1 && find_platform_path_down(parent->children[i], NULL, obj2, bandwidth))
  1927. {
  1928. /* Found it down there, update bandwidth of parent */
  1929. update_bandwidth_down(parent->children[i], bandwidth);
  1930. update_bandwidth_through(parent, bandwidth);
  1931. return 1;
  1932. }
  1933. #if HWLOC_API_VERSION >= 0x00020000
  1934. hwloc_obj_t io;
  1935. for (io = parent->io_first_child; io; io = io->next_sibling)
  1936. if (io != obj1 && find_platform_path_down(io, NULL, obj2, bandwidth))
  1937. {
  1938. /* Found it down there, update bandwidth of parent */
  1939. update_bandwidth_down(io, bandwidth);
  1940. update_bandwidth_through(parent, bandwidth);
  1941. return 1;
  1942. }
  1943. #endif
  1944. return 0;
  1945. }
  1946. /* Find the path from obj1 to obj2, and update the maximum bandwidth along the
  1947. * path */
  1948. static int find_platform_path_up(hwloc_obj_t obj1, hwloc_obj_t obj2, double bandwidth)
  1949. {
  1950. int ret;
  1951. hwloc_obj_t parent = obj1->parent;
  1952. if (!parent)
  1953. {
  1954. /* Oops, we should have seen a host bridge. Act as if we had seen it. */
  1955. find_platform_backward_path(obj2, bandwidth);
  1956. return 1;
  1957. }
  1958. if (find_platform_path_down(parent, obj1, obj2, bandwidth))
  1959. /* obj2 was a mere (sub)child of our parent */
  1960. return 1;
  1961. /* obj2 is not a (sub)child of our parent, we have to go up through the parent */
  1962. if (parent->type == HWLOC_OBJ_BRIDGE && parent->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST)
  1963. {
  1964. /* We have to go up to the Host, so obj2 is not in the same PCI
  1965. * tree, so we're for for obj1 to Host, and just find the path
  1966. * from obj2 to Host too.
  1967. */
  1968. find_platform_backward_path(obj2, bandwidth);
  1969. update_bandwidth_up(parent, bandwidth);
  1970. update_bandwidth_through(parent, bandwidth);
  1971. return 1;
  1972. }
  1973. /* Not at host yet, just go up */
  1974. ret = find_platform_path_up(parent, obj2, bandwidth);
  1975. update_bandwidth_up(parent, bandwidth);
  1976. update_bandwidth_through(parent, bandwidth);
  1977. return ret;
  1978. }
  1979. static hwloc_obj_t get_hwloc_cuda_obj(hwloc_topology_t topology, unsigned devid)
  1980. {
  1981. hwloc_obj_t res;
  1982. struct cudaDeviceProp props;
  1983. cudaError_t cures;
  1984. res = hwloc_cuda_get_device_osdev_by_index(topology, devid);
  1985. if (res)
  1986. return res;
  1987. cures = cudaGetDeviceProperties(&props, devid);
  1988. if (cures == cudaSuccess)
  1989. {
  1990. res = hwloc_get_pcidev_by_busid(topology, props.pciDomainID, props.pciBusID, props.pciDeviceID, 0);
  1991. if (res)
  1992. return res;
  1993. #ifdef STARPU_HAVE_LIBNVIDIA_ML
  1994. nvmlDevice_t nvmldev = _starpu_cuda_get_nvmldev(&props);
  1995. if (nvmldev)
  1996. {
  1997. unsigned int index;
  1998. if (nvmlDeviceGetIndex(nvmldev, &index) == NVML_SUCCESS)
  1999. {
  2000. res = hwloc_nvml_get_device_osdev_by_index(topology, index);
  2001. if (res)
  2002. return res;
  2003. }
  2004. res = hwloc_nvml_get_device_osdev(topology, nvmldev);
  2005. if (res)
  2006. return res;
  2007. }
  2008. #endif
  2009. }
  2010. return NULL;
  2011. }
  2012. /* find the path between cuda i and cuda j, and update the maximum bandwidth along the path */
  2013. static int find_platform_cuda_path(hwloc_topology_t topology, unsigned i, unsigned j, double bandwidth)
  2014. {
  2015. hwloc_obj_t cudai, cudaj;
  2016. cudai = get_hwloc_cuda_obj(topology, i);
  2017. cudaj = get_hwloc_cuda_obj(topology, j);
  2018. if (!cudai || !cudaj)
  2019. return 0;
  2020. return find_platform_path_up(cudai, cudaj, bandwidth);
  2021. }
  2022. /* emit_topology_bandwidths performs the second step: emitting link names */
  2023. /* Emit the link name of the object */
  2024. static void emit_pci_hub(FILE *f, hwloc_obj_t obj)
  2025. {
  2026. STARPU_ASSERT(obj->type == HWLOC_OBJ_BRIDGE);
  2027. fprintf(f, "PCI:%04x:[%02x-%02x]", obj->attr->bridge.downstream.pci.domain, obj->attr->bridge.downstream.pci.secondary_bus, obj->attr->bridge.downstream.pci.subordinate_bus);
  2028. }
  2029. static void emit_pci_dev(FILE *f, struct hwloc_pcidev_attr_s *pcidev)
  2030. {
  2031. fprintf(f, "PCI:%04x:%02x:%02x.%1x", pcidev->domain, pcidev->bus, pcidev->dev, pcidev->func);
  2032. }
  2033. /* Emit the links of the object */
  2034. static void emit_topology_bandwidths(FILE *f, hwloc_obj_t obj, const char *Bps, const char *s)
  2035. {
  2036. unsigned i;
  2037. if (obj->userdata)
  2038. {
  2039. struct pci_userdata *data = obj->userdata;
  2040. if (obj->type == HWLOC_OBJ_BRIDGE)
  2041. {
  2042. /* Uplink */
  2043. fprintf(f, " <link id=\"");
  2044. emit_pci_hub(f, obj);
  2045. fprintf(f, " up\" bandwidth=\"%f%s\" latency=\"0.000000%s\"/>\n", data->bw_up, Bps, s);
  2046. fprintf(f, " <link id=\"");
  2047. emit_pci_hub(f, obj);
  2048. fprintf(f, " down\" bandwidth=\"%f%s\" latency=\"0.000000%s\"/>\n", data->bw_down, Bps, s);
  2049. /* PCI Switches are assumed to have infinite internal bandwidth */
  2050. if (!obj->name || !strstr(obj->name, "Switch"))
  2051. {
  2052. /* We assume that PCI Hubs have double bandwidth in
  2053. * order to support full duplex but not more */
  2054. fprintf(f, " <link id=\"");
  2055. emit_pci_hub(f, obj);
  2056. fprintf(f, " through\" bandwidth=\"%f%s\" latency=\"0.000000%s\"/>\n", data->bw * 2, Bps, s);
  2057. }
  2058. }
  2059. else if (obj->type == HWLOC_OBJ_PCI_DEVICE)
  2060. {
  2061. fprintf(f, " <link id=\"");
  2062. emit_pci_dev(f, &obj->attr->pcidev);
  2063. fprintf(f, " up\" bandwidth=\"%f%s\" latency=\"0.000000%s\"/>\n", data->bw_up, Bps, s);
  2064. fprintf(f, " <link id=\"");
  2065. emit_pci_dev(f, &obj->attr->pcidev);
  2066. fprintf(f, " down\" bandwidth=\"%f%s\" latency=\"0.000000%s\"/>\n", data->bw_down, Bps, s);
  2067. }
  2068. }
  2069. for (i = 0; i < obj->arity; i++)
  2070. emit_topology_bandwidths(f, obj->children[i], Bps, s);
  2071. #if HWLOC_API_VERSION >= 0x00020000
  2072. hwloc_obj_t io;
  2073. for (io = obj->io_first_child; io; io = io->next_sibling)
  2074. emit_topology_bandwidths(f, io, Bps, s);
  2075. #endif
  2076. }
  2077. /* emit_pci_link_* functions perform the third step: emitting the routes */
  2078. static void emit_pci_link(FILE *f, hwloc_obj_t obj, const char *suffix)
  2079. {
  2080. if (obj->type == HWLOC_OBJ_BRIDGE)
  2081. {
  2082. fprintf(f, " <link_ctn id=\"");
  2083. emit_pci_hub(f, obj);
  2084. fprintf(f, " %s\"/>\n", suffix);
  2085. }
  2086. else if (obj->type == HWLOC_OBJ_PCI_DEVICE)
  2087. {
  2088. fprintf(f, " <link_ctn id=\"");
  2089. emit_pci_dev(f, &obj->attr->pcidev);
  2090. fprintf(f, " %s\"/>\n", suffix);
  2091. }
  2092. }
  2093. /* Go to upstream */
  2094. static void emit_pci_link_up(FILE *f, hwloc_obj_t obj)
  2095. {
  2096. emit_pci_link(f, obj, "up");
  2097. }
  2098. /* Go from upstream */
  2099. static void emit_pci_link_down(FILE *f, hwloc_obj_t obj)
  2100. {
  2101. emit_pci_link(f, obj, "down");
  2102. }
  2103. /* Go through PCI hub */
  2104. static void emit_pci_link_through(FILE *f, hwloc_obj_t obj)
  2105. {
  2106. /* We don't care about trafic going through PCI switches */
  2107. if (obj->type == HWLOC_OBJ_BRIDGE)
  2108. {
  2109. if (!obj->name || !strstr(obj->name, "Switch"))
  2110. emit_pci_link(f, obj, "through");
  2111. else
  2112. {
  2113. fprintf(f, " <!-- Switch ");
  2114. emit_pci_hub(f, obj);
  2115. fprintf(f, " through -->\n");
  2116. }
  2117. }
  2118. }
  2119. /* Our trafic has to go through the host, go back from target up to the host,
  2120. * using uplink downstream along the way */
  2121. static void emit_platform_backward_path(FILE *f, hwloc_obj_t obj)
  2122. {
  2123. if (!obj)
  2124. /* Oops, we should have seen a host bridge. Well, too bad. */
  2125. return;
  2126. /* Go through PCI Hub */
  2127. emit_pci_link_through(f, obj);
  2128. /* Go through uplink */
  2129. emit_pci_link_down(f, obj);
  2130. if (obj->type == HWLOC_OBJ_BRIDGE && obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST)
  2131. {
  2132. /* Finished, go through host */
  2133. fprintf(f, " <link_ctn id=\"Host\"/>\n");
  2134. return;
  2135. }
  2136. /* Continue up */
  2137. emit_platform_backward_path(f, obj->parent);
  2138. }
  2139. /* Same, but use upstream link */
  2140. static void emit_platform_forward_path(FILE *f, hwloc_obj_t obj)
  2141. {
  2142. if (!obj)
  2143. /* Oops, we should have seen a host bridge. Well, too bad. */
  2144. return;
  2145. /* Go through PCI Hub */
  2146. emit_pci_link_through(f, obj);
  2147. /* Go through uplink */
  2148. emit_pci_link_up(f, obj);
  2149. if (obj->type == HWLOC_OBJ_BRIDGE && obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST)
  2150. {
  2151. /* Finished, go through host */
  2152. fprintf(f, " <link_ctn id=\"Host\"/>\n");
  2153. return;
  2154. }
  2155. /* Continue up */
  2156. emit_platform_forward_path(f, obj->parent);
  2157. }
  2158. /* Find the path from obj1 through parent down to obj2 (without ever going up),
  2159. * and use the links along the path */
  2160. static int emit_platform_path_down(FILE *f, hwloc_obj_t parent, hwloc_obj_t obj1, hwloc_obj_t obj2)
  2161. {
  2162. unsigned i;
  2163. /* Base case, path is empty */
  2164. if (parent == obj2)
  2165. return 1;
  2166. /* Try to go down from parent */
  2167. for (i = 0; i < parent->arity; i++)
  2168. if (parent->children[i] != obj1 && emit_platform_path_down(f, parent->children[i], NULL, obj2))
  2169. {
  2170. /* Found it down there, path goes through this hub */
  2171. emit_pci_link_down(f, parent->children[i]);
  2172. emit_pci_link_through(f, parent);
  2173. return 1;
  2174. }
  2175. #if HWLOC_API_VERSION >= 0x00020000
  2176. hwloc_obj_t io;
  2177. for (io = parent->io_first_child; io; io = io->next_sibling)
  2178. if (io != obj1 && emit_platform_path_down(f, io, NULL, obj2))
  2179. {
  2180. /* Found it down there, path goes through this hub */
  2181. emit_pci_link_down(f, io);
  2182. emit_pci_link_through(f, parent);
  2183. return 1;
  2184. }
  2185. #endif
  2186. return 0;
  2187. }
  2188. /* Find the path from obj1 to obj2, and use the links along the path */
  2189. static int emit_platform_path_up(FILE *f, hwloc_obj_t obj1, hwloc_obj_t obj2)
  2190. {
  2191. int ret;
  2192. hwloc_obj_t parent = obj1->parent;
  2193. if (!parent)
  2194. {
  2195. /* Oops, we should have seen a host bridge. Act as if we had seen it. */
  2196. emit_platform_backward_path(f, obj2);
  2197. return 1;
  2198. }
  2199. if (emit_platform_path_down(f, parent, obj1, obj2))
  2200. /* obj2 was a mere (sub)child of our parent */
  2201. return 1;
  2202. /* obj2 is not a (sub)child of our parent, we have to go up through the parent */
  2203. if (parent->type == HWLOC_OBJ_BRIDGE && parent->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST)
  2204. {
  2205. /* We have to go up to the Host, so obj2 is not in the same PCI
  2206. * tree, so we're for for obj1 to Host, and just find the path
  2207. * from obj2 to Host too.
  2208. */
  2209. emit_platform_backward_path(f, obj2);
  2210. fprintf(f, " <link_ctn id=\"Host\"/>\n");
  2211. emit_pci_link_up(f, parent);
  2212. emit_pci_link_through(f, parent);
  2213. return 1;
  2214. }
  2215. /* Not at host yet, just go up */
  2216. ret = emit_platform_path_up(f, parent, obj2);
  2217. emit_pci_link_up(f, parent);
  2218. emit_pci_link_through(f, parent);
  2219. return ret;
  2220. }
  2221. /* Clean our mess in the topology before destroying it */
  2222. static void clean_topology(hwloc_obj_t obj)
  2223. {
  2224. unsigned i;
  2225. if (obj->userdata)
  2226. {
  2227. free(obj->userdata);
  2228. obj->userdata = NULL;
  2229. }
  2230. for (i = 0; i < obj->arity; i++)
  2231. clean_topology(obj->children[i]);
  2232. #if HWLOC_API_VERSION >= 0x00020000
  2233. hwloc_obj_t io;
  2234. for (io = obj->io_first_child; io; io = io->next_sibling)
  2235. clean_topology(io);
  2236. #endif
  2237. }
  2238. #endif
  2239. static void write_bus_platform_file_content(int version)
  2240. {
  2241. FILE *f;
  2242. char path[PATH_LENGTH];
  2243. unsigned i;
  2244. const char *speed, *flops, *Bps, *s;
  2245. char dash;
  2246. int locked;
  2247. if (version == 3)
  2248. {
  2249. speed = "power";
  2250. flops = "";
  2251. Bps = "";
  2252. s = "";
  2253. dash = '_';
  2254. }
  2255. else
  2256. {
  2257. speed = "speed";
  2258. flops = "f";
  2259. Bps = "Bps";
  2260. s = "s";
  2261. dash = '-';
  2262. }
  2263. STARPU_ASSERT(was_benchmarked);
  2264. _starpu_simgrid_get_platform_path(version, path, sizeof(path));
  2265. _STARPU_DEBUG("writing platform to %s\n", path);
  2266. f = fopen(path, "a+");
  2267. if (!f)
  2268. {
  2269. perror("fopen write_bus_platform_file_content");
  2270. _STARPU_DISP("path '%s'\n", path);
  2271. fflush(stderr);
  2272. STARPU_ABORT();
  2273. }
  2274. locked = _starpu_fwrlock(f) == 0;
  2275. fseek(f, 0, SEEK_SET);
  2276. _starpu_fftruncate(f, 0);
  2277. fprintf(f,
  2278. "<?xml version='1.0'?>\n"
  2279. "<!DOCTYPE platform SYSTEM '%s'>\n"
  2280. " <platform version=\"%d\">\n"
  2281. " <config id=\"General\">\n"
  2282. " <prop id=\"network/TCP%cgamma\" value=\"-1\"></prop>\n"
  2283. " <prop id=\"network/latency%cfactor\" value=\"1\"></prop>\n"
  2284. " <prop id=\"network/bandwidth%cfactor\" value=\"1\"></prop>\n"
  2285. " <prop id=\"network/crosstraffic\" value=\"0\"></prop>\n"
  2286. " <prop id=\"network/weight%cS\" value=\"0.0\"></prop>\n"
  2287. " </config>\n"
  2288. " <AS id=\"AS0\" routing=\"Full\">\n"
  2289. " <host id=\"MAIN\" %s=\"1%s\"/>\n",
  2290. version == 3
  2291. ? "http://simgrid.gforge.inria.fr/simgrid.dtd"
  2292. : "http://simgrid.gforge.inria.fr/simgrid/simgrid.dtd",
  2293. version, dash, dash, dash, dash, speed, flops);
  2294. for (i = 0; i < ncpus; i++)
  2295. /* TODO: host memory for out-of-core simulation */
  2296. fprintf(f, " <host id=\"CPU%u\" %s=\"2000000000%s\"/>\n", i, speed, flops);
  2297. for (i = 0; i < ncuda; i++)
  2298. {
  2299. fprintf(f, " <host id=\"CUDA%u\" %s=\"2000000000%s\">\n", i, speed, flops);
  2300. fprintf(f, " <prop id=\"memsize\" value=\"%llu\"/>\n", (unsigned long long) cuda_size[i]);
  2301. #ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
  2302. fprintf(f, " <prop id=\"memcpy_peer\" value=\"1\"/>\n");
  2303. #endif
  2304. /* TODO: record cudadev_direct instead of assuming it's NUMA nodes */
  2305. fprintf(f, " </host>\n");
  2306. }
  2307. for (i = 0; i < nopencl; i++)
  2308. {
  2309. fprintf(f, " <host id=\"OpenCL%u\" %s=\"2000000000%s\">\n", i, speed, flops);
  2310. fprintf(f, " <prop id=\"memsize\" value=\"%llu\"/>\n", (unsigned long long) opencl_size[i]);
  2311. fprintf(f, " </host>\n");
  2312. }
  2313. fprintf(f, "\n <host id=\"RAM\" %s=\"1%s\"/>\n", speed, flops);
  2314. /*
  2315. * Compute maximum bandwidth, taken as host bandwidth
  2316. */
  2317. double max_bandwidth = 0;
  2318. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
  2319. unsigned numa;
  2320. #endif
  2321. #ifdef STARPU_USE_CUDA
  2322. for (i = 0; i < ncuda; i++)
  2323. {
  2324. for (numa = 0; numa < nnumas; numa++)
  2325. {
  2326. double down_bw = 1.0 / cudadev_timing_per_numa[i*STARPU_MAXNUMANODES+numa].timing_dtoh;
  2327. double up_bw = 1.0 / cudadev_timing_per_numa[i*STARPU_MAXNUMANODES+numa].timing_htod;
  2328. if (max_bandwidth < down_bw)
  2329. max_bandwidth = down_bw;
  2330. if (max_bandwidth < up_bw)
  2331. max_bandwidth = up_bw;
  2332. }
  2333. }
  2334. #endif
  2335. #ifdef STARPU_USE_OPENCL
  2336. for (i = 0; i < nopencl; i++)
  2337. {
  2338. for (numa = 0; numa < nnumas; numa++)
  2339. {
  2340. double down_bw = 1.0 / opencldev_timing_per_numa[i*STARPU_MAXNUMANODES+numa].timing_dtoh;
  2341. double up_bw = 1.0 / opencldev_timing_per_numa[i*STARPU_MAXNUMANODES+numa].timing_htod;
  2342. if (max_bandwidth < down_bw)
  2343. max_bandwidth = down_bw;
  2344. if (max_bandwidth < up_bw)
  2345. max_bandwidth = up_bw;
  2346. }
  2347. }
  2348. #endif
  2349. fprintf(f, "\n <link id=\"Host\" bandwidth=\"%f%s\" latency=\"0.000000%s\"/>\n\n", max_bandwidth*1000000, Bps, s);
  2350. /*
  2351. * OpenCL links
  2352. */
  2353. #ifdef STARPU_USE_OPENCL
  2354. for (i = 0; i < nopencl; i++)
  2355. {
  2356. char i_name[16];
  2357. snprintf(i_name, sizeof(i_name), "OpenCL%u", i);
  2358. fprintf(f, " <link id=\"RAM-%s\" bandwidth=\"%f%s\" latency=\"%f%s\"/>\n",
  2359. i_name,
  2360. 1000000 / search_bus_best_timing(i, "OpenCL", 1), Bps,
  2361. search_bus_best_latency(i, "OpenCL", 1)/1000000., s);
  2362. fprintf(f, " <link id=\"%s-RAM\" bandwidth=\"%f%s\" latency=\"%f%s\"/>\n",
  2363. i_name,
  2364. 1000000 / search_bus_best_timing(i, "OpenCL", 0), Bps,
  2365. search_bus_best_latency(i, "OpenCL", 0)/1000000., s);
  2366. }
  2367. fprintf(f, "\n");
  2368. #endif
  2369. /*
  2370. * CUDA links and routes
  2371. */
  2372. #ifdef STARPU_USE_CUDA
  2373. /* Write RAM/CUDA bandwidths and latencies */
  2374. for (i = 0; i < ncuda; i++)
  2375. {
  2376. char i_name[16];
  2377. snprintf(i_name, sizeof(i_name), "CUDA%u", i);
  2378. fprintf(f, " <link id=\"RAM-%s\" bandwidth=\"%f%s\" latency=\"%f%s\"/>\n",
  2379. i_name,
  2380. 1000000. / search_bus_best_timing(i, "CUDA", 1), Bps,
  2381. search_bus_best_latency(i, "CUDA", 1)/1000000., s);
  2382. fprintf(f, " <link id=\"%s-RAM\" bandwidth=\"%f%s\" latency=\"%f%s\"/>\n",
  2383. i_name,
  2384. 1000000. / search_bus_best_timing(i, "CUDA", 0), Bps,
  2385. search_bus_best_latency(i, "CUDA", 0)/1000000., s);
  2386. }
  2387. fprintf(f, "\n");
  2388. #ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
  2389. /* Write CUDA/CUDA bandwidths and latencies */
  2390. for (i = 0; i < ncuda; i++)
  2391. {
  2392. unsigned j;
  2393. char i_name[16];
  2394. snprintf(i_name, sizeof(i_name), "CUDA%u", i);
  2395. for (j = 0; j < ncuda; j++)
  2396. {
  2397. char j_name[16];
  2398. if (j == i)
  2399. continue;
  2400. snprintf(j_name, sizeof(j_name), "CUDA%u", j);
  2401. fprintf(f, " <link id=\"%s-%s\" bandwidth=\"%f%s\" latency=\"%f%s\"/>\n",
  2402. i_name, j_name,
  2403. 1000000. / cudadev_timing_dtod[i][j], Bps,
  2404. cudadev_latency_dtod[i][j]/1000000., s);
  2405. }
  2406. }
  2407. #endif
  2408. #if defined(HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX) && HAVE_DECL_HWLOC_CUDA_GET_DEVICE_OSDEV_BY_INDEX && defined(STARPU_USE_CUDA) && defined(STARPU_HAVE_CUDA_MEMCPY_PEER)
  2409. #ifdef STARPU_DEVEL
  2410. #warning TODO: use libnvml to get NVLink links, otherwise numbers will be bogusly propagated through PCI topology
  2411. #endif
  2412. /* If we have enough hwloc information, write PCI bandwidths and routes */
  2413. if (!starpu_get_env_number_default("STARPU_PCI_FLAT", 0) && ncuda > 0)
  2414. {
  2415. hwloc_topology_t topology;
  2416. hwloc_topology_init(&topology);
  2417. _starpu_topology_filter(topology);
  2418. hwloc_topology_load(topology);
  2419. char nvlink[ncuda][ncuda];
  2420. char nvlinkhost[ncuda];
  2421. memset(nvlink, 0, sizeof(nvlink));
  2422. memset(nvlinkhost, 0, sizeof(nvlinkhost));
  2423. #ifdef STARPU_HAVE_LIBNVIDIA_ML
  2424. /* First find NVLinks */
  2425. struct cudaDeviceProp props[ncuda];
  2426. for (i = 0; i < ncuda; i++)
  2427. {
  2428. cudaError_t cures = cudaGetDeviceProperties(&props[i], i);
  2429. if (cures != cudaSuccess)
  2430. props[i].name[0] = 0;
  2431. }
  2432. for (i = 0; i < ncuda; i++)
  2433. {
  2434. unsigned j;
  2435. if (!props[i].name[0])
  2436. continue;
  2437. nvmlDevice_t nvmldev;
  2438. nvmldev = _starpu_cuda_get_nvmldev(&props[i]);
  2439. if (!nvmldev)
  2440. continue;
  2441. for (j = 0; j < NVML_NVLINK_MAX_LINKS; j++)
  2442. {
  2443. nvmlEnableState_t active;
  2444. nvmlReturn_t ret;
  2445. nvmlPciInfo_t pci;
  2446. unsigned k;
  2447. ret = nvmlDeviceGetNvLinkState(nvmldev, j, &active);
  2448. if (ret != NVML_SUCCESS)
  2449. continue;
  2450. if (active != NVML_FEATURE_ENABLED)
  2451. continue;
  2452. ret = nvmlDeviceGetNvLinkRemotePciInfo(nvmldev, j, &pci);
  2453. if (ret != NVML_SUCCESS)
  2454. continue;
  2455. hwloc_obj_t obj = hwloc_get_pcidev_by_busid(topology,
  2456. pci.domain, pci.bus, pci.device, 0);
  2457. if (obj && obj->type == HWLOC_OBJ_PCI_DEVICE && (obj->attr->pcidev.class_id >> 8 == 0x06))
  2458. {
  2459. switch (obj->attr->pcidev.vendor_id)
  2460. {
  2461. case 0x1014:
  2462. /* IBM OpenCAPI port, direct CPU-GPU NVLink */
  2463. /* TODO: NUMA affinity */
  2464. nvlinkhost[i] = 1;
  2465. continue;
  2466. case 0x10de:
  2467. /* TODO: NVIDIA NVSwitch */
  2468. continue;
  2469. }
  2470. }
  2471. /* Otherwise, link to another GPU? */
  2472. for (k = i+1; k < ncuda; k++)
  2473. {
  2474. if ((int) pci.domain == props[k].pciDomainID
  2475. && (int) pci.bus == props[k].pciBusID
  2476. && (int) pci.device == props[k].pciDeviceID)
  2477. {
  2478. nvlink[i][k] = 1;
  2479. nvlink[k][i] = 1;
  2480. break;
  2481. }
  2482. }
  2483. }
  2484. }
  2485. #endif
  2486. /* Find paths and record measured bandwidth along the path */
  2487. for (i = 0; i < ncuda; i++)
  2488. {
  2489. unsigned j;
  2490. for (j = 0; j < ncuda; j++)
  2491. if (i != j && !nvlink[i][j] && !nvlinkhost[i] && !nvlinkhost[j])
  2492. if (!find_platform_cuda_path(topology, i, j, 1000000. / cudadev_timing_dtod[i][j]))
  2493. {
  2494. _STARPU_DISP("Warning: could not get CUDA location from hwloc\n");
  2495. clean_topology(hwloc_get_root_obj(topology));
  2496. hwloc_topology_destroy(topology);
  2497. goto flat_cuda;
  2498. }
  2499. /* Record RAM/CUDA bandwidths */
  2500. if (!nvlinkhost[i])
  2501. {
  2502. find_platform_forward_path(get_hwloc_cuda_obj(topology, i), 1000000. / search_bus_best_timing(i, "CUDA", 0));
  2503. find_platform_backward_path(get_hwloc_cuda_obj(topology, i), 1000000. / search_bus_best_timing(i, "CUDA", 1));
  2504. }
  2505. }
  2506. /* Ok, found path in all cases, can emit advanced platform routes */
  2507. fprintf(f, "\n");
  2508. emit_topology_bandwidths(f, hwloc_get_root_obj(topology), Bps, s);
  2509. fprintf(f, "\n");
  2510. for (i = 0; i < ncuda; i++)
  2511. {
  2512. unsigned j;
  2513. for (j = 0; j < ncuda; j++)
  2514. if (i != j)
  2515. {
  2516. fprintf(f, " <route src=\"CUDA%u\" dst=\"CUDA%u\" symmetrical=\"NO\">\n", i, j);
  2517. fprintf(f, " <link_ctn id=\"CUDA%u-CUDA%u\"/>\n", i, j);
  2518. if (!nvlink[i][j])
  2519. {
  2520. if (nvlinkhost[i] && nvlinkhost[j])
  2521. /* TODO: NUMA affinity */
  2522. fprintf(f, " <link_ctn id=\"Host\"/>\n");
  2523. else
  2524. emit_platform_path_up(f,
  2525. get_hwloc_cuda_obj(topology, i),
  2526. get_hwloc_cuda_obj(topology, j));
  2527. }
  2528. fprintf(f, " </route>\n");
  2529. }
  2530. fprintf(f, " <route src=\"CUDA%u\" dst=\"RAM\" symmetrical=\"NO\">\n", i);
  2531. fprintf(f, " <link_ctn id=\"CUDA%u-RAM\"/>\n", i);
  2532. if (nvlinkhost[i])
  2533. /* TODO: NUMA affinity */
  2534. fprintf(f, " <link_ctn id=\"Host\"/>\n");
  2535. else
  2536. emit_platform_forward_path(f, get_hwloc_cuda_obj(topology, i));
  2537. fprintf(f, " </route>\n");
  2538. fprintf(f, " <route src=\"RAM\" dst=\"CUDA%u\" symmetrical=\"NO\">\n", i);
  2539. fprintf(f, " <link_ctn id=\"RAM-CUDA%u\"/>\n", i);
  2540. if (nvlinkhost[i])
  2541. /* TODO: NUMA affinity */
  2542. fprintf(f, " <link_ctn id=\"Host\"/>\n");
  2543. else
  2544. emit_platform_backward_path(f, get_hwloc_cuda_obj(topology, i));
  2545. fprintf(f, " </route>\n");
  2546. }
  2547. clean_topology(hwloc_get_root_obj(topology));
  2548. hwloc_topology_destroy(topology);
  2549. }
  2550. else
  2551. {
  2552. flat_cuda:
  2553. #else
  2554. {
  2555. #endif
  2556. /* If we don't have enough hwloc information, write trivial routes always through host */
  2557. for (i = 0; i < ncuda; i++)
  2558. {
  2559. char i_name[16];
  2560. snprintf(i_name, sizeof(i_name), "CUDA%u", i);
  2561. fprintf(f, " <route src=\"RAM\" dst=\"%s\" symmetrical=\"NO\"><link_ctn id=\"RAM-%s\"/><link_ctn id=\"Host\"/></route>\n", i_name, i_name);
  2562. fprintf(f, " <route src=\"%s\" dst=\"RAM\" symmetrical=\"NO\"><link_ctn id=\"%s-RAM\"/><link_ctn id=\"Host\"/></route>\n", i_name, i_name);
  2563. }
  2564. #ifdef STARPU_HAVE_CUDA_MEMCPY_PEER
  2565. for (i = 0; i < ncuda; i++)
  2566. {
  2567. unsigned j;
  2568. char i_name[16];
  2569. snprintf(i_name, sizeof(i_name), "CUDA%u", i);
  2570. for (j = 0; j < ncuda; j++)
  2571. {
  2572. char j_name[16];
  2573. if (j == i)
  2574. continue;
  2575. snprintf(j_name, sizeof(j_name), "CUDA%u", j);
  2576. fprintf(f, " <route src=\"%s\" dst=\"%s\" symmetrical=\"NO\"><link_ctn id=\"%s-%s\"/><link_ctn id=\"Host\"/></route>\n", i_name, j_name, i_name, j_name);
  2577. }
  2578. }
  2579. #endif
  2580. } /* defined(STARPU_HAVE_HWLOC) && defined(STARPU_HAVE_CUDA_MEMCPY_PEER) */
  2581. fprintf(f, "\n");
  2582. #endif /* STARPU_USE_CUDA */
  2583. /*
  2584. * OpenCL routes
  2585. */
  2586. #ifdef STARPU_USE_OPENCL
  2587. for (i = 0; i < nopencl; i++)
  2588. {
  2589. char i_name[16];
  2590. snprintf(i_name, sizeof(i_name), "OpenCL%u", i);
  2591. fprintf(f, " <route src=\"RAM\" dst=\"%s\" symmetrical=\"NO\"><link_ctn id=\"RAM-%s\"/><link_ctn id=\"Host\"/></route>\n", i_name, i_name);
  2592. fprintf(f, " <route src=\"%s\" dst=\"RAM\" symmetrical=\"NO\"><link_ctn id=\"%s-RAM\"/><link_ctn id=\"Host\"/></route>\n", i_name, i_name);
  2593. }
  2594. #endif
  2595. fprintf(f,
  2596. " </AS>\n"
  2597. " </platform>\n"
  2598. );
  2599. if (locked)
  2600. _starpu_fwrunlock(f);
  2601. fclose(f);
  2602. }
  2603. static void generate_bus_platform_file(void)
  2604. {
  2605. if (!was_benchmarked)
  2606. benchmark_all_gpu_devices();
  2607. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  2608. /* Slaves don't write files */
  2609. if (!_starpu_mpi_common_is_src_node())
  2610. return;
  2611. #endif
  2612. write_bus_platform_file_content(3);
  2613. write_bus_platform_file_content(4);
  2614. }
  2615. static void check_bus_platform_file(void)
  2616. {
  2617. int res;
  2618. char path[PATH_LENGTH];
  2619. _starpu_simgrid_get_platform_path(4, path, sizeof(path));
  2620. res = access(path, F_OK);
  2621. if (!res)
  2622. {
  2623. _starpu_simgrid_get_platform_path(3, path, sizeof(path));
  2624. res = access(path, F_OK);
  2625. }
  2626. if (res)
  2627. {
  2628. /* File does not exist yet */
  2629. generate_bus_platform_file();
  2630. }
  2631. }
  2632. /*
  2633. * Generic
  2634. */
  2635. static void _starpu_bus_force_sampling(void)
  2636. {
  2637. _STARPU_DEBUG("Force bus sampling ...\n");
  2638. _starpu_create_sampling_directory_if_needed();
  2639. generate_bus_affinity_file();
  2640. generate_bus_latency_file();
  2641. generate_bus_bandwidth_file();
  2642. generate_bus_config_file();
  2643. generate_bus_platform_file();
  2644. }
  2645. #endif /* !SIMGRID */
  2646. void _starpu_load_bus_performance_files(void)
  2647. {
  2648. _starpu_create_sampling_directory_if_needed();
  2649. struct _starpu_machine_config * config = _starpu_get_machine_config();
  2650. nnumas = _starpu_topology_get_nnumanodes(config);
  2651. #if defined(STARPU_USE_CUDA) || defined(STARPU_USE_SIMGRID)
  2652. ncuda = _starpu_get_cuda_device_count();
  2653. #endif
  2654. #if defined(STARPU_USE_OPENCL) || defined(STARPU_USE_SIMGRID)
  2655. nopencl = _starpu_opencl_get_device_count();
  2656. #endif
  2657. #if defined(STARPU_USE_MPI_MASTER_SLAVE) || defined(STARPU_USE_SIMGRID)
  2658. nmpi_ms = _starpu_mpi_src_get_device_count();
  2659. #endif
  2660. #ifndef STARPU_SIMGRID
  2661. check_bus_config_file();
  2662. #endif
  2663. #ifdef STARPU_USE_MPI_MASTER_SLAVE
  2664. /* be sure that master wrote the perf files */
  2665. _starpu_mpi_common_barrier();
  2666. #endif
  2667. #ifndef STARPU_SIMGRID
  2668. load_bus_affinity_file();
  2669. #endif
  2670. load_bus_latency_file();
  2671. load_bus_bandwidth_file();
  2672. #ifndef STARPU_SIMGRID
  2673. check_bus_platform_file();
  2674. #endif
  2675. }
  2676. /* (in MB/s) */
  2677. double starpu_transfer_bandwidth(unsigned src_node, unsigned dst_node)
  2678. {
  2679. return bandwidth_matrix[src_node][dst_node];
  2680. }
  2681. /* (in µs) */
  2682. double starpu_transfer_latency(unsigned src_node, unsigned dst_node)
  2683. {
  2684. return latency_matrix[src_node][dst_node];
  2685. }
  2686. /* (in µs) */
  2687. double starpu_transfer_predict(unsigned src_node, unsigned dst_node, size_t size)
  2688. {
  2689. if (src_node == dst_node)
  2690. return 0;
  2691. double bandwidth = bandwidth_matrix[src_node][dst_node];
  2692. double latency = latency_matrix[src_node][dst_node];
  2693. struct _starpu_machine_topology *topology = &_starpu_get_machine_config()->topology;
  2694. #if 0
  2695. int busid = starpu_bus_get_id(src_node, dst_node);
  2696. int direct = starpu_bus_get_direct(busid);
  2697. #endif
  2698. float ngpus = topology->ndevices[STARPU_CUDA_WORKER]+topology->ndevices[STARPU_OPENCL_WORKER];
  2699. #ifdef STARPU_DEVEL
  2700. #warning FIXME: ngpus should not be used e.g. for slow disk transfers...
  2701. #endif
  2702. #if 0
  2703. /* Ideally we should take into account that some GPUs are directly
  2704. * connected through a PCI switch, which has less contention that the
  2705. * Host bridge, but doing that seems to *decrease* performance... */
  2706. if (direct)
  2707. {
  2708. float neighbours = starpu_bus_get_ngpus(busid);
  2709. /* Count transfers of these GPUs, and count transfers between
  2710. * other GPUs and these GPUs */
  2711. ngpus = neighbours + (ngpus - neighbours) * neighbours / ngpus;
  2712. }
  2713. #endif
  2714. return latency + (size/bandwidth)*2*ngpus;
  2715. }
  2716. /* calculate save bandwidth and latency */
  2717. /* bandwidth in MB/s - latency in µs */
  2718. void _starpu_save_bandwidth_and_latency_disk(double bandwidth_write, double bandwidth_read, double latency_write, double latency_read, unsigned node, const char *name)
  2719. {
  2720. unsigned int i, j;
  2721. double slowness_disk_between_main_ram, slowness_main_ram_between_node;
  2722. int print_stats = starpu_get_env_number_default("STARPU_BUS_STATS", 0);
  2723. if (print_stats)
  2724. {
  2725. fprintf(stderr, "\n#---------------------\n");
  2726. fprintf(stderr, "Data transfer speed for %s (node %u):\n", name, node);
  2727. }
  2728. /* save bandwith */
  2729. for(i = 0; i < STARPU_MAXNODES; ++i)
  2730. {
  2731. for(j = 0; j < STARPU_MAXNODES; ++j)
  2732. {
  2733. if (i == j && j == node) /* source == destination == node */
  2734. {
  2735. bandwidth_matrix[i][j] = 0;
  2736. }
  2737. else if (i == node) /* source == disk */
  2738. {
  2739. /* convert in slowness */
  2740. if(bandwidth_read != 0)
  2741. slowness_disk_between_main_ram = 1/bandwidth_read;
  2742. else
  2743. slowness_disk_between_main_ram = 0;
  2744. if(bandwidth_matrix[STARPU_MAIN_RAM][j] != 0)
  2745. slowness_main_ram_between_node = 1/bandwidth_matrix[STARPU_MAIN_RAM][j];
  2746. else
  2747. slowness_main_ram_between_node = 0;
  2748. bandwidth_matrix[i][j] = 1/(slowness_disk_between_main_ram+slowness_main_ram_between_node);
  2749. if (!isnan(bandwidth_matrix[i][j]) && print_stats)
  2750. fprintf(stderr,"%u -> %u: %.0f MB/s\n", i, j, bandwidth_matrix[i][j]);
  2751. }
  2752. else if (j == node) /* destination == disk */
  2753. {
  2754. /* convert in slowness */
  2755. if(bandwidth_write != 0)
  2756. slowness_disk_between_main_ram = 1/bandwidth_write;
  2757. else
  2758. slowness_disk_between_main_ram = 0;
  2759. if(bandwidth_matrix[i][STARPU_MAIN_RAM] != 0)
  2760. slowness_main_ram_between_node = 1/bandwidth_matrix[i][STARPU_MAIN_RAM];
  2761. else
  2762. slowness_main_ram_between_node = 0;
  2763. bandwidth_matrix[i][j] = 1/(slowness_disk_between_main_ram+slowness_main_ram_between_node);
  2764. if (!isnan(bandwidth_matrix[i][j]) && print_stats)
  2765. fprintf(stderr,"%u -> %u: %.0f MB/s\n", i, j, bandwidth_matrix[i][j]);
  2766. }
  2767. else if (j > node || i > node) /* not affected by the node */
  2768. {
  2769. bandwidth_matrix[i][j] = NAN;
  2770. }
  2771. }
  2772. }
  2773. /* save latency */
  2774. for(i = 0; i < STARPU_MAXNODES; ++i)
  2775. {
  2776. for(j = 0; j < STARPU_MAXNODES; ++j)
  2777. {
  2778. if (i == j && j == node) /* source == destination == node */
  2779. {
  2780. latency_matrix[i][j] = 0;
  2781. }
  2782. else if (i == node) /* source == disk */
  2783. {
  2784. latency_matrix[i][j] = (latency_write+latency_matrix[STARPU_MAIN_RAM][j]);
  2785. if (!isnan(latency_matrix[i][j]) && print_stats)
  2786. fprintf(stderr,"%u -> %u: %.0f us\n", i, j, latency_matrix[i][j]);
  2787. }
  2788. else if (j == node) /* destination == disk */
  2789. {
  2790. latency_matrix[i][j] = (latency_read+latency_matrix[i][STARPU_MAIN_RAM]);
  2791. if (!isnan(latency_matrix[i][j]) && print_stats)
  2792. fprintf(stderr,"%u -> %u: %.0f us\n", i, j, latency_matrix[i][j]);
  2793. }
  2794. else if (j > node || i > node) /* not affected by the node */
  2795. {
  2796. latency_matrix[i][j] = NAN;
  2797. }
  2798. }
  2799. }
  2800. if (print_stats)
  2801. fprintf(stderr, "\n#---------------------\n");
  2802. }