starpu.texi 65 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924
  1. \input texinfo @c -*-texinfo-*-
  2. @c %**start of header
  3. @setfilename starpu.info
  4. @settitle StarPU
  5. @c %**end of header
  6. @setchapternewpage odd
  7. @titlepage
  8. @title StarPU
  9. @page
  10. @vskip 0pt plus 1filll
  11. @comment For the @value{version-GCC} Version*
  12. @end titlepage
  13. @summarycontents
  14. @contents
  15. @page
  16. @node Top
  17. @top Preface
  18. @cindex Preface
  19. This manual documents the usage of StarPU.
  20. @comment
  21. @comment When you add a new menu item, please keep the right hand
  22. @comment aligned to the same column. Do not use tabs. This provides
  23. @comment better formatting.
  24. @comment
  25. @menu
  26. * Introduction:: A basic introduction to using StarPU
  27. * Installing StarPU:: How to configure, build and install StarPU
  28. * Using StarPU:: How to run StarPU application
  29. * Configuration options:: Configurations options
  30. * Environment variables:: Environment variables used by StarPU
  31. * StarPU API:: The API to use StarPU
  32. * Basic Examples:: Basic examples of the use of StarPU
  33. * Advanced Topics:: Advanced use of StarPU
  34. @end menu
  35. @c ---------------------------------------------------------------------
  36. @c Introduction to StarPU
  37. @c ---------------------------------------------------------------------
  38. @node Introduction
  39. @chapter Introduction to StarPU
  40. @menu
  41. * Motivation:: Why StarPU ?
  42. * StarPU in a Nutshell:: The Fundamentals of StarPU
  43. @end menu
  44. @node Motivation
  45. @section Motivation
  46. @c complex machines with heterogeneous cores/devices
  47. The use of specialized hardware such as accelerators or coprocessors offers an
  48. interesting approach to overcome the physical limits encountered by processor
  49. architects. As a result, many machines are now equipped with one or several
  50. accelerators (e.g. a GPU), in addition to the usual processor(s). While a lot of
  51. efforts have been devoted to offload computation onto such accelerators, very
  52. little attention as been paid to portability concerns on the one hand, and to the
  53. possibility of having heterogeneous accelerators and processors to interact on the other hand.
  54. StarPU is a runtime system that offers support for heterogeneous multicore
  55. architectures, it not only offers a unified view of the computational resources
  56. (i.e. CPUs and accelerators at the same time), but it also takes care of
  57. efficiently mapping and executing tasks onto an heterogeneous machine while
  58. transparently handling low-level issues in a portable fashion.
  59. @c this leads to a complicated distributed memory design
  60. @c which is not (easily) manageable by hand
  61. @c added value/benefits of StarPU
  62. @c - portability
  63. @c - scheduling, perf. portability
  64. @node StarPU in a Nutshell
  65. @section StarPU in a Nutshell
  66. @menu
  67. * Codelet and Tasks::
  68. * StarPU Data Management Library::
  69. @end menu
  70. From a programming point of view, StarPU is not a new language but a library
  71. that executes tasks explicitly submitted by the application. The data that a
  72. task manipulates are automatically transferred onto the accelerator so that the
  73. programmer does not have to take care of complex data movements. StarPU also
  74. takes particular care of scheduling those tasks efficiently and allows
  75. scheduling experts to implement custom scheduling policies in a portable
  76. fashion.
  77. @c explain the notion of codelet and task (i.e. g(A, B)
  78. @node Codelet and Tasks
  79. @subsection Codelet and Tasks
  80. One of StarPU primary data structure is the @b{codelet}. A codelet describes a
  81. computational kernel that can possibly be implemented on multiple architectures
  82. such as a CPU, a CUDA device or a Cell's SPU.
  83. @c TODO insert illustration f : f_spu, f_cpu, ...
  84. Another important data structure is the @b{task}. Executing a StarPU task
  85. consists in applying a codelet on a data set, on one of the architectures on
  86. which the codelet is implemented. In addition to the codelet that a task
  87. implements, it also describes which data are accessed, and how they are
  88. accessed during the computation (read and/or write).
  89. StarPU tasks are asynchronous: submitting a task to StarPU is a non-blocking
  90. operation. The task structure can also specify a @b{callback} function that is
  91. called once StarPU has properly executed the task. It also contains optional
  92. fields that the application may use to give hints to the scheduler (such as
  93. priority levels).
  94. A task may be identified by a unique 64-bit number which we refer as a @b{tag}.
  95. Task dependencies can be enforced either by the means of callback functions, or
  96. by expressing dependencies between tags.
  97. @c TODO insert illustration f(Ar, Brw, Cr) + ..
  98. @c DSM
  99. @node StarPU Data Management Library
  100. @subsection StarPU Data Management Library
  101. Because StarPU schedules tasks at runtime, data transfers have to be
  102. done automatically and ``just-in-time'' between processing units,
  103. relieving the application programmer from explicit data transfers.
  104. Moreover, to avoid unnecessary transfers, StarPU keeps data
  105. where it was last needed, even if was modified there, and it
  106. allows multiple copies of the same data to reside at the same time on
  107. several processing units as long as it is not modified.
  108. @c ---------------------------------------------------------------------
  109. @c Installing StarPU
  110. @c ---------------------------------------------------------------------
  111. @node Installing StarPU
  112. @chapter Installing StarPU
  113. @menu
  114. * Configuration of StarPU::
  115. * Building and Installing StarPU::
  116. @end menu
  117. StarPU can be built and installed by the standard means of the GNU
  118. autotools. The following chapter is intended to briefly remind how these tools
  119. can be used to install StarPU.
  120. @node Configuration of StarPU
  121. @section Configuration of StarPU
  122. @menu
  123. * Generating Makefiles and configuration scripts::
  124. * Running the configuration::
  125. @end menu
  126. @node Generating Makefiles and configuration scripts
  127. @subsection Generating Makefiles and configuration scripts
  128. This step is not necessary when using the tarball releases of StarPU. If you
  129. are using the source code from the svn repository, you first need to generate
  130. the configure scripts and the Makefiles.
  131. @example
  132. % autoreconf -vfi
  133. @end example
  134. @node Running the configuration
  135. @subsection Running the configuration
  136. @example
  137. % ./configure
  138. @end example
  139. Details about options that are useful to give to @code{./configure} are given in
  140. @ref{Configuration options}.
  141. @node Building and Installing StarPU
  142. @section Building and Installing StarPU
  143. @menu
  144. * Building::
  145. * Sanity Checks::
  146. * Installing::
  147. @end menu
  148. @node Building
  149. @subsection Building
  150. @example
  151. % make
  152. @end example
  153. @node Sanity Checks
  154. @subsection Sanity Checks
  155. In order to make sure that StarPU is working properly on the system, it is also
  156. possible to run a test suite.
  157. @example
  158. % make check
  159. @end example
  160. @node Installing
  161. @subsection Installing
  162. In order to install StarPU at the location that was specified during
  163. configuration:
  164. @example
  165. % make install
  166. @end example
  167. @c ---------------------------------------------------------------------
  168. @c Using StarPU
  169. @c ---------------------------------------------------------------------
  170. @node Using StarPU
  171. @chapter Using StarPU
  172. @menu
  173. * Setting flags for compiling and linking applications::
  174. * Running a basic StarPU application::
  175. @end menu
  176. @node Setting flags for compiling and linking applications
  177. @section Setting flags for compiling and linking applications
  178. Compiling and linking an application against StarPU may require to use
  179. specific flags or libraries (for instance @code{CUDA} or @code{libspe2}).
  180. To this end, it is possible to use the @code{pkg-config} tool.
  181. If StarPU was not installed at some standard location, the path of StarPU's
  182. library must be specified in the @code{PKG_CONFIG_PATH} environment variable so
  183. that @code{pkg-config} can find it. For example if StarPU was installed in
  184. @code{$prefix_dir}:
  185. @example
  186. % PKG_CONFIG_PATH=$PKG_CONFIG_PATH:$prefix_dir/lib/pkgconfig
  187. @end example
  188. The flags required to compile or link against StarPU are then
  189. accessible with the following commands:
  190. @example
  191. % pkg-config --cflags libstarpu # options for the compiler
  192. % pkg-config --libs libstarpu # options for the linker
  193. @end example
  194. @node Running a basic StarPU application
  195. @section Running a basic StarPU application
  196. Basic examples using StarPU have been built in the directory
  197. @code{$prefix_dir/lib/starpu/examples/}. You can for example run the
  198. example @code{vector_scal}.
  199. @example
  200. % $prefix_dir/lib/starpu/examples/vector_scal
  201. BEFORE : First element was 1.000000
  202. AFTER First element is 3.140000
  203. %
  204. @end example
  205. @c ---------------------------------------------------------------------
  206. @c Configuration options
  207. @c ---------------------------------------------------------------------
  208. @node Configuring StarPU
  209. @chapter Configuring StarPU
  210. @menu
  211. * Compilation configuration::
  212. * Execution configuration through environment variables::
  213. @end menu
  214. @node Compilation configuration
  215. @section Compilation configuration
  216. The following arguments can be given to the @code{configure} script.
  217. @menu
  218. * Common configuration::
  219. * Configuring workers::
  220. * Advanced configuration::
  221. @end menu
  222. @node Common configuration
  223. @subsection Common configuration
  224. @table @asis
  225. @item @code{--enable-debug}
  226. Enable debugging messages.
  227. @item @code{--enable-fast}
  228. Do not enforce assertions, saves a lot of time spent to compute them otherwise.
  229. @item @code{--enable-verbose}
  230. Augment the verbosity of the debugging messages.
  231. @item @code{--enable-coverage}
  232. Enable flags for the coverage tool.
  233. @end table
  234. @node Configuring workers
  235. @subsection Configuring workers
  236. @table @asis
  237. @item @code{--disable-cpu}
  238. Disable the use of CPUs of the machine. Only GPUs etc. will be used.
  239. @item @code{--enable-maxcudadev=<number>}
  240. Defines the maximum number of CUDA devices that StarPU will support, then
  241. available as the STARPU_MAXCUDADEVS macro.
  242. @item @code{--disable-cuda}
  243. Disable the use of CUDA, even if the SDK is detected.
  244. @item @code{--enable-maxopencldev=<number>}
  245. Defines the maximum number of OpenCL devices that StarPU will support, then
  246. available as the STARPU_MAXOPENCLDEVS macro.
  247. @item @code{--disable-opencl}
  248. Disable the use of OpenCL, even if the SDK is detected.
  249. @item @code{--enable-gordon}
  250. Enable the use of the Gordon runtime for Cell SPUs.
  251. @c TODO: rather default to enabled when detected
  252. @item @code{--with-cuda-dir=<path>}
  253. Specify the location of the CUDA SDK resides. This directory should notably contain
  254. @code{include/cuda.h}.
  255. @item @code{--with-gordon-dir=<path>}
  256. Specify the location of the Gordon SDK.
  257. @end table
  258. @node Advanced configuration
  259. @subsection Advanced configuration
  260. @table @asis
  261. @item @code{--enable-perf-debug}
  262. Enable performance debugging.
  263. @item @code{--enable-model-debug}
  264. Enable performance model debugging.
  265. @item @code{--enable-stats}
  266. Enable statistics.
  267. @item @code{--enable-maxbuffers=<nbuffers>}
  268. Define the maximum number of buffers that tasks will be able to take as parameters, then available as the STARPU_NMAXBUFS macro.
  269. @item @code{--enable-allocation-cache}
  270. Enable the use of a data allocation cache to avoid the cost of it with
  271. CUDA. Still experimental.
  272. @item @code{--enable-opengl-render}
  273. Enable the use of OpenGL for the rendering of some examples.
  274. @c TODO: rather default to enabled when detected
  275. @item @code{--enable-blas-lib=<name>}
  276. Specify the blas library to be used by some of the examples. The
  277. library has to be 'atlas' or 'goto'.
  278. @item @code{--with-magma=<path>}
  279. Specify where magma is installed.
  280. @item @code{--with-opencl-dir=<path>}
  281. Specify the location of the OpenCL SDK. This directory should notably contain
  282. @code{include/CL/cl.h}.
  283. @item @code{--with-fxt=<path>}
  284. Specify the location of FxT (for generating traces and rendering them
  285. using ViTE). This directory should notably contain
  286. @code{include/fxt/fxt.h}.
  287. @item @code{--with-perf-model-dir=<dir>}
  288. Specify where performance models should be stored (instead of defaulting to the
  289. current user's home).
  290. @item @code{--with-mpicc=<path to mpicc>}
  291. Specify the location of the @code{mpicc} compiler to be used for starpumpi.
  292. @c TODO: also just use AC_PROG
  293. @item @code{--with-mpi}
  294. Enable building libstarpumpi.
  295. @c TODO: rather just use the availability of mpicc instead of a second option
  296. @item @code{--with-goto-dir=<dir>}
  297. Specify the location of GotoBLAS.
  298. @item @code{--with-atlas-dir=<dir>}
  299. Specify the location of ATLAS. This directory should notably contain
  300. @code{include/cblas.h}.
  301. @end table
  302. @c ---------------------------------------------------------------------
  303. @c Environment variables
  304. @c ---------------------------------------------------------------------
  305. @node Execution configuration through environment variables
  306. @section Execution configuration through environment variables
  307. @menu
  308. * Workers:: Configuring workers
  309. * Scheduling:: Configuring the Scheduling engine
  310. * Misc:: Miscellaneous and debug
  311. @end menu
  312. Note: the values given in @code{starpu_conf} structure passed when
  313. calling @code{starpu_init} will override the values of the environment
  314. variables.
  315. @node Workers
  316. @subsection Configuring workers
  317. @menu
  318. * STARPU_NCPUS:: Number of CPU workers
  319. * STARPU_NCUDA:: Number of CUDA workers
  320. * STARPU_NOPENCL:: Number of OpenCL workers
  321. * STARPU_NGORDON:: Number of SPU workers (Cell)
  322. * STARPU_WORKERS_CPUID:: Bind workers to specific CPUs
  323. * STARPU_WORKERS_CUDAID:: Select specific CUDA devices
  324. * STARPU_WORKERS_OPENCLID:: Select specific OpenCL devices
  325. @end menu
  326. @node STARPU_NCPUS
  327. @subsubsection @code{STARPU_NCPUS} -- Number of CPU workers
  328. @table @asis
  329. @item @emph{Description}:
  330. Specify the maximum number of CPU workers. Note that StarPU will not allocate
  331. more CPUs than there are physical CPUs, and that some CPUs are used to control
  332. the accelerators.
  333. @end table
  334. @node STARPU_NCUDA
  335. @subsubsection @code{STARPU_NCUDA} -- Number of CUDA workers
  336. @table @asis
  337. @item @emph{Description}:
  338. Specify the maximum number of CUDA devices that StarPU can use. If
  339. @code{STARPU_NCUDA} is lower than the number of physical devices, it is
  340. possible to select which CUDA devices should be used by the means of the
  341. @code{STARPU_WORKERS_CUDAID} environment variable.
  342. @end table
  343. @node STARPU_NOPENCL
  344. @subsubsection @code{STARPU_NOPENCL} -- Number of OpenCL workers
  345. @table @asis
  346. @item @emph{Description}:
  347. OpenCL equivalent of the @code{STARPU_NCUDA} environment variable.
  348. @end table
  349. @node STARPU_NGORDON
  350. @subsubsection @code{STARPU_NGORDON} -- Number of SPU workers (Cell)
  351. @table @asis
  352. @item @emph{Description}:
  353. Specify the maximum number of SPUs that StarPU can use.
  354. @end table
  355. @node STARPU_WORKERS_CPUID
  356. @subsubsection @code{STARPU_WORKERS_CPUID} -- Bind workers to specific CPUs
  357. @table @asis
  358. @item @emph{Description}:
  359. Passing an array of integers (starting from 0) in @code{STARPU_WORKERS_CPUID}
  360. specifies on which logical CPU the different workers should be
  361. bound. For instance, if @code{STARPU_WORKERS_CPUID = "1 3 0 2"}, the first
  362. worker will be bound to logical CPU #1, the second CPU worker will be bound to
  363. logical CPU #3 and so on. Note that the logical ordering of the CPUs is either
  364. determined by the OS, or provided by the @code{hwloc} library in case it is
  365. available.
  366. Note that the first workers correspond to the CUDA workers, then come the
  367. OpenCL and the SPU, and finally the CPU workers. For example if
  368. we have @code{STARPU_NCUDA=1}, @code{STARPU_NOPENCL=1}, @code{STARPU_NCPUS=2}
  369. and @code{STARPU_WORKERS_CPUID = "0 2 1 3"}, the CUDA device will be controlled
  370. by logical CPU #0, the OpenCL device will be controlled by logical CPU #2, and
  371. the logical CPUs #1 and #3 will be used by the CPU workers.
  372. If the number of workers is larger than the array given in
  373. @code{STARPU_WORKERS_CPUID}, the workers are bound to the logical CPUs in a
  374. round-robin fashion: if @code{STARPU_WORKERS_CPUID = "0 1"}, the first and the
  375. third (resp. second and fourth) workers will be put on CPU #0 (resp. CPU #1).
  376. @end table
  377. @node STARPU_WORKERS_CUDAID
  378. @subsubsection @code{STARPU_WORKERS_CUDAID} -- Select specific CUDA devices
  379. @table @asis
  380. @item @emph{Description}:
  381. Similarly to the @code{STARPU_WORKERS_CPUID} environment variable, it is
  382. possible to select which CUDA devices should be used by StarPU. On a machine
  383. equipped with 4 GPUs, setting @code{STARPU_WORKERS_CUDAID = "1 3"} and
  384. @code{STARPU_NCUDA=2} specifies that 2 CUDA workers should be created, and that
  385. they should use CUDA devices #1 and #3 (the logical ordering of the devices is
  386. the one reported by CUDA).
  387. @end table
  388. @node STARPU_WORKERS_OPENCLID
  389. @subsubsection @code{STARPU_WORKERS_OPENCLID} -- Select specific OpenCL devices
  390. @table @asis
  391. @item @emph{Description}:
  392. OpenCL equivalent of the @code{STARPU_WORKERS_CUDAID} environment variable.
  393. @end table
  394. @node Scheduling
  395. @subsection Configuring the Scheduling engine
  396. @menu
  397. * STARPU_SCHED:: Scheduling policy
  398. * STARPU_CALIBRATE:: Calibrate performance models
  399. * STARPU_PREFETCH:: Use data prefetch
  400. * STARPU_SCHED_ALPHA:: Computation factor
  401. * STARPU_SCHED_BETA:: Communication factor
  402. @end menu
  403. @node STARPU_SCHED
  404. @subsubsection @code{STARPU_SCHED} -- Scheduling policy
  405. @table @asis
  406. @item @emph{Description}:
  407. This chooses between the different scheduling policies proposed by StarPU: work
  408. random, stealing, greedy, with performance models, etc.
  409. Use @code{STARPU_SCHED=help} to get the list of available schedulers.
  410. @end table
  411. @node STARPU_CALIBRATE
  412. @subsubsection @code{STARPU_CALIBRATE} -- Calibrate performance models
  413. @table @asis
  414. @item @emph{Description}:
  415. If this variable is set to 1, the performance models are calibrated during
  416. the execution. If it is set to 2, the previous values are dropped to restart
  417. calibration from scratch.
  418. Note: this currently only applies to dm and dmda scheduling policies.
  419. @end table
  420. @node STARPU_PREFETCH
  421. @subsubsection @code{STARPU_PREFETCH} -- Use data prefetch
  422. @table @asis
  423. @item @emph{Description}:
  424. If this variable is set, data prefetching will be enabled, that is when a task is
  425. scheduled to be executed e.g. on a GPU, StarPU will request an asynchronous
  426. transfer in advance, so that data is already present on the GPU when the task
  427. starts. As a result, computation and data transfers are overlapped.
  428. @end table
  429. @node STARPU_SCHED_ALPHA
  430. @subsubsection @code{STARPU_SCHED_ALPHA} -- Computation factor
  431. @table @asis
  432. @item @emph{Description}:
  433. To estimate the cost of a task StarPU takes into account the estimated
  434. computation time (obtained thanks to performance models). The alpha factor is
  435. the coefficient to be applied to it before adding it to the communication part.
  436. @end table
  437. @node STARPU_SCHED_BETA
  438. @subsubsection @code{STARPU_SCHED_BETA} -- Communication factor
  439. @table @asis
  440. @item @emph{Description}:
  441. To estimate the cost of a task StarPU takes into account the estimated
  442. data transfer time (obtained thanks to performance models). The beta factor is
  443. the coefficient to be applied to it before adding it to the computation part.
  444. @end table
  445. @node Misc
  446. @subsection Miscellaneous and debug
  447. @menu
  448. * STARPU_LOGFILENAME:: Select debug file name
  449. @end menu
  450. @node STARPU_LOGFILENAME
  451. @subsubsection @code{STARPU_LOGFILENAME} -- Select debug file name
  452. @table @asis
  453. @item @emph{Description}:
  454. This variable specify in which file the debugging output should be saved to.
  455. @end table
  456. @c ---------------------------------------------------------------------
  457. @c StarPU API
  458. @c ---------------------------------------------------------------------
  459. @node StarPU API
  460. @chapter StarPU API
  461. @menu
  462. * Initialization and Termination:: Initialization and Termination methods
  463. * Workers' Properties:: Methods to enumerate workers' properties
  464. * Data Library:: Methods to manipulate data
  465. * Codelets and Tasks:: Methods to construct tasks
  466. * Tags:: Task dependencies
  467. * CUDA extensions:: CUDA extensions
  468. * OpenCL extensions:: OpenCL extensions
  469. * Cell extensions:: Cell extensions
  470. * Miscellaneous:: Miscellaneous helpers
  471. @end menu
  472. @node Initialization and Termination
  473. @section Initialization and Termination
  474. @menu
  475. * starpu_init:: Initialize StarPU
  476. * struct starpu_conf:: StarPU runtime configuration
  477. * starpu_shutdown:: Terminate StarPU
  478. @end menu
  479. @node starpu_init
  480. @subsection @code{starpu_init} -- Initialize StarPU
  481. @table @asis
  482. @item @emph{Description}:
  483. This is StarPU initialization method, which must be called prior to any other
  484. StarPU call. It is possible to specify StarPU's configuration (e.g. scheduling
  485. policy, number of cores, ...) by passing a non-null argument. Default
  486. configuration is used if the passed argument is @code{NULL}.
  487. @item @emph{Return value}:
  488. Upon successful completion, this function returns 0. Otherwise, @code{-ENODEV}
  489. indicates that no worker was available (so that StarPU was not initialized).
  490. @item @emph{Prototype}:
  491. @code{int starpu_init(struct starpu_conf *conf);}
  492. @end table
  493. @node struct starpu_conf
  494. @subsection @code{struct starpu_conf} -- StarPU runtime configuration
  495. @table @asis
  496. @item @emph{Description}:
  497. This structure is passed to the @code{starpu_init} function in order
  498. to configure StarPU.
  499. When the default value is used, StarPU automatically selects the number
  500. of processing units and takes the default scheduling policy. This parameter
  501. overwrites the equivalent environment variables.
  502. @item @emph{Fields}:
  503. @table @asis
  504. @item @code{sched_policy_name} (default = NULL):
  505. This is the name of the scheduling policy. This can also be specified with the
  506. @code{STARPU_SCHED} environment variable.
  507. @item @code{sched_policy} (default = NULL):
  508. This is the definition of the scheduling policy. This field is ignored
  509. if @code{sched_policy_name} is set.
  510. @item @code{ncpus} (default = -1):
  511. This is the maximum number of CPU cores that StarPU can use. This can also be
  512. specified with the @code{STARPU_NCPUS} environment variable.
  513. @item @code{ncuda} (default = -1):
  514. This is the maximum number of CUDA devices that StarPU can use. This can also be
  515. specified with the @code{STARPU_NCUDA} environment variable.
  516. @item @code{nopencl} (default = -1):
  517. This is the maximum number of OpenCL devices that StarPU can use. This can also be
  518. specified with the @code{STARPU_NOPENCL} environment variable.
  519. @item @code{nspus} (default = -1):
  520. This is the maximum number of Cell SPUs that StarPU can use. This can also be
  521. specified with the @code{STARPU_NGORDON} environment variable.
  522. @item @code{use_explicit_workers_bindid} (default = 0)
  523. @item @code{workers_bindid[STARPU_NMAXWORKERS]}
  524. @item @code{use_explicit_workers_cuda_gpuid} (default = 0)
  525. @item @code{workers_cuda_gpuid[STARPU_NMAXWORKERS]}
  526. @item @code{use_explicit_workers_opencl_gpuid} (default = 0)
  527. @item @code{workers_opencl_gpuid[STARPU_NMAXWORKERS]}:
  528. These fields are explained in @ref{STARPU_WORKERS_CPUID}.
  529. @item @code{calibrate} (default = 0):
  530. If this flag is set, StarPU will calibrate the performance models when
  531. executing tasks. This can also be specified with the @code{STARPU_CALIBRATE}
  532. environment variable.
  533. @end table
  534. @end table
  535. @node starpu_shutdown
  536. @subsection @code{starpu_shutdown} -- Terminate StarPU
  537. @table @asis
  538. @item @emph{Description}:
  539. This is StarPU termination method. It must be called at the end of the
  540. application: statistics and other post-mortem debugging information are not
  541. guaranteed to be available until this method has been called.
  542. @item @emph{Prototype}:
  543. @code{void starpu_shutdown(void);}
  544. @end table
  545. @node Workers' Properties
  546. @section Workers' Properties
  547. @menu
  548. * starpu_worker_get_count:: Get the number of processing units
  549. * starpu_cpu_worker_get_count:: Get the number of CPU controlled by StarPU
  550. * starpu_cuda_worker_get_count:: Get the number of CUDA devices controlled by StarPU
  551. * starpu_opencl_worker_get_count:: Get the number of OpenCL devices controlled by StarPU
  552. * starpu_spu_worker_get_count:: Get the number of Cell SPUs controlled by StarPU
  553. * starpu_worker_get_id:: Get the identifier of the current worker
  554. * starpu_worker_get_type:: Get the type of processing unit associated to a worker
  555. * starpu_worker_get_name:: Get the name of a worker
  556. @end menu
  557. @node starpu_worker_get_count
  558. @subsection @code{starpu_worker_get_count} -- Get the number of processing units
  559. @table @asis
  560. @item @emph{Description}:
  561. This function returns the number of workers (i.e. processing units executing
  562. StarPU tasks). The returned value should be at most @code{STARPU_NMAXWORKERS}.
  563. @item @emph{Prototype}:
  564. @code{unsigned starpu_worker_get_count(void);}
  565. @end table
  566. @node starpu_cpu_worker_get_count
  567. @subsection @code{starpu_cpu_worker_get_count} -- Get the number of CPU controlled by StarPU
  568. @table @asis
  569. @item @emph{Description}:
  570. This function returns the number of CPUs controlled by StarPU. The returned
  571. value should be at most @code{STARPU_NMAXCPUS}.
  572. @item @emph{Prototype}:
  573. @code{unsigned starpu_cpu_worker_get_count(void);}
  574. @end table
  575. @node starpu_cuda_worker_get_count
  576. @subsection @code{starpu_cuda_worker_get_count} -- Get the number of CUDA devices controlled by StarPU
  577. @table @asis
  578. @item @emph{Description}:
  579. This function returns the number of CUDA devices controlled by StarPU. The returned
  580. value should be at most @code{STARPU_MAXCUDADEVS}.
  581. @item @emph{Prototype}:
  582. @code{unsigned starpu_cuda_worker_get_count(void);}
  583. @end table
  584. @node starpu_opencl_worker_get_count
  585. @subsection @code{starpu_opencl_worker_get_count} -- Get the number of OpenCL devices controlled by StarPU
  586. @table @asis
  587. @item @emph{Description}:
  588. This function returns the number of OpenCL devices controlled by StarPU. The returned
  589. value should be at most @code{STARPU_MAXOPENCLDEVS}.
  590. @item @emph{Prototype}:
  591. @code{unsigned starpu_opencl_worker_get_count(void);}
  592. @end table
  593. @node starpu_spu_worker_get_count
  594. @subsection @code{starpu_spu_worker_get_count} -- Get the number of Cell SPUs controlled by StarPU
  595. @table @asis
  596. @item @emph{Description}:
  597. This function returns the number of Cell SPUs controlled by StarPU.
  598. @item @emph{Prototype}:
  599. @code{unsigned starpu_opencl_worker_get_count(void);}
  600. @end table
  601. @node starpu_worker_get_id
  602. @subsection @code{starpu_worker_get_id} -- Get the identifier of the current worker
  603. @table @asis
  604. @item @emph{Description}:
  605. This function returns the identifier of the worker associated to the calling
  606. thread. The returned value is either -1 if the current context is not a StarPU
  607. worker (i.e. when called from the application outside a task or a callback), or
  608. an integer between 0 and @code{starpu_worker_get_count() - 1}.
  609. @item @emph{Prototype}:
  610. @code{int starpu_worker_get_id(void);}
  611. @end table
  612. @node starpu_worker_get_type
  613. @subsection @code{starpu_worker_get_type} -- Get the type of processing unit associated to a worker
  614. @table @asis
  615. @item @emph{Description}:
  616. This function returns the type of worker associated to an identifier (as
  617. returned by the @code{starpu_worker_get_id} function). The returned value
  618. indicates the architecture of the worker: @code{STARPU_CPU_WORKER} for a CPU
  619. core, @code{STARPU_CUDA_WORKER} for a CUDA device,
  620. @code{STARPU_OPENCL_WORKER} for a OpenCL device, and
  621. @code{STARPU_GORDON_WORKER} for a Cell SPU. The value returned for an invalid
  622. identifier is unspecified.
  623. @item @emph{Prototype}:
  624. @code{enum starpu_archtype starpu_worker_get_type(int id);}
  625. @end table
  626. @node starpu_worker_get_name
  627. @subsection @code{starpu_worker_get_name} -- Get the name of a worker
  628. @table @asis
  629. @item @emph{Description}:
  630. StarPU associates a unique human readable string to each processing unit. This
  631. function copies at most the @code{maxlen} first bytes of the unique string
  632. associated to a worker identified by its identifier @code{id} into the
  633. @code{dst} buffer. The caller is responsible for ensuring that the @code{dst}
  634. is a valid pointer to a buffer of @code{maxlen} bytes at least. Calling this
  635. function on an invalid identifier results in an unspecified behaviour.
  636. @item @emph{Prototype}:
  637. @code{void starpu_worker_get_name(int id, char *dst, size_t maxlen);}
  638. @end table
  639. @node Data Library
  640. @section Data Library
  641. This section describes the data management facilities provided by StarPU.
  642. TODO: We show how to use existing data interfaces in [ref], but developers can
  643. design their own data interfaces if required.
  644. @menu
  645. * starpu_data_handle:: StarPU opaque data handle
  646. * void *interface:: StarPU data interface
  647. @end menu
  648. @node starpu_data_handle
  649. @subsection @code{starpu_data_handle} -- StarPU opaque data handle
  650. @table @asis
  651. @item @emph{Description}:
  652. StarPU uses @code{starpu_data_handle} as an opaque handle to manage a piece of
  653. data. Once a piece of data has been registered to StarPU, it is associated to a
  654. @code{starpu_data_handle} which keeps track of the state of the piece of data
  655. over the entire machine, so that we can maintain data consistency and locate
  656. data replicates for instance.
  657. @end table
  658. @node void *interface
  659. @subsection @code{void *interface} -- StarPU data interface
  660. @table @asis
  661. @item @emph{Description}:
  662. Data management is done at a high-level in StarPU: rather than accessing a mere
  663. list of contiguous buffers, the tasks may manipulate data that are described by
  664. a high-level construct which we call data interface.
  665. TODO
  666. @end table
  667. @c void starpu_data_unregister(struct starpu_data_state_t *state);
  668. @c starpu_worker_get_memory_node TODO
  669. @c
  670. @c user interaction with the DSM
  671. @c void starpu_data_sync_with_mem(struct starpu_data_state_t *state);
  672. @c void starpu_notify_data_modification(struct starpu_data_state_t *state, uint32_t modifying_node);
  673. @node Codelets and Tasks
  674. @section Codelets and Tasks
  675. @menu
  676. * struct starpu_codelet:: StarPU codelet structure
  677. * struct starpu_task:: StarPU task structure
  678. * starpu_task_init:: Initialize a Task
  679. * starpu_task_create:: Allocate and Initialize a Task
  680. * starpu_task_deinit:: Release all the resources used by a Task
  681. * starpu_task_destroy:: Destroy a dynamically allocated Task
  682. * starpu_task_wait:: Wait for the termination of a Task
  683. * starpu_task_submit:: Submit a Task
  684. * starpu_task_wait_for_all:: Wait for the termination of all Tasks
  685. @end menu
  686. @node struct starpu_codelet
  687. @subsection @code{struct starpu_codelet} -- StarPU codelet structure
  688. @table @asis
  689. @item @emph{Description}:
  690. The codelet structure describes a kernel that is possibly implemented on
  691. various targets.
  692. @item @emph{Fields}:
  693. @table @asis
  694. @item @code{where}:
  695. Indicates which types of processing units are able to execute the codelet.
  696. @code{STARPU_CPU|STARPU_CUDA} for instance indicates that the codelet is
  697. implemented for both CPU cores and CUDA devices while @code{STARPU_GORDON}
  698. indicates that it is only available on Cell SPUs.
  699. @item @code{cpu_func} (optional):
  700. Is a function pointer to the CPU implementation of the codelet. Its prototype
  701. must be: @code{void cpu_func(void *buffers[], void *cl_arg)}. The first
  702. argument being the array of data managed by the data management library, and
  703. the second argument is a pointer to the argument passed from the @code{cl_arg}
  704. field of the @code{starpu_task} structure.
  705. The @code{cpu_func} field is ignored if @code{STARPU_CPU} does not appear in
  706. the @code{where} field, it must be non-null otherwise.
  707. @item @code{cuda_func} (optional):
  708. Is a function pointer to the CUDA implementation of the codelet. @emph{This
  709. must be a host-function written in the CUDA runtime API}. Its prototype must
  710. be: @code{void cuda_func(void *buffers[], void *cl_arg);}. The @code{cuda_func}
  711. field is ignored if @code{STARPU_CUDA} does not appear in the @code{where}
  712. field, it must be non-null otherwise.
  713. @item @code{opencl_func} (optional):
  714. Is a function pointer to the OpenCL implementation of the codelet. Its
  715. prototype must be:
  716. @code{void opencl_func(starpu_data_interface_t *descr, void *arg);}.
  717. This pointer is ignored if @code{OPENCL} does not appear in the
  718. @code{where} field, it must be non-null otherwise.
  719. @item @code{gordon_func} (optional):
  720. This is the index of the Cell SPU implementation within the Gordon library.
  721. TODO
  722. @item @code{nbuffers}:
  723. Specifies the number of arguments taken by the codelet. These arguments are
  724. managed by the DSM and are accessed from the @code{void *buffers[]}
  725. array. The constant argument passed with the @code{cl_arg} field of the
  726. @code{starpu_task} structure is not counted in this number. This value should
  727. not be above @code{STARPU_NMAXBUFS}.
  728. @item @code{model} (optional):
  729. This is a pointer to the performance model associated to this codelet. This
  730. optional field is ignored when null. TODO
  731. @end table
  732. @end table
  733. @node struct starpu_task
  734. @subsection @code{struct starpu_task} -- StarPU task structure
  735. @table @asis
  736. @item @emph{Description}:
  737. The @code{starpu_task} structure describes a task that can be offloaded on the various
  738. processing units managed by StarPU. It instantiates a codelet. It can either be
  739. allocated dynamically with the @code{starpu_task_create} method, or declared
  740. statically. In the latter case, the programmer has to zero the
  741. @code{starpu_task} structure and to fill the different fields properly. The
  742. indicated default values correspond to the configuration of a task allocated
  743. with @code{starpu_task_create}.
  744. @item @emph{Fields}:
  745. @table @asis
  746. @item @code{cl}:
  747. Is a pointer to the corresponding @code{starpu_codelet} data structure. This
  748. describes where the kernel should be executed, and supplies the appropriate
  749. implementations. When set to @code{NULL}, no code is executed during the tasks,
  750. such empty tasks can be useful for synchronization purposes.
  751. @item @code{buffers}:
  752. TODO
  753. @item @code{cl_arg} (optional) (default = NULL):
  754. This pointer is passed to the codelet through the second argument
  755. of the codelet implementation (e.g. @code{cpu_func} or @code{cuda_func}).
  756. In the specific case of the Cell processor, see the @code{cl_arg_size}
  757. argument.
  758. @item @code{cl_arg_size} (optional, Cell specific):
  759. In the case of the Cell processor, the @code{cl_arg} pointer is not directly
  760. given to the SPU function. A buffer of size @code{cl_arg_size} is allocated on
  761. the SPU. This buffer is then filled with the @code{cl_arg_size} bytes starting
  762. at address @code{cl_arg}. In this case, the argument given to the SPU codelet
  763. is therefore not the @code{cl_arg} pointer, but the address of the buffer in
  764. local store (LS) instead. This field is ignored for CPU, CUDA and OpenCL
  765. codelets.
  766. @item @code{callback_func} (optional) (default = @code{NULL}):
  767. This is a function pointer of prototype @code{void (*f)(void *)} which
  768. specifies a possible callback. If this pointer is non-null, the callback
  769. function is executed @emph{on the host} after the execution of the task. The
  770. callback is passed the value contained in the @code{callback_arg} field. No
  771. callback is executed if the field is null.
  772. @item @code{callback_arg} (optional) (default = @code{NULL}):
  773. This is the pointer passed to the callback function. This field is ignored if
  774. the @code{callback_func} is null.
  775. @item @code{use_tag} (optional) (default = 0):
  776. If set, this flag indicates that the task should be associated with the tag
  777. contained in the @code{tag_id} field. Tag allow the application to synchronize
  778. with the task and to express task dependencies easily.
  779. @item @code{tag_id}:
  780. This fields contains the tag associated to the task if the @code{use_tag} field
  781. was set, it is ignored otherwise.
  782. @item @code{synchronous}:
  783. If this flag is set, the @code{starpu_task_submit} function is blocking and
  784. returns only when the task has been executed (or if no worker is able to
  785. process the task). Otherwise, @code{starpu_task_submit} returns immediately.
  786. @item @code{priority} (optional) (default = @code{STARPU_DEFAULT_PRIO}):
  787. This field indicates a level of priority for the task. This is an integer value
  788. that must be set between @code{STARPU_MIN_PRIO} (for the least important
  789. tasks) and @code{STARPU_MAX_PRIO} (for the most important tasks) included.
  790. Default priority is @code{STARPU_DEFAULT_PRIO}. Scheduling strategies that
  791. take priorities into account can use this parameter to take better scheduling
  792. decisions, but the scheduling policy may also ignore it.
  793. @item @code{execute_on_a_specific_worker} (default = 0):
  794. If this flag is set, StarPU will bypass the scheduler and directly affect this
  795. task to the worker specified by the @code{workerid} field.
  796. @item @code{workerid} (optional):
  797. If the @code{execute_on_a_specific_worker} field is set, this field indicates
  798. which is the identifier of the worker that should process this task (as
  799. returned by @code{starpu_worker_get_id}). This field is ignored if
  800. @code{execute_on_a_specific_worker} field is set to 0.
  801. @item @code{detach} (optional) (default = 1):
  802. If this flag is set, it is not possible to synchronize with the task
  803. by the means of @code{starpu_task_wait} later on. Internal data structures
  804. are only guaranteed to be freed once @code{starpu_task_wait} is called if the
  805. flag is not set.
  806. @item @code{destroy} (optional) (default = 1):
  807. If this flag is set, the task structure will automatically be freed, either
  808. after the execution of the callback if the task is detached, or during
  809. @code{starpu_task_wait} otherwise. If this flag is not set, dynamically
  810. allocated data structures will not be freed until @code{starpu_task_destroy} is
  811. called explicitly. Setting this flag for a statically allocated task structure
  812. will result in undefined behaviour.
  813. @end table
  814. @end table
  815. @node starpu_task_init
  816. @subsection @code{starpu_task_init} -- Initialize a Task
  817. @table @asis
  818. @item @emph{Description}:
  819. Initialize a task structure with default values. This function is implicitly
  820. called by @code{starpu_task_create}. By default, tasks initialized with
  821. @code{starpu_task_init} must be deinitialized explicitly with
  822. @code{starpu_task_deinit}. Tasks can also be initialized statically, using the
  823. constant @code{STARPU_TASK_INITIALIZER}.
  824. @item @emph{Prototype}:
  825. @code{void starpu_task_init(struct starpu_task *task);}
  826. @end table
  827. @node starpu_task_create
  828. @subsection @code{starpu_task_create} -- Allocate and Initialize a Task
  829. @table @asis
  830. @item @emph{Description}:
  831. Allocate a task structure and initialize it with default values. Tasks
  832. allocated dynamically with @code{starpu_task_create} are automatically freed when the
  833. task is terminated. If the destroy flag is explicitly unset, the resources used
  834. by the task are freed by calling
  835. @code{starpu_task_destroy}.
  836. @item @emph{Prototype}:
  837. @code{struct starpu_task *starpu_task_create(void);}
  838. @end table
  839. @node starpu_task_deinit
  840. @subsection @code{starpu_task_deinit} -- Release all the resources used by a Task
  841. @table @asis
  842. @item @emph{Description}:
  843. Release all the structures automatically allocated to execute the task. This is
  844. called automatically by @code{starpu_task_destroy}, but the task structure itself is not
  845. freed. This should be used for statically allocated tasks for instance.
  846. @item @emph{Prototype}:
  847. @code{void starpu_task_deinit(struct starpu_task *task);}
  848. @end table
  849. @node starpu_task_destroy
  850. @subsection @code{starpu_task_destroy} -- Destroy a dynamically allocated Task
  851. @table @asis
  852. @item @emph{Description}:
  853. Free the resource allocated during @code{starpu_task_create}. This function can be
  854. called automatically after the execution of a task by setting the
  855. @code{destroy} flag of the @code{starpu_task} structure (default behaviour).
  856. Calling this function on a statically allocated task results in an undefined
  857. behaviour.
  858. @item @emph{Prototype}:
  859. @code{void starpu_task_destroy(struct starpu_task *task);}
  860. @end table
  861. @node starpu_task_wait
  862. @subsection @code{starpu_task_wait} -- Wait for the termination of a Task
  863. @table @asis
  864. @item @emph{Description}:
  865. This function blocks until the task has been executed. It is not possible to
  866. synchronize with a task more than once. It is not possible to wait for
  867. synchronous or detached tasks.
  868. @item @emph{Return value}:
  869. Upon successful completion, this function returns 0. Otherwise, @code{-EINVAL}
  870. indicates that the specified task was either synchronous or detached.
  871. @item @emph{Prototype}:
  872. @code{int starpu_task_wait(struct starpu_task *task);}
  873. @end table
  874. @node starpu_task_submit
  875. @subsection @code{starpu_task_submit} -- Submit a Task
  876. @table @asis
  877. @item @emph{Description}:
  878. This function submits a task to StarPU. Calling this function does
  879. not mean that the task will be executed immediately as there can be data or task
  880. (tag) dependencies that are not fulfilled yet: StarPU will take care of
  881. scheduling this task with respect to such dependencies.
  882. This function returns immediately if the @code{synchronous} field of the
  883. @code{starpu_task} structure was set to 0, and block until the termination of
  884. the task otherwise. It is also possible to synchronize the application with
  885. asynchronous tasks by the means of tags, using the @code{starpu_tag_wait}
  886. function for instance.
  887. @item @emph{Return value}:
  888. In case of success, this function returns 0, a return value of @code{-ENODEV}
  889. means that there is no worker able to process this task (e.g. there is no GPU
  890. available and this task is only implemented for CUDA devices).
  891. @item @emph{Prototype}:
  892. @code{int starpu_task_submit(struct starpu_task *task);}
  893. @end table
  894. @node starpu_task_wait_for_all
  895. @subsection @code{starpu_task_wait_for_all} -- Wait for the termination of all Tasks
  896. @table @asis
  897. @item @emph{Description}:
  898. This function blocks until all the tasks that were submitted are terminated.
  899. @item @emph{Prototype}:
  900. @code{void starpu_task_wait_for_all(void);}
  901. @end table
  902. @c Callbacks : what can we put in callbacks ?
  903. @node Tags
  904. @section Tags
  905. @menu
  906. * starpu_tag_t:: Task identifier
  907. * starpu_tag_declare_deps:: Declare the Dependencies of a Tag
  908. * starpu_tag_declare_deps_array:: Declare the Dependencies of a Tag
  909. * starpu_tag_wait:: Block until a Tag is terminated
  910. * starpu_tag_wait_array:: Block until a set of Tags is terminated
  911. * starpu_tag_remove:: Destroy a Tag
  912. * starpu_tag_notify_from_apps:: Feed a tag explicitly
  913. @end menu
  914. @node starpu_tag_t
  915. @subsection @code{starpu_tag_t} -- Task identifier
  916. @table @asis
  917. @item @emph{Description}:
  918. It is possible to associate a task with a unique ``tag'' and to express
  919. dependencies between tasks by the means of those tags. To do so, fill the
  920. @code{tag_id} field of the @code{starpu_task} structure with a tag number (can
  921. be arbitrary) and set the @code{use_tag} field to 1.
  922. If @code{starpu_tag_declare_deps} is called with this tag number, the task will
  923. not be started until the tasks which holds the declared dependency tags are
  924. completed.
  925. @end table
  926. @node starpu_tag_declare_deps
  927. @subsection @code{starpu_tag_declare_deps} -- Declare the Dependencies of a Tag
  928. @table @asis
  929. @item @emph{Description}:
  930. Specify the dependencies of the task identified by tag @code{id}. The first
  931. argument specifies the tag which is configured, the second argument gives the
  932. number of tag(s) on which @code{id} depends. The following arguments are the
  933. tags which have to be terminated to unlock the task.
  934. This function must be called before the associated task is submitted to StarPU
  935. with @code{starpu_task_submit}.
  936. @item @emph{Remark}
  937. Because of the variable arity of @code{starpu_tag_declare_deps}, note that the
  938. last arguments @emph{must} be of type @code{starpu_tag_t}: constant values
  939. typically need to be explicitly casted. Using the
  940. @code{starpu_tag_declare_deps_array} function avoids this hazard.
  941. @item @emph{Prototype}:
  942. @code{void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...);}
  943. @item @emph{Example}:
  944. @cartouche
  945. @example
  946. /* Tag 0x1 depends on tags 0x32 and 0x52 */
  947. starpu_tag_declare_deps((starpu_tag_t)0x1,
  948. 2, (starpu_tag_t)0x32, (starpu_tag_t)0x52);
  949. @end example
  950. @end cartouche
  951. @end table
  952. @node starpu_tag_declare_deps_array
  953. @subsection @code{starpu_tag_declare_deps_array} -- Declare the Dependencies of a Tag
  954. @table @asis
  955. @item @emph{Description}:
  956. This function is similar to @code{starpu_tag_declare_deps}, except that its
  957. does not take a variable number of arguments but an array of tags of size
  958. @code{ndeps}.
  959. @item @emph{Prototype}:
  960. @code{void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t *array);}
  961. @item @emph{Example}:
  962. @cartouche
  963. @example
  964. /* Tag 0x1 depends on tags 0x32 and 0x52 */
  965. starpu_tag_t tag_array[2] = @{0x32, 0x52@};
  966. starpu_tag_declare_deps_array((starpu_tag_t)0x1, 2, tag_array);
  967. @end example
  968. @end cartouche
  969. @end table
  970. @node starpu_tag_wait
  971. @subsection @code{starpu_tag_wait} -- Block until a Tag is terminated
  972. @table @asis
  973. @item @emph{Description}:
  974. This function blocks until the task associated to tag @code{id} has been
  975. executed. This is a blocking call which must therefore not be called within
  976. tasks or callbacks, but only from the application directly. It is possible to
  977. synchronize with the same tag multiple times, as long as the
  978. @code{starpu_tag_remove} function is not called. Note that it is still
  979. possible to synchronize with a tag associated to a task which @code{starpu_task}
  980. data structure was freed (e.g. if the @code{destroy} flag of the
  981. @code{starpu_task} was enabled).
  982. @item @emph{Prototype}:
  983. @code{void starpu_tag_wait(starpu_tag_t id);}
  984. @end table
  985. @node starpu_tag_wait_array
  986. @subsection @code{starpu_tag_wait_array} -- Block until a set of Tags is terminated
  987. @table @asis
  988. @item @emph{Description}:
  989. This function is similar to @code{starpu_tag_wait} except that it blocks until
  990. @emph{all} the @code{ntags} tags contained in the @code{id} array are
  991. terminated.
  992. @item @emph{Prototype}:
  993. @code{void starpu_tag_wait_array(unsigned ntags, starpu_tag_t *id);}
  994. @end table
  995. @node starpu_tag_remove
  996. @subsection @code{starpu_tag_remove} -- Destroy a Tag
  997. @table @asis
  998. @item @emph{Description}:
  999. This function releases the resources associated to tag @code{id}. It can be
  1000. called once the corresponding task has been executed and when there is
  1001. no other tag that depend on this tag anymore.
  1002. @item @emph{Prototype}:
  1003. @code{void starpu_tag_remove(starpu_tag_t id);}
  1004. @end table
  1005. @node starpu_tag_notify_from_apps
  1006. @subsection @code{starpu_tag_notify_from_apps} -- Feed a Tag explicitly
  1007. @table @asis
  1008. @item @emph{Description}:
  1009. This function explicitly unlocks tag @code{id}. It may be useful in the
  1010. case of applications which execute part of their computation outside StarPU
  1011. tasks (e.g. third-party libraries). It is also provided as a
  1012. convenient tool for the programmer, for instance to entirely construct the task
  1013. DAG before actually giving StarPU the opportunity to execute the tasks.
  1014. @item @emph{Prototype}:
  1015. @code{void starpu_tag_notify_from_apps(starpu_tag_t id);}
  1016. @end table
  1017. @node CUDA extensions
  1018. @section CUDA extensions
  1019. @c void starpu_data_malloc_pinned_if_possible(float **A, size_t dim);
  1020. @c starpu_helper_cublas_init TODO
  1021. @c starpu_helper_cublas_shutdown TODO
  1022. @menu
  1023. * starpu_cuda_get_local_stream:: Get current worker's CUDA stream
  1024. * starpu_helper_cublas_init:: Initialize CUBLAS on every CUDA device
  1025. * starpu_helper_cublas_shutdown:: Deinitialize CUBLAS on every CUDA device
  1026. @end menu
  1027. @node starpu_cuda_get_local_stream
  1028. @subsection @code{starpu_cuda_get_local_stream} -- Get current worker's CUDA stream
  1029. @table @asis
  1030. @item @emph{Description}:
  1031. StarPU provides a stream for every CUDA device controlled by StarPU. This
  1032. function is only provided for convenience so that programmers can easily use
  1033. asynchronous operations within codelets without having to create a stream by
  1034. hand. Note that the application is not forced to use the stream provided by
  1035. @code{starpu_cuda_get_local_stream} and may also create its own streams.
  1036. @item @emph{Prototype}:
  1037. @code{cudaStream_t *starpu_cuda_get_local_stream(void);}
  1038. @end table
  1039. @node starpu_helper_cublas_init
  1040. @subsection @code{starpu_helper_cublas_init} -- Initialize CUBLAS on every CUDA device
  1041. @table @asis
  1042. @item @emph{Description}:
  1043. The CUBLAS library must be initialized prior to any CUBLAS call. Calling
  1044. @code{starpu_helper_cublas_init} will initialize CUBLAS on every CUDA device
  1045. controlled by StarPU. This call blocks until CUBLAS has been properly
  1046. initialized on every device.
  1047. @item @emph{Prototype}:
  1048. @code{void starpu_helper_cublas_init(void);}
  1049. @end table
  1050. @node starpu_helper_cublas_shutdown
  1051. @subsection @code{starpu_helper_cublas_shutdown} -- Deinitialize CUBLAS on every CUDA device
  1052. @table @asis
  1053. @item @emph{Description}:
  1054. This function synchronously deinitializes the CUBLAS library on every CUDA device.
  1055. @item @emph{Prototype}:
  1056. @code{void starpu_helper_cublas_shutdown(void);}
  1057. @end table
  1058. @node OpenCL extensions
  1059. @section OpenCL extensions
  1060. @menu
  1061. * Enabling OpenCL:: Enabling OpenCL
  1062. * Compiling OpenCL codelets:: Compiling OpenCL codelets
  1063. @end menu
  1064. @node Enabling OpenCL
  1065. @subsection Enabling OpenCL
  1066. On GPU devices which can run both CUDA and OpenCL, CUDA will be
  1067. enabled by default. To enable OpenCL, you need either to disable CUDA
  1068. when configuring StarPU:
  1069. @example
  1070. % ./configure --disable-cuda
  1071. @end example
  1072. or when running applications:
  1073. @example
  1074. % STARPU_NCUDA=0 ./application
  1075. @end example
  1076. OpenCL will automatically be started on any device not yet used by
  1077. CUDA. So on a machine running 4 GPUS, it is therefore possible to
  1078. enable CUDA on 2 devices, and OpenCL on the 2 other devices by doing
  1079. so:
  1080. @example
  1081. % STARPU_NCUDA=2 ./application
  1082. @end example
  1083. @node Compiling OpenCL codelets
  1084. @subsection Compiling OpenCL codelets
  1085. TODO
  1086. @node Cell extensions
  1087. @section Cell extensions
  1088. nothing yet.
  1089. @node Miscellaneous helpers
  1090. @section Miscellaneous helpers
  1091. @menu
  1092. * starpu_execute_on_each_worker:: Execute a function on a subset of workers
  1093. @end menu
  1094. @node starpu_execute_on_each_worker
  1095. @subsection @code{starpu_execute_on_each_worker} -- Execute a function on a subset of workers
  1096. @table @asis
  1097. @item @emph{Description}:
  1098. When calling this method, the offloaded function specified by the first argument is
  1099. executed by every StarPU worker that may execute the function.
  1100. The second argument is passed to the offloaded function.
  1101. The last argument specifies on which types of processing units the function
  1102. should be executed. Similarly to the @code{where} field of the
  1103. @code{starpu_codelet} structure, it is possible to specify that the function
  1104. should be executed on every CUDA device and every CPU by passing
  1105. @code{STARPU_CPU|STARPU_CUDA}.
  1106. This function blocks until the function has been executed on every appropriate
  1107. processing units, so that it may not be called from a callback function for
  1108. instance.
  1109. @item @emph{Prototype}:
  1110. @code{void starpu_execute_on_each_worker(void (*func)(void *), void *arg, uint32_t where);}
  1111. @end table
  1112. @c ---------------------------------------------------------------------
  1113. @c Basic Examples
  1114. @c ---------------------------------------------------------------------
  1115. @node Basic Examples
  1116. @chapter Basic Examples
  1117. @menu
  1118. * Compiling and linking options::
  1119. * Hello World:: Submitting Tasks
  1120. * Manipulating Data: Scaling a Vector::
  1121. * Vector Scaling on an Hybrid CPU/GPU Machine:: Handling Heterogeneous Architectures
  1122. @end menu
  1123. @node Compiling and linking options
  1124. @section Compiling and linking options
  1125. The Makefile could for instance contain the following lines to define which
  1126. options must be given to the compiler and to the linker:
  1127. @cartouche
  1128. @example
  1129. CFLAGS+=$$(pkg-config --cflags libstarpu)
  1130. LIBS+=$$(pkg-config --libs libstarpu)
  1131. @end example
  1132. @end cartouche
  1133. @node Hello World
  1134. @section Hello World
  1135. @menu
  1136. * Required Headers::
  1137. * Defining a Codelet::
  1138. * Submitting a Task::
  1139. @end menu
  1140. In this section, we show how to implement a simple program that submits a task to StarPU.
  1141. @node Required Headers
  1142. @subsection Required Headers
  1143. The @code{starpu.h} header should be included in any code using StarPU.
  1144. @cartouche
  1145. @example
  1146. #include <starpu.h>
  1147. @end example
  1148. @end cartouche
  1149. @node Defining a Codelet
  1150. @subsection Defining a Codelet
  1151. @cartouche
  1152. @example
  1153. void cpu_func(void *buffers[], void *cl_arg)
  1154. @{
  1155. float *array = cl_arg;
  1156. printf("Hello world (array = @{%f, %f@} )\n", array[0], array[1]);
  1157. @}
  1158. starpu_codelet cl =
  1159. @{
  1160. .where = STARPU_CPU,
  1161. .cpu_func = cpu_func,
  1162. .nbuffers = 0
  1163. @};
  1164. @end example
  1165. @end cartouche
  1166. A codelet is a structure that represents a computational kernel. Such a codelet
  1167. may contain an implementation of the same kernel on different architectures
  1168. (e.g. CUDA, Cell's SPU, x86, ...).
  1169. The @code{nbuffers} field specifies the number of data buffers that are
  1170. manipulated by the codelet: here the codelet does not access or modify any data
  1171. that is controlled by our data management library. Note that the argument
  1172. passed to the codelet (the @code{cl_arg} field of the @code{starpu_task}
  1173. structure) does not count as a buffer since it is not managed by our data
  1174. management library.
  1175. @c TODO need a crossref to the proper description of "where" see bla for more ...
  1176. We create a codelet which may only be executed on the CPUs. The @code{where}
  1177. field is a bitmask that defines where the codelet may be executed. Here, the
  1178. @code{STARPU_CPU} value means that only CPUs can execute this codelet
  1179. (@pxref{Codelets and Tasks} for more details on this field).
  1180. When a CPU core executes a codelet, it calls the @code{cpu_func} function,
  1181. which @emph{must} have the following prototype:
  1182. @cartouche
  1183. @example
  1184. void (*cpu_func)(void *buffers[], void *cl_arg);
  1185. @end example
  1186. @end cartouche
  1187. In this example, we can ignore the first argument of this function which gives a
  1188. description of the input and output buffers (e.g. the size and the location of
  1189. the matrices). The second argument is a pointer to a buffer passed as an
  1190. argument to the codelet by the means of the @code{cl_arg} field of the
  1191. @code{starpu_task} structure.
  1192. @c TODO rewrite so that it is a little clearer ?
  1193. Be aware that this may be a pointer to a
  1194. @emph{copy} of the actual buffer, and not the pointer given by the programmer:
  1195. if the codelet modifies this buffer, there is no guarantee that the initial
  1196. buffer will be modified as well: this for instance implies that the buffer
  1197. cannot be used as a synchronization medium.
  1198. @node Submitting a Task
  1199. @subsection Submitting a Task
  1200. @cartouche
  1201. @example
  1202. void callback_func(void *callback_arg)
  1203. @{
  1204. printf("Callback function (arg %x)\n", callback_arg);
  1205. @}
  1206. int main(int argc, char **argv)
  1207. @{
  1208. /* initialize StarPU */
  1209. starpu_init(NULL);
  1210. struct starpu_task *task = starpu_task_create();
  1211. task->cl = &cl;
  1212. float *array[2] = @{1.0f, -1.0f@};
  1213. task->cl_arg = &array;
  1214. task->cl_arg_size = 2*sizeof(float);
  1215. task->callback_func = callback_func;
  1216. task->callback_arg = 0x42;
  1217. /* starpu_task_submit will be a blocking call */
  1218. task->synchronous = 1;
  1219. /* submit the task to StarPU */
  1220. starpu_task_submit(task);
  1221. /* terminate StarPU */
  1222. starpu_shutdown();
  1223. return 0;
  1224. @}
  1225. @end example
  1226. @end cartouche
  1227. Before submitting any tasks to StarPU, @code{starpu_init} must be called. The
  1228. @code{NULL} argument specifies that we use default configuration. Tasks cannot
  1229. be submitted after the termination of StarPU by a call to
  1230. @code{starpu_shutdown}.
  1231. In the example above, a task structure is allocated by a call to
  1232. @code{starpu_task_create}. This function only allocates and fills the
  1233. corresponding structure with the default settings (@pxref{starpu_task_create}),
  1234. but it does not submit the task to StarPU.
  1235. @c not really clear ;)
  1236. The @code{cl} field is a pointer to the codelet which the task will
  1237. execute: in other words, the codelet structure describes which computational
  1238. kernel should be offloaded on the different architectures, and the task
  1239. structure is a wrapper containing a codelet and the piece of data on which the
  1240. codelet should operate.
  1241. The optional @code{cl_arg} field is a pointer to a buffer (of size
  1242. @code{cl_arg_size}) with some parameters for the kernel
  1243. described by the codelet. For instance, if a codelet implements a computational
  1244. kernel that multiplies its input vector by a constant, the constant could be
  1245. specified by the means of this buffer.
  1246. Once a task has been executed, an optional callback function can be called.
  1247. While the computational kernel could be offloaded on various architectures, the
  1248. callback function is always executed on a CPU. The @code{callback_arg}
  1249. pointer is passed as an argument of the callback. The prototype of a callback
  1250. function must be:
  1251. @cartouche
  1252. @example
  1253. void (*callback_function)(void *);
  1254. @end example
  1255. @end cartouche
  1256. If the @code{synchronous} field is non-null, task submission will be
  1257. synchronous: the @code{starpu_task_submit} function will not return until the
  1258. task was executed. Note that the @code{starpu_shutdown} method does not
  1259. guarantee that asynchronous tasks have been executed before it returns.
  1260. @node Manipulating Data: Scaling a Vector
  1261. @section Manipulating Data: Scaling a Vector
  1262. The previous example has shown how to submit tasks. In this section we show how
  1263. StarPU tasks can manipulate data.
  1264. Programmers can describe the data layout of their application so that StarPU is
  1265. responsible for enforcing data coherency and availability across the machine.
  1266. Instead of handling complex (and non-portable) mechanisms to perform data
  1267. movements, programmers only declare which piece of data is accessed and/or
  1268. modified by a task, and StarPU makes sure that when a computational kernel
  1269. starts somewhere (e.g. on a GPU), its data are available locally.
  1270. Before submitting those tasks, the programmer first needs to declare the
  1271. different pieces of data to StarPU using the @code{starpu_*_data_register}
  1272. functions. To ease the development of applications for StarPU, it is possible
  1273. to describe multiple types of data layout. A type of data layout is called an
  1274. @b{interface}. By default, there are different interfaces available in StarPU:
  1275. here we will consider the @b{vector interface}.
  1276. The following lines show how to declare an array of @code{n} elements of type
  1277. @code{float} using the vector interface:
  1278. @cartouche
  1279. @example
  1280. float tab[n];
  1281. starpu_data_handle tab_handle;
  1282. starpu_vector_data_register(&tab_handle, 0, tab, n, sizeof(float));
  1283. @end example
  1284. @end cartouche
  1285. The first argument, called the @b{data handle}, is an opaque pointer which
  1286. designates the array in StarPU. This is also the structure which is used to
  1287. describe which data is used by a task. The second argument is the node number
  1288. where the data currently resides. Here it is 0 since the @code{tab} array is in
  1289. the main memory. Then comes the pointer @code{tab} where the data can be found,
  1290. the number of elements in the vector and the size of each element.
  1291. It is possible to construct a StarPU
  1292. task that multiplies this vector by a constant factor:
  1293. @cartouche
  1294. @example
  1295. float factor = 3.0;
  1296. struct starpu_task *task = starpu_task_create();
  1297. task->cl = &cl;
  1298. task->buffers[0].handle = tab_handle;
  1299. task->buffers[0].mode = STARPU_RW;
  1300. task->cl_arg = &factor;
  1301. task->cl_arg_size = sizeof(float);
  1302. task->synchronous = 1;
  1303. starpu_task_submit(task);
  1304. @end example
  1305. @end cartouche
  1306. Since the factor is constant, it does not need a preliminary declaration, and
  1307. can just be passed through the @code{cl_arg} pointer like in the previous
  1308. example. The vector parameter is described by its handle.
  1309. There are two fields in each element of the @code{buffers} array.
  1310. @code{handle} is the handle of the data, and @code{mode} specifies how the
  1311. kernel will access the data (@code{STARPU_R} for read-only, @code{STARPU_W} for
  1312. write-only and @code{STARPU_RW} for read and write access).
  1313. The definition of the codelet can be written as follows:
  1314. @cartouche
  1315. @example
  1316. void scal_func(void *buffers[], void *cl_arg)
  1317. @{
  1318. unsigned i;
  1319. float *factor = cl_arg;
  1320. struct starpu_vector_interface_s *vector = buffers[0];
  1321. /* length of the vector */
  1322. unsigned n = STARPU_GET_VECTOR_NX(vector);
  1323. /* local copy of the vector pointer */
  1324. float *val = (float *)STARPU_GET_VECTOR_PTR(vector);
  1325. for (i = 0; i < n; i++)
  1326. val[i] *= *factor;
  1327. @}
  1328. starpu_codelet cl = @{
  1329. .where = STARPU_CPU,
  1330. .cpu_func = scal_func,
  1331. .nbuffers = 1
  1332. @};
  1333. @end example
  1334. @end cartouche
  1335. The second argument of the @code{scal_func} function contains a pointer to the
  1336. parameters of the codelet (given in @code{task->cl_arg}), so that we read the
  1337. constant factor from this pointer. The first argument is an array that gives
  1338. a description of every buffers passed in the @code{task->buffers}@ array. The
  1339. size of this array is given by the @code{nbuffers} field of the codelet
  1340. structure. For the sake of generality, this array contains pointers to the
  1341. different interfaces describing each buffer. In the case of the @b{vector
  1342. interface}, the location of the vector (resp. its length) is accessible in the
  1343. @code{ptr} (resp. @code{nx}) of this array. Since the vector is accessed in a
  1344. read-write fashion, any modification will automatically affect future accesses
  1345. to this vector made by other tasks.
  1346. @node Vector Scaling on an Hybrid CPU/GPU Machine
  1347. @section Vector Scaling on an Hybrid CPU/GPU Machine
  1348. Contrary to the previous examples, the task submitted in this example may not
  1349. only be executed by the CPUs, but also by a CUDA device.
  1350. @menu
  1351. * Source code:: Source of the StarPU application
  1352. * Compilation and execution:: Executing the StarPU application
  1353. @end menu
  1354. @node Source code
  1355. @subsection Source code
  1356. The CUDA implementation can be written as follows. It needs to be
  1357. compiled with a CUDA compiler such as nvcc, the NVIDIA CUDA compiler
  1358. driver.
  1359. @cartouche
  1360. @example
  1361. #include <starpu.h>
  1362. static __global__ void vector_mult_cuda(float *val, unsigned n,
  1363. float factor)
  1364. @{
  1365. unsigned i;
  1366. for(i = 0 ; i < n ; i++)
  1367. val[i] *= factor;
  1368. @}
  1369. extern "C" void scal_cuda_func(void *buffers[], void *_args)
  1370. @{
  1371. float *factor = (float *)_args;
  1372. struct starpu_vector_interface_s *vector = (struct starpu_vector_interface_s *) buffers[0];
  1373. /* length of the vector */
  1374. unsigned n = STARPU_GET_VECTOR_NX(vector);
  1375. /* local copy of the vector pointer */
  1376. float *val = (float *)STARPU_GET_VECTOR_PTR(vector);
  1377. /* TODO: use more blocks and threads in blocks */
  1378. vector_mult_cuda<<<1,1>>>(val, n, *factor);
  1379. cudaThreadSynchronize();
  1380. @}
  1381. @end example
  1382. @end cartouche
  1383. The CPU implementation is the same as in the previous section.
  1384. Here is the source of the main application. You can notice the value of the
  1385. field @code{where} for the codelet. We specify
  1386. @code{STARPU_CPU|STARPU_CUDA} to indicate to StarPU that the codelet
  1387. can be executed either on a CPU or on a CUDA device.
  1388. @cartouche
  1389. @example
  1390. #include <starpu.h>
  1391. #define NX 5
  1392. extern void scal_cuda_func(void *buffers[], void *_args);
  1393. extern void scal_func(void *buffers[], void *_args);
  1394. /* @b{Definition of the codelet} */
  1395. static starpu_codelet cl = @{
  1396. .where = STARPU_CPU|STARPU_CUDA; /* @b{It can be executed on a CPU} */
  1397. /* @b{or on a CUDA device} */
  1398. .cuda_func = scal_cuda_func;
  1399. .cpu_func = scal_func;
  1400. .nbuffers = 1;
  1401. @}
  1402. int main(int argc, char **argv)
  1403. @{
  1404. float *vector;
  1405. int i, ret;
  1406. float factor=3.0;
  1407. struct starpu_task *task;
  1408. starpu_data_handle tab_handle;
  1409. starpu_init(NULL); /* @b{Initialising StarPU} */
  1410. vector = (float*)malloc(NX*sizeof(float));
  1411. assert(vector);
  1412. for(i=0 ; i<NX ; i++) vector[i] = i;
  1413. @end example
  1414. @end cartouche
  1415. @cartouche
  1416. @example
  1417. /* @b{Registering data within StarPU} */
  1418. starpu_vector_data_register(&tab_handle, 0, (uintptr_t)vector,
  1419. NX, sizeof(float));
  1420. /* @b{Definition of the task} */
  1421. task = starpu_task_create();
  1422. task->cl = &cl;
  1423. task->callback_func = NULL;
  1424. task->buffers[0].handle = tab_handle;
  1425. task->buffers[0].mode = STARPU_RW;
  1426. task->cl_arg = &factor;
  1427. @end example
  1428. @end cartouche
  1429. @cartouche
  1430. @example
  1431. /* @b{Submitting the task} */
  1432. ret = starpu_task_submit(task);
  1433. if (ret == -ENODEV) @{
  1434. fprintf(stderr, "No worker may execute this task\n");
  1435. return 1;
  1436. @}
  1437. /* @b{Waiting for its termination} */
  1438. starpu_task_wait_for_all();
  1439. /* @b{Update the vector in RAM} */
  1440. starpu_data_sync_with_mem(tab_handle, STARPU_R);
  1441. @end example
  1442. @end cartouche
  1443. @cartouche
  1444. @example
  1445. /* @b{Access the data} */
  1446. for(i=0 ; i<NX; i++) @{
  1447. fprintf(stderr, "%f ", vector[i]);
  1448. @}
  1449. fprintf(stderr, "\n");
  1450. /* @b{Release the data and shutdown StarPU} */
  1451. starpu_data_release_from_mem(tab_handle);
  1452. starpu_shutdown();
  1453. return 0;
  1454. @}
  1455. @end example
  1456. @end cartouche
  1457. @node Compilation and execution
  1458. @subsection Compilation and execution
  1459. Let's suppose StarPU has been installed in the directory
  1460. @code{$STARPU_DIR}. As explained in @ref{Setting flags for compiling and linking applications},
  1461. the variable @code{PKG_CONFIG_PATH} needs to be set. It is also
  1462. necessary to set the variable @code{LD_LIBRARY_PATH} to locate dynamic
  1463. libraries at runtime.
  1464. @example
  1465. % PKG_CONFIG_PATH=$STARPU_DIR/lib/pkgconfig:$PKG_CONFIG_PATH
  1466. % LD_LIBRARY_PATH=$STARPU_DIR/lib:$LD_LIBRARY_PATH
  1467. @end example
  1468. It is then possible to compile the application using the following
  1469. makefile:
  1470. @cartouche
  1471. @example
  1472. CFLAGS += $(shell pkg-config --cflags libstarpu)
  1473. LDFLAGS += $(shell pkg-config --libs libstarpu)
  1474. CC = gcc
  1475. vector: vector.o vector_cpu.o vector_cuda.o
  1476. %.o: %.cu
  1477. nvcc $(CFLAGS) $< -c $@
  1478. clean:
  1479. rm -f vector *.o
  1480. @end example
  1481. @end cartouche
  1482. @example
  1483. % make
  1484. @end example
  1485. and to execute it, with the default configuration:
  1486. @example
  1487. % ./vector
  1488. 0.000000 3.000000 6.000000 9.000000 12.000000
  1489. @end example
  1490. or for example, by disabling CPU devices:
  1491. @example
  1492. % STARPU_NCPUS=0 ./vector
  1493. 0.000000 3.000000 6.000000 9.000000 12.000000
  1494. @end example
  1495. or by disabling CUDA devices:
  1496. @example
  1497. % STARPU_NCUDA=0 ./vector
  1498. 0.000000 3.000000 6.000000 9.000000 12.000000
  1499. @end example
  1500. @c TODO: Add performance model example (and update basic_examples)
  1501. @c ---------------------------------------------------------------------
  1502. @c Advanced Topics
  1503. @c ---------------------------------------------------------------------
  1504. @node Advanced Topics
  1505. @chapter Advanced Topics
  1506. @bye