starpu.texi 62 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834
  1. \input texinfo @c -*-texinfo-*-
  2. @c %**start of header
  3. @setfilename starpu.info
  4. @settitle StarPU
  5. @c %**end of header
  6. @setchapternewpage odd
  7. @titlepage
  8. @title StarPU
  9. @page
  10. @vskip 0pt plus 1filll
  11. @comment For the @value{version-GCC} Version*
  12. @end titlepage
  13. @summarycontents
  14. @contents
  15. @page
  16. @node Top
  17. @top Preface
  18. @cindex Preface
  19. This manual documents the usage of StarPU.
  20. @comment
  21. @comment When you add a new menu item, please keep the right hand
  22. @comment aligned to the same column. Do not use tabs. This provides
  23. @comment better formatting.
  24. @comment
  25. @menu
  26. * Introduction:: A basic introduction to using StarPU
  27. * Installing StarPU:: How to configure, build and install StarPU
  28. * Configuration options:: Configurations options
  29. * Environment variables:: Environment variables used by StarPU
  30. * StarPU API:: The API to use StarPU
  31. * Basic Examples:: Basic examples of the use of StarPU
  32. * Advanced Topics:: Advanced use of StarPU
  33. @end menu
  34. @c ---------------------------------------------------------------------
  35. @c Introduction to StarPU
  36. @c ---------------------------------------------------------------------
  37. @node Introduction
  38. @chapter Introduction to StarPU
  39. @menu
  40. * Motivation:: Why StarPU ?
  41. * StarPU in a Nutshell:: The Fundamentals of StarPU
  42. @end menu
  43. @node Motivation
  44. @section Motivation
  45. @c complex machines with heterogeneous cores/devices
  46. The use of specialized hardware such as accelerators or coprocessors offers an
  47. interesting approach to overcome the physical limits encountered by processor
  48. architects. As a result, many machines are now equipped with one or several
  49. accelerators (e.g. a GPU), in addition to the usual processor(s). While a lot of
  50. efforts have been devoted to offload computation onto such accelerators, very
  51. little attention as been paid to portability concerns on the one hand, and to the
  52. possibility of having heterogeneous accelerators and processors to interact on the other hand.
  53. StarPU is a runtime system that offers support for heterogeneous multicore
  54. architectures, it not only offers a unified view of the computational resources
  55. (i.e. CPUs and accelerators at the same time), but it also takes care of
  56. efficiently mapping and executing tasks onto an heterogeneous machine while
  57. transparently handling low-level issues in a portable fashion.
  58. @c this leads to a complicated distributed memory design
  59. @c which is not (easily) manageable by hand
  60. @c added value/benefits of StarPU
  61. @c - portability
  62. @c - scheduling, perf. portability
  63. @node StarPU in a Nutshell
  64. @section StarPU in a Nutshell
  65. From a programming point of view, StarPU is not a new language but a library
  66. that executes tasks explicitly submitted by the application. The data that a
  67. task manipulates are automatically transferred onto the accelerator so that the
  68. programmer does not have to take care of complex data movements. StarPU also
  69. takes particular care of scheduling those tasks efficiently and allows
  70. scheduling experts to implement custom scheduling policies in a portable
  71. fashion.
  72. @c explain the notion of codelet and task (i.e. g(A, B)
  73. @subsection Codelet and Tasks
  74. One of StarPU primary data structure is the @b{codelet}. A codelet describes a
  75. computational kernel that can possibly be implemented on multiple architectures
  76. such as a CPU, a CUDA device or a Cell's SPU.
  77. @c TODO insert illustration f : f_spu, f_cpu, ...
  78. Another important data structure is the @b{task}. Executing a StarPU task
  79. consists in applying a codelet on a data set, on one of the architectures on
  80. which the codelet is implemented. In addition to the codelet that a task
  81. implements, it also describes which data are accessed, and how they are
  82. accessed during the computation (read and/or write).
  83. StarPU tasks are asynchronous: submitting a task to StarPU is a non-blocking
  84. operation. The task structure can also specify a @b{callback} function that is
  85. called once StarPU has properly executed the task. It also contains optional
  86. fields that the application may use to give hints to the scheduler (such as
  87. priority levels).
  88. A task may be identified by a unique 64-bit number which we refer as a @b{tag}.
  89. Task dependencies can be enforced either by the means of callback functions, or
  90. by expressing dependencies between tags.
  91. @c TODO insert illustration f(Ar, Brw, Cr) + ..
  92. @c DSM
  93. @subsection StarPU Data Management Library
  94. Because StarPU schedules tasks at runtime, data transfers have to be
  95. done automatically and ``just-in-time'' between processing units,
  96. relieving the application programmer from explicit data transfers.
  97. Moreover, to avoid unnecessary transfers, StarPU keeps data
  98. where it was last needed, even if was modified there, and it
  99. allows multiple copies of the same data to reside at the same time on
  100. several processing units as long as it is not modified.
  101. @c ---------------------------------------------------------------------
  102. @c Installing StarPU
  103. @c ---------------------------------------------------------------------
  104. @node Installing StarPU
  105. @chapter Installing StarPU
  106. @menu
  107. * Configuration of StarPU::
  108. * Building and Installing StarPU::
  109. @end menu
  110. StarPU can be built and installed by the standard means of the GNU
  111. autotools. The following chapter is intended to briefly remind how these tools
  112. can be used to install StarPU.
  113. @node Configuration of StarPU
  114. @section Configuration of StarPU
  115. @menu
  116. * Generating Makefiles and configuration scripts::
  117. * Configuring StarPU::
  118. @end menu
  119. @node Generating Makefiles and configuration scripts
  120. @subsection Generating Makefiles and configuration scripts
  121. This step is not necessary when using the tarball releases of StarPU. If you
  122. are using the source code from the svn repository, you first need to generate
  123. the configure scripts and the Makefiles.
  124. @example
  125. $ autoreconf -vfi
  126. @end example
  127. @node Configuring StarPU
  128. @subsection Configuring StarPU
  129. @example
  130. $ ./configure
  131. @end example
  132. Details about options that are useful to give to @code{./configure} are given in
  133. @ref{Configuration options}.
  134. @node Building and Installing StarPU
  135. @section Building and Installing StarPU
  136. @menu
  137. * Building::
  138. * Sanity Checks::
  139. * Installing::
  140. * pkg-config configuration::
  141. @end menu
  142. @node Building
  143. @subsection Building
  144. @example
  145. $ make
  146. @end example
  147. @node Sanity Checks
  148. @subsection Sanity Checks
  149. In order to make sure that StarPU is working properly on the system, it is also
  150. possible to run a test suite.
  151. @example
  152. $ make check
  153. @end example
  154. @node Installing
  155. @subsection Installing
  156. In order to install StarPU at the location that was specified during
  157. configuration:
  158. @example
  159. $ make install
  160. @end example
  161. @node pkg-config configuration
  162. @subsection pkg-config configuration
  163. It is possible that compiling and linking an application against StarPU
  164. requires to use specific flags or libraries (for instance @code{CUDA} or
  165. @code{libspe2}). To this end, it is possible to use the @code{pkg-config} tool.
  166. If StarPU was not installed at some standard location, the path of StarPU's
  167. library must be specified in the @code{PKG_CONFIG_PATH} environment variable so
  168. that @code{pkg-config} can find it. For example if StarPU was installed in
  169. @code{$prefix_dir}:
  170. @example
  171. $ PKG_CONFIG_PATH = $PKG_CONFIG_PATH:$prefix_dir/lib/pkgconfig
  172. @end example
  173. The flags required to compile or link against StarPU are then
  174. accessible with the following commands:
  175. @example
  176. $ pkg-config --cflags libstarpu # options for the compiler
  177. $ pkg-config --libs libstarpu # options for the linker
  178. @end example
  179. @c ---------------------------------------------------------------------
  180. @c Configuration options
  181. @c ---------------------------------------------------------------------
  182. @node Configuration options
  183. @chapter Configuration options
  184. @table @asis
  185. @item @code{--disable-cpu}
  186. Disable the use of CPUs of the machine. Only GPUs etc. will be used.
  187. @item @code{--enable-maxcudadev=<number>}
  188. Defines the maximum number of CUDA devices that StarPU will support, then
  189. available as the STARPU_MAXCUDADEVS macro.
  190. @item @code{--disable-cuda}
  191. Disable the use of CUDA, even if the SDK is detected.
  192. @item @code{--enable-maxopencldev=<number>}
  193. Defines the maximum number of OpenCL devices that StarPU will support, then
  194. available as the STARPU_MAXOPENCLDEVS macro.
  195. @item @code{--disable-opencl}
  196. Disable the use of OpenCL, even if the SDK is detected.
  197. @item @code{--enable-gordon}
  198. Enable the use of the Gordon runtime for Cell SPUs.
  199. @c TODO: rather default to enabled when detected
  200. @item @code{--enable-debug}
  201. Enable debugging messages.
  202. @item @code{--enable-fast}
  203. Do not enforce assertions, saves a lot of time spent to compute them otherwise.
  204. @item @code{--enable-verbose}
  205. Augment the verbosity of the debugging messages.
  206. @item @code{--enable-coverage}
  207. Enable flags for the coverage tool.
  208. @item @code{--enable-perf-debug}
  209. Enable performance debugging.
  210. @item @code{--enable-model-debug}
  211. Enable performance model debugging.
  212. @item @code{--enable-stats}
  213. Enable statistics.
  214. @item @code{--enable-maxbuffers=<nbuffers>}
  215. Define the maximum number of buffers that tasks will be able to take as parameters, then available as the STARPU_NMAXBUFS macro.
  216. @item @code{--disable-priority}
  217. Disable taking priorities into account in scheduling decisions. Mostly for
  218. comparison purposes.
  219. @item @code{--enable-allocation-cache}
  220. Enable the use of a data allocation cache to avoid the cost of it with
  221. CUDA. Still experimental.
  222. @item @code{--enable-opengl-render}
  223. Enable the use of OpenGL for the rendering of some examples.
  224. @c TODO: rather default to enabled when detected
  225. @item @code{--enable-blas-lib=<name>}
  226. Specify the blas library to be used by some of the examples. The
  227. library has to be 'atlas' or 'goto'.
  228. @item @code{--with-cuda-dir=<path>}
  229. Specify the location of the CUDA SDK resides. This directory should notably contain
  230. @code{include/cuda.h}.
  231. @item @code{--with-magma=<path>}
  232. Specify where magma is installed.
  233. @item @code{--with-opencl-dir=<path>}
  234. Specify the location of the OpenCL SDK. This directory should notably contain
  235. @code{include/CL/cl.h}.
  236. @item @code{--with-gordon-dir=<path>}
  237. Specify the location of the Gordon SDK.
  238. @item @code{--with-fxt=<path>}
  239. Specify the location of FxT (for generating traces and rendering them
  240. using ViTE). This directory should notably contain
  241. @code{include/fxt/fxt.h}.
  242. @item @code{--with-perf-model-dir=<dir>}
  243. Specify where performance models should be stored (instead of defaulting to the
  244. current user's home).
  245. @item @code{--with-mpicc=<path to mpicc>}
  246. Specify the location of the @code{mpicc} compiler to be used for starpumpi.
  247. @c TODO: also just use AC_PROG
  248. @item @code{--with-mpi}
  249. Enable building libstarpumpi.
  250. @c TODO: rather just use the availability of mpicc instead of a second option
  251. @item @code{--with-goto-dir=<dir>}
  252. Specify the location of GotoBLAS.
  253. @item @code{--with-atlas-dir=<dir>}
  254. Specify the location of ATLAS. This directory should notably contain
  255. @code{include/cblas.h}.
  256. @end table
  257. @c ---------------------------------------------------------------------
  258. @c Environment variables
  259. @c ---------------------------------------------------------------------
  260. @node Environment variables
  261. @chapter Environment variables
  262. @menu
  263. * Workers:: Configuring workers
  264. * Scheduling:: Configuring the Scheduling engine
  265. * Misc:: Miscellaneous and debug
  266. @end menu
  267. Note: the values given in @code{starpu_conf} structure passed when
  268. calling @code{starpu_init} will override the values of the environment
  269. variables.
  270. @node Workers
  271. @section Configuring workers
  272. @menu
  273. * STARPU_NCPUS :: Number of CPU workers
  274. * STARPU_NCUDA :: Number of CUDA workers
  275. * STARPU_NOPENCL :: Number of OpenCL workers
  276. * STARPU_NGORDON :: Number of SPU workers (Cell)
  277. * STARPU_WORKERS_CPUID :: Bind workers to specific CPUs
  278. * STARPU_WORKERS_CUDAID :: Select specific CUDA devices
  279. * STARPU_WORKERS_OPENCLID :: Select specific OpenCL devices
  280. @end menu
  281. @node STARPU_NCPUS
  282. @subsection @code{STARPU_NCPUS} -- Number of CPU workers
  283. @table @asis
  284. @item @emph{Description}:
  285. Specify the maximum number of CPU workers. Note that StarPU will not allocate
  286. more CPUs than there are physical CPUs, and that some CPUs are used to control
  287. the accelerators.
  288. @end table
  289. @node STARPU_NCUDA
  290. @subsection @code{STARPU_NCUDA} -- Number of CUDA workers
  291. @table @asis
  292. @item @emph{Description}:
  293. Specify the maximum number of CUDA devices that StarPU can use. If
  294. @code{STARPU_NCUDA} is lower than the number of physical devices, it is
  295. possible to select which CUDA devices should be used by the means of the
  296. @code{STARPU_WORKERS_CUDAID} environment variable.
  297. @end table
  298. @node STARPU_NOPENCL
  299. @subsection @code{STARPU_NOPENCL} -- Number of OpenCL workers
  300. @table @asis
  301. @item @emph{Description}:
  302. OpenCL equivalent of the @code{STARPU_NCUDA} environment variable.
  303. @end table
  304. @node STARPU_NGORDON
  305. @subsection @code{STARPU_NGORDON} -- Number of SPU workers (Cell)
  306. @table @asis
  307. @item @emph{Description}:
  308. Specify the maximum number of SPUs that StarPU can use.
  309. @end table
  310. @node STARPU_WORKERS_CPUID
  311. @subsection @code{STARPU_WORKERS_CPUID} -- Bind workers to specific CPUs
  312. @table @asis
  313. @item @emph{Description}:
  314. Passing an array of integers (starting from 0) in @code{STARPU_WORKERS_CPUID}
  315. specifies on which logical CPU the different workers should be
  316. bound. For instance, if @code{STARPU_WORKERS_CPUID = "1 3 0 2"}, the first
  317. worker will be bound to logical CPU #1, the second CPU worker will be bound to
  318. logical CPU #3 and so on. Note that the logical ordering of the CPUs is either
  319. determined by the OS, or provided by the @code{hwloc} library in case it is
  320. available.
  321. Note that the first workers correspond to the CUDA workers, then come the
  322. OpenCL and the SPU, and finally the CPU workers. For example if
  323. we have @code{STARPU_NCUDA=1}, @code{STARPU_NOPENCL=1}, @code{STARPU_NCPUS=2}
  324. and @code{STARPU_WORKERS_CPUID = "0 2 1 3"}, the CUDA device will be controlled
  325. by logical CPU #0, the OpenCL device will be controlled by logical CPU #2, and
  326. the logical CPUs #1 and #3 will be used by the CPU workers.
  327. If the number of workers is larger than the array given in
  328. @code{STARPU_WORKERS_CPUID}, the workers are bound to the logical CPUs in a
  329. round-robin fashion: if @code{STARPU_WORKERS_CPUID = "0 1"}, the first and the
  330. third (resp. second and fourth) workers will be put on CPU #0 (resp. CPU #1).
  331. @end table
  332. @node STARPU_WORKERS_CUDAID
  333. @subsection @code{STARPU_WORKERS_CUDAID} -- Select specific CUDA devices
  334. @table @asis
  335. @item @emph{Description}:
  336. Similarly to the @code{STARPU_WORKERS_CPUID} environment variable, it is
  337. possible to select which CUDA devices should be used by StarPU. On a machine
  338. equipped with 4 GPUs, setting @code{STARPU_WORKERS_CUDAID = "1 3"} and
  339. @code{STARPU_NCUDA=2} specifies that 2 CUDA workers should be created, and that
  340. they should use CUDA devices #1 and #3 (the logical ordering of the devices is
  341. the one reported by CUDA).
  342. @end table
  343. @node STARPU_WORKERS_OPENCLID
  344. @subsection @code{STARPU_WORKERS_OPENCLID} -- Select specific OpenCL devices
  345. @table @asis
  346. @item @emph{Description}:
  347. OpenCL equivalent of the @code{STARPU_WORKERS_CUDAID} environment variable.
  348. @end table
  349. @node Scheduling
  350. @section Configuring the Scheduling engine
  351. @menu
  352. * STARPU_SCHED :: Scheduling policy
  353. * STARPU_CALIBRATE :: Calibrate performance models
  354. * STARPU_PREFETCH :: Use data prefetch
  355. * STARPU_SCHED_ALPHA :: Computation factor
  356. * STARPU_SCHED_BETA :: Communication factor
  357. @end menu
  358. @node STARPU_SCHED
  359. @subsection @code{STARPU_SCHED} -- Scheduling policy
  360. @table @asis
  361. @item @emph{Description}:
  362. This chooses between the different scheduling policies proposed by StarPU: work
  363. random, stealing, greedy, with performance models, etc.
  364. Use @code{STARPU_SCHED=help} to get the list of available schedulers.
  365. @end table
  366. @node STARPU_CALIBRATE
  367. @subsection @code{STARPU_CALIBRATE} -- Calibrate performance models
  368. @table @asis
  369. @item @emph{Description}:
  370. If this variable is set to 1, the performance models are calibrated during
  371. the execution. If it is set to 2, the previous values are dropped to restart
  372. calibration from scratch.
  373. Note: this currently only applies to dm and dmda scheduling policies.
  374. @end table
  375. @node STARPU_PREFETCH
  376. @subsection @code{STARPU_PREFETCH} -- Use data prefetch
  377. @table @asis
  378. @item @emph{Description}:
  379. If this variable is set, data prefetching will be enabled, that is when a task is
  380. scheduled to be executed e.g. on a GPU, StarPU will request an asynchronous
  381. transfer in advance, so that data is already present on the GPU when the task
  382. starts. As a result, computation and data transfers are overlapped.
  383. @end table
  384. @node STARPU_SCHED_ALPHA
  385. @subsection @code{STARPU_SCHED_ALPHA} -- Computation factor
  386. @table @asis
  387. @item @emph{Description}:
  388. To estimate the cost of a task StarPU takes into account the estimated
  389. computation time (obtained thanks to performance models). The alpha factor is
  390. the coefficient to be applied to it before adding it to the communication part.
  391. @end table
  392. @node STARPU_SCHED_BETA
  393. @subsection @code{STARPU_SCHED_BETA} -- Communication factor
  394. @table @asis
  395. @item @emph{Description}:
  396. To estimate the cost of a task StarPU takes into account the estimated
  397. data transfer time (obtained thanks to performance models). The beta factor is
  398. the coefficient to be applied to it before adding it to the computation part.
  399. @end table
  400. @node Misc
  401. @section Miscellaneous and debug
  402. @menu
  403. * STARPU_LOGFILENAME :: Select debug file name
  404. @end menu
  405. @node STARPU_LOGFILENAME
  406. @subsection @code{STARPU_LOGFILENAME} -- Select debug file name
  407. @table @asis
  408. @item @emph{Description}:
  409. This variable specify in which file the debugging output should be saved to.
  410. @end table
  411. @c ---------------------------------------------------------------------
  412. @c StarPU API
  413. @c ---------------------------------------------------------------------
  414. @node StarPU API
  415. @chapter StarPU API
  416. @menu
  417. * Initialization and Termination:: Initialization and Termination methods
  418. * Workers' Properties:: Methods to enumerate workers' properties
  419. * Data Library:: Methods to manipulate data
  420. * Codelets and Tasks:: Methods to construct tasks
  421. * Tags:: Task dependencies
  422. * CUDA extensions:: CUDA extensions
  423. * OpenCL extensions:: OpenCL extensions
  424. * Cell extensions:: Cell extensions
  425. * Miscellaneous:: Miscellaneous helpers
  426. @end menu
  427. @node Initialization and Termination
  428. @section Initialization and Termination
  429. @menu
  430. * starpu_init:: Initialize StarPU
  431. * struct starpu_conf:: StarPU runtime configuration
  432. * starpu_shutdown:: Terminate StarPU
  433. @end menu
  434. @node starpu_init
  435. @subsection @code{starpu_init} -- Initialize StarPU
  436. @table @asis
  437. @item @emph{Description}:
  438. This is StarPU initialization method, which must be called prior to any other
  439. StarPU call. It is possible to specify StarPU's configuration (e.g. scheduling
  440. policy, number of cores, ...) by passing a non-null argument. Default
  441. configuration is used if the passed argument is @code{NULL}.
  442. @item @emph{Return value}:
  443. Upon successful completion, this function returns 0. Otherwise, @code{-ENODEV}
  444. indicates that no worker was available (so that StarPU was not initialized).
  445. @item @emph{Prototype}:
  446. @code{int starpu_init(struct starpu_conf *conf);}
  447. @end table
  448. @node struct starpu_conf
  449. @subsection @code{struct starpu_conf} -- StarPU runtime configuration
  450. @table @asis
  451. @item @emph{Description}:
  452. This structure is passed to the @code{starpu_init} function in order
  453. to configure StarPU.
  454. When the default value is used, StarPU automatically selects the number
  455. of processing units and takes the default scheduling policy. This parameter
  456. overwrites the equivalent environment variables.
  457. @item @emph{Fields}:
  458. @table @asis
  459. @item @code{sched_policy} (default = NULL):
  460. This is the name of the scheduling policy. This can also be specified with the
  461. @code{STARPU_SCHED} environment variable.
  462. @item @code{ncpus} (default = -1):
  463. This is the maximum number of CPU cores that StarPU can use. This can also be
  464. specified with the @code{STARPU_NCPUS} environment variable.
  465. @item @code{ncuda} (default = -1):
  466. This is the maximum number of CUDA devices that StarPU can use. This can also be
  467. specified with the @code{STARPU_NCUDA} environment variable.
  468. @item @code{nopencl} (default = -1):
  469. This is the maximum number of OpenCL devices that StarPU can use. This can also be
  470. specified with the @code{STARPU_NOPENCL} environment variable.
  471. @item @code{nspus} (default = -1):
  472. This is the maximum number of Cell SPUs that StarPU can use. This can also be
  473. specified with the @code{STARPU_NGORDON} environment variable.
  474. @item @code{calibrate} (default = 0):
  475. If this flag is set, StarPU will calibrate the performance models when
  476. executing tasks. This can also be specified with the @code{STARPU_CALIBRATE}
  477. environment variable.
  478. @end table
  479. @end table
  480. @node starpu_shutdown
  481. @subsection @code{starpu_shutdown} -- Terminate StarPU
  482. @table @asis
  483. @item @emph{Description}:
  484. This is StarPU termination method. It must be called at the end of the
  485. application: statistics and other post-mortem debugging information are not
  486. guaranteed to be available until this method has been called.
  487. @item @emph{Prototype}:
  488. @code{void starpu_shutdown(void);}
  489. @end table
  490. @node Workers' Properties
  491. @section Workers' Properties
  492. @menu
  493. * starpu_worker_get_count:: Get the number of processing units
  494. * starpu_cpu_worker_get_count:: Get the number of CPU controlled by StarPU
  495. * starpu_cuda_worker_get_count:: Get the number of CUDA devices controlled by StarPU
  496. * starpu_opencl_worker_get_count:: Get the number of OpenCL devices controlled by StarPU
  497. * starpu_spu_worker_get_count:: Get the number of Cell SPUs controlled by StarPU
  498. * starpu_worker_get_id:: Get the identifier of the current worker
  499. * starpu_worker_get_type:: Get the type of processing unit associated to a worker
  500. * starpu_worker_get_name:: Get the name of a worker
  501. @end menu
  502. @node starpu_worker_get_count
  503. @subsection @code{starpu_worker_get_count} -- Get the number of processing units
  504. @table @asis
  505. @item @emph{Description}:
  506. This function returns the number of workers (i.e. processing units executing
  507. StarPU tasks). The returned value should be at most @code{STARPU_NMAXWORKERS}.
  508. @item @emph{Prototype}:
  509. @code{unsigned starpu_worker_get_count(void);}
  510. @end table
  511. @node starpu_cpu_worker_get_count
  512. @subsection @code{starpu_cpu_worker_get_count} -- Get the number of CPU controlled by StarPU
  513. @table @asis
  514. @item @emph{Description}:
  515. This function returns the number of CPUs controlled by StarPU. The returned
  516. value should be at most @code{STARPU_NMAXCPUS}.
  517. @item @emph{Prototype}:
  518. @code{unsigned starpu_cpu_worker_get_count(void);}
  519. @end table
  520. @node starpu_cuda_worker_get_count
  521. @subsection @code{starpu_cuda_worker_get_count} -- Get the number of CUDA devices controlled by StarPU
  522. @table @asis
  523. @item @emph{Description}:
  524. This function returns the number of CUDA devices controlled by StarPU. The returned
  525. value should be at most @code{STARPU_MAXCUDADEVS}.
  526. @item @emph{Prototype}:
  527. @code{unsigned starpu_cuda_worker_get_count(void);}
  528. @end table
  529. @node starpu_opencl_worker_get_count
  530. @subsection @code{starpu_opencl_worker_get_count} -- Get the number of OpenCL devices controlled by StarPU
  531. @table @asis
  532. @item @emph{Description}:
  533. This function returns the number of OpenCL devices controlled by StarPU. The returned
  534. value should be at most @code{STARPU_MAXOPENCLDEVS}.
  535. @item @emph{Prototype}:
  536. @code{unsigned starpu_opencl_worker_get_count(void);}
  537. @end table
  538. @node starpu_spu_worker_get_count
  539. @subsection @code{starpu_spu_worker_get_count} -- Get the number of Cell SPUs controlled by StarPU
  540. @table @asis
  541. @item @emph{Description}:
  542. This function returns the number of Cell SPUs controlled by StarPU.
  543. @item @emph{Prototype}:
  544. @code{unsigned starpu_opencl_worker_get_count(void);}
  545. @end table
  546. @node starpu_worker_get_id
  547. @subsection @code{starpu_worker_get_id} -- Get the identifier of the current worker
  548. @table @asis
  549. @item @emph{Description}:
  550. This function returns the identifier of the worker associated to the calling
  551. thread. The returned value is either -1 if the current context is not a StarPU
  552. worker (i.e. when called from the application outside a task or a callback), or
  553. an integer between 0 and @code{starpu_worker_get_count() - 1}.
  554. @item @emph{Prototype}:
  555. @code{int starpu_worker_get_id(void);}
  556. @end table
  557. @node starpu_worker_get_type
  558. @subsection @code{starpu_worker_get_type} -- Get the type of processing unit associated to a worker
  559. @table @asis
  560. @item @emph{Description}:
  561. This function returns the type of worker associated to an identifier (as
  562. returned by the @code{starpu_worker_get_id} function). The returned value
  563. indicates the architecture of the worker: @code{STARPU_CPU_WORKER} for a CPU
  564. core, @code{STARPU_CUDA_WORKER} for a CUDA device,
  565. @code{STARPU_OPENCL_WORKER} for a OpenCL device, and
  566. @code{STARPU_GORDON_WORKER} for a Cell SPU. The value returned for an invalid
  567. identifier is unspecified.
  568. @item @emph{Prototype}:
  569. @code{enum starpu_archtype starpu_worker_get_type(int id);}
  570. @end table
  571. @node starpu_worker_get_name
  572. @subsection @code{starpu_worker_get_name} -- Get the name of a worker
  573. @table @asis
  574. @item @emph{Description}:
  575. StarPU associates a unique human readable string to each processing unit. This
  576. function copies at most the @code{maxlen} first bytes of the unique string
  577. associated to a worker identified by its identifier @code{id} into the
  578. @code{dst} buffer. The caller is responsible for ensuring that the @code{dst}
  579. is a valid pointer to a buffer of @code{maxlen} bytes at least. Calling this
  580. function on an invalid identifier results in an unspecified behaviour.
  581. @item @emph{Prototype}:
  582. @code{void starpu_worker_get_name(int id, char *dst, size_t maxlen);}
  583. @end table
  584. @node Data Library
  585. @section Data Library
  586. This section describes the data management facilities provided by StarPU.
  587. TODO: We show how to use existing data interfaces in [ref], but developers can
  588. design their own data interfaces if required.
  589. @menu
  590. * starpu_data_handle:: StarPU opaque data handle
  591. * void *interface:: StarPU data interface
  592. @end menu
  593. @node starpu_data_handle
  594. @subsection @code{starpu_data_handle} -- StarPU opaque data handle
  595. @table @asis
  596. @item @emph{Description}:
  597. StarPU uses @code{starpu_data_handle} as an opaque handle to manage a piece of
  598. data. Once a piece of data has been registered to StarPU, it is associated to a
  599. @code{starpu_data_handle} which keeps track of the state of the piece of data
  600. over the entire machine, so that we can maintain data consistency and locate
  601. data replicates for instance.
  602. @end table
  603. @node void *interface
  604. @subsection @code{void *interface} -- StarPU data interface
  605. @table @asis
  606. @item @emph{Description}:
  607. Data management is done at a high-level in StarPU: rather than accessing a mere
  608. list of contiguous buffers, the tasks may manipulate data that are described by
  609. a high-level construct which we call data interface.
  610. TODO
  611. @end table
  612. @c void starpu_data_unregister(struct starpu_data_state_t *state);
  613. @c starpu_worker_get_memory_node TODO
  614. @c
  615. @c user interaction with the DSM
  616. @c void starpu_data_sync_with_mem(struct starpu_data_state_t *state);
  617. @c void starpu_notify_data_modification(struct starpu_data_state_t *state, uint32_t modifying_node);
  618. @node Codelets and Tasks
  619. @section Codelets and Tasks
  620. @menu
  621. * struct starpu_codelet:: StarPU codelet structure
  622. * struct starpu_task:: StarPU task structure
  623. * starpu_task_init:: Initialize a Task
  624. * starpu_task_create:: Allocate and Initialize a Task
  625. * starpu_task_deinit:: Release all the resources used by a Task
  626. * starpu_task_destroy:: Destroy a dynamically allocated Task
  627. * starpu_task_submit:: Submit a Task
  628. * starpu_task_wait:: Wait for the termination of a Task
  629. * starpu_task_wait_for_all:: Wait for the termination of all Tasks
  630. @end menu
  631. @node struct starpu_codelet
  632. @subsection @code{struct starpu_codelet} -- StarPU codelet structure
  633. @table @asis
  634. @item @emph{Description}:
  635. The codelet structure describes a kernel that is possibly implemented on
  636. various targets.
  637. @item @emph{Fields}:
  638. @table @asis
  639. @item @code{where}:
  640. Indicates which types of processing units are able to execute the codelet.
  641. @code{STARPU_CPU|STARPU_CUDA} for instance indicates that the codelet is
  642. implemented for both CPU cores and CUDA devices while @code{STARPU_GORDON}
  643. indicates that it is only available on Cell SPUs.
  644. @item @code{cpu_func} (optional):
  645. Is a function pointer to the CPU implementation of the codelet. Its prototype
  646. must be: @code{void cpu_func(void *buffers[], void *cl_arg)}. The first
  647. argument being the array of data managed by the data management library, and
  648. the second argument is a pointer to the argument passed from the @code{cl_arg}
  649. field of the @code{starpu_task} structure.
  650. The @code{cpu_func} field is ignored if @code{STARPU_CPU} does not appear in
  651. the @code{where} field, it must be non-null otherwise.
  652. @item @code{cuda_func} (optional):
  653. Is a function pointer to the CUDA implementation of the codelet. @emph{This
  654. must be a host-function written in the CUDA runtime API}. Its prototype must
  655. be: @code{void cuda_func(void *buffers[], void *cl_arg);}. The @code{cuda_func}
  656. field is ignored if @code{STARPU_CUDA} does not appear in the @code{where}
  657. field, it must be non-null otherwise.
  658. @item @code{opencl_func} (optional):
  659. Is a function pointer to the OpenCL implementation of the codelet. Its
  660. prototype must be:
  661. @code{void opencl_func(starpu_data_interface_t *descr, void *arg);}.
  662. This pointer is ignored if @code{OPENCL} does not appear in the
  663. @code{where} field, it must be non-null otherwise.
  664. @item @code{gordon_func} (optional):
  665. This is the index of the Cell SPU implementation within the Gordon library.
  666. TODO
  667. @item @code{nbuffers}:
  668. Specifies the number of arguments taken by the codelet. These arguments are
  669. managed by the DSM and are accessed from the @code{void *buffers[]}
  670. array. The constant argument passed with the @code{cl_arg} field of the
  671. @code{starpu_task} structure is not counted in this number. This value should
  672. not be above @code{STARPU_NMAXBUFS}.
  673. @item @code{model} (optional):
  674. This is a pointer to the performance model associated to this codelet. This
  675. optional field is ignored when null. TODO
  676. @end table
  677. @end table
  678. @node struct starpu_task
  679. @subsection @code{struct starpu_task} -- StarPU task structure
  680. @table @asis
  681. @item @emph{Description}:
  682. The @code{starpu_task} structure describes a task that can be offloaded on the various
  683. processing units managed by StarPU. It instantiates a codelet. It can either be
  684. allocated dynamically with the @code{starpu_task_create} method, or declared
  685. statically. In the latter case, the programmer has to zero the
  686. @code{starpu_task} structure and to fill the different fields properly. The
  687. indicated default values correspond to the configuration of a task allocated
  688. with @code{starpu_task_create}.
  689. @item @emph{Fields}:
  690. @table @asis
  691. @item @code{cl}:
  692. Is a pointer to the corresponding @code{starpu_codelet} data structure. This
  693. describes where the kernel should be executed, and supplies the appropriate
  694. implementations. When set to @code{NULL}, no code is executed during the tasks,
  695. such empty tasks can be useful for synchronization purposes.
  696. @item @code{buffers}:
  697. TODO
  698. @item @code{cl_arg} (optional) (default = NULL):
  699. This pointer is passed to the codelet through the second argument
  700. of the codelet implementation (e.g. @code{cpu_func} or @code{cuda_func}).
  701. In the specific case of the Cell processor, see the @code{cl_arg_size}
  702. argument.
  703. @item @code{cl_arg_size} (optional, Cell specific):
  704. In the case of the Cell processor, the @code{cl_arg} pointer is not directly
  705. given to the SPU function. A buffer of size @code{cl_arg_size} is allocated on
  706. the SPU. This buffer is then filled with the @code{cl_arg_size} bytes starting
  707. at address @code{cl_arg}. In this case, the argument given to the SPU codelet
  708. is therefore not the @code{cl_arg} pointer, but the address of the buffer in
  709. local store (LS) instead. This field is ignored for CPU, CUDA and OpenCL
  710. codelets.
  711. @item @code{callback_func} (optional) (default = @code{NULL}):
  712. This is a function pointer of prototype @code{void (*f)(void *)} which
  713. specifies a possible callback. If this pointer is non-null, the callback
  714. function is executed @emph{on the host} after the execution of the task. The
  715. callback is passed the value contained in the @code{callback_arg} field. No
  716. callback is executed if the field is null.
  717. @item @code{callback_arg} (optional) (default = @code{NULL}):
  718. This is the pointer passed to the callback function. This field is ignored if
  719. the @code{callback_func} is null.
  720. @item @code{use_tag} (optional) (default = 0):
  721. If set, this flag indicates that the task should be associated with the tag
  722. contained in the @code{tag_id} field. Tag allow the application to synchronize
  723. with the task and to express task dependencies easily.
  724. @item @code{tag_id}:
  725. This fields contains the tag associated to the tag if the @code{use_tag} field
  726. was set, it is ignored otherwise.
  727. @item @code{synchronous}:
  728. If this flag is set, the @code{starpu_task_submit} function is blocking and
  729. returns only when the task has been executed (or if no worker is able to
  730. process the task). Otherwise, @code{starpu_task_submit} returns immediately.
  731. @item @code{priority} (optional) (default = @code{STARPU_DEFAULT_PRIO}):
  732. This field indicates a level of priority for the task. This is an integer value
  733. that must be set between @code{STARPU_MIN_PRIO} (for the least important
  734. tasks) and @code{STARPU_MAX_PRIO} (for the most important tasks) included.
  735. Default priority is @code{STARPU_DEFAULT_PRIO}. Scheduling strategies that
  736. take priorities into account can use this parameter to take better scheduling
  737. decisions, but the scheduling policy may also ignore it.
  738. @item @code{execute_on_a_specific_worker} (default = 0):
  739. If this flag is set, StarPU will bypass the scheduler and directly affect this
  740. task to the worker specified by the @code{workerid} field.
  741. @item @code{workerid} (optional):
  742. If the @code{execute_on_a_specific_worker} field is set, this field indicates
  743. which is the identifier of the worker that should process this task (as
  744. returned by @code{starpu_worker_get_id}). This field is ignored if
  745. @code{execute_on_a_specific_worker} field is set to 0.
  746. @item @code{detach} (optional) (default = 1):
  747. If this flag is set, it is not possible to synchronize with the task
  748. by the means of @code{starpu_task_wait} later on. Internal data structures
  749. are only guaranteed to be freed once @code{starpu_task_wait} is called if the
  750. flag is not set.
  751. @item @code{destroy} (optional) (default = 1):
  752. If this flag is set, the task structure will automatically be freed, either
  753. after the execution of the callback if the task is detached, or during
  754. @code{starpu_task_wait} otherwise. If this flag is not set, dynamically
  755. allocated data structures will not be freed until @code{starpu_task_destroy} is
  756. called explicitly. Setting this flag for a statically allocated task structure
  757. will result in undefined behaviour.
  758. @end table
  759. @end table
  760. @node starpu_task_init
  761. @subsection @code{starpu_task_init} -- Initialize a Task
  762. @table @asis
  763. @item @emph{Description}:
  764. Initialize a task structure with default values. This function is implicitly
  765. called by @code{starpu_task_create}. By default, tasks initialized with
  766. @code{starpu_task_init} must be deinitialized explicitly with
  767. @code{starpu_task_deinit}. Tasks can also be initialized statically, using the
  768. constant @code{STARPU_TASK_INITIALIZER}.
  769. @item @emph{Prototype}:
  770. @code{void starpu_task_init(struct starpu_task *task);}
  771. @end table
  772. @node starpu_task_create
  773. @subsection @code{starpu_task_create} -- Allocate and Initialize a Task
  774. @table @asis
  775. @item @emph{Description}:
  776. Allocate a task structure and initialize it with default values. Tasks
  777. allocated dynamically with @code{starpu_task_create} are automatically freed when the
  778. task is terminated. If the destroy flag is explicitly unset, the resources used
  779. by the task are freed by calling
  780. @code{starpu_task_destroy}.
  781. @item @emph{Prototype}:
  782. @code{struct starpu_task *starpu_task_create(void);}
  783. @end table
  784. @node starpu_task_deinit
  785. @subsection @code{starpu_task_deinit} -- Release all the resources used by a Task
  786. @table @asis
  787. @item @emph{Description}:
  788. Release all the structures automatically allocated to execute the task. This is
  789. called implicitly by @code{starpu_task_destroy}, but the task structure itself is not
  790. freed. This should be used for statically allocated tasks for instance.
  791. Note that this function is automatically called by @code{starpu_task_destroy}.
  792. @item @emph{Prototype}:
  793. @code{void starpu_task_deinit(struct starpu_task *task);}
  794. @end table
  795. @node starpu_task_destroy
  796. @subsection @code{starpu_task_destroy} -- Destroy a dynamically allocated Task
  797. @table @asis
  798. @item @emph{Description}:
  799. Free the resource allocated during @code{starpu_task_create}. This function can be
  800. called automatically after the execution of a task by setting the
  801. @code{destroy} flag of the @code{starpu_task} structure (default behaviour).
  802. Calling this function on a statically allocated task results in an undefined
  803. behaviour.
  804. @item @emph{Prototype}:
  805. @code{void starpu_task_destroy(struct starpu_task *task);}
  806. @end table
  807. @node starpu_task_wait
  808. @subsection @code{starpu_task_wait} -- Wait for the termination of a Task
  809. @table @asis
  810. @item @emph{Description}:
  811. This function blocks until the task has been executed. It is not possible to
  812. synchronize with a task more than once. It is not possible to wait for
  813. synchronous or detached tasks.
  814. @item @emph{Return value}:
  815. Upon successful completion, this function returns 0. Otherwise, @code{-EINVAL}
  816. indicates that the specified task was either synchronous or detached.
  817. @item @emph{Prototype}:
  818. @code{int starpu_task_wait(struct starpu_task *task);}
  819. @end table
  820. @node starpu_task_submit
  821. @subsection @code{starpu_task_submit} -- Submit a Task
  822. @table @asis
  823. @item @emph{Description}:
  824. This function submits a task to StarPU. Calling this function does
  825. not mean that the task will be executed immediately as there can be data or task
  826. (tag) dependencies that are not fulfilled yet: StarPU will take care of
  827. scheduling this task with respect to such dependencies.
  828. This function returns immediately if the @code{synchronous} field of the
  829. @code{starpu_task} structure was set to 0, and block until the termination of
  830. the task otherwise. It is also possible to synchronize the application with
  831. asynchronous tasks by the means of tags, using the @code{starpu_tag_wait}
  832. function for instance.
  833. @item @emph{Return value}:
  834. In case of success, this function returns 0, a return value of @code{-ENODEV}
  835. means that there is no worker able to process this task (e.g. there is no GPU
  836. available and this task is only implemented for CUDA devices).
  837. @item @emph{Prototype}:
  838. @code{int starpu_task_submit(struct starpu_task *task);}
  839. @end table
  840. @node starpu_task_wait_for_all
  841. @subsection @code{starpu_task_wait_for_all} -- Wait for the termination of all Tasks
  842. @table @asis
  843. @item @emph{Description}:
  844. This function blocks until all the tasks that were submitted are terminated.
  845. @item @emph{Prototype}:
  846. @code{void starpu_task_wait_for_all(void);}
  847. @end table
  848. @c Callbacks : what can we put in callbacks ?
  849. @node Tags
  850. @section Tags
  851. @menu
  852. * starpu_tag_t:: Task identifier
  853. * starpu_tag_declare_deps:: Declare the Dependencies of a Tag
  854. * starpu_tag_declare_deps_array:: Declare the Dependencies of a Tag
  855. * starpu_tag_wait:: Block until a Tag is terminated
  856. * starpu_tag_wait_array:: Block until a set of Tags is terminated
  857. * starpu_tag_remove:: Destroy a Tag
  858. * starpu_tag_notify_from_apps:: Feed a tag explicitly
  859. @end menu
  860. @node starpu_tag_t
  861. @subsection @code{starpu_tag_t} -- Task identifier
  862. @table @asis
  863. @item @emph{Description}:
  864. It is possible to associate a task with a unique ``tag'' and to express
  865. dependencies between tasks by the means of those tags. To do so, fill the
  866. @code{tag_id} field of the @code{starpu_task} structure with a tag number (can
  867. be arbitrary) and set the @code{use_tag} field to 1.
  868. If @code{starpu_tag_declare_deps} is called with this tag number, the task will
  869. not be started until the tasks which holds the declared dependency tags are
  870. completed.
  871. @end table
  872. @node starpu_tag_declare_deps
  873. @subsection @code{starpu_tag_declare_deps} -- Declare the Dependencies of a Tag
  874. @table @asis
  875. @item @emph{Description}:
  876. Specify the dependencies of the task identified by tag @code{id}. The first
  877. argument specifies the tag which is configured, the second argument gives the
  878. number of tag(s) on which @code{id} depends. The following arguments are the
  879. tags which have to be terminated to unlock the task.
  880. This function must be called before the associated task is submitted to StarPU
  881. with @code{starpu_task_submit}.
  882. @item @emph{Remark}
  883. Because of the variable arity of @code{starpu_tag_declare_deps}, note that the
  884. last arguments @emph{must} be of type @code{starpu_tag_t}: constant values
  885. typically need to be explicitly casted. Using the
  886. @code{starpu_tag_declare_deps_array} function avoids this hazard.
  887. @item @emph{Prototype}:
  888. @code{void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...);}
  889. @item @emph{Example}:
  890. @cartouche
  891. @example
  892. /* Tag 0x1 depends on tags 0x32 and 0x52 */
  893. starpu_tag_declare_deps((starpu_tag_t)0x1,
  894. 2, (starpu_tag_t)0x32, (starpu_tag_t)0x52);
  895. @end example
  896. @end cartouche
  897. @end table
  898. @node starpu_tag_declare_deps_array
  899. @subsection @code{starpu_tag_declare_deps_array} -- Declare the Dependencies of a Tag
  900. @table @asis
  901. @item @emph{Description}:
  902. This function is similar to @code{starpu_tag_declare_deps}, except that its
  903. does not take a variable number of arguments but an array of tags of size
  904. @code{ndeps}.
  905. @item @emph{Prototype}:
  906. @code{void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t *array);}
  907. @item @emph{Example}:
  908. @cartouche
  909. @example
  910. /* Tag 0x1 depends on tags 0x32 and 0x52 */
  911. starpu_tag_t tag_array[2] = @{0x32, 0x52@};
  912. starpu_tag_declare_deps_array((starpu_tag_t)0x1, 2, tag_array);
  913. @end example
  914. @end cartouche
  915. @end table
  916. @node starpu_tag_wait
  917. @subsection @code{starpu_tag_wait} -- Block until a Tag is terminated
  918. @table @asis
  919. @item @emph{Description}:
  920. This function blocks until the task associated to tag @code{id} has been
  921. executed. This is a blocking call which must therefore not be called within
  922. tasks or callbacks, but only from the application directly. It is possible to
  923. synchronize with the same tag multiple times, as long as the
  924. @code{starpu_tag_remove} function is not called. Note that it is still
  925. possible to synchronize with a tag associated to a task which @code{starpu_task}
  926. data structure was freed (e.g. if the @code{destroy} flag of the
  927. @code{starpu_task} was enabled).
  928. @item @emph{Prototype}:
  929. @code{void starpu_tag_wait(starpu_tag_t id);}
  930. @end table
  931. @node starpu_tag_wait_array
  932. @subsection @code{starpu_tag_wait_array} -- Block until a set of Tags is terminated
  933. @table @asis
  934. @item @emph{Description}:
  935. This function is similar to @code{starpu_tag_wait} except that it blocks until
  936. @emph{all} the @code{ntags} tags contained in the @code{id} array are
  937. terminated.
  938. @item @emph{Prototype}:
  939. @code{void starpu_tag_wait_array(unsigned ntags, starpu_tag_t *id);}
  940. @end table
  941. @node starpu_tag_remove
  942. @subsection @code{starpu_tag_remove} -- Destroy a Tag
  943. @table @asis
  944. @item @emph{Description}:
  945. This function releases the resources associated to tag @code{id}. It can be
  946. called once the corresponding task has been executed and when there is
  947. no other tag that depend on this tag anymore.
  948. @item @emph{Prototype}:
  949. @code{void starpu_tag_remove(starpu_tag_t id);}
  950. @end table
  951. @node starpu_tag_notify_from_apps
  952. @subsection @code{starpu_tag_notify_from_apps} -- Feed a Tag explicitly
  953. @table @asis
  954. @item @emph{Description}:
  955. This function explicitly unlocks tag @code{id}. It may be useful in the
  956. case of applications which execute part of their computation outside StarPU
  957. tasks (e.g. third-party libraries). It is also provided as a
  958. convenient tool for the programmer, for instance to entirely construct the task
  959. DAG before actually giving StarPU the opportunity to execute the tasks.
  960. @item @emph{Prototype}:
  961. @code{void starpu_tag_notify_from_apps(starpu_tag_t id);}
  962. @end table
  963. @node CUDA extensions
  964. @section CUDA extensions
  965. @c void starpu_data_malloc_pinned_if_possible(float **A, size_t dim);
  966. @c starpu_helper_cublas_init TODO
  967. @c starpu_helper_cublas_shutdown TODO
  968. @menu
  969. * starpu_cuda_get_local_stream:: Get current worker's CUDA stream
  970. * starpu_helper_cublas_init:: Initialize CUBLAS on every CUDA device
  971. * starpu_helper_cublas_shutdown:: Deinitialize CUBLAS on every CUDA device
  972. @end menu
  973. @node starpu_cuda_get_local_stream
  974. @subsection @code{starpu_cuda_get_local_stream} -- Get current worker's CUDA stream
  975. @table @asis
  976. @item @emph{Description}:
  977. StarPU provides a stream for every CUDA device controlled by StarPU. This
  978. function is only provided for convenience so that programmers can easily use
  979. asynchronous operations within codelets without having to create a stream by
  980. hand. Note that the application is not forced to use the stream provided by
  981. @code{starpu_cuda_get_local_stream} and may also create its own streams.
  982. @item @emph{Prototype}:
  983. @code{cudaStream_t *starpu_cuda_get_local_stream(void);}
  984. @end table
  985. @node starpu_helper_cublas_init
  986. @subsection @code{starpu_helper_cublas_init} -- Initialize CUBLAS on every CUDA device
  987. @table @asis
  988. @item @emph{Description}:
  989. The CUBLAS library must be initialized prior to any CUBLAS call. Calling
  990. @code{starpu_helper_cublas_init} will initialize CUBLAS on every CUDA device
  991. controlled by StarPU. This call blocks until CUBLAS has been properly
  992. initialized on every device.
  993. @item @emph{Prototype}:
  994. @code{void starpu_helper_cublas_init(void);}
  995. @end table
  996. @node starpu_helper_cublas_shutdown
  997. @subsection @code{starpu_helper_cublas_shutdown} -- Deinitialize CUBLAS on every CUDA device
  998. @table @asis
  999. @item @emph{Description}:
  1000. This function synchronously deinitializes the CUBLAS library on every CUDA device.
  1001. @item @emph{Prototype}:
  1002. @code{void starpu_helper_cublas_shutdown(void);}
  1003. @end table
  1004. @node OpenCL extensions
  1005. @section OpenCL extensions
  1006. @menu
  1007. * Enabling OpenCL:: Enabling OpenCL
  1008. * Compiling OpenCL codelets:: Compiling OpenCL codelets
  1009. @end menu
  1010. @node Enabling OpenCL
  1011. @subsection Enabling OpenCL
  1012. On GPU devices which can run both CUDA and OpenCL, CUDA will be
  1013. enabled by default. To enable OpenCL, you need either to disable CUDA
  1014. when configuring StarPU:
  1015. @example
  1016. $ ./configure --disable-cuda
  1017. @end example
  1018. or when running applications:
  1019. @example
  1020. $ STARPU_NCUDA=0 ./application
  1021. @end example
  1022. OpenCL will automatically be started on any device not yet used by
  1023. CUDA. So on a machine running 4 GPUS, it is therefore possible to
  1024. enable CUDA on 2 devices, and OpenCL on the 2 other devices by doing
  1025. so:
  1026. @example
  1027. $ STARPU_NCUDA=2 ./application
  1028. @end example
  1029. @node Compiling OpenCL codelets
  1030. @subsection Compiling OpenCL codelets
  1031. TODO
  1032. @node Cell extensions
  1033. @section Cell extensions
  1034. nothing yet.
  1035. @node Miscellaneous
  1036. @section Miscellaneous helpers
  1037. @menu
  1038. * starpu_execute_on_each_worker:: Execute a function on a subset of workers
  1039. @end menu
  1040. @node starpu_execute_on_each_worker
  1041. @subsection @code{starpu_execute_on_each_worker} -- Execute a function on a subset of workers
  1042. @table @asis
  1043. @item @emph{Description}:
  1044. When calling this method, the offloaded function specified by the first argument is
  1045. executed by every StarPU worker that may execute the function.
  1046. The second argument is passed to the offloaded function.
  1047. The last argument specifies on which types of processing units the function
  1048. should be executed. Similarly to the @code{where} field of the
  1049. @code{starpu_codelet} structure, it is possible to specify that the function
  1050. should be executed on every CUDA device and every CPU by passing
  1051. @code{STARPU_CPU|STARPU_CUDA}.
  1052. This function blocks until the function has been executed on every appropriate
  1053. processing units, so that it may not be called from a callback function for
  1054. instance.
  1055. @item @emph{Prototype}:
  1056. @code{void starpu_execute_on_each_worker(void (*func)(void *), void *arg, uint32_t where);}
  1057. @end table
  1058. @c ---------------------------------------------------------------------
  1059. @c Basic Examples
  1060. @c ---------------------------------------------------------------------
  1061. @node Basic Examples
  1062. @chapter Basic Examples
  1063. @menu
  1064. * Compiling and linking:: Compiling and Linking Options
  1065. * Hello World:: Submitting Tasks
  1066. * Scaling a Vector:: Manipulating Data
  1067. * Scaling a Vector (hybrid):: Handling Heterogeneous Architectures
  1068. @end menu
  1069. @node Compiling and linking
  1070. @section Compiling and linking options
  1071. The Makefile could for instance contain the following lines to define which
  1072. options must be given to the compiler and to the linker:
  1073. @cartouche
  1074. @example
  1075. CFLAGS+=$$(pkg-config --cflags libstarpu)
  1076. LIBS+=$$(pkg-config --libs libstarpu)
  1077. @end example
  1078. @end cartouche
  1079. @node Hello World
  1080. @section Hello World
  1081. In this section, we show how to implement a simple program that submits a task to StarPU.
  1082. @subsection Required Headers
  1083. The @code{starpu.h} header should be included in any code using StarPU.
  1084. @cartouche
  1085. @example
  1086. #include <starpu.h>
  1087. @end example
  1088. @end cartouche
  1089. @subsection Defining a Codelet
  1090. @cartouche
  1091. @example
  1092. void cpu_func(void *buffers[], void *cl_arg)
  1093. @{
  1094. float *array = cl_arg;
  1095. printf("Hello world (array = @{%f, %f@} )\n", array[0], array[1]);
  1096. @}
  1097. starpu_codelet cl =
  1098. @{
  1099. .where = STARPU_CPU,
  1100. .cpu_func = cpu_func,
  1101. .nbuffers = 0
  1102. @};
  1103. @end example
  1104. @end cartouche
  1105. A codelet is a structure that represents a computational kernel. Such a codelet
  1106. may contain an implementation of the same kernel on different architectures
  1107. (e.g. CUDA, Cell's SPU, x86, ...).
  1108. The @code{nbuffers} field specifies the number of data buffers that are
  1109. manipulated by the codelet: here the codelet does not access or modify any data
  1110. that is controlled by our data management library. Note that the argument
  1111. passed to the codelet (the @code{cl_arg} field of the @code{starpu_task}
  1112. structure) does not count as a buffer since it is not managed by our data
  1113. management library.
  1114. @c TODO need a crossref to the proper description of "where" see bla for more ...
  1115. We create a codelet which may only be executed on the CPUs. The @code{where}
  1116. field is a bitmask that defines where the codelet may be executed. Here, the
  1117. @code{STARPU_CPU} value means that only CPUs can execute this codelet
  1118. (@pxref{Codelets and Tasks} for more details on this field).
  1119. When a CPU core executes a codelet, it calls the @code{cpu_func} function,
  1120. which @emph{must} have the following prototype:
  1121. @code{void (*cpu_func)(void *buffers[], void *cl_arg)}
  1122. In this example, we can ignore the first argument of this function which gives a
  1123. description of the input and output buffers (e.g. the size and the location of
  1124. the matrices). The second argument is a pointer to a buffer passed as an
  1125. argument to the codelet by the means of the @code{cl_arg} field of the
  1126. @code{starpu_task} structure.
  1127. @c TODO rewrite so that it is a little clearer ?
  1128. Be aware that this may be a pointer to a
  1129. @emph{copy} of the actual buffer, and not the pointer given by the programmer:
  1130. if the codelet modifies this buffer, there is no guarantee that the initial
  1131. buffer will be modified as well: this for instance implies that the buffer
  1132. cannot be used as a synchronization medium.
  1133. @subsection Submitting a Task
  1134. @cartouche
  1135. @example
  1136. void callback_func(void *callback_arg)
  1137. @{
  1138. printf("Callback function (arg %x)\n", callback_arg);
  1139. @}
  1140. int main(int argc, char **argv)
  1141. @{
  1142. /* initialize StarPU */
  1143. starpu_init(NULL);
  1144. struct starpu_task *task = starpu_task_create();
  1145. task->cl = &cl;
  1146. float *array[2] = @{1.0f, -1.0f@};
  1147. task->cl_arg = &array;
  1148. task->cl_arg_size = 2*sizeof(float);
  1149. task->callback_func = callback_func;
  1150. task->callback_arg = 0x42;
  1151. /* starpu_task_submit will be a blocking call */
  1152. task->synchronous = 1;
  1153. /* submit the task to StarPU */
  1154. starpu_task_submit(task);
  1155. /* terminate StarPU */
  1156. starpu_shutdown();
  1157. return 0;
  1158. @}
  1159. @end example
  1160. @end cartouche
  1161. Before submitting any tasks to StarPU, @code{starpu_init} must be called. The
  1162. @code{NULL} argument specifies that we use default configuration. Tasks cannot
  1163. be submitted after the termination of StarPU by a call to
  1164. @code{starpu_shutdown}.
  1165. In the example above, a task structure is allocated by a call to
  1166. @code{starpu_task_create}. This function only allocates and fills the
  1167. corresponding structure with the default settings (@pxref{starpu_task_create}),
  1168. but it does not submit the task to StarPU.
  1169. @c not really clear ;)
  1170. The @code{cl} field is a pointer to the codelet which the task will
  1171. execute: in other words, the codelet structure describes which computational
  1172. kernel should be offloaded on the different architectures, and the task
  1173. structure is a wrapper containing a codelet and the piece of data on which the
  1174. codelet should operate.
  1175. The optional @code{cl_arg} field is a pointer to a buffer (of size
  1176. @code{cl_arg_size}) with some parameters for the kernel
  1177. described by the codelet. For instance, if a codelet implements a computational
  1178. kernel that multiplies its input vector by a constant, the constant could be
  1179. specified by the means of this buffer.
  1180. Once a task has been executed, an optional callback function can be called.
  1181. While the computational kernel could be offloaded on various architectures, the
  1182. callback function is always executed on a CPU. The @code{callback_arg}
  1183. pointer is passed as an argument of the callback. The prototype of a callback
  1184. function must be:
  1185. @cartouche
  1186. @example
  1187. void (*callback_function)(void *);
  1188. @end example
  1189. @end cartouche
  1190. If the @code{synchronous} field is non-null, task submission will be
  1191. synchronous: the @code{starpu_task_submit} function will not return until the
  1192. task was executed. Note that the @code{starpu_shutdown} method does not
  1193. guarantee that asynchronous tasks have been executed before it returns.
  1194. @node Scaling a Vector
  1195. @section Manipulating Data: Scaling a Vector
  1196. The previous example has shown how to submit tasks. In this section we show how
  1197. StarPU tasks can manipulate data.
  1198. Programmers can describe the data layout of their application so that StarPU is
  1199. responsible for enforcing data coherency and availability across the machine.
  1200. Instead of handling complex (and non-portable) mechanisms to perform data
  1201. movements, programmers only declare which piece of data is accessed and/or
  1202. modified by a task, and StarPU makes sure that when a computational kernel
  1203. starts somewhere (e.g. on a GPU), its data are available locally.
  1204. Before submitting those tasks, the programmer first needs to declare the
  1205. different pieces of data to StarPU using the @code{starpu_*_data_register}
  1206. functions. To ease the development of applications for StarPU, it is possible
  1207. to describe multiple types of data layout. A type of data layout is called an
  1208. @b{interface}. By default, there are different interfaces available in StarPU:
  1209. here we will consider the @b{vector interface}.
  1210. The following lines show how to declare an array of @code{n} elements of type
  1211. @code{float} using the vector interface:
  1212. @cartouche
  1213. @example
  1214. float tab[n];
  1215. starpu_data_handle tab_handle;
  1216. starpu_vector_data_register(&tab_handle, 0, tab, n, sizeof(float));
  1217. @end example
  1218. @end cartouche
  1219. The first argument, called the @b{data handle}, is an opaque pointer which
  1220. designates the array in StarPU. This is also the structure which is used to
  1221. describe which data is used by a task. The second argument is the node number
  1222. where the data currently resides. Here it is 0 since the @code{tab} array is in
  1223. the main memory. Then comes the pointer @code{tab} where the data can be found,
  1224. the number of elements in the vector and the size of each element.
  1225. It is possible to construct a StarPU
  1226. task that multiplies this vector by a constant factor:
  1227. @cartouche
  1228. @example
  1229. float factor;
  1230. struct starpu_task *task = starpu_task_create();
  1231. task->cl = &cl;
  1232. task->buffers[0].handle = tab_handle;
  1233. task->buffers[0].mode = STARPU_RW;
  1234. task->cl_arg = &factor;
  1235. task->cl_arg_size = sizeof(float);
  1236. @end example
  1237. @end cartouche
  1238. Since the factor is constant, it does not need a preliminary declaration, and
  1239. can just be passed through the @code{cl_arg} pointer like in the previous
  1240. example. The vector parameter is described by its handle.
  1241. There are two fields in each element of the @code{buffers} array.
  1242. @code{handle} is the handle of the data, and @code{mode} specifies how the
  1243. kernel will access the data (@code{STARPU_R} for read-only, @code{STARPU_W} for
  1244. write-only and @code{STARPU_RW} for read and write access).
  1245. The definition of the codelet can be written as follows:
  1246. @cartouche
  1247. @example
  1248. void scal_func(void *buffers[], void *cl_arg)
  1249. @{
  1250. unsigned i;
  1251. float *factor = cl_arg;
  1252. struct starpu_vector_interface_s *vector = buffers[0];
  1253. /* length of the vector */
  1254. unsigned n = vector->nx;
  1255. /* local copy of the vector pointer */
  1256. float *val = (float *)vector->ptr;
  1257. for (i = 0; i < n; i++)
  1258. val[i] *= *factor;
  1259. @}
  1260. starpu_codelet cl = @{
  1261. .where = STARPU_CPU,
  1262. .cpu_func = scal_func,
  1263. .nbuffers = 1
  1264. @};
  1265. @end example
  1266. @end cartouche
  1267. The second argument of the @code{scal_func} function contains a pointer to the
  1268. parameters of the codelet (given in @code{task->cl_arg}), so that we read the
  1269. constant factor from this pointer. The first argument is an array that gives
  1270. a description of every buffers passed in the @code{task->buffers}@ array. The
  1271. size of this array is given by the @code{nbuffers} field of the codelet
  1272. structure. For the sake of generality, this array contains pointers to the
  1273. different interfaces describing each buffer. In the case of the @b{vector
  1274. interface}, the location of the vector (resp. its length) is accessible in the
  1275. @code{ptr} (resp. @code{nx}) of this array. Since the vector is accessed in a
  1276. read-write fashion, any modification will automatically affect future accesses
  1277. to this vector made by other tasks.
  1278. @node Scaling a Vector (hybrid)
  1279. @section Vector Scaling on an Hybrid CPU/GPU Machine
  1280. @menu
  1281. * Source code:: Source of the StarPU application
  1282. * Compilation and execution:: Executing the StarPU application
  1283. @end menu
  1284. @node Source code
  1285. @subsection Source code
  1286. Contrary to the previous examples, the task submitted in this example may not
  1287. only be executed by the CPUs, but also by a CUDA device.
  1288. The CUDA implementation can be written as follows. It needs to be
  1289. compiled with a CUDA compiler such as nvcc, the NVIDIA CUDA compiler
  1290. driver.
  1291. @cartouche
  1292. @example
  1293. #include <starpu.h>
  1294. static __global__ void vector_mult_cuda(float *vector, int nx,
  1295. float *multiplier)
  1296. @{
  1297. int i;
  1298. for(i=0 ; i<nx ; i++) vector[i] *= *multiplier;
  1299. @}
  1300. extern "C" void cuda_codelet(void *descr[], void *_args)
  1301. @{
  1302. float *vector = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
  1303. int nx = STARPU_GET_VECTOR_NX(descr[0]);
  1304. float *multiplier = (float *)STARPU_GET_VARIABLE_PTR(descr[1]);
  1305. vector_mult_cuda<<<1,1>>>(vector, nx, multiplier);
  1306. @}
  1307. @end example
  1308. @end cartouche
  1309. The CPU implementation can be as follows.
  1310. @cartouche
  1311. @example
  1312. #include <starpu.h>
  1313. void cpu_codelet(void *descr[], void *_args)
  1314. @{
  1315. float *vector = (float *)STARPU_GET_VECTOR_PTR(descr[0]);
  1316. int nx = (int)STARPU_GET_VECTOR_NX(descr[0]);
  1317. float *multiplier = (float *)STARPU_GET_VARIABLE_PTR(descr[1]);
  1318. int i;
  1319. for(i=0 ; i<nx ; i++) vector[i] *= *multiplier;
  1320. @}
  1321. @end example
  1322. @end cartouche
  1323. Here the source of the application. You can notice the value of the
  1324. field @code{where} for the codelet. We specify
  1325. @code{STARPU_CPU|STARPU_CUDA} to indicate to StarPU that the codelet
  1326. can be executed either on a CPU or on a CUDA device.
  1327. @example
  1328. #include <starpu.h>
  1329. #define NX 10
  1330. extern void cuda_codelet(void *descr[], void *_args);
  1331. extern void cpu_codelet(void *descr[], void *_args);
  1332. int main(int argc, char **argv)
  1333. @{
  1334. float *vector;
  1335. int i, ret;
  1336. float multiplier=3.0;
  1337. starpu_codelet cl;
  1338. struct starpu_task *task;
  1339. starpu_data_handle vector_handle;
  1340. starpu_data_handle multiplier_handle;
  1341. starpu_init(NULL); /* @b{Initialising StarPU} */
  1342. vector = (float*)malloc(NX*sizeof(float));
  1343. assert(vector);
  1344. for(i=0 ; i<NX ; i++) vector[i] = i;
  1345. /* @b{Registering data within StarPU} */
  1346. starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector,
  1347. NX, sizeof(float));
  1348. starpu_register_variable_data(&multiplier_handle, 0, (uintptr_t)&multiplier,
  1349. sizeof(float));
  1350. /* @b{Definition of the codelet} */
  1351. cl.where = STARPU_CPU|STARPU_CUDA; /* @b{It can be executed on a CPU or on CUDA device} */
  1352. cl.cuda_func = cuda_codelet;
  1353. cl.cpu_func = cpu_codelet;
  1354. cl.nbuffers = 2;
  1355. cl.model = NULL;
  1356. /* @b{Definition of the task} */
  1357. task = starpu_task_create();
  1358. task->cl = &cl;
  1359. task->callback_func = NULL;
  1360. task->buffers[0].handle = vector_handle;
  1361. task->buffers[0].mode = STARPU_RW;
  1362. task->buffers[1].handle = multiplier_handle;
  1363. task->buffers[1].mode = STARPU_RW;
  1364. /* @b{Submitting the task} */
  1365. ret = starpu_task_submit(task);
  1366. if (ret == -ENODEV) @{
  1367. fprintf(stderr, "No worker may execute this task\n");
  1368. return 1;
  1369. @}
  1370. /* @b{Waiting for its termination} */
  1371. starpu_task_wait_for_all();
  1372. /* @b{Update the vector in RAM} */
  1373. starpu_data_sync_with_mem(vector_handle, STARPU_R);
  1374. /* @b{Access the data} */
  1375. for(i=0 ; i<NX; i++) @{
  1376. fprintf(stderr, "%f ", vector[i]);
  1377. @}
  1378. fprintf(stderr, "\n");
  1379. /* @b{Release the data and shutdown StarPU} */
  1380. starpu_data_release_from_mem(vector_handle);
  1381. starpu_shutdown();
  1382. return 0;
  1383. @}
  1384. @end example
  1385. @node Compilation and execution
  1386. @subsection Compilation and execution
  1387. Let's suppose StarPU has been installed in the directory
  1388. @code{$STARPU_DIR}. As explained in @ref{pkg-config configuration},
  1389. the variable @code{PKG_CONFIG_PATH} needs to be set. It is also
  1390. necessary to set the variable @code{LD_LIBRARY_PATH} to locate dynamic
  1391. libraries at runtime.
  1392. @example
  1393. $ PKG_CONFIG_PATH=$STARPU_DIR/lib/pkgconfig:$PKG_CONFIG_PATH
  1394. $ LD_LIBRARY_PATH=$STARPU_DIR/lib:$LD_LIBRARY_PATH
  1395. @end example
  1396. It is then possible to compile the application using the following
  1397. makefile:
  1398. @cartouche
  1399. @example
  1400. CFLAGS += $(shell pkg-config --cflags libstarpu)
  1401. LDFLAGS += $(shell pkg-config --libs libstarpu)
  1402. CC = gcc
  1403. vector: vector.o vector_cpu.o vector_cuda.o
  1404. %.o: %.cu
  1405. nvcc $(CFLAGS) $< -c $@
  1406. clean:
  1407. rm -f vector *.o
  1408. @end example
  1409. @end cartouche
  1410. @example
  1411. $ make
  1412. @end example
  1413. and to execute it, with the default configuration:
  1414. @example
  1415. $ ./vector
  1416. 0.000000 3.000000 6.000000 9.000000 12.000000 15.000000 18.000000 21.000000 24.000000 27.000000
  1417. @end example
  1418. or for example, by disabling CPU devices:
  1419. @example
  1420. $ STARPU_NCPUS=0 ./vector
  1421. 0.000000 3.000000 6.000000 9.000000 12.000000 15.000000 18.000000 21.000000 24.000000 27.000000
  1422. @end example
  1423. or by disabling CUDA devices:
  1424. @example
  1425. $ STARPU_NCUDA=0 ./vector
  1426. 0.000000 3.000000 6.000000 9.000000 12.000000 15.000000 18.000000 21.000000 24.000000 27.000000
  1427. @end example
  1428. @c ---------------------------------------------------------------------
  1429. @c Advanced Topics
  1430. @c ---------------------------------------------------------------------
  1431. @node Advanced Topics
  1432. @chapter Advanced Topics
  1433. @bye