starpu.texi 56 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569
  1. \input texinfo @c -*-texinfo-*-
  2. @c %**start of header
  3. @setfilename starpu.info
  4. @settitle StarPU
  5. @c %**end of header
  6. @setchapternewpage odd
  7. @titlepage
  8. @title StarPU
  9. @page
  10. @vskip 0pt plus 1filll
  11. @comment For the @value{version-GCC} Version*
  12. @end titlepage
  13. @summarycontents
  14. @contents
  15. @page
  16. @node Top
  17. @top Preface
  18. @cindex Preface
  19. This manual documents the usage of StarPU
  20. @comment
  21. @comment When you add a new menu item, please keep the right hand
  22. @comment aligned to the same column. Do not use tabs. This provides
  23. @comment better formatting.
  24. @comment
  25. @menu
  26. * Introduction:: A basic introduction to using StarPU.
  27. * Installing StarPU:: How to configure, build and install StarPU.
  28. * Configuration options:: Configurations options
  29. * Environment variables:: Environment variables used by StarPU.
  30. * StarPU API:: The API to use StarPU.
  31. * Basic Examples:: Basic examples of the use of StarPU.
  32. * Advanced Topics:: Advanced use of StarPU.
  33. @end menu
  34. @c ---------------------------------------------------------------------
  35. @c Introduction to StarPU
  36. @c ---------------------------------------------------------------------
  37. @node Introduction
  38. @chapter Introduction to StarPU
  39. @menu
  40. * Motivation:: Why StarPU ?
  41. * StarPU in a Nutshell:: The Fundamentals of StarPU
  42. @end menu
  43. @node Motivation
  44. @section Motivation
  45. @c complex machines with heterogeneous cores/devices
  46. The use of specialized hardware such as accelerators or coprocessors offers an
  47. interesting approach to overcome the physical limits encountered by processor
  48. architects. As a result, many machines are now equipped with one or several
  49. accelerators (e.g. a GPU), in addition to the usual processor(s). While a lot of
  50. efforts have been devoted to offload computation onto such accelerators, very
  51. little attention as been paid to portability concerns on the one hand, and to the
  52. possibility of having heterogeneous accelerators and processors to interact on the other hand.
  53. StarPU is a runtime system that offers support for heterogeneous multicore
  54. architectures, it not only offers a unified view of the computational resources
  55. (i.e. CPUs and accelerators at the same time), but it also takes care to
  56. efficiently map and execute tasks onto an heterogeneous machine while
  57. transparently handling low-level issues in a portable fashion.
  58. @c this leads to a complicated distributed memory design
  59. @c which is not (easily) manageable by hand
  60. @c added value/benefits of StarPU
  61. @c - portability
  62. @c - scheduling, perf. portability
  63. @node StarPU in a Nutshell
  64. @section StarPU in a Nutshell
  65. From a programming point of view, StarPU is not a new language but a library
  66. that executes tasks explicitly submitted by the application. The data that a
  67. task manipulate are automatically transferred onto the accelerator so that the
  68. programmer does not have to take care of complex data movements. StarPU also
  69. takes particular care of scheduling those tasks efficiently and allows
  70. scheduling experts to implement custom scheduling policies in a portable
  71. fashion.
  72. @c explain the notion of codelet and task (i.e. g(A, B)
  73. @subsection Codelet and Tasks
  74. One of StarPU primary data structure is the @b{codelet}. A codelet describes a
  75. computational kernel that can possibly be implemented on multiple architectures
  76. such as a CPU, a CUDA device or a Cell's SPU.
  77. @c TODO insert illustration f : f_spu, f_cpu, ...
  78. Another important data structure is the @b{task}. Executing a StarPU task
  79. consists in applying a codelet on a data set, on one of the architecture on
  80. which the codelet is implemented. In addition to the codelet that a task
  81. implements, it also describes which data are accessed, and how they are
  82. accessed during the computation (read and/or write).
  83. StarPU tasks are asynchronous: submitting a task to StarPU is a non-blocking
  84. operation. The task structure can also specify a @b{callback} function that is
  85. called once StarPU has properly executed the task. It also contains optional
  86. fields that the application may use to give hints to the scheduler (such as
  87. priority levels).
  88. A task may be identified by a unique 64-bit number which we refer as a @b{tag}.
  89. Task dependencies can be enforced either by the means of callback functions, or
  90. by expressing dependencies between tags.
  91. @c TODO insert illustration f(Ar, Brw, Cr) + ..
  92. @c DSM
  93. @subsection StarPU Data Management Library
  94. Because StarPU schedules tasks at runtime, data transfers have to be
  95. done automatically and ``just-in-time'' between processing units,
  96. relieving the application programmer from explicit data transfers.
  97. Moreover, to avoid unnecessary transfers, StarPU keeps data
  98. where it was last needed, even if was modified there, and it
  99. allows multiple copies of the same data to reside at the same time on
  100. several processing units as long as it is not modified.
  101. @c ---------------------------------------------------------------------
  102. @c Installing StarPU
  103. @c ---------------------------------------------------------------------
  104. @node Installing StarPU
  105. @chapter Installing StarPU
  106. StarPU can be built and installed by the standard means of the GNU
  107. autotools. The following chapter is intended to briefly remind how these tools
  108. can be used to install StarPU.
  109. @section Configuring StarPU
  110. @subsection Generating Makefiles and configuration scripts
  111. This step is not necessary when using the tarball releases of StarPU. If you
  112. are using the source code from the svn repository, you first need to generate
  113. the configure scripts and the Makefiles.
  114. @example
  115. $ autoreconf -vfi
  116. @end example
  117. @subsection Configuring StarPU
  118. @example
  119. $ ./configure
  120. @end example
  121. Details about options that are useful to give to @code{./configure} are given in
  122. @ref{Configuration options}.
  123. @section Building and Installing StarPU
  124. @subsection Building
  125. @example
  126. $ make
  127. @end example
  128. @subsection Sanity Checks
  129. In order to make sure that StarPU is working properly on the system, it is also
  130. possible to run a test suite.
  131. @example
  132. $ make check
  133. @end example
  134. @subsection Installing
  135. In order to install StarPU at the location that was specified during
  136. configuration:
  137. @example
  138. $ make install
  139. @end example
  140. @subsection pkg-config configuration
  141. It is possible that compiling and linking an application against StarPU
  142. requires to use specific flags or libraries (for instance @code{CUDA} or
  143. @code{libspe2}). Therefore, it is possible to use the @code{pkg-config} tool.
  144. If StarPU was not installed at some standard location, the path of StarPU's
  145. library must be specified in the @code{PKG_CONFIG_PATH} environment variable so
  146. that @code{pkg-config} can find it. So if StarPU was installed in
  147. @code{$prefix_dir}:
  148. @example
  149. $ PKG_CONFIG_PATH = $PKG_CONFIG_PATH:$prefix_dir/lib/pkgconfig
  150. @end example
  151. The flags required to compiled or linked against StarPU are then
  152. accessible with the following commands:
  153. @example
  154. $ pkg-config --cflags libstarpu # options for the compiler
  155. $ pkg-config --libs libstarpu # options for the linker
  156. @end example
  157. @c ---------------------------------------------------------------------
  158. @c Configuration options
  159. @c ---------------------------------------------------------------------
  160. @node Configuration options
  161. @chapter Configuration options
  162. @table @asis
  163. @item @code{--disable-cpu}
  164. Disable the use of CPUs of the machine. Only GPUs etc. will be used.
  165. @item @code{--enable-maxcudadev=<number>}
  166. Defines the maximum number of CUDA devices that StarPU will support, then
  167. available as the STARPU_MAXCUDADEVS macro.
  168. @item @code{--disable-cuda}
  169. Disable the use of CUDA, even the SDK is detected.
  170. @item @code{--enable-maxopencldev=<number>}
  171. Defines the maximum number of OpenCL devices that StarPU will support, then
  172. available as the STARPU_MAXOPENCLDEVS macro.
  173. @item @code{--disable-opencl}
  174. Disable the use of OpenCL, even the SDK is detected.
  175. @item @code{--enable-gordon}
  176. Enable the use of the Gordon runtime for Cell SPUs.
  177. @c TODO: rather default to enabled when detected
  178. @item @code{--enable-debug}
  179. Enable debugging messages.
  180. @item @code{--enable-fast}
  181. Do not enforce assertions, saves a lot of time spent to compute them otherwise.
  182. @item @code{--enable-verbose}
  183. Augment the verbosity of the debugging messages
  184. @item @code{--enable-coverage}
  185. Enable flags for the coverage tool.
  186. @item @code{--enable-perf-debug}
  187. enable performance debugging
  188. @item @code{--enable-model-debug}
  189. enable performance model debugging
  190. @item @code{--enable-stats}
  191. enable statistics
  192. @item @code{--enable-maxbuffers=<nbuffers>}
  193. Defines the maximum number of buffers that tasks will be able to take as parameter, then available as the STARPU_NMAXBUFS macro.
  194. @item @code{--disable-priority}
  195. Disable taking priorities into account in scheduling decisions. Mostly for
  196. comparison purposes.
  197. @item @code{--enable-allocation-cache}
  198. Enable the use of a data allocation cache to avoid the cost of it with
  199. CUDA. Still experimental.
  200. @item @code{--enable-opengl-render}
  201. Enable the use of OpenGL for the rendering of some examples.
  202. @c TODO: rather default to enabled when detected
  203. @item @code{--enable-blas-lib=<name>}
  204. Choose the blas library to be used by the examples. Either atlas or goto can be
  205. used ATM.
  206. @item @code{--with-cuda-dir=<path>}
  207. Tell where the CUDA SDK resides. This directory should notably contain
  208. @code{include/cuda.h}.
  209. @item @code{--with-magma=<path>}
  210. Tell where magma is installed
  211. @item @code{--with-opencl-dir=<path>}
  212. Tell where the OpenCL SDK is installed. This directory should notably contain
  213. @code{include/CL/cl.h}.
  214. @item @code{--with-gordon-dir=<path>}
  215. Tell where the Gordon SDK is installed.
  216. @item @code{--with-fxt=<path>}
  217. Tell where FxT (for generating traces and rendering them using ViTE) is
  218. installed. This directory should notably contain @code{include/fxt/fxt.h}.
  219. @item @code{--with-perf-model-dir=<dir>}
  220. Specify where performance models should be stored (instead of defaulting to the
  221. current user's home).
  222. @item @code{--with-mpicc=<path to mpicc>}
  223. Tell the path to the @code{mpicc} compiler to be used for starpumpi.
  224. @c TODO: also just use AC_PROG
  225. @item @code{--with-mpi}
  226. Enable building libstarpumpi
  227. @c TODO: rather just use the availability of mpicc instead of a second option
  228. @item @code{--with-goto-dir=<dir>}
  229. Specify where GotoBLAS is installed.
  230. @item @code{--with-atlas-dir=<dir>}
  231. Specify where ATLAS is installed. This directory should notably contain
  232. @code{include/cblas.h}.
  233. @end table
  234. @c ---------------------------------------------------------------------
  235. @c Environment variables
  236. @c ---------------------------------------------------------------------
  237. @node Environment variables
  238. @chapter Environment variables
  239. @menu
  240. * Workers:: Configuring workers
  241. * Scheduling:: Configuring the Scheduling engine
  242. * Misc:: Miscellaneous and debug
  243. @end menu
  244. Note: the values given in @code{starpu_conf} structure passed to starpu_init
  245. will override the values of the environment variables.
  246. @node Workers
  247. @section Configuring workers
  248. @menu
  249. * STARPU_NCPUS :: Number of CPU workers
  250. * STARPU_NCUDA :: Number of CUDA workers
  251. * STARPU_NOPENCL :: Number of OpenCL workers
  252. * STARPU_NGORDON :: Number of SPU workers (Cell)
  253. * STARPU_WORKERS_CPUID :: Bind workers to specific CPUs
  254. * STARPU_WORKERS_CUDAID :: Select specific CUDA devices
  255. * STARPU_WORKERS_OPENCLID :: Select specific OpenCL devices
  256. @end menu
  257. @node STARPU_NCPUS
  258. @subsection @code{STARPU_NCPUS} -- Number of CPU workers
  259. @table @asis
  260. @item @emph{Description}:
  261. Specify the maximum number of CPU workers. Note that StarPU will not allocate
  262. more CPUs than there are physical CPUs, and that some CPUs are used to control
  263. the accelerators.
  264. @end table
  265. @node STARPU_NCUDA
  266. @subsection @code{STARPU_NCUDA} -- Number of CUDA workers
  267. @table @asis
  268. @item @emph{Description}:
  269. Specify the maximum number of CUDA devices that StarPU can use. In case there
  270. @code{STARPU_NCUDA} is lower than the number of physical devices, it is
  271. possible to select which CUDA devices should be used by the means of the
  272. @code{STARPU_WORKERS_CUDAID} environment variable.
  273. @end table
  274. @node STARPU_NOPENCL
  275. @subsection @code{STARPU_NOPENCL} -- Number of OpenCL workers
  276. @table @asis
  277. @item @emph{Description}:
  278. OpenCL equivalent of the @code{STARPU_NCUDA} environment variable.
  279. @end table
  280. @node STARPU_NGORDON
  281. @subsection @code{STARPU_NGORDON} -- Number of SPU workers (Cell)
  282. @table @asis
  283. @item @emph{Description}:
  284. Specify the maximum number of SPUs that StarPU can use.
  285. @end table
  286. @node STARPU_WORKERS_CPUID
  287. @subsection @code{STARPU_WORKERS_CPUID} -- Bind workers to specific CPUs
  288. @table @asis
  289. @item @emph{Description}:
  290. Passing an array of integers (starting from 0) in @code{STARPU_WORKERS_CPUID}
  291. specifies on which logical CPU the different workers should be
  292. bound. For instance, if @code{STARPU_WORKERS_CPUID = "1 3 0 2"}, the first
  293. worker will be bound to logical CPU #1, the second CPU worker will be bound to
  294. logical CPU #3 and so on. Note that the logical ordering of the CPUs is either
  295. determined by the OS, or provided by the @code{hwloc} library in case it is
  296. available.
  297. Note that the first workers correspond to the CUDA workers, then come the
  298. OpenCL and the SPU, and finally the CPU workers. For example if
  299. we have @code{STARPU_NCUDA=1}, @code{STARPU_NOPENCL=1}, @code{STARPU_NCPUS=2}
  300. and @code{STARPU_WORKERS_CPUID = "0 2 1 3"}, the CUDA device will be controlled
  301. by logical CPU #0, the OpenCL device will be controlled by logical CPU #2, and
  302. the logical CPUs #1 and #3 will be used by the CPU workers.
  303. If the number of workers is larger than the array given in
  304. @code{STARPU_WORKERS_CPUID}, the workers are bound to the logical CPUs in a
  305. round-robin fashion: if @code{STARPU_WORKERS_CPUID = "0 1"}, the first and the
  306. third (resp. second and fourth) workers will be put on CPU #0 (resp. CPU #1).
  307. @end table
  308. @node STARPU_WORKERS_CUDAID
  309. @subsection @code{STARPU_WORKERS_CUDAID} -- Select specific CUDA devices
  310. @table @asis
  311. @item @emph{Description}:
  312. Similarly to the @code{STARPU_WORKERS_CPUID} environment variable, it is
  313. possible to select which CUDA devices should be used by StarPU. On a machine
  314. equipped with 4 GPUs, setting @code{STARPU_WORKERS_CUDAID = "1 3"} and
  315. @code{STARPU_NCUDA=2} specifies that 2 CUDA workers should be created, and that
  316. they should use CUDA devices #1 and #3 (the logical ordering of the devices is
  317. the one reported by CUDA).
  318. @end table
  319. @node STARPU_WORKERS_OPENCLID
  320. @subsection @code{STARPU_WORKERS_OPENCLID} -- Select specific OpenCL devices
  321. @table @asis
  322. @item @emph{Description}:
  323. OpenCL equivalent of the @code{STARPU_WORKERS_CUDAID} environment variable.
  324. @end table
  325. @node Scheduling
  326. @section Configuring the Scheduling engine
  327. @menu
  328. * STARPU_SCHED :: Scheduling policy
  329. * STARPU_CALIBRATE :: Calibrate performance models
  330. * STARPU_PREFETCH :: Use data prefetch
  331. * STARPU_SCHED_ALPHA :: Computation factor
  332. * STARPU_SCHED_BETA :: Communication factor
  333. @end menu
  334. @node STARPU_SCHED
  335. @subsection @code{STARPU_SCHED} -- Scheduling policy
  336. @table @asis
  337. @item @emph{Description}:
  338. This chooses between the different scheduling policies proposed by StarPU: work
  339. random, stealing, greedy, with performance models, etc.
  340. Use @code{STARPU_SCHED=help} to get the list of available schedulers
  341. @end table
  342. @node STARPU_CALIBRATE
  343. @subsection @code{STARPU_CALIBRATE} -- Calibrate performance models
  344. @table @asis
  345. @item @emph{Description}:
  346. If this variable is set to 1, the performance models are calibrated during
  347. the execution. If it is set to 2, the previous values are dropped to restart
  348. calibration from scratch.
  349. Note: this currently only applies to dm and dmda scheduling policies.
  350. @end table
  351. @node STARPU_PREFETCH
  352. @subsection @code{STARPU_PREFETCH} -- Use data prefetch
  353. @table @asis
  354. @item @emph{Description}:
  355. If this variable is set, data prefetching will be enable, that is when a task is
  356. scheduled to be executed e.g. on a GPU, StarPU will request an asynchronous
  357. transfer in advance, so that data is already present on the GPU when the task
  358. starts. As a result, computation and data transfers are overlapped.
  359. @end table
  360. @node STARPU_SCHED_ALPHA
  361. @subsection @code{STARPU_SCHED_ALPHA} -- Computation factor
  362. @table @asis
  363. @item @emph{Description}:
  364. To estimate the cost of a task StarPU takes into account the estimated
  365. computation time (obtained thanks to performance models). The alpha factor is
  366. the coefficient to be applied to it before adding it to the communication part.
  367. @end table
  368. @node STARPU_SCHED_BETA
  369. @subsection @code{STARPU_SCHED_BETA} -- Communication factor
  370. @table @asis
  371. @item @emph{Description}:
  372. To estimate the cost of a task StarPU takes into account the estimated
  373. data transfer time (obtained thanks to performance models). The beta factor is
  374. the coefficient to be applied to it before adding it to the computation part.
  375. @end table
  376. @node Misc
  377. @section Miscellaneous and debug
  378. @menu
  379. * STARPU_LOGFILENAME :: Select debug file name
  380. @end menu
  381. @node STARPU_LOGFILENAME
  382. @subsection @code{STARPU_LOGFILENAME} -- Select debug file name
  383. @table @asis
  384. @item @emph{Description}:
  385. This variable tells to which file the debugging output should go.
  386. @end table
  387. @c ---------------------------------------------------------------------
  388. @c StarPU API
  389. @c ---------------------------------------------------------------------
  390. @node StarPU API
  391. @chapter StarPU API
  392. @menu
  393. * Initialization and Termination:: Initialization and Termination methods
  394. * Workers' Properties:: Methods to enumerate workers' properties
  395. * Data Library:: Methods to manipulate data
  396. * Codelets and Tasks:: Methods to construct tasks
  397. * Tags:: Task dependencies
  398. * CUDA extensions:: CUDA extensions
  399. * Cell extensions:: Cell extensions
  400. * Miscellaneous:: Miscellaneous helpers
  401. @end menu
  402. @node Initialization and Termination
  403. @section Initialization and Termination
  404. @menu
  405. * starpu_init:: Initialize StarPU
  406. * struct starpu_conf:: StarPU runtime configuration
  407. * starpu_shutdown:: Terminate StarPU
  408. @end menu
  409. @node starpu_init
  410. @subsection @code{starpu_init} -- Initialize StarPU
  411. @table @asis
  412. @item @emph{Description}:
  413. This is StarPU initialization method, which must be called prior to any other
  414. StarPU call. It is possible to specify StarPU's configuration (e.g. scheduling
  415. policy, number of cores, ...) by passing a non-null argument. Default
  416. configuration is used if the passed argument is @code{NULL}.
  417. @item @emph{Return value}:
  418. Upon successful completion, this function returns 0. Otherwise, @code{-ENODEV}
  419. indicates that no worker was available (so that StarPU was not be initialized).
  420. @item @emph{Prototype}:
  421. @code{int starpu_init(struct starpu_conf *conf);}
  422. @end table
  423. @node struct starpu_conf
  424. @subsection @code{struct starpu_conf} -- StarPU runtime configuration
  425. @table @asis
  426. @item @emph{Description}:
  427. This structure is passed to the @code{starpu_init} function in order configure
  428. StarPU. When the default value is used, StarPU automatically select the number
  429. of processing units and takes the default scheduling policy. This parameters
  430. overwrite the equivalent environment variables.
  431. @item @emph{Fields}:
  432. @table @asis
  433. @item @code{sched_policy} (default = NULL):
  434. This is the name of the scheduling policy. This can also be specified with the
  435. @code{STARPU_SCHED} environment variable.
  436. @item @code{ncpus} (default = -1):
  437. This is the maximum number of CPU cores that StarPU can use. This can also be
  438. specified with the @code{STARPU_NCPUS} environment variable.
  439. @item @code{ncuda} (default = -1):
  440. This is the maximum number of CUDA devices that StarPU can use. This can also be
  441. specified with the @code{STARPU_NCUDA} environment variable.
  442. @item @code{nopencl} (default = -1):
  443. This is the maximum number of OpenCL devices that StarPU can use. This can also be
  444. specified with the @code{STARPU_NOPENCL} environment variable.
  445. @item @code{nspus} (default = -1):
  446. This is the maximum number of Cell SPUs that StarPU can use. This can also be
  447. specified with the @code{STARPU_NGORDON} environment variable.
  448. @item @code{calibrate} (default = 0):
  449. If this flag is set, StarPU will calibrate the performance models when
  450. executing tasks. This can also be specified with the @code{STARPU_CALIBRATE}
  451. environment variable.
  452. @end table
  453. @end table
  454. @node starpu_shutdown
  455. @subsection @code{starpu_shutdown} -- Terminate StarPU
  456. @table @asis
  457. @item @emph{Description}:
  458. This is StarPU termination method. It must be called at the end of the
  459. application: statistics and other post-mortem debugging information are not
  460. guaranteed to be available until this method has been called.
  461. @item @emph{Prototype}:
  462. @code{void starpu_shutdown(void);}
  463. @end table
  464. @node Workers' Properties
  465. @section Workers' Properties
  466. @menu
  467. * starpu_worker_get_count:: Get the number of processing units
  468. * starpu_cpu_worker_get_count:: Get the number of CPU controlled by StarPU
  469. * starpu_cuda_worker_get_count:: Get the number of CUDA devices controlled by StarPU
  470. * starpu_opencl_worker_get_count:: Get the number of OpenCL devices controlled by StarPU
  471. * starpu_spu_worker_get_count:: Get the number of Cell SPUs controlled by StarPU
  472. * starpu_worker_get_id:: Get the identifier of the current worker
  473. * starpu_worker_get_type:: Get the type of processing unit associated to a worker
  474. * starpu_worker_get_name:: Get the name of a worker
  475. @end menu
  476. @node starpu_worker_get_count
  477. @subsection @code{starpu_worker_get_count} -- Get the number of processing units
  478. @table @asis
  479. @item @emph{Description}:
  480. This function returns the number of workers (i.e. processing units executing
  481. StarPU tasks). The returned value should be at most @code{STARPU_NMAXWORKERS}.
  482. @item @emph{Prototype}:
  483. @code{unsigned starpu_worker_get_count(void);}
  484. @end table
  485. @node starpu_cpu_worker_get_count
  486. @subsection @code{starpu_cpu_worker_get_count} -- Get the number of CPU controlled by StarPU
  487. @table @asis
  488. @item @emph{Description}:
  489. This function returns the number of CPUs controlled by StarPU. The returned
  490. value should be at most @code{STARPU_NMAXCPUS}.
  491. @item @emph{Prototype}:
  492. @code{unsigned starpu_cpu_worker_get_count(void);}
  493. @end table
  494. @node starpu_cuda_worker_get_count
  495. @subsection @code{starpu_cuda_worker_get_count} -- Get the number of CUDA devices controlled by StarPU
  496. @table @asis
  497. @item @emph{Description}:
  498. This function returns the number of CUDA devices controlled by StarPU. The returned
  499. value should be at most @code{STARPU_MAXCUDADEVS}.
  500. @item @emph{Prototype}:
  501. @code{unsigned starpu_cuda_worker_get_count(void);}
  502. @end table
  503. @node starpu_opencl_worker_get_count
  504. @subsection @code{starpu_opencl_worker_get_count} -- Get the number of OpenCL devices controlled by StarPU
  505. @table @asis
  506. @item @emph{Description}:
  507. This function returns the number of OpenCL devices controlled by StarPU. The returned
  508. value should be at most @code{STARPU_MAXOPENCLDEVS}.
  509. @item @emph{Prototype}:
  510. @code{unsigned starpu_opencl_worker_get_count(void);}
  511. @end table
  512. @node starpu_spu_worker_get_count
  513. @subsection @code{starpu_spu_worker_get_count} -- Get the number of Cell SPUs controlled by StarPU
  514. @table @asis
  515. @item @emph{Description}:
  516. This function returns the number of Cell SPUs controlled by StarPU.
  517. @item @emph{Prototype}:
  518. @code{unsigned starpu_opencl_worker_get_count(void);}
  519. @end table
  520. @node starpu_worker_get_id
  521. @subsection @code{starpu_worker_get_id} -- Get the identifier of the current worker
  522. @table @asis
  523. @item @emph{Description}:
  524. This function returns the identifier of the worker associated to the calling
  525. thread. The returned value is either -1 if the current context is not a StarPU
  526. worker (i.e. when called from the application outside a task or a callback), or
  527. an integer between 0 and @code{starpu_worker_get_count() - 1}.
  528. @item @emph{Prototype}:
  529. @code{int starpu_worker_get_id(void);}
  530. @end table
  531. @node starpu_worker_get_type
  532. @subsection @code{starpu_worker_get_type} -- Get the type of processing unit associated to a worker
  533. @table @asis
  534. @item @emph{Description}:
  535. This function returns the type of worker associated to an identifier (as
  536. returned by the @code{starpu_worker_get_id} function). The returned value
  537. indicates the architecture of the worker: @code{STARPU_CPU_WORKER} for a CPU
  538. core, @code{STARPU_CUDA_WORKER} for a CUDA device,
  539. @code{STARPU_OPENCL_WORKER} for a OpenCL device, and
  540. @code{STARPU_GORDON_WORKER} for a Cell SPU. The value returned for an invalid
  541. identifier is unspecified.
  542. @item @emph{Prototype}:
  543. @code{enum starpu_archtype starpu_worker_get_type(int id);}
  544. @end table
  545. @node starpu_worker_get_name
  546. @subsection @code{starpu_worker_get_name} -- Get the name of a worker
  547. @table @asis
  548. @item @emph{Description}:
  549. StarPU associates a unique human readable string to each processing unit. This
  550. function copies at most the @code{maxlen} first bytes of the unique string
  551. associated to a worker identified by its identifier @code{id} into the
  552. @code{dst} buffer. The caller is responsible for ensuring that the @code{dst}
  553. is a valid pointer to a buffer of @code{maxlen} bytes at least. Calling this
  554. function on an invalid identifier results in an unspecified behaviour.
  555. @item @emph{Prototype}:
  556. @code{void starpu_worker_get_name(int id, char *dst, size_t maxlen);}
  557. @end table
  558. @node Data Library
  559. @section Data Library
  560. This section describes the data management facilities provided by StarPU.
  561. TODO: We show how to use existing data interfaces in [ref], but developers can
  562. design their own data interfaces if required.
  563. @menu
  564. * starpu_data_handle:: StarPU opaque data handle
  565. * void *interface:: StarPU data interface
  566. @end menu
  567. @node starpu_data_handle
  568. @subsection @code{starpu_data_handle} -- StarPU opaque data handle
  569. @table @asis
  570. @item @emph{Description}:
  571. StarPU uses @code{starpu_data_handle} as an opaque handle to manage a piece of
  572. data. Once a piece of data has been registered to StarPU, it is associated to a
  573. @code{starpu_data_handle} which keeps track of the state of the piece of data
  574. over the entire machine, so that we can maintain data consistency and locate
  575. data replicates for instance.
  576. @end table
  577. @node void *interface
  578. @subsection @code{void *interface} -- StarPU data interface
  579. @table @asis
  580. @item @emph{Description}:
  581. Data management is done at a high-level in StarPU: rather than accessing a mere
  582. list of contiguous buffers, the tasks may manipulate data that are described by
  583. a high-level construct which we call data interface.
  584. TODO
  585. @end table
  586. @c void starpu_data_unregister(struct starpu_data_state_t *state);
  587. @c starpu_worker_get_memory_node TODO
  588. @c
  589. @c user interaction with the DSM
  590. @c void starpu_data_sync_with_mem(struct starpu_data_state_t *state);
  591. @c void starpu_notify_data_modification(struct starpu_data_state_t *state, uint32_t modifying_node);
  592. @node Codelets and Tasks
  593. @section Codelets and Tasks
  594. @menu
  595. * struct starpu_codelet:: StarPU codelet structure
  596. * struct starpu_task:: StarPU task structure
  597. * starpu_task_init:: Initialize a Task
  598. * starpu_task_create:: Allocate and Initialize a Task
  599. * starpu_task_deinit:: Release all the resources used by a Task
  600. * starpu_task_destroy:: Destroy a dynamically allocated Task
  601. * starpu_task_submit:: Submit a Task
  602. * starpu_task_wait:: Wait for the termination of a Task
  603. * starpu_task_wait_for_all:: Wait for the termination of all Tasks
  604. @end menu
  605. @node struct starpu_codelet
  606. @subsection @code{struct starpu_codelet} -- StarPU codelet structure
  607. @table @asis
  608. @item @emph{Description}:
  609. The codelet structure describes a kernel that is possibly implemented on
  610. various targets.
  611. @item @emph{Fields}:
  612. @table @asis
  613. @item @code{where}:
  614. Indicates which types of processing units are able to execute that codelet.
  615. @code{STARPU_CPU|STARPU_CUDA} for instance indicates that the codelet is
  616. implemented for both CPU cores and CUDA devices while @code{STARPU_GORDON}
  617. indicates that it is only available on Cell SPUs.
  618. @item @code{cpu_func} (optional):
  619. Is a function pointer to the CPU implementation of the codelet. Its prototype
  620. must be: @code{void cpu_func(void *buffers[], void *cl_arg)}. The first
  621. argument being the array of data managed by the data management library, and
  622. the second argument is a pointer to the argument passed from the @code{.cl_arg}
  623. field of the @code{starpu_task} structure.
  624. The @code{cpu_func} field is ignored if @code{STARPU_CPU} does not appear in
  625. the @code{.where} field, it must be non-null otherwise.
  626. @item @code{cuda_func} (optional):
  627. Is a function pointer to the CUDA implementation of the codelet. @emph{This
  628. must be a host-function written in the CUDA runtime API}. Its prototype must
  629. be: @code{void cuda_func(void *buffers[], void *cl_arg);}. The @code{cuda_func}
  630. field is ignored if @code{STARPU_CUDA} does not appear in the @code{.where}
  631. field, it must be non-null otherwise.
  632. @item @code{opencl_func} (optional):
  633. Is a function pointer to the OpenCL implementation of the codelet. Its
  634. prototype must be:
  635. @code{void opencl_func(starpu_data_interface_t *descr, void *arg);}.
  636. This pointer is ignored if @code{OPENCL} does not appear in the
  637. @code{.where} field, it must be non-null otherwise.
  638. @item @code{gordon_func} (optional):
  639. This is the index of the Cell SPU implementation within the Gordon library.
  640. TODO
  641. @item @code{nbuffers}:
  642. Specifies the number of arguments taken by the codelet. These arguments are
  643. managed by the DSM and are accessed from the @code{void *buffers[]}
  644. array. The constant argument passed with the @code{.cl_arg} field of the
  645. @code{starpu_task} structure is not counted in this number. This value should
  646. not be above @code{STARPU_NMAXBUFS}.
  647. @item @code{model} (optional):
  648. This is a pointer to the performance model associated to this codelet. This
  649. optional field is ignored when null. TODO
  650. @end table
  651. @end table
  652. @node struct starpu_task
  653. @subsection @code{struct starpu_task} -- StarPU task structure
  654. @table @asis
  655. @item @emph{Description}:
  656. The starpu_task structure describes a task that can be offloaded on the various
  657. processing units managed by StarPU. It instantiates a codelet. It can either be
  658. allocated dynamically with the @code{starpu_task_create} method, or declared
  659. statically. In the latter case, the programmer has to zero the
  660. @code{starpu_task} structure and to fill the different fields properly. The
  661. indicated default values correspond to the configuration of a task allocated
  662. with @code{starpu_task_create}.
  663. @item @emph{Fields}:
  664. @table @asis
  665. @item @code{cl}:
  666. Is a pointer to the corresponding @code{starpu_codelet} data structure. This
  667. describes where the kernel should be executed, and supplies the appropriate
  668. implementations. When set to @code{NULL}, no code is executed during the tasks,
  669. such empty tasks can be useful for synchronization purposes.
  670. @item @code{buffers}:
  671. TODO
  672. @item @code{cl_arg} (optional) (default = NULL):
  673. This pointer is passed to the codelet through the second argument
  674. of the codelet implementation (e.g. @code{cpu_func} or @code{cuda_func}).
  675. In the specific case of the Cell processor, see the @code{.cl_arg_size}
  676. argument.
  677. @item @code{cl_arg_size} (optional, Cell specific):
  678. In the case of the Cell processor, the @code{.cl_arg} pointer is not directly
  679. given to the SPU function. A buffer of size @code{cl_arg_size} is allocated on
  680. the SPU. This buffer is then filled with the @code{cl_arg_size} bytes starting
  681. at address @code{cl_arg}. In that case, the argument given to the SPU codelet
  682. is therefore not the @code{.cl_arg} pointer, but the address of the buffer in
  683. local store (LS) instead. This field is ignored for CPU, CUDA and OpenCL
  684. codelets.
  685. @item @code{callback_func} (optional) (default = @code{NULL}):
  686. This is a function pointer of prototype @code{void (*f)(void *)} which
  687. specifies a possible callback. If that pointer is non-null, the callback
  688. function is executed @emph{on the host} after the execution of the task. The
  689. callback is passed the value contained in the @code{callback_arg} field. No
  690. callback is executed if that field is null.
  691. @item @code{callback_arg} (optional) (default = @code{NULL}):
  692. This is the pointer passed to the callback function. This field is ignored if
  693. the @code{callback_func} is null.
  694. @item @code{use_tag} (optional) (default = 0):
  695. If set, this flag indicates that the task should be associated with the tag
  696. contained in the @code{tag_id} field. Tag allow the application to synchronize
  697. with the task and to express task dependencies easily.
  698. @item @code{tag_id}:
  699. This fields contains the tag associated to the tag if the @code{use_tag} field
  700. was set, it is ignored otherwise.
  701. @item @code{synchronous}:
  702. If this flag is set, the @code{starpu_task_submit} function is blocking and
  703. returns only when the task has been executed (or if no worker is able to
  704. process the task). Otherwise, @code{starpu_task_submit} returns immediately.
  705. @item @code{priority} (optional) (default = @code{STARPU_DEFAULT_PRIO}):
  706. This field indicates a level of priority for the task. This is an integer value
  707. that must be selected between @code{STARPU_MIN_PRIO} (for the least important
  708. tasks) and @code{STARPU_MAX_PRIO} (for the most important tasks) included.
  709. Default priority is @code{STARPU_DEFAULT_PRIO}. Scheduling strategies that
  710. take priorities into account can use this parameter to take better scheduling
  711. decisions, but the scheduling policy may also ignore it.
  712. @item @code{execute_on_a_specific_worker} (default = 0):
  713. If this flag is set, StarPU will bypass the scheduler and directly affect this
  714. task to the worker specified by the @code{workerid} field.
  715. @item @code{workerid} (optional):
  716. If the @code{execute_on_a_specific_worker} field is set, this field indicates
  717. which is the identifier of the worker that should process this task (as
  718. returned by @code{starpu_worker_get_id}). This field is ignored if
  719. @code{execute_on_a_specific_worker} field is set to 0.
  720. @item @code{detach} (optional) (default = 1):
  721. If this flag is set, it is not possible to synchronize with the task
  722. by the means of @code{starpu_task_wait} later on. Internal data structures
  723. are only guaranteed to be freed once @code{starpu_task_wait} is called if that
  724. flag is not set.
  725. @item @code{destroy} (optional) (default = 1):
  726. If that flag is set, the task structure will automatically be freed, either
  727. after the execution of the callback if the task is detached, or during
  728. @code{starpu_task_wait} otherwise. If this flag is not set, dynamically
  729. allocated data structures will not be freed until @code{starpu_task_destroy} is
  730. called explicitly. Setting this flag for a statically allocated task structure
  731. will result in undefined behaviour.
  732. @end table
  733. @end table
  734. @node starpu_task_init
  735. @subsection @code{starpu_task_init} -- Initialize a Task
  736. @table @asis
  737. @item @emph{Description}:
  738. Initialize a task structure with default values. This function is implicitly
  739. called by @code{starpu_task_create}. By default, tasks initialized with
  740. @code{starpu_task_init} must be deinitialized explicitly with
  741. @code{starpu_task_deinit}. Tasks can also be initialized statically, using the
  742. constant @code{STARPU_TASK_INITIALIZER}.
  743. @item @emph{Prototype}:
  744. @code{void starpu_task_init(struct starpu_task *task);}
  745. @end table
  746. @node starpu_task_create
  747. @subsection @code{starpu_task_create} -- Allocate and Initialize a Task
  748. @table @asis
  749. @item @emph{Description}:
  750. Allocate a task structure and initialize it with default values. Tasks
  751. allocated dynamically with starpu_task_create are automatically freed when the
  752. task is terminated. If the destroy flag is explicitly unset, the resources used
  753. by the task are freed by calling
  754. @code{starpu_task_destroy}.
  755. @item @emph{Prototype}:
  756. @code{struct starpu_task *starpu_task_create(void);}
  757. @end table
  758. @node starpu_task_deinit
  759. @subsection @code{starpu_task_deinit} -- Release all the resources used by a Task
  760. @table @asis
  761. @item @emph{Description}:
  762. Release all the structures automatically allocated to execute the task. This is
  763. called implicitly by starpu_task_destroy, but the task structure itself is not
  764. freed. This should be used for statically allocated tasks for instance.
  765. Note that this function is automatically called by @code{starpu_task_destroy}.
  766. @item @emph{Prototype}:
  767. @code{void starpu_task_deinit(struct starpu_task *task);}
  768. @end table
  769. @node starpu_task_destroy
  770. @subsection @code{starpu_task_destroy} -- Destroy a dynamically allocated Task
  771. @table @asis
  772. @item @emph{Description}:
  773. Free the resource allocated during starpu_task_create. This function can be
  774. called automatically after the execution of a task by setting the
  775. @code{.destroy} flag of the @code{starpu_task} structure (default behaviour).
  776. Calling this function on a statically allocated task results in an undefined
  777. behaviour.
  778. @item @emph{Prototype}:
  779. @code{void starpu_task_destroy(struct starpu_task *task);}
  780. @end table
  781. @node starpu_task_wait
  782. @subsection @code{starpu_task_wait} -- Wait for the termination of a Task
  783. @table @asis
  784. @item @emph{Description}:
  785. This function blocks until the task was executed. It is not possible to
  786. synchronize with a task more than once. It is not possible to wait
  787. synchronous or detached tasks.
  788. @item @emph{Return value}:
  789. Upon successful completion, this function returns 0. Otherwise, @code{-EINVAL}
  790. indicates that the waited task was either synchronous or detached.
  791. @item @emph{Prototype}:
  792. @code{int starpu_task_wait(struct starpu_task *task);}
  793. @end table
  794. @node starpu_task_submit
  795. @subsection @code{starpu_task_submit} -- Submit a Task
  796. @table @asis
  797. @item @emph{Description}:
  798. This function submits task @code{task} to StarPU. Calling this function does
  799. not mean that the task will be executed immediately as there can be data or task
  800. (tag) dependencies that are not fulfilled yet: StarPU will take care to
  801. schedule this task with respect to such dependencies.
  802. This function returns immediately if the @code{synchronous} field of the
  803. @code{starpu_task} structure was set to 0, and block until the termination of
  804. the task otherwise. It is also possible to synchronize the application with
  805. asynchronous tasks by the means of tags, using the @code{starpu_tag_wait}
  806. function for instance.
  807. In case of success, this function returns 0, a return value of @code{-ENODEV}
  808. means that there is no worker able to process that task (e.g. there is no GPU
  809. available and this task is only implemented on top of CUDA).
  810. @item @emph{Prototype}:
  811. @code{int starpu_task_submit(struct starpu_task *task);}
  812. @end table
  813. @node starpu_task_wait_for_all
  814. @subsection @code{starpu_task_wait_for_all} -- Wait for the termination of all Tasks
  815. @table @asis
  816. @item @emph{Description}:
  817. This function blocks until all the tasks that were submitted are terminated.
  818. @item @emph{Prototype}:
  819. @code{void starpu_task_wait_for_all(void);}
  820. @end table
  821. @c Callbacks : what can we put in callbacks ?
  822. @node Tags
  823. @section Tags
  824. @menu
  825. * starpu_tag_t:: Task identifier
  826. * starpu_tag_declare_deps:: Declare the Dependencies of a Tag
  827. * starpu_tag_declare_deps_array:: Declare the Dependencies of a Tag
  828. * starpu_tag_wait:: Block until a Tag is terminated
  829. * starpu_tag_wait_array:: Block until a set of Tags is terminated
  830. * starpu_tag_remove:: Destroy a Tag
  831. * starpu_tag_notify_from_apps:: Feed a tag explicitly
  832. @end menu
  833. @node starpu_tag_t
  834. @subsection @code{starpu_tag_t} -- Task identifier
  835. @table @asis
  836. @item @emph{Description}:
  837. It is possible to associate a task with a unique "tag" and to express
  838. dependencies between tasks by the means of those tags. To do so, fill the
  839. @code{tag_id} field of the @code{starpu_task} structure with a tag number (can
  840. be arbitrary) and set the @code{use_tag} field to 1.
  841. If @code{starpu_tag_declare_deps} is called with that tag number, the task will
  842. not be started until the task which wears the declared dependency tags are
  843. complete.
  844. @end table
  845. @node starpu_tag_declare_deps
  846. @subsection @code{starpu_tag_declare_deps} -- Declare the Dependencies of a Tag
  847. @table @asis
  848. @item @emph{Description}:
  849. Specify the dependencies of the task identified by tag @code{id}. The first
  850. argument specifies the tag which is configured, the second argument gives the
  851. number of tag(s) on which @code{id} depends. The following arguments are the
  852. tags which have to terminated to unlock the task.
  853. This function must be called before the associated task is submitted to StarPU
  854. with @code{starpu_task_submit}.
  855. @item @emph{Remark}
  856. Because of the variable arity of @code{starpu_tag_declare_deps}, note that the
  857. last arguments @emph{must} be of type @code{starpu_tag_t}: constant values
  858. typically need to be explicitly casted. Using the
  859. @code{starpu_tag_declare_deps_array} function avoids this hazard.
  860. @item @emph{Prototype}:
  861. @code{void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...);}
  862. @item @emph{Example}:
  863. @example
  864. @c @cartouche
  865. /* Tag 0x1 depends on tags 0x32 and 0x52 */
  866. starpu_tag_declare_deps((starpu_tag_t)0x1,
  867. 2, (starpu_tag_t)0x32, (starpu_tag_t)0x52);
  868. @c @end cartouche
  869. @end example
  870. @end table
  871. @node starpu_tag_declare_deps_array
  872. @subsection @code{starpu_tag_declare_deps_array} -- Declare the Dependencies of a Tag
  873. @table @asis
  874. @item @emph{Description}:
  875. This function is similar to @code{starpu_tag_declare_deps}, except that its
  876. does not take a variable number of arguments but an array of tags of size
  877. @code{ndeps}.
  878. @item @emph{Prototype}:
  879. @code{void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t *array);}
  880. @item @emph{Example}:
  881. @example
  882. /* Tag 0x1 depends on tags 0x32 and 0x52 */
  883. starpu_tag_t tag_array[2] = @{0x32, 0x52@};
  884. starpu_tag_declare_deps_array((starpu_tag_t)0x1, 2, tag_array);
  885. @end example
  886. @end table
  887. @node starpu_tag_wait
  888. @subsection @code{starpu_tag_wait} -- Block until a Tag is terminated
  889. @table @asis
  890. @item @emph{Description}:
  891. This function blocks until the task associated to tag @code{id} has been
  892. executed. This is a blocking call which must therefore not be called within
  893. tasks or callbacks, but only from the application directly. It is possible to
  894. synchronize with the same tag multiple times, as long as the
  895. @code{starpu_tag_remove} function is not called. Note that it is still
  896. possible to synchronize with a tag associated to a task which @code{starpu_task}
  897. data structure was freed (e.g. if the @code{destroy} flag of the
  898. @code{starpu_task} was enabled).
  899. @item @emph{Prototype}:
  900. @code{void starpu_tag_wait(starpu_tag_t id);}
  901. @end table
  902. @node starpu_tag_wait_array
  903. @subsection @code{starpu_tag_wait_array} -- Block until a set of Tags is terminated
  904. @table @asis
  905. @item @emph{Description}:
  906. This function is similar to @code{starpu_tag_wait} except that it blocks until
  907. @emph{all} the @code{ntags} tags contained in the @code{id} array are
  908. terminated.
  909. @item @emph{Prototype}:
  910. @code{void starpu_tag_wait_array(unsigned ntags, starpu_tag_t *id);}
  911. @end table
  912. @node starpu_tag_remove
  913. @subsection @code{starpu_tag_remove} -- Destroy a Tag
  914. @table @asis
  915. @item @emph{Description}:
  916. This function release the resources associated to tag @code{id}. It can be
  917. called once the corresponding task has been executed and when there is no tag
  918. that depend on that one anymore.
  919. @item @emph{Prototype}:
  920. @code{void starpu_tag_remove(starpu_tag_t id);}
  921. @end table
  922. @node starpu_tag_notify_from_apps
  923. @subsection @code{starpu_tag_notify_from_apps} -- Feed a Tag explicitly
  924. @table @asis
  925. @item @emph{Description}:
  926. This function explicitly unlocks tag @code{id}. It may be useful in the
  927. case of applications which execute part of their computation outside StarPU
  928. tasks (e.g. third-party libraries). It is also provided as a
  929. convenient tool for the programmer, for instance to entirely construct the task
  930. DAG before actually giving StarPU the opportunity to execute the tasks.
  931. @item @emph{Prototype}:
  932. @code{void starpu_tag_notify_from_apps(starpu_tag_t id);}
  933. @end table
  934. @node CUDA extensions
  935. @section CUDA extensions
  936. @c void starpu_data_malloc_pinned_if_possible(float **A, size_t dim);
  937. @c starpu_helper_cublas_init TODO
  938. @c starpu_helper_cublas_shutdown TODO
  939. @menu
  940. * starpu_cuda_get_local_stream:: Get current worker's CUDA stream
  941. * starpu_helper_cublas_init:: Initialize CUBLAS on every CUDA device
  942. * starpu_helper_cublas_shutdown:: Deiitialize CUBLAS on every CUDA device
  943. @end menu
  944. @node starpu_cuda_get_local_stream
  945. @subsection @code{starpu_cuda_get_local_stream} -- Get current worker's CUDA stream
  946. @table @asis
  947. @item @emph{Description}:
  948. StarPU provides a stream for every CUDA device controlled by StarPU. This
  949. function is only provided for convenience so that programmers can easily use
  950. asynchronous operations within codelets without having to create a stream by
  951. hand. Note that the application is not forced to use the stream provided by
  952. @code{starpu_cuda_get_local_stream} and may also create its own streams.
  953. @item @emph{Prototype}:
  954. @code{cudaStream_t *starpu_cuda_get_local_stream(void);}
  955. @end table
  956. @node starpu_helper_cublas_init
  957. @subsection @code{starpu_helper_cublas_init} -- Initialize CUBLAS on every CUDA device
  958. @table @asis
  959. @item @emph{Description}:
  960. The CUBLAS library must be initialized prior to any CUBLAS call. Calling
  961. @code{starpu_helper_cublas_init} will initialize CUBLAS on every CUDA device
  962. controlled by StarPU. This call blocks until CUBLAS has been properly
  963. initialized on every device.
  964. @item @emph{Prototype}:
  965. @code{void starpu_helper_cublas_init(void);}
  966. @end table
  967. @node starpu_helper_cublas_shutdown
  968. @subsection @code{starpu_helper_cublas_shutdown} -- Deinitialize CUBLAS on every CUDA device
  969. @table @asis
  970. @item @emph{Description}:
  971. This function synchronously deinitializes the CUBLAS library on every CUDA device.
  972. @item @emph{Prototype}:
  973. @code{void starpu_helper_cublas_shutdown(void);}
  974. @end table
  975. @node Cell extensions
  976. @section Cell extensions
  977. nothing yet.
  978. @node Miscellaneous
  979. @section Miscellaneous helpers
  980. @menu
  981. * starpu_execute_on_each_worker:: Execute a function on a subset of workers
  982. @end menu
  983. @node starpu_execute_on_each_worker
  984. @subsection @code{starpu_execute_on_each_worker} -- Execute a function on a subset of workers
  985. @table @asis
  986. @item @emph{Description}:
  987. When calling this method, the offloaded function specified by the first argument is
  988. executed by every StarPU worker that may execute the function.
  989. The second argument is passed to the offloaded function.
  990. The last argument specifies on which types of processing units the function
  991. should be executed. Similarly to the @code{.where} field of the
  992. @code{starpu_codelet} structure, it is possible to specify that the function
  993. should be executed on every CUDA device and every CPU by passing
  994. @code{STARPU_CPU|STARPU_CUDA}.
  995. This function blocks until the function has been executed on every appropriate
  996. processing units, so that it may not be called from a callback function for
  997. instance.
  998. @item @emph{Prototype}:
  999. @code{void starpu_execute_on_each_worker(void (*func)(void *), void *arg, uint32_t where);}
  1000. @end table
  1001. @c ---------------------------------------------------------------------
  1002. @c Basic Examples
  1003. @c ---------------------------------------------------------------------
  1004. @node Basic Examples
  1005. @chapter Basic Examples
  1006. @menu
  1007. * Compiling and linking:: Compiling and Linking Options
  1008. * Hello World:: Submitting Tasks
  1009. * Scaling a Vector:: Manipulating Data
  1010. * Scaling a Vector (hybrid):: Handling Heterogeneous Architectures
  1011. @end menu
  1012. @node Compiling and linking
  1013. @section Compiling and linking options
  1014. The Makefile could for instance contain the following lines to define which
  1015. options must be given to the compiler and to the linker:
  1016. @example
  1017. @c @cartouche
  1018. CFLAGS+=$$(pkg-config --cflags libstarpu)
  1019. LIBS+=$$(pkg-config --libs libstarpu)
  1020. @c @end cartouche
  1021. @end example
  1022. @node Hello World
  1023. @section Hello World
  1024. In this section, we show how to implement a simple program that submits a task to StarPU.
  1025. @subsection Required Headers
  1026. The @code{starpu.h} header should be included in any code using StarPU.
  1027. @example
  1028. @c @cartouche
  1029. #include <starpu.h>
  1030. @c @end cartouche
  1031. @end example
  1032. @subsection Defining a Codelet
  1033. @example
  1034. @c @cartouche
  1035. void cpu_func(void *buffers[], void *cl_arg)
  1036. @{
  1037. float *array = cl_arg;
  1038. printf("Hello world (array = @{%f, %f@} )\n", array[0], array[1]);
  1039. @}
  1040. starpu_codelet cl =
  1041. @{
  1042. .where = STARPU_CPU,
  1043. .cpu_func = cpu_func,
  1044. .nbuffers = 0
  1045. @};
  1046. @c @end cartouche
  1047. @end example
  1048. A codelet is a structure that represents a computational kernel. Such a codelet
  1049. may contain an implementation of the same kernel on different architectures
  1050. (e.g. CUDA, Cell's SPU, x86, ...).
  1051. The ''@code{.nbuffers}'' field specifies the number of data buffers that are
  1052. manipulated by the codelet: here the codelet does not access or modify any data
  1053. that is controlled by our data management library. Note that the argument
  1054. passed to the codelet (the ''@code{.cl_arg}'' field of the @code{starpu_task}
  1055. structure) does not count as a buffer since it is not managed by our data
  1056. management library.
  1057. @c TODO need a crossref to the proper description of "where" see bla for more ...
  1058. We create a codelet which may only be executed on the CPUs. The ''@code{.where}''
  1059. field is a bitmask that defines where the codelet may be executed. Here, the
  1060. @code{STARPU_CPU} value means that only CPUs can execute this codelet
  1061. (@pxref{Codelets and Tasks} for more details on that field).
  1062. When a CPU core executes a codelet, it calls the @code{.cpu_func} function,
  1063. which @emph{must} have the following prototype:
  1064. @code{void (*cpu_func)(void *buffers[], void *cl_arg)}
  1065. In this example, we can ignore the first argument of this function which gives a
  1066. description of the input and output buffers (e.g. the size and the location of
  1067. the matrices). The second argument is a pointer to a buffer passed as an
  1068. argument to the codelet by the means of the ''@code{.cl_arg}'' field of the
  1069. @code{starpu_task} structure.
  1070. @c TODO rewrite so that it is a little clearer ?
  1071. Be aware that this may be a pointer to a
  1072. @emph{copy} of the actual buffer, and not the pointer given by the programmer:
  1073. if the codelet modifies this buffer, there is no guarantee that the initial
  1074. buffer will be modified as well: this for instance implies that the buffer
  1075. cannot be used as a synchronization medium.
  1076. @subsection Submitting a Task
  1077. @example
  1078. @c @cartouche
  1079. void callback_func(void *callback_arg)
  1080. @{
  1081. printf("Callback function (arg %x)\n", callback_arg);
  1082. @}
  1083. int main(int argc, char **argv)
  1084. @{
  1085. /* initialize StarPU */
  1086. starpu_init(NULL);
  1087. struct starpu_task *task = starpu_task_create();
  1088. task->cl = &cl;
  1089. float *array[2] = @{1.0f, -1.0f@};
  1090. task->cl_arg = &array;
  1091. task->cl_arg_size = 2*sizeof(float);
  1092. task->callback_func = callback_func;
  1093. task->callback_arg = 0x42;
  1094. /* starpu_task_submit will be a blocking call */
  1095. task->synchronous = 1;
  1096. /* submit the task to StarPU */
  1097. starpu_task_submit(task);
  1098. /* terminate StarPU */
  1099. starpu_shutdown();
  1100. return 0;
  1101. @}
  1102. @c @end cartouche
  1103. @end example
  1104. Before submitting any tasks to StarPU, @code{starpu_init} must be called. The
  1105. @code{NULL} argument specifies that we use default configuration. Tasks cannot
  1106. be submitted after the termination of StarPU by a call to
  1107. @code{starpu_shutdown}.
  1108. In the example above, a task structure is allocated by a call to
  1109. @code{starpu_task_create}. This function only allocates and fills the
  1110. corresponding structure with the default settings (@pxref{starpu_task_create}),
  1111. but it does not submit the task to StarPU.
  1112. @c not really clear ;)
  1113. The ''@code{.cl}'' field is a pointer to the codelet which the task will
  1114. execute: in other words, the codelet structure describes which computational
  1115. kernel should be offloaded on the different architectures, and the task
  1116. structure is a wrapper containing a codelet and the piece of data on which the
  1117. codelet should operate.
  1118. The optional ''@code{.cl_arg}'' field is a pointer to a buffer (of size
  1119. @code{.cl_arg_size}) with some parameters for the kernel
  1120. described by the codelet. For instance, if a codelet implements a computational
  1121. kernel that multiplies its input vector by a constant, the constant could be
  1122. specified by the means of this buffer.
  1123. Once a task has been executed, an optional callback function can be called.
  1124. While the computational kernel could be offloaded on various architectures, the
  1125. callback function is always executed on a CPU. The ''@code{.callback_arg}''
  1126. pointer is passed as an argument of the callback. The prototype of a callback
  1127. function must be:
  1128. @example
  1129. void (*callback_function)(void *);
  1130. @end example
  1131. If the @code{.synchronous} field is non-null, task submission will be
  1132. synchronous: the @code{starpu_task_submit} function will not return until the
  1133. task was executed. Note that the @code{starpu_shutdown} method does not
  1134. guarantee that asynchronous tasks have been executed before it returns.
  1135. @node Scaling a Vector
  1136. @section Manipulating Data: Scaling a Vector
  1137. The previous example has shown how to submit tasks. In this section we show how
  1138. StarPU tasks can manipulate data.
  1139. Programmers can describe the data layout of their application so that StarPU is
  1140. responsible for enforcing data coherency and availability across the machine.
  1141. Instead of handling complex (and non-portable) mechanisms to perform data
  1142. movements, programmers only declare which piece of data is accessed and/or
  1143. modified by a task, and StarPU makes sure that when a computational kernel
  1144. starts somewhere (e.g. on a GPU), its data are available locally.
  1145. Before submitting those tasks, the programmer first needs to declare the
  1146. different pieces of data to StarPU using the @code{starpu_register_*_data}
  1147. functions. To ease the development of applications for StarPU, it is possible
  1148. to describe multiple types of data layout. A type of data layout is called an
  1149. @b{interface}. By default, there are different interfaces available in StarPU:
  1150. here we will consider the @b{vector interface}.
  1151. The following lines show how to declare an array of @code{n} elements of type
  1152. @code{float} using the vector interface:
  1153. @example
  1154. float tab[n];
  1155. starpu_data_handle tab_handle;
  1156. starpu_vector_data_register(&tab_handle, 0, tab, n, sizeof(float));
  1157. @end example
  1158. The first argument, called the @b{data handle}, is an opaque pointer which
  1159. designates the array in StarPU. This is also the structure which is used to
  1160. describe which data is used by a task. The second argument is the node number
  1161. where the data currently resides. Here it is 0 since the @code{tab} array is in
  1162. the main memory. Then comes the pointer @code{tab} where the data can be found,
  1163. the number of elements in the vector and the size of each element.
  1164. It is possible to construct a StarPU
  1165. task that multiplies this vector by a constant factor:
  1166. @example
  1167. float factor;
  1168. struct starpu_task *task = starpu_task_create();
  1169. task->cl = &cl;
  1170. task->buffers[0].handle = tab_handle;
  1171. task->buffers[0].mode = STARPU_RW;
  1172. task->cl_arg = &factor;
  1173. task->cl_arg_size = sizeof(float);
  1174. @end example
  1175. Since the factor is constant, it does not need a preliminary declaration, and
  1176. can just be passed through the @code{cl_arg} pointer like in the previous
  1177. example. The vector parameter is described by its handle.
  1178. There are two fields in each element of the @code{buffers} array.
  1179. @code{.handle} is the handle of the data, and @code{.mode} specifies how the
  1180. kernel will access the data (@code{STARPU_R} for read-only, @code{STARPU_W} for
  1181. write-only and @code{STARPU_RW} for read and write access).
  1182. The definition of the codelet can be written as follows:
  1183. @example
  1184. void scal_func(void *buffers[], void *cl_arg)
  1185. @{
  1186. unsigned i;
  1187. float *factor = cl_arg;
  1188. struct starpu_vector_interface_s *vector = buffers[0];
  1189. /* length of the vector */
  1190. unsigned n = vector->nx;
  1191. /* local copy of the vector pointer */
  1192. float *val = (float *)vector->ptr;
  1193. for (i = 0; i < n; i++)
  1194. val[i] *= *factor;
  1195. @}
  1196. starpu_codelet cl = @{
  1197. .where = STARPU_CPU,
  1198. .cpu_func = scal_func,
  1199. .nbuffers = 1
  1200. @};
  1201. @end example
  1202. The second argument of the @code{scal_func} function contains a pointer to the
  1203. parameters of the codelet (given in @code{task->cl_arg}), so that we read the
  1204. constant factor from this pointer. The first argument is an array that gives
  1205. a description of every buffers passed in the @code{task->buffers}@ array. The
  1206. size of this array is given by the @code{.nbuffers} field of the codelet
  1207. structure. For the sake of generality, this array contains pointers to the
  1208. different interfaces describing each buffer. In the case of the @b{vector
  1209. interface}, the location of the vector (resp. its length) is accessible in the
  1210. @code{ptr} (resp. @code{nx}) of this array. Since the vector is accessed in a
  1211. read-write fashion, any modification will automatically affect future accesses
  1212. to that vector made by other tasks.
  1213. @node Scaling a Vector (hybrid)
  1214. @section Vector Scaling on an Hybrid CPU/GPU Machine
  1215. Contrary to the previous examples, the task submitted in the example may not
  1216. only be executed by the CPUs, but also by a CUDA device.
  1217. TODO
  1218. @c ---------------------------------------------------------------------
  1219. @c Advanced Topics
  1220. @c ---------------------------------------------------------------------
  1221. @node Advanced Topics
  1222. @chapter Advanced Topics
  1223. @bye