starpu.texi 78 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352
  1. \input texinfo @c -*-texinfo-*-
  2. @c %**start of header
  3. @setfilename starpu.info
  4. @settitle StarPU
  5. @c %**end of header
  6. @setchapternewpage odd
  7. @titlepage
  8. @title StarPU
  9. @page
  10. @vskip 0pt plus 1filll
  11. @comment For the @value{version-GCC} Version*
  12. @end titlepage
  13. @summarycontents
  14. @contents
  15. @page
  16. @node Top
  17. @top Preface
  18. @cindex Preface
  19. This manual documents the usage of StarPU.
  20. @comment
  21. @comment When you add a new menu item, please keep the right hand
  22. @comment aligned to the same column. Do not use tabs. This provides
  23. @comment better formatting.
  24. @comment
  25. @menu
  26. * Introduction:: A basic introduction to using StarPU
  27. * Installing StarPU:: How to configure, build and install StarPU
  28. * Using StarPU:: How to run StarPU application
  29. * Configuring StarPU:: How to configure StarPU
  30. * StarPU API:: The API to use StarPU
  31. * Basic Examples:: Basic examples of the use of StarPU
  32. * Full source code for the 'Scaling a Vector' example::
  33. @end menu
  34. @c ---------------------------------------------------------------------
  35. @c Introduction to StarPU
  36. @c ---------------------------------------------------------------------
  37. @node Introduction
  38. @chapter Introduction to StarPU
  39. @menu
  40. * Motivation:: Why StarPU ?
  41. * StarPU in a Nutshell:: The Fundamentals of StarPU
  42. @end menu
  43. @node Motivation
  44. @section Motivation
  45. @c complex machines with heterogeneous cores/devices
  46. The use of specialized hardware such as accelerators or coprocessors offers an
  47. interesting approach to overcome the physical limits encountered by processor
  48. architects. As a result, many machines are now equipped with one or several
  49. accelerators (e.g. a GPU), in addition to the usual processor(s). While a lot of
  50. efforts have been devoted to offload computation onto such accelerators, very
  51. little attention as been paid to portability concerns on the one hand, and to the
  52. possibility of having heterogeneous accelerators and processors to interact on the other hand.
  53. StarPU is a runtime system that offers support for heterogeneous multicore
  54. architectures, it not only offers a unified view of the computational resources
  55. (i.e. CPUs and accelerators at the same time), but it also takes care of
  56. efficiently mapping and executing tasks onto an heterogeneous machine while
  57. transparently handling low-level issues in a portable fashion.
  58. @c this leads to a complicated distributed memory design
  59. @c which is not (easily) manageable by hand
  60. @c added value/benefits of StarPU
  61. @c - portability
  62. @c - scheduling, perf. portability
  63. @node StarPU in a Nutshell
  64. @section StarPU in a Nutshell
  65. @menu
  66. * Codelet and Tasks::
  67. * StarPU Data Management Library::
  68. @end menu
  69. From a programming point of view, StarPU is not a new language but a library
  70. that executes tasks explicitly submitted by the application. The data that a
  71. task manipulates are automatically transferred onto the accelerator so that the
  72. programmer does not have to take care of complex data movements. StarPU also
  73. takes particular care of scheduling those tasks efficiently and allows
  74. scheduling experts to implement custom scheduling policies in a portable
  75. fashion.
  76. @c explain the notion of codelet and task (i.e. g(A, B)
  77. @node Codelet and Tasks
  78. @subsection Codelet and Tasks
  79. One of StarPU primary data structure is the @b{codelet}. A codelet describes a
  80. computational kernel that can possibly be implemented on multiple architectures
  81. such as a CPU, a CUDA device or a Cell's SPU.
  82. @c TODO insert illustration f : f_spu, f_cpu, ...
  83. Another important data structure is the @b{task}. Executing a StarPU task
  84. consists in applying a codelet on a data set, on one of the architectures on
  85. which the codelet is implemented. In addition to the codelet that a task
  86. implements, it also describes which data are accessed, and how they are
  87. accessed during the computation (read and/or write).
  88. StarPU tasks are asynchronous: submitting a task to StarPU is a non-blocking
  89. operation. The task structure can also specify a @b{callback} function that is
  90. called once StarPU has properly executed the task. It also contains optional
  91. fields that the application may use to give hints to the scheduler (such as
  92. priority levels).
  93. A task may be identified by a unique 64-bit number which we refer as a @b{tag}.
  94. Task dependencies can be enforced either by the means of callback functions, or
  95. by expressing dependencies between tags.
  96. @c TODO insert illustration f(Ar, Brw, Cr) + ..
  97. @c DSM
  98. @node StarPU Data Management Library
  99. @subsection StarPU Data Management Library
  100. Because StarPU schedules tasks at runtime, data transfers have to be
  101. done automatically and ``just-in-time'' between processing units,
  102. relieving the application programmer from explicit data transfers.
  103. Moreover, to avoid unnecessary transfers, StarPU keeps data
  104. where it was last needed, even if was modified there, and it
  105. allows multiple copies of the same data to reside at the same time on
  106. several processing units as long as it is not modified.
  107. @c ---------------------------------------------------------------------
  108. @c Installing StarPU
  109. @c ---------------------------------------------------------------------
  110. @node Installing StarPU
  111. @chapter Installing StarPU
  112. @menu
  113. * Configuration of StarPU::
  114. * Building and Installing StarPU::
  115. @end menu
  116. StarPU can be built and installed by the standard means of the GNU
  117. autotools. The following chapter is intended to briefly remind how these tools
  118. can be used to install StarPU.
  119. @node Configuration of StarPU
  120. @section Configuration of StarPU
  121. @menu
  122. * Generating Makefiles and configuration scripts::
  123. * Running the configuration::
  124. @end menu
  125. @node Generating Makefiles and configuration scripts
  126. @subsection Generating Makefiles and configuration scripts
  127. This step is not necessary when using the tarball releases of StarPU. If you
  128. are using the source code from the svn repository, you first need to generate
  129. the configure scripts and the Makefiles.
  130. @example
  131. % autoreconf -vfi
  132. @end example
  133. @node Running the configuration
  134. @subsection Running the configuration
  135. @example
  136. % ./configure
  137. @end example
  138. Details about options that are useful to give to @code{./configure} are given in
  139. @ref{Compilation configuration}.
  140. @node Building and Installing StarPU
  141. @section Building and Installing StarPU
  142. @menu
  143. * Building::
  144. * Sanity Checks::
  145. * Installing::
  146. @end menu
  147. @node Building
  148. @subsection Building
  149. @example
  150. % make
  151. @end example
  152. @node Sanity Checks
  153. @subsection Sanity Checks
  154. In order to make sure that StarPU is working properly on the system, it is also
  155. possible to run a test suite.
  156. @example
  157. % make check
  158. @end example
  159. @node Installing
  160. @subsection Installing
  161. In order to install StarPU at the location that was specified during
  162. configuration:
  163. @example
  164. % make install
  165. @end example
  166. @c ---------------------------------------------------------------------
  167. @c Using StarPU
  168. @c ---------------------------------------------------------------------
  169. @node Using StarPU
  170. @chapter Using StarPU
  171. @menu
  172. * Setting flags for compiling and linking applications::
  173. * Running a basic StarPU application::
  174. @end menu
  175. @node Setting flags for compiling and linking applications
  176. @section Setting flags for compiling and linking applications
  177. Compiling and linking an application against StarPU may require to use
  178. specific flags or libraries (for instance @code{CUDA} or @code{libspe2}).
  179. To this end, it is possible to use the @code{pkg-config} tool.
  180. If StarPU was not installed at some standard location, the path of StarPU's
  181. library must be specified in the @code{PKG_CONFIG_PATH} environment variable so
  182. that @code{pkg-config} can find it. For example if StarPU was installed in
  183. @code{$prefix_dir}:
  184. @example
  185. % PKG_CONFIG_PATH=$PKG_CONFIG_PATH:$prefix_dir/lib/pkgconfig
  186. @end example
  187. The flags required to compile or link against StarPU are then
  188. accessible with the following commands:
  189. @example
  190. % pkg-config --cflags libstarpu # options for the compiler
  191. % pkg-config --libs libstarpu # options for the linker
  192. @end example
  193. @node Running a basic StarPU application
  194. @section Running a basic StarPU application
  195. Basic examples using StarPU have been built in the directory
  196. @code{$prefix_dir/lib/starpu/examples/}. You can for example run the
  197. example @code{vector_scal}.
  198. @example
  199. % $prefix_dir/lib/starpu/examples/vector_scal
  200. BEFORE : First element was 1.000000
  201. AFTER First element is 3.140000
  202. %
  203. @end example
  204. @c ---------------------------------------------------------------------
  205. @c Configuration options
  206. @c ---------------------------------------------------------------------
  207. @node Configuring StarPU
  208. @chapter Configuring StarPU
  209. @menu
  210. * Compilation configuration::
  211. * Execution configuration through environment variables::
  212. @end menu
  213. @node Compilation configuration
  214. @section Compilation configuration
  215. The following arguments can be given to the @code{configure} script.
  216. @menu
  217. * Common configuration::
  218. * Configuring workers::
  219. * Advanced configuration::
  220. @end menu
  221. @node Common configuration
  222. @subsection Common configuration
  223. @menu
  224. * --enable-debug::
  225. * --enable-fast::
  226. * --enable-verbose::
  227. * --enable-coverage::
  228. @end menu
  229. @node --enable-debug
  230. @subsubsection @code{--enable-debug}
  231. @table @asis
  232. @item @emph{Description}:
  233. Enable debugging messages.
  234. @end table
  235. @node --enable-fast
  236. @subsubsection @code{--enable-fast}
  237. @table @asis
  238. @item @emph{Description}:
  239. Do not enforce assertions, saves a lot of time spent to compute them otherwise.
  240. @end table
  241. @node --enable-verbose
  242. @subsubsection @code{--enable-verbose}
  243. @table @asis
  244. @item @emph{Description}:
  245. Augment the verbosity of the debugging messages.
  246. @end table
  247. @node --enable-coverage
  248. @subsubsection @code{--enable-coverage}
  249. @table @asis
  250. @item @emph{Description}:
  251. Enable flags for the coverage tool.
  252. @end table
  253. @node Configuring workers
  254. @subsection Configuring workers
  255. @menu
  256. * --disable-cpu::
  257. * --enable-maxcudadev::
  258. * --disable-cuda::
  259. * --with-cuda-dir::
  260. * --enable-maxopencldev::
  261. * --disable-opencl::
  262. * --with-opencl-dir::
  263. * --enable-gordon::
  264. * --with-gordon-dir::
  265. @end menu
  266. @node --disable-cpu
  267. @subsubsection @code{--disable-cpu}
  268. @table @asis
  269. @item @emph{Description}:
  270. Disable the use of CPUs of the machine. Only GPUs etc. will be used.
  271. @end table
  272. @node --enable-maxcudadev
  273. @subsubsection @code{--enable-maxcudadev=<number>}
  274. @table @asis
  275. @item @emph{Description}:
  276. Defines the maximum number of CUDA devices that StarPU will support, then
  277. available as the @code{STARPU_MAXCUDADEVS} macro.
  278. @end table
  279. @node --disable-cuda
  280. @subsubsection @code{--disable-cuda}
  281. @table @asis
  282. @item @emph{Description}:
  283. Disable the use of CUDA, even if a valid CUDA installation was detected.
  284. @end table
  285. @node --with-cuda-dir
  286. @subsubsection @code{--with-cuda-dir=<path>}
  287. @table @asis
  288. @item @emph{Description}:
  289. Specify the directory where CUDA is installed. This directory should notably contain
  290. @code{include/cuda.h}.
  291. @end table
  292. @node --enable-maxopencldev
  293. @subsubsection @code{--enable-maxopencldev=<number>}
  294. @table @asis
  295. @item @emph{Description}:
  296. Defines the maximum number of OpenCL devices that StarPU will support, then
  297. available as the @code{STARPU_MAXOPENCLDEVS} macro.
  298. @end table
  299. @node --disable-opencl
  300. @subsubsection @code{--disable-opencl}
  301. @table @asis
  302. @item @emph{Description}:
  303. Disable the use of OpenCL, even if the SDK is detected.
  304. @end table
  305. @node --with-opencl-dir
  306. @subsubsection @code{--with-opencl-dir=<path>}
  307. @table @asis
  308. @item @emph{Description}:
  309. Specify the location of the OpenCL SDK. This directory should notably contain
  310. @code{include/CL/cl.h}.
  311. @end table
  312. @node --enable-gordon
  313. @subsubsection @code{--enable-gordon}
  314. @table @asis
  315. @item @emph{Description}:
  316. Enable the use of the Gordon runtime for Cell SPUs.
  317. @c TODO: rather default to enabled when detected
  318. @end table
  319. @node --with-gordon-dir
  320. @subsubsection @code{--with-gordon-dir=<path>}
  321. @table @asis
  322. @item @emph{Description}:
  323. Specify the location of the Gordon SDK.
  324. @end table
  325. @node Advanced configuration
  326. @subsection Advanced configuration
  327. @menu
  328. * --enable-perf-debug::
  329. * --enable-model-debug::
  330. * --enable-stats::
  331. * --enable-maxbuffers::
  332. * --enable-allocation-cache::
  333. * --enable-opengl-render::
  334. * --enable-blas-lib::
  335. * --with-magma::
  336. * --with-fxt::
  337. * --with-perf-model-dir::
  338. * --with-mpicc::
  339. * --with-mpi::
  340. * --with-goto-dir::
  341. * --with-atlas-dir::
  342. @end menu
  343. @node --enable-perf-debug
  344. @subsubsection @code{--enable-perf-debug}
  345. @table @asis
  346. @item @emph{Description}:
  347. Enable performance debugging.
  348. @end table
  349. @node --enable-model-debug
  350. @subsubsection @code{--enable-model-debug}
  351. @table @asis
  352. @item @emph{Description}:
  353. Enable performance model debugging.
  354. @end table
  355. @node --enable-stats
  356. @subsubsection @code{--enable-stats}
  357. @table @asis
  358. @item @emph{Description}:
  359. Enable statistics.
  360. @end table
  361. @node --enable-maxbuffers
  362. @subsubsection @code{--enable-maxbuffers=<nbuffers>}
  363. @table @asis
  364. @item @emph{Description}:
  365. Define the maximum number of buffers that tasks will be able to take
  366. as parameters, then available as the @code{STARPU_NMAXBUFS} macro.
  367. @end table
  368. @node --enable-allocation-cache
  369. @subsubsection @code{--enable-allocation-cache}
  370. @table @asis
  371. @item @emph{Description}:
  372. Enable the use of a data allocation cache to avoid the cost of it with
  373. CUDA. Still experimental.
  374. @end table
  375. @node --enable-opengl-render
  376. @subsubsection @code{--enable-opengl-render}
  377. @table @asis
  378. @item @emph{Description}:
  379. Enable the use of OpenGL for the rendering of some examples.
  380. @c TODO: rather default to enabled when detected
  381. @end table
  382. @node --enable-blas-lib
  383. @subsubsection @code{--enable-blas-lib=<name>}
  384. @table @asis
  385. @item @emph{Description}:
  386. Specify the blas library to be used by some of the examples. The
  387. library has to be 'atlas' or 'goto'.
  388. @end table
  389. @node --with-magma
  390. @subsubsection @code{--with-magma=<path>}
  391. @table @asis
  392. @item @emph{Description}:
  393. Specify where magma is installed.
  394. @end table
  395. @node --with-fxt
  396. @subsubsection @code{--with-fxt=<path>}
  397. @table @asis
  398. @item @emph{Description}:
  399. Specify the location of FxT (for generating traces and rendering them
  400. using ViTE). This directory should notably contain
  401. @code{include/fxt/fxt.h}.
  402. @end table
  403. @node --with-perf-model-dir
  404. @subsubsection @code{--with-perf-model-dir=<dir>}
  405. @table @asis
  406. @item @emph{Description}:
  407. Specify where performance models should be stored (instead of defaulting to the
  408. current user's home).
  409. @end table
  410. @node --with-mpicc
  411. @subsubsection @code{--with-mpicc=<path to mpicc>}
  412. @table @asis
  413. @item @emph{Description}:
  414. Specify the location of the @code{mpicc} compiler to be used for starpumpi.
  415. @c TODO: also just use AC_PROG
  416. @end table
  417. @node --with-mpi
  418. @subsubsection @code{--with-mpi}
  419. @table @asis
  420. @item @emph{Description}:
  421. Enable building libstarpumpi.
  422. @c TODO: rather just use the availability of mpicc instead of a second option
  423. @end table
  424. @node --with-goto-dir
  425. @subsubsection @code{--with-goto-dir=<dir>}
  426. @table @asis
  427. @item @emph{Description}:
  428. Specify the location of GotoBLAS.
  429. @end table
  430. @node --with-atlas-dir
  431. @subsubsection @code{--with-atlas-dir=<dir>}
  432. @table @asis
  433. @item @emph{Description}:
  434. Specify the location of ATLAS. This directory should notably contain
  435. @code{include/cblas.h}.
  436. @end table
  437. @c ---------------------------------------------------------------------
  438. @c Environment variables
  439. @c ---------------------------------------------------------------------
  440. @node Execution configuration through environment variables
  441. @section Execution configuration through environment variables
  442. @menu
  443. * Workers:: Configuring workers
  444. * Scheduling:: Configuring the Scheduling engine
  445. * Misc:: Miscellaneous and debug
  446. @end menu
  447. Note: the values given in @code{starpu_conf} structure passed when
  448. calling @code{starpu_init} will override the values of the environment
  449. variables.
  450. @node Workers
  451. @subsection Configuring workers
  452. @menu
  453. * STARPU_NCPUS:: Number of CPU workers
  454. * STARPU_NCUDA:: Number of CUDA workers
  455. * STARPU_NOPENCL:: Number of OpenCL workers
  456. * STARPU_NGORDON:: Number of SPU workers (Cell)
  457. * STARPU_WORKERS_CPUID:: Bind workers to specific CPUs
  458. * STARPU_WORKERS_CUDAID:: Select specific CUDA devices
  459. * STARPU_WORKERS_OPENCLID:: Select specific OpenCL devices
  460. @end menu
  461. @node STARPU_NCPUS
  462. @subsubsection @code{STARPU_NCPUS} -- Number of CPU workers
  463. @table @asis
  464. @item @emph{Description}:
  465. Specify the maximum number of CPU workers. Note that StarPU will not allocate
  466. more CPUs than there are physical CPUs, and that some CPUs are used to control
  467. the accelerators.
  468. @end table
  469. @node STARPU_NCUDA
  470. @subsubsection @code{STARPU_NCUDA} -- Number of CUDA workers
  471. @table @asis
  472. @item @emph{Description}:
  473. Specify the maximum number of CUDA devices that StarPU can use. If
  474. @code{STARPU_NCUDA} is lower than the number of physical devices, it is
  475. possible to select which CUDA devices should be used by the means of the
  476. @code{STARPU_WORKERS_CUDAID} environment variable.
  477. @end table
  478. @node STARPU_NOPENCL
  479. @subsubsection @code{STARPU_NOPENCL} -- Number of OpenCL workers
  480. @table @asis
  481. @item @emph{Description}:
  482. OpenCL equivalent of the @code{STARPU_NCUDA} environment variable.
  483. @end table
  484. @node STARPU_NGORDON
  485. @subsubsection @code{STARPU_NGORDON} -- Number of SPU workers (Cell)
  486. @table @asis
  487. @item @emph{Description}:
  488. Specify the maximum number of SPUs that StarPU can use.
  489. @end table
  490. @node STARPU_WORKERS_CPUID
  491. @subsubsection @code{STARPU_WORKERS_CPUID} -- Bind workers to specific CPUs
  492. @table @asis
  493. @item @emph{Description}:
  494. Passing an array of integers (starting from 0) in @code{STARPU_WORKERS_CPUID}
  495. specifies on which logical CPU the different workers should be
  496. bound. For instance, if @code{STARPU_WORKERS_CPUID = "1 3 0 2"}, the first
  497. worker will be bound to logical CPU #1, the second CPU worker will be bound to
  498. logical CPU #3 and so on. Note that the logical ordering of the CPUs is either
  499. determined by the OS, or provided by the @code{hwloc} library in case it is
  500. available.
  501. Note that the first workers correspond to the CUDA workers, then come the
  502. OpenCL and the SPU, and finally the CPU workers. For example if
  503. we have @code{STARPU_NCUDA=1}, @code{STARPU_NOPENCL=1}, @code{STARPU_NCPUS=2}
  504. and @code{STARPU_WORKERS_CPUID = "0 2 1 3"}, the CUDA device will be controlled
  505. by logical CPU #0, the OpenCL device will be controlled by logical CPU #2, and
  506. the logical CPUs #1 and #3 will be used by the CPU workers.
  507. If the number of workers is larger than the array given in
  508. @code{STARPU_WORKERS_CPUID}, the workers are bound to the logical CPUs in a
  509. round-robin fashion: if @code{STARPU_WORKERS_CPUID = "0 1"}, the first and the
  510. third (resp. second and fourth) workers will be put on CPU #0 (resp. CPU #1).
  511. @end table
  512. @node STARPU_WORKERS_CUDAID
  513. @subsubsection @code{STARPU_WORKERS_CUDAID} -- Select specific CUDA devices
  514. @table @asis
  515. @item @emph{Description}:
  516. Similarly to the @code{STARPU_WORKERS_CPUID} environment variable, it is
  517. possible to select which CUDA devices should be used by StarPU. On a machine
  518. equipped with 4 GPUs, setting @code{STARPU_WORKERS_CUDAID = "1 3"} and
  519. @code{STARPU_NCUDA=2} specifies that 2 CUDA workers should be created, and that
  520. they should use CUDA devices #1 and #3 (the logical ordering of the devices is
  521. the one reported by CUDA).
  522. @end table
  523. @node STARPU_WORKERS_OPENCLID
  524. @subsubsection @code{STARPU_WORKERS_OPENCLID} -- Select specific OpenCL devices
  525. @table @asis
  526. @item @emph{Description}:
  527. OpenCL equivalent of the @code{STARPU_WORKERS_CUDAID} environment variable.
  528. @end table
  529. @node Scheduling
  530. @subsection Configuring the Scheduling engine
  531. @menu
  532. * STARPU_SCHED:: Scheduling policy
  533. * STARPU_CALIBRATE:: Calibrate performance models
  534. * STARPU_PREFETCH:: Use data prefetch
  535. * STARPU_SCHED_ALPHA:: Computation factor
  536. * STARPU_SCHED_BETA:: Communication factor
  537. @end menu
  538. @node STARPU_SCHED
  539. @subsubsection @code{STARPU_SCHED} -- Scheduling policy
  540. @table @asis
  541. @item @emph{Description}:
  542. This chooses between the different scheduling policies proposed by StarPU: work
  543. random, stealing, greedy, with performance models, etc.
  544. Use @code{STARPU_SCHED=help} to get the list of available schedulers.
  545. @end table
  546. @node STARPU_CALIBRATE
  547. @subsubsection @code{STARPU_CALIBRATE} -- Calibrate performance models
  548. @table @asis
  549. @item @emph{Description}:
  550. If this variable is set to 1, the performance models are calibrated during
  551. the execution. If it is set to 2, the previous values are dropped to restart
  552. calibration from scratch.
  553. Note: this currently only applies to dm and dmda scheduling policies.
  554. @end table
  555. @node STARPU_PREFETCH
  556. @subsubsection @code{STARPU_PREFETCH} -- Use data prefetch
  557. @table @asis
  558. @item @emph{Description}:
  559. If this variable is set, data prefetching will be enabled, that is when a task is
  560. scheduled to be executed e.g. on a GPU, StarPU will request an asynchronous
  561. transfer in advance, so that data is already present on the GPU when the task
  562. starts. As a result, computation and data transfers are overlapped.
  563. @end table
  564. @node STARPU_SCHED_ALPHA
  565. @subsubsection @code{STARPU_SCHED_ALPHA} -- Computation factor
  566. @table @asis
  567. @item @emph{Description}:
  568. To estimate the cost of a task StarPU takes into account the estimated
  569. computation time (obtained thanks to performance models). The alpha factor is
  570. the coefficient to be applied to it before adding it to the communication part.
  571. @end table
  572. @node STARPU_SCHED_BETA
  573. @subsubsection @code{STARPU_SCHED_BETA} -- Communication factor
  574. @table @asis
  575. @item @emph{Description}:
  576. To estimate the cost of a task StarPU takes into account the estimated
  577. data transfer time (obtained thanks to performance models). The beta factor is
  578. the coefficient to be applied to it before adding it to the computation part.
  579. @end table
  580. @node Misc
  581. @subsection Miscellaneous and debug
  582. @menu
  583. * STARPU_LOGFILENAME:: Select debug file name
  584. @end menu
  585. @node STARPU_LOGFILENAME
  586. @subsubsection @code{STARPU_LOGFILENAME} -- Select debug file name
  587. @table @asis
  588. @item @emph{Description}:
  589. This variable specify in which file the debugging output should be saved to.
  590. @end table
  591. @c ---------------------------------------------------------------------
  592. @c StarPU API
  593. @c ---------------------------------------------------------------------
  594. @node StarPU API
  595. @chapter StarPU API
  596. @menu
  597. * Initialization and Termination:: Initialization and Termination methods
  598. * Workers' Properties:: Methods to enumerate workers' properties
  599. * Data Library:: Methods to manipulate data
  600. * Codelets and Tasks:: Methods to construct tasks
  601. * Tags:: Task dependencies
  602. * Profiling API:: Profiling API
  603. * CUDA extensions:: CUDA extensions
  604. * OpenCL extensions:: OpenCL extensions
  605. * Cell extensions:: Cell extensions
  606. * Miscellaneous helpers::
  607. @end menu
  608. @node Initialization and Termination
  609. @section Initialization and Termination
  610. @menu
  611. * starpu_init:: Initialize StarPU
  612. * struct starpu_conf:: StarPU runtime configuration
  613. * starpu_shutdown:: Terminate StarPU
  614. @end menu
  615. @node starpu_init
  616. @subsection @code{starpu_init} -- Initialize StarPU
  617. @table @asis
  618. @item @emph{Description}:
  619. This is StarPU initialization method, which must be called prior to any other
  620. StarPU call. It is possible to specify StarPU's configuration (e.g. scheduling
  621. policy, number of cores, ...) by passing a non-null argument. Default
  622. configuration is used if the passed argument is @code{NULL}.
  623. @item @emph{Return value}:
  624. Upon successful completion, this function returns 0. Otherwise, @code{-ENODEV}
  625. indicates that no worker was available (so that StarPU was not initialized).
  626. @item @emph{Prototype}:
  627. @code{int starpu_init(struct starpu_conf *conf);}
  628. @end table
  629. @node struct starpu_conf
  630. @subsection @code{struct starpu_conf} -- StarPU runtime configuration
  631. @table @asis
  632. @item @emph{Description}:
  633. This structure is passed to the @code{starpu_init} function in order
  634. to configure StarPU.
  635. When the default value is used, StarPU automatically selects the number
  636. of processing units and takes the default scheduling policy. This parameter
  637. overwrites the equivalent environment variables.
  638. @item @emph{Fields}:
  639. @table @asis
  640. @item @code{sched_policy_name} (default = NULL):
  641. This is the name of the scheduling policy. This can also be specified with the
  642. @code{STARPU_SCHED} environment variable.
  643. @item @code{sched_policy} (default = NULL):
  644. This is the definition of the scheduling policy. This field is ignored
  645. if @code{sched_policy_name} is set.
  646. @item @code{ncpus} (default = -1):
  647. This is the maximum number of CPU cores that StarPU can use. This can also be
  648. specified with the @code{STARPU_NCPUS} environment variable.
  649. @item @code{ncuda} (default = -1):
  650. This is the maximum number of CUDA devices that StarPU can use. This can also be
  651. specified with the @code{STARPU_NCUDA} environment variable.
  652. @item @code{nopencl} (default = -1):
  653. This is the maximum number of OpenCL devices that StarPU can use. This can also be
  654. specified with the @code{STARPU_NOPENCL} environment variable.
  655. @item @code{nspus} (default = -1):
  656. This is the maximum number of Cell SPUs that StarPU can use. This can also be
  657. specified with the @code{STARPU_NGORDON} environment variable.
  658. @item @code{use_explicit_workers_bindid} (default = 0)
  659. If this flag is set, the @code{workers_bindid} array indicates where the
  660. different workers are bound, otherwise StarPU automatically selects where to
  661. bind the different workers unless the @code{STARPU_WORKERS_CPUID} environment
  662. variable is set. The @code{STARPU_WORKERS_CPUID} environment variable is
  663. ignored if the @code{use_explicit_workers_bindid} flag is set.
  664. @item @code{workers_bindid[STARPU_NMAXWORKERS]}
  665. If the @code{use_explicit_workers_bindid} flag is set, this array indicates
  666. where to bind the different workers. The i-th entry of the
  667. @code{workers_bindid} indicates the logical identifier of the processor which
  668. should execute the i-th worker. Note that the logical ordering of the CPUs is
  669. either determined by the OS, or provided by the @code{hwloc} library in case it
  670. is available.
  671. @item @code{use_explicit_workers_cuda_gpuid} (default = 0)
  672. @item @code{workers_cuda_gpuid[STARPU_NMAXWORKERS]}
  673. @item @code{use_explicit_workers_opencl_gpuid} (default = 0)
  674. @item @code{workers_opencl_gpuid[STARPU_NMAXWORKERS]}:
  675. These fields are explained in @ref{STARPU_WORKERS_CPUID}.
  676. @item @code{calibrate} (default = 0):
  677. If this flag is set, StarPU will calibrate the performance models when
  678. executing tasks. This can also be specified with the @code{STARPU_CALIBRATE}
  679. environment variable.
  680. @end table
  681. @end table
  682. @node starpu_shutdown
  683. @subsection @code{starpu_shutdown} -- Terminate StarPU
  684. @table @asis
  685. @item @emph{Description}:
  686. This is StarPU termination method. It must be called at the end of the
  687. application: statistics and other post-mortem debugging information are not
  688. guaranteed to be available until this method has been called.
  689. @item @emph{Prototype}:
  690. @code{void starpu_shutdown(void);}
  691. @end table
  692. @node Workers' Properties
  693. @section Workers' Properties
  694. @menu
  695. * starpu_worker_get_count:: Get the number of processing units
  696. * starpu_cpu_worker_get_count:: Get the number of CPU controlled by StarPU
  697. * starpu_cuda_worker_get_count:: Get the number of CUDA devices controlled by StarPU
  698. * starpu_opencl_worker_get_count:: Get the number of OpenCL devices controlled by StarPU
  699. * starpu_spu_worker_get_count:: Get the number of Cell SPUs controlled by StarPU
  700. * starpu_worker_get_id:: Get the identifier of the current worker
  701. * starpu_worker_get_type:: Get the type of processing unit associated to a worker
  702. * starpu_worker_get_name:: Get the name of a worker
  703. @end menu
  704. @node starpu_worker_get_count
  705. @subsection @code{starpu_worker_get_count} -- Get the number of processing units
  706. @table @asis
  707. @item @emph{Description}:
  708. This function returns the number of workers (i.e. processing units executing
  709. StarPU tasks). The returned value should be at most @code{STARPU_NMAXWORKERS}.
  710. @item @emph{Prototype}:
  711. @code{unsigned starpu_worker_get_count(void);}
  712. @end table
  713. @node starpu_cpu_worker_get_count
  714. @subsection @code{starpu_cpu_worker_get_count} -- Get the number of CPU controlled by StarPU
  715. @table @asis
  716. @item @emph{Description}:
  717. This function returns the number of CPUs controlled by StarPU. The returned
  718. value should be at most @code{STARPU_NMAXCPUS}.
  719. @item @emph{Prototype}:
  720. @code{unsigned starpu_cpu_worker_get_count(void);}
  721. @end table
  722. @node starpu_cuda_worker_get_count
  723. @subsection @code{starpu_cuda_worker_get_count} -- Get the number of CUDA devices controlled by StarPU
  724. @table @asis
  725. @item @emph{Description}:
  726. This function returns the number of CUDA devices controlled by StarPU. The returned
  727. value should be at most @code{STARPU_MAXCUDADEVS}.
  728. @item @emph{Prototype}:
  729. @code{unsigned starpu_cuda_worker_get_count(void);}
  730. @end table
  731. @node starpu_opencl_worker_get_count
  732. @subsection @code{starpu_opencl_worker_get_count} -- Get the number of OpenCL devices controlled by StarPU
  733. @table @asis
  734. @item @emph{Description}:
  735. This function returns the number of OpenCL devices controlled by StarPU. The returned
  736. value should be at most @code{STARPU_MAXOPENCLDEVS}.
  737. @item @emph{Prototype}:
  738. @code{unsigned starpu_opencl_worker_get_count(void);}
  739. @end table
  740. @node starpu_spu_worker_get_count
  741. @subsection @code{starpu_spu_worker_get_count} -- Get the number of Cell SPUs controlled by StarPU
  742. @table @asis
  743. @item @emph{Description}:
  744. This function returns the number of Cell SPUs controlled by StarPU.
  745. @item @emph{Prototype}:
  746. @code{unsigned starpu_opencl_worker_get_count(void);}
  747. @end table
  748. @node starpu_worker_get_id
  749. @subsection @code{starpu_worker_get_id} -- Get the identifier of the current worker
  750. @table @asis
  751. @item @emph{Description}:
  752. This function returns the identifier of the worker associated to the calling
  753. thread. The returned value is either -1 if the current context is not a StarPU
  754. worker (i.e. when called from the application outside a task or a callback), or
  755. an integer between 0 and @code{starpu_worker_get_count() - 1}.
  756. @item @emph{Prototype}:
  757. @code{int starpu_worker_get_id(void);}
  758. @end table
  759. @node starpu_worker_get_type
  760. @subsection @code{starpu_worker_get_type} -- Get the type of processing unit associated to a worker
  761. @table @asis
  762. @item @emph{Description}:
  763. This function returns the type of worker associated to an identifier (as
  764. returned by the @code{starpu_worker_get_id} function). The returned value
  765. indicates the architecture of the worker: @code{STARPU_CPU_WORKER} for a CPU
  766. core, @code{STARPU_CUDA_WORKER} for a CUDA device,
  767. @code{STARPU_OPENCL_WORKER} for a OpenCL device, and
  768. @code{STARPU_GORDON_WORKER} for a Cell SPU. The value returned for an invalid
  769. identifier is unspecified.
  770. @item @emph{Prototype}:
  771. @code{enum starpu_archtype starpu_worker_get_type(int id);}
  772. @end table
  773. @node starpu_worker_get_name
  774. @subsection @code{starpu_worker_get_name} -- Get the name of a worker
  775. @table @asis
  776. @item @emph{Description}:
  777. StarPU associates a unique human readable string to each processing unit. This
  778. function copies at most the @code{maxlen} first bytes of the unique string
  779. associated to a worker identified by its identifier @code{id} into the
  780. @code{dst} buffer. The caller is responsible for ensuring that the @code{dst}
  781. is a valid pointer to a buffer of @code{maxlen} bytes at least. Calling this
  782. function on an invalid identifier results in an unspecified behaviour.
  783. @item @emph{Prototype}:
  784. @code{void starpu_worker_get_name(int id, char *dst, size_t maxlen);}
  785. @end table
  786. @node Data Library
  787. @section Data Library
  788. This section describes the data management facilities provided by StarPU.
  789. TODO: We show how to use existing data interfaces in [ref], but developers can
  790. design their own data interfaces if required.
  791. @menu
  792. * starpu_data_handle:: StarPU opaque data handle
  793. * void *interface:: StarPU data interface
  794. * starpu_XXX_data_register::
  795. * starpu_data_unregister::
  796. @end menu
  797. @node starpu_data_handle
  798. @subsection @code{starpu_data_handle} -- StarPU opaque data handle
  799. @table @asis
  800. @item @emph{Description}:
  801. StarPU uses @code{starpu_data_handle} as an opaque handle to manage a piece of
  802. data. Once a piece of data has been registered to StarPU, it is associated to a
  803. @code{starpu_data_handle} which keeps track of the state of the piece of data
  804. over the entire machine, so that we can maintain data consistency and locate
  805. data replicates for instance.
  806. @end table
  807. @node void *interface
  808. @subsection @code{void *interface} -- StarPU data interface
  809. @table @asis
  810. @item @emph{Description}:
  811. Data management is done at a high-level in StarPU: rather than accessing a mere
  812. list of contiguous buffers, the tasks may manipulate data that are described by
  813. a high-level construct which we call data interface.
  814. TODO
  815. @end table
  816. @node starpu_XXX_data_register
  817. @subsection @code{starpu_XXX_data_register} -- Register data to StarPU
  818. @table @asis
  819. @end table
  820. @node starpu_data_unregister
  821. @subsection @code{starpu_data_unregister} -- Unregister data from StarPU
  822. @table @asis
  823. @item @emph{Description}:
  824. @item @emph{Prototype}:
  825. @code{void starpu_data_unregister(starpu_data_handle handle);}
  826. @end table
  827. @c starpu_worker_get_memory_node TODO
  828. @c
  829. @c user interaction with the DSM
  830. @c void starpu_data_sync_with_mem(struct starpu_data_state_t *state);
  831. @c void starpu_notify_data_modification(struct starpu_data_state_t *state, uint32_t modifying_node);
  832. @node Codelets and Tasks
  833. @section Codelets and Tasks
  834. @menu
  835. * struct starpu_codelet:: StarPU codelet structure
  836. * struct starpu_task:: StarPU task structure
  837. * starpu_task_init:: Initialize a Task
  838. * starpu_task_create:: Allocate and Initialize a Task
  839. * starpu_task_deinit:: Release all the resources used by a Task
  840. * starpu_task_destroy:: Destroy a dynamically allocated Task
  841. * starpu_task_wait:: Wait for the termination of a Task
  842. * starpu_task_submit:: Submit a Task
  843. * starpu_task_wait_for_all:: Wait for the termination of all Tasks
  844. @end menu
  845. @node struct starpu_codelet
  846. @subsection @code{struct starpu_codelet} -- StarPU codelet structure
  847. @table @asis
  848. @item @emph{Description}:
  849. The codelet structure describes a kernel that is possibly implemented on
  850. various targets.
  851. @item @emph{Fields}:
  852. @table @asis
  853. @item @code{where}:
  854. Indicates which types of processing units are able to execute the codelet.
  855. @code{STARPU_CPU|STARPU_CUDA} for instance indicates that the codelet is
  856. implemented for both CPU cores and CUDA devices while @code{STARPU_GORDON}
  857. indicates that it is only available on Cell SPUs.
  858. @item @code{cpu_func} (optional):
  859. Is a function pointer to the CPU implementation of the codelet. Its prototype
  860. must be: @code{void cpu_func(void *buffers[], void *cl_arg)}. The first
  861. argument being the array of data managed by the data management library, and
  862. the second argument is a pointer to the argument passed from the @code{cl_arg}
  863. field of the @code{starpu_task} structure.
  864. The @code{cpu_func} field is ignored if @code{STARPU_CPU} does not appear in
  865. the @code{where} field, it must be non-null otherwise.
  866. @item @code{cuda_func} (optional):
  867. Is a function pointer to the CUDA implementation of the codelet. @emph{This
  868. must be a host-function written in the CUDA runtime API}. Its prototype must
  869. be: @code{void cuda_func(void *buffers[], void *cl_arg);}. The @code{cuda_func}
  870. field is ignored if @code{STARPU_CUDA} does not appear in the @code{where}
  871. field, it must be non-null otherwise.
  872. @item @code{opencl_func} (optional):
  873. Is a function pointer to the OpenCL implementation of the codelet. Its
  874. prototype must be:
  875. @code{void opencl_func(starpu_data_interface_t *descr, void *arg);}.
  876. This pointer is ignored if @code{STARPU_OPENCL} does not appear in the
  877. @code{where} field, it must be non-null otherwise.
  878. @item @code{gordon_func} (optional):
  879. This is the index of the Cell SPU implementation within the Gordon library.
  880. See Gordon documentation for more details on how to register a kernel and
  881. retrieve its index.
  882. @item @code{nbuffers}:
  883. Specifies the number of arguments taken by the codelet. These arguments are
  884. managed by the DSM and are accessed from the @code{void *buffers[]}
  885. array. The constant argument passed with the @code{cl_arg} field of the
  886. @code{starpu_task} structure is not counted in this number. This value should
  887. not be above @code{STARPU_NMAXBUFS}.
  888. @item @code{model} (optional):
  889. This is a pointer to the performance model associated to this codelet. This
  890. optional field is ignored when set to @code{NULL}. TODO
  891. @end table
  892. @end table
  893. @node struct starpu_task
  894. @subsection @code{struct starpu_task} -- StarPU task structure
  895. @table @asis
  896. @item @emph{Description}:
  897. The @code{starpu_task} structure describes a task that can be offloaded on the various
  898. processing units managed by StarPU. It instantiates a codelet. It can either be
  899. allocated dynamically with the @code{starpu_task_create} method, or declared
  900. statically. In the latter case, the programmer has to zero the
  901. @code{starpu_task} structure and to fill the different fields properly. The
  902. indicated default values correspond to the configuration of a task allocated
  903. with @code{starpu_task_create}.
  904. @item @emph{Fields}:
  905. @table @asis
  906. @item @code{cl}:
  907. Is a pointer to the corresponding @code{starpu_codelet} data structure. This
  908. describes where the kernel should be executed, and supplies the appropriate
  909. implementations. When set to @code{NULL}, no code is executed during the tasks,
  910. such empty tasks can be useful for synchronization purposes.
  911. @item @code{buffers}:
  912. TODO
  913. @item @code{cl_arg} (optional) (default = NULL):
  914. This pointer is passed to the codelet through the second argument
  915. of the codelet implementation (e.g. @code{cpu_func} or @code{cuda_func}).
  916. In the specific case of the Cell processor, see the @code{cl_arg_size}
  917. argument.
  918. @item @code{cl_arg_size} (optional, Cell specific):
  919. In the case of the Cell processor, the @code{cl_arg} pointer is not directly
  920. given to the SPU function. A buffer of size @code{cl_arg_size} is allocated on
  921. the SPU. This buffer is then filled with the @code{cl_arg_size} bytes starting
  922. at address @code{cl_arg}. In this case, the argument given to the SPU codelet
  923. is therefore not the @code{cl_arg} pointer, but the address of the buffer in
  924. local store (LS) instead. This field is ignored for CPU, CUDA and OpenCL
  925. codelets.
  926. @item @code{callback_func} (optional) (default = @code{NULL}):
  927. This is a function pointer of prototype @code{void (*f)(void *)} which
  928. specifies a possible callback. If this pointer is non-null, the callback
  929. function is executed @emph{on the host} after the execution of the task. The
  930. callback is passed the value contained in the @code{callback_arg} field. No
  931. callback is executed if the field is set to @code{NULL}.
  932. @item @code{callback_arg} (optional) (default = @code{NULL}):
  933. This is the pointer passed to the callback function. This field is ignored if
  934. the @code{callback_func} is set to @code{NULL}.
  935. @item @code{use_tag} (optional) (default = 0):
  936. If set, this flag indicates that the task should be associated with the tag
  937. contained in the @code{tag_id} field. Tag allow the application to synchronize
  938. with the task and to express task dependencies easily.
  939. @item @code{tag_id}:
  940. This fields contains the tag associated to the task if the @code{use_tag} field
  941. was set, it is ignored otherwise.
  942. @item @code{synchronous}:
  943. If this flag is set, the @code{starpu_task_submit} function is blocking and
  944. returns only when the task has been executed (or if no worker is able to
  945. process the task). Otherwise, @code{starpu_task_submit} returns immediately.
  946. @item @code{priority} (optional) (default = @code{STARPU_DEFAULT_PRIO}):
  947. This field indicates a level of priority for the task. This is an integer value
  948. that must be set between @code{STARPU_MIN_PRIO} (for the least important
  949. tasks) and @code{STARPU_MAX_PRIO} (for the most important tasks) included.
  950. Default priority is @code{STARPU_DEFAULT_PRIO}. Scheduling strategies that
  951. take priorities into account can use this parameter to take better scheduling
  952. decisions, but the scheduling policy may also ignore it.
  953. @item @code{execute_on_a_specific_worker} (default = 0):
  954. If this flag is set, StarPU will bypass the scheduler and directly affect this
  955. task to the worker specified by the @code{workerid} field.
  956. @item @code{workerid} (optional):
  957. If the @code{execute_on_a_specific_worker} field is set, this field indicates
  958. which is the identifier of the worker that should process this task (as
  959. returned by @code{starpu_worker_get_id}). This field is ignored if
  960. @code{execute_on_a_specific_worker} field is set to 0.
  961. @item @code{detach} (optional) (default = 1):
  962. If this flag is set, it is not possible to synchronize with the task
  963. by the means of @code{starpu_task_wait} later on. Internal data structures
  964. are only guaranteed to be freed once @code{starpu_task_wait} is called if the
  965. flag is not set.
  966. @item @code{destroy} (optional) (default = 1):
  967. If this flag is set, the task structure will automatically be freed, either
  968. after the execution of the callback if the task is detached, or during
  969. @code{starpu_task_wait} otherwise. If this flag is not set, dynamically
  970. allocated data structures will not be freed until @code{starpu_task_destroy} is
  971. called explicitly. Setting this flag for a statically allocated task structure
  972. will result in undefined behaviour.
  973. @end table
  974. @end table
  975. @node starpu_task_init
  976. @subsection @code{starpu_task_init} -- Initialize a Task
  977. @table @asis
  978. @item @emph{Description}:
  979. Initialize a task structure with default values. This function is implicitly
  980. called by @code{starpu_task_create}. By default, tasks initialized with
  981. @code{starpu_task_init} must be deinitialized explicitly with
  982. @code{starpu_task_deinit}. Tasks can also be initialized statically, using the
  983. constant @code{STARPU_TASK_INITIALIZER}.
  984. @item @emph{Prototype}:
  985. @code{void starpu_task_init(struct starpu_task *task);}
  986. @end table
  987. @node starpu_task_create
  988. @subsection @code{starpu_task_create} -- Allocate and Initialize a Task
  989. @table @asis
  990. @item @emph{Description}:
  991. Allocate a task structure and initialize it with default values. Tasks
  992. allocated dynamically with @code{starpu_task_create} are automatically freed when the
  993. task is terminated. If the destroy flag is explicitly unset, the resources used
  994. by the task are freed by calling
  995. @code{starpu_task_destroy}.
  996. @item @emph{Prototype}:
  997. @code{struct starpu_task *starpu_task_create(void);}
  998. @end table
  999. @node starpu_task_deinit
  1000. @subsection @code{starpu_task_deinit} -- Release all the resources used by a Task
  1001. @table @asis
  1002. @item @emph{Description}:
  1003. Release all the structures automatically allocated to execute the task. This is
  1004. called automatically by @code{starpu_task_destroy}, but the task structure itself is not
  1005. freed. This should be used for statically allocated tasks for instance.
  1006. @item @emph{Prototype}:
  1007. @code{void starpu_task_deinit(struct starpu_task *task);}
  1008. @end table
  1009. @node starpu_task_destroy
  1010. @subsection @code{starpu_task_destroy} -- Destroy a dynamically allocated Task
  1011. @table @asis
  1012. @item @emph{Description}:
  1013. Free the resource allocated during @code{starpu_task_create}. This function can be
  1014. called automatically after the execution of a task by setting the
  1015. @code{destroy} flag of the @code{starpu_task} structure (default behaviour).
  1016. Calling this function on a statically allocated task results in an undefined
  1017. behaviour.
  1018. @item @emph{Prototype}:
  1019. @code{void starpu_task_destroy(struct starpu_task *task);}
  1020. @end table
  1021. @node starpu_task_wait
  1022. @subsection @code{starpu_task_wait} -- Wait for the termination of a Task
  1023. @table @asis
  1024. @item @emph{Description}:
  1025. This function blocks until the task has been executed. It is not possible to
  1026. synchronize with a task more than once. It is not possible to wait for
  1027. synchronous or detached tasks.
  1028. @item @emph{Return value}:
  1029. Upon successful completion, this function returns 0. Otherwise, @code{-EINVAL}
  1030. indicates that the specified task was either synchronous or detached.
  1031. @item @emph{Prototype}:
  1032. @code{int starpu_task_wait(struct starpu_task *task);}
  1033. @end table
  1034. @node starpu_task_submit
  1035. @subsection @code{starpu_task_submit} -- Submit a Task
  1036. @table @asis
  1037. @item @emph{Description}:
  1038. This function submits a task to StarPU. Calling this function does
  1039. not mean that the task will be executed immediately as there can be data or task
  1040. (tag) dependencies that are not fulfilled yet: StarPU will take care of
  1041. scheduling this task with respect to such dependencies.
  1042. This function returns immediately if the @code{synchronous} field of the
  1043. @code{starpu_task} structure was set to 0, and block until the termination of
  1044. the task otherwise. It is also possible to synchronize the application with
  1045. asynchronous tasks by the means of tags, using the @code{starpu_tag_wait}
  1046. function for instance.
  1047. @item @emph{Return value}:
  1048. In case of success, this function returns 0, a return value of @code{-ENODEV}
  1049. means that there is no worker able to process this task (e.g. there is no GPU
  1050. available and this task is only implemented for CUDA devices).
  1051. @item @emph{Prototype}:
  1052. @code{int starpu_task_submit(struct starpu_task *task);}
  1053. @end table
  1054. @node starpu_task_wait_for_all
  1055. @subsection @code{starpu_task_wait_for_all} -- Wait for the termination of all Tasks
  1056. @table @asis
  1057. @item @emph{Description}:
  1058. This function blocks until all the tasks that were submitted are terminated.
  1059. @item @emph{Prototype}:
  1060. @code{void starpu_task_wait_for_all(void);}
  1061. @end table
  1062. @c Callbacks : what can we put in callbacks ?
  1063. @node Tags
  1064. @section Tags
  1065. @menu
  1066. * starpu_tag_t:: Task identifier
  1067. * starpu_tag_declare_deps:: Declare the Dependencies of a Tag
  1068. * starpu_tag_declare_deps_array:: Declare the Dependencies of a Tag
  1069. * starpu_tag_wait:: Block until a Tag is terminated
  1070. * starpu_tag_wait_array:: Block until a set of Tags is terminated
  1071. * starpu_tag_remove:: Destroy a Tag
  1072. * starpu_tag_notify_from_apps:: Feed a tag explicitly
  1073. @end menu
  1074. @node starpu_tag_t
  1075. @subsection @code{starpu_tag_t} -- Task identifier
  1076. @table @asis
  1077. @item @emph{Description}:
  1078. It is possible to associate a task with a unique ``tag'' and to express
  1079. dependencies between tasks by the means of those tags. To do so, fill the
  1080. @code{tag_id} field of the @code{starpu_task} structure with a tag number (can
  1081. be arbitrary) and set the @code{use_tag} field to 1.
  1082. If @code{starpu_tag_declare_deps} is called with this tag number, the task will
  1083. not be started until the tasks which holds the declared dependency tags are
  1084. completed.
  1085. @end table
  1086. @node starpu_tag_declare_deps
  1087. @subsection @code{starpu_tag_declare_deps} -- Declare the Dependencies of a Tag
  1088. @table @asis
  1089. @item @emph{Description}:
  1090. Specify the dependencies of the task identified by tag @code{id}. The first
  1091. argument specifies the tag which is configured, the second argument gives the
  1092. number of tag(s) on which @code{id} depends. The following arguments are the
  1093. tags which have to be terminated to unlock the task.
  1094. This function must be called before the associated task is submitted to StarPU
  1095. with @code{starpu_task_submit}.
  1096. @item @emph{Remark}
  1097. Because of the variable arity of @code{starpu_tag_declare_deps}, note that the
  1098. last arguments @emph{must} be of type @code{starpu_tag_t}: constant values
  1099. typically need to be explicitly casted. Using the
  1100. @code{starpu_tag_declare_deps_array} function avoids this hazard.
  1101. @item @emph{Prototype}:
  1102. @code{void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...);}
  1103. @item @emph{Example}:
  1104. @cartouche
  1105. @example
  1106. /* Tag 0x1 depends on tags 0x32 and 0x52 */
  1107. starpu_tag_declare_deps((starpu_tag_t)0x1,
  1108. 2, (starpu_tag_t)0x32, (starpu_tag_t)0x52);
  1109. @end example
  1110. @end cartouche
  1111. @end table
  1112. @node starpu_tag_declare_deps_array
  1113. @subsection @code{starpu_tag_declare_deps_array} -- Declare the Dependencies of a Tag
  1114. @table @asis
  1115. @item @emph{Description}:
  1116. This function is similar to @code{starpu_tag_declare_deps}, except that its
  1117. does not take a variable number of arguments but an array of tags of size
  1118. @code{ndeps}.
  1119. @item @emph{Prototype}:
  1120. @code{void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t *array);}
  1121. @item @emph{Example}:
  1122. @cartouche
  1123. @example
  1124. /* Tag 0x1 depends on tags 0x32 and 0x52 */
  1125. starpu_tag_t tag_array[2] = @{0x32, 0x52@};
  1126. starpu_tag_declare_deps_array((starpu_tag_t)0x1, 2, tag_array);
  1127. @end example
  1128. @end cartouche
  1129. @end table
  1130. @node starpu_tag_wait
  1131. @subsection @code{starpu_tag_wait} -- Block until a Tag is terminated
  1132. @table @asis
  1133. @item @emph{Description}:
  1134. This function blocks until the task associated to tag @code{id} has been
  1135. executed. This is a blocking call which must therefore not be called within
  1136. tasks or callbacks, but only from the application directly. It is possible to
  1137. synchronize with the same tag multiple times, as long as the
  1138. @code{starpu_tag_remove} function is not called. Note that it is still
  1139. possible to synchronize with a tag associated to a task which @code{starpu_task}
  1140. data structure was freed (e.g. if the @code{destroy} flag of the
  1141. @code{starpu_task} was enabled).
  1142. @item @emph{Prototype}:
  1143. @code{void starpu_tag_wait(starpu_tag_t id);}
  1144. @end table
  1145. @node starpu_tag_wait_array
  1146. @subsection @code{starpu_tag_wait_array} -- Block until a set of Tags is terminated
  1147. @table @asis
  1148. @item @emph{Description}:
  1149. This function is similar to @code{starpu_tag_wait} except that it blocks until
  1150. @emph{all} the @code{ntags} tags contained in the @code{id} array are
  1151. terminated.
  1152. @item @emph{Prototype}:
  1153. @code{void starpu_tag_wait_array(unsigned ntags, starpu_tag_t *id);}
  1154. @end table
  1155. @node starpu_tag_remove
  1156. @subsection @code{starpu_tag_remove} -- Destroy a Tag
  1157. @table @asis
  1158. @item @emph{Description}:
  1159. This function releases the resources associated to tag @code{id}. It can be
  1160. called once the corresponding task has been executed and when there is
  1161. no other tag that depend on this tag anymore.
  1162. @item @emph{Prototype}:
  1163. @code{void starpu_tag_remove(starpu_tag_t id);}
  1164. @end table
  1165. @node starpu_tag_notify_from_apps
  1166. @subsection @code{starpu_tag_notify_from_apps} -- Feed a Tag explicitly
  1167. @table @asis
  1168. @item @emph{Description}:
  1169. This function explicitly unlocks tag @code{id}. It may be useful in the
  1170. case of applications which execute part of their computation outside StarPU
  1171. tasks (e.g. third-party libraries). It is also provided as a
  1172. convenient tool for the programmer, for instance to entirely construct the task
  1173. DAG before actually giving StarPU the opportunity to execute the tasks.
  1174. @item @emph{Prototype}:
  1175. @code{void starpu_tag_notify_from_apps(starpu_tag_t id);}
  1176. @end table
  1177. @node Profiling API
  1178. @section Profiling API
  1179. @menu
  1180. * starpu_profiling_status_set:: starpu_profiling_status_set
  1181. * starpu_profiling_status_get:: starpu_profiling_status_get
  1182. * struct starpu_task_profiling_info:: task profiling information
  1183. * struct starpu_worker_profiling_info:: worker profiling information
  1184. * starpu_worker_get_profiling_info:: starpu_worker_get_profiling_info
  1185. * struct starpu_bus_profiling_info:: bus profiling information
  1186. @end menu
  1187. @node starpu_profiling_status_set
  1188. @subsection @code{starpu_profiling_status_set} -- Set current profiling status
  1189. @table @asis
  1190. @item @emph{Description}:
  1191. Thie function sets the profiling status. Profiling is activated by passing
  1192. @code{STARPU_PROFILING_ENABLE} in @code{status}. Passing
  1193. @code{STARPU_PROFILING_DISABLE} disables profiling. Calling this function
  1194. resets all profiling measurements. When profiling is enabled, the
  1195. @code{profiling_info} field of the @code{struct starpu_task} structure points
  1196. to a valid @code{struct starpu_task_profiling_info} structure containing
  1197. information about the execution of the task.
  1198. @item @emph{Return value}:
  1199. Negative return values indicate an error, otherwise the previous status is
  1200. returned.
  1201. @item @emph{Prototype}:
  1202. @code{int starpu_profiling_status_set(int status);}
  1203. @end table
  1204. @node starpu_profiling_status_get
  1205. @subsection @code{starpu_profiling_status_get} -- Get current profiling status
  1206. @table @asis
  1207. @item @emph{Description}:
  1208. Return the current profiling status or a negative value in case there was an error.
  1209. @item @emph{Prototype}:
  1210. @code{int starpu_profiling_status_get(void);}
  1211. @end table
  1212. @node struct starpu_task_profiling_info
  1213. @subsection @code{struct starpu_task_profiling_info} -- Task profiling information
  1214. @table @asis
  1215. @item @emph{Description}:
  1216. This structure contains information about the execution of a task. It is
  1217. accessible from the @code{.profiling_info} field of the @code{starpu_task}
  1218. structure if profiling was enabled.
  1219. @item @emph{Fields}:
  1220. @table @asis
  1221. @item @code{submit_time}:
  1222. Date of task submission (relative to the initialization of StarPU).
  1223. @item @code{start_time}:
  1224. Date of task execution beginning (relative to the initialization of StarPU).
  1225. @item @code{end_time}:
  1226. Date of task execution termination (relative to the initialization of StarPU).
  1227. @item @code{workerid}:
  1228. Identifier of the worker which has executed the task.
  1229. @end table
  1230. @end table
  1231. @node struct starpu_worker_profiling_info
  1232. @subsection @code{struct starpu_worker_profiling_info} -- Worker profiling information
  1233. @table @asis
  1234. @item @emph{Description}:
  1235. This structure contains the profiling information associated to a worker.
  1236. @item @emph{Fields}:
  1237. @table @asis
  1238. @item @code{start_time}:
  1239. Starting date for the reported profiling measurements.
  1240. @item @code{total_time}:
  1241. Duration of the profiling measurement interval.
  1242. @item @code{executing_time}:
  1243. Time spent by the worker to execute tasks during the profiling measurement interval.
  1244. @item @code{sleeping_time}:
  1245. Time spent idling by the worker during the profiling measurement interval.
  1246. @item @code{executed_tasks}:
  1247. Number of tasks executed by the worker during the profiling measurement interval.
  1248. @end table
  1249. @end table
  1250. @node starpu_worker_get_profiling_info
  1251. @subsection @code{starpu_worker_get_profiling_info} -- Get worker profiling info
  1252. @table @asis
  1253. @item @emph{Description}:
  1254. Get the profiling info associated to the worker identified by @code{workerid},
  1255. and reset the profiling measurements. If the @code{worker_info} argument is
  1256. NULL, only reset the counters associated to worker @code{workerid}.
  1257. @item @emph{Return value}:
  1258. Upon successful completion, this function returns 0. Otherwise, a negative
  1259. value is returned.
  1260. @item @emph{Prototype}:
  1261. @code{int starpu_worker_get_profiling_info(int workerid, struct starpu_worker_profiling_info *worker_info);}
  1262. @end table
  1263. @node struct starpu_bus_profiling_info
  1264. @subsection @code{struct starpu_bus_profiling_info} -- Bus profiling information
  1265. @table @asis
  1266. @item @emph{Description}:
  1267. TODO
  1268. @item @emph{Fields}:
  1269. @table @asis
  1270. @item @code{start_time}:
  1271. TODO
  1272. @item @code{total_time}:
  1273. TODO
  1274. @item @code{transferred_bytes}:
  1275. TODO
  1276. @item @code{transfer_count}:
  1277. TODO
  1278. @end table
  1279. @end table
  1280. @node CUDA extensions
  1281. @section CUDA extensions
  1282. @c void starpu_data_malloc_pinned_if_possible(float **A, size_t dim);
  1283. @c starpu_helper_cublas_init TODO
  1284. @c starpu_helper_cublas_shutdown TODO
  1285. @menu
  1286. * starpu_cuda_get_local_stream:: Get current worker's CUDA stream
  1287. * starpu_helper_cublas_init:: Initialize CUBLAS on every CUDA device
  1288. * starpu_helper_cublas_shutdown:: Deinitialize CUBLAS on every CUDA device
  1289. @end menu
  1290. @node starpu_cuda_get_local_stream
  1291. @subsection @code{starpu_cuda_get_local_stream} -- Get current worker's CUDA stream
  1292. @table @asis
  1293. @item @emph{Description}:
  1294. StarPU provides a stream for every CUDA device controlled by StarPU. This
  1295. function is only provided for convenience so that programmers can easily use
  1296. asynchronous operations within codelets without having to create a stream by
  1297. hand. Note that the application is not forced to use the stream provided by
  1298. @code{starpu_cuda_get_local_stream} and may also create its own streams.
  1299. @item @emph{Prototype}:
  1300. @code{cudaStream_t *starpu_cuda_get_local_stream(void);}
  1301. @end table
  1302. @node starpu_helper_cublas_init
  1303. @subsection @code{starpu_helper_cublas_init} -- Initialize CUBLAS on every CUDA device
  1304. @table @asis
  1305. @item @emph{Description}:
  1306. The CUBLAS library must be initialized prior to any CUBLAS call. Calling
  1307. @code{starpu_helper_cublas_init} will initialize CUBLAS on every CUDA device
  1308. controlled by StarPU. This call blocks until CUBLAS has been properly
  1309. initialized on every device.
  1310. @item @emph{Prototype}:
  1311. @code{void starpu_helper_cublas_init(void);}
  1312. @end table
  1313. @node starpu_helper_cublas_shutdown
  1314. @subsection @code{starpu_helper_cublas_shutdown} -- Deinitialize CUBLAS on every CUDA device
  1315. @table @asis
  1316. @item @emph{Description}:
  1317. This function synchronously deinitializes the CUBLAS library on every CUDA device.
  1318. @item @emph{Prototype}:
  1319. @code{void starpu_helper_cublas_shutdown(void);}
  1320. @end table
  1321. @node OpenCL extensions
  1322. @section OpenCL extensions
  1323. @menu
  1324. * Enabling OpenCL:: Enabling OpenCL
  1325. * Compiling OpenCL codelets:: Compiling OpenCL codelets
  1326. @end menu
  1327. @node Enabling OpenCL
  1328. @subsection Enabling OpenCL
  1329. On GPU devices which can run both CUDA and OpenCL, CUDA will be
  1330. enabled by default. To enable OpenCL, you need either to disable CUDA
  1331. when configuring StarPU:
  1332. @example
  1333. % ./configure --disable-cuda
  1334. @end example
  1335. or when running applications:
  1336. @example
  1337. % STARPU_NCUDA=0 ./application
  1338. @end example
  1339. OpenCL will automatically be started on any device not yet used by
  1340. CUDA. So on a machine running 4 GPUS, it is therefore possible to
  1341. enable CUDA on 2 devices, and OpenCL on the 2 other devices by doing
  1342. so:
  1343. @example
  1344. % STARPU_NCUDA=2 ./application
  1345. @end example
  1346. @node Compiling OpenCL codelets
  1347. @subsection Compiling OpenCL codelets
  1348. TODO
  1349. @node Cell extensions
  1350. @section Cell extensions
  1351. nothing yet.
  1352. @node Miscellaneous helpers
  1353. @section Miscellaneous helpers
  1354. @menu
  1355. * starpu_execute_on_each_worker:: Execute a function on a subset of workers
  1356. @end menu
  1357. @node starpu_execute_on_each_worker
  1358. @subsection @code{starpu_execute_on_each_worker} -- Execute a function on a subset of workers
  1359. @table @asis
  1360. @item @emph{Description}:
  1361. When calling this method, the offloaded function specified by the first argument is
  1362. executed by every StarPU worker that may execute the function.
  1363. The second argument is passed to the offloaded function.
  1364. The last argument specifies on which types of processing units the function
  1365. should be executed. Similarly to the @code{where} field of the
  1366. @code{starpu_codelet} structure, it is possible to specify that the function
  1367. should be executed on every CUDA device and every CPU by passing
  1368. @code{STARPU_CPU|STARPU_CUDA}.
  1369. This function blocks until the function has been executed on every appropriate
  1370. processing units, so that it may not be called from a callback function for
  1371. instance.
  1372. @item @emph{Prototype}:
  1373. @code{void starpu_execute_on_each_worker(void (*func)(void *), void *arg, uint32_t where);}
  1374. @end table
  1375. @c ---------------------------------------------------------------------
  1376. @c Basic Examples
  1377. @c ---------------------------------------------------------------------
  1378. @node Basic Examples
  1379. @chapter Basic Examples
  1380. @menu
  1381. * Compiling and linking options::
  1382. * Hello World:: Submitting Tasks
  1383. * Scaling a Vector:: Manipulating Data
  1384. * Vector Scaling on an Hybrid CPU/GPU Machine:: Handling Heterogeneous Architectures
  1385. @end menu
  1386. @node Compiling and linking options
  1387. @section Compiling and linking options
  1388. Let's suppose StarPU has been installed in the directory
  1389. @code{$STARPU_DIR}. As explained in @ref{Setting flags for compiling and linking applications},
  1390. the variable @code{PKG_CONFIG_PATH} needs to be set. It is also
  1391. necessary to set the variable @code{LD_LIBRARY_PATH} to locate dynamic
  1392. libraries at runtime.
  1393. @example
  1394. % PKG_CONFIG_PATH=$STARPU_DIR/lib/pkgconfig:$PKG_CONFIG_PATH
  1395. % LD_LIBRARY_PATH=$STARPU_DIR/lib:$LD_LIBRARY_PATH
  1396. @end example
  1397. The Makefile could for instance contain the following lines to define which
  1398. options must be given to the compiler and to the linker:
  1399. @cartouche
  1400. @example
  1401. CFLAGS += $$(pkg-config --cflags libstarpu)
  1402. LDFLAGS += $$(pkg-config --libs libstarpu)
  1403. @end example
  1404. @end cartouche
  1405. @node Hello World
  1406. @section Hello World
  1407. @menu
  1408. * Required Headers::
  1409. * Defining a Codelet::
  1410. * Submitting a Task::
  1411. * Execution of Hello World::
  1412. @end menu
  1413. In this section, we show how to implement a simple program that submits a task to StarPU.
  1414. @node Required Headers
  1415. @subsection Required Headers
  1416. The @code{starpu.h} header should be included in any code using StarPU.
  1417. @cartouche
  1418. @smallexample
  1419. #include <starpu.h>
  1420. @end smallexample
  1421. @end cartouche
  1422. @node Defining a Codelet
  1423. @subsection Defining a Codelet
  1424. @cartouche
  1425. @smallexample
  1426. void cpu_func(void *buffers[], void *cl_arg)
  1427. @{
  1428. float *array = cl_arg;
  1429. printf("Hello world (array = @{%f, %f@} )\n", array[0], array[1]);
  1430. @}
  1431. starpu_codelet cl =
  1432. @{
  1433. .where = STARPU_CPU,
  1434. .cpu_func = cpu_func,
  1435. .nbuffers = 0
  1436. @};
  1437. @end smallexample
  1438. @end cartouche
  1439. A codelet is a structure that represents a computational kernel. Such a codelet
  1440. may contain an implementation of the same kernel on different architectures
  1441. (e.g. CUDA, Cell's SPU, x86, ...).
  1442. The @code{nbuffers} field specifies the number of data buffers that are
  1443. manipulated by the codelet: here the codelet does not access or modify any data
  1444. that is controlled by our data management library. Note that the argument
  1445. passed to the codelet (the @code{cl_arg} field of the @code{starpu_task}
  1446. structure) does not count as a buffer since it is not managed by our data
  1447. management library.
  1448. @c TODO need a crossref to the proper description of "where" see bla for more ...
  1449. We create a codelet which may only be executed on the CPUs. The @code{where}
  1450. field is a bitmask that defines where the codelet may be executed. Here, the
  1451. @code{STARPU_CPU} value means that only CPUs can execute this codelet
  1452. (@pxref{Codelets and Tasks} for more details on this field).
  1453. When a CPU core executes a codelet, it calls the @code{cpu_func} function,
  1454. which @emph{must} have the following prototype:
  1455. @code{void (*cpu_func)(void *buffers[], void *cl_arg);}
  1456. In this example, we can ignore the first argument of this function which gives a
  1457. description of the input and output buffers (e.g. the size and the location of
  1458. the matrices). The second argument is a pointer to a buffer passed as an
  1459. argument to the codelet by the means of the @code{cl_arg} field of the
  1460. @code{starpu_task} structure.
  1461. @c TODO rewrite so that it is a little clearer ?
  1462. Be aware that this may be a pointer to a
  1463. @emph{copy} of the actual buffer, and not the pointer given by the programmer:
  1464. if the codelet modifies this buffer, there is no guarantee that the initial
  1465. buffer will be modified as well: this for instance implies that the buffer
  1466. cannot be used as a synchronization medium.
  1467. @node Submitting a Task
  1468. @subsection Submitting a Task
  1469. @cartouche
  1470. @smallexample
  1471. void callback_func(void *callback_arg)
  1472. @{
  1473. printf("Callback function (arg %x)\n", callback_arg);
  1474. @}
  1475. int main(int argc, char **argv)
  1476. @{
  1477. /* @b{initialize StarPU} */
  1478. starpu_init(NULL);
  1479. struct starpu_task *task = starpu_task_create();
  1480. task->cl = &cl; /* @b{Pointer to the codelet defined above} */
  1481. float array[2] = @{1.0f, -1.0f@};
  1482. task->cl_arg = &array;
  1483. task->cl_arg_size = sizeof(array);
  1484. task->callback_func = callback_func;
  1485. task->callback_arg = 0x42;
  1486. /* @b{starpu_task_submit will be a blocking call} */
  1487. task->synchronous = 1;
  1488. /* @b{submit the task to StarPU} */
  1489. starpu_task_submit(task);
  1490. /* @b{terminate StarPU} */
  1491. starpu_shutdown();
  1492. return 0;
  1493. @}
  1494. @end smallexample
  1495. @end cartouche
  1496. Before submitting any tasks to StarPU, @code{starpu_init} must be called. The
  1497. @code{NULL} argument specifies that we use default configuration. Tasks cannot
  1498. be submitted after the termination of StarPU by a call to
  1499. @code{starpu_shutdown}.
  1500. In the example above, a task structure is allocated by a call to
  1501. @code{starpu_task_create}. This function only allocates and fills the
  1502. corresponding structure with the default settings (@pxref{starpu_task_create}),
  1503. but it does not submit the task to StarPU.
  1504. @c not really clear ;)
  1505. The @code{cl} field is a pointer to the codelet which the task will
  1506. execute: in other words, the codelet structure describes which computational
  1507. kernel should be offloaded on the different architectures, and the task
  1508. structure is a wrapper containing a codelet and the piece of data on which the
  1509. codelet should operate.
  1510. The optional @code{cl_arg} field is a pointer to a buffer (of size
  1511. @code{cl_arg_size}) with some parameters for the kernel
  1512. described by the codelet. For instance, if a codelet implements a computational
  1513. kernel that multiplies its input vector by a constant, the constant could be
  1514. specified by the means of this buffer.
  1515. Once a task has been executed, an optional callback function can be called.
  1516. While the computational kernel could be offloaded on various architectures, the
  1517. callback function is always executed on a CPU. The @code{callback_arg}
  1518. pointer is passed as an argument of the callback. The prototype of a callback
  1519. function must be:
  1520. @code{void (*callback_function)(void *);}
  1521. If the @code{synchronous} field is non-null, task submission will be
  1522. synchronous: the @code{starpu_task_submit} function will not return until the
  1523. task was executed. Note that the @code{starpu_shutdown} method does not
  1524. guarantee that asynchronous tasks have been executed before it returns.
  1525. @node Execution of Hello World
  1526. @subsection Execution of Hello World
  1527. @smallexample
  1528. % make helloWorld
  1529. cc $(pkg-config --cflags libstarpu) $(pkg-config --libs libstarpu) helloWorld.c -o helloWorld
  1530. % ./helloWorld
  1531. Hello world (array = @{1.000000, -1.000000@} )
  1532. Callback function (arg 42)
  1533. @end smallexample
  1534. @node Scaling a Vector
  1535. @section Manipulating Data: Scaling a Vector
  1536. The previous example has shown how to submit tasks. In this section,
  1537. we show how StarPU tasks can manipulate data. The full source code for
  1538. this example is given in @ref{Full source code for the 'Scaling a Vector' example}.
  1539. @menu
  1540. * Source code of Vector Scaling::
  1541. * Execution of Vector Scaling::
  1542. @end menu
  1543. @node Source code of Vector Scaling
  1544. @subsection Source code of Vector Scaling
  1545. Programmers can describe the data layout of their application so that StarPU is
  1546. responsible for enforcing data coherency and availability across the machine.
  1547. Instead of handling complex (and non-portable) mechanisms to perform data
  1548. movements, programmers only declare which piece of data is accessed and/or
  1549. modified by a task, and StarPU makes sure that when a computational kernel
  1550. starts somewhere (e.g. on a GPU), its data are available locally.
  1551. Before submitting those tasks, the programmer first needs to declare the
  1552. different pieces of data to StarPU using the @code{starpu_*_data_register}
  1553. functions. To ease the development of applications for StarPU, it is possible
  1554. to describe multiple types of data layout. A type of data layout is called an
  1555. @b{interface}. By default, there are different interfaces available in StarPU:
  1556. here we will consider the @b{vector interface}.
  1557. The following lines show how to declare an array of @code{NX} elements of type
  1558. @code{float} using the vector interface:
  1559. @cartouche
  1560. @smallexample
  1561. float vector[NX];
  1562. starpu_data_handle vector_handle;
  1563. starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX,
  1564. sizeof(float));
  1565. @end smallexample
  1566. @end cartouche
  1567. The first argument, called the @b{data handle}, is an opaque pointer which
  1568. designates the array in StarPU. This is also the structure which is used to
  1569. describe which data is used by a task. The second argument is the node number
  1570. where the data currently resides. Here it is 0 since the @code{vector} array is in
  1571. the main memory. Then comes the pointer @code{vector} where the data can be found,
  1572. the number of elements in the vector and the size of each element.
  1573. It is possible to construct a StarPU task that will manipulate the
  1574. vector and a constant factor.
  1575. @cartouche
  1576. @smallexample
  1577. float factor = 3.14;
  1578. struct starpu_task *task = starpu_task_create();
  1579. task->cl = &cl; /* @b{Pointer to the codelet defined below} */
  1580. task->buffers[0].handle = vector_handle; /* @b{First parameter of the codelet} */
  1581. task->buffers[0].mode = STARPU_RW;
  1582. task->cl_arg = &factor;
  1583. task->cl_arg_size = sizeof(factor);
  1584. task->synchronous = 1;
  1585. starpu_task_submit(task);
  1586. @end smallexample
  1587. @end cartouche
  1588. Since the factor is constant, it does not need a preliminary declaration, and
  1589. can just be passed through the @code{cl_arg} pointer like in the previous
  1590. example. The vector parameter is described by its handle.
  1591. There are two fields in each element of the @code{buffers} array.
  1592. @code{handle} is the handle of the data, and @code{mode} specifies how the
  1593. kernel will access the data (@code{STARPU_R} for read-only, @code{STARPU_W} for
  1594. write-only and @code{STARPU_RW} for read and write access).
  1595. The definition of the codelet can be written as follows:
  1596. @cartouche
  1597. @smallexample
  1598. void scal_cpu_func(void *buffers[], void *cl_arg)
  1599. @{
  1600. unsigned i;
  1601. float *factor = cl_arg;
  1602. struct starpu_vector_interface_s *vector = buffers[0];
  1603. /* length of the vector */
  1604. unsigned n = STARPU_GET_VECTOR_NX(vector);
  1605. /* local copy of the vector pointer */
  1606. float *val = (float *)STARPU_GET_VECTOR_PTR(vector);
  1607. for (i = 0; i < n; i++)
  1608. val[i] *= *factor;
  1609. @}
  1610. starpu_codelet cl = @{
  1611. .where = STARPU_CPU,
  1612. .cpu_func = scal_cpu_func,
  1613. .nbuffers = 1
  1614. @};
  1615. @end smallexample
  1616. @end cartouche
  1617. The second argument of the @code{scal_cpu_func} function contains a pointer to the
  1618. parameters of the codelet (given in @code{task->cl_arg}), so that we read the
  1619. constant factor from this pointer. The first argument is an array that gives
  1620. a description of all the buffers passed in the @code{task->buffers}@ array. The
  1621. size of this array is given by the @code{nbuffers} field of the codelet
  1622. structure. For the sake of generality, this array contains pointers to the
  1623. different interfaces describing each buffer. In the case of the @b{vector
  1624. interface}, the location of the vector (resp. its length) is accessible in the
  1625. @code{ptr} (resp. @code{nx}) of this array. Since the vector is accessed in a
  1626. read-write fashion, any modification will automatically affect future accesses
  1627. to this vector made by other tasks.
  1628. @node Execution of Vector Scaling
  1629. @subsection Execution of Vector Scaling
  1630. @smallexample
  1631. % make vector
  1632. cc $(pkg-config --cflags libstarpu) $(pkg-config --libs libstarpu) vector.c -o vector
  1633. % ./vector
  1634. 0.000000 3.000000 6.000000 9.000000 12.000000
  1635. @end smallexample
  1636. @node Vector Scaling on an Hybrid CPU/GPU Machine
  1637. @section Vector Scaling on an Hybrid CPU/GPU Machine
  1638. Contrary to the previous examples, the task submitted in this example may not
  1639. only be executed by the CPUs, but also by a CUDA device.
  1640. @menu
  1641. * Definition of the CUDA Codelet::
  1642. * Definition of the OpenCL Codelet::
  1643. * Definition of the Main Code::
  1644. * Compilation and execution of Hybrid Vector Scaling::
  1645. @end menu
  1646. @node Definition of the CUDA Codelet
  1647. @subsection Definition of the CUDA Codelet
  1648. The CUDA implementation can be written as follows. It needs to be
  1649. compiled with a CUDA compiler such as nvcc, the NVIDIA CUDA compiler
  1650. driver.
  1651. @cartouche
  1652. @smallexample
  1653. #include <starpu.h>
  1654. static __global__ void vector_mult_cuda(float *val, unsigned n,
  1655. float factor)
  1656. @{
  1657. unsigned i;
  1658. for(i = 0 ; i < n ; i++)
  1659. val[i] *= factor;
  1660. @}
  1661. extern "C" void scal_cuda_func(void *buffers[], void *_args)
  1662. @{
  1663. float *factor = (float *)_args;
  1664. struct starpu_vector_interface_s *vector = (struct starpu_vector_interface_s *) buffers[0];
  1665. /* length of the vector */
  1666. unsigned n = STARPU_GET_VECTOR_NX(vector);
  1667. /* local copy of the vector pointer */
  1668. float *val = (float *)STARPU_GET_VECTOR_PTR(vector);
  1669. @i{ vector_mult_cuda<<<1,1>>>(val, n, *factor);}
  1670. @i{ cudaThreadSynchronize();}
  1671. @}
  1672. @end smallexample
  1673. @end cartouche
  1674. @node Definition of the OpenCL Codelet
  1675. @subsection Definition of the OpenCL Codelet
  1676. The OpenCL implementation can be written as follows. StarPU provides
  1677. tools to compile a OpenCL codelet stored in a file.
  1678. @cartouche
  1679. @smallexample
  1680. __kernel void vector_mult_opencl(__global float* val, int nx, float factor)
  1681. @{
  1682. const int i = get_global_id(0);
  1683. if (i < nx) @{
  1684. val[i] *= factor;
  1685. @}
  1686. @}
  1687. @end smallexample
  1688. @end cartouche
  1689. @cartouche
  1690. @smallexample
  1691. #include <starpu.h>
  1692. @i{#include <starpu_opencl.h>}
  1693. @i{extern struct starpu_opencl_codelet codelet;}
  1694. void scal_opencl_func(void *buffers[], void *_args)
  1695. @{
  1696. float *factor = (float *)_args;
  1697. struct starpu_vector_interface_s *vector = (struct starpu_vector_interface_s *) buffers[0];
  1698. @i{ int id, devid, err;}
  1699. @i{ cl_kernel kernel;}
  1700. @i{ cl_command_queue queue;}
  1701. /* length of the vector */
  1702. unsigned n = STARPU_GET_VECTOR_NX(vector);
  1703. /* local copy of the vector pointer */
  1704. float *val = (float *)STARPU_GET_VECTOR_PTR(vector);
  1705. @i{ id = starpu_worker_get_id();}
  1706. @i{ devid = starpu_worker_get_devid(id);}
  1707. @i{ err = starpu_opencl_load_kernel(&kernel, &queue, &codelet,}
  1708. @i{ "vector_mult_opencl", devid); /* @b{Name of the codelet defined above} */}
  1709. @i{ if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);}
  1710. @i{ err = 0;}
  1711. @i{ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);}
  1712. @i{ err = clSetKernelArg(kernel, 1, sizeof(int), &n);}
  1713. @i{ err |= clSetKernelArg(kernel, 2, sizeof(float), (void*)factor);}
  1714. @i{ if (err) STARPU_OPENCL_REPORT_ERROR(err);}
  1715. @i{ @{}
  1716. @i{ size_t global=1;}
  1717. @i{ size_t local=1;}
  1718. @i{ err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);}
  1719. @i{ if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);}
  1720. @i{ @}}
  1721. @i{ clFinish(queue);}
  1722. @i{ starpu_opencl_release_kernel(kernel);}
  1723. @}
  1724. @end smallexample
  1725. @end cartouche
  1726. @node Definition of the Main Code
  1727. @subsection Definition of the Main Code
  1728. The CPU implementation is the same as in the previous section.
  1729. Here is the source of the main application. You can notice the value of the
  1730. field @code{where} for the codelet. We specify
  1731. @code{STARPU_CPU|STARPU_CUDA|STARPU_OPENCL} to indicate to StarPU that the codelet
  1732. can be executed either on a CPU or on a CUDA or an OpenCL device.
  1733. @cartouche
  1734. @smallexample
  1735. #include <starpu.h>
  1736. #define NX 2048
  1737. extern void scal_cuda_func(void *buffers[], void *_args);
  1738. extern void scal_cpu_func(void *buffers[], void *_args);
  1739. /* @b{Definition of the codelet} */
  1740. static starpu_codelet cl = @{
  1741. .where = STARPU_CPU|STARPU_CUDA; /* @b{It can be executed on a CPU} */
  1742. /* @b{or on a CUDA device} */
  1743. .cuda_func = scal_cuda_func;
  1744. .cpu_func = scal_cpu_func;
  1745. .nbuffers = 1;
  1746. @}
  1747. int main(int argc, char **argv)
  1748. @{
  1749. float *vector;
  1750. int i, ret;
  1751. float factor=3.0;
  1752. struct starpu_task *task;
  1753. starpu_data_handle vector_handle;
  1754. starpu_init(NULL); /* @b{Initialising StarPU} */
  1755. vector = (float*)malloc(NX*sizeof(float));
  1756. assert(vector);
  1757. for(i=0 ; i<NX ; i++) vector[i] = i;
  1758. @end smallexample
  1759. @end cartouche
  1760. @cartouche
  1761. @smallexample
  1762. /* @b{Registering data within StarPU} */
  1763. starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector,
  1764. NX, sizeof(float));
  1765. /* @b{Definition of the task} */
  1766. task = starpu_task_create();
  1767. task->cl = &cl;
  1768. task->buffers[0].handle = vector_handle;
  1769. task->buffers[0].mode = STARPU_RW;
  1770. task->cl_arg = &factor;
  1771. task->cl_arg_size = sizeof(factor);
  1772. @end smallexample
  1773. @end cartouche
  1774. @cartouche
  1775. @smallexample
  1776. /* @b{Submitting the task} */
  1777. ret = starpu_task_submit(task);
  1778. if (ret == -ENODEV) @{
  1779. fprintf(stderr, "No worker may execute this task\n");
  1780. return 1;
  1781. @}
  1782. /* @b{Waiting for its termination} */
  1783. starpu_task_wait_for_all();
  1784. /* @b{Update the vector in RAM} */
  1785. starpu_data_sync_with_mem(vector_handle, STARPU_R);
  1786. @end smallexample
  1787. @end cartouche
  1788. @cartouche
  1789. @smallexample
  1790. /* @b{Access the data} */
  1791. for(i=0 ; i<NX; i++) @{
  1792. fprintf(stderr, "%f ", vector[i]);
  1793. @}
  1794. fprintf(stderr, "\n");
  1795. /* @b{Release the data and shutdown StarPU} */
  1796. starpu_data_release_from_mem(vector_handle);
  1797. starpu_shutdown();
  1798. return 0;
  1799. @}
  1800. @end smallexample
  1801. @end cartouche
  1802. @node Compilation and execution of Hybrid Vector Scaling
  1803. @subsection Compilation and execution of Hybrid Vector Scaling
  1804. The Makefile given at the beginning of the section must be extended to
  1805. give the rules to compile the CUDA source code.
  1806. @cartouche
  1807. @smallexample
  1808. CFLAGS += $(shell pkg-config --cflags libstarpu)
  1809. LDFLAGS += $(shell pkg-config --libs libstarpu)
  1810. CC = gcc
  1811. vector: vector.o vector_cpu.o vector_cuda.o
  1812. %.o: %.cu
  1813. nvcc $(CFLAGS) $< -c $@
  1814. clean:
  1815. rm -f vector *.o
  1816. @end smallexample
  1817. @end cartouche
  1818. @smallexample
  1819. % make
  1820. @end smallexample
  1821. and to execute it, with the default configuration:
  1822. @smallexample
  1823. % ./vector
  1824. 0.000000 3.000000 6.000000 9.000000 12.000000
  1825. @end smallexample
  1826. or for example, by disabling CPU devices:
  1827. @smallexample
  1828. % STARPU_NCPUS=0 ./vector
  1829. 0.000000 3.000000 6.000000 9.000000 12.000000
  1830. @end smallexample
  1831. or by disabling CUDA devices:
  1832. @smallexample
  1833. % STARPU_NCUDA=0 ./vector
  1834. 0.000000 3.000000 6.000000 9.000000 12.000000
  1835. @end smallexample
  1836. @c TODO: Add performance model example (and update basic_examples)
  1837. @c ---------------------------------------------------------------------
  1838. @c Advanced Topics
  1839. @c ---------------------------------------------------------------------
  1840. @c @node Advanced Topics
  1841. @c @chapter Advanced Topics
  1842. @c ---------------------------------------------------------------------
  1843. @c Appendices
  1844. @c ---------------------------------------------------------------------
  1845. @c ---------------------------------------------------------------------
  1846. @c Full source code for the 'Scaling a Vector' example
  1847. @c ---------------------------------------------------------------------
  1848. @node Full source code for the 'Scaling a Vector' example
  1849. @appendix Full source code for the 'Scaling a Vector' example
  1850. @menu
  1851. * Main application::
  1852. * CPU Codelet::
  1853. * CUDA Codelet::
  1854. * OpenCL Codelet::
  1855. @end menu
  1856. @node Main application
  1857. @section Main application
  1858. @smallexample
  1859. @include vector_scal_c.texi
  1860. @end smallexample
  1861. @node CPU Codelet
  1862. @section CPU Codelet
  1863. @smallexample
  1864. @include vector_scal_cpu.texi
  1865. @end smallexample
  1866. @node CUDA Codelet
  1867. @section CUDA Codelet
  1868. @smallexample
  1869. @include vector_scal_cuda.texi
  1870. @end smallexample
  1871. @node OpenCL Codelet
  1872. @section OpenCL Codelet
  1873. @menu
  1874. * Invoking the kernel::
  1875. * Source of the kernel::
  1876. @end menu
  1877. @node Invoking the kernel
  1878. @subsection Invoking the kernel
  1879. @smallexample
  1880. @include vector_scal_opencl.texi
  1881. @end smallexample
  1882. @node Source of the kernel
  1883. @subsection Source of the kernel
  1884. @smallexample
  1885. @include vector_scal_opencl_codelet.texi
  1886. @end smallexample
  1887. @bye