starpu.texi 63 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863
  1. \input texinfo @c -*-texinfo-*-
  2. @c %**start of header
  3. @setfilename starpu.info
  4. @settitle StarPU
  5. @c %**end of header
  6. @setchapternewpage odd
  7. @titlepage
  8. @title StarPU
  9. @page
  10. @vskip 0pt plus 1filll
  11. @comment For the @value{version-GCC} Version*
  12. @end titlepage
  13. @summarycontents
  14. @contents
  15. @page
  16. @node Top
  17. @top Preface
  18. @cindex Preface
  19. This manual documents the usage of StarPU.
  20. @comment
  21. @comment When you add a new menu item, please keep the right hand
  22. @comment aligned to the same column. Do not use tabs. This provides
  23. @comment better formatting.
  24. @comment
  25. @menu
  26. * Introduction:: A basic introduction to using StarPU
  27. * Installing StarPU:: How to configure, build and install StarPU
  28. * Configuration options:: Configurations options
  29. * Environment variables:: Environment variables used by StarPU
  30. * StarPU API:: The API to use StarPU
  31. * Basic Examples:: Basic examples of the use of StarPU
  32. * Advanced Topics:: Advanced use of StarPU
  33. @end menu
  34. @c ---------------------------------------------------------------------
  35. @c Introduction to StarPU
  36. @c ---------------------------------------------------------------------
  37. @node Introduction
  38. @chapter Introduction to StarPU
  39. @menu
  40. * Motivation:: Why StarPU ?
  41. * StarPU in a Nutshell:: The Fundamentals of StarPU
  42. @end menu
  43. @node Motivation
  44. @section Motivation
  45. @c complex machines with heterogeneous cores/devices
  46. The use of specialized hardware such as accelerators or coprocessors offers an
  47. interesting approach to overcome the physical limits encountered by processor
  48. architects. As a result, many machines are now equipped with one or several
  49. accelerators (e.g. a GPU), in addition to the usual processor(s). While a lot of
  50. efforts have been devoted to offload computation onto such accelerators, very
  51. little attention as been paid to portability concerns on the one hand, and to the
  52. possibility of having heterogeneous accelerators and processors to interact on the other hand.
  53. StarPU is a runtime system that offers support for heterogeneous multicore
  54. architectures, it not only offers a unified view of the computational resources
  55. (i.e. CPUs and accelerators at the same time), but it also takes care of
  56. efficiently mapping and executing tasks onto an heterogeneous machine while
  57. transparently handling low-level issues in a portable fashion.
  58. @c this leads to a complicated distributed memory design
  59. @c which is not (easily) manageable by hand
  60. @c added value/benefits of StarPU
  61. @c - portability
  62. @c - scheduling, perf. portability
  63. @node StarPU in a Nutshell
  64. @section StarPU in a Nutshell
  65. From a programming point of view, StarPU is not a new language but a library
  66. that executes tasks explicitly submitted by the application. The data that a
  67. task manipulates are automatically transferred onto the accelerator so that the
  68. programmer does not have to take care of complex data movements. StarPU also
  69. takes particular care of scheduling those tasks efficiently and allows
  70. scheduling experts to implement custom scheduling policies in a portable
  71. fashion.
  72. @c explain the notion of codelet and task (i.e. g(A, B)
  73. @subsection Codelet and Tasks
  74. One of StarPU primary data structure is the @b{codelet}. A codelet describes a
  75. computational kernel that can possibly be implemented on multiple architectures
  76. such as a CPU, a CUDA device or a Cell's SPU.
  77. @c TODO insert illustration f : f_spu, f_cpu, ...
  78. Another important data structure is the @b{task}. Executing a StarPU task
  79. consists in applying a codelet on a data set, on one of the architectures on
  80. which the codelet is implemented. In addition to the codelet that a task
  81. implements, it also describes which data are accessed, and how they are
  82. accessed during the computation (read and/or write).
  83. StarPU tasks are asynchronous: submitting a task to StarPU is a non-blocking
  84. operation. The task structure can also specify a @b{callback} function that is
  85. called once StarPU has properly executed the task. It also contains optional
  86. fields that the application may use to give hints to the scheduler (such as
  87. priority levels).
  88. A task may be identified by a unique 64-bit number which we refer as a @b{tag}.
  89. Task dependencies can be enforced either by the means of callback functions, or
  90. by expressing dependencies between tags.
  91. @c TODO insert illustration f(Ar, Brw, Cr) + ..
  92. @c DSM
  93. @subsection StarPU Data Management Library
  94. Because StarPU schedules tasks at runtime, data transfers have to be
  95. done automatically and ``just-in-time'' between processing units,
  96. relieving the application programmer from explicit data transfers.
  97. Moreover, to avoid unnecessary transfers, StarPU keeps data
  98. where it was last needed, even if was modified there, and it
  99. allows multiple copies of the same data to reside at the same time on
  100. several processing units as long as it is not modified.
  101. @c ---------------------------------------------------------------------
  102. @c Installing StarPU
  103. @c ---------------------------------------------------------------------
  104. @node Installing StarPU
  105. @chapter Installing StarPU
  106. @menu
  107. * Configuration of StarPU::
  108. * Building and Installing StarPU::
  109. @end menu
  110. StarPU can be built and installed by the standard means of the GNU
  111. autotools. The following chapter is intended to briefly remind how these tools
  112. can be used to install StarPU.
  113. @node Configuration of StarPU
  114. @section Configuration of StarPU
  115. @menu
  116. * Generating Makefiles and configuration scripts::
  117. * Configuring StarPU::
  118. @end menu
  119. @node Generating Makefiles and configuration scripts
  120. @subsection Generating Makefiles and configuration scripts
  121. This step is not necessary when using the tarball releases of StarPU. If you
  122. are using the source code from the svn repository, you first need to generate
  123. the configure scripts and the Makefiles.
  124. @example
  125. % autoreconf -vfi
  126. @end example
  127. @node Configuring StarPU
  128. @subsection Configuring StarPU
  129. @example
  130. % ./configure
  131. @end example
  132. Details about options that are useful to give to @code{./configure} are given in
  133. @ref{Configuration options}.
  134. @node Building and Installing StarPU
  135. @section Building and Installing StarPU
  136. @menu
  137. * Building::
  138. * Sanity Checks::
  139. * Installing::
  140. * pkg-config configuration::
  141. @end menu
  142. @node Building
  143. @subsection Building
  144. @example
  145. % make
  146. @end example
  147. @node Sanity Checks
  148. @subsection Sanity Checks
  149. In order to make sure that StarPU is working properly on the system, it is also
  150. possible to run a test suite.
  151. @example
  152. % make check
  153. @end example
  154. @node Installing
  155. @subsection Installing
  156. In order to install StarPU at the location that was specified during
  157. configuration:
  158. @example
  159. % make install
  160. @end example
  161. @c ---------------------------------------------------------------------
  162. @c Using StarPU
  163. @c ---------------------------------------------------------------------
  164. @node Using StarPU
  165. @chapter Using StarPU
  166. @node Setting flags for compiling and linking applications
  167. @section Setting flags for compiling and linking applications
  168. Compiling and linking an application against StarPU may require to use
  169. specific flags or libraries (for instance @code{CUDA} or @code{libspe2}).
  170. To this end, it is possible to use the @code{pkg-config} tool.
  171. If StarPU was not installed at some standard location, the path of StarPU's
  172. library must be specified in the @code{PKG_CONFIG_PATH} environment variable so
  173. that @code{pkg-config} can find it. For example if StarPU was installed in
  174. @code{$prefix_dir}:
  175. @example
  176. % PKG_CONFIG_PATH=$PKG_CONFIG_PATH:$prefix_dir/lib/pkgconfig
  177. @end example
  178. The flags required to compile or link against StarPU are then
  179. accessible with the following commands:
  180. @example
  181. % pkg-config --cflags libstarpu # options for the compiler
  182. % pkg-config --libs libstarpu # options for the linker
  183. @end example
  184. @node Running a basic StarPU application
  185. @section Running a basic StarPU application
  186. Basic examples using StarPU have been built in the directory
  187. @code{$prefix_dir/lib/starpu/examples/}. You can for example run the
  188. @code{vector_scal}.
  189. @example
  190. % $prefix_dir/lib/starpu/examples/vector_scal
  191. BEFORE : First element was 1.000000
  192. AFTER First element is 3.140000
  193. %
  194. @end example
  195. @c ---------------------------------------------------------------------
  196. @c Configuration options
  197. @c ---------------------------------------------------------------------
  198. @node Configuration options
  199. @chapter Configuration options
  200. @table @asis
  201. @item @code{--disable-cpu}
  202. Disable the use of CPUs of the machine. Only GPUs etc. will be used.
  203. @item @code{--enable-maxcudadev=<number>}
  204. Defines the maximum number of CUDA devices that StarPU will support, then
  205. available as the STARPU_MAXCUDADEVS macro.
  206. @item @code{--disable-cuda}
  207. Disable the use of CUDA, even if the SDK is detected.
  208. @item @code{--enable-maxopencldev=<number>}
  209. Defines the maximum number of OpenCL devices that StarPU will support, then
  210. available as the STARPU_MAXOPENCLDEVS macro.
  211. @item @code{--disable-opencl}
  212. Disable the use of OpenCL, even if the SDK is detected.
  213. @item @code{--enable-gordon}
  214. Enable the use of the Gordon runtime for Cell SPUs.
  215. @c TODO: rather default to enabled when detected
  216. @item @code{--enable-debug}
  217. Enable debugging messages.
  218. @item @code{--enable-fast}
  219. Do not enforce assertions, saves a lot of time spent to compute them otherwise.
  220. @item @code{--enable-verbose}
  221. Augment the verbosity of the debugging messages.
  222. @item @code{--enable-coverage}
  223. Enable flags for the coverage tool.
  224. @item @code{--enable-perf-debug}
  225. Enable performance debugging.
  226. @item @code{--enable-model-debug}
  227. Enable performance model debugging.
  228. @item @code{--enable-stats}
  229. Enable statistics.
  230. @item @code{--enable-maxbuffers=<nbuffers>}
  231. Define the maximum number of buffers that tasks will be able to take as parameters, then available as the STARPU_NMAXBUFS macro.
  232. @item @code{--enable-allocation-cache}
  233. Enable the use of a data allocation cache to avoid the cost of it with
  234. CUDA. Still experimental.
  235. @item @code{--enable-opengl-render}
  236. Enable the use of OpenGL for the rendering of some examples.
  237. @c TODO: rather default to enabled when detected
  238. @item @code{--enable-blas-lib=<name>}
  239. Specify the blas library to be used by some of the examples. The
  240. library has to be 'atlas' or 'goto'.
  241. @item @code{--with-cuda-dir=<path>}
  242. Specify the location of the CUDA SDK resides. This directory should notably contain
  243. @code{include/cuda.h}.
  244. @item @code{--with-magma=<path>}
  245. Specify where magma is installed.
  246. @item @code{--with-opencl-dir=<path>}
  247. Specify the location of the OpenCL SDK. This directory should notably contain
  248. @code{include/CL/cl.h}.
  249. @item @code{--with-gordon-dir=<path>}
  250. Specify the location of the Gordon SDK.
  251. @item @code{--with-fxt=<path>}
  252. Specify the location of FxT (for generating traces and rendering them
  253. using ViTE). This directory should notably contain
  254. @code{include/fxt/fxt.h}.
  255. @item @code{--with-perf-model-dir=<dir>}
  256. Specify where performance models should be stored (instead of defaulting to the
  257. current user's home).
  258. @item @code{--with-mpicc=<path to mpicc>}
  259. Specify the location of the @code{mpicc} compiler to be used for starpumpi.
  260. @c TODO: also just use AC_PROG
  261. @item @code{--with-mpi}
  262. Enable building libstarpumpi.
  263. @c TODO: rather just use the availability of mpicc instead of a second option
  264. @item @code{--with-goto-dir=<dir>}
  265. Specify the location of GotoBLAS.
  266. @item @code{--with-atlas-dir=<dir>}
  267. Specify the location of ATLAS. This directory should notably contain
  268. @code{include/cblas.h}.
  269. @end table
  270. @c ---------------------------------------------------------------------
  271. @c Environment variables
  272. @c ---------------------------------------------------------------------
  273. @node Environment variables
  274. @chapter Environment variables
  275. @menu
  276. * Workers:: Configuring workers
  277. * Scheduling:: Configuring the Scheduling engine
  278. * Misc:: Miscellaneous and debug
  279. @end menu
  280. Note: the values given in @code{starpu_conf} structure passed when
  281. calling @code{starpu_init} will override the values of the environment
  282. variables.
  283. @node Workers
  284. @section Configuring workers
  285. @menu
  286. * STARPU_NCPUS :: Number of CPU workers
  287. * STARPU_NCUDA :: Number of CUDA workers
  288. * STARPU_NOPENCL :: Number of OpenCL workers
  289. * STARPU_NGORDON :: Number of SPU workers (Cell)
  290. * STARPU_WORKERS_CPUID :: Bind workers to specific CPUs
  291. * STARPU_WORKERS_CUDAID :: Select specific CUDA devices
  292. * STARPU_WORKERS_OPENCLID :: Select specific OpenCL devices
  293. @end menu
  294. @node STARPU_NCPUS
  295. @subsection @code{STARPU_NCPUS} -- Number of CPU workers
  296. @table @asis
  297. @item @emph{Description}:
  298. Specify the maximum number of CPU workers. Note that StarPU will not allocate
  299. more CPUs than there are physical CPUs, and that some CPUs are used to control
  300. the accelerators.
  301. @end table
  302. @node STARPU_NCUDA
  303. @subsection @code{STARPU_NCUDA} -- Number of CUDA workers
  304. @table @asis
  305. @item @emph{Description}:
  306. Specify the maximum number of CUDA devices that StarPU can use. If
  307. @code{STARPU_NCUDA} is lower than the number of physical devices, it is
  308. possible to select which CUDA devices should be used by the means of the
  309. @code{STARPU_WORKERS_CUDAID} environment variable.
  310. @end table
  311. @node STARPU_NOPENCL
  312. @subsection @code{STARPU_NOPENCL} -- Number of OpenCL workers
  313. @table @asis
  314. @item @emph{Description}:
  315. OpenCL equivalent of the @code{STARPU_NCUDA} environment variable.
  316. @end table
  317. @node STARPU_NGORDON
  318. @subsection @code{STARPU_NGORDON} -- Number of SPU workers (Cell)
  319. @table @asis
  320. @item @emph{Description}:
  321. Specify the maximum number of SPUs that StarPU can use.
  322. @end table
  323. @node STARPU_WORKERS_CPUID
  324. @subsection @code{STARPU_WORKERS_CPUID} -- Bind workers to specific CPUs
  325. @table @asis
  326. @item @emph{Description}:
  327. Passing an array of integers (starting from 0) in @code{STARPU_WORKERS_CPUID}
  328. specifies on which logical CPU the different workers should be
  329. bound. For instance, if @code{STARPU_WORKERS_CPUID = "1 3 0 2"}, the first
  330. worker will be bound to logical CPU #1, the second CPU worker will be bound to
  331. logical CPU #3 and so on. Note that the logical ordering of the CPUs is either
  332. determined by the OS, or provided by the @code{hwloc} library in case it is
  333. available.
  334. Note that the first workers correspond to the CUDA workers, then come the
  335. OpenCL and the SPU, and finally the CPU workers. For example if
  336. we have @code{STARPU_NCUDA=1}, @code{STARPU_NOPENCL=1}, @code{STARPU_NCPUS=2}
  337. and @code{STARPU_WORKERS_CPUID = "0 2 1 3"}, the CUDA device will be controlled
  338. by logical CPU #0, the OpenCL device will be controlled by logical CPU #2, and
  339. the logical CPUs #1 and #3 will be used by the CPU workers.
  340. If the number of workers is larger than the array given in
  341. @code{STARPU_WORKERS_CPUID}, the workers are bound to the logical CPUs in a
  342. round-robin fashion: if @code{STARPU_WORKERS_CPUID = "0 1"}, the first and the
  343. third (resp. second and fourth) workers will be put on CPU #0 (resp. CPU #1).
  344. @end table
  345. @node STARPU_WORKERS_CUDAID
  346. @subsection @code{STARPU_WORKERS_CUDAID} -- Select specific CUDA devices
  347. @table @asis
  348. @item @emph{Description}:
  349. Similarly to the @code{STARPU_WORKERS_CPUID} environment variable, it is
  350. possible to select which CUDA devices should be used by StarPU. On a machine
  351. equipped with 4 GPUs, setting @code{STARPU_WORKERS_CUDAID = "1 3"} and
  352. @code{STARPU_NCUDA=2} specifies that 2 CUDA workers should be created, and that
  353. they should use CUDA devices #1 and #3 (the logical ordering of the devices is
  354. the one reported by CUDA).
  355. @end table
  356. @node STARPU_WORKERS_OPENCLID
  357. @subsection @code{STARPU_WORKERS_OPENCLID} -- Select specific OpenCL devices
  358. @table @asis
  359. @item @emph{Description}:
  360. OpenCL equivalent of the @code{STARPU_WORKERS_CUDAID} environment variable.
  361. @end table
  362. @node Scheduling
  363. @section Configuring the Scheduling engine
  364. @menu
  365. * STARPU_SCHED :: Scheduling policy
  366. * STARPU_CALIBRATE :: Calibrate performance models
  367. * STARPU_PREFETCH :: Use data prefetch
  368. * STARPU_SCHED_ALPHA :: Computation factor
  369. * STARPU_SCHED_BETA :: Communication factor
  370. @end menu
  371. @node STARPU_SCHED
  372. @subsection @code{STARPU_SCHED} -- Scheduling policy
  373. @table @asis
  374. @item @emph{Description}:
  375. This chooses between the different scheduling policies proposed by StarPU: work
  376. random, stealing, greedy, with performance models, etc.
  377. Use @code{STARPU_SCHED=help} to get the list of available schedulers.
  378. @end table
  379. @node STARPU_CALIBRATE
  380. @subsection @code{STARPU_CALIBRATE} -- Calibrate performance models
  381. @table @asis
  382. @item @emph{Description}:
  383. If this variable is set to 1, the performance models are calibrated during
  384. the execution. If it is set to 2, the previous values are dropped to restart
  385. calibration from scratch.
  386. Note: this currently only applies to dm and dmda scheduling policies.
  387. @end table
  388. @node STARPU_PREFETCH
  389. @subsection @code{STARPU_PREFETCH} -- Use data prefetch
  390. @table @asis
  391. @item @emph{Description}:
  392. If this variable is set, data prefetching will be enabled, that is when a task is
  393. scheduled to be executed e.g. on a GPU, StarPU will request an asynchronous
  394. transfer in advance, so that data is already present on the GPU when the task
  395. starts. As a result, computation and data transfers are overlapped.
  396. @end table
  397. @node STARPU_SCHED_ALPHA
  398. @subsection @code{STARPU_SCHED_ALPHA} -- Computation factor
  399. @table @asis
  400. @item @emph{Description}:
  401. To estimate the cost of a task StarPU takes into account the estimated
  402. computation time (obtained thanks to performance models). The alpha factor is
  403. the coefficient to be applied to it before adding it to the communication part.
  404. @end table
  405. @node STARPU_SCHED_BETA
  406. @subsection @code{STARPU_SCHED_BETA} -- Communication factor
  407. @table @asis
  408. @item @emph{Description}:
  409. To estimate the cost of a task StarPU takes into account the estimated
  410. data transfer time (obtained thanks to performance models). The beta factor is
  411. the coefficient to be applied to it before adding it to the computation part.
  412. @end table
  413. @node Misc
  414. @section Miscellaneous and debug
  415. @menu
  416. * STARPU_LOGFILENAME :: Select debug file name
  417. @end menu
  418. @node STARPU_LOGFILENAME
  419. @subsection @code{STARPU_LOGFILENAME} -- Select debug file name
  420. @table @asis
  421. @item @emph{Description}:
  422. This variable specify in which file the debugging output should be saved to.
  423. @end table
  424. @c ---------------------------------------------------------------------
  425. @c StarPU API
  426. @c ---------------------------------------------------------------------
  427. @node StarPU API
  428. @chapter StarPU API
  429. @menu
  430. * Initialization and Termination:: Initialization and Termination methods
  431. * Workers' Properties:: Methods to enumerate workers' properties
  432. * Data Library:: Methods to manipulate data
  433. * Codelets and Tasks:: Methods to construct tasks
  434. * Tags:: Task dependencies
  435. * CUDA extensions:: CUDA extensions
  436. * OpenCL extensions:: OpenCL extensions
  437. * Cell extensions:: Cell extensions
  438. * Miscellaneous:: Miscellaneous helpers
  439. @end menu
  440. @node Initialization and Termination
  441. @section Initialization and Termination
  442. @menu
  443. * starpu_init:: Initialize StarPU
  444. * struct starpu_conf:: StarPU runtime configuration
  445. * starpu_shutdown:: Terminate StarPU
  446. @end menu
  447. @node starpu_init
  448. @subsection @code{starpu_init} -- Initialize StarPU
  449. @table @asis
  450. @item @emph{Description}:
  451. This is StarPU initialization method, which must be called prior to any other
  452. StarPU call. It is possible to specify StarPU's configuration (e.g. scheduling
  453. policy, number of cores, ...) by passing a non-null argument. Default
  454. configuration is used if the passed argument is @code{NULL}.
  455. @item @emph{Return value}:
  456. Upon successful completion, this function returns 0. Otherwise, @code{-ENODEV}
  457. indicates that no worker was available (so that StarPU was not initialized).
  458. @item @emph{Prototype}:
  459. @code{int starpu_init(struct starpu_conf *conf);}
  460. @end table
  461. @node struct starpu_conf
  462. @subsection @code{struct starpu_conf} -- StarPU runtime configuration
  463. @table @asis
  464. @item @emph{Description}:
  465. This structure is passed to the @code{starpu_init} function in order
  466. to configure StarPU.
  467. When the default value is used, StarPU automatically selects the number
  468. of processing units and takes the default scheduling policy. This parameter
  469. overwrites the equivalent environment variables.
  470. @item @emph{Fields}:
  471. @table @asis
  472. @item @code{sched_policy} (default = NULL):
  473. This is the name of the scheduling policy. This can also be specified with the
  474. @code{STARPU_SCHED} environment variable.
  475. @item @code{ncpus} (default = -1):
  476. This is the maximum number of CPU cores that StarPU can use. This can also be
  477. specified with the @code{STARPU_NCPUS} environment variable.
  478. @item @code{ncuda} (default = -1):
  479. This is the maximum number of CUDA devices that StarPU can use. This can also be
  480. specified with the @code{STARPU_NCUDA} environment variable.
  481. @item @code{nopencl} (default = -1):
  482. This is the maximum number of OpenCL devices that StarPU can use. This can also be
  483. specified with the @code{STARPU_NOPENCL} environment variable.
  484. @item @code{nspus} (default = -1):
  485. This is the maximum number of Cell SPUs that StarPU can use. This can also be
  486. specified with the @code{STARPU_NGORDON} environment variable.
  487. @item @code{calibrate} (default = 0):
  488. If this flag is set, StarPU will calibrate the performance models when
  489. executing tasks. This can also be specified with the @code{STARPU_CALIBRATE}
  490. environment variable.
  491. @end table
  492. @end table
  493. @node starpu_shutdown
  494. @subsection @code{starpu_shutdown} -- Terminate StarPU
  495. @table @asis
  496. @item @emph{Description}:
  497. This is StarPU termination method. It must be called at the end of the
  498. application: statistics and other post-mortem debugging information are not
  499. guaranteed to be available until this method has been called.
  500. @item @emph{Prototype}:
  501. @code{void starpu_shutdown(void);}
  502. @end table
  503. @node Workers' Properties
  504. @section Workers' Properties
  505. @menu
  506. * starpu_worker_get_count:: Get the number of processing units
  507. * starpu_cpu_worker_get_count:: Get the number of CPU controlled by StarPU
  508. * starpu_cuda_worker_get_count:: Get the number of CUDA devices controlled by StarPU
  509. * starpu_opencl_worker_get_count:: Get the number of OpenCL devices controlled by StarPU
  510. * starpu_spu_worker_get_count:: Get the number of Cell SPUs controlled by StarPU
  511. * starpu_worker_get_id:: Get the identifier of the current worker
  512. * starpu_worker_get_type:: Get the type of processing unit associated to a worker
  513. * starpu_worker_get_name:: Get the name of a worker
  514. @end menu
  515. @node starpu_worker_get_count
  516. @subsection @code{starpu_worker_get_count} -- Get the number of processing units
  517. @table @asis
  518. @item @emph{Description}:
  519. This function returns the number of workers (i.e. processing units executing
  520. StarPU tasks). The returned value should be at most @code{STARPU_NMAXWORKERS}.
  521. @item @emph{Prototype}:
  522. @code{unsigned starpu_worker_get_count(void);}
  523. @end table
  524. @node starpu_cpu_worker_get_count
  525. @subsection @code{starpu_cpu_worker_get_count} -- Get the number of CPU controlled by StarPU
  526. @table @asis
  527. @item @emph{Description}:
  528. This function returns the number of CPUs controlled by StarPU. The returned
  529. value should be at most @code{STARPU_NMAXCPUS}.
  530. @item @emph{Prototype}:
  531. @code{unsigned starpu_cpu_worker_get_count(void);}
  532. @end table
  533. @node starpu_cuda_worker_get_count
  534. @subsection @code{starpu_cuda_worker_get_count} -- Get the number of CUDA devices controlled by StarPU
  535. @table @asis
  536. @item @emph{Description}:
  537. This function returns the number of CUDA devices controlled by StarPU. The returned
  538. value should be at most @code{STARPU_MAXCUDADEVS}.
  539. @item @emph{Prototype}:
  540. @code{unsigned starpu_cuda_worker_get_count(void);}
  541. @end table
  542. @node starpu_opencl_worker_get_count
  543. @subsection @code{starpu_opencl_worker_get_count} -- Get the number of OpenCL devices controlled by StarPU
  544. @table @asis
  545. @item @emph{Description}:
  546. This function returns the number of OpenCL devices controlled by StarPU. The returned
  547. value should be at most @code{STARPU_MAXOPENCLDEVS}.
  548. @item @emph{Prototype}:
  549. @code{unsigned starpu_opencl_worker_get_count(void);}
  550. @end table
  551. @node starpu_spu_worker_get_count
  552. @subsection @code{starpu_spu_worker_get_count} -- Get the number of Cell SPUs controlled by StarPU
  553. @table @asis
  554. @item @emph{Description}:
  555. This function returns the number of Cell SPUs controlled by StarPU.
  556. @item @emph{Prototype}:
  557. @code{unsigned starpu_opencl_worker_get_count(void);}
  558. @end table
  559. @node starpu_worker_get_id
  560. @subsection @code{starpu_worker_get_id} -- Get the identifier of the current worker
  561. @table @asis
  562. @item @emph{Description}:
  563. This function returns the identifier of the worker associated to the calling
  564. thread. The returned value is either -1 if the current context is not a StarPU
  565. worker (i.e. when called from the application outside a task or a callback), or
  566. an integer between 0 and @code{starpu_worker_get_count() - 1}.
  567. @item @emph{Prototype}:
  568. @code{int starpu_worker_get_id(void);}
  569. @end table
  570. @node starpu_worker_get_type
  571. @subsection @code{starpu_worker_get_type} -- Get the type of processing unit associated to a worker
  572. @table @asis
  573. @item @emph{Description}:
  574. This function returns the type of worker associated to an identifier (as
  575. returned by the @code{starpu_worker_get_id} function). The returned value
  576. indicates the architecture of the worker: @code{STARPU_CPU_WORKER} for a CPU
  577. core, @code{STARPU_CUDA_WORKER} for a CUDA device,
  578. @code{STARPU_OPENCL_WORKER} for a OpenCL device, and
  579. @code{STARPU_GORDON_WORKER} for a Cell SPU. The value returned for an invalid
  580. identifier is unspecified.
  581. @item @emph{Prototype}:
  582. @code{enum starpu_archtype starpu_worker_get_type(int id);}
  583. @end table
  584. @node starpu_worker_get_name
  585. @subsection @code{starpu_worker_get_name} -- Get the name of a worker
  586. @table @asis
  587. @item @emph{Description}:
  588. StarPU associates a unique human readable string to each processing unit. This
  589. function copies at most the @code{maxlen} first bytes of the unique string
  590. associated to a worker identified by its identifier @code{id} into the
  591. @code{dst} buffer. The caller is responsible for ensuring that the @code{dst}
  592. is a valid pointer to a buffer of @code{maxlen} bytes at least. Calling this
  593. function on an invalid identifier results in an unspecified behaviour.
  594. @item @emph{Prototype}:
  595. @code{void starpu_worker_get_name(int id, char *dst, size_t maxlen);}
  596. @end table
  597. @node Data Library
  598. @section Data Library
  599. This section describes the data management facilities provided by StarPU.
  600. TODO: We show how to use existing data interfaces in [ref], but developers can
  601. design their own data interfaces if required.
  602. @menu
  603. * starpu_data_handle:: StarPU opaque data handle
  604. * void *interface:: StarPU data interface
  605. @end menu
  606. @node starpu_data_handle
  607. @subsection @code{starpu_data_handle} -- StarPU opaque data handle
  608. @table @asis
  609. @item @emph{Description}:
  610. StarPU uses @code{starpu_data_handle} as an opaque handle to manage a piece of
  611. data. Once a piece of data has been registered to StarPU, it is associated to a
  612. @code{starpu_data_handle} which keeps track of the state of the piece of data
  613. over the entire machine, so that we can maintain data consistency and locate
  614. data replicates for instance.
  615. @end table
  616. @node void *interface
  617. @subsection @code{void *interface} -- StarPU data interface
  618. @table @asis
  619. @item @emph{Description}:
  620. Data management is done at a high-level in StarPU: rather than accessing a mere
  621. list of contiguous buffers, the tasks may manipulate data that are described by
  622. a high-level construct which we call data interface.
  623. TODO
  624. @end table
  625. @c void starpu_data_unregister(struct starpu_data_state_t *state);
  626. @c starpu_worker_get_memory_node TODO
  627. @c
  628. @c user interaction with the DSM
  629. @c void starpu_data_sync_with_mem(struct starpu_data_state_t *state);
  630. @c void starpu_notify_data_modification(struct starpu_data_state_t *state, uint32_t modifying_node);
  631. @node Codelets and Tasks
  632. @section Codelets and Tasks
  633. @menu
  634. * struct starpu_codelet:: StarPU codelet structure
  635. * struct starpu_task:: StarPU task structure
  636. * starpu_task_init:: Initialize a Task
  637. * starpu_task_create:: Allocate and Initialize a Task
  638. * starpu_task_deinit:: Release all the resources used by a Task
  639. * starpu_task_destroy:: Destroy a dynamically allocated Task
  640. * starpu_task_submit:: Submit a Task
  641. * starpu_task_wait:: Wait for the termination of a Task
  642. * starpu_task_wait_for_all:: Wait for the termination of all Tasks
  643. @end menu
  644. @node struct starpu_codelet
  645. @subsection @code{struct starpu_codelet} -- StarPU codelet structure
  646. @table @asis
  647. @item @emph{Description}:
  648. The codelet structure describes a kernel that is possibly implemented on
  649. various targets.
  650. @item @emph{Fields}:
  651. @table @asis
  652. @item @code{where}:
  653. Indicates which types of processing units are able to execute the codelet.
  654. @code{STARPU_CPU|STARPU_CUDA} for instance indicates that the codelet is
  655. implemented for both CPU cores and CUDA devices while @code{STARPU_GORDON}
  656. indicates that it is only available on Cell SPUs.
  657. @item @code{cpu_func} (optional):
  658. Is a function pointer to the CPU implementation of the codelet. Its prototype
  659. must be: @code{void cpu_func(void *buffers[], void *cl_arg)}. The first
  660. argument being the array of data managed by the data management library, and
  661. the second argument is a pointer to the argument passed from the @code{cl_arg}
  662. field of the @code{starpu_task} structure.
  663. The @code{cpu_func} field is ignored if @code{STARPU_CPU} does not appear in
  664. the @code{where} field, it must be non-null otherwise.
  665. @item @code{cuda_func} (optional):
  666. Is a function pointer to the CUDA implementation of the codelet. @emph{This
  667. must be a host-function written in the CUDA runtime API}. Its prototype must
  668. be: @code{void cuda_func(void *buffers[], void *cl_arg);}. The @code{cuda_func}
  669. field is ignored if @code{STARPU_CUDA} does not appear in the @code{where}
  670. field, it must be non-null otherwise.
  671. @item @code{opencl_func} (optional):
  672. Is a function pointer to the OpenCL implementation of the codelet. Its
  673. prototype must be:
  674. @code{void opencl_func(starpu_data_interface_t *descr, void *arg);}.
  675. This pointer is ignored if @code{OPENCL} does not appear in the
  676. @code{where} field, it must be non-null otherwise.
  677. @item @code{gordon_func} (optional):
  678. This is the index of the Cell SPU implementation within the Gordon library.
  679. TODO
  680. @item @code{nbuffers}:
  681. Specifies the number of arguments taken by the codelet. These arguments are
  682. managed by the DSM and are accessed from the @code{void *buffers[]}
  683. array. The constant argument passed with the @code{cl_arg} field of the
  684. @code{starpu_task} structure is not counted in this number. This value should
  685. not be above @code{STARPU_NMAXBUFS}.
  686. @item @code{model} (optional):
  687. This is a pointer to the performance model associated to this codelet. This
  688. optional field is ignored when null. TODO
  689. @end table
  690. @end table
  691. @node struct starpu_task
  692. @subsection @code{struct starpu_task} -- StarPU task structure
  693. @table @asis
  694. @item @emph{Description}:
  695. The @code{starpu_task} structure describes a task that can be offloaded on the various
  696. processing units managed by StarPU. It instantiates a codelet. It can either be
  697. allocated dynamically with the @code{starpu_task_create} method, or declared
  698. statically. In the latter case, the programmer has to zero the
  699. @code{starpu_task} structure and to fill the different fields properly. The
  700. indicated default values correspond to the configuration of a task allocated
  701. with @code{starpu_task_create}.
  702. @item @emph{Fields}:
  703. @table @asis
  704. @item @code{cl}:
  705. Is a pointer to the corresponding @code{starpu_codelet} data structure. This
  706. describes where the kernel should be executed, and supplies the appropriate
  707. implementations. When set to @code{NULL}, no code is executed during the tasks,
  708. such empty tasks can be useful for synchronization purposes.
  709. @item @code{buffers}:
  710. TODO
  711. @item @code{cl_arg} (optional) (default = NULL):
  712. This pointer is passed to the codelet through the second argument
  713. of the codelet implementation (e.g. @code{cpu_func} or @code{cuda_func}).
  714. In the specific case of the Cell processor, see the @code{cl_arg_size}
  715. argument.
  716. @item @code{cl_arg_size} (optional, Cell specific):
  717. In the case of the Cell processor, the @code{cl_arg} pointer is not directly
  718. given to the SPU function. A buffer of size @code{cl_arg_size} is allocated on
  719. the SPU. This buffer is then filled with the @code{cl_arg_size} bytes starting
  720. at address @code{cl_arg}. In this case, the argument given to the SPU codelet
  721. is therefore not the @code{cl_arg} pointer, but the address of the buffer in
  722. local store (LS) instead. This field is ignored for CPU, CUDA and OpenCL
  723. codelets.
  724. @item @code{callback_func} (optional) (default = @code{NULL}):
  725. This is a function pointer of prototype @code{void (*f)(void *)} which
  726. specifies a possible callback. If this pointer is non-null, the callback
  727. function is executed @emph{on the host} after the execution of the task. The
  728. callback is passed the value contained in the @code{callback_arg} field. No
  729. callback is executed if the field is null.
  730. @item @code{callback_arg} (optional) (default = @code{NULL}):
  731. This is the pointer passed to the callback function. This field is ignored if
  732. the @code{callback_func} is null.
  733. @item @code{use_tag} (optional) (default = 0):
  734. If set, this flag indicates that the task should be associated with the tag
  735. contained in the @code{tag_id} field. Tag allow the application to synchronize
  736. with the task and to express task dependencies easily.
  737. @item @code{tag_id}:
  738. This fields contains the tag associated to the task if the @code{use_tag} field
  739. was set, it is ignored otherwise.
  740. @item @code{synchronous}:
  741. If this flag is set, the @code{starpu_task_submit} function is blocking and
  742. returns only when the task has been executed (or if no worker is able to
  743. process the task). Otherwise, @code{starpu_task_submit} returns immediately.
  744. @item @code{priority} (optional) (default = @code{STARPU_DEFAULT_PRIO}):
  745. This field indicates a level of priority for the task. This is an integer value
  746. that must be set between @code{STARPU_MIN_PRIO} (for the least important
  747. tasks) and @code{STARPU_MAX_PRIO} (for the most important tasks) included.
  748. Default priority is @code{STARPU_DEFAULT_PRIO}. Scheduling strategies that
  749. take priorities into account can use this parameter to take better scheduling
  750. decisions, but the scheduling policy may also ignore it.
  751. @item @code{execute_on_a_specific_worker} (default = 0):
  752. If this flag is set, StarPU will bypass the scheduler and directly affect this
  753. task to the worker specified by the @code{workerid} field.
  754. @item @code{workerid} (optional):
  755. If the @code{execute_on_a_specific_worker} field is set, this field indicates
  756. which is the identifier of the worker that should process this task (as
  757. returned by @code{starpu_worker_get_id}). This field is ignored if
  758. @code{execute_on_a_specific_worker} field is set to 0.
  759. @item @code{detach} (optional) (default = 1):
  760. If this flag is set, it is not possible to synchronize with the task
  761. by the means of @code{starpu_task_wait} later on. Internal data structures
  762. are only guaranteed to be freed once @code{starpu_task_wait} is called if the
  763. flag is not set.
  764. @item @code{destroy} (optional) (default = 1):
  765. If this flag is set, the task structure will automatically be freed, either
  766. after the execution of the callback if the task is detached, or during
  767. @code{starpu_task_wait} otherwise. If this flag is not set, dynamically
  768. allocated data structures will not be freed until @code{starpu_task_destroy} is
  769. called explicitly. Setting this flag for a statically allocated task structure
  770. will result in undefined behaviour.
  771. @end table
  772. @end table
  773. @node starpu_task_init
  774. @subsection @code{starpu_task_init} -- Initialize a Task
  775. @table @asis
  776. @item @emph{Description}:
  777. Initialize a task structure with default values. This function is implicitly
  778. called by @code{starpu_task_create}. By default, tasks initialized with
  779. @code{starpu_task_init} must be deinitialized explicitly with
  780. @code{starpu_task_deinit}. Tasks can also be initialized statically, using the
  781. constant @code{STARPU_TASK_INITIALIZER}.
  782. @item @emph{Prototype}:
  783. @code{void starpu_task_init(struct starpu_task *task);}
  784. @end table
  785. @node starpu_task_create
  786. @subsection @code{starpu_task_create} -- Allocate and Initialize a Task
  787. @table @asis
  788. @item @emph{Description}:
  789. Allocate a task structure and initialize it with default values. Tasks
  790. allocated dynamically with @code{starpu_task_create} are automatically freed when the
  791. task is terminated. If the destroy flag is explicitly unset, the resources used
  792. by the task are freed by calling
  793. @code{starpu_task_destroy}.
  794. @item @emph{Prototype}:
  795. @code{struct starpu_task *starpu_task_create(void);}
  796. @end table
  797. @node starpu_task_deinit
  798. @subsection @code{starpu_task_deinit} -- Release all the resources used by a Task
  799. @table @asis
  800. @item @emph{Description}:
  801. Release all the structures automatically allocated to execute the task. This is
  802. called automatically by @code{starpu_task_destroy}, but the task structure itself is not
  803. freed. This should be used for statically allocated tasks for instance.
  804. @item @emph{Prototype}:
  805. @code{void starpu_task_deinit(struct starpu_task *task);}
  806. @end table
  807. @node starpu_task_destroy
  808. @subsection @code{starpu_task_destroy} -- Destroy a dynamically allocated Task
  809. @table @asis
  810. @item @emph{Description}:
  811. Free the resource allocated during @code{starpu_task_create}. This function can be
  812. called automatically after the execution of a task by setting the
  813. @code{destroy} flag of the @code{starpu_task} structure (default behaviour).
  814. Calling this function on a statically allocated task results in an undefined
  815. behaviour.
  816. @item @emph{Prototype}:
  817. @code{void starpu_task_destroy(struct starpu_task *task);}
  818. @end table
  819. @node starpu_task_wait
  820. @subsection @code{starpu_task_wait} -- Wait for the termination of a Task
  821. @table @asis
  822. @item @emph{Description}:
  823. This function blocks until the task has been executed. It is not possible to
  824. synchronize with a task more than once. It is not possible to wait for
  825. synchronous or detached tasks.
  826. @item @emph{Return value}:
  827. Upon successful completion, this function returns 0. Otherwise, @code{-EINVAL}
  828. indicates that the specified task was either synchronous or detached.
  829. @item @emph{Prototype}:
  830. @code{int starpu_task_wait(struct starpu_task *task);}
  831. @end table
  832. @node starpu_task_submit
  833. @subsection @code{starpu_task_submit} -- Submit a Task
  834. @table @asis
  835. @item @emph{Description}:
  836. This function submits a task to StarPU. Calling this function does
  837. not mean that the task will be executed immediately as there can be data or task
  838. (tag) dependencies that are not fulfilled yet: StarPU will take care of
  839. scheduling this task with respect to such dependencies.
  840. This function returns immediately if the @code{synchronous} field of the
  841. @code{starpu_task} structure was set to 0, and block until the termination of
  842. the task otherwise. It is also possible to synchronize the application with
  843. asynchronous tasks by the means of tags, using the @code{starpu_tag_wait}
  844. function for instance.
  845. @item @emph{Return value}:
  846. In case of success, this function returns 0, a return value of @code{-ENODEV}
  847. means that there is no worker able to process this task (e.g. there is no GPU
  848. available and this task is only implemented for CUDA devices).
  849. @item @emph{Prototype}:
  850. @code{int starpu_task_submit(struct starpu_task *task);}
  851. @end table
  852. @node starpu_task_wait_for_all
  853. @subsection @code{starpu_task_wait_for_all} -- Wait for the termination of all Tasks
  854. @table @asis
  855. @item @emph{Description}:
  856. This function blocks until all the tasks that were submitted are terminated.
  857. @item @emph{Prototype}:
  858. @code{void starpu_task_wait_for_all(void);}
  859. @end table
  860. @c Callbacks : what can we put in callbacks ?
  861. @node Tags
  862. @section Tags
  863. @menu
  864. * starpu_tag_t:: Task identifier
  865. * starpu_tag_declare_deps:: Declare the Dependencies of a Tag
  866. * starpu_tag_declare_deps_array:: Declare the Dependencies of a Tag
  867. * starpu_tag_wait:: Block until a Tag is terminated
  868. * starpu_tag_wait_array:: Block until a set of Tags is terminated
  869. * starpu_tag_remove:: Destroy a Tag
  870. * starpu_tag_notify_from_apps:: Feed a tag explicitly
  871. @end menu
  872. @node starpu_tag_t
  873. @subsection @code{starpu_tag_t} -- Task identifier
  874. @table @asis
  875. @item @emph{Description}:
  876. It is possible to associate a task with a unique ``tag'' and to express
  877. dependencies between tasks by the means of those tags. To do so, fill the
  878. @code{tag_id} field of the @code{starpu_task} structure with a tag number (can
  879. be arbitrary) and set the @code{use_tag} field to 1.
  880. If @code{starpu_tag_declare_deps} is called with this tag number, the task will
  881. not be started until the tasks which holds the declared dependency tags are
  882. completed.
  883. @end table
  884. @node starpu_tag_declare_deps
  885. @subsection @code{starpu_tag_declare_deps} -- Declare the Dependencies of a Tag
  886. @table @asis
  887. @item @emph{Description}:
  888. Specify the dependencies of the task identified by tag @code{id}. The first
  889. argument specifies the tag which is configured, the second argument gives the
  890. number of tag(s) on which @code{id} depends. The following arguments are the
  891. tags which have to be terminated to unlock the task.
  892. This function must be called before the associated task is submitted to StarPU
  893. with @code{starpu_task_submit}.
  894. @item @emph{Remark}
  895. Because of the variable arity of @code{starpu_tag_declare_deps}, note that the
  896. last arguments @emph{must} be of type @code{starpu_tag_t}: constant values
  897. typically need to be explicitly casted. Using the
  898. @code{starpu_tag_declare_deps_array} function avoids this hazard.
  899. @item @emph{Prototype}:
  900. @code{void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...);}
  901. @item @emph{Example}:
  902. @cartouche
  903. @example
  904. /* Tag 0x1 depends on tags 0x32 and 0x52 */
  905. starpu_tag_declare_deps((starpu_tag_t)0x1,
  906. 2, (starpu_tag_t)0x32, (starpu_tag_t)0x52);
  907. @end example
  908. @end cartouche
  909. @end table
  910. @node starpu_tag_declare_deps_array
  911. @subsection @code{starpu_tag_declare_deps_array} -- Declare the Dependencies of a Tag
  912. @table @asis
  913. @item @emph{Description}:
  914. This function is similar to @code{starpu_tag_declare_deps}, except that its
  915. does not take a variable number of arguments but an array of tags of size
  916. @code{ndeps}.
  917. @item @emph{Prototype}:
  918. @code{void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t *array);}
  919. @item @emph{Example}:
  920. @cartouche
  921. @example
  922. /* Tag 0x1 depends on tags 0x32 and 0x52 */
  923. starpu_tag_t tag_array[2] = @{0x32, 0x52@};
  924. starpu_tag_declare_deps_array((starpu_tag_t)0x1, 2, tag_array);
  925. @end example
  926. @end cartouche
  927. @end table
  928. @node starpu_tag_wait
  929. @subsection @code{starpu_tag_wait} -- Block until a Tag is terminated
  930. @table @asis
  931. @item @emph{Description}:
  932. This function blocks until the task associated to tag @code{id} has been
  933. executed. This is a blocking call which must therefore not be called within
  934. tasks or callbacks, but only from the application directly. It is possible to
  935. synchronize with the same tag multiple times, as long as the
  936. @code{starpu_tag_remove} function is not called. Note that it is still
  937. possible to synchronize with a tag associated to a task which @code{starpu_task}
  938. data structure was freed (e.g. if the @code{destroy} flag of the
  939. @code{starpu_task} was enabled).
  940. @item @emph{Prototype}:
  941. @code{void starpu_tag_wait(starpu_tag_t id);}
  942. @end table
  943. @node starpu_tag_wait_array
  944. @subsection @code{starpu_tag_wait_array} -- Block until a set of Tags is terminated
  945. @table @asis
  946. @item @emph{Description}:
  947. This function is similar to @code{starpu_tag_wait} except that it blocks until
  948. @emph{all} the @code{ntags} tags contained in the @code{id} array are
  949. terminated.
  950. @item @emph{Prototype}:
  951. @code{void starpu_tag_wait_array(unsigned ntags, starpu_tag_t *id);}
  952. @end table
  953. @node starpu_tag_remove
  954. @subsection @code{starpu_tag_remove} -- Destroy a Tag
  955. @table @asis
  956. @item @emph{Description}:
  957. This function releases the resources associated to tag @code{id}. It can be
  958. called once the corresponding task has been executed and when there is
  959. no other tag that depend on this tag anymore.
  960. @item @emph{Prototype}:
  961. @code{void starpu_tag_remove(starpu_tag_t id);}
  962. @end table
  963. @node starpu_tag_notify_from_apps
  964. @subsection @code{starpu_tag_notify_from_apps} -- Feed a Tag explicitly
  965. @table @asis
  966. @item @emph{Description}:
  967. This function explicitly unlocks tag @code{id}. It may be useful in the
  968. case of applications which execute part of their computation outside StarPU
  969. tasks (e.g. third-party libraries). It is also provided as a
  970. convenient tool for the programmer, for instance to entirely construct the task
  971. DAG before actually giving StarPU the opportunity to execute the tasks.
  972. @item @emph{Prototype}:
  973. @code{void starpu_tag_notify_from_apps(starpu_tag_t id);}
  974. @end table
  975. @node CUDA extensions
  976. @section CUDA extensions
  977. @c void starpu_data_malloc_pinned_if_possible(float **A, size_t dim);
  978. @c starpu_helper_cublas_init TODO
  979. @c starpu_helper_cublas_shutdown TODO
  980. @menu
  981. * starpu_cuda_get_local_stream:: Get current worker's CUDA stream
  982. * starpu_helper_cublas_init:: Initialize CUBLAS on every CUDA device
  983. * starpu_helper_cublas_shutdown:: Deinitialize CUBLAS on every CUDA device
  984. @end menu
  985. @node starpu_cuda_get_local_stream
  986. @subsection @code{starpu_cuda_get_local_stream} -- Get current worker's CUDA stream
  987. @table @asis
  988. @item @emph{Description}:
  989. StarPU provides a stream for every CUDA device controlled by StarPU. This
  990. function is only provided for convenience so that programmers can easily use
  991. asynchronous operations within codelets without having to create a stream by
  992. hand. Note that the application is not forced to use the stream provided by
  993. @code{starpu_cuda_get_local_stream} and may also create its own streams.
  994. @item @emph{Prototype}:
  995. @code{cudaStream_t *starpu_cuda_get_local_stream(void);}
  996. @end table
  997. @node starpu_helper_cublas_init
  998. @subsection @code{starpu_helper_cublas_init} -- Initialize CUBLAS on every CUDA device
  999. @table @asis
  1000. @item @emph{Description}:
  1001. The CUBLAS library must be initialized prior to any CUBLAS call. Calling
  1002. @code{starpu_helper_cublas_init} will initialize CUBLAS on every CUDA device
  1003. controlled by StarPU. This call blocks until CUBLAS has been properly
  1004. initialized on every device.
  1005. @item @emph{Prototype}:
  1006. @code{void starpu_helper_cublas_init(void);}
  1007. @end table
  1008. @node starpu_helper_cublas_shutdown
  1009. @subsection @code{starpu_helper_cublas_shutdown} -- Deinitialize CUBLAS on every CUDA device
  1010. @table @asis
  1011. @item @emph{Description}:
  1012. This function synchronously deinitializes the CUBLAS library on every CUDA device.
  1013. @item @emph{Prototype}:
  1014. @code{void starpu_helper_cublas_shutdown(void);}
  1015. @end table
  1016. @node OpenCL extensions
  1017. @section OpenCL extensions
  1018. @menu
  1019. * Enabling OpenCL:: Enabling OpenCL
  1020. * Compiling OpenCL codelets:: Compiling OpenCL codelets
  1021. @end menu
  1022. @node Enabling OpenCL
  1023. @subsection Enabling OpenCL
  1024. On GPU devices which can run both CUDA and OpenCL, CUDA will be
  1025. enabled by default. To enable OpenCL, you need either to disable CUDA
  1026. when configuring StarPU:
  1027. @example
  1028. % ./configure --disable-cuda
  1029. @end example
  1030. or when running applications:
  1031. @example
  1032. % STARPU_NCUDA=0 ./application
  1033. @end example
  1034. OpenCL will automatically be started on any device not yet used by
  1035. CUDA. So on a machine running 4 GPUS, it is therefore possible to
  1036. enable CUDA on 2 devices, and OpenCL on the 2 other devices by doing
  1037. so:
  1038. @example
  1039. % STARPU_NCUDA=2 ./application
  1040. @end example
  1041. @node Compiling OpenCL codelets
  1042. @subsection Compiling OpenCL codelets
  1043. TODO
  1044. @node Cell extensions
  1045. @section Cell extensions
  1046. nothing yet.
  1047. @node Miscellaneous
  1048. @section Miscellaneous helpers
  1049. @menu
  1050. * starpu_execute_on_each_worker:: Execute a function on a subset of workers
  1051. @end menu
  1052. @node starpu_execute_on_each_worker
  1053. @subsection @code{starpu_execute_on_each_worker} -- Execute a function on a subset of workers
  1054. @table @asis
  1055. @item @emph{Description}:
  1056. When calling this method, the offloaded function specified by the first argument is
  1057. executed by every StarPU worker that may execute the function.
  1058. The second argument is passed to the offloaded function.
  1059. The last argument specifies on which types of processing units the function
  1060. should be executed. Similarly to the @code{where} field of the
  1061. @code{starpu_codelet} structure, it is possible to specify that the function
  1062. should be executed on every CUDA device and every CPU by passing
  1063. @code{STARPU_CPU|STARPU_CUDA}.
  1064. This function blocks until the function has been executed on every appropriate
  1065. processing units, so that it may not be called from a callback function for
  1066. instance.
  1067. @item @emph{Prototype}:
  1068. @code{void starpu_execute_on_each_worker(void (*func)(void *), void *arg, uint32_t where);}
  1069. @end table
  1070. @c ---------------------------------------------------------------------
  1071. @c Basic Examples
  1072. @c ---------------------------------------------------------------------
  1073. @node Basic Examples
  1074. @chapter Basic Examples
  1075. @menu
  1076. * Compiling and linking:: Compiling and Linking Options
  1077. * Hello World:: Submitting Tasks
  1078. * Scaling a Vector:: Manipulating Data
  1079. * Scaling a Vector (hybrid):: Handling Heterogeneous Architectures
  1080. @end menu
  1081. @node Compiling and linking
  1082. @section Compiling and linking options
  1083. The Makefile could for instance contain the following lines to define which
  1084. options must be given to the compiler and to the linker:
  1085. @cartouche
  1086. @example
  1087. CFLAGS+=$$(pkg-config --cflags libstarpu)
  1088. LIBS+=$$(pkg-config --libs libstarpu)
  1089. @end example
  1090. @end cartouche
  1091. @node Hello World
  1092. @section Hello World
  1093. In this section, we show how to implement a simple program that submits a task to StarPU.
  1094. @subsection Required Headers
  1095. The @code{starpu.h} header should be included in any code using StarPU.
  1096. @cartouche
  1097. @example
  1098. #include <starpu.h>
  1099. @end example
  1100. @end cartouche
  1101. @subsection Defining a Codelet
  1102. @cartouche
  1103. @example
  1104. void cpu_func(void *buffers[], void *cl_arg)
  1105. @{
  1106. float *array = cl_arg;
  1107. printf("Hello world (array = @{%f, %f@} )\n", array[0], array[1]);
  1108. @}
  1109. starpu_codelet cl =
  1110. @{
  1111. .where = STARPU_CPU,
  1112. .cpu_func = cpu_func,
  1113. .nbuffers = 0
  1114. @};
  1115. @end example
  1116. @end cartouche
  1117. A codelet is a structure that represents a computational kernel. Such a codelet
  1118. may contain an implementation of the same kernel on different architectures
  1119. (e.g. CUDA, Cell's SPU, x86, ...).
  1120. The @code{nbuffers} field specifies the number of data buffers that are
  1121. manipulated by the codelet: here the codelet does not access or modify any data
  1122. that is controlled by our data management library. Note that the argument
  1123. passed to the codelet (the @code{cl_arg} field of the @code{starpu_task}
  1124. structure) does not count as a buffer since it is not managed by our data
  1125. management library.
  1126. @c TODO need a crossref to the proper description of "where" see bla for more ...
  1127. We create a codelet which may only be executed on the CPUs. The @code{where}
  1128. field is a bitmask that defines where the codelet may be executed. Here, the
  1129. @code{STARPU_CPU} value means that only CPUs can execute this codelet
  1130. (@pxref{Codelets and Tasks} for more details on this field).
  1131. When a CPU core executes a codelet, it calls the @code{cpu_func} function,
  1132. which @emph{must} have the following prototype:
  1133. @cartouche
  1134. @example
  1135. void (*cpu_func)(void *buffers[], void *cl_arg);
  1136. @end example
  1137. @end cartouche
  1138. In this example, we can ignore the first argument of this function which gives a
  1139. description of the input and output buffers (e.g. the size and the location of
  1140. the matrices). The second argument is a pointer to a buffer passed as an
  1141. argument to the codelet by the means of the @code{cl_arg} field of the
  1142. @code{starpu_task} structure.
  1143. @c TODO rewrite so that it is a little clearer ?
  1144. Be aware that this may be a pointer to a
  1145. @emph{copy} of the actual buffer, and not the pointer given by the programmer:
  1146. if the codelet modifies this buffer, there is no guarantee that the initial
  1147. buffer will be modified as well: this for instance implies that the buffer
  1148. cannot be used as a synchronization medium.
  1149. @subsection Submitting a Task
  1150. @cartouche
  1151. @example
  1152. void callback_func(void *callback_arg)
  1153. @{
  1154. printf("Callback function (arg %x)\n", callback_arg);
  1155. @}
  1156. int main(int argc, char **argv)
  1157. @{
  1158. /* initialize StarPU */
  1159. starpu_init(NULL);
  1160. struct starpu_task *task = starpu_task_create();
  1161. task->cl = &cl;
  1162. float *array[2] = @{1.0f, -1.0f@};
  1163. task->cl_arg = &array;
  1164. task->cl_arg_size = 2*sizeof(float);
  1165. task->callback_func = callback_func;
  1166. task->callback_arg = 0x42;
  1167. /* starpu_task_submit will be a blocking call */
  1168. task->synchronous = 1;
  1169. /* submit the task to StarPU */
  1170. starpu_task_submit(task);
  1171. /* terminate StarPU */
  1172. starpu_shutdown();
  1173. return 0;
  1174. @}
  1175. @end example
  1176. @end cartouche
  1177. Before submitting any tasks to StarPU, @code{starpu_init} must be called. The
  1178. @code{NULL} argument specifies that we use default configuration. Tasks cannot
  1179. be submitted after the termination of StarPU by a call to
  1180. @code{starpu_shutdown}.
  1181. In the example above, a task structure is allocated by a call to
  1182. @code{starpu_task_create}. This function only allocates and fills the
  1183. corresponding structure with the default settings (@pxref{starpu_task_create}),
  1184. but it does not submit the task to StarPU.
  1185. @c not really clear ;)
  1186. The @code{cl} field is a pointer to the codelet which the task will
  1187. execute: in other words, the codelet structure describes which computational
  1188. kernel should be offloaded on the different architectures, and the task
  1189. structure is a wrapper containing a codelet and the piece of data on which the
  1190. codelet should operate.
  1191. The optional @code{cl_arg} field is a pointer to a buffer (of size
  1192. @code{cl_arg_size}) with some parameters for the kernel
  1193. described by the codelet. For instance, if a codelet implements a computational
  1194. kernel that multiplies its input vector by a constant, the constant could be
  1195. specified by the means of this buffer.
  1196. Once a task has been executed, an optional callback function can be called.
  1197. While the computational kernel could be offloaded on various architectures, the
  1198. callback function is always executed on a CPU. The @code{callback_arg}
  1199. pointer is passed as an argument of the callback. The prototype of a callback
  1200. function must be:
  1201. @cartouche
  1202. @example
  1203. void (*callback_function)(void *);
  1204. @end example
  1205. @end cartouche
  1206. If the @code{synchronous} field is non-null, task submission will be
  1207. synchronous: the @code{starpu_task_submit} function will not return until the
  1208. task was executed. Note that the @code{starpu_shutdown} method does not
  1209. guarantee that asynchronous tasks have been executed before it returns.
  1210. @node Scaling a Vector
  1211. @section Manipulating Data: Scaling a Vector
  1212. The previous example has shown how to submit tasks. In this section we show how
  1213. StarPU tasks can manipulate data.
  1214. Programmers can describe the data layout of their application so that StarPU is
  1215. responsible for enforcing data coherency and availability across the machine.
  1216. Instead of handling complex (and non-portable) mechanisms to perform data
  1217. movements, programmers only declare which piece of data is accessed and/or
  1218. modified by a task, and StarPU makes sure that when a computational kernel
  1219. starts somewhere (e.g. on a GPU), its data are available locally.
  1220. Before submitting those tasks, the programmer first needs to declare the
  1221. different pieces of data to StarPU using the @code{starpu_*_data_register}
  1222. functions. To ease the development of applications for StarPU, it is possible
  1223. to describe multiple types of data layout. A type of data layout is called an
  1224. @b{interface}. By default, there are different interfaces available in StarPU:
  1225. here we will consider the @b{vector interface}.
  1226. The following lines show how to declare an array of @code{n} elements of type
  1227. @code{float} using the vector interface:
  1228. @cartouche
  1229. @example
  1230. float tab[n];
  1231. starpu_data_handle tab_handle;
  1232. starpu_vector_data_register(&tab_handle, 0, tab, n, sizeof(float));
  1233. @end example
  1234. @end cartouche
  1235. The first argument, called the @b{data handle}, is an opaque pointer which
  1236. designates the array in StarPU. This is also the structure which is used to
  1237. describe which data is used by a task. The second argument is the node number
  1238. where the data currently resides. Here it is 0 since the @code{tab} array is in
  1239. the main memory. Then comes the pointer @code{tab} where the data can be found,
  1240. the number of elements in the vector and the size of each element.
  1241. It is possible to construct a StarPU
  1242. task that multiplies this vector by a constant factor:
  1243. @cartouche
  1244. @example
  1245. float factor = 3.0;
  1246. struct starpu_task *task = starpu_task_create();
  1247. task->cl = &cl;
  1248. task->buffers[0].handle = tab_handle;
  1249. task->buffers[0].mode = STARPU_RW;
  1250. task->cl_arg = &factor;
  1251. task->cl_arg_size = sizeof(float);
  1252. task->synchronous = 1;
  1253. starpu_task_submit(task);
  1254. @end example
  1255. @end cartouche
  1256. Since the factor is constant, it does not need a preliminary declaration, and
  1257. can just be passed through the @code{cl_arg} pointer like in the previous
  1258. example. The vector parameter is described by its handle.
  1259. There are two fields in each element of the @code{buffers} array.
  1260. @code{handle} is the handle of the data, and @code{mode} specifies how the
  1261. kernel will access the data (@code{STARPU_R} for read-only, @code{STARPU_W} for
  1262. write-only and @code{STARPU_RW} for read and write access).
  1263. The definition of the codelet can be written as follows:
  1264. @cartouche
  1265. @example
  1266. void scal_func(void *buffers[], void *cl_arg)
  1267. @{
  1268. unsigned i;
  1269. float *factor = cl_arg;
  1270. struct starpu_vector_interface_s *vector = buffers[0];
  1271. /* length of the vector */
  1272. unsigned n = STARPU_GET_VECTOR_NX(vector);
  1273. /* local copy of the vector pointer */
  1274. float *val = (float *)STARPU_GET_VECTOR_PTR(vector);
  1275. for (i = 0; i < n; i++)
  1276. val[i] *= *factor;
  1277. @}
  1278. starpu_codelet cl = @{
  1279. .where = STARPU_CPU,
  1280. .cpu_func = scal_func,
  1281. .nbuffers = 1
  1282. @};
  1283. @end example
  1284. @end cartouche
  1285. The second argument of the @code{scal_func} function contains a pointer to the
  1286. parameters of the codelet (given in @code{task->cl_arg}), so that we read the
  1287. constant factor from this pointer. The first argument is an array that gives
  1288. a description of every buffers passed in the @code{task->buffers}@ array. The
  1289. size of this array is given by the @code{nbuffers} field of the codelet
  1290. structure. For the sake of generality, this array contains pointers to the
  1291. different interfaces describing each buffer. In the case of the @b{vector
  1292. interface}, the location of the vector (resp. its length) is accessible in the
  1293. @code{ptr} (resp. @code{nx}) of this array. Since the vector is accessed in a
  1294. read-write fashion, any modification will automatically affect future accesses
  1295. to this vector made by other tasks.
  1296. @node Scaling a Vector (hybrid)
  1297. @section Vector Scaling on an Hybrid CPU/GPU Machine
  1298. Contrary to the previous examples, the task submitted in this example may not
  1299. only be executed by the CPUs, but also by a CUDA device.
  1300. @menu
  1301. * Source code:: Source of the StarPU application
  1302. * Compilation and execution:: Executing the StarPU application
  1303. @end menu
  1304. @node Source code
  1305. @subsection Source code
  1306. The CUDA implementation can be written as follows. It needs to be
  1307. compiled with a CUDA compiler such as nvcc, the NVIDIA CUDA compiler
  1308. driver.
  1309. @cartouche
  1310. @example
  1311. #include <starpu.h>
  1312. static __global__ void vector_mult_cuda(float *val, unsigned n,
  1313. float factor)
  1314. @{
  1315. unsigned i;
  1316. for(i = 0 ; i < n ; i++)
  1317. val[i] *= factor;
  1318. @}
  1319. extern "C" void scal_cuda_func(void *buffers[], void *_args)
  1320. @{
  1321. float *factor = (float *)_args;
  1322. struct starpu_vector_interface_s *vector = (struct starpu_vector_interface_s *) buffers[0];
  1323. /* length of the vector */
  1324. unsigned n = STARPU_GET_VECTOR_NX(vector);
  1325. /* local copy of the vector pointer */
  1326. float *val = (float *)STARPU_GET_VECTOR_PTR(vector);
  1327. /* TODO: use more blocks and threads in blocks */
  1328. vector_mult_cuda<<<1,1>>>(val, n, *factor);
  1329. cudaThreadSynchronize();
  1330. @}
  1331. @end example
  1332. @end cartouche
  1333. The CPU implementation is the same as in the previous section.
  1334. Here is the source of the main application. You can notice the value of the
  1335. field @code{where} for the codelet. We specify
  1336. @code{STARPU_CPU|STARPU_CUDA} to indicate to StarPU that the codelet
  1337. can be executed either on a CPU or on a CUDA device.
  1338. @cartouche
  1339. @example
  1340. #include <starpu.h>
  1341. #define NX 5
  1342. extern void scal_cuda_func(void *buffers[], void *_args);
  1343. extern void scal_func(void *buffers[], void *_args);
  1344. /* @b{Definition of the codelet} */
  1345. static starpu_codelet cl = @{
  1346. .where = STARPU_CPU|STARPU_CUDA; /* @b{It can be executed on a CPU} */
  1347. /* @b{or on a CUDA device} */
  1348. .cuda_func = scal_cuda_func;
  1349. .cpu_func = scal_func;
  1350. .nbuffers = 1;
  1351. @}
  1352. int main(int argc, char **argv)
  1353. @{
  1354. float *vector;
  1355. int i, ret;
  1356. float factor=3.0;
  1357. struct starpu_task *task;
  1358. starpu_data_handle tab_handle;
  1359. starpu_init(NULL); /* @b{Initialising StarPU} */
  1360. vector = (float*)malloc(NX*sizeof(float));
  1361. assert(vector);
  1362. for(i=0 ; i<NX ; i++) vector[i] = i;
  1363. @end example
  1364. @end cartouche
  1365. @cartouche
  1366. @example
  1367. /* @b{Registering data within StarPU} */
  1368. starpu_vector_data_register(&tab_handle, 0, (uintptr_t)vector,
  1369. NX, sizeof(float));
  1370. /* @b{Definition of the task} */
  1371. task = starpu_task_create();
  1372. task->cl = &cl;
  1373. task->callback_func = NULL;
  1374. task->buffers[0].handle = tab_handle;
  1375. task->buffers[0].mode = STARPU_RW;
  1376. task->cl_arg = &factor;
  1377. @end example
  1378. @end cartouche
  1379. @cartouche
  1380. @example
  1381. /* @b{Submitting the task} */
  1382. ret = starpu_task_submit(task);
  1383. if (ret == -ENODEV) @{
  1384. fprintf(stderr, "No worker may execute this task\n");
  1385. return 1;
  1386. @}
  1387. /* @b{Waiting for its termination} */
  1388. starpu_task_wait_for_all();
  1389. /* @b{Update the vector in RAM} */
  1390. starpu_data_sync_with_mem(tab_handle, STARPU_R);
  1391. @end example
  1392. @end cartouche
  1393. @cartouche
  1394. @example
  1395. /* @b{Access the data} */
  1396. for(i=0 ; i<NX; i++) @{
  1397. fprintf(stderr, "%f ", vector[i]);
  1398. @}
  1399. fprintf(stderr, "\n");
  1400. /* @b{Release the data and shutdown StarPU} */
  1401. starpu_data_release_from_mem(tab_handle);
  1402. starpu_shutdown();
  1403. return 0;
  1404. @}
  1405. @end example
  1406. @end cartouche
  1407. @node Compilation and execution
  1408. @subsection Compilation and execution
  1409. Let's suppose StarPU has been installed in the directory
  1410. @code{$STARPU_DIR}. As explained in @ref{pkg-config configuration},
  1411. the variable @code{PKG_CONFIG_PATH} needs to be set. It is also
  1412. necessary to set the variable @code{LD_LIBRARY_PATH} to locate dynamic
  1413. libraries at runtime.
  1414. @example
  1415. % PKG_CONFIG_PATH=$STARPU_DIR/lib/pkgconfig:$PKG_CONFIG_PATH
  1416. % LD_LIBRARY_PATH=$STARPU_DIR/lib:$LD_LIBRARY_PATH
  1417. @end example
  1418. It is then possible to compile the application using the following
  1419. makefile:
  1420. @cartouche
  1421. @example
  1422. CFLAGS += $(shell pkg-config --cflags libstarpu)
  1423. LDFLAGS += $(shell pkg-config --libs libstarpu)
  1424. CC = gcc
  1425. vector: vector.o vector_cpu.o vector_cuda.o
  1426. %.o: %.cu
  1427. nvcc $(CFLAGS) $< -c $@
  1428. clean:
  1429. rm -f vector *.o
  1430. @end example
  1431. @end cartouche
  1432. @example
  1433. % make
  1434. @end example
  1435. and to execute it, with the default configuration:
  1436. @example
  1437. % ./vector
  1438. 0.000000 3.000000 6.000000 9.000000 12.000000
  1439. @end example
  1440. or for example, by disabling CPU devices:
  1441. @example
  1442. % STARPU_NCPUS=0 ./vector
  1443. 0.000000 3.000000 6.000000 9.000000 12.000000
  1444. @end example
  1445. or by disabling CUDA devices:
  1446. @example
  1447. % STARPU_NCUDA=0 ./vector
  1448. 0.000000 3.000000 6.000000 9.000000 12.000000
  1449. @end example
  1450. @c TODO: Add performance model example (and update basic_examples)
  1451. @c ---------------------------------------------------------------------
  1452. @c Advanced Topics
  1453. @c ---------------------------------------------------------------------
  1454. @node Advanced Topics
  1455. @chapter Advanced Topics
  1456. @bye