starpu.texi 51 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444
  1. \input texinfo @c -*-texinfo-*-
  2. @c %**start of header
  3. @setfilename starpu.info
  4. @settitle StarPU
  5. @c %**end of header
  6. @setchapternewpage odd
  7. @titlepage
  8. @title StarPU
  9. @page
  10. @vskip 0pt plus 1filll
  11. @comment For the @value{version-GCC} Version*
  12. @end titlepage
  13. @summarycontents
  14. @contents
  15. @page
  16. @node Top
  17. @top Preface
  18. @cindex Preface
  19. This manual documents the usage of StarPU
  20. @comment
  21. @comment When you add a new menu item, please keep the right hand
  22. @comment aligned to the same column. Do not use tabs. This provides
  23. @comment better formatting.
  24. @comment
  25. @menu
  26. * Introduction:: A basic introduction to using StarPU.
  27. * Installing StarPU:: How to configure, build and install StarPU.
  28. * Configuration options:: Configurations options
  29. * Environment variables:: Environment variables used by StarPU.
  30. * StarPU API:: The API to use StarPU.
  31. * Basic Examples:: Basic examples of the use of StarPU.
  32. * Advanced Topics:: Advanced use of StarPU.
  33. @end menu
  34. @c ---------------------------------------------------------------------
  35. @c Introduction to StarPU
  36. @c ---------------------------------------------------------------------
  37. @node Introduction
  38. @chapter Introduction to StarPU
  39. @menu
  40. * Motivation:: Why StarPU ?
  41. * StarPU in a Nutshell:: The Fundamentals of StarPU
  42. @end menu
  43. @node Motivation
  44. @section Motivation
  45. @c complex machines with heterogeneous cores/devices
  46. The use of specialized hardware such as accelerators or coprocessors offers an
  47. interesting approach to overcome the physical limits encountered by processor
  48. architects. As a result, many machines are now equipped with one or several
  49. accelerators (eg. a GPU), in addition to the usual processor(s). While a lot of
  50. efforts have been devoted to offload computation onto such accelerators, very
  51. little attention as been paid to portability concerns on the one hand, and to the
  52. possibility of having heterogeneous accelerators and processors to interact on the other hand.
  53. StarPU is a runtime system that offers support for heterogeneous multicore
  54. architectures, it not only offers a unified view of the computational resources
  55. (ie. CPUs and accelerators at the same time), but it also takes care to
  56. efficiently map and execute tasks onto an heterogeneous machine while
  57. transparently handling low-level issues in a portable fashion.
  58. @c this leads to a complicated distributed memory design
  59. @c which is not (easily) manageable by hand
  60. @c added value/benefits of StarPU
  61. @c - portability
  62. @c - scheduling, perf. portability
  63. @node StarPU in a Nutshell
  64. @section StarPU in a Nutshell
  65. From a programming point of view, StarPU is not a new language but a library
  66. that executes tasks explicitly submitted by the application. The data that a
  67. task manipulate are automatically transferred onto the accelerator so that the
  68. programmer does not have to take care of complex data movements. StarPU also
  69. takes particular care of scheduling those tasks efficiently and allows
  70. scheduling experts to implement custom scheduling policies in a portable
  71. fashion.
  72. @c explain the notion of codelet and task (ie. g(A, B)
  73. @subsection Codelet and Tasks
  74. One of StarPU primary data structure is the @b{codelet}. A codelet describes a
  75. computational kernel that can possibly be implemented on multiple architectures
  76. such as a CPU, a CUDA device or a Cell's SPU.
  77. @c TODO insert illustration f : f_spu, f_cpu, ...
  78. Another important data structure is the @b{task}. Executing a StarPU task
  79. consists in applying a codelet on a data set, on one of the architecture on
  80. which the codelet is implemented. In addition to the codelet that a task
  81. implements, it also describes which data are accessed, and how they are
  82. accessed during the computation (read and/or write).
  83. StarPU tasks are asynchronous: submitting a task to StarPU is a non-blocking
  84. operation. The task structure can also specify a @b{callback} function that is
  85. called once StarPU has properly executed the task. It also contains optional
  86. fields that the application may use to give hints to the scheduler (such as
  87. priority levels).
  88. A task may be identified by a unique 64-bit number which we refer as a @b{tag}.
  89. Task dependencies can be enforced either by the means of callback functions, or
  90. by expressing dependencies between tags.
  91. @c TODO insert illustration f(Ar, Brw, Cr) + ..
  92. @c DSM
  93. @subsection StarPU Data Management Library
  94. Because StarPU schedules tasks at runtime, data transfers have to be
  95. done automatically and ``just-in-time'' between processing units,
  96. relieving the application programmer from explicit data transfers.
  97. Moreover, to avoid unnecessary transfers, StarPU keeps data
  98. where it was last needed, even if was modified there, and it
  99. allows multiple copies of the same data to reside at the same time on
  100. several processing units as long as it is not modified.
  101. @c ---------------------------------------------------------------------
  102. @c Installing StarPU
  103. @c ---------------------------------------------------------------------
  104. @node Installing StarPU
  105. @chapter Installing StarPU
  106. StarPU can be built and installed by the standard means of the GNU
  107. autotools. The following chapter is intended to briefly remind how these tools
  108. can be used to install StarPU.
  109. @section Configuring StarPU
  110. @subsection Generating Makefiles and configuration scripts
  111. This step is not necessary when using the tarball releases of StarPU. If you
  112. are using the source code from the svn repository, you first need to generate
  113. the configure scripts and the Makefiles.
  114. @example
  115. $ autoreconf -vfi
  116. @end example
  117. @subsection Configuring StarPU
  118. @example
  119. $ ./configure
  120. @end example
  121. @c TODO enumerate the list of interesting options: refer to a specific section
  122. @section Building and Installing StarPU
  123. @subsection Building
  124. @example
  125. $ make
  126. @end example
  127. @subsection Sanity Checks
  128. In order to make sure that StarPU is working properly on the system, it is also
  129. possible to run a test suite.
  130. @example
  131. $ make check
  132. @end example
  133. @subsection Installing
  134. In order to install StarPU at the location that was specified during
  135. configuration:
  136. @example
  137. $ make install
  138. @end example
  139. @subsection pkg-config configuration
  140. It is possible that compiling and linking an application against StarPU
  141. requires to use specific flags or libraries (for instance @code{CUDA} or
  142. @code{libspe2}). Therefore, it is possible to use the @code{pkg-config} tool.
  143. If StarPU was not installed at some standard location, the path of StarPU's
  144. library must be specified in the @code{PKG_CONFIG_PATH} environment variable so
  145. that @code{pkg-config} can find it. So if StarPU was installed in
  146. @code{$(prefix_dir)}:
  147. @example
  148. @c TODO: heu, c'est vraiment du shell ça ? :)
  149. $ PKG_CONFIG_PATH = @{PKG_CONFIG_PATH@}:$(prefix_dir)/lib/
  150. @end example
  151. The flags required to compiled or linked against StarPU are then
  152. accessible with the following commands:
  153. @example
  154. $ pkg-config --cflags libstarpu # options for the compiler
  155. $ pkg-config --libs libstarpu # options for the linker
  156. @end example
  157. @c ---------------------------------------------------------------------
  158. @c Configuration options
  159. @c ---------------------------------------------------------------------
  160. @node Configuration options
  161. @chapter Configuration options
  162. TODO
  163. @c ---------------------------------------------------------------------
  164. @c Environment variables
  165. @c ---------------------------------------------------------------------
  166. @node Environment variables
  167. @chapter Environment variables
  168. @menu
  169. * Workers:: Configuring workers
  170. * Scheduling:: Configuring the Scheduling engine
  171. * Misc:: Miscellaneous and debug
  172. @end menu
  173. TODO, explicit configuration (passed to starpu_init) overrides env variables.
  174. @node Workers
  175. @section Configuring workers
  176. @menu
  177. * STARPU_NCPUS :: Number of CPU workers
  178. * STARPU_NCUDA :: Number of CUDA workers
  179. * STARPU_NOPENCL :: Number of OpenCL workers
  180. * STARPU_NGORDON :: Number of SPU workers (Cell)
  181. * STARPU_WORKERS_CPUID :: Bind workers to specific CPUs
  182. * STARPU_WORKERS_CUDAID :: Select specific CUDA devices
  183. * STARPU_WORKERS_OPENCLID :: Select specific OpenCL devices
  184. @end menu
  185. @node STARPU_NCPUS
  186. @subsection @code{STARPU_NCPUS} -- Number of CPU workers
  187. @table @asis
  188. @item @emph{Description}:
  189. Specify the maximum number of CPU workers. Note that StarPU will not allocate
  190. more CPUs than there are physical CPUs, and that some CPUs are used to control
  191. the accelerators.
  192. @end table
  193. @node STARPU_NCUDA
  194. @subsection @code{STARPU_NCUDA} -- Number of CUDA workers
  195. @table @asis
  196. @item @emph{Description}:
  197. Specify the maximum number of CUDA devices that StarPU can use. In case there
  198. @code{STARPU_NCUDA} is lower than the number of physical devices, it is
  199. possible to select which CUDA devices should be used by the means of the
  200. @code{STARPU_WORKERS_CUDAID} environment variable.
  201. @end table
  202. @node STARPU_NOPENCL
  203. @subsection @code{STARPU_NOPENCL} -- Number of OpenCL workers
  204. @table @asis
  205. @item @emph{Description}:
  206. OpenCL equivalent of the @code{STARPU_NCUDA} environment variable.
  207. @end table
  208. @node STARPU_NGORDON
  209. @subsection @code{STARPU_NGORDON} -- Number of SPU workers (Cell)
  210. @table @asis
  211. @item @emph{Description}:
  212. Specify the maximum number of SPUs that StarPU can use.
  213. @end table
  214. @node STARPU_WORKERS_CPUID
  215. @subsection @code{STARPU_WORKERS_CPUID} -- Bind workers to specific CPUs
  216. @table @asis
  217. @item @emph{Description}:
  218. Passing an array of integers (starting from 0) in @code{STARPU_WORKERS_CPUID}
  219. specifies on which logical CPU the different workers should be
  220. bound. For instance, if @code{STARPU_WORKERS_CPUID = "1 3 0 2"}, the first
  221. worker will be bound to logical CPU #1, the second CPU worker will be bound to
  222. logical CPU #3 and so on. Note that the logical ordering of the CPUs is either
  223. determined by the OS, or provided by the @code{hwloc}  library in case it is
  224. available.
  225. Note that the first workers correspond to the CUDA workers, then come the
  226. OpenCL and the SPU, and finally the CPU workers. For example if
  227. we have @code{STARPU_NCUDA=1}, @code{STARPU_NOPENCL=1}, @code{STARPU_NCPUS=2}
  228. and @code{STARPU_WORKERS_CPUID = "0 2 1 3"}, the CUDA device will be controlled
  229. by logical CPU #0, the OpenCL device will be controlled by logical CPU #2, and
  230. the logical CPUs #1 and #3 will be used by the CPU workers.
  231. If the number of workers is larger than the array given in
  232. @code{STARPU_WORKERS_CPUID}, the workers are bound to the logical CPUs in a
  233. round-robin fashion: if @code{STARPU_WORKERS_CPUID = "0 1"}, the first and the
  234. third (resp. second and fourth) workers will be put on CPU #0 (resp. CPU #1).
  235. @end table
  236. @node STARPU_WORKERS_CUDAID
  237. @subsection @code{STARPU_WORKERS_CUDAID} -- Select specific CUDA devices
  238. @table @asis
  239. @item @emph{Description}:
  240. Similarly to the @code{STARPU_WORKERS_CPUID} environment variable, it is
  241. possible to select which CUDA devices should be used by StarPU. On a machine
  242. equipped with 4 GPUs, setting @code{STARPU_WORKERS_CUDAID = "1 3"} and
  243. @code{STARPU_NCUDA=2} specifies that 2 CUDA workers should be created, and that
  244. they should use CUDA devices #1 and #3 (the logical ordering of the devices is
  245. the one reported by CUDA).
  246. @end table
  247. @node STARPU_WORKERS_OPENCLID
  248. @subsection @code{STARPU_WORKERS_OPENCLID} -- Select specific OpenCL devices
  249. @table @asis
  250. @item @emph{Description}:
  251. OpenCL equivalent of the @code{STARPU_WORKERS_CUDAID} environment variable.
  252. @end table
  253. @node Scheduling
  254. @section Configuring the Scheduling engine
  255. @menu
  256. * STARPU_SCHED :: Scheduling policy
  257. * STARPU_CALIBRATE :: Calibrate performance models
  258. * STARPU_PREFETCH :: Use data prefetch
  259. * STARPU_SCHED_ALPHA :: Computation factor
  260. * STARPU_SCHED_BETA :: Communication factor
  261. @end menu
  262. @node STARPU_SCHED
  263. @subsection @code{STARPU_SCHED} -- Scheduling policy
  264. @table @asis
  265. @item @emph{Description}:
  266. TODO
  267. Use @code{STARPU_SCHED=help} to get the list of available schedulers
  268. @end table
  269. @node STARPU_CALIBRATE
  270. @subsection @code{STARPU_CALIBRATE} -- Calibrate performance models
  271. @table @asis
  272. @item @emph{Description}:
  273. If this variable is set, the performance models are calibrated during the execution.
  274. TODO
  275. Note: this currently only applies to dm and dmda scheduling policies.
  276. @end table
  277. @node STARPU_PREFETCH
  278. @subsection @code{STARPU_PREFETCH} -- Use data prefetch
  279. @table @asis
  280. @item @emph{Description}:
  281. TODO
  282. @end table
  283. @node STARPU_SCHED_ALPHA
  284. @subsection @code{STARPU_SCHED_ALPHA} -- Computation factor
  285. @table @asis
  286. @item @emph{Description}:
  287. TODO
  288. @end table
  289. @node STARPU_SCHED_BETA
  290. @subsection @code{STARPU_SCHED_BETA} -- Communication factor
  291. @table @asis
  292. @item @emph{Description}:
  293. TODO
  294. @end table
  295. @node Misc
  296. @section Miscellaneous and debug
  297. @menu
  298. * STARPU_LOGFILENAME :: Select debug file name
  299. @end menu
  300. @node STARPU_LOGFILENAME
  301. @subsection @code{STARPU_LOGFILENAME} -- Select debug file name
  302. @table @asis
  303. @item @emph{Description}:
  304. TODO
  305. @end table
  306. @c ---------------------------------------------------------------------
  307. @c StarPU API
  308. @c ---------------------------------------------------------------------
  309. @node StarPU API
  310. @chapter StarPU API
  311. @menu
  312. * Initialization and Termination:: Initialization and Termination methods
  313. * Workers' Properties:: Methods to enumerate workers' properties
  314. * Data Library:: Methods to manipulate data
  315. * Codelets and Tasks:: Methods to construct tasks
  316. * Tags:: Task dependencies
  317. * CUDA extensions:: CUDA extensions
  318. * Cell extensions:: Cell extensions
  319. * Miscellaneous:: Miscellaneous helpers
  320. @end menu
  321. @node Initialization and Termination
  322. @section Initialization and Termination
  323. @menu
  324. * starpu_init:: Initialize StarPU
  325. * struct starpu_conf:: StarPU runtime configuration
  326. * starpu_shutdown:: Terminate StarPU
  327. @end menu
  328. @node starpu_init
  329. @subsection @code{starpu_init} -- Initialize StarPU
  330. @table @asis
  331. @item @emph{Description}:
  332. This is StarPU initialization method, which must be called prior to any other
  333. StarPU call. It is possible to specify StarPU's configuration (eg. scheduling
  334. policy, number of cores, ...) by passing a non-null argument. Default
  335. configuration is used if the passed argument is @code{NULL}.
  336. @item @emph{Return value}:
  337. Upon successful completion, this function returns 0. Otherwise, @code{-ENODEV}
  338. indicates that no worker was available (so that StarPU was not be initialized).
  339. @item @emph{Prototype}:
  340. @code{int starpu_init(struct starpu_conf *conf);}
  341. @end table
  342. @node struct starpu_conf
  343. @subsection @code{struct starpu_conf} -- StarPU runtime configuration
  344. @table @asis
  345. @item @emph{Description}:
  346. This structure is passed to the @code{starpu_init} function in order configure
  347. StarPU. When the default value is used, StarPU automatically select the number
  348. of processing units and takes the default scheduling policy. This parameters
  349. overwrite the equivalent environnement variables.
  350. @item @emph{Fields}:
  351. @table @asis
  352. @item @code{sched_policy} (default = NULL):
  353. This is the name of the scheduling policy. This can also be specified with the
  354. @code{STARPU_SCHED} environment variable.
  355. @item @code{ncpus} (default = -1):
  356. This is the maximum number of CPU cores that StarPU can use. This can also be
  357. specified with the @code{STARPU_NCPUS} environment variable.
  358. @item @code{ncuda} (default = -1):
  359. This is the maximum number of CUDA devices that StarPU can use. This can also be
  360. specified with the @code{STARPU_NCUDA} environment variable.
  361. @item @code{nspus} (default = -1):
  362. This is the maximum number of Cell SPUs that StarPU can use. This can also be
  363. specified with the @code{STARPU_NGORDON} environment variable.
  364. @item @code{calibrate} (default = 0):
  365. If this flag is set, StarPU will calibrate the performance models when
  366. executing tasks. This can also be specified with the @code{STARPU_CALIBRATE}
  367. environment variable.
  368. @end table
  369. @end table
  370. @node starpu_shutdown
  371. @subsection @code{starpu_shutdown} -- Terminate StarPU
  372. @table @asis
  373. @item @emph{Description}:
  374. This is StarPU termination method. It must be called at the end of the
  375. application: statistics and other post-mortem debugging information are not
  376. garanteed to be available until this method has been called.
  377. @item @emph{Prototype}:
  378. @code{void starpu_shutdown(void);}
  379. @end table
  380. @node Workers' Properties
  381. @section Workers' Properties
  382. @menu
  383. * starpu_get_worker_count:: Get the number of processing units
  384. * starpu_get_cpu_worker_count:: Get the number of CPU controlled by StarPU
  385. * starpu_get_cuda_worker_count:: Get the number of CUDA devices controlled by StarPU
  386. * starpu_get_opencl_worker_count:: Get the number of OpenCL devices controlled by StarPU
  387. * starpu_get_spu_worker_count:: Get the number of Cell SPUs controlled by StarPU
  388. * starpu_get_worker_id:: Get the identifier of the current worker
  389. * starpu_get_worker_type:: Get the type of processing unit associated to a worker
  390. * starpu_get_worker_name:: Get the name of a worker
  391. @end menu
  392. @node starpu_get_worker_count
  393. @subsection @code{starpu_get_worker_count} -- Get the number of processing units
  394. @table @asis
  395. @item @emph{Description}:
  396. This function returns the number of workers (ie. processing units executing
  397. StarPU tasks). The returned value should be at most @code{STARPU_NMAXWORKERS}.
  398. @item @emph{Prototype}:
  399. @code{unsigned starpu_get_worker_count(void);}
  400. @end table
  401. @node starpu_get_cpu_worker_count
  402. @subsection @code{starpu_get_cpu_worker_count} -- Get the number of CPU controlled by StarPU
  403. @table @asis
  404. @item @emph{Description}:
  405. This function returns the number of CPUs controlled by StarPU. The returned
  406. value should be at most @code{STARPU_NMAXCPUS}.
  407. @item @emph{Prototype}:
  408. @code{unsigned starpu_get_cpu_worker_count(void);}
  409. @end table
  410. @node starpu_get_cuda_worker_count
  411. @subsection @code{starpu_get_cuda_worker_count} -- Get the number of CUDA devices controlled by StarPU
  412. @table @asis
  413. @item @emph{Description}:
  414. This function returns the number of CUDA devices controlled by StarPU. The returned
  415. value should be at most @code{STARPU_MAXCUDADEVS}.
  416. @item @emph{Prototype}:
  417. @code{unsigned starpu_get_cuda_worker_count(void);}
  418. @end table
  419. @node starpu_get_opencl_worker_count
  420. @subsection @code{starpu_get_opencl_worker_count} -- Get the number of OpenCL devices controlled by StarPU
  421. @table @asis
  422. @item @emph{Description}:
  423. This function returns the number of OpenCL devices controlled by StarPU. The returned
  424. value should be at most @code{STARPU_MAXOPENCLDEVS}.
  425. @item @emph{Prototype}:
  426. @code{unsigned starpu_get_opencl_worker_count(void);}
  427. @end table
  428. @node starpu_get_spu_worker_count
  429. @subsection @code{starpu_get_spu_worker_count} -- Get the number of Cell SPUs controlled by StarPU
  430. @table @asis
  431. @item @emph{Description}:
  432. This function returns the number of Cell SPUs controlled by StarPU.
  433. @item @emph{Prototype}:
  434. @code{unsigned starpu_get_opencl_worker_count(void);}
  435. @end table
  436. @node starpu_get_worker_id
  437. @subsection @code{starpu_get_worker_id} -- Get the identifier of the current worker
  438. @table @asis
  439. @item @emph{Description}:
  440. This function returns the identifier of the worker associated to the calling
  441. thread. The returned value is either -1 if the current context is not a StarPU
  442. worker (ie. when called from the application outside a task or a callback), or
  443. an integer between 0 and @code{starpu_get_worker_count() - 1}.
  444. @item @emph{Prototype}:
  445. @code{int starpu_get_worker_id(void);}
  446. @end table
  447. @node starpu_get_worker_type
  448. @subsection @code{starpu_get_worker_type} -- Get the type of processing unit associated to a worker
  449. @table @asis
  450. @item @emph{Description}:
  451. This function returns the type of worker associated to an identifier (as
  452. returned by the @code{starpu_get_worker_id} function). The returned value
  453. indicates the architecture of the worker: @code{STARPU_CPU_WORKER} for a CPU
  454. core, @code{STARPU_CUDA_WORKER} for a CUDA device, and
  455. @code{STARPU_GORDON_WORKER} for a Cell SPU. The value returned for an invalid
  456. identifier is unspecified.
  457. @item @emph{Prototype}:
  458. @code{enum starpu_archtype starpu_get_worker_type(int id);}
  459. @end table
  460. @node starpu_get_worker_name
  461. @subsection @code{starpu_get_worker_name} -- Get the name of a worker
  462. @table @asis
  463. @item @emph{Description}:
  464. StarPU associates a unique human readable string to each processing unit. This
  465. function copies at most the @code{maxlen} first bytes of the unique string
  466. associated to a worker identified by its identifier @code{id} into the
  467. @code{dst} buffer. The caller is responsible for ensuring that the @code{dst}
  468. is a valid pointer to a buffer of @code{maxlen} bytes at least. Calling this
  469. function on an invalid identifier results in an unspecified behaviour.
  470. @item @emph{Prototype}:
  471. @code{void starpu_get_worker_name(int id, char *dst, size_t maxlen);}
  472. @end table
  473. @node Data Library
  474. @section Data Library
  475. This section describes the data management facilities provided by StarPU.
  476. TODO: We show how to use existing data interfaces in [ref], but developers can
  477. design their own data interfaces if required.
  478. @menu
  479. * starpu_data_handle:: StarPU opaque data handle
  480. * void *interface:: StarPU data interface
  481. @end menu
  482. @node starpu_data_handle
  483. @subsection @code{starpu_data_handle} -- StarPU opaque data handle
  484. @table @asis
  485. @item @emph{Description}:
  486. StarPU uses @code{starpu_data_handle} as an opaque handle to manage a piece of
  487. data. Once a piece of data has been registered to StarPU, it is associated to a
  488. @code{starpu_data_handle} which keeps track of the state of the piece of data
  489. over the entire machine, so that we can maintain data consistency and locate
  490. data replicates for instance.
  491. @end table
  492. @node void *interface
  493. @subsection @code{void *interface} -- StarPU data interface
  494. @table @asis
  495. @item @emph{Description}:
  496. Data management is done at a high-level in StarPU: rather than accessing a mere
  497. list of contiguous buffers, the tasks may manipulate data that are described by
  498. a high-level construct which we call data interface.
  499. TODO
  500. @end table
  501. @c void starpu_delete_data(struct starpu_data_state_t *state);
  502. @c starpu_get_worker_memory_node TODO
  503. @c
  504. @c user interaction with the DSM
  505. @c void starpu_sync_data_with_mem(struct starpu_data_state_t *state);
  506. @c void starpu_notify_data_modification(struct starpu_data_state_t *state, uint32_t modifying_node);
  507. @node Codelets and Tasks
  508. @section Codelets and Tasks
  509. @menu
  510. * struct starpu_codelet:: StarPU codelet structure
  511. * struct starpu_task:: StarPU task structure
  512. * starpu_task_init:: Initialize a Task
  513. * starpu_task_create:: Allocate and Initialize a Task
  514. * starpu_task_deinit:: Release all the resources used by a Task
  515. * starpu_task_destroy:: Destroy a dynamically allocated Task
  516. * starpu_submit_task:: Submit a Task
  517. * starpu_wait_task:: Wait for the termination of a Task
  518. * starpu_wait_all_tasks:: Wait for the termination of all Tasks
  519. @end menu
  520. @node struct starpu_codelet
  521. @subsection @code{struct starpu_codelet} -- StarPU codelet structure
  522. @table @asis
  523. @item @emph{Description}:
  524. The codelet structure describes a kernel that is possibly implemented on
  525. various targets.
  526. @item @emph{Fields}:
  527. @table @asis
  528. @item @code{where}:
  529. Indicates which types of processing units are able to execute that codelet.
  530. @code{STARPU_CPU|STARPU_CUDA} for instance indicates that the codelet is
  531. implemented for both CPU cores and CUDA devices while @code{STARPU_GORDON}
  532. indicates that it is only available on Cell SPUs.
  533. @item @code{cpu_func} (optionnal):
  534. Is a function pointer to the CPU implementation of the codelet. Its prototype
  535. must be: @code{void cpu_func(void *buffers[], void *cl_arg)}. The first
  536. argument being the array of data managed by the data management library, and
  537. the second argument is a pointer to the argument passed from the @code{.cl_arg}
  538. field of the @code{starpu_task} structure.
  539. The @code{cpu_func} field is ignored if @code{STARPU_CPU} does not appear in
  540. the @code{.where} field, it must be non-null otherwise.
  541. @item @code{cuda_func} (optionnal):
  542. Is a function pointer to the CUDA implementation of the codelet. @emph{This
  543. must be a host-function written in the CUDA runtime API}. Its prototype must
  544. be: @code{void cuda_func(void *buffers[], void *cl_arg);}. The @code{cuda_func}
  545. field is ignored if @code{STARPU_CUDA} does not appear in the @code{.where}
  546. field, it must be non-null otherwise.
  547. @item @code{gordon_func} (optionnal):
  548. This is the index of the Cell SPU implementation within the Gordon library.
  549. TODO
  550. @item @code{nbuffers}:
  551. Specifies the number of arguments taken by the codelet. These arguments are
  552. managed by the DSM and are accessed from the @code{void *buffers[]}
  553. array. The constant argument passed with the @code{.cl_arg} field of the
  554. @code{starpu_task} structure is not counted in this number. This value should
  555. not be above @code{STARPU_NMAXBUFS}.
  556. @item @code{model} (optionnal):
  557. This is a pointer to the performance model associated to this codelet. This
  558. optionnal field is ignored when null. TODO
  559. @end table
  560. @end table
  561. @node struct starpu_task
  562. @subsection @code{struct starpu_task} -- StarPU task structure
  563. @table @asis
  564. @item @emph{Description}:
  565. The starpu_task structure describes a task that can be offloaded on the various
  566. processing units managed by StarPU. It instanciates a codelet. It can either be
  567. allocated dynamically with the @code{starpu_task_create} method, or declared
  568. statically. In the latter case, the programmer has to zero the
  569. @code{starpu_task} structure and to fill the different fields properly. The
  570. indicated default values correspond to the configuration of a task allocated
  571. with @code{starpu_task_create}.
  572. @item @emph{Fields}:
  573. @table @asis
  574. @item @code{cl}:
  575. Is a pointer to the corresponding @code{starpu_codelet} data structure. This
  576. describes where the kernel should be executed, and supplies the appropriate
  577. implementations. When set to @code{NULL}, no code is executed during the tasks,
  578. such empty tasks can be useful for synchronization purposes.
  579. @item @code{buffers}:
  580. TODO
  581. @item @code{cl_arg} (optional) (default = NULL):
  582. This pointer is passed to the codelet through the second argument
  583. of the codelet implementation (eg. @code{cpu_func} or @code{cuda_func}).
  584. In the specific case of the Cell processor, see the @code{.cl_arg_size}
  585. argument.
  586. @item @code{cl_arg_size} (optional, Cell specific):
  587. In the case of the Cell processor, the @code{.cl_arg} pointer is not directly
  588. given to the SPU function. A buffer of size @code{cl_arg_size} is allocated on
  589. the SPU. This buffer is then filled with the @code{cl_arg_size} bytes starting
  590. at address @code{cl_arg}. In that case, the argument given to the SPU codelet
  591. is therefore not the @code{.cl_arg} pointer, but the address of the buffer in
  592. local store (LS) instead. This field is ignored for CPU, CUDA and OpenCL
  593. codelets.
  594. @item @code{callback_func} (optional) (default = @code{NULL}):
  595. This is a function pointer of prototype @code{void (*f)(void *)} which
  596. specifies a possible callback. If that pointer is non-null, the callback
  597. function is executed @emph{on the host} after the execution of the task. The
  598. callback is passed the value contained in the @code{callback_arg} field. No
  599. callback is executed if that field is null.
  600. @item @code{callback_arg} (optional) (default = @code{NULL}):
  601. This is the pointer passed to the callback function. This field is ignored if
  602. the @code{callback_func} is null.
  603. @item @code{use_tag} (optional) (default = 0):
  604. If set, this flag indicates that the task should be associated with the tag
  605. conained in the @code{tag_id} field. Tag allow the application to synchronize
  606. with the task and to express task dependencies easily.
  607. @item @code{tag_id}:
  608. This fields contains the tag associated to the tag if the @code{use_tag} field
  609. was set, it is ignored otherwise.
  610. @item @code{synchronous}:
  611. If this flag is set, the @code{starpu_submit_task} function is blocking and
  612. returns only when the task has been executed (or if no worker is able to
  613. process the task). Otherwise, @code{starpu_submit_task} returns immediately.
  614. @item @code{priority} (optionnal) (default = @code{STARPU_DEFAULT_PRIO}):
  615. This field indicates a level of priority for the task. This is an integer value
  616. that must be selected between @code{STARPU_MIN_PRIO} (for the least important
  617. tasks) and @code{STARPU_MAX_PRIO} (for the most important tasks) included.
  618. Default priority is @code{STARPU_DEFAULT_PRIO}. Scheduling strategies that
  619. take priorities into account can use this parameter to take better scheduling
  620. decisions, but the scheduling policy may also ignore it.
  621. @item @code{execute_on_a_specific_worker} (default = 0):
  622. If this flag is set, StarPU will bypass the scheduler and directly affect this
  623. task to the worker specified by the @code{workerid} field.
  624. @item @code{workerid} (optional):
  625. If the @code{execute_on_a_specific_worker} field is set, this field indicates
  626. which is the identifier of the worker that should process this task (as
  627. returned by @code{starpu_get_worker_id}). This field is ignored if
  628. @code{execute_on_a_specific_worker} field is set to 0.
  629. @item @code{detach} (optional) (default = 1):
  630. If this flag is set, it is not possible to synchronize with the task
  631. by the means of @code{starpu_wait_task} later on. Internal data structures
  632. are only garanteed to be liberated once @code{starpu_wait_task} is called
  633. if that flag is not set.
  634. @item @code{destroy} (optional) (default = 1):
  635. If that flag is set, the task structure will automatically be liberated, either
  636. after the execution of the callback if the task is detached, or during
  637. @code{starpu_task_wait} otherwise. If this flag is not set, dynamically allocated data
  638. structures will not be liberated until @code{starpu_task_destroy} is called
  639. explicitely. Setting this flag for a statically allocated task structure will
  640. result in undefined behaviour.
  641. @end table
  642. @end table
  643. @node starpu_task_init
  644. @subsection @code{starpu_task_init} -- Initialize a Task
  645. @table @asis
  646. @item @emph{Description}:
  647. Initialize a task structure with default values. This function is implicitely
  648. called by @code{starpu_task_create}. By default, tasks initialized with
  649. @code{starpu_task_init} must be deinitialized explicitely with
  650. @code{starpu_task_deinit}.
  651. @item @emph{Prototype}:
  652. @code{void starpu_task_init(struct starpu_task *task);}
  653. @end table
  654. @node starpu_task_create
  655. @subsection @code{starpu_task_create} -- Allocate and Initialize a Task
  656. @table @asis
  657. @item @emph{Description}:
  658. Allocate a task structure and initialize it with default values. Tasks
  659. allocated dynamically with starpu_task_create are automatically liberated when
  660. the task is terminated. If the destroy flag is explicitely unset, the
  661. ressources used by the task are liberated by calling
  662. @code{starpu_task_destroy}.
  663. @item @emph{Prototype}:
  664. @code{struct starpu_task *starpu_task_create(void);}
  665. @end table
  666. @node starpu_task_deinit
  667. @subsection @code{starpu_task_deinit} -- Release all the resources used by a Task
  668. @table @asis
  669. @item @emph{Description}:
  670. Release all the structures automatically allocated to execute the task. This is
  671. called implicitely by starpu_task_destroy, but the task structure itself is not
  672. liberated. This should be used for statically allocated tasks for instance.
  673. Note that this function is automatically called by @code{starpu_task_destroy}.
  674. @item @emph{Prototype}:
  675. @code{void starpu_task_deinit(struct starpu_task *task);}
  676. @end table
  677. @node starpu_task_destroy
  678. @subsection @code{starpu_task_destroy} -- Destroy a dynamically allocated Task
  679. @table @asis
  680. @item @emph{Description}:
  681. Liberate the ressource allocated during starpu_task_create. This function can
  682. be called automatically after the execution of a task by setting the
  683. @code{.destroy} flag of the @code{starpu_task} structure (default behaviour).
  684. Calling this function on a statically allocated task results in an undefined
  685. behaviour.
  686. @item @emph{Prototype}:
  687. @code{void starpu_task_destroy(struct starpu_task *task);}
  688. @end table
  689. @node starpu_wait_task
  690. @subsection @code{starpu_wait_task} -- Wait for the termination of a Task
  691. @table @asis
  692. @item @emph{Description}:
  693. This function blocks until the task was executed. It is not possible to
  694. synchronize with a task more than once. It is not possible to wait
  695. synchronous or detached tasks.
  696. @item @emph{Return value}:
  697. Upon successful completion, this function returns 0. Otherwise, @code{-EINVAL}
  698. indicates that the waited task was either synchronous or detached.
  699. @item @emph{Prototype}:
  700. @code{int starpu_wait_task(struct starpu_task *task);}
  701. @end table
  702. @node starpu_submit_task
  703. @subsection @code{starpu_submit_task} -- Submit a Task
  704. @table @asis
  705. @item @emph{Description}:
  706. This function submits task @code{task} to StarPU. Calling this function does
  707. not mean that the task will be executed immediatly as there can be data or task
  708. (tag) dependencies that are not fulfilled yet: StarPU will take care to
  709. schedule this task with respect to such dependencies.
  710. This function returns immediately if the @code{synchronous} field of the
  711. @code{starpu_task} structure was set to 0, and block until the termination of
  712. the task otherwise. It is also possible to synchronize the application with
  713. asynchronous tasks by the means of tags, using the @code{starpu_tag_wait}
  714. function for instance.
  715. In case of success, this function returns 0, a return value of @code{-ENODEV}
  716. means that there is no worker able to process that task (eg. there is no GPU
  717. available and this task is only implemented on top of CUDA).
  718. @item @emph{Prototype}:
  719. @code{int starpu_submit_task(struct starpu_task *task);}
  720. @end table
  721. @node starpu_wait_all_tasks
  722. @subsection @code{starpu_wait_all_tasks} -- Wait for the termination of all Tasks
  723. @table @asis
  724. @item @emph{Description}:
  725. This function blocks until all the tasks that were submitted are terminated.
  726. @item @emph{Prototype}:
  727. @code{void starpu_wait_all_tasks(void);}
  728. @end table
  729. @c Callbacks : what can we put in callbacks ?
  730. @node Tags
  731. @section Tags
  732. @menu
  733. * starpu_tag_t:: Task identifier
  734. * starpu_tag_declare_deps:: Declare the Dependencies of a Tag
  735. * starpu_tag_declare_deps_array:: Declare the Dependencies of a Tag
  736. * starpu_tag_wait:: Block until a Tag is terminated
  737. * starpu_tag_wait_array:: Block until a set of Tags is terminated
  738. * starpu_tag_remove:: Destroy a Tag
  739. * starpu_tag_notify_from_apps:: Feed a tag explicitely
  740. @end menu
  741. @node starpu_tag_t
  742. @subsection @code{starpu_tag_t} -- Task identifier
  743. @table @asis
  744. @item @emph{Description}:
  745. It is possible to associate a task with a unique "tag" and to express
  746. dependencies between tasks by the means of those tags. To do so, fill the
  747. @code{tag_id} field of the @code{starpu_task} structure with a tag number (can
  748. be arbitrary) and set the @code{use_tag} field to 1.
  749. If @code{starpu_tag_declare_deps} is called with that tag number, the task will
  750. not be started until the task which wears the declared dependency tags are
  751. complete.
  752. @end table
  753. @node starpu_tag_declare_deps
  754. @subsection @code{starpu_tag_declare_deps} -- Declare the Dependencies of a Tag
  755. @table @asis
  756. @item @emph{Description}:
  757. Specify the dependencies of the task identified by tag @code{id}. The first
  758. argument specifies the tag which is configured, the second argument gives the
  759. number of tag(s) on which @code{id} depends. The following arguments are the
  760. tags which have to terminated to unlock the task.
  761. This function must be called before the associated task is submitted to StarPU
  762. with @code{starpu_submit_task}.
  763. @item @emph{Remark}
  764. Because of the variable arity of @code{starpu_tag_declare_deps}, note that the
  765. last arguments @emph{must} be of type @code{starpu_tag_t}: constant values
  766. typically need to be explicitely casted. Using the
  767. @code{starpu_tag_declare_deps_array} function avoids this hazard.
  768. @item @emph{Prototype}:
  769. @code{void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...);}
  770. @item @emph{Example}:
  771. @example
  772. @c @cartouche
  773. /* Tag 0x1 depends on tags 0x32 and 0x52 */
  774. starpu_tag_declare_deps((starpu_tag_t)0x1,
  775. 2, (starpu_tag_t)0x32, (starpu_tag_t)0x52);
  776. @c @end cartouche
  777. @end example
  778. @end table
  779. @node starpu_tag_declare_deps_array
  780. @subsection @code{starpu_tag_declare_deps_array} -- Declare the Dependencies of a Tag
  781. @table @asis
  782. @item @emph{Description}:
  783. This function is similar to @code{starpu_tag_declare_deps}, except that its
  784. does not take a variable number of arguments but an array of tags of size
  785. @code{ndeps}.
  786. @item @emph{Prototype}:
  787. @code{void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t *array);}
  788. @item @emph{Example}:
  789. @example
  790. /* Tag 0x1 depends on tags 0x32 and 0x52 */
  791. starpu_tag_t tag_array[2] = @{0x32, 0x52@};
  792. starpu_tag_declare_deps_array((starpu_tag_t)0x1, 2, tag_array);
  793. @end example
  794. @end table
  795. @node starpu_tag_wait
  796. @subsection @code{starpu_tag_wait} -- Block until a Tag is terminated
  797. @table @asis
  798. @item @emph{Description}:
  799. This function blocks until the task associated to tag @code{id} has been
  800. executed. This is a blocking call which must therefore not be called within
  801. tasks or callbacks, but only from the application directly. It is possible to
  802. synchronize with the same tag multiple times, as long as the
  803. @code{starpu_tag_remove} function is not called. Note that it is still
  804. possible to synchronize wih a tag associated to a task which @code{starpu_task}
  805. data structure was liberated (eg. if the @code{destroy} flag of the
  806. @code{starpu_task} was enabled).
  807. @item @emph{Prototype}:
  808. @code{void starpu_tag_wait(starpu_tag_t id);}
  809. @end table
  810. @node starpu_tag_wait_array
  811. @subsection @code{starpu_tag_wait_array} -- Block until a set of Tags is terminated
  812. @table @asis
  813. @item @emph{Description}:
  814. This function is similar to @code{starpu_tag_wait} except that it blocks until
  815. @emph{all} the @code{ntags} tags contained in the @code{id} array are
  816. terminated.
  817. @item @emph{Prototype}:
  818. @code{void starpu_tag_wait_array(unsigned ntags, starpu_tag_t *id);}
  819. @end table
  820. @node starpu_tag_remove
  821. @subsection @code{starpu_tag_remove} -- Destroy a Tag
  822. @table @asis
  823. @item @emph{Description}:
  824. This function release the resources associated to tag @code{id}. It can be
  825. called once the corresponding task has been executed and when there is no tag
  826. that depend on that one anymore.
  827. @item @emph{Prototype}:
  828. @code{void starpu_tag_remove(starpu_tag_t id);}
  829. @end table
  830. @node starpu_tag_notify_from_apps
  831. @subsection @code{starpu_tag_notify_from_apps} -- Feed a Tag explicitely
  832. @table @asis
  833. @item @emph{Description}:
  834. This function explicitely unlocks tag @code{id}. It may be useful in the
  835. case of applications which execute part of their computation outside StarPU
  836. tasks (eg. third-party libraries). It is also provided as a
  837. convenient tool for the programmer, for instance to entirely construct the task
  838. DAG before actually giving StarPU the opportunity to execute the tasks.
  839. @item @emph{Prototype}:
  840. @code{void starpu_tag_notify_from_apps(starpu_tag_t id);}
  841. @end table
  842. @node CUDA extensions
  843. @section CUDA extensions
  844. @c void starpu_malloc_pinned_if_possible(float **A, size_t dim);
  845. @c starpu_helper_init_cublas TODO
  846. @c starpu_helper_shutdown_cublas TODO
  847. @menu
  848. * starpu_get_local_cuda_stream:: Get current worker's CUDA stream
  849. * starpu_helper_init_cublas:: Initialize CUBLAS on every CUDA device
  850. * starpu_helper_shutdown_cublas:: Deiitialize CUBLAS on every CUDA device
  851. @end menu
  852. @node starpu_get_local_cuda_stream
  853. @subsection @code{starpu_get_local_cuda_stream} -- Get current worker's CUDA stream
  854. @table @asis
  855. @item @emph{Description}:
  856. StarPU provides a stream for every CUDA device controlled by StarPU. This
  857. function is only provided for convenience so that programmers can easily use
  858. asynchronous operations within codelets without having to create a stream by
  859. hand. Note that the application is not forced to use the stream provided by
  860. @code{starpu_get_local_cuda_stream} and may also create its own streams.
  861. @item @emph{Prototype}:
  862. @code{cudaStream_t *starpu_get_local_cuda_stream(void);}
  863. @end table
  864. @node starpu_helper_init_cublas
  865. @subsection @code{starpu_helper_init_cublas} -- Initialize CUBLAS on every CUDA device
  866. @table @asis
  867. @item @emph{Description}:
  868. The CUBLAS library must be initialized prior to any CUBLAS call. Calling
  869. @code{starpu_helper_init_cublas} will initialize CUBLAS on every CUDA device
  870. controlled by StarPU. This call blocks until CUBLAS has been properly
  871. initialized on every device.
  872. @item @emph{Prototype}:
  873. @code{void starpu_helper_init_cublas(void);}
  874. @end table
  875. @node starpu_helper_shutdown_cublas
  876. @subsection @code{starpu_helper_shutdown_cublas} -- Deinitialize CUBLAS on every CUDA device
  877. @table @asis
  878. @item @emph{Description}:
  879. This function synchronously deinitializes the CUBLAS library on every CUDA device.
  880. @item @emph{Prototype}:
  881. @code{void starpu_helper_shutdown_cublas(void);}
  882. @end table
  883. @node Cell extensions
  884. @section Cell extensions
  885. nothing yet.
  886. @node Miscellaneous
  887. @section Miscellaneous helpers
  888. @menu
  889. * starpu_execute_on_each_worker:: Execute a function on a subset of workers
  890. @end menu
  891. @node starpu_execute_on_each_worker
  892. @subsection @code{starpu_execute_on_each_worker} -- Execute a function on a subset of workers
  893. @table @asis
  894. @item @emph{Description}:
  895. When calling this method, the offloaded function specified by the first argument is
  896. executed by every StarPU worker that may execute the function.
  897. The second argument is passed to the offloaded function.
  898. The last argument specifies on which types of processing units the function
  899. should be executed. Similarly to the @code{.where} field of the
  900. @code{starpu_codelet} structure, it is possible to specify that the function
  901. should be executed on every CUDA device and every CPU by passing
  902. @code{STARPU_CPU|STARPU_CUDA}.
  903. This function blocks until the function has been executed on every appropriate
  904. processing units, so that it may not be called from a callback function for
  905. instance.
  906. @item @emph{Prototype}:
  907. @code{void starpu_execute_on_each_worker(void (*func)(void *), void *arg, uint32_t where);}
  908. @end table
  909. @c ---------------------------------------------------------------------
  910. @c Basic Examples
  911. @c ---------------------------------------------------------------------
  912. @node Basic Examples
  913. @chapter Basic Examples
  914. @menu
  915. * Compiling and linking:: Compiling and Linking Options
  916. * Hello World:: Submitting Tasks
  917. * Scaling a Vector:: Manipulating Data
  918. * Scaling a Vector (hybrid):: Handling Heterogeneous Architectures
  919. @end menu
  920. @node Compiling and linking
  921. @section Compiling and linking options
  922. The Makefile could for instance contain the following lines to define which
  923. options must be given to the compiler and to the linker:
  924. @example
  925. @c @cartouche
  926. CFLAGS+=$$(pkg-config --cflags libstarpu)
  927. LIBS+=$$(pkg-config --libs libstarpu)
  928. @c @end cartouche
  929. @end example
  930. @node Hello World
  931. @section Hello World
  932. In this section, we show how to implement a simple program that submits a task to StarPU.
  933. @subsection Required Headers
  934. The @code{starpu.h} header should be included in any code using StarPU.
  935. @example
  936. @c @cartouche
  937. #include <starpu.h>
  938. @c @end cartouche
  939. @end example
  940. @subsection Defining a Codelet
  941. @example
  942. @c @cartouche
  943. void cpu_func(void *buffers[], void *cl_arg)
  944. @{
  945. float *array = cl_arg;
  946. printf("Hello world (array = @{%f, %f@} )\n", array[0], array[1]);
  947. @}
  948. starpu_codelet cl =
  949. @{
  950. .where = STARPU_CPU,
  951. .cpu_func = cpu_func,
  952. .nbuffers = 0
  953. @};
  954. @c @end cartouche
  955. @end example
  956. A codelet is a structure that represents a computational kernel. Such a codelet
  957. may contain an implementation of the same kernel on different architectures
  958. (eg. CUDA, Cell's SPU, x86, ...).
  959. The ''@code{.nbuffers}'' field specifies the number of data buffers that are
  960. manipulated by the codelet: here the codelet does not access or modify any data
  961. that is controlled by our data management library. Note that the argument
  962. passed to the codelet (the ''@code{.cl_arg}'' field of the @code{starpu_task}
  963. structure) does not count as a buffer since it is not managed by our data
  964. management library.
  965. @c TODO need a crossref to the proper description of "where" see bla for more ...
  966. We create a codelet which may only be executed on the CPUs. The ''@code{.where}''
  967. field is a bitmask that defines where the codelet may be executed. Here, the
  968. @code{STARPU_CPU} value means that only CPUs can execute this codelet
  969. (@pxref{Codelets and Tasks} for more details on that field).
  970. When a CPU core executes a codelet, it calls the @code{.cpu_func} function,
  971. which @emph{must} have the following prototype:
  972. @code{void (*cpu_func)(void *buffers[], void *cl_arg)}
  973. In this example, we can ignore the first argument of this function which gives a
  974. description of the input and output buffers (eg. the size and the location of
  975. the matrices). The second argument is a pointer to a buffer passed as an
  976. argument to the codelet by the means of the ''@code{.cl_arg}'' field of the
  977. @code{starpu_task} structure.
  978. @c TODO rewrite so that it is a little clearer ?
  979. Be aware that this may be a pointer to a
  980. @emph{copy} of the actual buffer, and not the pointer given by the programmer:
  981. if the codelet modifies this buffer, there is no garantee that the initial
  982. buffer will be modified as well: this for instance implies that the buffer
  983. cannot be used as a synchronization medium.
  984. @subsection Submitting a Task
  985. @example
  986. @c @cartouche
  987. void callback_func(void *callback_arg)
  988. @{
  989. printf("Callback function (arg %x)\n", callback_arg);
  990. @}
  991. int main(int argc, char **argv)
  992. @{
  993. /* initialize StarPU */
  994. starpu_init(NULL);
  995. struct starpu_task *task = starpu_task_create();
  996. task->cl = &cl;
  997. float *array[2] = @{1.0f, -1.0f@};
  998. task->cl_arg = &array;
  999. task->cl_arg_size = 2*sizeof(float);
  1000. task->callback_func = callback_func;
  1001. task->callback_arg = 0x42;
  1002. /* starpu_submit_task will be a blocking call */
  1003. task->synchronous = 1;
  1004. /* submit the task to StarPU */
  1005. starpu_submit_task(task);
  1006. /* terminate StarPU */
  1007. starpu_shutdown();
  1008. return 0;
  1009. @}
  1010. @c @end cartouche
  1011. @end example
  1012. Before submitting any tasks to StarPU, @code{starpu_init} must be called. The
  1013. @code{NULL} argument specifies that we use default configuration. Tasks cannot
  1014. be submitted after the termination of StarPU by a call to
  1015. @code{starpu_shutdown}.
  1016. In the example above, a task structure is allocated by a call to
  1017. @code{starpu_task_create}. This function only allocates and fills the
  1018. corresponding structure with the default settings (@pxref{starpu_task_create}),
  1019. but it does not submit the task to StarPU.
  1020. @c not really clear ;)
  1021. The ''@code{.cl}'' field is a pointer to the codelet which the task will
  1022. execute: in other words, the codelet structure describes which computational
  1023. kernel should be offloaded on the different architectures, and the task
  1024. structure is a wrapper containing a codelet and the piece of data on which the
  1025. codelet should operate.
  1026. The optional ''@code{.cl_arg}'' field is a pointer to a buffer (of size
  1027. @code{.cl_arg_size}) with some parameters for the kernel
  1028. described by the codelet. For instance, if a codelet implements a computational
  1029. kernel that multiplies its input vector by a constant, the constant could be
  1030. specified by the means of this buffer.
  1031. Once a task has been executed, an optional callback function can be called.
  1032. While the computational kernel could be offloaded on various architectures, the
  1033. callback function is always executed on a CPU. The ''@code{.callback_arg}''
  1034. pointer is passed as an argument of the callback. The prototype of a callback
  1035. function must be:
  1036. @example
  1037. void (*callback_function)(void *);
  1038. @end example
  1039. If the @code{.synchronous} field is non-null, task submission will be
  1040. synchronous: the @code{starpu_submit_task} function will not return until the
  1041. task was executed. Note that the @code{starpu_shutdown} method does not
  1042. guarantee that asynchronous tasks have been executed before it returns.
  1043. @node Scaling a Vector
  1044. @section Manipulating Data: Scaling a Vector
  1045. The previous example has shown how to submit tasks. In this section we show how
  1046. StarPU tasks can manipulate data.
  1047. Programmers can describe the data layout of their application so that StarPU is
  1048. responsible for enforcing data coherency and availability accross the machine.
  1049. Instead of handling complex (and non-portable) mechanisms to perform data
  1050. movements, programmers only declare which piece of data is accessed and/or
  1051. modified by a task, and StarPU makes sure that when a computational kernel
  1052. starts somewhere (eg. on a GPU), its data are available locally.
  1053. Before submitting those tasks, the programmer first needs to declare the
  1054. different pieces of data to StarPU using the @code{starpu_register_*_data}
  1055. functions. To ease the development of applications for StarPU, it is possible
  1056. to describe multiple types of data layout. A type of data layout is called an
  1057. @b{interface}. By default, there are different interfaces available in StarPU:
  1058. here we will consider the @b{vector interface}.
  1059. The following lines show how to declare an array of @code{n} elements of type
  1060. @code{float} using the vector interface:
  1061. @example
  1062. float tab[n];
  1063. starpu_data_handle tab_handle;
  1064. starpu_register_vector_data(&tab_handle, 0, tab, n, sizeof(float));
  1065. @end example
  1066. The first argument, called the @b{data handle}, is an opaque pointer which
  1067. designates the array in StarPU. This is also the structure which is used to
  1068. describe which data is used by a task.
  1069. @c TODO: what is 0 ?
  1070. It is possible to construct a StarPU
  1071. task that multiplies this vector by a constant factor:
  1072. @example
  1073. float factor;
  1074. struct starpu_task *task = starpu_task_create();
  1075. task->cl = &cl;
  1076. task->buffers[0].handle = tab_handle;
  1077. task->buffers[0].mode = STARPU_RW;
  1078. task->cl_arg = &factor;
  1079. task->cl_arg_size = sizeof(float);
  1080. @end example
  1081. Since the factor is constant, it does not need a preliminary declaration, and
  1082. can just be passed through the @code{cl_arg} pointer like in the previous
  1083. example. The vector parameter is described by its handle.
  1084. There are two fields in each element of the @code{buffers} array.
  1085. @code{.handle} is the handle of the data, and @code{.mode} specifies how the
  1086. kernel will access the data (@code{STARPU_R} for read-only, @code{STARPU_W} for
  1087. write-only and @code{STARPU_RW} for read and write access).
  1088. The definition of the codelet can be written as follows:
  1089. @example
  1090. void scal_func(void *buffers[], void *cl_arg)
  1091. @{
  1092. unsigned i;
  1093. float *factor = cl_arg;
  1094. struct starpu_vector_interface_s *vector = buffers[0];
  1095. /* length of the vector */
  1096. unsigned n = vector->nx;
  1097. /* local copy of the vector pointer */
  1098. float *val = (float *)vector->ptr;
  1099. for (i = 0; i < n; i++)
  1100. val[i] *= *factor;
  1101. @}
  1102. starpu_codelet cl = @{
  1103. .where = STARPU_CPU,
  1104. .cpu_func = scal_func,
  1105. .nbuffers = 1
  1106. @};
  1107. @end example
  1108. The second argument of the @code{scal_func} function contains a pointer to the
  1109. parameters of the codelet (given in @code{task->cl_arg}), so that we read the
  1110. constant factor from this pointer. The first argument is an array that gives
  1111. a description of every buffers passed in the @code{task->buffers}@ array. The
  1112. size of this array is given by the @code{.nbuffers} field of the codelet
  1113. structure. For the sake of generality, this array contains pointers to the
  1114. different interfaces describing each buffer. In the case of the @b{vector
  1115. interface}, the location of the vector (resp. its length) is accessible in the
  1116. @code{ptr} (resp. @code{nx}) of this array. Since the vector is accessed in a
  1117. read-write fashion, any modification will automatically affect future accesses
  1118. to that vector made by other tasks.
  1119. @node Scaling a Vector (hybrid)
  1120. @section Vector Scaling on an Hybrid CPU/GPU Machine
  1121. Contrary to the previous examples, the task submitted in the example may not
  1122. only be executed by the CPUs, but also by a CUDA device.
  1123. TODO
  1124. @c ---------------------------------------------------------------------
  1125. @c Advanced Topics
  1126. @c ---------------------------------------------------------------------
  1127. @node Advanced Topics
  1128. @chapter Advanced Topics
  1129. @bye