310_data_management.doxy 51 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289
  1. /* StarPU --- Runtime system for heterogeneous multicore architectures.
  2. *
  3. * Copyright (C) 2009-2021 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
  4. *
  5. * StarPU is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU Lesser General Public License as published by
  7. * the Free Software Foundation; either version 2.1 of the License, or (at
  8. * your option) any later version.
  9. *
  10. * StarPU is distributed in the hope that it will be useful, but
  11. * WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  13. *
  14. * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  15. */
  16. /*! \page DataManagement Data Management
  17. TODO: intro which mentions consistency among other things
  18. \section DataInterface Data Interface
  19. StarPU provides several data interfaces for programmers to describe
  20. the data layout of their application. There are predefined interfaces
  21. already available in StarPU. Users can define new data interfaces as
  22. explained in \ref DefiningANewDataInterface. All functions provided by
  23. StarPU are documented in \ref API_Data_Interfaces. You will find a
  24. short list below.
  25. \subsection VariableDataInterface Variable Data Interface
  26. A variable is a given-size byte element, typically a scalar. Here an
  27. example of how to register a variable data to StarPU by using
  28. starpu_variable_data_register().
  29. \code{.c}
  30. float var = 42.0;
  31. starpu_data_handle_t var_handle;
  32. starpu_variable_data_register(&var_handle, STARPU_MAIN_RAM, (uintptr_t)&var, sizeof(var));
  33. \endcode
  34. \subsection VectorDataInterface Vector Data Interface
  35. A vector is a fixed number of elements of a given size. Here an
  36. example of how to register a vector data to StarPU by using
  37. starpu_vector_data_register().
  38. \code{.c}
  39. float vector[NX];
  40. starpu_data_handle_t vector_handle;
  41. starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX, sizeof(vector[0]));
  42. \endcode
  43. Vectors can be partitioned into pieces by using
  44. starpu_vector_filter_block(). They can also be partitioned with some overlapping
  45. by using starpu_vector_filter_block_shadow(). By default StarPU
  46. uses the same size for each piece. If different sizes are desired,
  47. starpu_vector_filter_list() or starpu_vector_filter_list_long() can be used
  48. instead. To just divide in two pieces, starpu_vector_filter_divide_in_2() can be used.
  49. \subsection MatrixDataInterface Matrix Data Interface
  50. To register 2-D matrices with a potential padding, one can use the
  51. matrix data interface. Here an example of how to register a matrix
  52. data to StarPU by using starpu_matrix_data_register().
  53. \code{.c}
  54. float *matrix;
  55. starpu_data_handle_t matrix_handle;
  56. matrix = (float*)malloc(width * height * sizeof(float));
  57. starpu_matrix_data_register(&matrix_handle, STARPU_MAIN_RAM, (uintptr_t)matrix, width, width, height, sizeof(float));
  58. \endcode
  59. 2D matrices can be partitioned into 2D matrices along the x dimension by
  60. using starpu_matrix_filter_block(), and along the y dimension by using
  61. starpu_matrix_filter_vertical_block(). They can also be partitioned
  62. with some overlapping by using starpu_matrix_filter_block_shadow() and
  63. starpu_matrix_filter_vertical_block_shadow().
  64. \subsection BlockDataInterface Block Data Interface
  65. To register 3-D matrices with potential paddings on Y and Z dimensions,
  66. one can use the block data interface. Here an example of how to
  67. register a block data to StarPU by using starpu_block_data_register().
  68. \code{.c}
  69. float *block;
  70. starpu_data_handle_t block_handle;
  71. block = (float*)malloc(nx*ny*nz*sizeof(float));
  72. starpu_block_data_register(&block_handle, STARPU_MAIN_RAM, (uintptr_t)block, nx, nx*ny, nx, ny, nz, sizeof(float));
  73. \endcode
  74. 3D matrices can be partitioned along the x dimension by
  75. using starpu_block_filter_block(), or along the y dimension
  76. by using starpu_block_filter_vertical_block(), or along the
  77. z dimension by using starpu_block_filter_depth_block(). They
  78. can also be partitioned with some overlapping by using
  79. starpu_block_filter_block_shadow(), starpu_block_filter_vertical_block_shadow(),
  80. or starpu_block_filter_depth_block_shadow().
  81. \subsection TensorDataInterface Tensor Data Interface
  82. To register 4-D matrices with potential paddings on Y, Z, and T dimensions,
  83. one can use the tensor data interface. Here an example of how to
  84. register a tensor data to StarPU by using starpu_tensor_data_register().
  85. \code{.c}
  86. float *block;
  87. starpu_data_handle_t block_handle;
  88. block = (float*)malloc(nx*ny*nz*nt*sizeof(float));
  89. starpu_tensor_data_register(&block_handle, STARPU_MAIN_RAM, (uintptr_t)block, nx, nx*ny, nx*ny*nz, nx, ny, nz, nt, sizeof(float));
  90. \endcode
  91. Partitioning filters are not implemented yet.
  92. \subsection BCSRDataInterface BCSR Data Interface
  93. BCSR (Blocked Compressed Sparse Row Representation) sparse matrix data
  94. can be registered to StarPU using the bcsr data interface. Here an
  95. example on how to do so by using starpu_bcsr_data_register().
  96. \code{.c}
  97. /*
  98. * We use the following matrix:
  99. *
  100. * +----------------+
  101. * | 0 1 0 0 |
  102. * | 2 3 0 0 |
  103. * | 4 5 8 9 |
  104. * | 6 7 10 11 |
  105. * +----------------+
  106. *
  107. * nzval = [0, 1, 2, 3] ++ [4, 5, 6, 7] ++ [8, 9, 10, 11]
  108. * colind = [0, 0, 1]
  109. * rowptr = [0, 1, 3]
  110. * r = c = 2
  111. */
  112. /* Size of the blocks */
  113. int R = 2;
  114. int C = 2;
  115. int NROWS = 2;
  116. int NNZ_BLOCKS = 3; /* out of 4 */
  117. int NZVAL_SIZE = (R*C*NNZ_BLOCKS);
  118. int nzval[NZVAL_SIZE] =
  119. {
  120. 0, 1, 2, 3, /* First block */
  121. 4, 5, 6, 7, /* Second block */
  122. 8, 9, 10, 11 /* Third block */
  123. };
  124. uint32_t colind[NNZ_BLOCKS] =
  125. {
  126. 0, /* block-column index for first block in nzval */
  127. 0, /* block-column index for second block in nzval */
  128. 1 /* block-column index for third block in nzval */
  129. };
  130. uint32_t rowptr[NROWS+1] =
  131. {
  132. 0, / * block-index in nzval of the first block of the first row. */
  133. 1, / * block-index in nzval of the first block of the second row. */
  134. NNZ_BLOCKS /* number of blocks, to allow an easier element's access for the kernels */
  135. };
  136. starpu_data_handle_t bcsr_handle;
  137. starpu_bcsr_data_register(&bcsr_handle,
  138. STARPU_MAIN_RAM,
  139. NNZ_BLOCKS,
  140. NROWS,
  141. (uintptr_t) nzval,
  142. colind,
  143. rowptr,
  144. 0, /* firstentry */
  145. R,
  146. C,
  147. sizeof(nzval[0]));
  148. \endcode
  149. StarPU provides an example on how to deal with such matrices in
  150. <c>examples/spmv</c>.
  151. BCSR data handles can be partitioned into its dense matrix blocks by using
  152. starpu_bcsr_filter_canonical_block(), or split into other BCSR data handles by
  153. using starpu_bcsr_filter_vertical_block() (but only split along the leading dimension is
  154. supported, i.e. along adjacent nnz blocks)
  155. \subsection CSRDataInterface CSR Data Interface
  156. TODO
  157. CSR data handles can be partitioned into vertical CSR matrices by using
  158. starpu_csr_filter_vertical_block().
  159. \subsection VariableSizeDataInterface Data Interface with Variable Size
  160. Tasks are actually allowed to change the size of data interfaces.
  161. The simplest case is just changing the amount of data actually used within the
  162. allocated buffer. This is for instance implemented for the matrix interface: one
  163. can set the new NX/NY values with STARPU_MATRIX_SET_NX(), STARPU_MATRIX_SET_NY(), and STARPU_MATRIX_SET_LD()
  164. at the end of the task implementation. Data transfers achieved by StarPU will
  165. then use these values instead of the whole allocated size. The values of course
  166. need to be set within the original allocation. To reserve room for increasing
  167. the NX/NY values, one can use starpu_matrix_data_register_allocsize() instead of
  168. starpu_matrix_data_register(), to specify the allocation size to be used instead
  169. of the default NX*NY*ELEMSIZE. To support this, the data interface
  170. has to implement the functions starpu_data_interface_ops::alloc_footprint and
  171. starpu_data_interface_ops::alloc_compare, for proper StarPU allocation
  172. management.
  173. A more involved case is changing the amount of allocated data.
  174. The task implementation can just reallocate the buffer during its execution, and
  175. set the proper new values in the interface structure, e.g. nx, ny, ld, etc. so
  176. that the StarPU core knows the new data layout. The structure starpu_data_interface_ops
  177. however then needs to have the field starpu_data_interface_ops::dontcache
  178. set to 1, to prevent StarPU from trying to perform any cached allocation,
  179. since the allocated size will vary. An example is available in
  180. <c>tests/datawizard/variable_size.c</c>. The example uses its own data
  181. interface so as to contain some simulation information for data growth, but the
  182. principle can be applied for any data interface.
  183. The principle is to use starpu_malloc_on_node_flags() to make the new
  184. allocation, and use starpu_free_on_node_flags() to release any previous
  185. allocation. The flags have to be precisely like in the example:
  186. \code{.c}
  187. unsigned workerid = starpu_worker_get_id_check();
  188. unsigned dst_node = starpu_worker_get_memory_node(workerid);
  189. interface->ptr = starpu_malloc_on_node_flags(dst_node, size + increase, STARPU_MALLOC_PINNED | STARPU_MALLOC_COUNT | STARPU_MEMORY_OVERFLOW);
  190. starpu_free_on_node_flags(dst_node, old, size, STARPU_MALLOC_PINNED | STARPU_MALLOC_COUNT | STARPU_MEMORY_OVERFLOW);
  191. interface->size += increase;
  192. \endcode
  193. so that the allocated area has the expected properties and the allocation is accounted for properly.
  194. Depending on the interface (vector, CSR, etc.) you may have to fix several
  195. members of the data interface: e.g. both <c>nx</c> and <c>allocsize</c> for
  196. vectors, and store the pointer both in <c>ptr</c> and <c>dev_handle</c>.
  197. Some interfaces make a distinction between the actual number of elements
  198. stored in the data and the actually allocated buffer. For instance, the vector
  199. interface uses the <c>nx</c> field for the former, and the <c>allocsize</c> for
  200. the latter. This allows for lazy reallocation to avoid reallocating the buffer
  201. everytime to exactly match the actual number of elements. Computations and data
  202. transfers will use the field <c>nx</c>, while allocation functions will use the field
  203. <c>allocsize</c>. One just has to make sure that <c>allocsize</c> is always
  204. bigger or equal to <c>nx</c>.
  205. Important note: one can not change the size of a partitioned data.
  206. \section DataManagement Data Management
  207. When the application allocates data, whenever possible it should use
  208. the function starpu_malloc(), which will ask CUDA or OpenCL to make
  209. the allocation itself and pin the corresponding allocated memory, or to use the function
  210. starpu_memory_pin() to pin memory allocated by other ways, such as local arrays. This
  211. is needed to permit asynchronous data transfer, i.e. permit data
  212. transfer to overlap with computations. Otherwise, the trace will show
  213. that the state <c>DriverCopyAsync</c> takes a lot of time, this is
  214. because CUDA or OpenCL then reverts to synchronous transfers.
  215. The application can provide its own allocation function by calling
  216. starpu_malloc_set_hooks(). StarPU will then use them for all data handle
  217. allocations in the main memory.
  218. By default, StarPU leaves replicates of data wherever they were used, in case they
  219. will be re-used by other tasks, thus saving the data transfer time. When some
  220. task modifies some data, all the other replicates are invalidated, and only the
  221. processing unit which ran this task will have a valid replicate of the data. If the application knows
  222. that this data will not be re-used by further tasks, it should advise StarPU to
  223. immediately replicate it to a desired list of memory nodes (given through a
  224. bitmask). This can be understood like the write-through mode of CPU caches.
  225. \code{.c}
  226. starpu_data_set_wt_mask(img_handle, 1<<0);
  227. \endcode
  228. will for instance request to always automatically transfer a replicate into the
  229. main memory (node <c>0</c>), as bit <c>0</c> of the write-through bitmask is being set.
  230. \code{.c}
  231. starpu_data_set_wt_mask(img_handle, ~0U);
  232. \endcode
  233. will request to always automatically broadcast the updated data to all memory
  234. nodes.
  235. Setting the write-through mask to <c>~0U</c> can also be useful to make sure all
  236. memory nodes always have a copy of the data, so that it is never evicted when
  237. memory gets scarse.
  238. Implicit data dependency computation can become expensive if a lot
  239. of tasks access the same piece of data. If no dependency is required
  240. on some piece of data (e.g. because it is only accessed in read-only
  241. mode, or because write accesses are actually commutative), use the
  242. function starpu_data_set_sequential_consistency_flag() to disable
  243. implicit dependencies on this data.
  244. In the same vein, accumulation of results in the same data can become a
  245. bottleneck. The use of the mode ::STARPU_REDUX permits to optimize such
  246. accumulation (see \ref DataReduction). To a lesser extent, the use of
  247. the flag ::STARPU_COMMUTE keeps the bottleneck (see \ref DataCommute), but at least permits
  248. the accumulation to happen in any order.
  249. Applications often need a data just for temporary results. In such a case,
  250. registration can be made without an initial value, for instance this produces a vector data:
  251. \code{.c}
  252. starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
  253. \endcode
  254. StarPU will then allocate the actual buffer only when it is actually needed,
  255. e.g. directly on the GPU without allocating in main memory.
  256. In the same vein, once the temporary results are not useful any more, the
  257. data should be thrown away. If the handle is not to be reused, it can be
  258. unregistered:
  259. \code{.c}
  260. starpu_data_unregister_submit(handle);
  261. \endcode
  262. actual unregistration will be done after all tasks working on the handle
  263. terminate.
  264. If the handle is to be reused, instead of unregistering it, it can simply be invalidated:
  265. \code{.c}
  266. starpu_data_invalidate_submit(handle);
  267. \endcode
  268. the buffers containing the current value will then be freed, and reallocated
  269. only when another task writes some value to the handle.
  270. \section DataPrefetch Data Prefetch
  271. The scheduling policies <c>heft</c>, <c>dmda</c> and <c>pheft</c>
  272. perform data prefetch (see \ref STARPU_PREFETCH):
  273. as soon as a scheduling decision is taken for a task, requests are issued to
  274. transfer its required data to the target processing unit, if needed, so that
  275. when the processing unit actually starts the task, its data will hopefully be
  276. already available and it will not have to wait for the transfer to finish.
  277. The application may want to perform some manual prefetching, for several reasons
  278. such as excluding initial data transfers from performance measurements, or
  279. setting up an initial statically-computed data distribution on the machine
  280. before submitting tasks, which will thus guide StarPU toward an initial task
  281. distribution (since StarPU will try to avoid further transfers).
  282. This can be achieved by giving the function starpu_data_prefetch_on_node() the
  283. handle and the desired target memory node. The variant
  284. starpu_data_idle_prefetch_on_node() can be used to issue the transfer
  285. only when the bus is idle.
  286. Conversely, one can advise StarPU that some data will not be useful in the
  287. close future by calling starpu_data_wont_use(). StarPU will then write its value
  288. back to its home node, and evict it from GPUs when room is needed.
  289. \section PartitioningData Partitioning Data
  290. An existing piece of data can be partitioned in sub parts to be used by different tasks, for instance:
  291. \code{.c}
  292. #define NX 1048576
  293. #define PARTS 16
  294. int vector[NX];
  295. starpu_data_handle_t handle;
  296. /* Declare data to StarPU */
  297. starpu_vector_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX, sizeof(vector[0]));
  298. /* Partition the vector in PARTS sub-vectors */
  299. struct starpu_data_filter f =
  300. {
  301. .filter_func = starpu_vector_filter_block,
  302. .nchildren = PARTS
  303. };
  304. starpu_data_partition(handle, &f);
  305. \endcode
  306. The task submission then uses the function starpu_data_get_sub_data()
  307. to retrieve the sub-handles to be passed as tasks parameters.
  308. \code{.c}
  309. /* Submit a task on each sub-vector */
  310. for (i=0; i<starpu_data_get_nb_children(handle); i++)
  311. {
  312. /* Get subdata number i (there is only 1 dimension) */
  313. starpu_data_handle_t sub_handle = starpu_data_get_sub_data(handle, 1, i);
  314. struct starpu_task *task = starpu_task_create();
  315. task->handles[0] = sub_handle;
  316. task->cl = &cl;
  317. task->synchronous = 1;
  318. task->cl_arg = &factor;
  319. task->cl_arg_size = sizeof(factor);
  320. starpu_task_submit(task);
  321. }
  322. \endcode
  323. Partitioning can be applied several times, see
  324. <c>examples/basic_examples/mult.c</c> and <c>examples/filters/</c>.
  325. Wherever the whole piece of data is already available, the partitioning will
  326. be done in-place, i.e. without allocating new buffers but just using pointers
  327. inside the existing copy. This is particularly important to be aware of when
  328. using OpenCL, where the kernel parameters are not pointers, but \c cl_mem handles. The
  329. kernel thus needs to be also passed the offset within the OpenCL buffer:
  330. \code{.c}
  331. void opencl_func(void *buffers[], void *cl_arg)
  332. {
  333. cl_mem vector = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
  334. unsigned offset = STARPU_BLOCK_GET_OFFSET(buffers[0]);
  335. ...
  336. clSetKernelArg(kernel, 0, sizeof(vector), &vector);
  337. clSetKernelArg(kernel, 1, sizeof(offset), &offset);
  338. ...
  339. }
  340. \endcode
  341. And the kernel has to shift from the pointer passed by the OpenCL driver:
  342. \code{.c}
  343. __kernel void opencl_kernel(__global int *vector, unsigned offset)
  344. {
  345. block = (__global void *)block + offset;
  346. ...
  347. }
  348. \endcode
  349. When the sub-data is not of the same type as the original data, the field
  350. starpu_data_filter::get_child_ops needs to be set appropriately for StarPU
  351. to know which type should be used.
  352. StarPU provides various interfaces and filters for matrices, vectors, etc.,
  353. but applications can also write their own data interfaces and filters, see
  354. <c>examples/interface</c> and <c>examples/filters/custom_mf</c> for an example,
  355. and see \ref DefiningANewDataInterface and \ref DefiningANewDataFilter
  356. for documentation.
  357. \section AsynchronousPartitioning Asynchronous Partitioning
  358. The partitioning functions described in the previous section are synchronous:
  359. starpu_data_partition() and starpu_data_unpartition() both wait for all the tasks
  360. currently working on the data. This can be a bottleneck for the application.
  361. An asynchronous API also exists, it works only on handles with sequential
  362. consistency. The principle is to first plan the partitioning, which returns
  363. data handles of the partition, which are not functional yet. When submitting
  364. tasks, one can mix using the handles of the partition, of the whole data. One
  365. can even partition recursively and mix using handles at different levels of the
  366. recursion. Of course, StarPU will have to introduce coherency synchronization.
  367. <c>fmultiple_submit_implicit</c> is a complete example using this technique.
  368. One can also look at <c>fmultiple_submit_readonly</c> which contains the
  369. explicit coherency synchronization which are automatically introduced by StarPU
  370. for <c>fmultiple_submit_implicit</c>.
  371. In short, we first register a matrix and plan the partitioning:
  372. \code{.c}
  373. starpu_matrix_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)matrix, NX, NX, NY, sizeof(matrix[0]));
  374. struct starpu_data_filter f_vert =
  375. {
  376. .filter_func = starpu_matrix_filter_block,
  377. .nchildren = PARTS
  378. };
  379. starpu_data_partition_plan(handle, &f_vert, vert_handle);
  380. \endcode
  381. starpu_data_partition_plan() returns the handles for the partition in <c>vert_handle</c>.
  382. One can then submit tasks working on the main handle, and tasks working on the handles
  383. <c>vert_handle</c>. Between using the main handle and the handles <c>vert_handle</c>, StarPU will automatically call starpu_data_partition_submit() and
  384. starpu_data_unpartition_submit().
  385. All this code is asynchronous, just submitting which tasks, partitioning and
  386. unpartitioning will be done at runtime.
  387. Planning several partitioning of the same data is also possible, StarPU will
  388. unpartition and repartition as needed when mixing accesses of different
  389. partitions. If data access is done in read-only mode, StarPU will allow the
  390. different partitioning to coexist. As soon as a data is accessed in read-write
  391. mode, StarPU will automatically unpartition everything and activate only the
  392. partitioning leading to the data being written to.
  393. For instance, for a stencil application, one can split a subdomain into
  394. its interior and halos, and then just submit a task updating the whole
  395. subdomain, then submit MPI sends/receives to update the halos, then submit
  396. again a task updating the whole subdomain, etc. and StarPU will automatically
  397. partition/unpartition each time.
  398. \section ManualPartitioning Manual Partitioning
  399. One can also handle partitioning by hand, by registering several views on the
  400. same piece of data. The idea is then to manage the coherency of the various
  401. views through the common buffer in the main memory.
  402. <c>fmultiple_manual</c> is a complete example using this technique.
  403. In short, we first register the same matrix several times:
  404. \code{.c}
  405. starpu_matrix_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)matrix, NX, NX, NY, sizeof(matrix[0]));
  406. for (i = 0; i < PARTS; i++)
  407. starpu_matrix_data_register(&vert_handle[i], STARPU_MAIN_RAM, (uintptr_t)&matrix[0][i*(NX/PARTS)], NX, NX/PARTS, NY, sizeof(matrix[0][0]));
  408. \endcode
  409. Since StarPU is not aware that the two handles are actually pointing to the same
  410. data, we have a danger of inadvertently submitting tasks to both views, which
  411. will bring a mess since StarPU will not guarantee any coherency between the two
  412. views. To make sure we don't do this, we invalidate the view that we will not
  413. use:
  414. \code{.c}
  415. for (i = 0; i < PARTS; i++)
  416. starpu_data_invalidate(vert_handle[i]);
  417. \endcode
  418. Then we can safely work on <c>handle</c>.
  419. When we want to switch to the vertical slice view, all we need to do is bring
  420. coherency between them by running an empty task on the home node of the data:
  421. \code{.c}
  422. struct starpu_codelet cl_switch =
  423. {
  424. .where = STARPU_NOWHERE,
  425. .nbuffers = 3,
  426. .specific_nodes = 1,
  427. .nodes = { STARPU_MAIN_RAM, STARPU_MAIN_RAM, STARPU_MAIN_RAM },
  428. };
  429. ret = starpu_task_insert(&cl_switch, STARPU_RW, handle,
  430. STARPU_W, vert_handle[0],
  431. STARPU_W, vert_handle[1],
  432. 0);
  433. \endcode
  434. The execution of the task <c>switch</c> will get back the matrix data into the
  435. main memory, and thus the vertical slices will get the updated value there.
  436. Again, we prefer to make sure that we don't accidentally access the matrix through the whole-matrix handle:
  437. \code{.c}
  438. starpu_data_invalidate_submit(handle);
  439. \endcode
  440. Note: when enabling a set of handles in this way, the set must not have any
  441. overlapping, i.e. the handles of the set must not have any part of data in
  442. common, otherwise StarPU will not properly handle concurrent accesses between
  443. them.
  444. And now we can start using vertical slices, etc.
  445. \section DataPointers Handles data buffer pointers
  446. A simple understanding of StarPU handles is that it's a collection of buffers on
  447. each memory node of the machine, which contain the same data. The picture is
  448. however made more complex with the OpenCL support and with partitioning.
  449. When partitioning a handle, the data buffers of the subhandles will indeed
  450. be inside the data buffers of the main handle (to save transferring data
  451. back and forth between the main handle and the subhandles). But in OpenCL,
  452. a <c>cl_mem</c> is not a pointer, but an opaque value on which pointer
  453. arithmetic can not be used. That is why data interfaces contain three fields:
  454. <c>dev_handle</c>, <c>offset</c>, and <c>ptr</c>.
  455. <ul>
  456. <li> The field <c>dev_handle</c> is what the allocation function
  457. returned, and one can not do arithmetic on it.
  458. </li>
  459. <li> The field <c>offset</c> is the offset inside the allocated area,
  460. most often it will be 0 because data start at the beginning of the
  461. allocated area, but when the handle is partitioned, the subhandles
  462. will have varying <c>offset</c> values, for each subpiece.
  463. </li>
  464. <li> The field <c>ptr</c>, in the non-OpenCL case, i.e. when pointer
  465. arithmetic can be used on <c>dev_handle</c>, is just the sum of
  466. <c>dev_handle</c> and <c>offset</c>, provided for convenience.
  467. </li>
  468. </ul>
  469. This means that:
  470. <ul>
  471. <li>computation kernels can use <c>ptr</c> in non-OpenCL implementations.</li>
  472. <li>computation kernels have to use <c>dev_handle</c> and <c>offset</c> in the OpenCL implementation.</li>
  473. <li>allocation methods of data interfaces have to store the value returned by starpu_malloc_on_node() in <c>dev_handle</c> and <c>ptr</c>, and set <c>offset</c> to 0.</li>
  474. <li>partitioning filters have to copy over <c>dev_handle</c> without modifying it, set in the child different values of <c>offset</c>, and set <c>ptr</c> accordingly as the sum of <c>dev_handle</c> and <c>offset</c>.</li>
  475. </ul>
  476. \section DefiningANewDataFilter Defining A New Data Filter
  477. StarPU provides a series of predefined filters in \ref API_Data_Partition, but
  478. additional filters can be defined by the application. The principle is that the
  479. filter function just fills the memory location of the <c>i-th</c> subpart of a data.
  480. Examples are provided in <c>src/datawizard/interfaces/*_filters.c</c>,
  481. check \ref starpu_data_filter::filter_func for further details.
  482. The helper function starpu_filter_nparts_compute_chunk_size_and_offset() can be used to
  483. compute the division of pieces of data.
  484. \section DataReduction Data Reduction
  485. In various cases, some piece of data is used to accumulate intermediate
  486. results. For instances, the dot product of a vector, maximum/minimum finding,
  487. the histogram of a photograph, etc. When these results are produced along the
  488. whole machine, it would not be efficient to accumulate them in only one place,
  489. incurring data transmission each and access concurrency.
  490. StarPU provides a mode ::STARPU_REDUX, which permits to optimize
  491. this case: it will allocate a buffer on each worker (lazily), and accumulate
  492. intermediate results there. When the data is eventually accessed in the normal
  493. mode ::STARPU_R, StarPU will collect the intermediate results in just one
  494. buffer.
  495. For this to work, the user has to use the function
  496. starpu_data_set_reduction_methods() to declare how to initialize these
  497. buffers, and how to assemble partial results.
  498. For instance, <c>cg</c> uses that to optimize its dot product: it first defines
  499. the codelets for initialization and reduction:
  500. \code{.c}
  501. struct starpu_codelet bzero_variable_cl =
  502. {
  503. .cpu_funcs = { bzero_variable_cpu },
  504. .cpu_funcs_name = { "bzero_variable_cpu" },
  505. .cuda_funcs = { bzero_variable_cuda },
  506. .nbuffers = 1,
  507. }
  508. static void accumulate_variable_cpu(void *descr[], void *cl_arg)
  509. {
  510. double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);
  511. double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);
  512. *v_dst = *v_dst + *v_src;
  513. }
  514. static void accumulate_variable_cuda(void *descr[], void *cl_arg)
  515. {
  516. double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);
  517. double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);
  518. cublasaxpy(1, (double)1.0, v_src, 1, v_dst, 1);
  519. cudaStreamSynchronize(starpu_cuda_get_local_stream());
  520. }
  521. struct starpu_codelet accumulate_variable_cl =
  522. {
  523. .cpu_funcs = { accumulate_variable_cpu },
  524. .cpu_funcs_name = { "accumulate_variable_cpu" },
  525. .cuda_funcs = { accumulate_variable_cuda },
  526. .nbuffers = 2,
  527. .modes = {STARPU_RW|STARPU_COMMUTE, STARPU_R},
  528. }
  529. \endcode
  530. and attaches them as reduction methods for its handle <c>dtq</c>:
  531. \code{.c}
  532. starpu_variable_data_register(&dtq_handle, -1, NULL, sizeof(type));
  533. starpu_data_set_reduction_methods(dtq_handle, &accumulate_variable_cl, &bzero_variable_cl);
  534. \endcode
  535. and <c>dtq_handle</c> can now be used in mode ::STARPU_REDUX for the
  536. dot products with partitioned vectors:
  537. \code{.c}
  538. for (b = 0; b < nblocks; b++)
  539. starpu_task_insert(&dot_kernel_cl,
  540. STARPU_REDUX, dtq_handle,
  541. STARPU_R, starpu_data_get_sub_data(v1, 1, b),
  542. STARPU_R, starpu_data_get_sub_data(v2, 1, b),
  543. 0);
  544. \endcode
  545. During registration, we have here provided <c>NULL</c>, i.e. there is
  546. no initial value to be taken into account during reduction. StarPU
  547. will thus only take into account the contributions from the tasks
  548. <c>dot_kernel_cl</c>. Also, it will not allocate any memory for
  549. <c>dtq_handle</c> before tasks <c>dot_kernel_cl</c> are ready to run.
  550. If another dot product has to be performed, one could unregister
  551. <c>dtq_handle</c>, and re-register it. But one can also call
  552. starpu_data_invalidate_submit() with the parameter <c>dtq_handle</c>,
  553. which will clear all data from the handle, thus resetting it back to
  554. the initial status <c>register(NULL)</c>.
  555. The example <c>cg</c> also uses reduction for the blocked gemv kernel,
  556. leading to yet more relaxed dependencies and more parallelism.
  557. ::STARPU_REDUX can also be passed to starpu_mpi_task_insert() in the MPI
  558. case. This will however not produce any MPI communication, but just pass
  559. ::STARPU_REDUX to the underlying starpu_task_insert(). It is up to the
  560. application to call starpu_mpi_redux_data(), which posts tasks which will
  561. reduce the partial results among MPI nodes into the MPI node which owns the
  562. data. For instance, some hypothetical application which collects partial results
  563. into data <c>res</c>, then uses it for other computation, before looping again
  564. with a new reduction:
  565. \code{.c}
  566. for (i = 0; i < 100; i++)
  567. {
  568. starpu_mpi_task_insert(MPI_COMM_WORLD, &init_res, STARPU_W, res, 0);
  569. starpu_mpi_task_insert(MPI_COMM_WORLD, &work, STARPU_RW, A, STARPU_R, B, STARPU_REDUX, res, 0);
  570. starpu_mpi_redux_data(MPI_COMM_WORLD, res);
  571. starpu_mpi_task_insert(MPI_COMM_WORLD, &work2, STARPU_RW, B, STARPU_R, res, 0);
  572. }
  573. \endcode
  574. \section DataCommute Commute Data Access
  575. By default, the implicit dependencies computed from data access use the
  576. sequential semantic. Notably, write accesses are always serialized in the order
  577. of submission. In some applicative cases, the write contributions can actually
  578. be performed in any order without affecting the eventual result. In this case
  579. it is useful to drop the strictly sequential semantic, to improve parallelism
  580. by allowing StarPU to reorder the write accesses. This can be done by using
  581. the data access flag ::STARPU_COMMUTE. Accesses without this flag will however
  582. properly be serialized against accesses with this flag. For instance:
  583. \code{.c}
  584. starpu_task_insert(&cl1, STARPU_R, h, STARPU_RW, handle, 0);
  585. starpu_task_insert(&cl2, STARPU_R, handle1, STARPU_RW|STARPU_COMMUTE, handle, 0);
  586. starpu_task_insert(&cl2, STARPU_R, handle2, STARPU_RW|STARPU_COMMUTE, handle, 0);
  587. starpu_task_insert(&cl3, STARPU_R, g, STARPU_RW, handle, 0);
  588. \endcode
  589. The two tasks running <c>cl2</c> will be able to commute: depending on whether the
  590. value of <c>handle1</c> or <c>handle2</c> becomes available first, the corresponding task
  591. running <c>cl2</c> will start first. The task running <c>cl1</c> will however always be run
  592. before them, and the task running <c>cl3</c> will always be run after them.
  593. If a lot of tasks use the commute access on the same set of data and a lot of
  594. them are ready at the same time, it may become interesting to use an arbiter,
  595. see \ref ConcurrentDataAccess.
  596. \section ConcurrentDataAccess Concurrent Data Accesses
  597. When several tasks are ready and will work on several data, StarPU is faced with
  598. the classical Dining Philosophers problem, and has to determine the order in
  599. which it will run the tasks.
  600. Data accesses usually use sequential ordering, so data accesses are usually
  601. already serialized, and thus by default StarPU uses the Dijkstra solution which
  602. scales very well in terms of overhead: tasks will just acquire data one by one
  603. by data handle pointer value order.
  604. When sequential ordering is disabled or the flag ::STARPU_COMMUTE is used, there
  605. may be a lot of concurrent accesses to the same data, and the Dijkstra solution
  606. gets only poor parallelism, typically in some pathological cases which do happen
  607. in various applications. In this case, one can use a data access arbiter, which
  608. implements the classical centralized solution for the Dining Philosophers
  609. problem. This is more expensive in terms of overhead since it is centralized,
  610. but it opportunistically gets a lot of parallelism. The centralization can also
  611. be avoided by using several arbiters, thus separating sets of data for which
  612. arbitration will be done. If a task accesses data from different arbiters, it
  613. will acquire them arbiter by arbiter, in arbiter pointer value order.
  614. See the <c>tests/datawizard/test_arbiter.cpp</c> example.
  615. Arbiters however do not support the flag ::STARPU_REDUX yet.
  616. \section TemporaryBuffers Temporary Buffers
  617. There are two kinds of temporary buffers: temporary data which just pass results
  618. from a task to another, and scratch data which are needed only internally by
  619. tasks.
  620. \subsection TemporaryData Temporary Data
  621. Data can sometimes be entirely produced by a task, and entirely consumed by
  622. another task, without the need for other parts of the application to access
  623. it. In such case, registration can be done without prior allocation, by using
  624. the special memory node number <c>-1</c>, and passing a zero pointer. StarPU will
  625. actually allocate memory only when the task creating the content gets scheduled,
  626. and destroy it on unregistration.
  627. In addition to this, it can be tedious for the application to have to unregister
  628. the data, since it will not use its content anyway. The unregistration can be
  629. done lazily by using the function starpu_data_unregister_submit(),
  630. which will record that no more tasks accessing the handle will be submitted, so
  631. that it can be freed as soon as the last task accessing it is over.
  632. The following code examplifies both points: it registers the temporary
  633. data, submits three tasks accessing it, and records the data for automatic
  634. unregistration.
  635. \code{.c}
  636. starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
  637. starpu_task_insert(&produce_data, STARPU_W, handle, 0);
  638. starpu_task_insert(&compute_data, STARPU_RW, handle, 0);
  639. starpu_task_insert(&summarize_data, STARPU_R, handle, STARPU_W, result_handle, 0);
  640. starpu_data_unregister_submit(handle);
  641. \endcode
  642. The application may also want to see the temporary data initialized
  643. on the fly before being used by the task. This can be done by using
  644. starpu_data_set_reduction_methods() to set an initialization codelet (no redux
  645. codelet is needed).
  646. \subsection ScratchData Scratch Data
  647. Some kernels sometimes need temporary data to achieve the computations, i.e. a
  648. workspace. The application could allocate it at the start of the codelet
  649. function, and free it at the end, but this would be costly. It could also
  650. allocate one buffer per worker (similarly to \ref HowToInitializeAComputationLibraryOnceForEachWorker),
  651. but this would
  652. make them systematic and permanent. A more optimized way is to use
  653. the data access mode ::STARPU_SCRATCH, as examplified below, which
  654. provides per-worker buffers without content consistency. The buffer is
  655. registered only once, using memory node <c>-1</c>, i.e. the application didn't allocate
  656. memory for it, and StarPU will allocate it on demand at task execution.
  657. \code{.c}
  658. starpu_vector_data_register(&workspace, -1, 0, sizeof(float));
  659. for (i = 0; i < N; i++)
  660. starpu_task_insert(&compute, STARPU_R, input[i], STARPU_SCRATCH, workspace, STARPU_W, output[i], 0);
  661. \endcode
  662. StarPU will make sure that the buffer is allocated before executing the task,
  663. and make this allocation per-worker: for CPU workers, notably, each worker has
  664. its own buffer. This means that each task submitted above will actually have its
  665. own workspace, which will actually be the same for all tasks running one after
  666. the other on the same worker. Also, if for instance memory becomes scarce,
  667. StarPU will notice that it can free such buffers easily, since the content does
  668. not matter.
  669. The example <c>examples/pi</c> uses scratches for some temporary buffer.
  670. \section TheMultiformatInterface The Multiformat Interface
  671. It may be interesting to represent the same piece of data using two different
  672. data structures: one only used on CPUs, and one only used on GPUs.
  673. This can be done by using the multiformat interface. StarPU
  674. will be able to convert data from one data structure to the other when needed.
  675. Note that the scheduler <c>dmda</c> is the only one optimized for this
  676. interface. The user must provide StarPU with conversion codelets:
  677. \snippet multiformat.c To be included. You should update doxygen if you see this text.
  678. Kernels can be written almost as for any other interface. Note that
  679. ::STARPU_MULTIFORMAT_GET_CPU_PTR shall only be used for CPU kernels. CUDA kernels
  680. must use ::STARPU_MULTIFORMAT_GET_CUDA_PTR, and OpenCL kernels must use
  681. ::STARPU_MULTIFORMAT_GET_OPENCL_PTR. ::STARPU_MULTIFORMAT_GET_NX may
  682. be used in any kind of kernel.
  683. \code{.c}
  684. static void
  685. multiformat_scal_cpu_func(void *buffers[], void *args)
  686. {
  687. struct point *aos;
  688. unsigned int n;
  689. aos = STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
  690. n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
  691. ...
  692. }
  693. extern "C" void multiformat_scal_cuda_func(void *buffers[], void *_args)
  694. {
  695. unsigned int n;
  696. struct struct_of_arrays *soa;
  697. soa = (struct struct_of_arrays *) STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
  698. n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
  699. ...
  700. }
  701. \endcode
  702. A full example may be found in <c>examples/basic_examples/multiformat.c</c>.
  703. \section DefiningANewDataInterface Defining A New Data Interface
  704. This section proposes an example how to define your own interface, when the
  705. StarPU-provided interface do not fit your needs. Here we take a dumb example of
  706. an array of complex numbers represented by two arrays of double values.
  707. Let's thus define a new data interface to manage arrays of complex numbers:
  708. \code{.c}
  709. /* interface for complex numbers */
  710. struct starpu_complex_interface
  711. {
  712. double *real;
  713. double *imaginary;
  714. int nx;
  715. };
  716. \endcode
  717. That structure stores enough to describe <b>one</b> buffer of such kind of
  718. data. It is used for the buffer stored in the main memory, another instance
  719. is used for the buffer stored in a GPU, etc. A <i>data handle</i> is thus a
  720. collection of such structures, to describe each buffer on each memory node.
  721. Note: one should not take pointers into such structures, because StarPU needs
  722. to be able to copy over the content of it to various places, for instance to
  723. efficiently migrate a data buffer from one data handle to another data handle.
  724. \subsection DefiningANewDataInterface_registration Data registration
  725. Registering such a data to StarPU is easily done using the function
  726. starpu_data_register(). The last
  727. parameter of the function, <c>interface_complex_ops</c>, will be
  728. described below.
  729. \code{.c}
  730. void starpu_complex_data_register(starpu_data_handle_t *handleptr,
  731. unsigned home_node, double *real, double *imaginary, int nx)
  732. {
  733. struct starpu_complex_interface complex =
  734. {
  735. .real = real,
  736. .imaginary = imaginary,
  737. .nx = nx
  738. };
  739. if (interface_complex_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
  740. {
  741. interface_complex_ops.interfaceid = starpu_data_interface_get_next_id();
  742. }
  743. starpu_data_register(handleptr, home_node, &complex, &interface_complex_ops);
  744. }
  745. \endcode
  746. The <c>struct starpu_complex_interface complex</c> is here used just to store the
  747. parameters that the user provided to <c>starpu_complex_data_register</c>.
  748. starpu_data_register() will first allocate the handle, and
  749. then pass the structure <c>starpu_complex_interface</c> to the method
  750. starpu_data_interface_ops::register_data_handle, which records them
  751. within the data handle (it is called once per node by starpu_data_register()):
  752. \code{.c}
  753. static void complex_register_data_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface)
  754. {
  755. struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) data_interface;
  756. unsigned node;
  757. for (node = 0; node < STARPU_MAXNODES; node++)
  758. {
  759. struct starpu_complex_interface *local_interface = (struct starpu_complex_interface *)
  760. starpu_data_get_interface_on_node(handle, node);
  761. local_interface->nx = complex_interface->nx;
  762. if (node == home_node)
  763. {
  764. local_interface->real = complex_interface->real;
  765. local_interface->imaginary = complex_interface->imaginary;
  766. }
  767. else
  768. {
  769. local_interface->real = NULL;
  770. local_interface->imaginary = NULL;
  771. }
  772. }
  773. }
  774. \endcode
  775. If the application provided a home node, the corresponding pointers will be
  776. recorded for that node. Others have no buffer allocated yet.
  777. Possibly the interface needs some dynamic allocation (e.g. to store an array of
  778. dimensions that can have variable size). The corresponding deallocation will then be
  779. done in starpu_data_interface_ops::unregister_data_handle.
  780. Different operations need to be defined for a data interface through
  781. the type starpu_data_interface_ops. We only define here the basic
  782. operations needed to run simple applications. The source code for the
  783. different functions can be found in the file
  784. <c>examples/interface/complex_interface.c</c>, the details of the hooks to be
  785. provided are documented in \ref starpu_data_interface_ops .
  786. \code{.c}
  787. static struct starpu_data_interface_ops interface_complex_ops =
  788. {
  789. .register_data_handle = complex_register_data_handle,
  790. .allocate_data_on_node = complex_allocate_data_on_node,
  791. .copy_methods = &complex_copy_methods,
  792. .get_size = complex_get_size,
  793. .footprint = complex_footprint,
  794. .interfaceid = STARPU_UNKNOWN_INTERFACE_ID,
  795. .interface_size = sizeof(struct starpu_complex_interface),
  796. };
  797. \endcode
  798. Convenience functions can defined to access the different fields of the
  799. complex interface from a StarPU data handle after a call to starpu_data_acquire():
  800. \code{.c}
  801. double *starpu_complex_get_real(starpu_data_handle_t handle)
  802. {
  803. struct starpu_complex_interface *complex_interface =
  804. (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
  805. return complex_interface->real;
  806. }
  807. double *starpu_complex_get_imaginary(starpu_data_handle_t handle);
  808. int starpu_complex_get_nx(starpu_data_handle_t handle);
  809. \endcode
  810. Similar functions need to be defined to access the different fields of the
  811. complex interface from a <c>void *</c> pointer to be used within codelet
  812. implemetations.
  813. \snippet complex.c To be included. You should update doxygen if you see this text.
  814. Complex data interfaces can then be registered to StarPU.
  815. \code{.c}
  816. double real = 45.0;
  817. double imaginary = 12.0;
  818. starpu_complex_data_register(&handle1, STARPU_MAIN_RAM, &real, &imaginary, 1);
  819. starpu_task_insert(&cl_display, STARPU_R, handle1, 0);
  820. \endcode
  821. and used by codelets.
  822. \code{.c}
  823. void display_complex_codelet(void *descr[], void *_args)
  824. {
  825. int nx = STARPU_COMPLEX_GET_NX(descr[0]);
  826. double *real = STARPU_COMPLEX_GET_REAL(descr[0]);
  827. double *imaginary = STARPU_COMPLEX_GET_IMAGINARY(descr[0]);
  828. int i;
  829. for(i=0 ; i<nx ; i++)
  830. {
  831. fprintf(stderr, "Complex[%d] = %3.2f + %3.2f i\n", i, real[i], imaginary[i]);
  832. }
  833. }
  834. \endcode
  835. The whole code for this complex data interface is available in the
  836. directory <c>examples/interface/</c>.
  837. \subsection DefiningANewDataInterface_allocation Data allocation
  838. To be able to run tasks on GPUs etc. StarPU needs to know how to allocate a
  839. buffer for the interface. In our example, two allocations are needed in the
  840. allocation method \c complex_allocate_data_on_node(): one for the real part and one
  841. for the imaginary part.
  842. \code{.c}
  843. static starpu_ssize_t complex_allocate_data_on_node(void *data_interface, unsigned node)
  844. {
  845. struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) data_interface;
  846. double *addr_real = NULL;
  847. double *addr_imaginary = NULL;
  848. starpu_ssize_t requested_memory = complex_interface->nx * sizeof(complex_interface->real[0]);
  849. addr_real = (double*) starpu_malloc_on_node(node, requested_memory);
  850. if (!addr_real)
  851. goto fail_real;
  852. addr_imaginary = (double*) starpu_malloc_on_node(node, requested_memory);
  853. if (!addr_imaginary)
  854. goto fail_imaginary;
  855. /* update the data properly in consequence */
  856. complex_interface->real = addr_real;
  857. complex_interface->imaginary = addr_imaginary;
  858. return 2*requested_memory;
  859. fail_imaginary:
  860. starpu_free_on_node(node, (uintptr_t) addr_real, requested_memory);
  861. fail_real:
  862. return -ENOMEM;
  863. }
  864. \endcode
  865. Here we try to allocate the two parts. If either of them fails, we return
  866. \c -ENOMEM. If they succeed, we can record the obtained pointers and returned the
  867. amount of allocated memory (for memory usage accounting).
  868. Conversely, \c complex_free_data_on_node() frees the two parts:
  869. \code{.c}
  870. static void complex_free_data_on_node(void *data_interface, unsigned node)
  871. {
  872. struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) data_interface;
  873. starpu_ssize_t requested_memory = complex_interface->nx * sizeof(complex_interface->real[0]);
  874. starpu_free_on_node(node, (uintptr_t) complex_interface->real, requested_memory);
  875. starpu_free_on_node(node, (uintptr_t) complex_interface->imaginary, requested_memory);
  876. }
  877. \endcode
  878. We we have not made anything particular for GPUs or whatsoever: it is
  879. starpu_free_on_node() which knows how to actually make the allocation, and
  880. returns the resulting pointer, be it in main memory, in GPU memory, etc.
  881. \subsection DefiningANewDataInterface_copy Data copy
  882. Now that StarPU knows how to allocate/free a buffer, it needs to be able to
  883. copy over data into/from it. Defining a method \c copy_any_to_any() allows StarPU to
  884. perform direct transfers between main memory and GPU memory.
  885. \code{.c}
  886. static int copy_any_to_any(void *src_interface, unsigned src_node,
  887. void *dst_interface, unsigned dst_node,
  888. void *async_data)
  889. {
  890. struct starpu_complex_interface *src_complex = src_interface;
  891. struct starpu_complex_interface *dst_complex = dst_interface;
  892. int ret = 0;
  893. if (starpu_interface_copy((uintptr_t) src_complex->real, 0, src_node,
  894. (uintptr_t) dst_complex->real, 0, dst_node,
  895. src_complex->nx*sizeof(src_complex->real[0]),
  896. async_data))
  897. ret = -EAGAIN;
  898. if (starpu_interface_copy((uintptr_t) src_complex->imaginary, 0, src_node,
  899. (uintptr_t) dst_complex->imaginary, 0, dst_node,
  900. src_complex->nx*sizeof(src_complex->imaginary[0]),
  901. async_data))
  902. ret = -EAGAIN;
  903. return ret;
  904. }
  905. \endcode
  906. We here again have no idea what is main memory or GPU memory, or even if the
  907. copy is synchronous or asynchronous: we just call starpu_interface_copy()
  908. according to the interface, passing it the pointers, and checking whether it
  909. returned \c -EAGAIN, which means the copy is asynchronous, and StarPU will
  910. appropriately wait for it thanks to the pointer \c async_data.
  911. This copy method is referenced in a structure \ref starpu_data_copy_methods
  912. \code{.c}
  913. static const struct starpu_data_copy_methods complex_copy_methods =
  914. {
  915. .any_to_any = copy_any_to_any
  916. };
  917. \endcode
  918. which was referenced in the structure \ref starpu_data_interface_ops above.
  919. Other fields of \ref starpu_data_copy_methods allow to provide optimized
  920. variants, notably for the case of 2D or 3D matrix tiles with non-trivial ld.
  921. \subsection DefiningANewDataInterface_pack Data pack/peek/unpack
  922. The copy methods allow for RAM/GPU transfers, but is not enough for e.g.
  923. transferring over MPI. That requires defining the pack/peek/unpack methods. The
  924. principle is that the method starpu_data_interface_ops::pack_data concatenates
  925. the buffer data into a newly-allocated contiguous bytes array, conversely
  926. starpu_data_interface_ops::peek_data extracts from a bytes array into the
  927. buffer data, and starpu_data_interface_ops::unpack_data does the same as
  928. starpu_data_interface_ops::peek_data but also frees the bytes array.
  929. \code{.c}
  930. static int complex_pack_data(starpu_data_handle_t handle, unsigned node, void **ptr, starpu_ssize_t *count)
  931. {
  932. STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
  933. struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *)
  934. starpu_data_get_interface_on_node(handle, node);
  935. *count = complex_get_size(handle);
  936. if (ptr != NULL)
  937. {
  938. char *data;
  939. data = (void*) starpu_malloc_on_node_flags(node, *count, 0);
  940. *ptr = data;
  941. memcpy(data, complex_interface->real, complex_interface->nx*sizeof(double));
  942. memcpy(data+complex_interface->nx*sizeof(double), complex_interface->imaginary, complex_interface->nx*sizeof(double));
  943. }
  944. return 0;
  945. }
  946. \endcode
  947. \c complex_pack_data() first computes the size to be allocated, then allocates it,
  948. and copies over into it the content of the two real and imaginary arrays.
  949. \code{.c}
  950. static int complex_peek_data(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count)
  951. {
  952. char *data = ptr;
  953. STARPU_ASSERT(starpu_data_test_if_allocated_on_node(handle, node));
  954. struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *)
  955. starpu_data_get_interface_on_node(handle, node);
  956. STARPU_ASSERT(count == 2 * complex_interface->nx * sizeof(double));
  957. memcpy(complex_interface->real, data, complex_interface->nx*sizeof(double));
  958. memcpy(complex_interface->imaginary, data+complex_interface->nx*sizeof(double), complex_interface->nx*sizeof(double));
  959. return 0;
  960. }
  961. \endcode
  962. \c complex_peek_data() simply uses \c memcpy() to copy over from the bytes array into the data buffer.
  963. \code{.c}
  964. static int complex_unpack_data(starpu_data_handle_t handle, unsigned node, void *ptr, size_t count)
  965. {
  966. complex_peek_data(handle, node, ptr, count);
  967. starpu_free_on_node_flags(node, (uintptr_t) ptr, count, 0);
  968. return 0;
  969. }
  970. \endcode
  971. And \c complex_unpack_data() just calls \c complex_peek_data() and releases the bytes array.
  972. \section SpecifyingATargetNode Specifying A Target Node For Task Data
  973. When executing a task on a GPU for instance, StarPU would normally copy all the
  974. needed data for the tasks on the embedded memory of the GPU. It may however
  975. happen that the task kernel would rather have some of the datas kept in the
  976. main memory instead of copied in the GPU, a pivoting vector for instance.
  977. This can be achieved by setting the flag starpu_codelet::specific_nodes to
  978. <c>1</c>, and then fill the array starpu_codelet::nodes (or starpu_codelet::dyn_nodes when
  979. starpu_codelet::nbuffers is greater than \ref STARPU_NMAXBUFS) with the node numbers
  980. where data should be copied to, or ::STARPU_SPECIFIC_NODE_LOCAL to let
  981. StarPU copy it to the memory node where the task will be executed.
  982. ::STARPU_SPECIFIC_NODE_CPU can also be used to request data to be
  983. put in CPU-accessible memory (and let StarPU choose the NUMA node).
  984. ::STARPU_SPECIFIC_NODE_FAST and ::STARPU_SPECIFIC_NODE_SLOW can also be
  985. used
  986. For instance,
  987. with the following codelet:
  988. \code{.c}
  989. struct starpu_codelet cl =
  990. {
  991. .cuda_funcs = { kernel },
  992. .nbuffers = 2,
  993. .modes = {STARPU_RW, STARPU_RW},
  994. .specific_nodes = 1,
  995. .nodes = {STARPU_SPECIFIC_NODE_CPU, STARPU_SPECIFIC_NODE_LOCAL},
  996. };
  997. \endcode
  998. the first data of the task will be kept in the CPU memory, while the second
  999. data will be copied to the CUDA GPU as usual. A working example is available in
  1000. <c>tests/datawizard/specific_node.c</c>
  1001. With the following codelet:
  1002. \code{.c}
  1003. struct starpu_codelet cl =
  1004. {
  1005. .cuda_funcs = { kernel },
  1006. .nbuffers = 2,
  1007. .modes = {STARPU_RW, STARPU_RW},
  1008. .specific_nodes = 1,
  1009. .nodes = {STARPU_SPECIFIC_NODE_LOCAL, STARPU_SPECIFIC_NODE_SLOW},
  1010. };
  1011. \endcode
  1012. The first data will be copied into fast (but probably size-limited) local memory
  1013. while the second data will be left in slow (but large) memory. This makes sense
  1014. when the kernel does not make so many accesses to the second data, and thus data
  1015. being remote e.g. over a PCI bus is not a performance problem, and avoids
  1016. filling the fast local memory with data which does not need the performance.
  1017. In cases where the kernel is fine with some data being either local or in the
  1018. main memory, ::STARPU_SPECIFIC_NODE_LOCAL_OR_CPU can be used. StarPU will then
  1019. be free to leave the data in the main memory and let the kernel access it from
  1020. accelerators, or to move it to the accelerator before starting the kernel, for
  1021. instance:
  1022. \code{.c}
  1023. struct starpu_codelet cl =
  1024. {
  1025. .cuda_funcs = { kernel },
  1026. .nbuffers = 2,
  1027. .modes = {STARPU_RW, STARPU_R},
  1028. .specific_nodes = 1,
  1029. .nodes = {STARPU_SPECIFIC_NODE_LOCAL, STARPU_SPECIFIC_NODE_LOCAL_OR_CPU},
  1030. };
  1031. \endcode
  1032. */