310_data_management.doxy 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870
  1. /*
  2. * This file is part of the StarPU Handbook.
  3. * Copyright (C) 2009--2011 Universit@'e de Bordeaux
  4. * Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017 CNRS
  5. * Copyright (C) 2011, 2012 INRIA
  6. * See the file version.doxy for copying conditions.
  7. */
  8. /*! \page DataManagement Data Management
  9. TODO: intro qui parle de coherency entre autres
  10. \section DataInterface Data Interface
  11. StarPU provides several data interfaces for programmers to describe the data layout of their application. There are predefined interfaces already available in StarPU. Users can define new data interfaces as explained in \ref DefiningANewDataInterface. All functions provided by StarPU are documented in \ref API_Data_Interfaces. You will find a short list below.
  12. \subsection VariableDataInterface Variable Data Interface
  13. A variable is a given size byte element, typically a scalar. Here an
  14. example of how to register a variable data to StarPU by using
  15. starpu_variable_data_register().
  16. \code{.c}
  17. float var = 42.0;
  18. starpu_data_handle_t var_handle;
  19. starpu_variable_data_register(&var_handle, STARPU_MAIN_RAM, (uintptr_t)&var, sizeof(var));
  20. \endcode
  21. \subsection VectorDataInterface Vector Data Interface
  22. A vector is a fixed number of elements of a given size. Here an
  23. example of how to register a vector data to StarPU by using
  24. starpu_vector_data_register().
  25. \code{.c}
  26. float vector[NX];
  27. starpu_data_handle_t vector_handle;
  28. starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX, sizeof(vector[0]));
  29. \endcode
  30. \subsection MatrixDataInterface Matrix Data Interface
  31. To register 2-D matrices with a potential padding, one can use the
  32. matrix data interface. Here an example of how to register a matrix
  33. data to StarPU by using starpu_matrix_data_register().
  34. \code{.c}
  35. float *matrix;
  36. starpu_data_handle_t matrix_handle;
  37. matrix = (float*)malloc(width * height * sizeof(float));
  38. starpu_matrix_data_register(&matrix_handle, STARPU_MAIN_RAM, (uintptr_t)matrix, width, width, height, sizeof(float));
  39. \endcode
  40. \subsection BlockDataInterface Block Data Interface
  41. To register 3-D blocks with potential paddings on Y and Z dimensions,
  42. one can use the block data interface. Here an example of how to
  43. register a block data to StarPU by using starpu_block_data_register().
  44. \code{.c}
  45. float *block;
  46. starpu_data_handle_t block_handle;
  47. block = (float*)malloc(nx*ny*nz*sizeof(float));
  48. starpu_block_data_register(&block_handle, STARPU_MAIN_RAM, (uintptr_t)block, nx, nx*ny, nx, ny, nz, sizeof(float));
  49. \endcode
  50. \subsection BCSRDataInterface BCSR Data Interface
  51. BCSR (Blocked Compressed Sparse Row Representation) sparse matrix data
  52. can be registered to StarPU using the bcsr data interface. Here an
  53. example on how to do so by using starpu_bcsr_data_register().
  54. \code{.c}
  55. /*
  56. * We use the following matrix:
  57. *
  58. * +----------------+
  59. * | 0 1 0 0 |
  60. * | 2 3 0 0 |
  61. * | 4 5 8 9 |
  62. * | 6 7 10 11 |
  63. * +----------------+
  64. *
  65. * nzval = [0, 1, 2, 3] ++ [4, 5, 6, 7] ++ [8, 9, 10, 11]
  66. * colind = [0, 0, 1]
  67. * rowptr = [0, 1, 3]
  68. * r = c = 2
  69. */
  70. /* Size of the blocks */
  71. int R = 2;
  72. int C = 2;
  73. int NROWS = 2;
  74. int NNZ_BLOCKS = 3; /* out of 4 */
  75. int NZVAL_SIZE = (R*C*NNZ_BLOCKS);
  76. int nzval[NZVAL_SIZE] =
  77. {
  78. 0, 1, 2, 3, /* First block */
  79. 4, 5, 6, 7, /* Second block */
  80. 8, 9, 10, 11 /* Third block */
  81. };
  82. uint32_t colind[NNZ_BLOCKS] =
  83. {
  84. 0, /* block-column index for first block in nzval */
  85. 0, /* block-column index for second block in nzval */
  86. 1 /* block-column index for third block in nzval */
  87. };
  88. uint32_t rowptr[NROWS+1] =
  89. {
  90. 0, / * block-index in nzval of the first block of the first row. */
  91. 1, / * block-index in nzval of the first block of the second row. */
  92. NNZ_BLOCKS /* number of blocks, to allow an easier element's access for the kernels */
  93. };
  94. starpu_data_handle_t bcsr_handle;
  95. starpu_bcsr_data_register(&bcsr_handle,
  96. STARPU_MAIN_RAM,
  97. NNZ_BLOCKS,
  98. NROWS,
  99. (uintptr_t) nzval,
  100. colind,
  101. rowptr,
  102. 0, /* firstentry */
  103. R,
  104. C,
  105. sizeof(nzval[0]));
  106. \endcode
  107. StarPU provides an example on how to deal with such matrices in
  108. <c>examples/spmv</c>.
  109. \subsection CSRDataInterface CSR Data Interface
  110. TODO
  111. \section DataManagement Data Management
  112. When the application allocates data, whenever possible it should use
  113. the starpu_malloc() function, which will ask CUDA or OpenCL to make
  114. the allocation itself and pin the corresponding allocated memory, or to use the
  115. starpu_memory_pin() function to pin memory allocated by other ways, such as local arrays. This
  116. is needed to permit asynchronous data transfer, i.e. permit data
  117. transfer to overlap with computations. Otherwise, the trace will show
  118. that the <c>DriverCopyAsync</c> state takes a lot of time, this is
  119. because CUDA or OpenCL then reverts to synchronous transfers.
  120. By default, StarPU leaves replicates of data wherever they were used, in case they
  121. will be re-used by other tasks, thus saving the data transfer time. When some
  122. task modifies some data, all the other replicates are invalidated, and only the
  123. processing unit which ran that task will have a valid replicate of the data. If the application knows
  124. that this data will not be re-used by further tasks, it should advise StarPU to
  125. immediately replicate it to a desired list of memory nodes (given through a
  126. bitmask). This can be understood like the write-through mode of CPU caches.
  127. \code{.c}
  128. starpu_data_set_wt_mask(img_handle, 1<<0);
  129. \endcode
  130. will for instance request to always automatically transfer a replicate into the
  131. main memory (node <c>0</c>), as bit <c>0</c> of the write-through bitmask is being set.
  132. \code{.c}
  133. starpu_data_set_wt_mask(img_handle, ~0U);
  134. \endcode
  135. will request to always automatically broadcast the updated data to all memory
  136. nodes.
  137. Setting the write-through mask to <c>~0U</c> can also be useful to make sure all
  138. memory nodes always have a copy of the data, so that it is never evicted when
  139. memory gets scarse.
  140. Implicit data dependency computation can become expensive if a lot
  141. of tasks access the same piece of data. If no dependency is required
  142. on some piece of data (e.g. because it is only accessed in read-only
  143. mode, or because write accesses are actually commutative), use the
  144. function starpu_data_set_sequential_consistency_flag() to disable
  145. implicit dependencies on that data.
  146. In the same vein, accumulation of results in the same data can become a
  147. bottleneck. The use of the mode ::STARPU_REDUX permits to optimize such
  148. accumulation (see \ref DataReduction). To a lesser extent, the use of
  149. the flag ::STARPU_COMMUTE keeps the bottleneck (see \ref DataCommute), but at least permits
  150. the accumulation to happen in any order.
  151. Applications often need a data just for temporary results. In such a case,
  152. registration can be made without an initial value, for instance this produces a vector data:
  153. \code{.c}
  154. starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
  155. \endcode
  156. StarPU will then allocate the actual buffer only when it is actually needed,
  157. e.g. directly on the GPU without allocating in main memory.
  158. In the same vein, once the temporary results are not useful any more, the
  159. data should be thrown away. If the handle is not to be reused, it can be
  160. unregistered:
  161. \code{.c}
  162. starpu_data_unregister_submit(handle);
  163. \endcode
  164. actual unregistration will be done after all tasks working on the handle
  165. terminate.
  166. If the handle is to be reused, instead of unregistering it, it can simply be invalidated:
  167. \code{.c}
  168. starpu_data_invalidate_submit(handle);
  169. \endcode
  170. the buffers containing the current value will then be freed, and reallocated
  171. only when another task writes some value to the handle.
  172. \section DataPrefetch Data Prefetch
  173. The scheduling policies <c>heft</c>, <c>dmda</c> and <c>pheft</c>
  174. perform data prefetch (see \ref STARPU_PREFETCH):
  175. as soon as a scheduling decision is taken for a task, requests are issued to
  176. transfer its required data to the target processing unit, if needed, so that
  177. when the processing unit actually starts the task, its data will hopefully be
  178. already available and it will not have to wait for the transfer to finish.
  179. The application may want to perform some manual prefetching, for several reasons
  180. such as excluding initial data transfers from performance measurements, or
  181. setting up an initial statically-computed data distribution on the machine
  182. before submitting tasks, which will thus guide StarPU toward an initial task
  183. distribution (since StarPU will try to avoid further transfers).
  184. This can be achieved by giving the function starpu_data_prefetch_on_node() the
  185. handle and the desired target memory node. The
  186. starpu_data_idle_prefetch_on_node() variant can be used to issue the transfer
  187. only when the bus is idle.
  188. Conversely, one can advise StarPU that some data will not be useful in the
  189. close future by calling starpu_data_wont_use(). StarPU will then write its value
  190. back to its home node, and evict it from GPUs when room is needed.
  191. \section PartitioningData Partitioning Data
  192. An existing piece of data can be partitioned in sub parts to be used by different tasks, for instance:
  193. \code{.c}
  194. #define NX 1048576
  195. #define PARTS 16
  196. int vector[NX];
  197. starpu_data_handle_t handle;
  198. /* Declare data to StarPU */
  199. starpu_vector_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)vector,
  200. NX, sizeof(vector[0]));
  201. /* Partition the vector in PARTS sub-vectors */
  202. struct starpu_data_filter f =
  203. {
  204. .filter_func = starpu_vector_filter_block,
  205. .nchildren = PARTS
  206. };
  207. starpu_data_partition(handle, &f);
  208. \endcode
  209. The task submission then uses the function starpu_data_get_sub_data()
  210. to retrieve the sub-handles to be passed as tasks parameters.
  211. \code{.c}
  212. /* Submit a task on each sub-vector */
  213. for (i=0; i<starpu_data_get_nb_children(handle); i++)
  214. {
  215. /* Get subdata number i (there is only 1 dimension) */
  216. starpu_data_handle_t sub_handle = starpu_data_get_sub_data(handle, 1, i);
  217. struct starpu_task *task = starpu_task_create();
  218. task->handles[0] = sub_handle;
  219. task->cl = &cl;
  220. task->synchronous = 1;
  221. task->cl_arg = &factor;
  222. task->cl_arg_size = sizeof(factor);
  223. starpu_task_submit(task);
  224. }
  225. \endcode
  226. Partitioning can be applied several times, see
  227. <c>examples/basic_examples/mult.c</c> and <c>examples/filters/</c>.
  228. Wherever the whole piece of data is already available, the partitioning will
  229. be done in-place, i.e. without allocating new buffers but just using pointers
  230. inside the existing copy. This is particularly important to be aware of when
  231. using OpenCL, where the kernel parameters are not pointers, but cl_mem handles. The
  232. kernel thus needs to be also passed the offset within the OpenCL buffer:
  233. \code{.c}
  234. void opencl_func(void *buffers[], void *cl_arg)
  235. {
  236. cl_mem vector = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
  237. unsigned offset = STARPU_BLOCK_GET_OFFSET(buffers[0]);
  238. ...
  239. clSetKernelArg(kernel, 0, sizeof(vector), &vector);
  240. clSetKernelArg(kernel, 1, sizeof(offset), &offset);
  241. ...
  242. }
  243. \endcode
  244. And the kernel has to shift from the pointer passed by the OpenCL driver:
  245. \code{.c}
  246. __kernel void opencl_kernel(__global int *vector, unsigned offset)
  247. {
  248. block = (__global void *)block + offset;
  249. ...
  250. }
  251. \endcode
  252. StarPU provides various interfaces and filters for matrices, vectors, etc.,
  253. but applications can also write their own data interfaces and filters, see
  254. <c>examples/interface</c> and <c>examples/filters/custom_mf</c> for an example,
  255. and see \ref DefiningANewDataInterface and \ref DefiningANewDataFilter
  256. for documentation.
  257. \section AsynchronousPartitioning Asynchronous Partitioning
  258. The partitioning functions described in the previous section are synchronous:
  259. starpu_data_partition() and starpu_data_unpartition() both wait for all the tasks
  260. currently working on the data. This can be a bottleneck for the application.
  261. An asynchronous API also exists, it works only on handles with sequential
  262. consistency. The principle is to first plan the partitioning, which returns
  263. data handles of the partition, which are not functional yet. Along other task
  264. submission, one can submit the actual partitioning, and then use the handles
  265. of the partition. Before using the handle of the whole data, one has to submit
  266. the unpartitioning. <c>fmultiple_submit</c> is a complete example using this
  267. technique.
  268. In short, we first register a matrix and plan the partitioning:
  269. \code{.c}
  270. starpu_matrix_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)matrix, NX, NX, NY, sizeof(matrix[0]));
  271. struct starpu_data_filter f_vert =
  272. {
  273. .filter_func = starpu_matrix_filter_block,
  274. .nchildren = PARTS
  275. };
  276. starpu_data_partition_plan(handle, &f_vert, vert_handle);
  277. \endcode
  278. starpu_data_partition_plan() returns the handles for the partition in <c>vert_handle</c>.
  279. One can submit tasks working on the main handle, but not yet on the <c>vert_handle</c>
  280. handles. Now we submit the partitioning:
  281. \code{.c}
  282. starpu_data_partition_submit(handle, PARTS, vert_handle);
  283. \endcode
  284. And now we can submit tasks working on <c>vert_handle</c> handles (and not on the main
  285. handle any more). Eventually we want to work on the main handle again, so we
  286. submit the unpartitioning:
  287. \code{.c}
  288. starpu_data_unpartition_submit(handle, PARTS, vert_handle, -1);
  289. \endcode
  290. And now we can submit tasks working on the main handle again.
  291. All this code is asynchronous, just submitting which tasks, partitioning and
  292. unpartitioning should be done at runtime.
  293. Planning several partitioning of the same data is also possible, one just has
  294. to submit unpartitioning (to get back to the initial handle) before submitting
  295. another partitioning.
  296. It is also possible to activate several partitioning at the same time, in
  297. read-only mode, by using starpu_data_partition_readonly_submit(). A complete
  298. example is available in <c>examples/filters/fmultiple_submit_readonly.c</c>.
  299. \section ManualPartitioning Manual Partitioning
  300. One can also handle partitioning by hand, by registering several views on the
  301. same piece of data. The idea is then to manage the coherency of the various
  302. views through the common buffer in the main memory.
  303. <c>fmultiple_manual</c> is a complete example using this technique.
  304. In short, we first register the same matrix several times:
  305. \code{.c}
  306. starpu_matrix_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)matrix, NX, NX, NY, sizeof(matrix[0]));
  307. for (i = 0; i < PARTS; i++)
  308. starpu_matrix_data_register(&vert_handle[i], STARPU_MAIN_RAM, (uintptr_t)&matrix[0][i*(NX/PARTS)], NX, NX/PARTS, NY, sizeof(matrix[0][0]));
  309. \endcode
  310. Since StarPU is not aware that the two handles are actually pointing to the same
  311. data, we have a danger of inadvertently submitting tasks to both views, which
  312. will bring a mess since StarPU will not guarantee any coherency between the two
  313. views. To make sure we don't do this, we invalidate the view that we will not
  314. use:
  315. \code{.c}
  316. for (i = 0; i < PARTS; i++)
  317. starpu_data_invalidate(vert_handle[i]);
  318. \endcode
  319. Then we can safely work on <c>handle</c>.
  320. When we want to switch to the vertical slice view, all we need to do is bring
  321. coherency between them by running an empty task on the home node of the data:
  322. \code{.c}
  323. void empty(void *buffers[], void *cl_arg)
  324. { }
  325. struct starpu_codelet cl_switch =
  326. {
  327. .cpu_funcs = {empty},
  328. .nbuffers = STARPU_VARIABLE_NBUFFERS,
  329. };
  330. ret = starpu_task_insert(&cl_switch, STARPU_RW, handle,
  331. STARPU_W, vert_handle[0],
  332. STARPU_W, vert_handle[1],
  333. 0);
  334. \endcode
  335. The execution of the <c>switch</c> task will get back the matrix data into the
  336. main memory, and thus the vertical slices will get the updated value there.
  337. Again, we prefer to make sure that we don't accidentally access the matrix through the whole-matrix handle:
  338. \code{.c}
  339. starpu_data_invalidate_submit(handle);
  340. \endcode
  341. And now we can start using vertical slices, etc.
  342. \section DefiningANewDataFilter Defining A New Data Filter
  343. StarPU provides a series of predefined filters in API_Data_Partition, but
  344. additional filters can be defined by the application. The principle is that the
  345. filter function just fills the memory location of the i-th subpart of a data.
  346. Examples are provided in <c>src/datawizard/interfaces/*_filters.c</c>,
  347. and see \ref starpu_data_filter::filter_func for the details.
  348. \section DataReduction Data Reduction
  349. In various cases, some piece of data is used to accumulate intermediate
  350. results. For instances, the dot product of a vector, maximum/minimum finding,
  351. the histogram of a photograph, etc. When these results are produced along the
  352. whole machine, it would not be efficient to accumulate them in only one place,
  353. incurring data transmission each and access concurrency.
  354. StarPU provides a mode ::STARPU_REDUX, which permits to optimize
  355. that case: it will allocate a buffer on each memory node, and accumulate
  356. intermediate results there. When the data is eventually accessed in the normal
  357. mode ::STARPU_R, StarPU will collect the intermediate results in just one
  358. buffer.
  359. For this to work, the user has to use the function
  360. starpu_data_set_reduction_methods() to declare how to initialize these
  361. buffers, and how to assemble partial results.
  362. For instance, <c>cg</c> uses that to optimize its dot product: it first defines
  363. the codelets for initialization and reduction:
  364. \code{.c}
  365. struct starpu_codelet bzero_variable_cl =
  366. {
  367. .cpu_funcs = { bzero_variable_cpu },
  368. .cpu_funcs_name = { "bzero_variable_cpu" },
  369. .cuda_funcs = { bzero_variable_cuda },
  370. .nbuffers = 1,
  371. }
  372. static void accumulate_variable_cpu(void *descr[], void *cl_arg)
  373. {
  374. double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);
  375. double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);
  376. *v_dst = *v_dst + *v_src;
  377. }
  378. static void accumulate_variable_cuda(void *descr[], void *cl_arg)
  379. {
  380. double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);
  381. double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);
  382. cublasaxpy(1, (double)1.0, v_src, 1, v_dst, 1);
  383. cudaStreamSynchronize(starpu_cuda_get_local_stream());
  384. }
  385. struct starpu_codelet accumulate_variable_cl =
  386. {
  387. .cpu_funcs = { accumulate_variable_cpu },
  388. .cpu_funcs_name = { "accumulate_variable_cpu" },
  389. .cuda_funcs = { accumulate_variable_cuda },
  390. .nbuffers = 1,
  391. }
  392. \endcode
  393. and attaches them as reduction methods for its handle <c>dtq</c>:
  394. \code{.c}
  395. starpu_variable_data_register(&dtq_handle, -1, NULL, sizeof(type));
  396. starpu_data_set_reduction_methods(dtq_handle,
  397. &accumulate_variable_cl, &bzero_variable_cl);
  398. \endcode
  399. and <c>dtq_handle</c> can now be used in mode ::STARPU_REDUX for the
  400. dot products with partitioned vectors:
  401. \code{.c}
  402. for (b = 0; b < nblocks; b++)
  403. starpu_task_insert(&dot_kernel_cl,
  404. STARPU_REDUX, dtq_handle,
  405. STARPU_R, starpu_data_get_sub_data(v1, 1, b),
  406. STARPU_R, starpu_data_get_sub_data(v2, 1, b),
  407. 0);
  408. \endcode
  409. During registration, we have here provided <c>NULL</c>, i.e. there is
  410. no initial value to be taken into account during reduction. StarPU
  411. will thus only take into account the contributions from the tasks
  412. <c>dot_kernel_cl</c>. Also, it will not allocate any memory for
  413. <c>dtq_handle</c> before tasks <c>dot_kernel_cl</c> are ready to run.
  414. If another dot product has to be performed, one could unregister
  415. <c>dtq_handle</c>, and re-register it. But one can also call
  416. starpu_data_invalidate_submit() with the parameter <c>dtq_handle</c>,
  417. which will clear all data from the handle, thus resetting it back to
  418. the initial status <c>register(NULL)</c>.
  419. The example <c>cg</c> also uses reduction for the blocked gemv kernel,
  420. leading to yet more relaxed dependencies and more parallelism.
  421. ::STARPU_REDUX can also be passed to starpu_mpi_task_insert() in the MPI
  422. case. That will however not produce any MPI communication, but just pass
  423. ::STARPU_REDUX to the underlying starpu_task_insert(). It is up to the
  424. application to call starpu_mpi_redux_data(), which posts tasks that will
  425. reduce the partial results among MPI nodes into the MPI node which owns the
  426. data. For instance, some hypothetical application which collects partial results
  427. into data <c>res</c>, then uses it for other computation, before looping again
  428. with a new reduction:
  429. \code{.c}
  430. for (i = 0; i < 100; i++)
  431. {
  432. starpu_mpi_task_insert(MPI_COMM_WORLD, &init_res, STARPU_W, res, 0);
  433. starpu_mpi_task_insert(MPI_COMM_WORLD, &work, STARPU_RW, A,
  434. STARPU_R, B, STARPU_REDUX, res, 0);
  435. starpu_mpi_redux_data(MPI_COMM_WORLD, res);
  436. starpu_mpi_task_insert(MPI_COMM_WORLD, &work2, STARPU_RW, B, STARPU_R, res, 0);
  437. }
  438. \endcode
  439. \section DataCommute Commute Data Access
  440. By default, the implicit dependencies computed from data access use the
  441. sequential semantic. Notably, write accesses are always serialized in the order
  442. of submission. In some applicative cases, the write contributions can actually
  443. be performed in any order without affecting the eventual result. In that case
  444. it is useful to drop the strictly sequential semantic, to improve parallelism
  445. by allowing StarPU to reorder the write accesses. This can be done by using
  446. the ::STARPU_COMMUTE data access flag. Accesses without this flag will however
  447. properly be serialized against accesses with this flag. For instance:
  448. \code{.c}
  449. starpu_task_insert(&cl1,
  450. STARPU_R, h,
  451. STARPU_RW, handle,
  452. 0);
  453. starpu_task_insert(&cl2,
  454. STARPU_R, handle1,
  455. STARPU_RW|STARPU_COMMUTE, handle,
  456. 0);
  457. starpu_task_insert(&cl2,
  458. STARPU_R, handle2,
  459. STARPU_RW|STARPU_COMMUTE, handle,
  460. 0);
  461. starpu_task_insert(&cl3,
  462. STARPU_R, g,
  463. STARPU_RW, handle,
  464. 0);
  465. \endcode
  466. The two tasks running <c>cl2</c> will be able to commute: depending on whether the
  467. value of <c>handle1</c> or <c>handle2</c> becomes available first, the corresponding task
  468. running <c>cl2</c> will start first. The task running <c>cl1</c> will however always be run
  469. before them, and the task running <c>cl3</c> will always be run after them.
  470. If a lot of tasks use the commute access on the same set of data and a lot of
  471. them are ready at the same time, it may become interesting to use an arbiter,
  472. see \ref ConcurrentDataAccess.
  473. \section ConcurrentDataAccess Concurrent Data Accesses
  474. When several tasks are ready and will work on several data, StarPU is faced with
  475. the classical Dining Philosophers problem, and has to determine the order in
  476. which it will run the tasks.
  477. Data accesses usually use sequential ordering, so data accesses are usually
  478. already serialized, and thus by default StarPU uses the Dijkstra solution which
  479. scales very well in terms of overhead: tasks will just acquire data one by one
  480. by data handle pointer value order.
  481. When sequential ordering is disabled or the ::STARPU_COMMUTE flag is used, there
  482. may be a lot of concurrent accesses to the same data, and the Dijkstra solution
  483. gets only poor parallelism, typically in some pathological cases which do happen
  484. in various applications. In that case, one can use a data access arbiter, which
  485. implements the classical centralized solution for the Dining Philosophers
  486. problem. This is more expensive in terms of overhead since it is centralized,
  487. but it opportunistically gets a lot of parallelism. The centralization can also
  488. be avoided by using several arbiters, thus separating sets of data for which
  489. arbitration will be done. If a task accesses data from different arbiters, it
  490. will acquire them arbiter by arbiter, in arbiter pointer value order.
  491. See the <c>tests/datawizard/test_arbiter.cpp</c> example.
  492. Arbiters however do not support the ::STARPU_REDUX flag yet.
  493. \section TemporaryBuffers Temporary Buffers
  494. There are two kinds of temporary buffers: temporary data which just pass results
  495. from a task to another, and scratch data which are needed only internally by
  496. tasks.
  497. \subsection TemporaryData Temporary Data
  498. Data can sometimes be entirely produced by a task, and entirely consumed by
  499. another task, without the need for other parts of the application to access
  500. it. In such case, registration can be done without prior allocation, by using
  501. the special memory node number <c>-1</c>, and passing a zero pointer. StarPU will
  502. actually allocate memory only when the task creating the content gets scheduled,
  503. and destroy it on unregistration.
  504. In addition to that, it can be tedious for the application to have to unregister
  505. the data, since it will not use its content anyway. The unregistration can be
  506. done lazily by using the function starpu_data_unregister_submit(),
  507. which will record that no more tasks accessing the handle will be submitted, so
  508. that it can be freed as soon as the last task accessing it is over.
  509. The following code examplifies both points: it registers the temporary
  510. data, submits three tasks accessing it, and records the data for automatic
  511. unregistration.
  512. \code{.c}
  513. starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
  514. starpu_task_insert(&produce_data, STARPU_W, handle, 0);
  515. starpu_task_insert(&compute_data, STARPU_RW, handle, 0);
  516. starpu_task_insert(&summarize_data, STARPU_R, handle, STARPU_W, result_handle, 0);
  517. starpu_data_unregister_submit(handle);
  518. \endcode
  519. The application may also want to see the temporary data initialized
  520. on the fly before being used by the task. This can be done by using
  521. starpu_data_set_reduction_methods() to set an initialization codelet (no redux
  522. codelet is needed).
  523. \subsection ScratchData Scratch Data
  524. Some kernels sometimes need temporary data to achieve the computations, i.e. a
  525. workspace. The application could allocate it at the start of the codelet
  526. function, and free it at the end, but that would be costly. It could also
  527. allocate one buffer per worker (similarly to \ref HowToInitializeAComputationLibraryOnceForEachWorker),
  528. but that would
  529. make them systematic and permanent. A more optimized way is to use
  530. the data access mode ::STARPU_SCRATCH, as examplified below, which
  531. provides per-worker buffers without content consistency. The buffer is
  532. registered only once, using memory node <c>-1</c>, i.e. the application didn't allocate
  533. memory for it, and StarPU will allocate it on demand at task execution.
  534. \code{.c}
  535. starpu_vector_data_register(&workspace, -1, 0, sizeof(float));
  536. for (i = 0; i < N; i++)
  537. starpu_task_insert(&compute, STARPU_R, input[i],
  538. STARPU_SCRATCH, workspace, STARPU_W, output[i], 0);
  539. \endcode
  540. StarPU will make sure that the buffer is allocated before executing the task,
  541. and make this allocation per-worker: for CPU workers, notably, each worker has
  542. its own buffer. This means that each task submitted above will actually have its
  543. own workspace, which will actually be the same for all tasks running one after
  544. the other on the same worker. Also, if for instance memory becomes scarce,
  545. StarPU will notice that it can free such buffers easily, since the content does
  546. not matter.
  547. The example <c>examples/pi</c> uses scratches for some temporary buffer.
  548. \section TheMultiformatInterface The Multiformat Interface
  549. It may be interesting to represent the same piece of data using two different
  550. data structures: one that would only be used on CPUs, and one that would only
  551. be used on GPUs. This can be done by using the multiformat interface. StarPU
  552. will be able to convert data from one data structure to the other when needed.
  553. Note that the scheduler <c>dmda</c> is the only one optimized for this
  554. interface. The user must provide StarPU with conversion codelets:
  555. \snippet multiformat.c To be included. You should update doxygen if you see this text.
  556. Kernels can be written almost as for any other interface. Note that
  557. ::STARPU_MULTIFORMAT_GET_CPU_PTR shall only be used for CPU kernels. CUDA kernels
  558. must use ::STARPU_MULTIFORMAT_GET_CUDA_PTR, and OpenCL kernels must use
  559. ::STARPU_MULTIFORMAT_GET_OPENCL_PTR. ::STARPU_MULTIFORMAT_GET_NX may
  560. be used in any kind of kernel.
  561. \code{.c}
  562. static void
  563. multiformat_scal_cpu_func(void *buffers[], void *args)
  564. {
  565. struct point *aos;
  566. unsigned int n;
  567. aos = STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
  568. n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
  569. ...
  570. }
  571. extern "C" void multiformat_scal_cuda_func(void *buffers[], void *_args)
  572. {
  573. unsigned int n;
  574. struct struct_of_arrays *soa;
  575. soa = (struct struct_of_arrays *) STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
  576. n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
  577. ...
  578. }
  579. \endcode
  580. A full example may be found in <c>examples/basic_examples/multiformat.c</c>.
  581. \section DefiningANewDataInterface Defining A New Data Interface
  582. Let's define a new data interface to manage complex numbers.
  583. \code{.c}
  584. /* interface for complex numbers */
  585. struct starpu_complex_interface
  586. {
  587. double *real;
  588. double *imaginary;
  589. int nx;
  590. };
  591. \endcode
  592. Registering such a data to StarPU is easily done using the function
  593. starpu_data_register(). The last
  594. parameter of the function, <c>interface_complex_ops</c>, will be
  595. described below.
  596. \code{.c}
  597. void starpu_complex_data_register(starpu_data_handle_t *handle,
  598. unsigned home_node, double *real, double *imaginary, int nx)
  599. {
  600. struct starpu_complex_interface complex =
  601. {
  602. .real = real,
  603. .imaginary = imaginary,
  604. .nx = nx
  605. };
  606. if (interface_complex_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
  607. {
  608. interface_complex_ops.interfaceid = starpu_data_interface_get_next_id();
  609. }
  610. starpu_data_register(handleptr, home_node, &complex, &interface_complex_ops);
  611. }
  612. \endcode
  613. Different operations need to be defined for a data interface through
  614. the type starpu_data_interface_ops. We only define here the basic
  615. operations needed to run simple applications. The source code for the
  616. different functions can be found in the file
  617. <c>examples/interface/complex_interface.c</c>, the details of the hooks to be
  618. provided are documented \ref starpu_data_interface_ops .
  619. \code{.c}
  620. static struct starpu_data_interface_ops interface_complex_ops =
  621. {
  622. .register_data_handle = complex_register_data_handle,
  623. .allocate_data_on_node = complex_allocate_data_on_node,
  624. .copy_methods = &complex_copy_methods,
  625. .get_size = complex_get_size,
  626. .footprint = complex_footprint,
  627. .interfaceid = STARPU_UNKNOWN_INTERFACE_ID,
  628. .interface_size = sizeof(struct starpu_complex_interface),
  629. };
  630. \endcode
  631. Functions need to be defined to access the different fields of the
  632. complex interface from a StarPU data handle.
  633. \code{.c}
  634. double *starpu_complex_get_real(starpu_data_handle_t handle)
  635. {
  636. struct starpu_complex_interface *complex_interface =
  637. (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
  638. return complex_interface->real;
  639. }
  640. double *starpu_complex_get_imaginary(starpu_data_handle_t handle);
  641. int starpu_complex_get_nx(starpu_data_handle_t handle);
  642. \endcode
  643. Similar functions need to be defined to access the different fields of the
  644. complex interface from a <c>void *</c> pointer to be used within codelet
  645. implemetations.
  646. \snippet complex.c To be included. You should update doxygen if you see this text.
  647. Complex data interfaces can then be registered to StarPU.
  648. \code{.c}
  649. double real = 45.0;
  650. double imaginary = 12.0;
  651. starpu_complex_data_register(&handle1, STARPU_MAIN_RAM, &real, &imaginary, 1);
  652. starpu_task_insert(&cl_display, STARPU_R, handle1, 0);
  653. \endcode
  654. and used by codelets.
  655. \code{.c}
  656. void display_complex_codelet(void *descr[], void *_args)
  657. {
  658. int nx = STARPU_COMPLEX_GET_NX(descr[0]);
  659. double *real = STARPU_COMPLEX_GET_REAL(descr[0]);
  660. double *imaginary = STARPU_COMPLEX_GET_IMAGINARY(descr[0]);
  661. int i;
  662. for(i=0 ; i<nx ; i++)
  663. {
  664. fprintf(stderr, "Complex[%d] = %3.2f + %3.2f i\n", i, real[i], imaginary[i]);
  665. }
  666. }
  667. \endcode
  668. The whole code for this complex data interface is available in the
  669. directory <c>examples/interface/</c>.
  670. \section SpecifyingATargetNode Specifying A Target Node For Task Data
  671. When executing a task on a GPU for instance, StarPU would normally copy all the
  672. needed data for the tasks on the embedded memory of the GPU. It may however
  673. happen that the task kernel would rather have some of the datas kept in the
  674. main memory instead of copied in the GPU, a pivoting vector for instance.
  675. This can be achieved by setting the starpu_codelet::specific_nodes flag to
  676. <c>1</c>, and then fill the starpu_codelet::nodes array (or starpu_codelet::dyn_nodes when
  677. starpu_codelet::nbuffers is greater than \ref STARPU_NMAXBUFS) with the node numbers
  678. where data should be copied to, or <c>-1</c> to let StarPU copy it to the memory node
  679. where the task will be executed. For instance, with the following codelet:
  680. \code{.c}
  681. struct starpu_codelet cl =
  682. {
  683. .cuda_funcs = { kernel },
  684. .nbuffers = 2,
  685. .modes = {STARPU_RW, STARPU_RW},
  686. .specific_nodes = 1,
  687. .nodes = {STARPU_MAIN_RAM, -1},
  688. };
  689. \endcode
  690. the first data of the task will be kept in the main memory, while the second
  691. data will be copied to the CUDA GPU as usual.
  692. */