advanced_examples.doxy 48 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263
  1. /*
  2. * This file is part of the StarPU Handbook.
  3. * Copyright (C) 2009--2011 Universit@'e de Bordeaux 1
  4. * Copyright (C) 2010, 2011, 2012, 2013 Centre National de la Recherche Scientifique
  5. * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
  6. * See the file version.doxy for copying conditions.
  7. */
  8. /*! \page advancedExamples Advanced Examples
  9. \section Using_multiple_implementations_of_a_codelet Using multiple implementations of a codelet
  10. One may want to write multiple implementations of a codelet for a single type of
  11. device and let StarPU choose which one to run. As an example, we will show how
  12. to use SSE to scale a vector. The codelet can be written as follows:
  13. \code{.c}
  14. #include <xmmintrin.h>
  15. void scal_sse_func(void *buffers[], void *cl_arg)
  16. {
  17. float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
  18. unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
  19. unsigned int n_iterations = n/4;
  20. if (n % 4 != 0)
  21. n_iterations++;
  22. __m128 *VECTOR = (__m128*) vector;
  23. __m128 factor __attribute__((aligned(16)));
  24. factor = _mm_set1_ps(*(float *) cl_arg);
  25. unsigned int i;
  26. for (i = 0; i < n_iterations; i++)
  27. VECTOR[i] = _mm_mul_ps(factor, VECTOR[i]);
  28. }
  29. \endcode
  30. \code{.c}
  31. struct starpu_codelet cl = {
  32. .where = STARPU_CPU,
  33. .cpu_funcs = { scal_cpu_func, scal_sse_func, NULL },
  34. .nbuffers = 1,
  35. .modes = { STARPU_RW }
  36. };
  37. \endcode
  38. Schedulers which are multi-implementation aware (only <c>dmda</c> and
  39. <c>pheft</c> for now) will use the performance models of all the
  40. implementations it was given, and pick the one that seems to be the fastest.
  41. \section Enabling_implementation_according_to_capabilities Enabling implementation according to capabilities
  42. Some implementations may not run on some devices. For instance, some CUDA
  43. devices do not support double floating point precision, and thus the kernel
  44. execution would just fail; or the device may not have enough shared memory for
  45. the implementation being used. The <c>can_execute</c> field of the <c>struct
  46. starpu_codelet</c> structure permits to express this. For instance:
  47. \code{.c}
  48. static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
  49. {
  50. const struct cudaDeviceProp *props;
  51. if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
  52. return 1;
  53. /* Cuda device */
  54. props = starpu_cuda_get_device_properties(workerid);
  55. if (props->major >= 2 || props->minor >= 3)
  56. /* At least compute capability 1.3, supports doubles */
  57. return 1;
  58. /* Old card, does not support doubles */
  59. return 0;
  60. }
  61. struct starpu_codelet cl = {
  62. .where = STARPU_CPU|STARPU_CUDA,
  63. .can_execute = can_execute,
  64. .cpu_funcs = { cpu_func, NULL },
  65. .cuda_funcs = { gpu_func, NULL }
  66. .nbuffers = 1,
  67. .modes = { STARPU_RW }
  68. };
  69. \endcode
  70. This can be essential e.g. when running on a machine which mixes various models
  71. of CUDA devices, to take benefit from the new models without crashing on old models.
  72. Note: the <c>can_execute</c> function is called by the scheduler each time it
  73. tries to match a task with a worker, and should thus be very fast. The
  74. starpu_cuda_get_device_properties() provides a quick access to CUDA
  75. properties of CUDA devices to achieve such efficiency.
  76. Another example is compiling CUDA code for various compute capabilities,
  77. resulting with two CUDA functions, e.g. <c>scal_gpu_13</c> for compute capability
  78. 1.3, and <c>scal_gpu_20</c> for compute capability 2.0. Both functions can be
  79. provided to StarPU by using <c>cuda_funcs</c>, and <c>can_execute</c> can then be
  80. used to rule out the <c>scal_gpu_20</c> variant on a CUDA device which
  81. will not be able to execute it:
  82. \code{.c}
  83. static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
  84. {
  85. const struct cudaDeviceProp *props;
  86. if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
  87. return 1;
  88. /* Cuda device */
  89. if (nimpl == 0)
  90. /* Trying to execute the 1.3 capability variant, we assume it is ok in all cases. */
  91. return 1;
  92. /* Trying to execute the 2.0 capability variant, check that the card can do it. */
  93. props = starpu_cuda_get_device_properties(workerid);
  94. if (props->major >= 2 || props->minor >= 0)
  95. /* At least compute capability 2.0, can run it */
  96. return 1;
  97. /* Old card, does not support 2.0, will not be able to execute the 2.0 variant. */
  98. return 0;
  99. }
  100. struct starpu_codelet cl = {
  101. .where = STARPU_CPU|STARPU_CUDA,
  102. .can_execute = can_execute,
  103. .cpu_funcs = { cpu_func, NULL },
  104. .cuda_funcs = { scal_gpu_13, scal_gpu_20, NULL },
  105. .nbuffers = 1,
  106. .modes = { STARPU_RW }
  107. };
  108. \endcode
  109. Note: the most generic variant should be provided first, as some schedulers are
  110. not able to try the different variants.
  111. \section Task_and_Worker_Profiling Task and Worker Profiling
  112. A full example showing how to use the profiling API is available in
  113. the StarPU sources in the directory <c>examples/profiling/</c>.
  114. \code{.c}
  115. struct starpu_task *task = starpu_task_create();
  116. task->cl = &cl;
  117. task->synchronous = 1;
  118. /* We will destroy the task structure by hand so that we can
  119. * query the profiling info before the task is destroyed. */
  120. task->destroy = 0;
  121. /* Submit and wait for completion (since synchronous was set to 1) */
  122. starpu_task_submit(task);
  123. /* The task is finished, get profiling information */
  124. struct starpu_profiling_task_info *info = task->profiling_info;
  125. /* How much time did it take before the task started ? */
  126. double delay += starpu_timing_timespec_delay_us(&info->submit_time, &info->start_time);
  127. /* How long was the task execution ? */
  128. double length += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
  129. /* We don't need the task structure anymore */
  130. starpu_task_destroy(task);
  131. \endcode
  132. \code{.c}
  133. /* Display the occupancy of all workers during the test */
  134. int worker;
  135. for (worker = 0; worker < starpu_worker_get_count(); worker++)
  136. {
  137. struct starpu_profiling_worker_info worker_info;
  138. int ret = starpu_profiling_worker_get_info(worker, &worker_info);
  139. STARPU_ASSERT(!ret);
  140. double total_time = starpu_timing_timespec_to_us(&worker_info.total_time);
  141. double executing_time = starpu_timing_timespec_to_us(&worker_info.executing_time);
  142. double sleeping_time = starpu_timing_timespec_to_us(&worker_info.sleeping_time);
  143. double overhead_time = total_time - executing_time - sleeping_time;
  144. float executing_ratio = 100.0*executing_time/total_time;
  145. float sleeping_ratio = 100.0*sleeping_time/total_time;
  146. float overhead_ratio = 100.0 - executing_ratio - sleeping_ratio;
  147. char workername[128];
  148. starpu_worker_get_name(worker, workername, 128);
  149. fprintf(stderr, "Worker %s:\n", workername);
  150. fprintf(stderr, "\ttotal time: %.2lf ms\n", total_time*1e-3);
  151. fprintf(stderr, "\texec time: %.2lf ms (%.2f %%)\n",
  152. executing_time*1e-3, executing_ratio);
  153. fprintf(stderr, "\tblocked time: %.2lf ms (%.2f %%)\n",
  154. sleeping_time*1e-3, sleeping_ratio);
  155. fprintf(stderr, "\toverhead time: %.2lf ms (%.2f %%)\n",
  156. overhead_time*1e-3, overhead_ratio);
  157. }
  158. \endcode
  159. \section Partitioning_Data Partitioning Data
  160. An existing piece of data can be partitioned in sub parts to be used by different tasks, for instance:
  161. \code{.c}
  162. int vector[NX];
  163. starpu_data_handle_t handle;
  164. /* Declare data to StarPU */
  165. starpu_vector_data_register(&handle, 0, (uintptr_t)vector,
  166. NX, sizeof(vector[0]));
  167. /* Partition the vector in PARTS sub-vectors */
  168. starpu_data_filter f =
  169. {
  170. .filter_func = starpu_vector_filter_block,
  171. .nchildren = PARTS
  172. };
  173. starpu_data_partition(handle, &f);
  174. \endcode
  175. The task submission then uses starpu_data_get_sub_data() to retrieve the
  176. sub-handles to be passed as tasks parameters.
  177. \code{.c}
  178. /* Submit a task on each sub-vector */
  179. for (i=0; i<starpu_data_get_nb_children(handle); i++) {
  180. /* Get subdata number i (there is only 1 dimension) */
  181. starpu_data_handle_t sub_handle = starpu_data_get_sub_data(handle, 1, i);
  182. struct starpu_task *task = starpu_task_create();
  183. task->handles[0] = sub_handle;
  184. task->cl = &cl;
  185. task->synchronous = 1;
  186. task->cl_arg = &factor;
  187. task->cl_arg_size = sizeof(factor);
  188. starpu_task_submit(task);
  189. }
  190. \endcode
  191. Partitioning can be applied several times, see
  192. <c>examples/basic_examples/mult.c</c> and <c>examples/filters/</c>.
  193. Wherever the whole piece of data is already available, the partitioning will
  194. be done in-place, i.e. without allocating new buffers but just using pointers
  195. inside the existing copy. This is particularly important to be aware of when
  196. using OpenCL, where the kernel parameters are not pointers, but handles. The
  197. kernel thus needs to be also passed the offset within the OpenCL buffer:
  198. \code{.c}
  199. void opencl_func(void *buffers[], void *cl_arg)
  200. {
  201. cl_mem vector = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
  202. unsigned offset = STARPU_BLOCK_GET_OFFSET(buffers[0]);
  203. ...
  204. clSetKernelArg(kernel, 0, sizeof(vector), &vector);
  205. clSetKernelArg(kernel, 1, sizeof(offset), &offset);
  206. ...
  207. }
  208. \endcode
  209. And the kernel has to shift from the pointer passed by the OpenCL driver:
  210. \code{.c}
  211. __kernel void opencl_kernel(__global int *vector, unsigned offset)
  212. {
  213. block = (__global void *)block + offset;
  214. ...
  215. }
  216. \endcode
  217. StarPU provides various interfaces and filters for matrices, vectors, etc.,
  218. but applications can also write their own data interfaces and filters, see
  219. <c>examples/interface</c> and <c>examples/filters/custom_mf</c> for an example.
  220. \section Performance_model_example Performance model example
  221. To achieve good scheduling, StarPU scheduling policies need to be able to
  222. estimate in advance the duration of a task. This is done by giving to codelets
  223. a performance model, by defining a <c>starpu_perfmodel</c> structure and
  224. providing its address in the <c>model</c> field of the <c>struct starpu_codelet</c>
  225. structure. The <c>symbol</c> and <c>type</c> fields of <c>starpu_perfmodel</c>
  226. are mandatory, to give a name to the model, and the type of the model, since
  227. there are several kinds of performance models. For compatibility, make sure to
  228. initialize the whole structure to zero, either by using explicit memset, or by
  229. letting the compiler implicitly do it as examplified below.
  230. <ul>
  231. <li>
  232. Measured at runtime (<c>STARPU_HISTORY_BASED</c> model type). This assumes that for a
  233. given set of data input/output sizes, the performance will always be about the
  234. same. This is very true for regular kernels on GPUs for instance (<0.1% error),
  235. and just a bit less true on CPUs (~=1% error). This also assumes that there are
  236. few different sets of data input/output sizes. StarPU will then keep record of
  237. the average time of previous executions on the various processing units, and use
  238. it as an estimation. History is done per task size, by using a hash of the input
  239. and ouput sizes as an index.
  240. It will also save it in <c>$STARPU_HOME/.starpu/sampling/codelets</c>
  241. for further executions, and can be observed by using the
  242. <c>starpu_perfmodel_display</c> command, or drawn by using
  243. the <c>starpu_perfmodel_plot</c> (\ref Performance_model_calibration). The
  244. models are indexed by machine name. To
  245. share the models between machines (e.g. for a homogeneous cluster), use
  246. <c>export STARPU_HOSTNAME=some_global_name</c>. Measurements are only done
  247. when using a task scheduler which makes use of it, such as
  248. <c>dmda</c>. Measurements can also be provided explicitly by the application, by
  249. using the starpu_perfmodel_update_history() function.
  250. The following is a small code example.
  251. If e.g. the code is recompiled with other compilation options, or several
  252. variants of the code are used, the symbol string should be changed to reflect
  253. that, in order to recalibrate a new model from zero. The symbol string can even
  254. be constructed dynamically at execution time, as long as this is done before
  255. submitting any task using it.
  256. \code{.c}
  257. static struct starpu_perfmodel mult_perf_model = {
  258. .type = STARPU_HISTORY_BASED,
  259. .symbol = "mult_perf_model"
  260. };
  261. struct starpu_codelet cl = {
  262. .where = STARPU_CPU,
  263. .cpu_funcs = { cpu_mult, NULL },
  264. .nbuffers = 3,
  265. .modes = { STARPU_R, STARPU_R, STARPU_W },
  266. /* for the scheduling policy to be able to use performance models */
  267. .model = &mult_perf_model
  268. };
  269. \endcode
  270. </li>
  271. <li>
  272. Measured at runtime and refined by regression (<c>STARPU_*REGRESSION_BASED</c>
  273. model type). This still assumes performance regularity, but works
  274. with various data input sizes, by applying regression over observed
  275. execution times. STARPU_REGRESSION_BASED uses an a*n^b regression
  276. form, STARPU_NL_REGRESSION_BASED uses an a*n^b+c (more precise than
  277. STARPU_REGRESSION_BASED, but costs a lot more to compute).
  278. For instance,
  279. <c>tests/perfmodels/regression_based.c</c> uses a regression-based performance
  280. model for the <c>memset</c> operation.
  281. Of course, the application has to issue
  282. tasks with varying size so that the regression can be computed. StarPU will not
  283. trust the regression unless there is at least 10% difference between the minimum
  284. and maximum observed input size. It can be useful to set the
  285. <c>STARPU_CALIBRATE</c> environment variable to <c>1</c> and run the application
  286. on varying input sizes with <c>STARPU_SCHED</c> set to <c>eager</c> scheduler,
  287. so as to feed the performance model for a variety of
  288. inputs. The application can also provide the measurements explictly by using
  289. starpu_perfmodel_update_history(). The <c>starpu_perfmodel_display</c> and
  290. <c>starpu_perfmodel_plot</c>
  291. tools can be used to observe how much the performance model is calibrated (\ref Performance_model_calibration); when
  292. their output look good, <c>STARPU_CALIBRATE</c> can be reset to <c>0</c> to let
  293. StarPU use the resulting performance model without recording new measures, and
  294. <c>STARPU_SCHED</c> can be set to <c>dmda</c> to benefit from the performance models. If
  295. the data input sizes vary a lot, it is really important to set
  296. <c>STARPU_CALIBRATE</c> to <c>0</c>, otherwise StarPU will continue adding the
  297. measures, and result with a very big performance model, which will take time a
  298. lot of time to load and save.
  299. For non-linear regression, since computing it
  300. is quite expensive, it is only done at termination of the application. This
  301. means that the first execution of the application will use only history-based
  302. performance model to perform scheduling, without using regression.
  303. </li>
  304. <li>
  305. Provided as an estimation from the application itself (<c>STARPU_COMMON</c> model type and <c>cost_function</c> field),
  306. see for instance
  307. <c>examples/common/blas_model.h</c> and <c>examples/common/blas_model.c</c>.
  308. </li>
  309. <li>
  310. Provided explicitly by the application (<c>STARPU_PER_ARCH</c> model type): the
  311. <c>.per_arch[arch][nimpl].cost_function</c> fields have to be filled with pointers to
  312. functions which return the expected duration of the task in micro-seconds, one
  313. per architecture.
  314. </li>
  315. </ul>
  316. For the <c>STARPU_HISTORY_BASED</c> and <c>STARPU_*REGRESSION_BASE</c>,
  317. the total size of task data (both input and output) is used as an index by
  318. default. The <c>size_base</c> field of <c>struct starpu_perfmodel</c> however
  319. permits the application to override that, when for instance some of the data
  320. do not matter for task cost (e.g. mere reference table), or when using sparse
  321. structures (in which case it is the number of non-zeros which matter), or when
  322. there is some hidden parameter such as the number of iterations, etc. The
  323. <c>examples/pi</c> examples uses this to include the number of iterations in the
  324. base.
  325. How to use schedulers which can benefit from such performance model is explained
  326. in \ref Task_scheduling_policy.
  327. The same can be done for task power consumption estimation, by setting the
  328. <c>power_model</c> field the same way as the <c>model</c> field. Note: for
  329. now, the application has to give to the power consumption performance model
  330. a name which is different from the execution time performance model.
  331. The application can request time estimations from the StarPU performance
  332. models by filling a task structure as usual without actually submitting
  333. it. The data handles can be created by calling <c>starpu_*_data_register</c>
  334. functions with a <c>NULL</c> pointer and <c>-1</c> node and the
  335. desired data sizes, and need to be unregistered as usual. The
  336. starpu_task_expected_length() and starpu_task_expected_power() functions can then be called to get an estimation of the task cost on a given
  337. arch. starpu_task_footprint() can also be used to get the footprint used
  338. for indexing history-based performance models.
  339. starpu_task_destroy()
  340. needs to be called to destroy the dummy task afterwards. See
  341. <c>tests/perfmodels/regression_based.c</c> for an example.
  342. \section Theoretical_lower_bound_on_execution_time_example Theoretical lower bound on execution time
  343. For kernels with history-based performance models (and provided that they are completely calibrated), StarPU can very easily provide a theoretical lower
  344. bound for the execution time of a whole set of tasks. See for
  345. instance <c>examples/lu/lu_example.c</c>: before submitting tasks,
  346. call <c>starpu_bound_start</c>, and after complete execution, call
  347. <c>starpu_bound_stop</c>. <c>starpu_bound_print_lp</c> or
  348. <c>starpu_bound_print_mps</c> can then be used to output a Linear Programming
  349. problem corresponding to the schedule of your tasks. Run it through
  350. <c>lp_solve</c> or any other linear programming solver, and that will give you a
  351. lower bound for the total execution time of your tasks. If StarPU was compiled
  352. with the glpk library installed, <c>starpu_bound_compute</c> can be used to
  353. solve it immediately and get the optimized minimum, in ms. Its <c>integer</c>
  354. parameter allows to decide whether integer resolution should be computed
  355. and returned too.
  356. The <c>deps</c> parameter tells StarPU whether to take tasks, implicit data, and tag
  357. dependencies into account. Tags released in a callback or similar
  358. are not taken into account, only tags associated with a task are.
  359. It must be understood that the linear programming
  360. problem size is quadratic with the number of tasks and thus the time to solve it
  361. will be very long, it could be minutes for just a few dozen tasks. You should
  362. probably use <c>lp_solve -timeout 1 test.pl -wmps test.mps</c> to convert the
  363. problem to MPS format and then use a better solver, <c>glpsol</c> might be
  364. better than <c>lp_solve</c> for instance (the <c>--pcost</c> option may be
  365. useful), but sometimes doesn't manage to converge. <c>cbc</c> might look
  366. slower, but it is parallel. For <c>lp_solve</c>, be sure to try at least all the
  367. <c>-B</c> options. For instance, we often just use <c>lp_solve -cc -B1 -Bb
  368. -Bg -Bp -Bf -Br -BG -Bd -Bs -BB -Bo -Bc -Bi</c> , and the <c>-gr</c> option can
  369. also be quite useful. The resulting schedule can be observed by using the
  370. <c>starpu_lp2paje</c> tool, which converts it into the Paje format.
  371. Data transfer time can only be taken into account when <c>deps</c> is set. Only
  372. data transfers inferred from implicit data dependencies between tasks are taken
  373. into account. Other data transfers are assumed to be completely overlapped.
  374. Setting <c>deps</c> to 0 will only take into account the actual computations
  375. on processing units. It however still properly takes into account the varying
  376. performances of kernels and processing units, which is quite more accurate than
  377. just comparing StarPU performances with the fastest of the kernels being used.
  378. The <c>prio</c> parameter tells StarPU whether to simulate taking into account
  379. the priorities as the StarPU scheduler would, i.e. schedule prioritized
  380. tasks before less prioritized tasks, to check to which extend this results
  381. to a less optimal solution. This increases even more computation time.
  382. \section Insert_Task_Utility Insert Task Utility
  383. StarPU provides the wrapper function <c>starpu_insert_task</c> to ease
  384. the creation and submission of tasks. See the definition of the
  385. functions in \ref Insert_Task.
  386. Here the implementation of the codelet:
  387. \code{.c}
  388. void func_cpu(void *descr[], void *_args)
  389. {
  390. int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
  391. float *x1 = (float *)STARPU_VARIABLE_GET_PTR(descr[1]);
  392. int ifactor;
  393. float ffactor;
  394. starpu_codelet_unpack_args(_args, &ifactor, &ffactor);
  395. *x0 = *x0 * ifactor;
  396. *x1 = *x1 * ffactor;
  397. }
  398. struct starpu_codelet mycodelet = {
  399. .where = STARPU_CPU,
  400. .cpu_funcs = { func_cpu, NULL },
  401. .nbuffers = 2,
  402. .modes = { STARPU_RW, STARPU_RW }
  403. };
  404. \endcode
  405. And the call to the <c>starpu_insert_task</c> wrapper:
  406. \code{.c}
  407. starpu_insert_task(&mycodelet,
  408. STARPU_VALUE, &ifactor, sizeof(ifactor),
  409. STARPU_VALUE, &ffactor, sizeof(ffactor),
  410. STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],
  411. 0);
  412. \endcode
  413. The call to <c>starpu_insert_task</c> is equivalent to the following
  414. code:
  415. \code{.c}
  416. struct starpu_task *task = starpu_task_create();
  417. task->cl = &mycodelet;
  418. task->handles[0] = data_handles[0];
  419. task->handles[1] = data_handles[1];
  420. char *arg_buffer;
  421. size_t arg_buffer_size;
  422. starpu_codelet_pack_args(&arg_buffer, &arg_buffer_size,
  423. STARPU_VALUE, &ifactor, sizeof(ifactor),
  424. STARPU_VALUE, &ffactor, sizeof(ffactor),
  425. 0);
  426. task->cl_arg = arg_buffer;
  427. task->cl_arg_size = arg_buffer_size;
  428. int ret = starpu_task_submit(task);
  429. \endcode
  430. Here a similar call using <c>STARPU_DATA_ARRAY</c>.
  431. \code{.c}
  432. starpu_insert_task(&mycodelet,
  433. STARPU_DATA_ARRAY, data_handles, 2,
  434. STARPU_VALUE, &ifactor, sizeof(ifactor),
  435. STARPU_VALUE, &ffactor, sizeof(ffactor),
  436. 0);
  437. \endcode
  438. If some part of the task insertion depends on the value of some computation,
  439. the <c>STARPU_DATA_ACQUIRE_CB</c> macro can be very convenient. For
  440. instance, assuming that the index variable <c>i</c> was registered as handle
  441. <c>i_handle</c>:
  442. \code{.c}
  443. /* Compute which portion we will work on, e.g. pivot */
  444. starpu_insert_task(&which_index, STARPU_W, i_handle, 0);
  445. /* And submit the corresponding task */
  446. STARPU_DATA_ACQUIRE_CB(i_handle, STARPU_R,
  447. starpu_insert_task(&work, STARPU_RW, A_handle[i], 0));
  448. \endcode
  449. The <c>STARPU_DATA_ACQUIRE_CB</c> macro submits an asynchronous request for
  450. acquiring data <c>i</c> for the main application, and will execute the code
  451. given as third parameter when it is acquired. In other words, as soon as the
  452. value of <c>i</c> computed by the <c>which_index</c> codelet can be read, the
  453. portion of code passed as third parameter of <c>STARPU_DATA_ACQUIRE_CB</c> will
  454. be executed, and is allowed to read from <c>i</c> to use it e.g. as an
  455. index. Note that this macro is only avaible when compiling StarPU with
  456. the compiler <c>gcc</c>.
  457. \section Data_reduction Data reduction
  458. In various cases, some piece of data is used to accumulate intermediate
  459. results. For instances, the dot product of a vector, maximum/minimum finding,
  460. the histogram of a photograph, etc. When these results are produced along the
  461. whole machine, it would not be efficient to accumulate them in only one place,
  462. incurring data transmission each and access concurrency.
  463. StarPU provides a <c>STARPU_REDUX</c> mode, which permits to optimize
  464. that case: it will allocate a buffer on each memory node, and accumulate
  465. intermediate results there. When the data is eventually accessed in the normal
  466. <c>STARPU_R</c> mode, StarPU will collect the intermediate results in just one
  467. buffer.
  468. For this to work, the user has to use the
  469. <c>starpu_data_set_reduction_methods</c> to declare how to initialize these
  470. buffers, and how to assemble partial results.
  471. For instance, <c>cg</c> uses that to optimize its dot product: it first defines
  472. the codelets for initialization and reduction:
  473. \code{.c}
  474. struct starpu_codelet bzero_variable_cl =
  475. {
  476. .cpu_funcs = { bzero_variable_cpu, NULL },
  477. .cuda_funcs = { bzero_variable_cuda, NULL },
  478. .nbuffers = 1,
  479. }
  480. static void accumulate_variable_cpu(void *descr[], void *cl_arg)
  481. {
  482. double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);
  483. double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);
  484. *v_dst = *v_dst + *v_src;
  485. }
  486. static void accumulate_variable_cuda(void *descr[], void *cl_arg)
  487. {
  488. double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);
  489. double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);
  490. cublasaxpy(1, (double)1.0, v_src, 1, v_dst, 1);
  491. cudaStreamSynchronize(starpu_cuda_get_local_stream());
  492. }
  493. struct starpu_codelet accumulate_variable_cl =
  494. {
  495. .cpu_funcs = { accumulate_variable_cpu, NULL },
  496. .cuda_funcs = { accumulate_variable_cuda, NULL },
  497. .nbuffers = 1,
  498. }
  499. \endcode
  500. and attaches them as reduction methods for its dtq handle:
  501. \code{.c}
  502. starpu_variable_data_register(&dtq_handle, -1, NULL, sizeof(type));
  503. starpu_data_set_reduction_methods(dtq_handle,
  504. &accumulate_variable_cl, &bzero_variable_cl);
  505. \endcode
  506. and <c>dtq_handle</c> can now be used in <c>STARPU_REDUX</c> mode for the dot products
  507. with partitioned vectors:
  508. \code{.c}
  509. for (b = 0; b < nblocks; b++)
  510. starpu_insert_task(&dot_kernel_cl,
  511. STARPU_REDUX, dtq_handle,
  512. STARPU_R, starpu_data_get_sub_data(v1, 1, b),
  513. STARPU_R, starpu_data_get_sub_data(v2, 1, b),
  514. 0);
  515. \endcode
  516. During registration, we have here provided NULL, i.e. there is no initial value
  517. to be taken into account during reduction. StarPU will thus only take into
  518. account the contributions from the <c>dot_kernel_cl</c> tasks. Also, it will not
  519. allocate any memory for <c>dtq_handle</c> before <c>dot_kernel_cl</c> tasks are
  520. ready to run.
  521. If another dot product has to be performed, one could unregister
  522. <c>dtq_handle</c>, and re-register it. But one can also use
  523. <c>starpu_data_invalidate_submit(dtq_handle)</c>, which will clear all data from the handle,
  524. thus resetting it back to the initial <c>register(NULL)</c> state.
  525. The <c>cg</c> example also uses reduction for the blocked gemv kernel, leading
  526. to yet more relaxed dependencies and more parallelism.
  527. STARPU_REDUX can also be passed to <c>starpu_mpi_insert_task</c> in the MPI
  528. case. That will however not produce any MPI communication, but just pass
  529. STARPU_REDUX to the underlying <c>starpu_insert_task</c>. It is up to the
  530. application to call <c>starpu_mpi_redux_data</c>, which posts tasks that will
  531. reduce the partial results among MPI nodes into the MPI node which owns the
  532. data. For instance, some hypothetical application which collects partial results
  533. into data <c>res</c>, then uses it for other computation, before looping again
  534. with a new reduction:
  535. \code{.c}
  536. for (i = 0; i < 100; i++) {
  537. starpu_mpi_insert_task(MPI_COMM_WORLD, &init_res, STARPU_W, res, 0);
  538. starpu_mpi_insert_task(MPI_COMM_WORLD, &work, STARPU_RW, A,
  539. STARPU_R, B, STARPU_REDUX, res, 0);
  540. starpu_mpi_redux_data(MPI_COMM_WORLD, res);
  541. starpu_mpi_insert_task(MPI_COMM_WORLD, &work2, STARPU_RW, B, STARPU_R, res, 0);
  542. }
  543. \endcode
  544. \section Temporary_buffers Temporary buffers
  545. There are two kinds of temporary buffers: temporary data which just pass results
  546. from a task to another, and scratch data which are needed only internally by
  547. tasks.
  548. \subsection Temporary_data Temporary data
  549. Data can sometimes be entirely produced by a task, and entirely consumed by
  550. another task, without the need for other parts of the application to access
  551. it. In such case, registration can be done without prior allocation, by using
  552. the special -1 memory node number, and passing a zero pointer. StarPU will
  553. actually allocate memory only when the task creating the content gets scheduled,
  554. and destroy it on unregistration.
  555. In addition to that, it can be tedious for the application to have to unregister
  556. the data, since it will not use its content anyway. The unregistration can be
  557. done lazily by using the <c>starpu_data_unregister_submit(handle)</c> function,
  558. which will record that no more tasks accessing the handle will be submitted, so
  559. that it can be freed as soon as the last task accessing it is over.
  560. The following code examplifies both points: it registers the temporary
  561. data, submits three tasks accessing it, and records the data for automatic
  562. unregistration.
  563. \code{.c}
  564. starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
  565. starpu_insert_task(&produce_data, STARPU_W, handle, 0);
  566. starpu_insert_task(&compute_data, STARPU_RW, handle, 0);
  567. starpu_insert_task(&summarize_data, STARPU_R, handle, STARPU_W, result_handle, 0);
  568. starpu_data_unregister_submit(handle);
  569. \endcode
  570. \subsection Scratch_data Scratch data
  571. Some kernels sometimes need temporary data to achieve the computations, i.e. a
  572. workspace. The application could allocate it at the start of the codelet
  573. function, and free it at the end, but that would be costly. It could also
  574. allocate one buffer per worker (similarly to \ref
  575. Per-worker_library_initialization), but that would make them
  576. systematic and permanent. A more optimized way is to use the
  577. ::STARPU_SCRATCH data access mode, as examplified below,
  578. which provides per-worker buffers without content consistency.
  579. \code{.c}
  580. starpu_vector_data_register(&workspace, -1, 0, sizeof(float));
  581. for (i = 0; i < N; i++)
  582. starpu_insert_task(&compute, STARPU_R, input[i],
  583. STARPU_SCRATCH, workspace, STARPU_W, output[i], 0);
  584. \endcode
  585. StarPU will make sure that the buffer is allocated before executing the task,
  586. and make this allocation per-worker: for CPU workers, notably, each worker has
  587. its own buffer. This means that each task submitted above will actually have its
  588. own workspace, which will actually be the same for all tasks running one after
  589. the other on the same worker. Also, if for instance GPU memory becomes scarce,
  590. StarPU will notice that it can free such buffers easily, since the content does
  591. not matter.
  592. The <c>examples/pi</c> example uses scratches for some temporary buffer.
  593. \section Parallel_Tasks Parallel Tasks
  594. StarPU can leverage existing parallel computation libraries by the means of
  595. parallel tasks. A parallel task is a task which gets worked on by a set of CPUs
  596. (called a parallel or combined worker) at the same time, by using an existing
  597. parallel CPU implementation of the computation to be achieved. This can also be
  598. useful to improve the load balance between slow CPUs and fast GPUs: since CPUs
  599. work collectively on a single task, the completion time of tasks on CPUs become
  600. comparable to the completion time on GPUs, thus relieving from granularity
  601. discrepancy concerns. Hwloc support needs to be enabled to get good performance,
  602. otherwise StarPU will not know how to better group cores.
  603. Two modes of execution exist to accomodate with existing usages.
  604. \subsection Fork-mode parallel tasks Fork-mode_parallel_tasks
  605. In the Fork mode, StarPU will call the codelet function on one
  606. of the CPUs of the combined worker. The codelet function can use
  607. <c>starpu_combined_worker_get_size()</c> to get the number of threads it is
  608. allowed to start to achieve the computation. The CPU binding mask for the whole
  609. set of CPUs is already enforced, so that threads created by the function will
  610. inherit the mask, and thus execute where StarPU expected, the OS being in charge
  611. of choosing how to schedule threads on the corresponding CPUs. The application
  612. can also choose to bind threads by hand, using e.g. sched_getaffinity to know
  613. the CPU binding mask that StarPU chose.
  614. For instance, using OpenMP (full source is available in
  615. <c>examples/openmp/vector_scal.c</c>):
  616. \code{.c}
  617. void scal_cpu_func(void *buffers[], void *_args)
  618. {
  619. unsigned i;
  620. float *factor = _args;
  621. struct starpu_vector_interface *vector = buffers[0];
  622. unsigned n = STARPU_VECTOR_GET_NX(vector);
  623. float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
  624. #pragma omp parallel for num_threads(starpu_combined_worker_get_size())
  625. for (i = 0; i < n; i++)
  626. val[i] *= *factor;
  627. }
  628. static struct starpu_codelet cl =
  629. {
  630. .modes = { STARPU_RW },
  631. .where = STARPU_CPU,
  632. .type = STARPU_FORKJOIN,
  633. .max_parallelism = INT_MAX,
  634. .cpu_funcs = {scal_cpu_func, NULL},
  635. .nbuffers = 1,
  636. };
  637. \endcode
  638. Other examples include for instance calling a BLAS parallel CPU implementation
  639. (see <c>examples/mult/xgemm.c</c>).
  640. \subsection SPMD-mode_parallel_tasks SPMD-mode parallel tasks
  641. In the SPMD mode, StarPU will call the codelet function on
  642. each CPU of the combined worker. The codelet function can use
  643. <c>starpu_combined_worker_get_size()</c> to get the total number of CPUs
  644. involved in the combined worker, and thus the number of calls that are made in
  645. parallel to the function, and <c>starpu_combined_worker_get_rank()</c> to get
  646. the rank of the current CPU within the combined worker. For instance:
  647. \code{.c}
  648. static void func(void *buffers[], void *args)
  649. {
  650. unsigned i;
  651. float *factor = _args;
  652. struct starpu_vector_interface *vector = buffers[0];
  653. unsigned n = STARPU_VECTOR_GET_NX(vector);
  654. float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
  655. /* Compute slice to compute */
  656. unsigned m = starpu_combined_worker_get_size();
  657. unsigned j = starpu_combined_worker_get_rank();
  658. unsigned slice = (n+m-1)/m;
  659. for (i = j * slice; i < (j+1) * slice && i < n; i++)
  660. val[i] *= *factor;
  661. }
  662. static struct starpu_codelet cl =
  663. {
  664. .modes = { STARPU_RW },
  665. .where = STARP_CPU,
  666. .type = STARPU_SPMD,
  667. .max_parallelism = INT_MAX,
  668. .cpu_funcs = { func, NULL },
  669. .nbuffers = 1,
  670. }
  671. \endcode
  672. Of course, this trivial example will not really benefit from parallel task
  673. execution, and was only meant to be simple to understand. The benefit comes
  674. when the computation to be done is so that threads have to e.g. exchange
  675. intermediate results, or write to the data in a complex but safe way in the same
  676. buffer.
  677. \subsection Parallel_tasks_performance Parallel tasks performance
  678. To benefit from parallel tasks, a parallel-task-aware StarPU scheduler has to
  679. be used. When exposed to codelets with a Fork or SPMD flag, the <c>pheft</c>
  680. (parallel-heft) and <c>peager</c> (parallel eager) schedulers will indeed also
  681. try to execute tasks with several CPUs. It will automatically try the various
  682. available combined worker sizes and thus be able to avoid choosing a large
  683. combined worker if the codelet does not actually scale so much.
  684. \subsection Combined_workers Combined workers
  685. By default, StarPU creates combined workers according to the architecture
  686. structure as detected by hwloc. It means that for each object of the hwloc
  687. topology (NUMA node, socket, cache, ...) a combined worker will be created. If
  688. some nodes of the hierarchy have a big arity (e.g. many cores in a socket
  689. without a hierarchy of shared caches), StarPU will create combined workers of
  690. intermediate sizes. The <c>STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER</c> variable
  691. permits to tune the maximum arity between levels of combined workers.
  692. The combined workers actually produced can be seen in the output of the
  693. <c>starpu_machine_display</c> tool (the <c>STARPU_SCHED</c> environment variable
  694. has to be set to a combined worker-aware scheduler such as <c>pheft</c> or
  695. <c>peager</c>).
  696. \subsection Concurrent_parallel_tasks Concurrent parallel tasks
  697. Unfortunately, many environments and librairies do not support concurrent
  698. calls.
  699. For instance, most OpenMP implementations (including the main ones) do not
  700. support concurrent <c>pragma omp parallel</c> statements without nesting them in
  701. another <c>pragma omp parallel</c> statement, but StarPU does not yet support
  702. creating its CPU workers by using such pragma.
  703. Other parallel libraries are also not safe when being invoked concurrently
  704. from different threads, due to the use of global variables in their sequential
  705. sections for instance.
  706. The solution is then to use only one combined worker at a time. This can be
  707. done by setting <c>single_combined_worker</c> to 1 in the <c>starpu_conf</c>
  708. structure, or setting the <c>STARPU_SINGLE_COMBINED_WORKER</c> environment
  709. variable to 1. StarPU will then run only one parallel task at a time (but other
  710. CPU and GPU tasks are not affected and can be run concurrently). The parallel
  711. task scheduler will however still however still try varying combined worker
  712. sizes to look for the most efficient ones.
  713. \section Debugging Debugging
  714. StarPU provides several tools to help debugging aplications. Execution traces
  715. can be generated and displayed graphically, see \ref Generating_traces_with_FxT. Some
  716. gdb helpers are also provided to show the whole StarPU state:
  717. \verbatim
  718. (gdb) source tools/gdbinit
  719. (gdb) help starpu
  720. \endverbatim
  721. The Temanejo task debugger can also be used, see \ref Using_the_Temanejo_task_debugger.
  722. \section The_multiformat_interface The multiformat interface
  723. It may be interesting to represent the same piece of data using two different
  724. data structures: one that would only be used on CPUs, and one that would only
  725. be used on GPUs. This can be done by using the multiformat interface. StarPU
  726. will be able to convert data from one data structure to the other when needed.
  727. Note that the dmda scheduler is the only one optimized for this interface. The
  728. user must provide StarPU with conversion codelets:
  729. \code{.c}
  730. #define NX 1024
  731. struct point array_of_structs[NX];
  732. starpu_data_handle_t handle;
  733. /*
  734. * The conversion of a piece of data is itself a task, though it is created,
  735. * submitted and destroyed by StarPU internals and not by the user. Therefore,
  736. * we have to define two codelets.
  737. * Note that for now the conversion from the CPU format to the GPU format has to
  738. * be executed on the GPU, and the conversion from the GPU to the CPU has to be
  739. * executed on the CPU.
  740. */
  741. #ifdef STARPU_USE_OPENCL
  742. void cpu_to_opencl_opencl_func(void *buffers[], void *args);
  743. struct starpu_codelet cpu_to_opencl_cl = {
  744. .where = STARPU_OPENCL,
  745. .opencl_funcs = { cpu_to_opencl_opencl_func, NULL },
  746. .nbuffers = 1,
  747. .modes = { STARPU_RW }
  748. };
  749. void opencl_to_cpu_func(void *buffers[], void *args);
  750. struct starpu_codelet opencl_to_cpu_cl = {
  751. .where = STARPU_CPU,
  752. .cpu_funcs = { opencl_to_cpu_func, NULL },
  753. .nbuffers = 1,
  754. .modes = { STARPU_RW }
  755. };
  756. #endif
  757. struct starpu_multiformat_data_interface_ops format_ops = {
  758. #ifdef STARPU_USE_OPENCL
  759. .opencl_elemsize = 2 * sizeof(float),
  760. .cpu_to_opencl_cl = &cpu_to_opencl_cl,
  761. .opencl_to_cpu_cl = &opencl_to_cpu_cl,
  762. #endif
  763. .cpu_elemsize = 2 * sizeof(float),
  764. ...
  765. };
  766. starpu_multiformat_data_register(handle, 0, &array_of_structs, NX, &format_ops);
  767. \endcode
  768. Kernels can be written almost as for any other interface. Note that
  769. STARPU_MULTIFORMAT_GET_CPU_PTR shall only be used for CPU kernels. CUDA kernels
  770. must use STARPU_MULTIFORMAT_GET_CUDA_PTR, and OpenCL kernels must use
  771. STARPU_MULTIFORMAT_GET_OPENCL_PTR. STARPU_MULTIFORMAT_GET_NX may be used in any
  772. kind of kernel.
  773. \code{.c}
  774. static void
  775. multiformat_scal_cpu_func(void *buffers[], void *args)
  776. {
  777. struct point *aos;
  778. unsigned int n;
  779. aos = STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
  780. n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
  781. ...
  782. }
  783. extern "C" void multiformat_scal_cuda_func(void *buffers[], void *_args)
  784. {
  785. unsigned int n;
  786. struct struct_of_arrays *soa;
  787. soa = (struct struct_of_arrays *) STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
  788. n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
  789. ...
  790. }
  791. \endcode
  792. A full example may be found in <c>examples/basic_examples/multiformat.c</c>.
  793. \section Using_the_Driver_API Using the Driver API
  794. \ref Running_drivers
  795. \code{.c}
  796. int ret;
  797. struct starpu_driver = {
  798. .type = STARPU_CUDA_WORKER,
  799. .id.cuda_id = 0
  800. };
  801. ret = starpu_driver_init(&d);
  802. if (ret != 0)
  803. error();
  804. while (some_condition) {
  805. ret = starpu_driver_run_once(&d);
  806. if (ret != 0)
  807. error();
  808. }
  809. ret = starpu_driver_deinit(&d);
  810. if (ret != 0)
  811. error();
  812. \endcode
  813. \section Defining_a_New_Scheduling_Policy Defining a New Scheduling Policy
  814. A full example showing how to define a new scheduling policy is available in
  815. the StarPU sources in the directory <c>examples/scheduler/</c>.
  816. @pxref{Scheduling Policy}
  817. \code{.c}
  818. static struct starpu_sched_policy dummy_sched_policy = {
  819. .init_sched = init_dummy_sched,
  820. .deinit_sched = deinit_dummy_sched,
  821. .add_workers = dummy_sched_add_workers,
  822. .remove_workers = dummy_sched_remove_workers,
  823. .push_task = push_task_dummy,
  824. .push_prio_task = NULL,
  825. .pop_task = pop_task_dummy,
  826. .post_exec_hook = NULL,
  827. .pop_every_task = NULL,
  828. .policy_name = "dummy",
  829. .policy_description = "dummy scheduling strategy"
  830. };
  831. \endcode
  832. \section On-GPU_rendering On-GPU rendering
  833. Graphical-oriented applications need to draw the result of their computations,
  834. typically on the very GPU where these happened. Technologies such as OpenGL/CUDA
  835. interoperability permit to let CUDA directly work on the OpenGL buffers, making
  836. them thus immediately ready for drawing, by mapping OpenGL buffer, textures or
  837. renderbuffer objects into CUDA. CUDA however imposes some technical
  838. constraints: peer memcpy has to be disabled, and the thread that runs OpenGL has
  839. to be the one that runs CUDA computations for that GPU.
  840. To achieve this with StarPU, pass the <c>--disable-cuda-memcpy-peer</c> option
  841. to <c>./configure</c> (TODO: make it dynamic), OpenGL/GLUT has to be initialized
  842. first, and the interoperability mode has to
  843. be enabled by using the <c>cuda_opengl_interoperability</c> field of the
  844. <c>starpu_conf</c> structure, and the driver loop has to be run by
  845. the application, by using the <c>not_launched_drivers</c> field of
  846. <c>starpu_conf</c> to prevent StarPU from running it in a separate thread, and
  847. by using <c>starpu_driver_run</c> to run the loop. The <c>gl_interop</c> and
  848. <c>gl_interop_idle</c> examples shows how it articulates in a simple case, where
  849. rendering is done in task callbacks. The former uses <c>glutMainLoopEvent</c>
  850. to make GLUT progress from the StarPU driver loop, while the latter uses
  851. <c>glutIdleFunc</c> to make StarPU progress from the GLUT main loop.
  852. Then, to use an OpenGL buffer as a CUDA data, StarPU simply needs to be given
  853. the CUDA pointer at registration, for instance:
  854. \code{.c}
  855. /* Get the CUDA worker id */
  856. for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)
  857. if (starpu_worker_get_type(workerid) == STARPU_CUDA_WORKER)
  858. break;
  859. /* Build a CUDA pointer pointing at the OpenGL buffer */
  860. cudaGraphicsResourceGetMappedPointer((void**)&output, &num_bytes, resource);
  861. /* And register it to StarPU */
  862. starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid),
  863. output, num_bytes / sizeof(float4), sizeof(float4));
  864. /* The handle can now be used as usual */
  865. starpu_insert_task(&cl, STARPU_RW, handle, 0);
  866. /* ... */
  867. /* This gets back data into the OpenGL buffer */
  868. starpu_data_unregister(handle);
  869. \endcode
  870. and display it e.g. in the callback function.
  871. \section Defining_a_New_Data_Interface Defining a New Data Interface
  872. Let's define a new data interface to manage complex numbers.
  873. \code{.c}
  874. /* interface for complex numbers */
  875. struct starpu_complex_interface
  876. {
  877. double *real;
  878. double *imaginary;
  879. int nx;
  880. };
  881. \endcode
  882. Registering such a data to StarPU is easily done using the function
  883. <c>starpu_data_register</c> (@pxref{Basic Data Management API}). The last
  884. parameter of the function, <c>interface_complex_ops</c>, will be
  885. described below.
  886. \code{.c}
  887. void starpu_complex_data_register(starpu_data_handle_t *handle,
  888. unsigned home_node, double *real, double *imaginary, int nx)
  889. {
  890. struct starpu_complex_interface complex =
  891. {
  892. .real = real,
  893. .imaginary = imaginary,
  894. .nx = nx
  895. };
  896. if (interface_complex_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
  897. {
  898. interface_complex_ops.interfaceid = starpu_data_interface_get_next_id();
  899. }
  900. starpu_data_register(handleptr, home_node, &complex, &interface_complex_ops);
  901. }
  902. \endcode
  903. Different operations need to be defined for a data interface through
  904. the type <c>struct starpu_data_interface_ops</c> (@pxref{Defining
  905. Interface}). We only define here the basic operations needed to
  906. run simple applications. The source code for the different functions
  907. can be found in the file
  908. <c>examples/interface/complex_interface.c</c>.
  909. \code{.c}
  910. static struct starpu_data_interface_ops interface_complex_ops =
  911. {
  912. .register_data_handle = complex_register_data_handle,
  913. .allocate_data_on_node = complex_allocate_data_on_node,
  914. .copy_methods = &complex_copy_methods,
  915. .get_size = complex_get_size,
  916. .footprint = complex_footprint,
  917. .interfaceid = STARPU_UNKNOWN_INTERFACE_ID,
  918. .interface_size = sizeof(struct starpu_complex_interface),
  919. };
  920. \endcode
  921. Functions need to be defined to access the different fields of the
  922. complex interface from a StarPU data handle.
  923. \code{.c}
  924. double *starpu_complex_get_real(starpu_data_handle_t handle)
  925. {
  926. struct starpu_complex_interface *complex_interface =
  927. (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, 0);
  928. return complex_interface->real;
  929. }
  930. double *starpu_complex_get_imaginary(starpu_data_handle_t handle);
  931. int starpu_complex_get_nx(starpu_data_handle_t handle);
  932. \endcode
  933. Similar functions need to be defined to access the different fields of the
  934. complex interface from a <c>void *</c> pointer to be used within codelet
  935. implemetations.
  936. \code{.c}
  937. #define STARPU_COMPLEX_GET_REAL(interface) \
  938. (((struct starpu_complex_interface *)(interface))->real)
  939. #define STARPU_COMPLEX_GET_IMAGINARY(interface) \
  940. (((struct starpu_complex_interface *)(interface))->imaginary)
  941. #define STARPU_COMPLEX_GET_NX(interface) \
  942. (((struct starpu_complex_interface *)(interface))->nx)
  943. \endcode
  944. Complex data interfaces can then be registered to StarPU.
  945. \code{.c}
  946. double real = 45.0;
  947. double imaginary = 12.0;
  948. starpu_complex_data_register(&handle1, 0, &real, &imaginary, 1);
  949. starpu_insert_task(&cl_display, STARPU_R, handle1, 0);
  950. \endcode
  951. and used by codelets.
  952. \code{.c}
  953. void display_complex_codelet(void *descr[], __attribute__ ((unused)) void *_args)
  954. {
  955. int nx = STARPU_COMPLEX_GET_NX(descr[0]);
  956. double *real = STARPU_COMPLEX_GET_REAL(descr[0]);
  957. double *imaginary = STARPU_COMPLEX_GET_IMAGINARY(descr[0]);
  958. int i;
  959. for(i=0 ; i<nx ; i++)
  960. {
  961. fprintf(stderr, "Complex[%d] = %3.2f + %3.2f i\n", i, real[i], imaginary[i]);
  962. }
  963. }
  964. \endcode
  965. The whole code for this complex data interface is available in the
  966. directory <c>examples/interface/</c>.
  967. \section Setting_the_Data_Handles_for_a_Task Setting the Data Handles for a Task
  968. The number of data a task can manage is fixed by the
  969. <c>STARPU_NMAXBUFS</c> which has a default value which can be changed
  970. through the configure option <c>--enable-maxbuffers</c> (see
  971. @ref{--enable-maxbuffers}).
  972. However, it is possible to define tasks managing more data by using
  973. the field <c>dyn_handles</c> when defining a task and the field
  974. <c>dyn_modes</c> when defining the corresponding codelet.
  975. \code{.c}
  976. enum starpu_data_access_mode modes[STARPU_NMAXBUFS+1] = {
  977. STARPU_R, STARPU_R, ...
  978. };
  979. struct starpu_codelet dummy_big_cl =
  980. {
  981. .cuda_funcs = {dummy_big_kernel, NULL},
  982. .opencl_funcs = {dummy_big_kernel, NULL},
  983. .cpu_funcs = {dummy_big_kernel, NULL},
  984. .nbuffers = STARPU_NMAXBUFS+1,
  985. .dyn_modes = modes
  986. };
  987. task = starpu_task_create();
  988. task->cl = &dummy_big_cl;
  989. task->dyn_handles = malloc(task->cl->nbuffers * sizeof(starpu_data_handle_t));
  990. for(i=0 ; i<task->cl->nbuffers ; i++)
  991. {
  992. task->dyn_handles[i] = handle;
  993. }
  994. starpu_task_submit(task);
  995. \endcode
  996. \code{.c}
  997. starpu_data_handle_t *handles = malloc(dummy_big_cl.nbuffers * sizeof(starpu_data_handle_t));
  998. for(i=0 ; i<dummy_big_cl.nbuffers ; i++)
  999. {
  1000. handles[i] = handle;
  1001. }
  1002. starpu_insert_task(&dummy_big_cl,
  1003. STARPU_VALUE, &dummy_big_cl.nbuffers, sizeof(dummy_big_cl.nbuffers),
  1004. STARPU_DATA_ARRAY, handles, dummy_big_cl.nbuffers,
  1005. 0);
  1006. \endcode
  1007. The whole code for this complex data interface is available in the
  1008. directory <c>examples/basic_examples/dynamic_handles.c</c>.
  1009. \section More_examples More examples
  1010. More examples are available in the StarPU sources in the <c>examples/</c>
  1011. directory. Simple examples include:
  1012. <dl>
  1013. <dt> <c>incrementer/</c> </dt>
  1014. <dd> Trivial incrementation test. </dd>
  1015. <dt> <c>basic_examples/</c> </dt>
  1016. <dd>
  1017. Simple documented Hello world and vector/scalar product (as
  1018. shown in \ref basicExamples), matrix
  1019. product examples (as shown in \ref Performance_model_example), an example using the blocked matrix data
  1020. interface, an example using the variable data interface, and an example
  1021. using different formats on CPUs and GPUs.
  1022. </dd>
  1023. <dt> <c>matvecmult/</c></dt>
  1024. <dd>
  1025. OpenCL example from NVidia, adapted to StarPU.
  1026. </dd>
  1027. <dt> <c>axpy/</c></dt>
  1028. <dd>
  1029. AXPY CUBLAS operation adapted to StarPU.
  1030. </dd>
  1031. <dt> <c>fortran/</c> </dt>
  1032. <dd>
  1033. Example of Fortran bindings.
  1034. </dd>
  1035. </dl>
  1036. More advanced examples include:
  1037. <dl>
  1038. <dt><c>filters/</c></dt>
  1039. <dd>
  1040. Examples using filters, as shown in \ref Partitioning_Data.
  1041. </dd>
  1042. <dt><c>lu/</c></dt>
  1043. <dd>
  1044. LU matrix factorization, see for instance <c>xlu_implicit.c</c>
  1045. </dd>
  1046. <dt><c>cholesky/</c></dt>
  1047. <dd>
  1048. Cholesky matrix factorization, see for instance <c>cholesky_implicit.c</c>.
  1049. </dd>
  1050. </dl>
  1051. */