advanced-examples.texi 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888
  1. @c -*-texinfo-*-
  2. @c This file is part of the StarPU Handbook.
  3. @c Copyright (C) 2009--2011 Universit@'e de Bordeaux 1
  4. @c Copyright (C) 2010, 2011, 2012 Centre National de la Recherche Scientifique
  5. @c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
  6. @c See the file starpu.texi for copying conditions.
  7. @menu
  8. * Using multiple implementations of a codelet::
  9. * Enabling implementation according to capabilities::
  10. * Task and Worker Profiling::
  11. * Partitioning Data:: Partitioning Data
  12. * Performance model example::
  13. * Theoretical lower bound on execution time::
  14. * Insert Task Utility::
  15. * Parallel Tasks::
  16. * Debugging::
  17. * The multiformat interface::
  18. * On-GPU rendering::
  19. * More examples:: More examples shipped with StarPU
  20. @end menu
  21. @node Using multiple implementations of a codelet
  22. @section Using multiple implementations of a codelet
  23. One may want to write multiple implementations of a codelet for a single type of
  24. device and let StarPU choose which one to run. As an example, we will show how
  25. to use SSE to scale a vector. The codelet can be written as follows:
  26. @cartouche
  27. @smallexample
  28. #include <xmmintrin.h>
  29. void scal_sse_func(void *buffers[], void *cl_arg)
  30. @{
  31. float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
  32. unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
  33. unsigned int n_iterations = n/4;
  34. if (n % 4 != 0)
  35. n_iterations++;
  36. __m128 *VECTOR = (__m128*) vector;
  37. __m128 factor __attribute__((aligned(16)));
  38. factor = _mm_set1_ps(*(float *) cl_arg);
  39. unsigned int i;
  40. for (i = 0; i < n_iterations; i++)
  41. VECTOR[i] = _mm_mul_ps(factor, VECTOR[i]);
  42. @}
  43. @end smallexample
  44. @end cartouche
  45. @cartouche
  46. @smallexample
  47. struct starpu_codelet cl = @{
  48. .where = STARPU_CPU,
  49. .cpu_funcs = @{ scal_cpu_func, scal_sse_func, NULL @},
  50. .nbuffers = 1,
  51. .modes = @{ STARPU_RW @}
  52. @};
  53. @end smallexample
  54. @end cartouche
  55. Schedulers which are multi-implementation aware (only @code{dmda}, @code{heft}
  56. and @code{pheft} for now) will use the performance models of all the
  57. implementations it was given, and pick the one that seems to be the fastest.
  58. @node Enabling implementation according to capabilities
  59. @section Enabling implementation according to capabilities
  60. Some implementations may not run on some devices. For instance, some CUDA
  61. devices do not support double floating point precision, and thus the kernel
  62. execution would just fail; or the device may not have enough shared memory for
  63. the implementation being used. The @code{can_execute} field of the @code{struct
  64. starpu_codelet} structure permits to express this. For instance:
  65. @cartouche
  66. @smallexample
  67. static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
  68. @{
  69. const struct cudaDeviceProp *props;
  70. if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
  71. return 1;
  72. /* Cuda device */
  73. props = starpu_cuda_get_device_properties(workerid);
  74. if (props->major >= 2 || props->minor >= 3)
  75. /* At least compute capability 1.3, supports doubles */
  76. return 1;
  77. /* Old card, does not support doubles */
  78. return 0;
  79. @}
  80. struct starpu_codelet cl = @{
  81. .where = STARPU_CPU|STARPU_CUDA,
  82. .can_execute = can_execute,
  83. .cpu_funcs = @{ cpu_func, NULL @},
  84. .cuda_funcs = @{ gpu_func, NULL @}
  85. .nbuffers = 1,
  86. .modes = @{ STARPU_RW @}
  87. @};
  88. @end smallexample
  89. @end cartouche
  90. This can be essential e.g. when running on a machine which mixes various models
  91. of CUDA devices, to take benefit from the new models without crashing on old models.
  92. Note: the @code{can_execute} function is called by the scheduler each time it
  93. tries to match a task with a worker, and should thus be very fast. The
  94. @code{starpu_cuda_get_device_properties} provides a quick access to CUDA
  95. properties of CUDA devices to achieve such efficiency.
  96. Another example is compiling CUDA code for various compute capabilities,
  97. resulting with two CUDA functions, e.g. @code{scal_gpu_13} for compute capability
  98. 1.3, and @code{scal_gpu_20} for compute capability 2.0. Both functions can be
  99. provided to StarPU by using @code{cuda_funcs}, and @code{can_execute} can then be
  100. used to rule out the @code{scal_gpu_20} variant on a CUDA device which
  101. will not be able to execute it:
  102. @cartouche
  103. @smallexample
  104. static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
  105. @{
  106. const struct cudaDeviceProp *props;
  107. if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
  108. return 1;
  109. /* Cuda device */
  110. if (nimpl == 0)
  111. /* Trying to execute the 1.3 capability variant, we assume it is ok in all cases. */
  112. return 1;
  113. /* Trying to execute the 2.0 capability variant, check that the card can do it. */
  114. props = starpu_cuda_get_device_properties(workerid);
  115. if (props->major >= 2 || props->minor >= 0)
  116. /* At least compute capability 2.0, can run it */
  117. return 1;
  118. /* Old card, does not support 2.0, will not be able to execute the 2.0 variant. */
  119. return 0;
  120. @}
  121. struct starpu_codelet cl = @{
  122. .where = STARPU_CPU|STARPU_CUDA,
  123. .can_execute = can_execute,
  124. .cpu_funcs = @{ cpu_func, NULL @},
  125. .cuda_funcs = @{ scal_gpu_13, scal_gpu_20, NULL @},
  126. .nbuffers = 1,
  127. .modes = @{ STARPU_RW @}
  128. @};
  129. @end smallexample
  130. @end cartouche
  131. Note: the most generic variant should be provided first, as some schedulers are
  132. not able to try the different variants.
  133. @node Task and Worker Profiling
  134. @section Task and Worker Profiling
  135. A full example showing how to use the profiling API is available in
  136. the StarPU sources in the directory @code{examples/profiling/}.
  137. @cartouche
  138. @smallexample
  139. struct starpu_task *task = starpu_task_create();
  140. task->cl = &cl;
  141. task->synchronous = 1;
  142. /* We will destroy the task structure by hand so that we can
  143. * query the profiling info before the task is destroyed. */
  144. task->destroy = 0;
  145. /* Submit and wait for completion (since synchronous was set to 1) */
  146. starpu_task_submit(task);
  147. /* The task is finished, get profiling information */
  148. struct starpu_task_profiling_info *info = task->profiling_info;
  149. /* How much time did it take before the task started ? */
  150. double delay += starpu_timing_timespec_delay_us(&info->submit_time, &info->start_time);
  151. /* How long was the task execution ? */
  152. double length += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
  153. /* We don't need the task structure anymore */
  154. starpu_task_destroy(task);
  155. @end smallexample
  156. @end cartouche
  157. @cartouche
  158. @smallexample
  159. /* Display the occupancy of all workers during the test */
  160. int worker;
  161. for (worker = 0; worker < starpu_worker_get_count(); worker++)
  162. @{
  163. struct starpu_worker_profiling_info worker_info;
  164. int ret = starpu_worker_get_profiling_info(worker, &worker_info);
  165. STARPU_ASSERT(!ret);
  166. double total_time = starpu_timing_timespec_to_us(&worker_info.total_time);
  167. double executing_time = starpu_timing_timespec_to_us(&worker_info.executing_time);
  168. double sleeping_time = starpu_timing_timespec_to_us(&worker_info.sleeping_time);
  169. float executing_ratio = 100.0*executing_time/total_time;
  170. float sleeping_ratio = 100.0*sleeping_time/total_time;
  171. char workername[128];
  172. starpu_worker_get_name(worker, workername, 128);
  173. fprintf(stderr, "Worker %s:\n", workername);
  174. fprintf(stderr, "\ttotal time: %.2lf ms\n", total_time*1e-3);
  175. fprintf(stderr, "\texec time: %.2lf ms (%.2f %%)\n", executing_time*1e-3,
  176. executing_ratio);
  177. fprintf(stderr, "\tblocked time: %.2lf ms (%.2f %%)\n", sleeping_time*1e-3,
  178. sleeping_ratio);
  179. @}
  180. @end smallexample
  181. @end cartouche
  182. @node Partitioning Data
  183. @section Partitioning Data
  184. An existing piece of data can be partitioned in sub parts to be used by different tasks, for instance:
  185. @cartouche
  186. @smallexample
  187. int vector[NX];
  188. starpu_data_handle_t handle;
  189. /* Declare data to StarPU */
  190. starpu_vector_data_register(&handle, 0, (uintptr_t)vector, NX, sizeof(vector[0]));
  191. /* Partition the vector in PARTS sub-vectors */
  192. starpu_filter f =
  193. @{
  194. .filter_func = starpu_block_filter_func_vector,
  195. .nchildren = PARTS
  196. @};
  197. starpu_data_partition(handle, &f);
  198. @end smallexample
  199. @end cartouche
  200. The task submission then uses @code{starpu_data_get_sub_data} to retrive the
  201. sub-handles to be passed as tasks parameters.
  202. @cartouche
  203. @smallexample
  204. /* Submit a task on each sub-vector */
  205. for (i=0; i<starpu_data_get_nb_children(handle); i++) @{
  206. /* Get subdata number i (there is only 1 dimension) */
  207. starpu_data_handle_t sub_handle = starpu_data_get_sub_data(handle, 1, i);
  208. struct starpu_task *task = starpu_task_create();
  209. task->handles[0] = sub_handle;
  210. task->cl = &cl;
  211. task->synchronous = 1;
  212. task->cl_arg = &factor;
  213. task->cl_arg_size = sizeof(factor);
  214. starpu_task_submit(task);
  215. @}
  216. @end smallexample
  217. @end cartouche
  218. Partitioning can be applied several times, see
  219. @code{examples/basic_examples/mult.c} and @code{examples/filters/}.
  220. Wherever the whole piece of data is already available, the partitioning will
  221. be done in-place, i.e. without allocating new buffers but just using pointers
  222. inside the existing copy. This is particularly important to be aware of when
  223. using OpenCL, where the kernel parameters are not pointers, but handles. The
  224. kernel thus needs to be also passed the offset within the OpenCL buffer:
  225. @cartouche
  226. @smallexample
  227. void opencl_func(void *buffers[], void *cl_arg)
  228. @{
  229. cl_mem vector = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
  230. unsigned offset = STARPU_BLOCK_GET_OFFSET(buffers[0]);
  231. ...
  232. clSetKernelArg(kernel, 0, sizeof(vector), &vector);
  233. clSetKernelArg(kernel, 1, sizeof(offset), &offset);
  234. ...
  235. @}
  236. @end smallexample
  237. @end cartouche
  238. And the kernel has to shift from the pointer passed by the OpenCL driver:
  239. @cartouche
  240. @smallexample
  241. __kernel void opencl_kernel(__global int *vector, unsigned offset)
  242. @{
  243. block = (__global void *)block + offset;
  244. ...
  245. @}
  246. @end smallexample
  247. @end cartouche
  248. @node Performance model example
  249. @section Performance model example
  250. To achieve good scheduling, StarPU scheduling policies need to be able to
  251. estimate in advance the duration of a task. This is done by giving to codelets
  252. a performance model, by defining a @code{starpu_perfmodel} structure and
  253. providing its address in the @code{model} field of the @code{struct starpu_codelet}
  254. structure. The @code{symbol} and @code{type} fields of @code{starpu_perfmodel}
  255. are mandatory, to give a name to the model, and the type of the model, since
  256. there are several kinds of performance models.
  257. @itemize
  258. @item
  259. Measured at runtime (@code{STARPU_HISTORY_BASED} model type). This assumes that for a
  260. given set of data input/output sizes, the performance will always be about the
  261. same. This is very true for regular kernels on GPUs for instance (<0.1% error),
  262. and just a bit less true on CPUs (~=1% error). This also assumes that there are
  263. few different sets of data input/output sizes. StarPU will then keep record of
  264. the average time of previous executions on the various processing units, and use
  265. it as an estimation. History is done per task size, by using a hash of the input
  266. and ouput sizes as an index.
  267. It will also save it in @code{~/.starpu/sampling/codelets}
  268. for further executions, and can be observed by using the
  269. @code{starpu_perfmodel_display} command, or drawn by using
  270. the @code{starpu_perfmodel_plot}. The models are indexed by machine name. To
  271. share the models between machines (e.g. for a homogeneous cluster), use
  272. @code{export STARPU_HOSTNAME=some_global_name}. Measurements are only done when using a task scheduler which makes use of it, such as @code{heft} or @code{dmda}.
  273. The following is a small code example.
  274. If e.g. the code is recompiled with other compilation options, or several
  275. variants of the code are used, the symbol string should be changed to reflect
  276. that, in order to recalibrate a new model from zero. The symbol string can even
  277. be constructed dynamically at execution time, as long as this is done before
  278. submitting any task using it.
  279. @cartouche
  280. @smallexample
  281. static struct starpu_perfmodel mult_perf_model = @{
  282. .type = STARPU_HISTORY_BASED,
  283. .symbol = "mult_perf_model"
  284. @};
  285. struct starpu_codelet cl = @{
  286. .where = STARPU_CPU,
  287. .cpu_funcs = @{ cpu_mult, NULL @},
  288. .nbuffers = 3,
  289. .modes = @{ STARPU_R, STARPU_R, STARPU_W @},
  290. /* for the scheduling policy to be able to use performance models */
  291. .model = &mult_perf_model
  292. @};
  293. @end smallexample
  294. @end cartouche
  295. @item
  296. Measured at runtime and refined by regression (@code{STARPU_*REGRESSION_BASED}
  297. model type). This still assumes performance regularity, but can work
  298. with various data input sizes, by applying regression over observed
  299. execution times. STARPU_REGRESSION_BASED uses an a*n^b regression
  300. form, STARPU_NL_REGRESSION_BASED uses an a*n^b+c (more precise than
  301. STARPU_REGRESSION_BASED, but costs a lot more to compute). For instance,
  302. @code{tests/perfmodels/regression_based.c} uses a regression-based performance
  303. model for the @code{memset} operation. Of course, the application has to issue
  304. tasks with varying size so that the regression can be computed. StarPU will not
  305. trust the regression unless there is at least 10% difference between the minimum
  306. and maximum observed input size. For non-linear regression, since computing it
  307. is quite expensive, it is only done at termination of the application. This
  308. means that the first execution uses history-based performance model to perform
  309. scheduling.
  310. @item
  311. Provided as an estimation from the application itself (@code{STARPU_COMMON} model type and @code{cost_function} field),
  312. see for instance
  313. @code{examples/common/blas_model.h} and @code{examples/common/blas_model.c}.
  314. @item
  315. Provided explicitly by the application (@code{STARPU_PER_ARCH} model type): the
  316. @code{.per_arch[arch][nimpl].cost_function} fields have to be filled with pointers to
  317. functions which return the expected duration of the task in micro-seconds, one
  318. per architecture.
  319. @end itemize
  320. For the @code{STARPU_HISTORY_BASED} and @code{STARPU_*REGRESSION_BASE},
  321. the total size of task data (both input and output) is used as an index by
  322. default. The @code{size_base} field of @code{struct starpu_perfmodel} however
  323. permits the application to override that, when for instance some of the data
  324. do not matter for task cost (e.g. mere reference table), or when using sparse
  325. structures (in which case it is the number of non-zeros which matter), or when
  326. there is some hidden parameter such as the number of iterations, etc.
  327. How to use schedulers which can benefit from such performance model is explained
  328. in @ref{Task scheduling policy}.
  329. The same can be done for task power consumption estimation, by setting the
  330. @code{power_model} field the same way as the @code{model} field. Note: for
  331. now, the application has to give to the power consumption performance model
  332. a name which is different from the execution time performance model.
  333. The application can request time estimations from the StarPU performance
  334. models by filling a task structure as usual without actually submitting
  335. it. The data handles can be created by calling @code{starpu_data_register}
  336. functions with a @code{NULL} pointer (and need to be unregistered as usual)
  337. and the desired data sizes. The @code{starpu_task_expected_length} and
  338. @code{starpu_task_expected_power} functions can then be called to get an
  339. estimation of the task duration on a given arch. @code{starpu_task_destroy}
  340. needs to be called to destroy the dummy task afterwards. See
  341. @code{tests/perfmodels/regression_based.c} for an example.
  342. @node Theoretical lower bound on execution time
  343. @section Theoretical lower bound on execution time
  344. For kernels with history-based performance models, StarPU can very easily provide a theoretical lower
  345. bound for the execution time of a whole set of tasks. See for
  346. instance @code{examples/lu/lu_example.c}: before submitting tasks,
  347. call @code{starpu_bound_start}, and after complete execution, call
  348. @code{starpu_bound_stop}. @code{starpu_bound_print_lp} or
  349. @code{starpu_bound_print_mps} can then be used to output a Linear Programming
  350. problem corresponding to the schedule of your tasks. Run it through
  351. @code{lp_solve} or any other linear programming solver, and that will give you a
  352. lower bound for the total execution time of your tasks. If StarPU was compiled
  353. with the glpk library installed, @code{starpu_bound_compute} can be used to
  354. solve it immediately and get the optimized minimum, in ms. Its @code{integer}
  355. parameter allows to decide whether integer resolution should be computed
  356. and returned too.
  357. The @code{deps} parameter tells StarPU whether to take tasks and implicit data
  358. dependencies into account. It must be understood that the linear programming
  359. problem size is quadratic with the number of tasks and thus the time to solve it
  360. will be very long, it could be minutes for just a few dozen tasks. You should
  361. probably use @code{lp_solve -timeout 1 test.pl -wmps test.mps} to convert the
  362. problem to MPS format and then use a better solver, @code{glpsol} might be
  363. better than @code{lp_solve} for instance (the @code{--pcost} option may be
  364. useful), but sometimes doesn't manage to converge. @code{cbc} might look
  365. slower, but it is parallel. Be sure to try at least all the @code{-B} options
  366. of @code{lp_solve}. For instance, we often just use
  367. @code{lp_solve -cc -B1 -Bb -Bg -Bp -Bf -Br -BG -Bd -Bs -BB -Bo -Bc -Bi} , and
  368. the @code{-gr} option can also be quite useful.
  369. Setting @code{deps} to 0 will only take into account the actual computations
  370. on processing units. It however still properly takes into account the varying
  371. performances of kernels and processing units, which is quite more accurate than
  372. just comparing StarPU performances with the fastest of the kernels being used.
  373. The @code{prio} parameter tells StarPU whether to simulate taking into account
  374. the priorities as the StarPU scheduler would, i.e. schedule prioritized
  375. tasks before less prioritized tasks, to check to which extend this results
  376. to a less optimal solution. This increases even more computation time.
  377. Note that for simplicity, all this however doesn't take into account data
  378. transfers, which are assumed to be completely overlapped.
  379. @node Insert Task Utility
  380. @section Insert Task Utility
  381. StarPU provides the wrapper function @code{starpu_insert_task} to ease
  382. the creation and submission of tasks.
  383. @deftypefun int starpu_insert_task (struct starpu_codelet *@var{cl}, ...)
  384. Create and submit a task corresponding to @var{cl} with the following
  385. arguments. The argument list must be zero-terminated.
  386. The arguments following the codelets can be of the following types:
  387. @itemize
  388. @item
  389. @code{STARPU_R}, @code{STARPU_W}, @code{STARPU_RW}, @code{STARPU_SCRATCH}, @code{STARPU_REDUX} an access mode followed by a data handle;
  390. @item
  391. the specific values @code{STARPU_VALUE}, @code{STARPU_CALLBACK},
  392. @code{STARPU_CALLBACK_ARG}, @code{STARPU_CALLBACK_WITH_ARG},
  393. @code{STARPU_PRIORITY}, followed by the appropriated objects as
  394. defined below.
  395. @end itemize
  396. Parameters to be passed to the codelet implementation are defined
  397. through the type @code{STARPU_VALUE}. The function
  398. @code{starpu_codelet_unpack_args} must be called within the codelet
  399. implementation to retrieve them.
  400. @end deftypefun
  401. @defmac STARPU_VALUE
  402. this macro is used when calling @code{starpu_insert_task}, and must be
  403. followed by a pointer to a constant value and the size of the constant
  404. @end defmac
  405. @defmac STARPU_CALLBACK
  406. this macro is used when calling @code{starpu_insert_task}, and must be
  407. followed by a pointer to a callback function
  408. @end defmac
  409. @defmac STARPU_CALLBACK_ARG
  410. this macro is used when calling @code{starpu_insert_task}, and must be
  411. followed by a pointer to be given as an argument to the callback
  412. function
  413. @end defmac
  414. @defmac STARPU_CALLBACK_WITH_ARG
  415. this macro is used when calling @code{starpu_insert_task}, and must be
  416. followed by two pointers: one to a callback function, and the other to
  417. be given as an argument to the callback function; this is equivalent
  418. to using both @code{STARPU_CALLBACK} and
  419. @code{STARPU_CALLBACK_WITH_ARG}
  420. @end defmac
  421. @defmac STARPU_PRIORITY
  422. this macro is used when calling @code{starpu_insert_task}, and must be
  423. followed by a integer defining a priority level
  424. @end defmac
  425. @deftypefun void starpu_codelet_pack_args ({char **}@var{arg_buffer}, {size_t *}@var{arg_buffer_size}, ...)
  426. Pack arguments of type @code{STARPU_VALUE} into a buffer which can be
  427. given to a codelet and later unpacked with the function
  428. @code{starpu_codelet_unpack_args} defined below.
  429. @end deftypefun
  430. @deftypefun void starpu_codelet_unpack_args ({void *}@var{cl_arg}, ...)
  431. Retrieve the arguments of type @code{STARPU_VALUE} associated to a
  432. task automatically created using the function
  433. @code{starpu_insert_task} defined above.
  434. @end deftypefun
  435. Here the implementation of the codelet:
  436. @smallexample
  437. void func_cpu(void *descr[], void *_args)
  438. @{
  439. int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
  440. float *x1 = (float *)STARPU_VARIABLE_GET_PTR(descr[1]);
  441. int ifactor;
  442. float ffactor;
  443. starpu_codelet_unpack_args(_args, &ifactor, &ffactor);
  444. *x0 = *x0 * ifactor;
  445. *x1 = *x1 * ffactor;
  446. @}
  447. struct starpu_codelet mycodelet = @{
  448. .where = STARPU_CPU,
  449. .cpu_funcs = @{ func_cpu, NULL @},
  450. .nbuffers = 2,
  451. .modes = @{ STARPU_RW, STARPU_RW @}
  452. @};
  453. @end smallexample
  454. And the call to the @code{starpu_insert_task} wrapper:
  455. @smallexample
  456. starpu_insert_task(&mycodelet,
  457. STARPU_VALUE, &ifactor, sizeof(ifactor),
  458. STARPU_VALUE, &ffactor, sizeof(ffactor),
  459. STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],
  460. 0);
  461. @end smallexample
  462. The call to @code{starpu_insert_task} is equivalent to the following
  463. code:
  464. @smallexample
  465. struct starpu_task *task = starpu_task_create();
  466. task->cl = &mycodelet;
  467. task->handles[0] = data_handles[0];
  468. task->handles[1] = data_handles[1];
  469. char *arg_buffer;
  470. size_t arg_buffer_size;
  471. starpu_codelet_pack_args(&arg_buffer, &arg_buffer_size,
  472. STARPU_VALUE, &ifactor, sizeof(ifactor),
  473. STARPU_VALUE, &ffactor, sizeof(ffactor),
  474. 0);
  475. task->cl_arg = arg_buffer;
  476. task->cl_arg_size = arg_buffer_size;
  477. int ret = starpu_task_submit(task);
  478. @end smallexample
  479. If some part of the task insertion depends on the value of some computation,
  480. the @code{STARPU_DATA_ACQUIRE_CB} macro can be very convenient. For
  481. instance, assuming that the index variable @code{i} was registered as handle
  482. @code{i_handle}:
  483. @smallexample
  484. /* Compute which portion we will work on, e.g. pivot */
  485. starpu_insert_task(&which_index, STARPU_W, i_handle, 0);
  486. /* And submit the corresponding task */
  487. STARPU_DATA_ACQUIRE_CB(i_handle, STARPU_R, starpu_insert_task(&work, STARPU_RW, A_handle[i], 0));
  488. @end smallexample
  489. The @code{STARPU_DATA_ACQUIRE_CB} macro submits an asynchronous request for
  490. acquiring data @code{i} for the main application, and will execute the code
  491. given as third parameter when it is acquired. In other words, as soon as the
  492. value of @code{i} computed by the @code{which_index} codelet can be read, the
  493. portion of code passed as third parameter of @code{STARPU_DATA_ACQUIRE_CB} will
  494. be executed, and is allowed to read from @code{i} to use it e.g. as an
  495. index. Note that this macro is only avaible when compiling StarPU with
  496. the compiler @code{gcc}.
  497. @node Parallel Tasks
  498. @section Parallel Tasks
  499. StarPU can leverage existing parallel computation libraries by the means of
  500. parallel tasks. A parallel task is a task which gets worked on by a set of CPUs
  501. (called a parallel or combined worker) at the same time, by using an existing
  502. parallel CPU implementation of the computation to be achieved. This can also be
  503. useful to improve the load balance between slow CPUs and fast GPUs: since CPUs
  504. work collectively on a single task, the completion time of tasks on CPUs become
  505. comparable to the completion time on GPUs, thus relieving from granularity
  506. discrepancy concerns.
  507. Two modes of execution exist to accomodate with existing usages.
  508. @subsection Fork-mode parallel tasks
  509. In the Fork mode, StarPU will call the codelet function on one
  510. of the CPUs of the combined worker. The codelet function can use
  511. @code{starpu_combined_worker_get_size()} to get the number of threads it is
  512. allowed to start to achieve the computation. The CPU binding mask is already
  513. enforced, so that threads created by the function will inherit the mask, and
  514. thus execute where StarPU expected. For instance, using OpenMP (full source is
  515. available in @code{examples/openmp/vector_scal.c}):
  516. @example
  517. void scal_cpu_func(void *buffers[], void *_args)
  518. @{
  519. unsigned i;
  520. float *factor = _args;
  521. struct starpu_vector_interface *vector = buffers[0];
  522. unsigned n = STARPU_VECTOR_GET_NX(vector);
  523. float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
  524. #pragma omp parallel for num_threads(starpu_combined_worker_get_size())
  525. for (i = 0; i < n; i++)
  526. val[i] *= *factor;
  527. @}
  528. static struct starpu_codelet cl =
  529. @{
  530. .modes = @{ STARPU_RW @},
  531. .where = STARPU_CPU,
  532. .type = STARPU_FORKJOIN,
  533. .max_parallelism = INT_MAX,
  534. .cpu_funcs = @{scal_cpu_func, NULL@},
  535. .nbuffers = 1,
  536. @};
  537. @end example
  538. Other examples include for instance calling a BLAS parallel CPU implementation
  539. (see @code{examples/mult/xgemm.c}).
  540. @subsection SPMD-mode parallel tasks
  541. In the SPMD mode, StarPU will call the codelet function on
  542. each CPU of the combined worker. The codelet function can use
  543. @code{starpu_combined_worker_get_size()} to get the total number of CPUs
  544. involved in the combined worker, and thus the number of calls that are made in
  545. parallel to the function, and @code{starpu_combined_worker_get_rank()} to get
  546. the rank of the current CPU within the combined worker. For instance:
  547. @example
  548. static void func(void *buffers[], void *args)
  549. @{
  550. unsigned i;
  551. float *factor = _args;
  552. struct starpu_vector_interface *vector = buffers[0];
  553. unsigned n = STARPU_VECTOR_GET_NX(vector);
  554. float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
  555. /* Compute slice to compute */
  556. unsigned m = starpu_combined_worker_get_size();
  557. unsigned j = starpu_combined_worker_get_rank();
  558. unsigned slice = (n+m-1)/m;
  559. for (i = j * slice; i < (j+1) * slice && i < n; i++)
  560. val[i] *= *factor;
  561. @}
  562. static struct starpu_codelet cl =
  563. @{
  564. .modes = @{ STARPU_RW @},
  565. .where = STARP_CPU,
  566. .type = STARPU_SPMD,
  567. .max_parallelism = INT_MAX,
  568. .cpu_funcs = @{ func, NULL @},
  569. .nbuffers = 1,
  570. @}
  571. @end example
  572. Of course, this trivial example will not really benefit from parallel task
  573. execution, and was only meant to be simple to understand. The benefit comes
  574. when the computation to be done is so that threads have to e.g. exchange
  575. intermediate results, or write to the data in a complex but safe way in the same
  576. buffer.
  577. @subsection Parallel tasks performance
  578. To benefit from parallel tasks, a parallel-task-aware StarPU scheduler has to
  579. be used. When exposed to codelets with a Fork or SPMD flag, the @code{pheft}
  580. (parallel-heft) and @code{pgreedy} (parallel greedy) schedulers will indeed also
  581. try to execute tasks with several CPUs. It will automatically try the various
  582. available combined worker sizes and thus be able to avoid choosing a large
  583. combined worker if the codelet does not actually scale so much.
  584. @subsection Combined worker sizes
  585. By default, StarPU creates combined workers according to the architecture
  586. structure as detected by hwloc. It means that for each object of the hwloc
  587. topology (NUMA node, socket, cache, ...) a combined worker will be created. If
  588. some nodes of the hierarchy have a big arity (e.g. many cores in a socket
  589. without a hierarchy of shared caches), StarPU will create combined workers of
  590. intermediate sizes.
  591. @subsection Concurrent parallel tasks
  592. Unfortunately, many environments and librairies do not support concurrent
  593. calls.
  594. For instance, most OpenMP implementations (including the main ones) do not
  595. support concurrent @code{pragma omp parallel} statements without nesting them in
  596. another @code{pragma omp parallel} statement, but StarPU does not yet support
  597. creating its CPU workers by using such pragma.
  598. Other parallel libraries are also not safe when being invoked concurrently
  599. from different threads, due to the use of global variables in their sequential
  600. sections for instance.
  601. The solution is then to use only a single combined worker, scoping all
  602. the CPUs. This can be done by setting @code{single_combined_worker}
  603. to 1 in the @code{starpu_conf} structure, or setting the
  604. @code{STARPU_SINGLE_COMBINED_WORKER} environment variable to 1. StarPU will then
  605. use parallel tasks only over all the CPUs at the same time.
  606. @node Debugging
  607. @section Debugging
  608. StarPU provides several tools to help debugging aplications. Execution traces
  609. can be generated and displayed graphically, see @ref{Generating traces}. Some
  610. gdb helpers are also provided to show the whole StarPU state:
  611. @smallexample
  612. (gdb) source tools/gdbinit
  613. (gdb) help starpu
  614. @end smallexample
  615. @node The multiformat interface
  616. @section The multiformat interface
  617. It may be interesting to represent the same piece of data using two different
  618. data structures: one that would only be used on CPUs, and one that would only
  619. be used on GPUs. This can be done by using the multiformat interface. StarPU
  620. will be able to convert data from one data structure to the other when needed.
  621. Note that the heft scheduler is the only one optimized for this interface. The
  622. user must provide StarPU with conversion codelets:
  623. @cartouche
  624. @smallexample
  625. #define NX 1024
  626. struct point array_of_structs[NX];
  627. starpu_data_handle_t handle;
  628. /*
  629. * The conversion of a piece of data is itself a task, though it is created,
  630. * submitted and destroyed by StarPU internals and not by the user. Therefore,
  631. * we have to define two codelets.
  632. * Note that for now the conversion from the CPU format to the GPU format has to
  633. * be executed on the GPU, and the conversion from the GPU to the CPU has to be
  634. * executed on the CPU.
  635. */
  636. #ifdef STARPU_USE_OPENCL
  637. void cpu_to_opencl_opencl_func(void *buffers[], void *args);
  638. struct starpu_codelet cpu_to_opencl_cl = @{
  639. .where = STARPU_OPENCL,
  640. .opencl_funcs = @{ cpu_to_opencl_opencl_func, NULL @},
  641. .nbuffers = 1,
  642. .modes = @{ STARPU_RW @}
  643. @};
  644. void opencl_to_cpu_func(void *buffers[], void *args);
  645. struct starpu_codelet opencl_to_cpu_cl = @{
  646. .where = STARPU_CPU,
  647. .cpu_funcs = @{ opencl_to_cpu_func, NULL @},
  648. .nbuffers = 1,
  649. .modes = @{ STARPU_RW @}
  650. @};
  651. #endif
  652. struct starpu_multiformat_data_interface_ops format_ops = @{
  653. #ifdef STARPU_USE_OPENCL
  654. .opencl_elemsize = 2 * sizeof(float),
  655. .cpu_to_opencl_cl = &cpu_to_opencl_cl,
  656. .opencl_to_cpu_cl = &opencl_to_cpu_cl,
  657. #endif
  658. .cpu_elemsize = 2 * sizeof(float),
  659. ...
  660. @};
  661. starpu_multiformat_data_register(handle, 0, &array_of_structs, NX, &format_ops);
  662. @end smallexample
  663. @end cartouche
  664. Kernels can be written almost as for any other interface. Note that
  665. STARPU_MULTIFORMAT_GET_CPU_PTR shall only be used for CPU kernels. CUDA kernels
  666. must use STARPU_MULTIFORMAT_GET_CUDA_PTR, and OpenCL kernels must use
  667. STARPU_MULTIFORMAT_GET_OPENCL_PTR. STARPU_MULTIFORMAT_GET_NX may be used in any
  668. kind of kernel.
  669. @cartouche
  670. @smallexample
  671. static void
  672. multiformat_scal_cpu_func(void *buffers[], void *args)
  673. @{
  674. struct point *aos;
  675. unsigned int n;
  676. aos = STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
  677. n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
  678. ...
  679. @}
  680. extern "C" void multiformat_scal_cuda_func(void *buffers[], void *_args)
  681. @{
  682. unsigned int n;
  683. struct struct_of_arrays *soa;
  684. soa = (struct struct_of_arrays *) STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
  685. n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
  686. ...
  687. @}
  688. @end smallexample
  689. @end cartouche
  690. A full example may be found in @code{examples/basic_examples/multiformat.c}.
  691. @node On-GPU rendering
  692. @section On-GPU rendering
  693. Graphical-oriented applications need to draw the result of their computations,
  694. typically on the very GPU where these happened. Technologies such as OpenGL/CUDA
  695. interoperability permit to let CUDA directly work on the OpenGL buffers, making
  696. them thus immediately ready for drawing, by mapping OpenGL buffer, textures or
  697. renderbuffer objects into CUDA. To achieve this with StarPU, it simply needs to
  698. be given the CUDA pointer at registration, for instance:
  699. @cartouche
  700. @smallexample
  701. for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)
  702. if (starpu_worker_get_type(workerid) == STARPU_CUDA_WORKER)
  703. break;
  704. cudaSetDevice(starpu_worker_get_devid(workerid));
  705. cudaGraphicsResourceGetMappedPointer((void**)&output, &num_bytes, resource);
  706. starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid), output, num_bytes / sizeof(float4), sizeof(float4));
  707. starpu_insert_task(&cl, STARPU_RW, handle, 0);
  708. starpu_data_unregister(handle);
  709. cudaSetDevice(starpu_worker_get_devid(workerid));
  710. cudaGraphicsUnmapResources(1, &resource, 0);
  711. /* Now display it */
  712. @end smallexample
  713. @end cartouche
  714. @node More examples
  715. @section More examples
  716. More examples are available in the StarPU sources in the @code{examples/}
  717. directory. Simple examples include:
  718. @table @asis
  719. @item @code{incrementer/}:
  720. Trivial incrementation test.
  721. @item @code{basic_examples/}:
  722. Simple documented Hello world (as shown in @ref{Hello World}), vector/scalar product (as shown
  723. in @ref{Vector Scaling on an Hybrid CPU/GPU Machine}), matrix
  724. product examples (as shown in @ref{Performance model example}), an example using the blocked matrix data
  725. interface, an example using the variable data interface, and an example
  726. using different formats on CPUs and GPUs.
  727. @item @code{matvecmult/}:
  728. OpenCL example from NVidia, adapted to StarPU.
  729. @item @code{axpy/}:
  730. AXPY CUBLAS operation adapted to StarPU.
  731. @item @code{fortran/}:
  732. Example of Fortran bindings.
  733. @end table
  734. More advanced examples include:
  735. @table @asis
  736. @item @code{filters/}:
  737. Examples using filters, as shown in @ref{Partitioning Data}.
  738. @item @code{lu/}:
  739. LU matrix factorization, see for instance @code{xlu_implicit.c}
  740. @item @code{cholesky/}:
  741. Cholesky matrix factorization, see for instance @code{cholesky_implicit.c}.
  742. @end table