advanced-examples.texi 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641
  1. @c -*-texinfo-*-
  2. @c This file is part of the StarPU Handbook.
  3. @c Copyright (C) 2009--2011 Universit@'e de Bordeaux 1
  4. @c Copyright (C) 2010, 2011 Centre National de la Recherche Scientifique
  5. @c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
  6. @c See the file starpu.texi for copying conditions.
  7. @node Advanced Examples
  8. @chapter Advanced Examples
  9. @menu
  10. * Using multiple implementations of a codelet::
  11. * Enabling implementation according to capabilities::
  12. * Task and Worker Profiling::
  13. * Partitioning Data:: Partitioning Data
  14. * Performance model example::
  15. * Theoretical lower bound on execution time::
  16. * Insert Task Utility::
  17. * The multiformat interface::
  18. * More examples:: More examples shipped with StarPU
  19. * Debugging:: When things go wrong.
  20. @end menu
  21. @node Using multiple implementations of a codelet
  22. @section Using multiple implementations of a codelet
  23. One may want to write multiple implementations of a codelet for a single type of
  24. device and let StarPU choose which one to run. As an example, we will show how
  25. to use SSE to scale a vector. The codelet can be written as follows :
  26. @cartouche
  27. @smallexample
  28. #include <xmmintrin.h>
  29. void scal_sse_func(void *buffers[], void *cl_arg)
  30. @{
  31. float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
  32. unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
  33. unsigned int n_iterations = n/4;
  34. if (n % 4 != 0)
  35. n_iterations++;
  36. __m128 *VECTOR = (__m128*) vector;
  37. __m128 factor __attribute__((aligned(16)));
  38. factor = _mm_set1_ps(*(float *) cl_arg);
  39. unsigned int i;
  40. for (i = 0; i < n_iterations; i++)
  41. VECTOR[i] = _mm_mul_ps(factor, VECTOR[i]);
  42. @}
  43. @end smallexample
  44. @end cartouche
  45. @cartouche
  46. @smallexample
  47. struct starpu_codelet cl = @{
  48. .where = STARPU_CPU,
  49. .cpu_funcs = @{ scal_cpu_func, scal_sse_func, NULL @},
  50. .nbuffers = 1
  51. @};
  52. @end smallexample
  53. @end cartouche
  54. Schedulers which are multi-implementation aware (only @code{dmda}, @code{heft}
  55. and @code{pheft} for now) will use the performance models of all the
  56. implementations it was given, and pick the one that seems to be the fastest.
  57. @node Enabling implementation according to capabilities
  58. @section Enabling implementation according to capabilities
  59. Some implementations may not run on some devices. For instance, some CUDA
  60. devices do not support double floating point precision, and thus the kernel
  61. execution would just fail; or the device may not have enough shared memory for
  62. the implementation being used. The @code{can_execute} field of the @code{struct
  63. starpu_codelet} structure permits to express this. For instance:
  64. @cartouche
  65. @smallexample
  66. static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
  67. @{
  68. const struct cudaDeviceProp *props;
  69. if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
  70. return 1;
  71. /* Cuda device */
  72. props = starpu_cuda_get_device_properties(workerid);
  73. if (props->major >= 2 || props->minor >= 3)
  74. /* At least compute capability 1.3, supports doubles */
  75. return 1;
  76. /* Old card, does not support doubles */
  77. return 0;
  78. @}
  79. struct starpu_codelet cl = @{
  80. .where = STARPU_CPU|STARPU_CUDA,
  81. .can_execute = can_execute,
  82. .cpu_funcs = @{ cpu_func, NULL @},
  83. .cuda_funcs = @{ gpu_func, NULL @}
  84. .nbuffers = 1
  85. @};
  86. @end smallexample
  87. @end cartouche
  88. This can be essential e.g. when running on a machine which mixes various models
  89. of CUDA devices, to take benefit from the new models without crashing on old models.
  90. Note: the @code{can_execute} function is called by the scheduler each time it
  91. tries to match a task with a worker, and should thus be very fast. The
  92. @code{starpu_cuda_get_device_properties} provides a quick access to CUDA
  93. properties of CUDA devices to achieve such efficiency.
  94. Another example is compiling CUDA code for various compute capabilities,
  95. resulting with two CUDA functions, e.g. @code{scal_gpu_13} for compute capability
  96. 1.3, and @code{scal_gpu_20} for compute capability 2.0. Both functions can be
  97. provided to StarPU by using @code{cuda_funcs}, and @code{can_execute} can then be
  98. used to rule out the @code{scal_gpu_20} variant on a CUDA device which
  99. will not be able to execute it:
  100. @cartouche
  101. @smallexample
  102. static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
  103. @{
  104. const struct cudaDeviceProp *props;
  105. if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
  106. return 1;
  107. /* Cuda device */
  108. if (nimpl == 0)
  109. /* Trying to execute the 1.3 capability variant, we assume it is ok in all cases. */
  110. return 1;
  111. /* Trying to execute the 2.0 capability variant, check that the card can do it. */
  112. props = starpu_cuda_get_device_properties(workerid);
  113. if (props->major >= 2 || props->minor >= 0)
  114. /* At least compute capability 2.0, can run it */
  115. return 1;
  116. /* Old card, does not support 2.0, will not be able to execute the 2.0 variant. */
  117. return 0;
  118. @}
  119. struct starpu_codelet cl = @{
  120. .where = STARPU_CPU|STARPU_CUDA,
  121. .can_execute = can_execute,
  122. .cpu_funcs = @{ cpu_func, NULL @},
  123. .cuda_funcs = @{ scal_gpu_13, scal_gpu_20, NULL @},
  124. .nbuffers = 1
  125. @};
  126. @end smallexample
  127. @end cartouche
  128. Note: the most generic variant should be provided first, as some schedulers are
  129. not able to try the different variants.
  130. @node Task and Worker Profiling
  131. @section Task and Worker Profiling
  132. A full example showing how to use the profiling API is available in
  133. the StarPU sources in the directory @code{examples/profiling/}.
  134. @cartouche
  135. @smallexample
  136. struct starpu_task *task = starpu_task_create();
  137. task->cl = &cl;
  138. task->synchronous = 1;
  139. /* We will destroy the task structure by hand so that we can
  140. * query the profiling info before the task is destroyed. */
  141. task->destroy = 0;
  142. /* Submit and wait for completion (since synchronous was set to 1) */
  143. starpu_task_submit(task);
  144. /* The task is finished, get profiling information */
  145. struct starpu_task_profiling_info *info = task->profiling_info;
  146. /* How much time did it take before the task started ? */
  147. double delay += starpu_timing_timespec_delay_us(&info->submit_time, &info->start_time);
  148. /* How long was the task execution ? */
  149. double length += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
  150. /* We don't need the task structure anymore */
  151. starpu_task_destroy(task);
  152. @end smallexample
  153. @end cartouche
  154. @cartouche
  155. @smallexample
  156. /* Display the occupancy of all workers during the test */
  157. int worker;
  158. for (worker = 0; worker < starpu_worker_get_count(); worker++)
  159. @{
  160. struct starpu_worker_profiling_info worker_info;
  161. int ret = starpu_worker_get_profiling_info(worker, &worker_info);
  162. STARPU_ASSERT(!ret);
  163. double total_time = starpu_timing_timespec_to_us(&worker_info.total_time);
  164. double executing_time = starpu_timing_timespec_to_us(&worker_info.executing_time);
  165. double sleeping_time = starpu_timing_timespec_to_us(&worker_info.sleeping_time);
  166. float executing_ratio = 100.0*executing_time/total_time;
  167. float sleeping_ratio = 100.0*sleeping_time/total_time;
  168. char workername[128];
  169. starpu_worker_get_name(worker, workername, 128);
  170. fprintf(stderr, "Worker %s:\n", workername);
  171. fprintf(stderr, "\ttotal time : %.2lf ms\n", total_time*1e-3);
  172. fprintf(stderr, "\texec time : %.2lf ms (%.2f %%)\n", executing_time*1e-3,
  173. executing_ratio);
  174. fprintf(stderr, "\tblocked time : %.2lf ms (%.2f %%)\n", sleeping_time*1e-3,
  175. sleeping_ratio);
  176. @}
  177. @end smallexample
  178. @end cartouche
  179. @node Partitioning Data
  180. @section Partitioning Data
  181. An existing piece of data can be partitioned in sub parts to be used by different tasks, for instance:
  182. @cartouche
  183. @smallexample
  184. int vector[NX];
  185. starpu_data_handle_t handle;
  186. /* Declare data to StarPU */
  187. starpu_vector_data_register(&handle, 0, (uintptr_t)vector, NX, sizeof(vector[0]));
  188. /* Partition the vector in PARTS sub-vectors */
  189. starpu_filter f =
  190. @{
  191. .filter_func = starpu_block_filter_func_vector,
  192. .nchildren = PARTS
  193. @};
  194. starpu_data_partition(handle, &f);
  195. @end smallexample
  196. @end cartouche
  197. @cartouche
  198. @smallexample
  199. /* Submit a task on each sub-vector */
  200. for (i=0; i<starpu_data_get_nb_children(handle); i++) @{
  201. /* Get subdata number i (there is only 1 dimension) */
  202. starpu_data_handle_t sub_handle = starpu_data_get_sub_data(handle, 1, i);
  203. struct starpu_task *task = starpu_task_create();
  204. task->buffers[0].handle = sub_handle;
  205. task->buffers[0].mode = STARPU_RW;
  206. task->cl = &cl;
  207. task->synchronous = 1;
  208. task->cl_arg = &factor;
  209. task->cl_arg_size = sizeof(factor);
  210. starpu_task_submit(task);
  211. @}
  212. @end smallexample
  213. @end cartouche
  214. Partitioning can be applied several times, see
  215. @code{examples/basic_examples/mult.c} and @code{examples/filters/}.
  216. @node Performance model example
  217. @section Performance model example
  218. To achieve good scheduling, StarPU scheduling policies need to be able to
  219. estimate in advance the duration of a task. This is done by giving to codelets
  220. a performance model, by defining a @code{starpu_perfmodel} structure and
  221. providing its address in the @code{model} field of the @code{struct starpu_codelet}
  222. structure. The @code{symbol} and @code{type} fields of @code{starpu_perfmodel}
  223. are mandatory, to give a name to the model, and the type of the model, since
  224. there are several kinds of performance models.
  225. @itemize
  226. @item
  227. Measured at runtime (@code{STARPU_HISTORY_BASED} model type). This assumes that for a
  228. given set of data input/output sizes, the performance will always be about the
  229. same. This is very true for regular kernels on GPUs for instance (<0.1% error),
  230. and just a bit less true on CPUs (~=1% error). This also assumes that there are
  231. few different sets of data input/output sizes. StarPU will then keep record of
  232. the average time of previous executions on the various processing units, and use
  233. it as an estimation. History is done per task size, by using a hash of the input
  234. and ouput sizes as an index.
  235. It will also save it in @code{~/.starpu/sampling/codelets}
  236. for further executions, and can be observed by using the
  237. @code{starpu_perfmodel_display} command, or drawn by using
  238. the @code{starpu_perfmodel_plot}. The models are indexed by machine name. To
  239. share the models between machines (e.g. for a homogeneous cluster), use
  240. @code{export STARPU_HOSTNAME=some_global_name}. Measurements are only done when using a task scheduler which makes use of it, such as @code{heft} or @code{dmda}.
  241. The following is a small code example.
  242. If e.g. the code is recompiled with other compilation options, or several
  243. variants of the code are used, the symbol string should be changed to reflect
  244. that, in order to recalibrate a new model from zero. The symbol string can even
  245. be constructed dynamically at execution time, as long as this is done before
  246. submitting any task using it.
  247. @cartouche
  248. @smallexample
  249. static struct starpu_perfmodel mult_perf_model = @{
  250. .type = STARPU_HISTORY_BASED,
  251. .symbol = "mult_perf_model"
  252. @};
  253. struct starpu_codelet cl = @{
  254. .where = STARPU_CPU,
  255. .cpu_funcs = @{ cpu_mult, NULL @},
  256. .nbuffers = 3,
  257. /* for the scheduling policy to be able to use performance models */
  258. .model = &mult_perf_model
  259. @};
  260. @end smallexample
  261. @end cartouche
  262. @item
  263. Measured at runtime and refined by regression (@code{STARPU_REGRESSION_*_BASED}
  264. model type). This still assumes performance regularity, but can work
  265. with various data input sizes, by applying regression over observed
  266. execution times. STARPU_REGRESSION_BASED uses an a*n^b regression
  267. form, STARPU_NL_REGRESSION_BASED uses an a*n^b+c (more precise than
  268. STARPU_REGRESSION_BASED, but costs a lot more to compute). For instance,
  269. @code{tests/perfmodels/regression_based.c} uses a regression-based performance
  270. model for the @code{memset} operation. Of course, the application has to issue
  271. tasks with varying size so that the regression can be computed. StarPU will not
  272. trust the regression unless there is at least 10% difference between the minimum
  273. and maximum observed input size. For non-linear regression, since computing it
  274. is quite expensive, it is only done at termination of the application. This
  275. means that the first execution uses history-based performance model to perform
  276. scheduling.
  277. @item
  278. Provided as an estimation from the application itself (@code{STARPU_COMMON} model type and @code{cost_model} field),
  279. see for instance
  280. @code{examples/common/blas_model.h} and @code{examples/common/blas_model.c}.
  281. @item
  282. Provided explicitly by the application (@code{STARPU_PER_ARCH} model type): the
  283. @code{.per_arch[i].cost_model} fields have to be filled with pointers to
  284. functions which return the expected duration of the task in micro-seconds, one
  285. per architecture.
  286. @end itemize
  287. How to use schedulers which can benefit from such performance model is explained
  288. in @ref{Task scheduling policy}.
  289. The same can be done for task power consumption estimation, by setting the
  290. @code{power_model} field the same way as the @code{model} field. Note: for
  291. now, the application has to give to the power consumption performance model
  292. a name which is different from the execution time performance model.
  293. The application can request time estimations from the StarPU performance
  294. models by filling a task structure as usual without actually submitting
  295. it. The data handles can be created by calling @code{starpu_data_register}
  296. functions with a @code{NULL} pointer (and need to be unregistered as usual)
  297. and the desired data sizes. The @code{starpu_task_expected_length} and
  298. @code{starpu_task_expected_power} functions can then be called to get an
  299. estimation of the task duration on a given arch. @code{starpu_task_destroy}
  300. needs to be called to destroy the dummy task afterwards. See
  301. @code{tests/perfmodels/regression_based.c} for an example.
  302. @node Theoretical lower bound on execution time
  303. @section Theoretical lower bound on execution time
  304. For kernels with history-based performance models, StarPU can very easily provide a theoretical lower
  305. bound for the execution time of a whole set of tasks. See for
  306. instance @code{examples/lu/lu_example.c}: before submitting tasks,
  307. call @code{starpu_bound_start}, and after complete execution, call
  308. @code{starpu_bound_stop}. @code{starpu_bound_print_lp} or
  309. @code{starpu_bound_print_mps} can then be used to output a Linear Programming
  310. problem corresponding to the schedule of your tasks. Run it through
  311. @code{lp_solve} or any other linear programming solver, and that will give you a
  312. lower bound for the total execution time of your tasks. If StarPU was compiled
  313. with the glpk library installed, @code{starpu_bound_compute} can be used to
  314. solve it immediately and get the optimized minimum, in ms. Its @code{integer}
  315. parameter allows to decide whether integer resolution should be computed
  316. and returned too.
  317. The @code{deps} parameter tells StarPU whether to take tasks and implicit data
  318. dependencies into account. It must be understood that the linear programming
  319. problem size is quadratic with the number of tasks and thus the time to solve it
  320. will be very long, it could be minutes for just a few dozen tasks. You should
  321. probably use @code{lp_solve -timeout 1 test.pl -wmps test.mps} to convert the
  322. problem to MPS format and then use a better solver, @code{glpsol} might be
  323. better than @code{lp_solve} for instance (the @code{--pcost} option may be
  324. useful), but sometimes doesn't manage to converge. @code{cbc} might look
  325. slower, but it is parallel. Be sure to try at least all the @code{-B} options
  326. of @code{lp_solve}. For instance, we often just use
  327. @code{lp_solve -cc -B1 -Bb -Bg -Bp -Bf -Br -BG -Bd -Bs -BB -Bo -Bc -Bi} , and
  328. the @code{-gr} option can also be quite useful.
  329. Setting @code{deps} to 0 will only take into account the actual computations
  330. on processing units. It however still properly takes into account the varying
  331. performances of kernels and processing units, which is quite more accurate than
  332. just comparing StarPU performances with the fastest of the kernels being used.
  333. The @code{prio} parameter tells StarPU whether to simulate taking into account
  334. the priorities as the StarPU scheduler would, i.e. schedule prioritized
  335. tasks before less prioritized tasks, to check to which extend this results
  336. to a less optimal solution. This increases even more computation time.
  337. Note that for simplicity, all this however doesn't take into account data
  338. transfers, which are assumed to be completely overlapped.
  339. @node Insert Task Utility
  340. @section Insert Task Utility
  341. StarPU provides the wrapper function @code{starpu_insert_task} to ease
  342. the creation and submission of tasks.
  343. @deftypefun int starpu_insert_task (struct starpu_codelet *@var{cl}, ...)
  344. Create and submit a task corresponding to @var{cl} with the following
  345. arguments. The argument list must be zero-terminated.
  346. The arguments following the codelets can be of the following types:
  347. @itemize
  348. @item
  349. @code{STARPU_R}, @code{STARPU_W}, @code{STARPU_RW}, @code{STARPU_SCRATCH}, @code{STARPU_REDUX} an access mode followed by a data handle;
  350. @item
  351. @code{STARPU_VALUE} followed by a pointer to a constant value and
  352. the size of the constant;
  353. @item
  354. @code{STARPU_CALLBACK} followed by a pointer to a callback function;
  355. @item
  356. @code{STARPU_CALLBACK_ARG} followed by a pointer to be given as an
  357. argument to the callback function;
  358. @item
  359. @code{STARPU_CALLBACK_WITH_ARG} followed by two pointers: one to a callback
  360. function, and the other to be given as an argument to the callback
  361. function; this is equivalent to using both @code{STARPU_CALLBACK} and
  362. @code{STARPU_CALLBACK_WITH_ARG}
  363. @item
  364. @code{STARPU_PRIORITY} followed by a integer defining a priority level.
  365. @end itemize
  366. Parameters to be passed to the codelet implementation are defined
  367. through the type @code{STARPU_VALUE}. The function
  368. @code{starpu_unpack_cl_args} must be called within the codelet
  369. implementation to retrieve them.
  370. @end deftypefun
  371. Here the implementation of the codelet:
  372. @smallexample
  373. void func_cpu(void *descr[], void *_args)
  374. @{
  375. int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
  376. float *x1 = (float *)STARPU_VARIABLE_GET_PTR(descr[1]);
  377. int ifactor;
  378. float ffactor;
  379. starpu_unpack_cl_args(_args, &ifactor, &ffactor);
  380. *x0 = *x0 * ifactor;
  381. *x1 = *x1 * ffactor;
  382. @}
  383. struct starpu_codelet mycodelet = @{
  384. .where = STARPU_CPU,
  385. .cpu_funcs = @{ func_cpu, NULL @},
  386. .nbuffers = 2
  387. @};
  388. @end smallexample
  389. And the call to the @code{starpu_insert_task} wrapper:
  390. @smallexample
  391. starpu_insert_task(&mycodelet,
  392. STARPU_VALUE, &ifactor, sizeof(ifactor),
  393. STARPU_VALUE, &ffactor, sizeof(ffactor),
  394. STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],
  395. 0);
  396. @end smallexample
  397. The call to @code{starpu_insert_task} is equivalent to the following
  398. code:
  399. @smallexample
  400. struct starpu_task *task = starpu_task_create();
  401. task->cl = &mycodelet;
  402. task->buffers[0].handle = data_handles[0];
  403. task->buffers[0].mode = STARPU_RW;
  404. task->buffers[1].handle = data_handles[1];
  405. task->buffers[1].mode = STARPU_RW;
  406. char *arg_buffer;
  407. size_t arg_buffer_size;
  408. starpu_pack_cl_args(&arg_buffer, &arg_buffer_size,
  409. STARPU_VALUE, &ifactor, sizeof(ifactor),
  410. STARPU_VALUE, &ffactor, sizeof(ffactor),
  411. 0);
  412. task->cl_arg = arg_buffer;
  413. task->cl_arg_size = arg_buffer_size;
  414. int ret = starpu_task_submit(task);
  415. @end smallexample
  416. If some part of the task insertion depends on the value of some computation,
  417. the @code{STARPU_DATA_ACQUIRE_CB} macro can be very convenient. For
  418. instance, assuming that the index variable @code{i} was registered as handle
  419. @code{i_handle}:
  420. @smallexample
  421. /* Compute which portion we will work on, e.g. pivot */
  422. starpu_insert_task(&which_index, STARPU_W, i_handle, 0);
  423. /* And submit the corresponding task */
  424. STARPU_DATA_ACQUIRE_CB(i_handle, STARPU_R, starpu_insert_task(&work, STARPU_RW, A_handle[i], 0));
  425. @end smallexample
  426. The @code{STARPU_DATA_ACQUIRE_CB} macro submits an asynchronous request for
  427. acquiring data @code{i} for the main application, and will execute the code
  428. given as third parameter when it is acquired. In other words, as soon as the
  429. value of @code{i} computed by the @code{which_index} codelet can be read, the
  430. portion of code passed as third parameter of @code{STARPU_DATA_ACQUIRE_CB} will
  431. be executed, and is allowed to read from @code{i} to use it e.g. as an
  432. index. Note that this macro is only avaible when compiling StarPU with
  433. the compiler @code{gcc}.
  434. @node Debugging
  435. @section Debugging
  436. StarPU provides several tools to help debugging aplications. Execution traces
  437. can be generated and displayed graphically, see @ref{Generating traces}. Some
  438. gdb helpers are also provided to show the whole StarPU state:
  439. @smallexample
  440. (gdb) source tools/gdbinit
  441. (gdb) help starpu
  442. @end smallexample
  443. @node The multiformat interface
  444. @section The multiformat interface
  445. It may be interesting to represent the same piece of data using two different
  446. data structures : one that would only be used on CPUs, and one that would only
  447. be used on GPUs. This can be done by using the multiformat interface. StarPU
  448. will be able to convert data from one data structure to the other when needed.
  449. Note that the heft scheduler is the only one optimized for this interface. The
  450. user must provide StarPU with conversion codelets :
  451. @example
  452. #define NX 1024
  453. struct point array_of_structs[NX];
  454. starpu_data_handle_t handle;
  455. /*
  456. * The conversion of a piece of data is itself a task, though it is created,
  457. * submitted and destroyed by StarPU internals and not by the user. Therefore,
  458. * we have to define two codelets.
  459. * Note that for now the conversion from the CPU format to the GPU format has to
  460. * be executed on the GPU, and the conversion from the GPU to the CPU has to be
  461. * executed on the CPU.
  462. */
  463. #ifdef STARPU_USE_OPENCL
  464. void cpu_to_opencl_opencl_func(void *buffers[], void *args);
  465. struct starpu_codelet cpu_to_opencl_cl = @{
  466. .where = STARPU_OPENCL,
  467. .opencl_funcs = @{ cpu_to_opencl_opencl_func, NULL @},
  468. .nbuffers = 1
  469. @};
  470. void opencl_to_cpu_func(void *buffers[], void *args);
  471. struct starpu_codelet opencl_to_cpu_cl = @{
  472. .where = STARPU_CPU,
  473. .cpu_funcs = @{ opencl_to_cpu_func, NULL @},
  474. .nbuffers = 1
  475. @};
  476. #endif
  477. struct starpu_multiformat_data_interface_ops format_ops = @{
  478. #ifdef STARPU_USE_OPENCL
  479. .opencl_elemsize = 2 * sizeof(float),
  480. .cpu_to_opencl_cl = &cpu_to_opencl_cl,
  481. .opencl_to_cpu_cl = &opencl_to_cpu_cl,
  482. #endif
  483. .cpu_elemsize = 2 * sizeof(float),
  484. ...
  485. @};
  486. starpu_multiformat_data_register(handle, 0, &array_of_structs, NX, &format_ops);
  487. @end example
  488. Kernels can be written almost as for any other interface. Note that
  489. STARPU_MULTIFORMAT_GET_PTR shall only be used for CPU kernels. CUDA kernels
  490. must use STARPU_MULTIFORMAT_GET_CUDA_PTR, and OpenCL kernels must use
  491. STARPU_MULTIFORMAT_GET_OPENCL_PTR. STARPU_MULTIFORMAT_GET_NX may be used in any
  492. kind of kernel.
  493. @example
  494. static void
  495. multiformat_scal_cpu_func(void *buffers[], void *args)
  496. @{
  497. struct point *aos;
  498. unsigned int n;
  499. aos = STARPU_MULTIFORMAT_GET_PTR(buffers[0]);
  500. n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
  501. ...
  502. @}
  503. extern "C" void multiformat_scal_cuda_func(void *buffers[], void *_args)
  504. @{
  505. unsigned int n;
  506. struct struct_of_arrays *soa;
  507. soa = (struct struct_of_arrays *) STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
  508. n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
  509. ...
  510. @}
  511. @end example
  512. A full example may be found in @code{examples/basic_examples/multiformat.c}.
  513. @node More examples
  514. @section More examples
  515. More examples are available in the StarPU sources in the @code{examples/}
  516. directory. Simple examples include:
  517. @table @asis
  518. @item @code{incrementer/}:
  519. Trivial incrementation test.
  520. @item @code{basic_examples/}:
  521. Simple documented Hello world (as shown in @ref{Hello World}), vector/scalar product (as shown
  522. in @ref{Vector Scaling on an Hybrid CPU/GPU Machine}), matrix
  523. product examples (as shown in @ref{Performance model example}), an example using the blocked matrix data
  524. interface, an example using the variable data interface, and an example
  525. using different formats on CPUs and GPUs.
  526. @item @code{matvecmult/}:
  527. OpenCL example from NVidia, adapted to StarPU.
  528. @item @code{axpy/}:
  529. AXPY CUBLAS operation adapted to StarPU.
  530. @item @code{fortran/}:
  531. Example of Fortran bindings.
  532. @end table
  533. More advanced examples include:
  534. @table @asis
  535. @item @code{filters/}:
  536. Examples using filters, as shown in @ref{Partitioning Data}.
  537. @item @code{lu/}:
  538. LU matrix factorization, see for instance @code{xlu_implicit.c}
  539. @item @code{cholesky/}:
  540. Cholesky matrix factorization, see for instance @code{cholesky_implicit.c}.
  541. @end table