advanced_examples.doxy 48 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246
  1. /*
  2. * This file is part of the StarPU Handbook.
  3. * Copyright (C) 2009--2011 Universit@'e de Bordeaux 1
  4. * Copyright (C) 2010, 2011, 2012, 2013 Centre National de la Recherche Scientifique
  5. * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
  6. * See the file version.doxy for copying conditions.
  7. */
  8. /*! \page AdvancedExamples Advanced Examples
  9. \section UsingMultipleImplementationsOfACodelet Using Multiple Implementations Of A Codelet
  10. One may want to write multiple implementations of a codelet for a single type of
  11. device and let StarPU choose which one to run. As an example, we will show how
  12. to use SSE to scale a vector. The codelet can be written as follows:
  13. \code{.c}
  14. #include <xmmintrin.h>
  15. void scal_sse_func(void *buffers[], void *cl_arg)
  16. {
  17. float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
  18. unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
  19. unsigned int n_iterations = n/4;
  20. if (n % 4 != 0)
  21. n_iterations++;
  22. __m128 *VECTOR = (__m128*) vector;
  23. __m128 factor __attribute__((aligned(16)));
  24. factor = _mm_set1_ps(*(float *) cl_arg);
  25. unsigned int i;
  26. for (i = 0; i < n_iterations; i++)
  27. VECTOR[i] = _mm_mul_ps(factor, VECTOR[i]);
  28. }
  29. \endcode
  30. \code{.c}
  31. struct starpu_codelet cl = {
  32. .where = STARPU_CPU,
  33. .cpu_funcs = { scal_cpu_func, scal_sse_func, NULL },
  34. .cpu_funcs_name = { "scal_cpu_func", "scal_sse_func", NULL },
  35. .nbuffers = 1,
  36. .modes = { STARPU_RW }
  37. };
  38. \endcode
  39. Schedulers which are multi-implementation aware (only <c>dmda</c> and
  40. <c>pheft</c> for now) will use the performance models of all the
  41. implementations it was given, and pick the one that seems to be the fastest.
  42. \section EnablingImplementationAccordingToCapabilities Enabling Implementation According To Capabilities
  43. Some implementations may not run on some devices. For instance, some CUDA
  44. devices do not support double floating point precision, and thus the kernel
  45. execution would just fail; or the device may not have enough shared memory for
  46. the implementation being used. The field starpu_codelet::can_execute
  47. permits to express this. For instance:
  48. \code{.c}
  49. static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
  50. {
  51. const struct cudaDeviceProp *props;
  52. if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
  53. return 1;
  54. /* Cuda device */
  55. props = starpu_cuda_get_device_properties(workerid);
  56. if (props->major >= 2 || props->minor >= 3)
  57. /* At least compute capability 1.3, supports doubles */
  58. return 1;
  59. /* Old card, does not support doubles */
  60. return 0;
  61. }
  62. struct starpu_codelet cl = {
  63. .where = STARPU_CPU|STARPU_CUDA,
  64. .can_execute = can_execute,
  65. .cpu_funcs = { cpu_func, NULL },
  66. .cpu_funcs_name = { "cpu_func", NULL },
  67. .cuda_funcs = { gpu_func, NULL }
  68. .nbuffers = 1,
  69. .modes = { STARPU_RW }
  70. };
  71. \endcode
  72. This can be essential e.g. when running on a machine which mixes various models
  73. of CUDA devices, to take benefit from the new models without crashing on old models.
  74. Note: the function starpu_codelet::can_execute is called by the
  75. scheduler each time it tries to match a task with a worker, and should
  76. thus be very fast. The function starpu_cuda_get_device_properties()
  77. provides a quick access to CUDA properties of CUDA devices to achieve
  78. such efficiency.
  79. Another example is to compile CUDA code for various compute capabilities,
  80. resulting with two CUDA functions, e.g. <c>scal_gpu_13</c> for compute capability
  81. 1.3, and <c>scal_gpu_20</c> for compute capability 2.0. Both functions can be
  82. provided to StarPU by using starpu_codelet::cuda_funcs, and
  83. starpu_codelet::can_execute can then be used to rule out the
  84. <c>scal_gpu_20</c> variant on a CUDA device which will not be able to execute it:
  85. \code{.c}
  86. static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
  87. {
  88. const struct cudaDeviceProp *props;
  89. if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
  90. return 1;
  91. /* Cuda device */
  92. if (nimpl == 0)
  93. /* Trying to execute the 1.3 capability variant, we assume it is ok in all cases. */
  94. return 1;
  95. /* Trying to execute the 2.0 capability variant, check that the card can do it. */
  96. props = starpu_cuda_get_device_properties(workerid);
  97. if (props->major >= 2 || props->minor >= 0)
  98. /* At least compute capability 2.0, can run it */
  99. return 1;
  100. /* Old card, does not support 2.0, will not be able to execute the 2.0 variant. */
  101. return 0;
  102. }
  103. struct starpu_codelet cl = {
  104. .where = STARPU_CPU|STARPU_CUDA,
  105. .can_execute = can_execute,
  106. .cpu_funcs = { cpu_func, NULL },
  107. .cpu_funcs_name = { "cpu_func", NULL },
  108. .cuda_funcs = { scal_gpu_13, scal_gpu_20, NULL },
  109. .nbuffers = 1,
  110. .modes = { STARPU_RW }
  111. };
  112. \endcode
  113. Note: the most generic variant should be provided first, as some schedulers are
  114. not able to try the different variants.
  115. \section TaskAndWorkerProfiling Task And Worker Profiling
  116. A full example showing how to use the profiling API is available in
  117. the StarPU sources in the directory <c>examples/profiling/</c>.
  118. \code{.c}
  119. struct starpu_task *task = starpu_task_create();
  120. task->cl = &cl;
  121. task->synchronous = 1;
  122. /* We will destroy the task structure by hand so that we can
  123. * query the profiling info before the task is destroyed. */
  124. task->destroy = 0;
  125. /* Submit and wait for completion (since synchronous was set to 1) */
  126. starpu_task_submit(task);
  127. /* The task is finished, get profiling information */
  128. struct starpu_profiling_task_info *info = task->profiling_info;
  129. /* How much time did it take before the task started ? */
  130. double delay += starpu_timing_timespec_delay_us(&info->submit_time, &info->start_time);
  131. /* How long was the task execution ? */
  132. double length += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
  133. /* We don't need the task structure anymore */
  134. starpu_task_destroy(task);
  135. \endcode
  136. \code{.c}
  137. /* Display the occupancy of all workers during the test */
  138. int worker;
  139. for (worker = 0; worker < starpu_worker_get_count(); worker++)
  140. {
  141. struct starpu_profiling_worker_info worker_info;
  142. int ret = starpu_profiling_worker_get_info(worker, &worker_info);
  143. STARPU_ASSERT(!ret);
  144. double total_time = starpu_timing_timespec_to_us(&worker_info.total_time);
  145. double executing_time = starpu_timing_timespec_to_us(&worker_info.executing_time);
  146. double sleeping_time = starpu_timing_timespec_to_us(&worker_info.sleeping_time);
  147. double overhead_time = total_time - executing_time - sleeping_time;
  148. float executing_ratio = 100.0*executing_time/total_time;
  149. float sleeping_ratio = 100.0*sleeping_time/total_time;
  150. float overhead_ratio = 100.0 - executing_ratio - sleeping_ratio;
  151. char workername[128];
  152. starpu_worker_get_name(worker, workername, 128);
  153. fprintf(stderr, "Worker %s:\n", workername);
  154. fprintf(stderr, "\ttotal time: %.2lf ms\n", total_time*1e-3);
  155. fprintf(stderr, "\texec time: %.2lf ms (%.2f %%)\n",
  156. executing_time*1e-3, executing_ratio);
  157. fprintf(stderr, "\tblocked time: %.2lf ms (%.2f %%)\n",
  158. sleeping_time*1e-3, sleeping_ratio);
  159. fprintf(stderr, "\toverhead time: %.2lf ms (%.2f %%)\n",
  160. overhead_time*1e-3, overhead_ratio);
  161. }
  162. \endcode
  163. \section PartitioningData Partitioning Data
  164. An existing piece of data can be partitioned in sub parts to be used by different tasks, for instance:
  165. \code{.c}
  166. int vector[NX];
  167. starpu_data_handle_t handle;
  168. /* Declare data to StarPU */
  169. starpu_vector_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)vector,
  170. NX, sizeof(vector[0]));
  171. /* Partition the vector in PARTS sub-vectors */
  172. starpu_data_filter f =
  173. {
  174. .filter_func = starpu_vector_filter_block,
  175. .nchildren = PARTS
  176. };
  177. starpu_data_partition(handle, &f);
  178. \endcode
  179. The task submission then uses the function starpu_data_get_sub_data()
  180. to retrieve the sub-handles to be passed as tasks parameters.
  181. \code{.c}
  182. /* Submit a task on each sub-vector */
  183. for (i=0; i<starpu_data_get_nb_children(handle); i++) {
  184. /* Get subdata number i (there is only 1 dimension) */
  185. starpu_data_handle_t sub_handle = starpu_data_get_sub_data(handle, 1, i);
  186. struct starpu_task *task = starpu_task_create();
  187. task->handles[0] = sub_handle;
  188. task->cl = &cl;
  189. task->synchronous = 1;
  190. task->cl_arg = &factor;
  191. task->cl_arg_size = sizeof(factor);
  192. starpu_task_submit(task);
  193. }
  194. \endcode
  195. Partitioning can be applied several times, see
  196. <c>examples/basic_examples/mult.c</c> and <c>examples/filters/</c>.
  197. Wherever the whole piece of data is already available, the partitioning will
  198. be done in-place, i.e. without allocating new buffers but just using pointers
  199. inside the existing copy. This is particularly important to be aware of when
  200. using OpenCL, where the kernel parameters are not pointers, but handles. The
  201. kernel thus needs to be also passed the offset within the OpenCL buffer:
  202. \code{.c}
  203. void opencl_func(void *buffers[], void *cl_arg)
  204. {
  205. cl_mem vector = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
  206. unsigned offset = STARPU_BLOCK_GET_OFFSET(buffers[0]);
  207. ...
  208. clSetKernelArg(kernel, 0, sizeof(vector), &vector);
  209. clSetKernelArg(kernel, 1, sizeof(offset), &offset);
  210. ...
  211. }
  212. \endcode
  213. And the kernel has to shift from the pointer passed by the OpenCL driver:
  214. \code{.c}
  215. __kernel void opencl_kernel(__global int *vector, unsigned offset)
  216. {
  217. block = (__global void *)block + offset;
  218. ...
  219. }
  220. \endcode
  221. StarPU provides various interfaces and filters for matrices, vectors, etc.,
  222. but applications can also write their own data interfaces and filters, see
  223. <c>examples/interface</c> and <c>examples/filters/custom_mf</c> for an example.
  224. \section PerformanceModelExample Performance Model Example
  225. To achieve good scheduling, StarPU scheduling policies need to be able to
  226. estimate in advance the duration of a task. This is done by giving to codelets
  227. a performance model, by defining a structure starpu_perfmodel and
  228. providing its address in the field starpu_codelet::model. The fields
  229. starpu_perfmodel::symbol and starpu_perfmodel::type are mandatory, to
  230. give a name to the model, and the type of the model, since there are
  231. several kinds of performance models. For compatibility, make sure to
  232. initialize the whole structure to zero, either by using explicit
  233. memset(), or by letting the compiler implicitly do it as examplified
  234. below.
  235. <ul>
  236. <li>
  237. Measured at runtime (model type ::STARPU_HISTORY_BASED). This assumes that for a
  238. given set of data input/output sizes, the performance will always be about the
  239. same. This is very true for regular kernels on GPUs for instance (<0.1% error),
  240. and just a bit less true on CPUs (~=1% error). This also assumes that there are
  241. few different sets of data input/output sizes. StarPU will then keep record of
  242. the average time of previous executions on the various processing units, and use
  243. it as an estimation. History is done per task size, by using a hash of the input
  244. and ouput sizes as an index.
  245. It will also save it in <c>$STARPU_HOME/.starpu/sampling/codelets</c>
  246. for further executions, and can be observed by using the tool
  247. <c>starpu_perfmodel_display</c>, or drawn by using
  248. the tool <c>starpu_perfmodel_plot</c> (\ref PerformanceModelCalibration). The
  249. models are indexed by machine name. To
  250. share the models between machines (e.g. for a homogeneous cluster), use
  251. <c>export STARPU_HOSTNAME=some_global_name</c>. Measurements are only done
  252. when using a task scheduler which makes use of it, such as
  253. <c>dmda</c>. Measurements can also be provided explicitly by the application, by
  254. using the function starpu_perfmodel_update_history().
  255. The following is a small code example.
  256. If e.g. the code is recompiled with other compilation options, or several
  257. variants of the code are used, the symbol string should be changed to reflect
  258. that, in order to recalibrate a new model from zero. The symbol string can even
  259. be constructed dynamically at execution time, as long as this is done before
  260. submitting any task using it.
  261. \code{.c}
  262. static struct starpu_perfmodel mult_perf_model = {
  263. .type = STARPU_HISTORY_BASED,
  264. .symbol = "mult_perf_model"
  265. };
  266. struct starpu_codelet cl = {
  267. .where = STARPU_CPU,
  268. .cpu_funcs = { cpu_mult, NULL },
  269. .cpu_funcs_name = { "cpu_mult", NULL },
  270. .nbuffers = 3,
  271. .modes = { STARPU_R, STARPU_R, STARPU_W },
  272. /* for the scheduling policy to be able to use performance models */
  273. .model = &mult_perf_model
  274. };
  275. \endcode
  276. </li>
  277. <li>
  278. Measured at runtime and refined by regression (model types
  279. ::STARPU_REGRESSION_BASED and ::STARPU_NL_REGRESSION_BASED). This
  280. still assumes performance regularity, but works
  281. with various data input sizes, by applying regression over observed
  282. execution times. ::STARPU_REGRESSION_BASED uses an a*n^b regression
  283. form, ::STARPU_NL_REGRESSION_BASED uses an a*n^b+c (more precise than
  284. ::STARPU_REGRESSION_BASED, but costs a lot more to compute).
  285. For instance,
  286. <c>tests/perfmodels/regression_based.c</c> uses a regression-based performance
  287. model for the function memset().
  288. Of course, the application has to issue
  289. tasks with varying size so that the regression can be computed. StarPU will not
  290. trust the regression unless there is at least 10% difference between the minimum
  291. and maximum observed input size. It can be useful to set the
  292. environment variable \ref STARPU_CALIBRATE to <c>1</c> and run the application
  293. on varying input sizes with \ref STARPU_SCHED set to <c>eager</c> scheduler,
  294. so as to feed the performance model for a variety of
  295. inputs. The application can also provide the measurements explictly by
  296. using the function starpu_perfmodel_update_history(). The tools
  297. <c>starpu_perfmodel_display</c> and <c>starpu_perfmodel_plot</c> can
  298. be used to observe how much the performance model is calibrated (\ref
  299. PerformanceModelCalibration); when their output look good,
  300. \ref STARPU_CALIBRATE can be reset to <c>0</c> to let
  301. StarPU use the resulting performance model without recording new measures, and
  302. \ref STARPU_SCHED can be set to <c>dmda</c> to benefit from the performance models. If
  303. the data input sizes vary a lot, it is really important to set
  304. \ref STARPU_CALIBRATE to <c>0</c>, otherwise StarPU will continue adding the
  305. measures, and result with a very big performance model, which will take time a
  306. lot of time to load and save.
  307. For non-linear regression, since computing it
  308. is quite expensive, it is only done at termination of the application. This
  309. means that the first execution of the application will use only history-based
  310. performance model to perform scheduling, without using regression.
  311. </li>
  312. <li>
  313. Provided as an estimation from the application itself (model type
  314. ::STARPU_COMMON and field starpu_perfmodel::cost_function),
  315. see for instance
  316. <c>examples/common/blas_model.h</c> and <c>examples/common/blas_model.c</c>.
  317. </li>
  318. <li>
  319. Provided explicitly by the application (model type ::STARPU_PER_ARCH):
  320. the fields <c>.per_arch[arch][nimpl].cost_function</c> have to be
  321. filled with pointers to functions which return the expected duration
  322. of the task in micro-seconds, one per architecture.
  323. </li>
  324. </ul>
  325. For ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED, and
  326. ::STARPU_NL_REGRESSION_BASED, the total size of task data (both input
  327. and output) is used as an index by default. The field
  328. starpu_perfmodel::size_base however permits the application to
  329. override that, when for instance some of the data do not matter for
  330. task cost (e.g. mere reference table), or when using sparse
  331. structures (in which case it is the number of non-zeros which matter), or when
  332. there is some hidden parameter such as the number of iterations, or when the application
  333. actually has a very good idea of the complexity of the algorithm, and just not
  334. the speed of the processor, etc.
  335. The example in the directory <c>examples/pi</c> uses this to include
  336. the number of iterations in the base.
  337. StarPU will automatically determine when the performance model is calibrated,
  338. or rather, it will assume the performance model is calibrated until the
  339. application submits a task for which the performance can not be predicted. For
  340. ::STARPU_HISTORY_BASED, StarPU will require 10 (::_STARPU_CALIBRATION_MINIMUM)
  341. measurements for a given size before estimating that an average can be taken as
  342. estimation for further executions with the same size. For
  343. ::STARPU_REGRESSION_BASED and ::STARPU_NL_REGRESSION_BASED, StarPU will require
  344. 10 (::_STARPU_CALIBRATION_MINIMUM) measurements, and that the minimum measured
  345. data size is smaller than 90% of the maximum measured data size (i.e. the
  346. measurement interval is large enough for a regression to have a meaning).
  347. Calibration can also be forced by setting the \ref STARPU_CALIBRATE environment
  348. variable to <c>1</c>, or even reset by setting it to <c>2</c>.
  349. How to use schedulers which can benefit from such performance model is explained
  350. in \ref TaskSchedulingPolicy.
  351. The same can be done for task power consumption estimation, by setting
  352. the field starpu_codelet::power_model the same way as the field
  353. starpu_codelet::model. Note: for now, the application has to give to
  354. the power consumption performance model a name which is different from
  355. the execution time performance model.
  356. The application can request time estimations from the StarPU performance
  357. models by filling a task structure as usual without actually submitting
  358. it. The data handles can be created by calling any of the functions
  359. <c>starpu_*_data_register</c> with a <c>NULL</c> pointer and <c>-1</c>
  360. node and the desired data sizes, and need to be unregistered as usual.
  361. The functions starpu_task_expected_length() and
  362. starpu_task_expected_power() can then be called to get an estimation
  363. of the task cost on a given arch. starpu_task_footprint() can also be
  364. used to get the footprint used for indexing history-based performance
  365. models. starpu_task_destroy() needs to be called to destroy the dummy
  366. task afterwards. See <c>tests/perfmodels/regression_based.c</c> for an example.
  367. \section TheoreticalLowerBoundOnExecutionTimeExample Theoretical Lower Bound On Execution Time Example
  368. For kernels with history-based performance models (and provided that
  369. they are completely calibrated), StarPU can very easily provide a
  370. theoretical lower bound for the execution time of a whole set of
  371. tasks. See for instance <c>examples/lu/lu_example.c</c>: before
  372. submitting tasks, call the function starpu_bound_start(), and after
  373. complete execution, call starpu_bound_stop().
  374. starpu_bound_print_lp() or starpu_bound_print_mps() can then be used
  375. to output a Linear Programming problem corresponding to the schedule
  376. of your tasks. Run it through <c>lp_solve</c> or any other linear
  377. programming solver, and that will give you a lower bound for the total
  378. execution time of your tasks. If StarPU was compiled with the library
  379. <c>glpk</c> installed, starpu_bound_compute() can be used to solve it
  380. immediately and get the optimized minimum, in ms. Its parameter
  381. <c>integer</c> allows to decide whether integer resolution should be
  382. computed and returned
  383. The <c>deps</c> parameter tells StarPU whether to take tasks, implicit
  384. data, and tag dependencies into account. Tags released in a callback
  385. or similar are not taken into account, only tags associated with a task are.
  386. It must be understood that the linear programming
  387. problem size is quadratic with the number of tasks and thus the time to solve it
  388. will be very long, it could be minutes for just a few dozen tasks. You should
  389. probably use <c>lp_solve -timeout 1 test.pl -wmps test.mps</c> to convert the
  390. problem to MPS format and then use a better solver, <c>glpsol</c> might be
  391. better than <c>lp_solve</c> for instance (the <c>--pcost</c> option may be
  392. useful), but sometimes doesn't manage to converge. <c>cbc</c> might look
  393. slower, but it is parallel. For <c>lp_solve</c>, be sure to try at least all the
  394. <c>-B</c> options. For instance, we often just use <c>lp_solve -cc -B1 -Bb
  395. -Bg -Bp -Bf -Br -BG -Bd -Bs -BB -Bo -Bc -Bi</c> , and the <c>-gr</c> option can
  396. also be quite useful. The resulting schedule can be observed by using
  397. the tool <c>starpu_lp2paje</c>, which converts it into the Paje
  398. format.
  399. Data transfer time can only be taken into account when <c>deps</c> is set. Only
  400. data transfers inferred from implicit data dependencies between tasks are taken
  401. into account. Other data transfers are assumed to be completely overlapped.
  402. Setting <c>deps</c> to 0 will only take into account the actual computations
  403. on processing units. It however still properly takes into account the varying
  404. performances of kernels and processing units, which is quite more accurate than
  405. just comparing StarPU performances with the fastest of the kernels being used.
  406. The <c>prio</c> parameter tells StarPU whether to simulate taking into account
  407. the priorities as the StarPU scheduler would, i.e. schedule prioritized
  408. tasks before less prioritized tasks, to check to which extend this results
  409. to a less optimal solution. This increases even more computation time.
  410. \section InsertTaskUtility Insert Task Utility
  411. StarPU provides the wrapper function starpu_insert_task() to ease
  412. the creation and submission of tasks.
  413. Here the implementation of the codelet:
  414. \code{.c}
  415. void func_cpu(void *descr[], void *_args)
  416. {
  417. int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
  418. float *x1 = (float *)STARPU_VARIABLE_GET_PTR(descr[1]);
  419. int ifactor;
  420. float ffactor;
  421. starpu_codelet_unpack_args(_args, &ifactor, &ffactor);
  422. *x0 = *x0 * ifactor;
  423. *x1 = *x1 * ffactor;
  424. }
  425. struct starpu_codelet mycodelet = {
  426. .where = STARPU_CPU,
  427. .cpu_funcs = { func_cpu, NULL },
  428. .cpu_funcs_name = { "func_cpu", NULL },
  429. .nbuffers = 2,
  430. .modes = { STARPU_RW, STARPU_RW }
  431. };
  432. \endcode
  433. And the call to the function starpu_insert_task():
  434. \code{.c}
  435. starpu_insert_task(&mycodelet,
  436. STARPU_VALUE, &ifactor, sizeof(ifactor),
  437. STARPU_VALUE, &ffactor, sizeof(ffactor),
  438. STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],
  439. 0);
  440. \endcode
  441. The call to starpu_insert_task() is equivalent to the following
  442. code:
  443. \code{.c}
  444. struct starpu_task *task = starpu_task_create();
  445. task->cl = &mycodelet;
  446. task->handles[0] = data_handles[0];
  447. task->handles[1] = data_handles[1];
  448. char *arg_buffer;
  449. size_t arg_buffer_size;
  450. starpu_codelet_pack_args(&arg_buffer, &arg_buffer_size,
  451. STARPU_VALUE, &ifactor, sizeof(ifactor),
  452. STARPU_VALUE, &ffactor, sizeof(ffactor),
  453. 0);
  454. task->cl_arg = arg_buffer;
  455. task->cl_arg_size = arg_buffer_size;
  456. int ret = starpu_task_submit(task);
  457. \endcode
  458. Here a similar call using ::STARPU_DATA_ARRAY.
  459. \code{.c}
  460. starpu_insert_task(&mycodelet,
  461. STARPU_DATA_ARRAY, data_handles, 2,
  462. STARPU_VALUE, &ifactor, sizeof(ifactor),
  463. STARPU_VALUE, &ffactor, sizeof(ffactor),
  464. 0);
  465. \endcode
  466. If some part of the task insertion depends on the value of some computation,
  467. the macro ::STARPU_DATA_ACQUIRE_CB can be very convenient. For
  468. instance, assuming that the index variable <c>i</c> was registered as handle
  469. <c>A_handle[i]</c>:
  470. \code{.c}
  471. /* Compute which portion we will work on, e.g. pivot */
  472. starpu_insert_task(&which_index, STARPU_W, i_handle, 0);
  473. /* And submit the corresponding task */
  474. STARPU_DATA_ACQUIRE_CB(i_handle, STARPU_R,
  475. starpu_insert_task(&work, STARPU_RW, A_handle[i], 0));
  476. \endcode
  477. The macro ::STARPU_DATA_ACQUIRE_CB submits an asynchronous request for
  478. acquiring data <c>i</c> for the main application, and will execute the code
  479. given as third parameter when it is acquired. In other words, as soon as the
  480. value of <c>i</c> computed by the codelet <c>which_index</c> can be read, the
  481. portion of code passed as third parameter of ::STARPU_DATA_ACQUIRE_CB will
  482. be executed, and is allowed to read from <c>i</c> to use it e.g. as an
  483. index. Note that this macro is only avaible when compiling StarPU with
  484. the compiler <c>gcc</c>.
  485. \section DataReduction Data Reduction
  486. In various cases, some piece of data is used to accumulate intermediate
  487. results. For instances, the dot product of a vector, maximum/minimum finding,
  488. the histogram of a photograph, etc. When these results are produced along the
  489. whole machine, it would not be efficient to accumulate them in only one place,
  490. incurring data transmission each and access concurrency.
  491. StarPU provides a mode ::STARPU_REDUX, which permits to optimize
  492. that case: it will allocate a buffer on each memory node, and accumulate
  493. intermediate results there. When the data is eventually accessed in the normal
  494. mode ::STARPU_R, StarPU will collect the intermediate results in just one
  495. buffer.
  496. For this to work, the user has to use the function
  497. starpu_data_set_reduction_methods() to declare how to initialize these
  498. buffers, and how to assemble partial results.
  499. For instance, <c>cg</c> uses that to optimize its dot product: it first defines
  500. the codelets for initialization and reduction:
  501. \code{.c}
  502. struct starpu_codelet bzero_variable_cl =
  503. {
  504. .cpu_funcs = { bzero_variable_cpu, NULL },
  505. .cpu_funcs_name = { "bzero_variable_cpu", NULL },
  506. .cuda_funcs = { bzero_variable_cuda, NULL },
  507. .nbuffers = 1,
  508. }
  509. static void accumulate_variable_cpu(void *descr[], void *cl_arg)
  510. {
  511. double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);
  512. double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);
  513. *v_dst = *v_dst + *v_src;
  514. }
  515. static void accumulate_variable_cuda(void *descr[], void *cl_arg)
  516. {
  517. double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);
  518. double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);
  519. cublasaxpy(1, (double)1.0, v_src, 1, v_dst, 1);
  520. cudaStreamSynchronize(starpu_cuda_get_local_stream());
  521. }
  522. struct starpu_codelet accumulate_variable_cl =
  523. {
  524. .cpu_funcs = { accumulate_variable_cpu, NULL },
  525. .cpu_funcs_name = { "accumulate_variable_cpu", NULL },
  526. .cuda_funcs = { accumulate_variable_cuda, NULL },
  527. .nbuffers = 1,
  528. }
  529. \endcode
  530. and attaches them as reduction methods for its handle <c>dtq</c>:
  531. \code{.c}
  532. starpu_variable_data_register(&dtq_handle, -1, NULL, sizeof(type));
  533. starpu_data_set_reduction_methods(dtq_handle,
  534. &accumulate_variable_cl, &bzero_variable_cl);
  535. \endcode
  536. and <c>dtq_handle</c> can now be used in mode ::STARPU_REDUX for the
  537. dot products with partitioned vectors:
  538. \code{.c}
  539. for (b = 0; b < nblocks; b++)
  540. starpu_insert_task(&dot_kernel_cl,
  541. STARPU_REDUX, dtq_handle,
  542. STARPU_R, starpu_data_get_sub_data(v1, 1, b),
  543. STARPU_R, starpu_data_get_sub_data(v2, 1, b),
  544. 0);
  545. \endcode
  546. During registration, we have here provided <c>NULL</c>, i.e. there is
  547. no initial value to be taken into account during reduction. StarPU
  548. will thus only take into account the contributions from the tasks
  549. <c>dot_kernel_cl</c>. Also, it will not allocate any memory for
  550. <c>dtq_handle</c> before tasks <c>dot_kernel_cl</c> are ready to run.
  551. If another dot product has to be performed, one could unregister
  552. <c>dtq_handle</c>, and re-register it. But one can also call
  553. starpu_data_invalidate_submit() with the parameter <c>dtq_handle</c>,
  554. which will clear all data from the handle, thus resetting it back to
  555. the initial status <c>register(NULL)</c>.
  556. The example <c>cg</c> also uses reduction for the blocked gemv kernel,
  557. leading to yet more relaxed dependencies and more parallelism.
  558. ::STARPU_REDUX can also be passed to starpu_mpi_insert_task() in the MPI
  559. case. That will however not produce any MPI communication, but just pass
  560. ::STARPU_REDUX to the underlying starpu_insert_task(). It is up to the
  561. application to call starpu_mpi_redux_data(), which posts tasks that will
  562. reduce the partial results among MPI nodes into the MPI node which owns the
  563. data. For instance, some hypothetical application which collects partial results
  564. into data <c>res</c>, then uses it for other computation, before looping again
  565. with a new reduction:
  566. \code{.c}
  567. for (i = 0; i < 100; i++) {
  568. starpu_mpi_insert_task(MPI_COMM_WORLD, &init_res, STARPU_W, res, 0);
  569. starpu_mpi_insert_task(MPI_COMM_WORLD, &work, STARPU_RW, A,
  570. STARPU_R, B, STARPU_REDUX, res, 0);
  571. starpu_mpi_redux_data(MPI_COMM_WORLD, res);
  572. starpu_mpi_insert_task(MPI_COMM_WORLD, &work2, STARPU_RW, B, STARPU_R, res, 0);
  573. }
  574. \endcode
  575. \section TemporaryBuffers Temporary Buffers
  576. There are two kinds of temporary buffers: temporary data which just pass results
  577. from a task to another, and scratch data which are needed only internally by
  578. tasks.
  579. \subsection TemporaryData Temporary Data
  580. Data can sometimes be entirely produced by a task, and entirely consumed by
  581. another task, without the need for other parts of the application to access
  582. it. In such case, registration can be done without prior allocation, by using
  583. the special memory node number <c>-1</c>, and passing a zero pointer. StarPU will
  584. actually allocate memory only when the task creating the content gets scheduled,
  585. and destroy it on unregistration.
  586. In addition to that, it can be tedious for the application to have to unregister
  587. the data, since it will not use its content anyway. The unregistration can be
  588. done lazily by using the function starpu_data_unregister_submit(),
  589. which will record that no more tasks accessing the handle will be submitted, so
  590. that it can be freed as soon as the last task accessing it is over.
  591. The following code examplifies both points: it registers the temporary
  592. data, submits three tasks accessing it, and records the data for automatic
  593. unregistration.
  594. \code{.c}
  595. starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
  596. starpu_insert_task(&produce_data, STARPU_W, handle, 0);
  597. starpu_insert_task(&compute_data, STARPU_RW, handle, 0);
  598. starpu_insert_task(&summarize_data, STARPU_R, handle, STARPU_W, result_handle, 0);
  599. starpu_data_unregister_submit(handle);
  600. \endcode
  601. \subsection ScratchData Scratch Data
  602. Some kernels sometimes need temporary data to achieve the computations, i.e. a
  603. workspace. The application could allocate it at the start of the codelet
  604. function, and free it at the end, but that would be costly. It could also
  605. allocate one buffer per worker (similarly to \ref
  606. HowToInitializeAComputationLibraryOnceForEachWorker), but that would
  607. make them systematic and permanent. A more optimized way is to use
  608. the data access mode ::STARPU_SCRATCH, as examplified below, which
  609. provides per-worker buffers without content consistency.
  610. \code{.c}
  611. starpu_vector_data_register(&workspace, -1, 0, sizeof(float));
  612. for (i = 0; i < N; i++)
  613. starpu_insert_task(&compute, STARPU_R, input[i],
  614. STARPU_SCRATCH, workspace, STARPU_W, output[i], 0);
  615. \endcode
  616. StarPU will make sure that the buffer is allocated before executing the task,
  617. and make this allocation per-worker: for CPU workers, notably, each worker has
  618. its own buffer. This means that each task submitted above will actually have its
  619. own workspace, which will actually be the same for all tasks running one after
  620. the other on the same worker. Also, if for instance GPU memory becomes scarce,
  621. StarPU will notice that it can free such buffers easily, since the content does
  622. not matter.
  623. The example <c>examples/pi</c> uses scratches for some temporary buffer.
  624. \section ParallelTasks Parallel Tasks
  625. StarPU can leverage existing parallel computation libraries by the means of
  626. parallel tasks. A parallel task is a task which gets worked on by a set of CPUs
  627. (called a parallel or combined worker) at the same time, by using an existing
  628. parallel CPU implementation of the computation to be achieved. This can also be
  629. useful to improve the load balance between slow CPUs and fast GPUs: since CPUs
  630. work collectively on a single task, the completion time of tasks on CPUs become
  631. comparable to the completion time on GPUs, thus relieving from granularity
  632. discrepancy concerns. <c>hwloc</c> support needs to be enabled to get
  633. good performance, otherwise StarPU will not know how to better group
  634. cores.
  635. Two modes of execution exist to accomodate with existing usages.
  636. \subsection Fork-modeParallelTasks Fork-mode Parallel Tasks
  637. In the Fork mode, StarPU will call the codelet function on one
  638. of the CPUs of the combined worker. The codelet function can use
  639. starpu_combined_worker_get_size() to get the number of threads it is
  640. allowed to start to achieve the computation. The CPU binding mask for the whole
  641. set of CPUs is already enforced, so that threads created by the function will
  642. inherit the mask, and thus execute where StarPU expected, the OS being in charge
  643. of choosing how to schedule threads on the corresponding CPUs. The application
  644. can also choose to bind threads by hand, using e.g. sched_getaffinity to know
  645. the CPU binding mask that StarPU chose.
  646. For instance, using OpenMP (full source is available in
  647. <c>examples/openmp/vector_scal.c</c>):
  648. \snippet forkmode.c To be included
  649. Other examples include for instance calling a BLAS parallel CPU implementation
  650. (see <c>examples/mult/xgemm.c</c>).
  651. \subsection SPMD-modeParallelTasks SPMD-mode Parallel Tasks
  652. In the SPMD mode, StarPU will call the codelet function on
  653. each CPU of the combined worker. The codelet function can use
  654. starpu_combined_worker_get_size() to get the total number of CPUs
  655. involved in the combined worker, and thus the number of calls that are made in
  656. parallel to the function, and starpu_combined_worker_get_rank() to get
  657. the rank of the current CPU within the combined worker. For instance:
  658. \code{.c}
  659. static void func(void *buffers[], void *args)
  660. {
  661. unsigned i;
  662. float *factor = _args;
  663. struct starpu_vector_interface *vector = buffers[0];
  664. unsigned n = STARPU_VECTOR_GET_NX(vector);
  665. float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
  666. /* Compute slice to compute */
  667. unsigned m = starpu_combined_worker_get_size();
  668. unsigned j = starpu_combined_worker_get_rank();
  669. unsigned slice = (n+m-1)/m;
  670. for (i = j * slice; i < (j+1) * slice && i < n; i++)
  671. val[i] *= *factor;
  672. }
  673. static struct starpu_codelet cl =
  674. {
  675. .modes = { STARPU_RW },
  676. .where = STARP_CPU,
  677. .type = STARPU_SPMD,
  678. .max_parallelism = INT_MAX,
  679. .cpu_funcs = { func, NULL },
  680. .cpu_funcs_name = { "func", NULL },
  681. .nbuffers = 1,
  682. }
  683. \endcode
  684. Of course, this trivial example will not really benefit from parallel task
  685. execution, and was only meant to be simple to understand. The benefit comes
  686. when the computation to be done is so that threads have to e.g. exchange
  687. intermediate results, or write to the data in a complex but safe way in the same
  688. buffer.
  689. \subsection ParallelTasksPerformance Parallel Tasks Performance
  690. To benefit from parallel tasks, a parallel-task-aware StarPU scheduler has to
  691. be used. When exposed to codelets with a flag ::STARPU_FORKJOIN or
  692. ::STARPU_SPMD, the schedulers <c>pheft</c> (parallel-heft) and <c>peager</c>
  693. (parallel eager) will indeed also try to execute tasks with
  694. several CPUs. It will automatically try the various available combined
  695. worker sizes (making several measurements for each worker size) and
  696. thus be able to avoid choosing a large combined worker if the codelet
  697. does not actually scale so much.
  698. \subsection CombinedWorkers Combined Workers
  699. By default, StarPU creates combined workers according to the architecture
  700. structure as detected by <c>hwloc</c>. It means that for each object of the <c>hwloc</c>
  701. topology (NUMA node, socket, cache, ...) a combined worker will be created. If
  702. some nodes of the hierarchy have a big arity (e.g. many cores in a socket
  703. without a hierarchy of shared caches), StarPU will create combined workers of
  704. intermediate sizes. The variable \ref
  705. STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER permits to tune the maximum
  706. arity between levels of combined workers.
  707. The combined workers actually produced can be seen in the output of the
  708. tool <c>starpu_machine_display</c> (the environment variable \ref
  709. STARPU_SCHED has to be set to a combined worker-aware scheduler such
  710. as <c>pheft</c> or <c>peager</c>).
  711. \subsection ConcurrentParallelTasks Concurrent Parallel Tasks
  712. Unfortunately, many environments and librairies do not support concurrent
  713. calls.
  714. For instance, most OpenMP implementations (including the main ones) do not
  715. support concurrent <c>pragma omp parallel</c> statements without nesting them in
  716. another <c>pragma omp parallel</c> statement, but StarPU does not yet support
  717. creating its CPU workers by using such pragma.
  718. Other parallel libraries are also not safe when being invoked concurrently
  719. from different threads, due to the use of global variables in their sequential
  720. sections for instance.
  721. The solution is then to use only one combined worker at a time. This can be
  722. done by setting the field starpu_conf::single_combined_worker to <c>1</c>, or
  723. setting the environment variable \ref STARPU_SINGLE_COMBINED_WORKER
  724. to <c>1</c>. StarPU will then run only one parallel task at a time (but other
  725. CPU and GPU tasks are not affected and can be run concurrently). The parallel
  726. task scheduler will however still however still try varying combined worker
  727. sizes to look for the most efficient ones.
  728. \section Debugging Debugging
  729. StarPU provides several tools to help debugging aplications. Execution traces
  730. can be generated and displayed graphically, see \ref
  731. GeneratingTracesWithFxT. Some gdb helpers are also provided to show
  732. the whole StarPU state:
  733. \verbatim
  734. (gdb) source tools/gdbinit
  735. (gdb) help starpu
  736. \endverbatim
  737. The Temanejo task debugger can also be used, see \ref UsingTheTemanejoTaskDebugger.
  738. \section TheMultiformatInterface The Multiformat Interface
  739. It may be interesting to represent the same piece of data using two different
  740. data structures: one that would only be used on CPUs, and one that would only
  741. be used on GPUs. This can be done by using the multiformat interface. StarPU
  742. will be able to convert data from one data structure to the other when needed.
  743. Note that the scheduler <c>dmda</c> is the only one optimized for this
  744. interface. The user must provide StarPU with conversion codelets:
  745. \snippet multiformat.c To be included
  746. Kernels can be written almost as for any other interface. Note that
  747. ::STARPU_MULTIFORMAT_GET_CPU_PTR shall only be used for CPU kernels. CUDA kernels
  748. must use ::STARPU_MULTIFORMAT_GET_CUDA_PTR, and OpenCL kernels must use
  749. ::STARPU_MULTIFORMAT_GET_OPENCL_PTR. ::STARPU_MULTIFORMAT_GET_NX may
  750. be used in any kind of kernel.
  751. \code{.c}
  752. static void
  753. multiformat_scal_cpu_func(void *buffers[], void *args)
  754. {
  755. struct point *aos;
  756. unsigned int n;
  757. aos = STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
  758. n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
  759. ...
  760. }
  761. extern "C" void multiformat_scal_cuda_func(void *buffers[], void *_args)
  762. {
  763. unsigned int n;
  764. struct struct_of_arrays *soa;
  765. soa = (struct struct_of_arrays *) STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
  766. n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
  767. ...
  768. }
  769. \endcode
  770. A full example may be found in <c>examples/basic_examples/multiformat.c</c>.
  771. \section UsingTheDriverAPI Using The Driver API
  772. \ref API_Running_Drivers
  773. \code{.c}
  774. int ret;
  775. struct starpu_driver = {
  776. .type = STARPU_CUDA_WORKER,
  777. .id.cuda_id = 0
  778. };
  779. ret = starpu_driver_init(&d);
  780. if (ret != 0)
  781. error();
  782. while (some_condition) {
  783. ret = starpu_driver_run_once(&d);
  784. if (ret != 0)
  785. error();
  786. }
  787. ret = starpu_driver_deinit(&d);
  788. if (ret != 0)
  789. error();
  790. \endcode
  791. To add a new kind of device to the structure starpu_driver, one needs to:
  792. <ol>
  793. <li> Add a member to the union starpu_driver::id
  794. </li>
  795. <li> Modify the internal function <c>_starpu_launch_drivers()</c> to
  796. make sure the driver is not always launched.
  797. </li>
  798. <li> Modify the function starpu_driver_run() so that it can handle
  799. another kind of architecture.
  800. </li>
  801. <li> Write the new function <c>_starpu_run_foobar()</c> in the
  802. corresponding driver.
  803. </li>
  804. </ol>
  805. \section DefiningANewSchedulingPolicy Defining A New Scheduling Policy
  806. A full example showing how to define a new scheduling policy is available in
  807. the StarPU sources in the directory <c>examples/scheduler/</c>.
  808. See \ref API_Scheduling_Policy
  809. \code{.c}
  810. static struct starpu_sched_policy dummy_sched_policy = {
  811. .init_sched = init_dummy_sched,
  812. .deinit_sched = deinit_dummy_sched,
  813. .add_workers = dummy_sched_add_workers,
  814. .remove_workers = dummy_sched_remove_workers,
  815. .push_task = push_task_dummy,
  816. .push_prio_task = NULL,
  817. .pop_task = pop_task_dummy,
  818. .post_exec_hook = NULL,
  819. .pop_every_task = NULL,
  820. .policy_name = "dummy",
  821. .policy_description = "dummy scheduling strategy"
  822. };
  823. \endcode
  824. \section On-GPURendering On-GPU Rendering
  825. Graphical-oriented applications need to draw the result of their computations,
  826. typically on the very GPU where these happened. Technologies such as OpenGL/CUDA
  827. interoperability permit to let CUDA directly work on the OpenGL buffers, making
  828. them thus immediately ready for drawing, by mapping OpenGL buffer, textures or
  829. renderbuffer objects into CUDA. CUDA however imposes some technical
  830. constraints: peer memcpy has to be disabled, and the thread that runs OpenGL has
  831. to be the one that runs CUDA computations for that GPU.
  832. To achieve this with StarPU, pass the option
  833. \ref disable-cuda-memcpy-peer "--disable-cuda-memcpy-peer"
  834. to <c>./configure</c> (TODO: make it dynamic), OpenGL/GLUT has to be initialized
  835. first, and the interoperability mode has to
  836. be enabled by using the field
  837. starpu_conf::cuda_opengl_interoperability, and the driver loop has to
  838. be run by the application, by using the field
  839. starpu_conf::not_launched_drivers to prevent StarPU from running it in
  840. a separate thread, and by using starpu_driver_run() to run the loop.
  841. The examples <c>gl_interop</c> and <c>gl_interop_idle</c> show how it
  842. articulates in a simple case, where rendering is done in task
  843. callbacks. The former uses <c>glutMainLoopEvent</c> to make GLUT
  844. progress from the StarPU driver loop, while the latter uses
  845. <c>glutIdleFunc</c> to make StarPU progress from the GLUT main loop.
  846. Then, to use an OpenGL buffer as a CUDA data, StarPU simply needs to be given
  847. the CUDA pointer at registration, for instance:
  848. \code{.c}
  849. /* Get the CUDA worker id */
  850. for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)
  851. if (starpu_worker_get_type(workerid) == STARPU_CUDA_WORKER)
  852. break;
  853. /* Build a CUDA pointer pointing at the OpenGL buffer */
  854. cudaGraphicsResourceGetMappedPointer((void**)&output, &num_bytes, resource);
  855. /* And register it to StarPU */
  856. starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid),
  857. output, num_bytes / sizeof(float4), sizeof(float4));
  858. /* The handle can now be used as usual */
  859. starpu_insert_task(&cl, STARPU_RW, handle, 0);
  860. /* ... */
  861. /* This gets back data into the OpenGL buffer */
  862. starpu_data_unregister(handle);
  863. \endcode
  864. and display it e.g. in the callback function.
  865. \section DefiningANewDataInterface Defining A New Data Interface
  866. Let's define a new data interface to manage complex numbers.
  867. \code{.c}
  868. /* interface for complex numbers */
  869. struct starpu_complex_interface
  870. {
  871. double *real;
  872. double *imaginary;
  873. int nx;
  874. };
  875. \endcode
  876. Registering such a data to StarPU is easily done using the function
  877. starpu_data_register(). The last
  878. parameter of the function, <c>interface_complex_ops</c>, will be
  879. described below.
  880. \code{.c}
  881. void starpu_complex_data_register(starpu_data_handle_t *handle,
  882. unsigned home_node, double *real, double *imaginary, int nx)
  883. {
  884. struct starpu_complex_interface complex =
  885. {
  886. .real = real,
  887. .imaginary = imaginary,
  888. .nx = nx
  889. };
  890. if (interface_complex_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
  891. {
  892. interface_complex_ops.interfaceid = starpu_data_interface_get_next_id();
  893. }
  894. starpu_data_register(handleptr, home_node, &complex, &interface_complex_ops);
  895. }
  896. \endcode
  897. Different operations need to be defined for a data interface through
  898. the type starpu_data_interface_ops. We only define here the basic
  899. operations needed to run simple applications. The source code for the
  900. different functions can be found in the file
  901. <c>examples/interface/complex_interface.c</c>.
  902. \code{.c}
  903. static struct starpu_data_interface_ops interface_complex_ops =
  904. {
  905. .register_data_handle = complex_register_data_handle,
  906. .allocate_data_on_node = complex_allocate_data_on_node,
  907. .copy_methods = &complex_copy_methods,
  908. .get_size = complex_get_size,
  909. .footprint = complex_footprint,
  910. .interfaceid = STARPU_UNKNOWN_INTERFACE_ID,
  911. .interface_size = sizeof(struct starpu_complex_interface),
  912. };
  913. \endcode
  914. Functions need to be defined to access the different fields of the
  915. complex interface from a StarPU data handle.
  916. \code{.c}
  917. double *starpu_complex_get_real(starpu_data_handle_t handle)
  918. {
  919. struct starpu_complex_interface *complex_interface =
  920. (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, 0);
  921. return complex_interface->real;
  922. }
  923. double *starpu_complex_get_imaginary(starpu_data_handle_t handle);
  924. int starpu_complex_get_nx(starpu_data_handle_t handle);
  925. \endcode
  926. Similar functions need to be defined to access the different fields of the
  927. complex interface from a <c>void *</c> pointer to be used within codelet
  928. implemetations.
  929. \snippet complex.c To be included
  930. Complex data interfaces can then be registered to StarPU.
  931. \code{.c}
  932. double real = 45.0;
  933. double imaginary = 12.0;starpu_complex_data_register(&handle1, STARPU_MAIN_RAM, &real, &imaginary, 1);
  934. starpu_insert_task(&cl_display, STARPU_R, handle1, 0);
  935. \endcode
  936. and used by codelets.
  937. \code{.c}
  938. void display_complex_codelet(void *descr[], __attribute__ ((unused)) void *_args)
  939. {
  940. int nx = STARPU_COMPLEX_GET_NX(descr[0]);
  941. double *real = STARPU_COMPLEX_GET_REAL(descr[0]);
  942. double *imaginary = STARPU_COMPLEX_GET_IMAGINARY(descr[0]);
  943. int i;
  944. for(i=0 ; i<nx ; i++)
  945. {
  946. fprintf(stderr, "Complex[%d] = %3.2f + %3.2f i\n", i, real[i], imaginary[i]);
  947. }
  948. }
  949. \endcode
  950. The whole code for this complex data interface is available in the
  951. directory <c>examples/interface/</c>.
  952. \section SettingTheDataHandlesForATask Setting The Data Handles For A Task
  953. The number of data a task can manage is fixed by the environment variable
  954. \ref STARPU_NMAXBUFS which has a default value which can be changed
  955. through the configure option \ref enable-maxbuffers "--enable-maxbuffers".
  956. However, it is possible to define tasks managing more data by using
  957. the field starpu_task::dyn_handles when defining a task and the field
  958. starpu_codelet::dyn_modes when defining the corresponding codelet.
  959. \code{.c}
  960. enum starpu_data_access_mode modes[STARPU_NMAXBUFS+1] = {
  961. STARPU_R, STARPU_R, ...
  962. };
  963. struct starpu_codelet dummy_big_cl =
  964. {
  965. .cuda_funcs = { dummy_big_kernel, NULL },
  966. .opencl_funcs = { dummy_big_kernel, NULL },
  967. .cpu_funcs = { dummy_big_kernel, NULL },
  968. .cpu_funcs_name = { "dummy_big_kernel", NULL },
  969. .nbuffers = STARPU_NMAXBUFS+1,
  970. .dyn_modes = modes
  971. };
  972. task = starpu_task_create();
  973. task->cl = &dummy_big_cl;
  974. task->dyn_handles = malloc(task->cl->nbuffers * sizeof(starpu_data_handle_t));
  975. for(i=0 ; i<task->cl->nbuffers ; i++)
  976. {
  977. task->dyn_handles[i] = handle;
  978. }
  979. starpu_task_submit(task);
  980. \endcode
  981. \code{.c}
  982. starpu_data_handle_t *handles = malloc(dummy_big_cl.nbuffers * sizeof(starpu_data_handle_t));
  983. for(i=0 ; i<dummy_big_cl.nbuffers ; i++)
  984. {
  985. handles[i] = handle;
  986. }
  987. starpu_insert_task(&dummy_big_cl,
  988. STARPU_VALUE, &dummy_big_cl.nbuffers, sizeof(dummy_big_cl.nbuffers),
  989. STARPU_DATA_ARRAY, handles, dummy_big_cl.nbuffers,
  990. 0);
  991. \endcode
  992. The whole code for this complex data interface is available in the
  993. directory <c>examples/basic_examples/dynamic_handles.c</c>.
  994. \section MoreExamples More Examples
  995. More examples are available in the StarPU sources in the directory
  996. <c>examples/</c>. Simple examples include:
  997. <dl>
  998. <dt> <c>incrementer/</c> </dt>
  999. <dd> Trivial incrementation test. </dd>
  1000. <dt> <c>basic_examples/</c> </dt>
  1001. <dd>
  1002. Simple documented Hello world and vector/scalar product (as
  1003. shown in \ref BasicExamples), matrix
  1004. product examples (as shown in \ref PerformanceModelExample), an example using the blocked matrix data
  1005. interface, an example using the variable data interface, and an example
  1006. using different formats on CPUs and GPUs.
  1007. </dd>
  1008. <dt> <c>matvecmult/</c></dt>
  1009. <dd>
  1010. OpenCL example from NVidia, adapted to StarPU.
  1011. </dd>
  1012. <dt> <c>axpy/</c></dt>
  1013. <dd>
  1014. AXPY CUBLAS operation adapted to StarPU.
  1015. </dd>
  1016. <dt> <c>fortran/</c> </dt>
  1017. <dd>
  1018. Example of Fortran bindings.
  1019. </dd>
  1020. </dl>
  1021. More advanced examples include:
  1022. <dl>
  1023. <dt><c>filters/</c></dt>
  1024. <dd>
  1025. Examples using filters, as shown in \ref PartitioningData.
  1026. </dd>
  1027. <dt><c>lu/</c></dt>
  1028. <dd>
  1029. LU matrix factorization, see for instance <c>xlu_implicit.c</c>
  1030. </dd>
  1031. <dt><c>cholesky/</c></dt>
  1032. <dd>
  1033. Cholesky matrix factorization, see for instance <c>cholesky_implicit.c</c>.
  1034. </dd>
  1035. </dl>
  1036. */