advanced-examples.texi 45 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142
  1. @c -*-texinfo-*-
  2. @c This file is part of the StarPU Handbook.
  3. @c Copyright (C) 2009--2011 Universit@'e de Bordeaux 1
  4. @c Copyright (C) 2010, 2011, 2012 Centre National de la Recherche Scientifique
  5. @c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
  6. @c See the file starpu.texi for copying conditions.
  7. @menu
  8. * Using multiple implementations of a codelet::
  9. * Enabling implementation according to capabilities::
  10. * Task and Worker Profiling::
  11. * Partitioning Data::
  12. * Performance model example::
  13. * Theoretical lower bound on execution time::
  14. * Insert Task Utility::
  15. * Data reduction::
  16. * Temporary buffers::
  17. * Parallel Tasks::
  18. * Debugging::
  19. * The multiformat interface::
  20. * On-GPU rendering::
  21. * More examples:: More examples shipped with StarPU
  22. @end menu
  23. @node Using multiple implementations of a codelet
  24. @section Using multiple implementations of a codelet
  25. One may want to write multiple implementations of a codelet for a single type of
  26. device and let StarPU choose which one to run. As an example, we will show how
  27. to use SSE to scale a vector. The codelet can be written as follows:
  28. @cartouche
  29. @smallexample
  30. #include <xmmintrin.h>
  31. void scal_sse_func(void *buffers[], void *cl_arg)
  32. @{
  33. float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
  34. unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
  35. unsigned int n_iterations = n/4;
  36. if (n % 4 != 0)
  37. n_iterations++;
  38. __m128 *VECTOR = (__m128*) vector;
  39. __m128 factor __attribute__((aligned(16)));
  40. factor = _mm_set1_ps(*(float *) cl_arg);
  41. unsigned int i;
  42. for (i = 0; i < n_iterations; i++)
  43. VECTOR[i] = _mm_mul_ps(factor, VECTOR[i]);
  44. @}
  45. @end smallexample
  46. @end cartouche
  47. @cartouche
  48. @smallexample
  49. struct starpu_codelet cl = @{
  50. .where = STARPU_CPU,
  51. .cpu_funcs = @{ scal_cpu_func, scal_sse_func, NULL @},
  52. .nbuffers = 1,
  53. .modes = @{ STARPU_RW @}
  54. @};
  55. @end smallexample
  56. @end cartouche
  57. Schedulers which are multi-implementation aware (only @code{dmda}, @code{heft}
  58. and @code{pheft} for now) will use the performance models of all the
  59. implementations it was given, and pick the one that seems to be the fastest.
  60. @node Enabling implementation according to capabilities
  61. @section Enabling implementation according to capabilities
  62. Some implementations may not run on some devices. For instance, some CUDA
  63. devices do not support double floating point precision, and thus the kernel
  64. execution would just fail; or the device may not have enough shared memory for
  65. the implementation being used. The @code{can_execute} field of the @code{struct
  66. starpu_codelet} structure permits to express this. For instance:
  67. @cartouche
  68. @smallexample
  69. static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
  70. @{
  71. const struct cudaDeviceProp *props;
  72. if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
  73. return 1;
  74. /* Cuda device */
  75. props = starpu_cuda_get_device_properties(workerid);
  76. if (props->major >= 2 || props->minor >= 3)
  77. /* At least compute capability 1.3, supports doubles */
  78. return 1;
  79. /* Old card, does not support doubles */
  80. return 0;
  81. @}
  82. struct starpu_codelet cl = @{
  83. .where = STARPU_CPU|STARPU_CUDA,
  84. .can_execute = can_execute,
  85. .cpu_funcs = @{ cpu_func, NULL @},
  86. .cuda_funcs = @{ gpu_func, NULL @}
  87. .nbuffers = 1,
  88. .modes = @{ STARPU_RW @}
  89. @};
  90. @end smallexample
  91. @end cartouche
  92. This can be essential e.g. when running on a machine which mixes various models
  93. of CUDA devices, to take benefit from the new models without crashing on old models.
  94. Note: the @code{can_execute} function is called by the scheduler each time it
  95. tries to match a task with a worker, and should thus be very fast. The
  96. @code{starpu_cuda_get_device_properties} provides a quick access to CUDA
  97. properties of CUDA devices to achieve such efficiency.
  98. Another example is compiling CUDA code for various compute capabilities,
  99. resulting with two CUDA functions, e.g. @code{scal_gpu_13} for compute capability
  100. 1.3, and @code{scal_gpu_20} for compute capability 2.0. Both functions can be
  101. provided to StarPU by using @code{cuda_funcs}, and @code{can_execute} can then be
  102. used to rule out the @code{scal_gpu_20} variant on a CUDA device which
  103. will not be able to execute it:
  104. @cartouche
  105. @smallexample
  106. static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
  107. @{
  108. const struct cudaDeviceProp *props;
  109. if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
  110. return 1;
  111. /* Cuda device */
  112. if (nimpl == 0)
  113. /* Trying to execute the 1.3 capability variant, we assume it is ok in all cases. */
  114. return 1;
  115. /* Trying to execute the 2.0 capability variant, check that the card can do it. */
  116. props = starpu_cuda_get_device_properties(workerid);
  117. if (props->major >= 2 || props->minor >= 0)
  118. /* At least compute capability 2.0, can run it */
  119. return 1;
  120. /* Old card, does not support 2.0, will not be able to execute the 2.0 variant. */
  121. return 0;
  122. @}
  123. struct starpu_codelet cl = @{
  124. .where = STARPU_CPU|STARPU_CUDA,
  125. .can_execute = can_execute,
  126. .cpu_funcs = @{ cpu_func, NULL @},
  127. .cuda_funcs = @{ scal_gpu_13, scal_gpu_20, NULL @},
  128. .nbuffers = 1,
  129. .modes = @{ STARPU_RW @}
  130. @};
  131. @end smallexample
  132. @end cartouche
  133. Note: the most generic variant should be provided first, as some schedulers are
  134. not able to try the different variants.
  135. @node Task and Worker Profiling
  136. @section Task and Worker Profiling
  137. A full example showing how to use the profiling API is available in
  138. the StarPU sources in the directory @code{examples/profiling/}.
  139. @cartouche
  140. @smallexample
  141. struct starpu_task *task = starpu_task_create();
  142. task->cl = &cl;
  143. task->synchronous = 1;
  144. /* We will destroy the task structure by hand so that we can
  145. * query the profiling info before the task is destroyed. */
  146. task->destroy = 0;
  147. /* Submit and wait for completion (since synchronous was set to 1) */
  148. starpu_task_submit(task);
  149. /* The task is finished, get profiling information */
  150. struct starpu_task_profiling_info *info = task->profiling_info;
  151. /* How much time did it take before the task started ? */
  152. double delay += starpu_timing_timespec_delay_us(&info->submit_time, &info->start_time);
  153. /* How long was the task execution ? */
  154. double length += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
  155. /* We don't need the task structure anymore */
  156. starpu_task_destroy(task);
  157. @end smallexample
  158. @end cartouche
  159. @cartouche
  160. @smallexample
  161. /* Display the occupancy of all workers during the test */
  162. int worker;
  163. for (worker = 0; worker < starpu_worker_get_count(); worker++)
  164. @{
  165. struct starpu_worker_profiling_info worker_info;
  166. int ret = starpu_worker_get_profiling_info(worker, &worker_info);
  167. STARPU_ASSERT(!ret);
  168. double total_time = starpu_timing_timespec_to_us(&worker_info.total_time);
  169. double executing_time = starpu_timing_timespec_to_us(&worker_info.executing_time);
  170. double sleeping_time = starpu_timing_timespec_to_us(&worker_info.sleeping_time);
  171. double overhead_time = total_time - executing_time - sleeping_time;
  172. float executing_ratio = 100.0*executing_time/total_time;
  173. float sleeping_ratio = 100.0*sleeping_time/total_time;
  174. float overhead_ratio = 100.0 - executing_ratio - sleeping_ratio;
  175. char workername[128];
  176. starpu_worker_get_name(worker, workername, 128);
  177. fprintf(stderr, "Worker %s:\n", workername);
  178. fprintf(stderr, "\ttotal time: %.2lf ms\n", total_time*1e-3);
  179. fprintf(stderr, "\texec time: %.2lf ms (%.2f %%)\n", executing_time*1e-3,
  180. executing_ratio);
  181. fprintf(stderr, "\tblocked time: %.2lf ms (%.2f %%)\n", sleeping_time*1e-3,
  182. sleeping_ratio);
  183. fprintf(stderr, "\toverhead time: %.2lf ms (%.2f %%)\n", overhead_time*1e-3,
  184. overhead_ratio);
  185. @}
  186. @end smallexample
  187. @end cartouche
  188. @node Partitioning Data
  189. @section Partitioning Data
  190. An existing piece of data can be partitioned in sub parts to be used by different tasks, for instance:
  191. @cartouche
  192. @smallexample
  193. int vector[NX];
  194. starpu_data_handle_t handle;
  195. /* Declare data to StarPU */
  196. starpu_vector_data_register(&handle, 0, (uintptr_t)vector,
  197. NX, sizeof(vector[0]));
  198. /* Partition the vector in PARTS sub-vectors */
  199. starpu_filter f =
  200. @{
  201. .filter_func = starpu_block_filter_func_vector,
  202. .nchildren = PARTS
  203. @};
  204. starpu_data_partition(handle, &f);
  205. @end smallexample
  206. @end cartouche
  207. The task submission then uses @code{starpu_data_get_sub_data} to retrieve the
  208. sub-handles to be passed as tasks parameters.
  209. @cartouche
  210. @smallexample
  211. /* Submit a task on each sub-vector */
  212. for (i=0; i<starpu_data_get_nb_children(handle); i++) @{
  213. /* Get subdata number i (there is only 1 dimension) */
  214. starpu_data_handle_t sub_handle = starpu_data_get_sub_data(handle, 1, i);
  215. struct starpu_task *task = starpu_task_create();
  216. task->handles[0] = sub_handle;
  217. task->cl = &cl;
  218. task->synchronous = 1;
  219. task->cl_arg = &factor;
  220. task->cl_arg_size = sizeof(factor);
  221. starpu_task_submit(task);
  222. @}
  223. @end smallexample
  224. @end cartouche
  225. Partitioning can be applied several times, see
  226. @code{examples/basic_examples/mult.c} and @code{examples/filters/}.
  227. Wherever the whole piece of data is already available, the partitioning will
  228. be done in-place, i.e. without allocating new buffers but just using pointers
  229. inside the existing copy. This is particularly important to be aware of when
  230. using OpenCL, where the kernel parameters are not pointers, but handles. The
  231. kernel thus needs to be also passed the offset within the OpenCL buffer:
  232. @cartouche
  233. @smallexample
  234. void opencl_func(void *buffers[], void *cl_arg)
  235. @{
  236. cl_mem vector = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
  237. unsigned offset = STARPU_BLOCK_GET_OFFSET(buffers[0]);
  238. ...
  239. clSetKernelArg(kernel, 0, sizeof(vector), &vector);
  240. clSetKernelArg(kernel, 1, sizeof(offset), &offset);
  241. ...
  242. @}
  243. @end smallexample
  244. @end cartouche
  245. And the kernel has to shift from the pointer passed by the OpenCL driver:
  246. @cartouche
  247. @smallexample
  248. __kernel void opencl_kernel(__global int *vector, unsigned offset)
  249. @{
  250. block = (__global void *)block + offset;
  251. ...
  252. @}
  253. @end smallexample
  254. @end cartouche
  255. StarPU provides various interfaces and filters for matrices, vectors, etc.,
  256. but applications can also write their own data interfaces and filters, see
  257. @code{examples/interface} and @code{examples/filters/custom_mf} for an example.
  258. @node Performance model example
  259. @section Performance model example
  260. To achieve good scheduling, StarPU scheduling policies need to be able to
  261. estimate in advance the duration of a task. This is done by giving to codelets
  262. a performance model, by defining a @code{starpu_perfmodel} structure and
  263. providing its address in the @code{model} field of the @code{struct starpu_codelet}
  264. structure. The @code{symbol} and @code{type} fields of @code{starpu_perfmodel}
  265. are mandatory, to give a name to the model, and the type of the model, since
  266. there are several kinds of performance models. For compatibility, make sure to
  267. initialize the whole structure to zero, either by using explicit memset, or by
  268. letting the compiler implicitly do it as examplified below.
  269. @itemize
  270. @item
  271. Measured at runtime (@code{STARPU_HISTORY_BASED} model type). This assumes that for a
  272. given set of data input/output sizes, the performance will always be about the
  273. same. This is very true for regular kernels on GPUs for instance (<0.1% error),
  274. and just a bit less true on CPUs (~=1% error). This also assumes that there are
  275. few different sets of data input/output sizes. StarPU will then keep record of
  276. the average time of previous executions on the various processing units, and use
  277. it as an estimation. History is done per task size, by using a hash of the input
  278. and ouput sizes as an index.
  279. It will also save it in @code{$HOME/.starpu/sampling/codelets}
  280. for further executions (@code{$USERPROFILE/.starpu/sampling/codelets} in windows
  281. environments), and can be observed by using the
  282. @code{starpu_perfmodel_display} command, or drawn by using
  283. the @code{starpu_perfmodel_plot} (@pxref{Performance model calibration}). The
  284. models are indexed by machine name. To
  285. share the models between machines (e.g. for a homogeneous cluster), use
  286. @code{export STARPU_HOSTNAME=some_global_name}. Measurements are only done
  287. when using a task scheduler which makes use of it, such as @code{heft} or
  288. @code{dmda}. Measurements can also be provided explicitly by the application, by
  289. using the @code{starpu_perfmodel_update_history} function.
  290. The following is a small code example.
  291. If e.g. the code is recompiled with other compilation options, or several
  292. variants of the code are used, the symbol string should be changed to reflect
  293. that, in order to recalibrate a new model from zero. The symbol string can even
  294. be constructed dynamically at execution time, as long as this is done before
  295. submitting any task using it.
  296. @cartouche
  297. @smallexample
  298. static struct starpu_perfmodel mult_perf_model = @{
  299. .type = STARPU_HISTORY_BASED,
  300. .symbol = "mult_perf_model"
  301. @};
  302. struct starpu_codelet cl = @{
  303. .where = STARPU_CPU,
  304. .cpu_funcs = @{ cpu_mult, NULL @},
  305. .nbuffers = 3,
  306. .modes = @{ STARPU_R, STARPU_R, STARPU_W @},
  307. /* for the scheduling policy to be able to use performance models */
  308. .model = &mult_perf_model
  309. @};
  310. @end smallexample
  311. @end cartouche
  312. @item
  313. Measured at runtime and refined by regression (@code{STARPU_*REGRESSION_BASED}
  314. model type). This still assumes performance regularity, but works
  315. with various data input sizes, by applying regression over observed
  316. execution times. STARPU_REGRESSION_BASED uses an a*n^b regression
  317. form, STARPU_NL_REGRESSION_BASED uses an a*n^b+c (more precise than
  318. STARPU_REGRESSION_BASED, but costs a lot more to compute).
  319. For instance,
  320. @code{tests/perfmodels/regression_based.c} uses a regression-based performance
  321. model for the @code{memset} operation.
  322. Of course, the application has to issue
  323. tasks with varying size so that the regression can be computed. StarPU will not
  324. trust the regression unless there is at least 10% difference between the minimum
  325. and maximum observed input size. It can be useful to set the
  326. @code{STARPU_CALIBRATE} environment variable to @code{1} and run the application
  327. on varying input sizes with @code{STARPU_SCHED} set to @code{eager} scheduler,
  328. so as to feed the performance model for a variety of
  329. inputs. The application can also provide the measurements explictly by using
  330. @code{starpu_perfmodel_update_history}. The @code{starpu_perfmodel_display} and
  331. @code{starpu_perfmodel_plot}
  332. tools can be used to observe how much the performance model is calibrated (@pxref{Performance model calibration}); when
  333. their output look good, @code{STARPU_CALIBRATE} can be reset to @code{0} to let
  334. StarPU use the resulting performance model without recording new measures, and
  335. @code{STARPU_SCHED} can be set to @code{heft} to benefit from the performance models. If
  336. the data input sizes vary a lot, it is really important to set
  337. @code{STARPU_CALIBRATE} to @code{0}, otherwise StarPU will continue adding the
  338. measures, and result with a very big performance model, which will take time a
  339. lot of time to load and save.
  340. For non-linear regression, since computing it
  341. is quite expensive, it is only done at termination of the application. This
  342. means that the first execution of the application will use only history-based
  343. performance model to perform scheduling, without using regression.
  344. @item
  345. Provided as an estimation from the application itself (@code{STARPU_COMMON} model type and @code{cost_function} field),
  346. see for instance
  347. @code{examples/common/blas_model.h} and @code{examples/common/blas_model.c}.
  348. @item
  349. Provided explicitly by the application (@code{STARPU_PER_ARCH} model type): the
  350. @code{.per_arch[arch][nimpl].cost_function} fields have to be filled with pointers to
  351. functions which return the expected duration of the task in micro-seconds, one
  352. per architecture.
  353. @end itemize
  354. For the @code{STARPU_HISTORY_BASED} and @code{STARPU_*REGRESSION_BASE},
  355. the total size of task data (both input and output) is used as an index by
  356. default. The @code{size_base} field of @code{struct starpu_perfmodel} however
  357. permits the application to override that, when for instance some of the data
  358. do not matter for task cost (e.g. mere reference table), or when using sparse
  359. structures (in which case it is the number of non-zeros which matter), or when
  360. there is some hidden parameter such as the number of iterations, etc.
  361. How to use schedulers which can benefit from such performance model is explained
  362. in @ref{Task scheduling policy}.
  363. The same can be done for task power consumption estimation, by setting the
  364. @code{power_model} field the same way as the @code{model} field. Note: for
  365. now, the application has to give to the power consumption performance model
  366. a name which is different from the execution time performance model.
  367. The application can request time estimations from the StarPU performance
  368. models by filling a task structure as usual without actually submitting
  369. it. The data handles can be created by calling @code{starpu_data_register}
  370. functions with a @code{NULL} pointer (and need to be unregistered as usual)
  371. and the desired data sizes. The @code{starpu_task_expected_length} and
  372. @code{starpu_task_expected_power} functions can then be called to get an
  373. estimation of the task duration on a given arch. @code{starpu_task_destroy}
  374. needs to be called to destroy the dummy task afterwards. See
  375. @code{tests/perfmodels/regression_based.c} for an example.
  376. @node Theoretical lower bound on execution time
  377. @section Theoretical lower bound on execution time
  378. For kernels with history-based performance models (and provided that they are completely calibrated), StarPU can very easily provide a theoretical lower
  379. bound for the execution time of a whole set of tasks. See for
  380. instance @code{examples/lu/lu_example.c}: before submitting tasks,
  381. call @code{starpu_bound_start}, and after complete execution, call
  382. @code{starpu_bound_stop}. @code{starpu_bound_print_lp} or
  383. @code{starpu_bound_print_mps} can then be used to output a Linear Programming
  384. problem corresponding to the schedule of your tasks. Run it through
  385. @code{lp_solve} or any other linear programming solver, and that will give you a
  386. lower bound for the total execution time of your tasks. If StarPU was compiled
  387. with the glpk library installed, @code{starpu_bound_compute} can be used to
  388. solve it immediately and get the optimized minimum, in ms. Its @code{integer}
  389. parameter allows to decide whether integer resolution should be computed
  390. and returned too.
  391. The @code{deps} parameter tells StarPU whether to take tasks and implicit data
  392. dependencies into account. It must be understood that the linear programming
  393. problem size is quadratic with the number of tasks and thus the time to solve it
  394. will be very long, it could be minutes for just a few dozen tasks. You should
  395. probably use @code{lp_solve -timeout 1 test.pl -wmps test.mps} to convert the
  396. problem to MPS format and then use a better solver, @code{glpsol} might be
  397. better than @code{lp_solve} for instance (the @code{--pcost} option may be
  398. useful), but sometimes doesn't manage to converge. @code{cbc} might look
  399. slower, but it is parallel. Be sure to try at least all the @code{-B} options
  400. of @code{lp_solve}. For instance, we often just use
  401. @code{lp_solve -cc -B1 -Bb -Bg -Bp -Bf -Br -BG -Bd -Bs -BB -Bo -Bc -Bi} , and
  402. the @code{-gr} option can also be quite useful.
  403. Setting @code{deps} to 0 will only take into account the actual computations
  404. on processing units. It however still properly takes into account the varying
  405. performances of kernels and processing units, which is quite more accurate than
  406. just comparing StarPU performances with the fastest of the kernels being used.
  407. The @code{prio} parameter tells StarPU whether to simulate taking into account
  408. the priorities as the StarPU scheduler would, i.e. schedule prioritized
  409. tasks before less prioritized tasks, to check to which extend this results
  410. to a less optimal solution. This increases even more computation time.
  411. Note that for simplicity, all this however doesn't take into account data
  412. transfers, which are assumed to be completely overlapped.
  413. @node Insert Task Utility
  414. @section Insert Task Utility
  415. StarPU provides the wrapper function @code{starpu_insert_task} to ease
  416. the creation and submission of tasks.
  417. @deftypefun int starpu_insert_task (struct starpu_codelet *@var{cl}, ...)
  418. Create and submit a task corresponding to @var{cl} with the following
  419. arguments. The argument list must be zero-terminated.
  420. The arguments following the codelets can be of the following types:
  421. @itemize
  422. @item
  423. @code{STARPU_R}, @code{STARPU_W}, @code{STARPU_RW}, @code{STARPU_SCRATCH}, @code{STARPU_REDUX} an access mode followed by a data handle;
  424. @item
  425. @code{STARPU_DATA_ARRAY} followed by an array of data handles and its number of elements;
  426. @item
  427. the specific values @code{STARPU_VALUE}, @code{STARPU_CALLBACK},
  428. @code{STARPU_CALLBACK_ARG}, @code{STARPU_CALLBACK_WITH_ARG},
  429. @code{STARPU_PRIORITY}, @code{STARPU_TAG}, followed by the appropriated objects
  430. as defined below.
  431. @end itemize
  432. When using @code{STARPU_DATA_ARRAY}, the access mode of the data
  433. handles is not defined.
  434. Parameters to be passed to the codelet implementation are defined
  435. through the type @code{STARPU_VALUE}. The function
  436. @code{starpu_codelet_unpack_args} must be called within the codelet
  437. implementation to retrieve them.
  438. @end deftypefun
  439. @defmac STARPU_VALUE
  440. this macro is used when calling @code{starpu_insert_task}, and must be
  441. followed by a pointer to a constant value and the size of the constant
  442. @end defmac
  443. @defmac STARPU_CALLBACK
  444. this macro is used when calling @code{starpu_insert_task}, and must be
  445. followed by a pointer to a callback function
  446. @end defmac
  447. @defmac STARPU_CALLBACK_ARG
  448. this macro is used when calling @code{starpu_insert_task}, and must be
  449. followed by a pointer to be given as an argument to the callback
  450. function
  451. @end defmac
  452. @defmac STARPU_CALLBACK_WITH_ARG
  453. this macro is used when calling @code{starpu_insert_task}, and must be
  454. followed by two pointers: one to a callback function, and the other to
  455. be given as an argument to the callback function; this is equivalent
  456. to using both @code{STARPU_CALLBACK} and
  457. @code{STARPU_CALLBACK_WITH_ARG}
  458. @end defmac
  459. @defmac STARPU_PRIORITY
  460. this macro is used when calling @code{starpu_insert_task}, and must be
  461. followed by a integer defining a priority level
  462. @end defmac
  463. @defmac STARPU_TAG
  464. this macro is used when calling @code{starpu_insert_task}, and must be
  465. followed by a tag.
  466. @end defmac
  467. @deftypefun void starpu_codelet_pack_args ({char **}@var{arg_buffer}, {size_t *}@var{arg_buffer_size}, ...)
  468. Pack arguments of type @code{STARPU_VALUE} into a buffer which can be
  469. given to a codelet and later unpacked with the function
  470. @code{starpu_codelet_unpack_args} defined below.
  471. @end deftypefun
  472. @deftypefun void starpu_codelet_unpack_args ({void *}@var{cl_arg}, ...)
  473. Retrieve the arguments of type @code{STARPU_VALUE} associated to a
  474. task automatically created using the function
  475. @code{starpu_insert_task} defined above.
  476. @end deftypefun
  477. Here the implementation of the codelet:
  478. @smallexample
  479. void func_cpu(void *descr[], void *_args)
  480. @{
  481. int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
  482. float *x1 = (float *)STARPU_VARIABLE_GET_PTR(descr[1]);
  483. int ifactor;
  484. float ffactor;
  485. starpu_codelet_unpack_args(_args, &ifactor, &ffactor);
  486. *x0 = *x0 * ifactor;
  487. *x1 = *x1 * ffactor;
  488. @}
  489. struct starpu_codelet mycodelet = @{
  490. .where = STARPU_CPU,
  491. .cpu_funcs = @{ func_cpu, NULL @},
  492. .nbuffers = 2,
  493. .modes = @{ STARPU_RW, STARPU_RW @}
  494. @};
  495. @end smallexample
  496. And the call to the @code{starpu_insert_task} wrapper:
  497. @smallexample
  498. starpu_insert_task(&mycodelet,
  499. STARPU_VALUE, &ifactor, sizeof(ifactor),
  500. STARPU_VALUE, &ffactor, sizeof(ffactor),
  501. STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],
  502. 0);
  503. @end smallexample
  504. The call to @code{starpu_insert_task} is equivalent to the following
  505. code:
  506. @smallexample
  507. struct starpu_task *task = starpu_task_create();
  508. task->cl = &mycodelet;
  509. task->handles[0] = data_handles[0];
  510. task->handles[1] = data_handles[1];
  511. char *arg_buffer;
  512. size_t arg_buffer_size;
  513. starpu_codelet_pack_args(&arg_buffer, &arg_buffer_size,
  514. STARPU_VALUE, &ifactor, sizeof(ifactor),
  515. STARPU_VALUE, &ffactor, sizeof(ffactor),
  516. 0);
  517. task->cl_arg = arg_buffer;
  518. task->cl_arg_size = arg_buffer_size;
  519. int ret = starpu_task_submit(task);
  520. @end smallexample
  521. Here a similar call using @code{STARPU_DATA_ARRAY}.
  522. @smallexample
  523. starpu_insert_task(&mycodelet,
  524. STARPU_DATA_ARRAY, data_handles, 2,
  525. STARPU_VALUE, &ifactor, sizeof(ifactor),
  526. STARPU_VALUE, &ffactor, sizeof(ffactor),
  527. 0);
  528. @end smallexample
  529. If some part of the task insertion depends on the value of some computation,
  530. the @code{STARPU_DATA_ACQUIRE_CB} macro can be very convenient. For
  531. instance, assuming that the index variable @code{i} was registered as handle
  532. @code{i_handle}:
  533. @smallexample
  534. /* Compute which portion we will work on, e.g. pivot */
  535. starpu_insert_task(&which_index, STARPU_W, i_handle, 0);
  536. /* And submit the corresponding task */
  537. STARPU_DATA_ACQUIRE_CB(i_handle, STARPU_R,
  538. starpu_insert_task(&work, STARPU_RW, A_handle[i], 0));
  539. @end smallexample
  540. The @code{STARPU_DATA_ACQUIRE_CB} macro submits an asynchronous request for
  541. acquiring data @code{i} for the main application, and will execute the code
  542. given as third parameter when it is acquired. In other words, as soon as the
  543. value of @code{i} computed by the @code{which_index} codelet can be read, the
  544. portion of code passed as third parameter of @code{STARPU_DATA_ACQUIRE_CB} will
  545. be executed, and is allowed to read from @code{i} to use it e.g. as an
  546. index. Note that this macro is only avaible when compiling StarPU with
  547. the compiler @code{gcc}.
  548. @node Data reduction
  549. @section Data reduction
  550. In various cases, some piece of data is used to accumulate intermediate
  551. results. For instances, the dot product of a vector, maximum/minimum finding,
  552. the histogram of a photograph, etc. When these results are produced along the
  553. whole machine, it would not be efficient to accumulate them in only one place,
  554. incurring data transmission each and access concurrency.
  555. StarPU provides a @code{STARPU_REDUX} mode, which permits to optimize
  556. that case: it will allocate a buffer on each memory node, and accumulate
  557. intermediate results there. When the data is eventually accessed in the normal
  558. @code{STARPU_R} mode, StarPU will collect the intermediate results in just one
  559. buffer.
  560. For this to work, the user has to use the
  561. @code{starpu_data_set_reduction_methods} to declare how to initialize these
  562. buffers, and how to assemble partial results.
  563. For instance, @code{cg} uses that to optimize its dot product: it first defines
  564. the codelets for initialization and reduction:
  565. @smallexample
  566. struct starpu_codelet bzero_variable_cl =
  567. @{
  568. .cpu_funcs = @{ bzero_variable_cpu, NULL @},
  569. .cuda_funcs = @{ bzero_variable_cuda, NULL @},
  570. .nbuffers = 1,
  571. @}
  572. static void accumulate_variable_cpu(void *descr[], void *cl_arg)
  573. @{
  574. double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);
  575. double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);
  576. *v_dst = *v_dst + *v_src;
  577. @}
  578. static void accumulate_variable_cuda(void *descr[], void *cl_arg)
  579. @{
  580. double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);
  581. double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);
  582. cublasaxpy(1, (double)1.0, v_src, 1, v_dst, 1);
  583. cudaStreamSynchronize(starpu_cuda_get_local_stream());
  584. @}
  585. struct starpu_codelet accumulate_variable_cl =
  586. @{
  587. .cpu_funcs = @{ accumulate_variable_cpu, NULL @},
  588. .cuda_funcs = @{ accumulate_variable_cuda, NULL @},
  589. .nbuffers = 1,
  590. @}
  591. @end smallexample
  592. and attaches them as reduction methods for its dtq handle:
  593. @smallexample
  594. starpu_data_set_reduction_methods(dtq_handle,
  595. &accumulate_variable_cl, &bzero_variable_cl);
  596. @end smallexample
  597. and dtq_handle can now be used in @code{STARPU_REDUX} mode for the dot products
  598. with partitioned vectors:
  599. @smallexample
  600. int dots(starpu_data_handle_t v1, starpu_data_handle_t v2,
  601. starpu_data_handle_t s, unsigned nblocks)
  602. @{
  603. starpu_insert_task(&bzero_variable_cl, STARPU_W, s, 0);
  604. for (b = 0; b < nblocks; b++)
  605. starpu_insert_task(&dot_kernel_cl,
  606. STARPU_RW, s,
  607. STARPU_R, starpu_data_get_sub_data(v1, 1, b),
  608. STARPU_R, starpu_data_get_sub_data(v2, 1, b),
  609. 0);
  610. @}
  611. @end smallexample
  612. The @code{cg} example also uses reduction for the blocked gemv kernel, leading
  613. to yet more relaxed dependencies and more parallelism.
  614. STARPU_REDUX can also be passed to @code{starpu_mpi_insert_task} in the MPI
  615. case. That will however not produce any MPI communication, but just pass
  616. STARPU_REDUX to the underlying @code{starpu_insert_task}. It is up to the
  617. application to call @code{starpu_mpi_redux_data}, which posts tasks that will
  618. reduce the partial results among MPI nodes into the MPI node which owns the
  619. data. For instance, some hypothetical application which collects partial results
  620. into data @code{res}, then uses it for other computation, before looping again
  621. with a new reduction:
  622. @smallexample
  623. @{
  624. for (i = 0; i < 100; i++) @{
  625. starpu_mpi_insert_task(MPI_COMM_WORLD, &init_res, STARPU_W, res, 0);
  626. starpu_mpi_insert_task(MPI_COMM_WORLD, &work, STARPU_RW, A, STARPU_R, B, STARPU_REDUX, res, 0);
  627. starpu_mpi_redux_data(MPI_COMM_WORLD, res);
  628. starpu_mpi_insert_task(MPI_COMM_WORLD, &work2, STARPU_RW, B, STARPU_R, res, 0);
  629. @}
  630. @}
  631. @end smallexample
  632. @node Temporary buffers
  633. @section Temporary buffers
  634. There are two kinds of temporary buffers: temporary data which just pass results
  635. from a task to another, and scratch data which are needed only internally by
  636. tasks.
  637. @subsection Temporary data
  638. Data can sometimes be entirely produced by a task, and entirely consumed by
  639. another task, without the need for other parts of the application to access
  640. it. In such case, registration can be done without prior allocation, by using
  641. the special -1 memory node number, and passing a zero pointer. StarPU will
  642. actually allocate memory only when the task creating the content gets scheduled,
  643. and destroy it on unregistration.
  644. In addition to that, it can be tedious for the application to have to unregister
  645. the data, since it will not use its content anyway. The unregistration can be
  646. done lazily by using the @code{starpu_data_unregister_submit(handle)} function,
  647. which will record that no more tasks accessing the handle will be submitted, so
  648. that it can be freed as soon as the last task accessing it is over.
  649. The following code examplifies both points: it registers the temporary
  650. data, submits three tasks accessing it, and records the data for automatic
  651. unregistration.
  652. @smallexample
  653. starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
  654. starpu_insert_task(&produce_data, STARPU_W, handle, 0);
  655. starpu_insert_task(&compute_data, STARPU_RW, handle, 0);
  656. starpu_insert_task(&summarize_data, STARPU_R, handle, STARPU_W, result_handle, 0);
  657. starpu_data_unregister_submit(handle);
  658. @end smallexample
  659. @subsection Scratch data
  660. Some kernels sometimes need temporary data to achieve the computations, i.e. a
  661. workspace. The application could allocate it at the start of the codelet
  662. function, and free it at the end, but that would be costly. It could also
  663. allocate one buffer per worker (similarly to @ref{Per-worker library
  664. initialization}), but that would make them systematic and permanent. A more
  665. optimized way is to use the SCRATCH data access mode, as examplified below,
  666. which provides per-worker buffers without content consistency.
  667. @smallexample
  668. starpu_vector_data_register(&workspace, -1, 0, sizeof(float));
  669. for (i = 0; i < N; i++)
  670. starpu_insert_task(&compute, STARPU_R, input[i],
  671. STARPU_SCRATCH, workspace, STARPU_W, output[i], 0);
  672. @end smallexample
  673. StarPU will make sure that the buffer is allocated before executing the task,
  674. and make this allocation per-worker: for CPU workers, notably, each worker has
  675. its own buffer. This means that each task submitted above will actually have its
  676. own workspace, which will actually be the same for all tasks running one after
  677. the other on the same worker. Also, if for instance GPU memory becomes scarce,
  678. StarPU will notice that it can free such buffers easily, since the content does
  679. not matter.
  680. @node Parallel Tasks
  681. @section Parallel Tasks
  682. StarPU can leverage existing parallel computation libraries by the means of
  683. parallel tasks. A parallel task is a task which gets worked on by a set of CPUs
  684. (called a parallel or combined worker) at the same time, by using an existing
  685. parallel CPU implementation of the computation to be achieved. This can also be
  686. useful to improve the load balance between slow CPUs and fast GPUs: since CPUs
  687. work collectively on a single task, the completion time of tasks on CPUs become
  688. comparable to the completion time on GPUs, thus relieving from granularity
  689. discrepancy concerns. Hwloc support needs to be enabled to get good performance,
  690. otherwise StarPU will not know how to better group cores.
  691. Two modes of execution exist to accomodate with existing usages.
  692. @subsection Fork-mode parallel tasks
  693. In the Fork mode, StarPU will call the codelet function on one
  694. of the CPUs of the combined worker. The codelet function can use
  695. @code{starpu_combined_worker_get_size()} to get the number of threads it is
  696. allowed to start to achieve the computation. The CPU binding mask for the whole
  697. set of CPUs is already enforced, so that threads created by the function will
  698. inherit the mask, and thus execute where StarPU expected, the OS being in charge
  699. of choosing how to schedule threads on the corresponding CPUs. The application
  700. can also choose to bind threads by hand, using e.g. sched_getaffinity to know
  701. the CPU binding mask that StarPU chose.
  702. For instance, using OpenMP (full source is available in
  703. @code{examples/openmp/vector_scal.c}):
  704. @example
  705. void scal_cpu_func(void *buffers[], void *_args)
  706. @{
  707. unsigned i;
  708. float *factor = _args;
  709. struct starpu_vector_interface *vector = buffers[0];
  710. unsigned n = STARPU_VECTOR_GET_NX(vector);
  711. float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
  712. #pragma omp parallel for num_threads(starpu_combined_worker_get_size())
  713. for (i = 0; i < n; i++)
  714. val[i] *= *factor;
  715. @}
  716. static struct starpu_codelet cl =
  717. @{
  718. .modes = @{ STARPU_RW @},
  719. .where = STARPU_CPU,
  720. .type = STARPU_FORKJOIN,
  721. .max_parallelism = INT_MAX,
  722. .cpu_funcs = @{scal_cpu_func, NULL@},
  723. .nbuffers = 1,
  724. @};
  725. @end example
  726. Other examples include for instance calling a BLAS parallel CPU implementation
  727. (see @code{examples/mult/xgemm.c}).
  728. @subsection SPMD-mode parallel tasks
  729. In the SPMD mode, StarPU will call the codelet function on
  730. each CPU of the combined worker. The codelet function can use
  731. @code{starpu_combined_worker_get_size()} to get the total number of CPUs
  732. involved in the combined worker, and thus the number of calls that are made in
  733. parallel to the function, and @code{starpu_combined_worker_get_rank()} to get
  734. the rank of the current CPU within the combined worker. For instance:
  735. @example
  736. static void func(void *buffers[], void *args)
  737. @{
  738. unsigned i;
  739. float *factor = _args;
  740. struct starpu_vector_interface *vector = buffers[0];
  741. unsigned n = STARPU_VECTOR_GET_NX(vector);
  742. float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
  743. /* Compute slice to compute */
  744. unsigned m = starpu_combined_worker_get_size();
  745. unsigned j = starpu_combined_worker_get_rank();
  746. unsigned slice = (n+m-1)/m;
  747. for (i = j * slice; i < (j+1) * slice && i < n; i++)
  748. val[i] *= *factor;
  749. @}
  750. static struct starpu_codelet cl =
  751. @{
  752. .modes = @{ STARPU_RW @},
  753. .where = STARP_CPU,
  754. .type = STARPU_SPMD,
  755. .max_parallelism = INT_MAX,
  756. .cpu_funcs = @{ func, NULL @},
  757. .nbuffers = 1,
  758. @}
  759. @end example
  760. Of course, this trivial example will not really benefit from parallel task
  761. execution, and was only meant to be simple to understand. The benefit comes
  762. when the computation to be done is so that threads have to e.g. exchange
  763. intermediate results, or write to the data in a complex but safe way in the same
  764. buffer.
  765. @subsection Parallel tasks performance
  766. To benefit from parallel tasks, a parallel-task-aware StarPU scheduler has to
  767. be used. When exposed to codelets with a Fork or SPMD flag, the @code{pheft}
  768. (parallel-heft) and @code{pgreedy} (parallel greedy) schedulers will indeed also
  769. try to execute tasks with several CPUs. It will automatically try the various
  770. available combined worker sizes and thus be able to avoid choosing a large
  771. combined worker if the codelet does not actually scale so much.
  772. @subsection Combined workers
  773. By default, StarPU creates combined workers according to the architecture
  774. structure as detected by hwloc. It means that for each object of the hwloc
  775. topology (NUMA node, socket, cache, ...) a combined worker will be created. If
  776. some nodes of the hierarchy have a big arity (e.g. many cores in a socket
  777. without a hierarchy of shared caches), StarPU will create combined workers of
  778. intermediate sizes. The @code{STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER} variable
  779. permits to tune the maximum arity between levels of combined workers.
  780. The combined workers actually produced can be seen in the output of the
  781. @code{starpu_machine_display} tool (the @code{STARPU_SCHED} environment variable
  782. has to be set to a combined worker-aware scheduler such as @code{pheft} or
  783. @code{pgreedy}).
  784. @subsection Concurrent parallel tasks
  785. Unfortunately, many environments and librairies do not support concurrent
  786. calls.
  787. For instance, most OpenMP implementations (including the main ones) do not
  788. support concurrent @code{pragma omp parallel} statements without nesting them in
  789. another @code{pragma omp parallel} statement, but StarPU does not yet support
  790. creating its CPU workers by using such pragma.
  791. Other parallel libraries are also not safe when being invoked concurrently
  792. from different threads, due to the use of global variables in their sequential
  793. sections for instance.
  794. The solution is then to use only one combined worker at a time. This can be
  795. done by setting @code{single_combined_worker} to 1 in the @code{starpu_conf}
  796. structure, or setting the @code{STARPU_SINGLE_COMBINED_WORKER} environment
  797. variable to 1. StarPU will then run only one parallel task at a time (but other
  798. CPU and GPU tasks are not affected and can be run concurrently). The parallel
  799. task scheduler will however still however still try varying combined worker
  800. sizes to look for the most efficient ones.
  801. @node Debugging
  802. @section Debugging
  803. StarPU provides several tools to help debugging aplications. Execution traces
  804. can be generated and displayed graphically, see @ref{Generating traces}. Some
  805. gdb helpers are also provided to show the whole StarPU state:
  806. @smallexample
  807. (gdb) source tools/gdbinit
  808. (gdb) help starpu
  809. @end smallexample
  810. @node The multiformat interface
  811. @section The multiformat interface
  812. It may be interesting to represent the same piece of data using two different
  813. data structures: one that would only be used on CPUs, and one that would only
  814. be used on GPUs. This can be done by using the multiformat interface. StarPU
  815. will be able to convert data from one data structure to the other when needed.
  816. Note that the heft scheduler is the only one optimized for this interface. The
  817. user must provide StarPU with conversion codelets:
  818. @cartouche
  819. @smallexample
  820. #define NX 1024
  821. struct point array_of_structs[NX];
  822. starpu_data_handle_t handle;
  823. /*
  824. * The conversion of a piece of data is itself a task, though it is created,
  825. * submitted and destroyed by StarPU internals and not by the user. Therefore,
  826. * we have to define two codelets.
  827. * Note that for now the conversion from the CPU format to the GPU format has to
  828. * be executed on the GPU, and the conversion from the GPU to the CPU has to be
  829. * executed on the CPU.
  830. */
  831. #ifdef STARPU_USE_OPENCL
  832. void cpu_to_opencl_opencl_func(void *buffers[], void *args);
  833. struct starpu_codelet cpu_to_opencl_cl = @{
  834. .where = STARPU_OPENCL,
  835. .opencl_funcs = @{ cpu_to_opencl_opencl_func, NULL @},
  836. .nbuffers = 1,
  837. .modes = @{ STARPU_RW @}
  838. @};
  839. void opencl_to_cpu_func(void *buffers[], void *args);
  840. struct starpu_codelet opencl_to_cpu_cl = @{
  841. .where = STARPU_CPU,
  842. .cpu_funcs = @{ opencl_to_cpu_func, NULL @},
  843. .nbuffers = 1,
  844. .modes = @{ STARPU_RW @}
  845. @};
  846. #endif
  847. struct starpu_multiformat_data_interface_ops format_ops = @{
  848. #ifdef STARPU_USE_OPENCL
  849. .opencl_elemsize = 2 * sizeof(float),
  850. .cpu_to_opencl_cl = &cpu_to_opencl_cl,
  851. .opencl_to_cpu_cl = &opencl_to_cpu_cl,
  852. #endif
  853. .cpu_elemsize = 2 * sizeof(float),
  854. ...
  855. @};
  856. starpu_multiformat_data_register(handle, 0, &array_of_structs, NX, &format_ops);
  857. @end smallexample
  858. @end cartouche
  859. Kernels can be written almost as for any other interface. Note that
  860. STARPU_MULTIFORMAT_GET_CPU_PTR shall only be used for CPU kernels. CUDA kernels
  861. must use STARPU_MULTIFORMAT_GET_CUDA_PTR, and OpenCL kernels must use
  862. STARPU_MULTIFORMAT_GET_OPENCL_PTR. STARPU_MULTIFORMAT_GET_NX may be used in any
  863. kind of kernel.
  864. @cartouche
  865. @smallexample
  866. static void
  867. multiformat_scal_cpu_func(void *buffers[], void *args)
  868. @{
  869. struct point *aos;
  870. unsigned int n;
  871. aos = STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
  872. n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
  873. ...
  874. @}
  875. extern "C" void multiformat_scal_cuda_func(void *buffers[], void *_args)
  876. @{
  877. unsigned int n;
  878. struct struct_of_arrays *soa;
  879. soa = (struct struct_of_arrays *) STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
  880. n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
  881. ...
  882. @}
  883. @end smallexample
  884. @end cartouche
  885. A full example may be found in @code{examples/basic_examples/multiformat.c}.
  886. @node On-GPU rendering
  887. @section On-GPU rendering
  888. Graphical-oriented applications need to draw the result of their computations,
  889. typically on the very GPU where these happened. Technologies such as OpenGL/CUDA
  890. interoperability permit to let CUDA directly work on the OpenGL buffers, making
  891. them thus immediately ready for drawing, by mapping OpenGL buffer, textures or
  892. renderbuffer objects into CUDA. CUDA however imposes some technical
  893. constraints: peer memcpy has to be disabled, and the thread that runs OpenGL has
  894. to be the one that runs CUDA computations for that GPU.
  895. To achieve this with StarPU, pass the @code{--disable-cuda-memcpy-peer} option
  896. to @code{./configure} (TODO: make it dynamic), OpenGL/GLUT has to be initialized
  897. first, and the interoperability mode has to
  898. be enabled by using the @code{cuda_opengl_interoperability} field of the
  899. @code{starpu_conf} structure, and the driver loop has to be run by
  900. the application, by using the @code{not_launched_drivers} field of
  901. @code{starpu_conf} to prevent StarPU from running it in a separate thread, and
  902. by using @code{starpu_driver_run} to run the loop. The @code{gl_interop} and
  903. @code{gl_interop_idle} examples shows how it articulates in a simple case, where
  904. rendering is done in task callbacks. The former uses @code{glutMainLoopEvent}
  905. to make GLUT progress from the StarPU driver loop, while the latter uses
  906. @code{glutIdleFunc} to make StarPU progress from the GLUT main loop.
  907. Then, to use an OpenGL buffer as a CUDA data, StarPU simply needs to be given
  908. the CUDA pointer at registration, for instance:
  909. @cartouche
  910. @smallexample
  911. /* Get the CUDA worker id */
  912. for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)
  913. if (starpu_worker_get_type(workerid) == STARPU_CUDA_WORKER)
  914. break;
  915. /* Build a CUDA pointer pointing at the OpenGL buffer */
  916. cudaGraphicsResourceGetMappedPointer((void**)&output, &num_bytes, resource);
  917. /* And register it to StarPU */
  918. starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid),
  919. output, num_bytes / sizeof(float4), sizeof(float4));
  920. /* The handle can now be used as usual */
  921. starpu_insert_task(&cl, STARPU_RW, handle, 0);
  922. /* ... */
  923. /* This gets back data into the OpenGL buffer */
  924. starpu_data_unregister(handle);
  925. @end smallexample
  926. @end cartouche
  927. and display it e.g. in the callback function.
  928. @node More examples
  929. @section More examples
  930. More examples are available in the StarPU sources in the @code{examples/}
  931. directory. Simple examples include:
  932. @table @asis
  933. @item @code{incrementer/}:
  934. Trivial incrementation test.
  935. @item @code{basic_examples/}:
  936. Simple documented Hello world (as shown in @ref{Hello World}), vector/scalar product (as shown
  937. in @ref{Vector Scaling on an Hybrid CPU/GPU Machine}), matrix
  938. product examples (as shown in @ref{Performance model example}), an example using the blocked matrix data
  939. interface, an example using the variable data interface, and an example
  940. using different formats on CPUs and GPUs.
  941. @item @code{matvecmult/}:
  942. OpenCL example from NVidia, adapted to StarPU.
  943. @item @code{axpy/}:
  944. AXPY CUBLAS operation adapted to StarPU.
  945. @item @code{fortran/}:
  946. Example of Fortran bindings.
  947. @end table
  948. More advanced examples include:
  949. @table @asis
  950. @item @code{filters/}:
  951. Examples using filters, as shown in @ref{Partitioning Data}.
  952. @item @code{lu/}:
  953. LU matrix factorization, see for instance @code{xlu_implicit.c}
  954. @item @code{cholesky/}:
  955. Cholesky matrix factorization, see for instance @code{cholesky_implicit.c}.
  956. @end table