| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368 | 
							- @c -*-texinfo-*-
 
- @c This file is part of the StarPU Handbook.
 
- @c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
 
- @c Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
 
- @c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
 
- @c See the file starpu.texi for copying conditions.
 
- @menu
 
- * Using multiple implementations of a codelet::
 
- * Enabling implementation according to capabilities::
 
- * Task and Worker Profiling::
 
- * Partitioning Data::
 
- * Performance model example::
 
- * Theoretical lower bound on execution time example::
 
- * Insert Task Utility::
 
- * Data reduction::
 
- * Temporary buffers::
 
- * Parallel Tasks::
 
- * Debugging::
 
- * The multiformat interface::
 
- * Using the Driver API::
 
- * Defining a New Scheduling Policy::
 
- * On-GPU rendering::
 
- * Defining a New Data Interface::
 
- * Setting the Data Handles for a Task::
 
- * More examples::               More examples shipped with StarPU
 
- @end menu
 
- @node Using multiple implementations of a codelet
 
- @section Using multiple implementations of a codelet
 
- One may want to write multiple implementations of a codelet for a single type of
 
- device and let StarPU choose which one to run. As an example, we will show how
 
- to use SSE to scale a vector. The codelet can be written as follows:
 
- @cartouche
 
- @smallexample
 
- #include <xmmintrin.h>
 
- void scal_sse_func(void *buffers[], void *cl_arg)
 
- @{
 
-     float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
 
-     unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
 
-     unsigned int n_iterations = n/4;
 
-     if (n % 4 != 0)
 
-         n_iterations++;
 
-     __m128 *VECTOR = (__m128*) vector;
 
-     __m128 factor __attribute__((aligned(16)));
 
-     factor = _mm_set1_ps(*(float *) cl_arg);
 
-     unsigned int i;
 
-     for (i = 0; i < n_iterations; i++)
 
-         VECTOR[i] = _mm_mul_ps(factor, VECTOR[i]);
 
- @}
 
- @end smallexample
 
- @end cartouche
 
- @cartouche
 
- @smallexample
 
- struct starpu_codelet cl = @{
 
-     .where = STARPU_CPU,
 
-     .cpu_funcs = @{ scal_cpu_func, scal_sse_func, NULL @},
 
-     .cpu_funcs_name = @{ "scal_cpu_func", "scal_sse_func", NULL @},
 
-     .nbuffers = 1,
 
-     .modes = @{ STARPU_RW @}
 
- @};
 
- @end smallexample
 
- @end cartouche
 
- Schedulers which are multi-implementation aware (only @code{dmda} and
 
- @code{pheft} for now) will use the performance models of all the
 
- implementations it was given, and pick the one that seems to be the fastest.
 
- @node Enabling implementation according to capabilities
 
- @section Enabling implementation according to capabilities
 
- Some implementations may not run on some devices. For instance, some CUDA
 
- devices do not support double floating point precision, and thus the kernel
 
- execution would just fail; or the device may not have enough shared memory for
 
- the implementation being used. The @code{can_execute} field of the @code{struct
 
- starpu_codelet} structure permits to express this. For instance:
 
- @cartouche
 
- @smallexample
 
- static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
 
- @{
 
-   const struct cudaDeviceProp *props;
 
-   if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
 
-     return 1;
 
-   /* Cuda device */
 
-   props = starpu_cuda_get_device_properties(workerid);
 
-   if (props->major >= 2 || props->minor >= 3)
 
-     /* At least compute capability 1.3, supports doubles */
 
-     return 1;
 
-   /* Old card, does not support doubles */
 
-   return 0;
 
- @}
 
- struct starpu_codelet cl = @{
 
-     .where = STARPU_CPU|STARPU_CUDA,
 
-     .can_execute = can_execute,
 
-     .cpu_funcs = @{ cpu_func, NULL @},
 
-     .cpu_funcs_name = @{ "cpu_func", NULL @},
 
-     .cuda_funcs = @{ gpu_func, NULL @}
 
-     .nbuffers = 1,
 
-     .modes = @{ STARPU_RW @}
 
- @};
 
- @end smallexample
 
- @end cartouche
 
- This can be essential e.g. when running on a machine which mixes various models
 
- of CUDA devices, to take benefit from the new models without crashing on old models.
 
- Note: the @code{can_execute} function is called by the scheduler each time it
 
- tries to match a task with a worker, and should thus be very fast. The
 
- @code{starpu_cuda_get_device_properties} provides a quick access to CUDA
 
- properties of CUDA devices to achieve such efficiency.
 
- Another example is compiling CUDA code for various compute capabilities,
 
- resulting with two CUDA functions, e.g. @code{scal_gpu_13} for compute capability
 
- 1.3, and @code{scal_gpu_20} for compute capability 2.0. Both functions can be
 
- provided to StarPU by using @code{cuda_funcs}, and @code{can_execute} can then be
 
- used to rule out the @code{scal_gpu_20} variant on a CUDA device which
 
- will not be able to execute it:
 
- @cartouche
 
- @smallexample
 
- static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
 
- @{
 
-   const struct cudaDeviceProp *props;
 
-   if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
 
-     return 1;
 
-   /* Cuda device */
 
-   if (nimpl == 0)
 
-     /* Trying to execute the 1.3 capability variant, we assume it is ok in all cases.  */
 
-     return 1;
 
-   /* Trying to execute the 2.0 capability variant, check that the card can do it.  */
 
-   props = starpu_cuda_get_device_properties(workerid);
 
-   if (props->major >= 2 || props->minor >= 0)
 
-     /* At least compute capability 2.0, can run it */
 
-     return 1;
 
-   /* Old card, does not support 2.0, will not be able to execute the 2.0 variant.  */
 
-   return 0;
 
- @}
 
- struct starpu_codelet cl = @{
 
-     .where = STARPU_CPU|STARPU_CUDA,
 
-     .can_execute = can_execute,
 
-     .cpu_funcs = @{ cpu_func, NULL @},
 
-     .cpu_funcs_name = @{ "cpu_func", NULL @},
 
-     .cuda_funcs = @{ scal_gpu_13, scal_gpu_20, NULL @},
 
-     .nbuffers = 1,
 
-     .modes = @{ STARPU_RW @}
 
- @};
 
- @end smallexample
 
- @end cartouche
 
- Note: the most generic variant should be provided first, as some schedulers are
 
- not able to try the different variants.
 
- @node Task and Worker Profiling
 
- @section Task and Worker Profiling
 
- A full example showing how to use the profiling API is available in
 
- the StarPU sources in the directory @code{examples/profiling/}.
 
- @cartouche
 
- @smallexample
 
- struct starpu_task *task = starpu_task_create();
 
- task->cl = &cl;
 
- task->synchronous = 1;
 
- /* We will destroy the task structure by hand so that we can
 
-  * query the profiling info before the task is destroyed. */
 
- task->destroy = 0;
 
- /* Submit and wait for completion (since synchronous was set to 1) */
 
- starpu_task_submit(task);
 
- /* The task is finished, get profiling information */
 
- struct starpu_profiling_task_info *info = task->profiling_info;
 
- /* How much time did it take before the task started ? */
 
- double delay += starpu_timing_timespec_delay_us(&info->submit_time, &info->start_time);
 
- /* How long was the task execution ? */
 
- double length += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
 
- /* We don't need the task structure anymore */
 
- starpu_task_destroy(task);
 
- @end smallexample
 
- @end cartouche
 
- @cartouche
 
- @smallexample
 
- /* Display the occupancy of all workers during the test */
 
- int worker;
 
- for (worker = 0; worker < starpu_worker_get_count(); worker++)
 
- @{
 
-         struct starpu_profiling_worker_info worker_info;
 
-         int ret = starpu_profiling_worker_get_info(worker, &worker_info);
 
-         STARPU_ASSERT(!ret);
 
-         double total_time = starpu_timing_timespec_to_us(&worker_info.total_time);
 
-         double executing_time = starpu_timing_timespec_to_us(&worker_info.executing_time);
 
-         double sleeping_time = starpu_timing_timespec_to_us(&worker_info.sleeping_time);
 
-         double overhead_time = total_time - executing_time - sleeping_time;
 
-         float executing_ratio = 100.0*executing_time/total_time;
 
-         float sleeping_ratio = 100.0*sleeping_time/total_time;
 
-         float overhead_ratio = 100.0 - executing_ratio - sleeping_ratio;
 
-         char workername[128];
 
-         starpu_worker_get_name(worker, workername, 128);
 
-         fprintf(stderr, "Worker %s:\n", workername);
 
-         fprintf(stderr, "\ttotal time: %.2lf ms\n", total_time*1e-3);
 
-         fprintf(stderr, "\texec time: %.2lf ms (%.2f %%)\n", executing_time*1e-3,
 
-                 executing_ratio);
 
-         fprintf(stderr, "\tblocked time: %.2lf ms (%.2f %%)\n", sleeping_time*1e-3,
 
-                 sleeping_ratio);
 
-         fprintf(stderr, "\toverhead time: %.2lf ms (%.2f %%)\n", overhead_time*1e-3,
 
-                 overhead_ratio);
 
- @}
 
- @end smallexample
 
- @end cartouche
 
- @node Partitioning Data
 
- @section Partitioning Data
 
- An existing piece of data can be partitioned in sub parts to be used by different tasks, for instance:
 
- @cartouche
 
- @smallexample
 
- int vector[NX];
 
- starpu_data_handle_t handle;
 
- /* Declare data to StarPU */
 
- starpu_vector_data_register(&handle, 0, (uintptr_t)vector,
 
-                             NX, sizeof(vector[0]));
 
- /* Partition the vector in PARTS sub-vectors */
 
- starpu_data_filter f =
 
- @{
 
-     .filter_func = starpu_vector_filter_block,
 
-     .nchildren = PARTS
 
- @};
 
- starpu_data_partition(handle, &f);
 
- @end smallexample
 
- @end cartouche
 
- The task submission then uses @code{starpu_data_get_sub_data} to retrieve the
 
- sub-handles to be passed as tasks parameters.
 
- @cartouche
 
- @smallexample
 
- /* Submit a task on each sub-vector */
 
- for (i=0; i<starpu_data_get_nb_children(handle); i++) @{
 
-     /* Get subdata number i (there is only 1 dimension) */
 
-     starpu_data_handle_t sub_handle = starpu_data_get_sub_data(handle, 1, i);
 
-     struct starpu_task *task = starpu_task_create();
 
-     task->handles[0] = sub_handle;
 
-     task->cl = &cl;
 
-     task->synchronous = 1;
 
-     task->cl_arg = &factor;
 
-     task->cl_arg_size = sizeof(factor);
 
-     starpu_task_submit(task);
 
- @}
 
- @end smallexample
 
- @end cartouche
 
- Partitioning can be applied several times, see
 
- @code{examples/basic_examples/mult.c} and @code{examples/filters/}.
 
- Wherever the whole piece of data is already available, the partitioning will
 
- be done in-place, i.e. without allocating new buffers but just using pointers
 
- inside the existing copy. This is particularly important to be aware of when
 
- using OpenCL, where the kernel parameters are not pointers, but handles. The
 
- kernel thus needs to be also passed the offset within the OpenCL buffer:
 
- @cartouche
 
- @smallexample
 
- void opencl_func(void *buffers[], void *cl_arg)
 
- @{
 
-     cl_mem vector = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
 
-     unsigned offset = STARPU_BLOCK_GET_OFFSET(buffers[0]);
 
-     ...
 
-     clSetKernelArg(kernel, 0, sizeof(vector), &vector);
 
-     clSetKernelArg(kernel, 1, sizeof(offset), &offset);
 
-     ...
 
- @}
 
- @end smallexample
 
- @end cartouche
 
- And the kernel has to shift from the pointer passed by the OpenCL driver:
 
- @cartouche
 
- @smallexample
 
- __kernel void opencl_kernel(__global int *vector, unsigned offset)
 
- @{
 
-     block = (__global void *)block + offset;
 
-     ...
 
- @}
 
- @end smallexample
 
- @end cartouche
 
- StarPU provides various interfaces and filters for matrices, vectors, etc.,
 
- but applications can also write their own data interfaces and filters, see
 
- @code{examples/interface} and @code{examples/filters/custom_mf} for an example.
 
- @node Performance model example
 
- @section Performance model example
 
- To achieve good scheduling, StarPU scheduling policies need to be able to
 
- estimate in advance the duration of a task. This is done by giving to codelets
 
- a performance model, by defining a @code{starpu_perfmodel} structure and
 
- providing its address in the @code{model} field of the @code{struct starpu_codelet}
 
- structure. The @code{symbol} and @code{type} fields of @code{starpu_perfmodel}
 
- are mandatory, to give a name to the model, and the type of the model, since
 
- there are several kinds of performance models. For compatibility, make sure to
 
- initialize the whole structure to zero, either by using explicit memset, or by
 
- letting the compiler implicitly do it as examplified below.
 
- @itemize
 
- @item
 
- Measured at runtime (@code{STARPU_HISTORY_BASED} model type). This assumes that for a
 
- given set of data input/output sizes, the performance will always be about the
 
- same. This is very true for regular kernels on GPUs for instance (<0.1% error),
 
- and just a bit less true on CPUs (~=1% error). This also assumes that there are
 
- few different sets of data input/output sizes. StarPU will then keep record of
 
- the average time of previous executions on the various processing units, and use
 
- it as an estimation. History is done per task size, by using a hash of the input
 
- and ouput sizes as an index.
 
- It will also save it in @code{$STARPU_HOME/.starpu/sampling/codelets}
 
- for further executions, and can be observed by using the
 
- @code{starpu_perfmodel_display} command, or drawn by using
 
- the @code{starpu_perfmodel_plot} (@pxref{Performance model calibration}).  The
 
- models are indexed by machine name. To
 
- share the models between machines (e.g. for a homogeneous cluster), use
 
- @code{export STARPU_HOSTNAME=some_global_name}. Measurements are only done
 
- when using a task scheduler which makes use of it, such as 
 
- @code{dmda}. Measurements can also be provided explicitly by the application, by
 
- using the @code{starpu_perfmodel_update_history} function.
 
- The following is a small code example.
 
- If e.g. the code is recompiled with other compilation options, or several
 
- variants of the code are used, the symbol string should be changed to reflect
 
- that, in order to recalibrate a new model from zero. The symbol string can even
 
- be constructed dynamically at execution time, as long as this is done before
 
- submitting any task using it.
 
- @cartouche
 
- @smallexample
 
- static struct starpu_perfmodel mult_perf_model = @{
 
-     .type = STARPU_HISTORY_BASED,
 
-     .symbol = "mult_perf_model"
 
- @};
 
- struct starpu_codelet cl = @{
 
-     .where = STARPU_CPU,
 
-     .cpu_funcs = @{ cpu_mult, NULL @},
 
-     .cpu_funcs_name = @{ "cpu_mult", NULL @},
 
-     .nbuffers = 3,
 
-     .modes = @{ STARPU_R, STARPU_R, STARPU_W @},
 
-     /* for the scheduling policy to be able to use performance models */
 
-     .model = &mult_perf_model
 
- @};
 
- @end smallexample
 
- @end cartouche
 
- @item
 
- Measured at runtime and refined by regression (@code{STARPU_*REGRESSION_BASED}
 
- model type). This still assumes performance regularity, but works
 
- with various data input sizes, by applying regression over observed
 
- execution times. STARPU_REGRESSION_BASED uses an a*n^b regression
 
- form, STARPU_NL_REGRESSION_BASED uses an a*n^b+c (more precise than
 
- STARPU_REGRESSION_BASED, but costs a lot more to compute).
 
- For instance,
 
- @code{tests/perfmodels/regression_based.c} uses a regression-based performance
 
- model for the @code{memset} operation.
 
- Of course, the application has to issue
 
- tasks with varying size so that the regression can be computed. StarPU will not
 
- trust the regression unless there is at least 10% difference between the minimum
 
- and maximum observed input size. It can be useful to set the
 
- @code{STARPU_CALIBRATE} environment variable to @code{1} and run the application
 
- on varying input sizes with @code{STARPU_SCHED} set to @code{eager} scheduler,
 
- so as to feed the performance model for a variety of
 
- inputs. The application can also provide the measurements explictly by using
 
- @code{starpu_perfmodel_update_history}. The @code{starpu_perfmodel_display} and
 
- @code{starpu_perfmodel_plot}
 
- tools can be used to observe how much the performance model is calibrated (@pxref{Performance model calibration}); when
 
- their output look good, @code{STARPU_CALIBRATE} can be reset to @code{0} to let
 
- StarPU use the resulting performance model without recording new measures, and
 
- @code{STARPU_SCHED} can be set to @code{dmda} to benefit from the performance models. If
 
- the data input sizes vary a lot, it is really important to set
 
- @code{STARPU_CALIBRATE} to @code{0}, otherwise StarPU will continue adding the
 
- measures, and result with a very big performance model, which will take time a
 
- lot of time to load and save.
 
- For non-linear regression, since computing it
 
- is quite expensive, it is only done at termination of the application. This
 
- means that the first execution of the application will use only history-based
 
- performance model to perform scheduling, without using regression.
 
- @item
 
- Provided as an estimation from the application itself (@code{STARPU_COMMON} model type and @code{cost_function} field),
 
- see for instance
 
- @code{examples/common/blas_model.h} and @code{examples/common/blas_model.c}.
 
- @item
 
- Provided explicitly by the application (@code{STARPU_PER_ARCH} model type): the
 
- @code{.per_arch[arch][nimpl].cost_function} fields have to be filled with pointers to
 
- functions which return the expected duration of the task in micro-seconds, one
 
- per architecture.
 
- @end itemize
 
- For the @code{STARPU_HISTORY_BASED} and @code{STARPU_*REGRESSION_BASE},
 
- the total size of task data (both input and output) is used as an index by
 
- default. The @code{size_base} field of @code{struct starpu_perfmodel} however
 
- permits the application to override that, when for instance some of the data
 
- do not matter for task cost (e.g. mere reference table), or when using sparse
 
- structures (in which case it is the number of non-zeros which matter), or when
 
- there is some hidden parameter such as the number of iterations, etc. The
 
- @code{examples/pi} examples uses this to include the number of iterations in the
 
- base.
 
- How to use schedulers which can benefit from such performance model is explained
 
- in @ref{Task scheduling policy}.
 
- The same can be done for task power consumption estimation, by setting the
 
- @code{power_model} field the same way as the @code{model} field. Note: for
 
- now, the application has to give to the power consumption performance model
 
- a name which is different from the execution time performance model.
 
- The application can request time estimations from the StarPU performance
 
- models by filling a task structure as usual without actually submitting
 
- it. The data handles can be created by calling @code{starpu_*_data_register}
 
- functions with a @code{NULL} pointer and @code{-1} node and the
 
- desired data sizes, and need to be unregistered as usual. The
 
- @code{starpu_task_expected_length} and @code{starpu_task_expected_power}
 
- functions can then be called to get an estimation of the task cost on a given
 
- arch. @code{starpu_task_footprint} can also be used to get the footprint used
 
- for indexing history-based performance models.
 
- @code{starpu_task_destroy}
 
- needs to be called to destroy the dummy task afterwards. See
 
- @code{tests/perfmodels/regression_based.c} for an example.
 
- @node Theoretical lower bound on execution time example
 
- @section Theoretical lower bound on execution time
 
- For kernels with history-based performance models (and provided that they are completely calibrated), StarPU can very easily provide a theoretical lower
 
- bound for the execution time of a whole set of tasks. See for
 
- instance @code{examples/lu/lu_example.c}: before submitting tasks,
 
- call @code{starpu_bound_start}, and after complete execution, call
 
- @code{starpu_bound_stop}. @code{starpu_bound_print_lp} or
 
- @code{starpu_bound_print_mps} can then be used to output a Linear Programming
 
- problem corresponding to the schedule of your tasks. Run it through
 
- @code{lp_solve} or any other linear programming solver, and that will give you a
 
- lower bound for the total execution time of your tasks. If StarPU was compiled
 
- with the glpk library installed, @code{starpu_bound_compute} can be used to
 
- solve it immediately and get the optimized minimum, in ms. Its @code{integer}
 
- parameter allows to decide whether integer resolution should be computed
 
- and returned too.
 
- The @code{deps} parameter tells StarPU whether to take tasks, implicit data, and tag
 
- dependencies into account. Tags released in a callback or similar
 
- are not taken into account, only tags associated with a task are.
 
- It must be understood that the linear programming
 
- problem size is quadratic with the number of tasks and thus the time to solve it
 
- will be very long, it could be minutes for just a few dozen tasks. You should
 
- probably use @code{lp_solve -timeout 1 test.pl -wmps test.mps} to convert the
 
- problem to MPS format and then use a better solver, @code{glpsol} might be
 
- better than @code{lp_solve} for instance (the @code{--pcost} option may be
 
- useful), but sometimes doesn't manage to converge. @code{cbc} might look
 
- slower, but it is parallel. For @code{lp_solve}, be sure to try at least all the
 
- @code{-B} options. For instance, we often just use @code{lp_solve -cc -B1 -Bb
 
- -Bg -Bp -Bf -Br -BG -Bd -Bs -BB -Bo -Bc -Bi} , and the @code{-gr} option can
 
- also be quite useful. The resulting schedule can be observed by using the
 
- @code{starpu_lp2paje} tool, which converts it into the Paje format.
 
- Data transfer time can only be taken into account when @code{deps} is set. Only
 
- data transfers inferred from implicit data dependencies between tasks are taken
 
- into account. Other data transfers are assumed to be completely overlapped.
 
- Setting @code{deps} to 0 will only take into account the actual computations
 
- on processing units. It however still properly takes into account the varying
 
- performances of kernels and processing units, which is quite more accurate than
 
- just comparing StarPU performances with the fastest of the kernels being used.
 
- The @code{prio} parameter tells StarPU whether to simulate taking into account
 
- the priorities as the StarPU scheduler would, i.e. schedule prioritized
 
- tasks before less prioritized tasks, to check to which extend this results
 
- to a less optimal solution. This increases even more computation time.
 
- @node Insert Task Utility
 
- @section Insert Task Utility
 
- StarPU provides the wrapper function @code{starpu_insert_task} to ease
 
- the creation and submission of tasks. See the definition of the
 
- functions in @ref{Insert Task}.
 
- Here the implementation of the codelet:
 
- @cartouche
 
- @smallexample
 
- void func_cpu(void *descr[], void *_args)
 
- @{
 
-         int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
-         float *x1 = (float *)STARPU_VARIABLE_GET_PTR(descr[1]);
 
-         int ifactor;
 
-         float ffactor;
 
-         starpu_codelet_unpack_args(_args, &ifactor, &ffactor);
 
-         *x0 = *x0 * ifactor;
 
-         *x1 = *x1 * ffactor;
 
- @}
 
- struct starpu_codelet mycodelet = @{
 
-         .where = STARPU_CPU,
 
-         .cpu_funcs = @{ func_cpu, NULL @},
 
-         .cpu_funcs_name = @{ "func_cpu", NULL @},
 
-         .nbuffers = 2,
 
-         .modes = @{ STARPU_RW, STARPU_RW @}
 
- @};
 
- @end smallexample
 
- @end cartouche
 
- And the call to the @code{starpu_insert_task} wrapper:
 
- @cartouche
 
- @smallexample
 
- starpu_insert_task(&mycodelet,
 
-                    STARPU_VALUE, &ifactor, sizeof(ifactor),
 
-                    STARPU_VALUE, &ffactor, sizeof(ffactor),
 
-                    STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],
 
-                    0);
 
- @end smallexample
 
- @end cartouche
 
- The call to @code{starpu_insert_task} is equivalent to the following
 
- code:
 
- @cartouche
 
- @smallexample
 
- struct starpu_task *task = starpu_task_create();
 
- task->cl = &mycodelet;
 
- task->handles[0] = data_handles[0];
 
- task->handles[1] = data_handles[1];
 
- char *arg_buffer;
 
- size_t arg_buffer_size;
 
- starpu_codelet_pack_args(&arg_buffer, &arg_buffer_size,
 
-                     STARPU_VALUE, &ifactor, sizeof(ifactor),
 
-                     STARPU_VALUE, &ffactor, sizeof(ffactor),
 
-                     0);
 
- task->cl_arg = arg_buffer;
 
- task->cl_arg_size = arg_buffer_size;
 
- int ret = starpu_task_submit(task);
 
- @end smallexample
 
- @end cartouche
 
- Here a similar call using @code{STARPU_DATA_ARRAY}.
 
- @cartouche
 
- @smallexample
 
- starpu_insert_task(&mycodelet,
 
-                    STARPU_DATA_ARRAY, data_handles, 2,
 
-                    STARPU_VALUE, &ifactor, sizeof(ifactor),
 
-                    STARPU_VALUE, &ffactor, sizeof(ffactor),
 
-                    0);
 
- @end smallexample
 
- @end cartouche
 
- If some part of the task insertion depends on the value of some computation,
 
- the @code{STARPU_DATA_ACQUIRE_CB} macro can be very convenient. For
 
- instance, assuming that the index variable @code{i} was registered as handle
 
- @code{i_handle}:
 
- @cartouche
 
- @smallexample
 
- /* Compute which portion we will work on, e.g. pivot */
 
- starpu_insert_task(&which_index, STARPU_W, i_handle, 0);
 
- /* And submit the corresponding task */
 
- STARPU_DATA_ACQUIRE_CB(i_handle, STARPU_R,
 
-                        starpu_insert_task(&work, STARPU_RW, A_handle[i], 0));
 
- @end smallexample
 
- @end cartouche
 
- The @code{STARPU_DATA_ACQUIRE_CB} macro submits an asynchronous request for
 
- acquiring data @code{i} for the main application, and will execute the code
 
- given as third parameter when it is acquired. In other words, as soon as the
 
- value of @code{i} computed by the @code{which_index} codelet can be read, the
 
- portion of code passed as third parameter of @code{STARPU_DATA_ACQUIRE_CB} will
 
- be executed, and is allowed to read from @code{i} to use it e.g. as an
 
- index. Note that this macro is only avaible when compiling StarPU with
 
- the compiler @code{gcc}.
 
- @node Data reduction
 
- @section Data reduction
 
- In various cases, some piece of data is used to accumulate intermediate
 
- results. For instances, the dot product of a vector, maximum/minimum finding,
 
- the histogram of a photograph, etc. When these results are produced along the
 
- whole machine, it would not be efficient to accumulate them in only one place,
 
- incurring data transmission each and access concurrency.
 
- StarPU provides a @code{STARPU_REDUX} mode, which permits to optimize
 
- that case: it will allocate a buffer on each memory node, and accumulate
 
- intermediate results there. When the data is eventually accessed in the normal
 
- @code{STARPU_R} mode, StarPU will collect the intermediate results in just one
 
- buffer.
 
- For this to work, the user has to use the
 
- @code{starpu_data_set_reduction_methods} to declare how to initialize these
 
- buffers, and how to assemble partial results.
 
- For instance, @code{cg} uses that to optimize its dot product: it first defines
 
- the codelets for initialization and reduction:
 
- @cartouche
 
- @smallexample
 
- struct starpu_codelet bzero_variable_cl =
 
- @{
 
-         .cpu_funcs = @{ bzero_variable_cpu, NULL @},
 
-         .cpu_funcs_name = @{ "bzero_variable_cpu", NULL @},
 
-         .cuda_funcs = @{ bzero_variable_cuda, NULL @},
 
-         .nbuffers = 1,
 
- @}
 
- static void accumulate_variable_cpu(void *descr[], void *cl_arg)
 
- @{
 
-         double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
-         double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);
 
-         *v_dst = *v_dst + *v_src;
 
- @}
 
- static void accumulate_variable_cuda(void *descr[], void *cl_arg)
 
- @{
 
-         double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
-         double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);
 
-         cublasaxpy(1, (double)1.0, v_src, 1, v_dst, 1);
 
-         cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
- @}
 
- struct starpu_codelet accumulate_variable_cl =
 
- @{
 
-         .cpu_funcs = @{ accumulate_variable_cpu, NULL @},
 
-         .cpu_funcs_name = @{ "accumulate_variable_cpu", NULL @},
 
-         .cuda_funcs = @{ accumulate_variable_cuda, NULL @},
 
-         .nbuffers = 1,
 
- @}
 
- @end smallexample
 
- @end cartouche
 
- and attaches them as reduction methods for its dtq handle:
 
- @cartouche
 
- @smallexample
 
- starpu_variable_data_register(&dtq_handle, -1, NULL, sizeof(type));
 
- starpu_data_set_reduction_methods(dtq_handle,
 
-         &accumulate_variable_cl, &bzero_variable_cl);
 
- @end smallexample
 
- @end cartouche
 
- and @code{dtq_handle} can now be used in @code{STARPU_REDUX} mode for the dot products
 
- with partitioned vectors:
 
- @cartouche
 
- @smallexample
 
- for (b = 0; b < nblocks; b++)
 
-     starpu_insert_task(&dot_kernel_cl,
 
-         STARPU_REDUX, dtq_handle,
 
-         STARPU_R, starpu_data_get_sub_data(v1, 1, b),
 
-         STARPU_R, starpu_data_get_sub_data(v2, 1, b),
 
-         0);
 
- @end smallexample
 
- @end cartouche
 
- During registration, we have here provided NULL, i.e. there is no initial value
 
- to be taken into account during reduction. StarPU will thus only take into
 
- account the contributions from the @code{dot_kernel_cl} tasks. Also, it will not
 
- allocate any memory for @code{dtq_handle} before @code{dot_kernel_cl} tasks are
 
- ready to run.
 
- If another dot product has to be performed, one could unregister
 
- @code{dtq_handle}, and re-register it. But one can also use
 
- @code{starpu_data_invalidate_submit(dtq_handle)}, which will clear all data from the handle,
 
- thus resetting it back to the initial @code{register(NULL)} state.
 
- The @code{cg} example also uses reduction for the blocked gemv kernel, leading
 
- to yet more relaxed dependencies and more parallelism.
 
- STARPU_REDUX can also be passed to @code{starpu_mpi_insert_task} in the MPI
 
- case. That will however not produce any MPI communication, but just pass
 
- STARPU_REDUX to the underlying @code{starpu_insert_task}. It is up to the
 
- application to call @code{starpu_mpi_redux_data}, which posts tasks that will
 
- reduce the partial results among MPI nodes into the MPI node which owns the
 
- data. For instance, some hypothetical application which collects partial results
 
- into data @code{res}, then uses it for other computation, before looping again
 
- with a new reduction:
 
- @cartouche
 
- @smallexample
 
- for (i = 0; i < 100; i++) @{
 
-     starpu_mpi_insert_task(MPI_COMM_WORLD, &init_res, STARPU_W, res, 0);
 
-     starpu_mpi_insert_task(MPI_COMM_WORLD, &work, STARPU_RW, A,
 
-                STARPU_R, B, STARPU_REDUX, res, 0);
 
-     starpu_mpi_redux_data(MPI_COMM_WORLD, res);
 
-     starpu_mpi_insert_task(MPI_COMM_WORLD, &work2, STARPU_RW, B, STARPU_R, res, 0);
 
- @}
 
- @end smallexample
 
- @end cartouche
 
- @node Temporary buffers
 
- @section Temporary buffers
 
- There are two kinds of temporary buffers: temporary data which just pass results
 
- from a task to another, and scratch data which are needed only internally by
 
- tasks.
 
- @subsection Temporary data
 
- Data can sometimes be entirely produced by a task, and entirely consumed by
 
- another task, without the need for other parts of the application to access
 
- it. In such case, registration can be done without prior allocation, by using
 
- the special -1 memory node number, and passing a zero pointer. StarPU will
 
- actually allocate memory only when the task creating the content gets scheduled,
 
- and destroy it on unregistration.
 
- In addition to that, it can be tedious for the application to have to unregister
 
- the data, since it will not use its content anyway. The unregistration can be
 
- done lazily by using the @code{starpu_data_unregister_submit(handle)} function,
 
- which will record that no more tasks accessing the handle will be submitted, so
 
- that it can be freed as soon as the last task accessing it is over.
 
- The following code examplifies both points: it registers the temporary
 
- data, submits three tasks accessing it, and records the data for automatic
 
- unregistration.
 
- @cartouche
 
- @smallexample
 
- starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
 
- starpu_insert_task(&produce_data, STARPU_W, handle, 0);
 
- starpu_insert_task(&compute_data, STARPU_RW, handle, 0);
 
- starpu_insert_task(&summarize_data, STARPU_R, handle, STARPU_W, result_handle, 0);
 
- starpu_data_unregister_submit(handle);
 
- @end smallexample
 
- @end cartouche
 
- @subsection Scratch data
 
- Some kernels sometimes need temporary data to achieve the computations, i.e. a
 
- workspace. The application could allocate it at the start of the codelet
 
- function, and free it at the end, but that would be costly. It could also
 
- allocate one buffer per worker (similarly to @ref{Per-worker library
 
- initialization}), but that would make them systematic and permanent. A more
 
- optimized way is to use the SCRATCH data access mode, as examplified below,
 
- which provides per-worker buffers without content consistency.
 
- @cartouche
 
- @smallexample
 
- starpu_vector_data_register(&workspace, -1, 0, sizeof(float));
 
- for (i = 0; i < N; i++)
 
-     starpu_insert_task(&compute, STARPU_R, input[i],
 
-                        STARPU_SCRATCH, workspace, STARPU_W, output[i], 0);
 
- @end smallexample
 
- @end cartouche
 
- StarPU will make sure that the buffer is allocated before executing the task,
 
- and make this allocation per-worker: for CPU workers, notably, each worker has
 
- its own buffer. This means that each task submitted above will actually have its
 
- own workspace, which will actually be the same for all tasks running one after
 
- the other on the same worker. Also, if for instance GPU memory becomes scarce,
 
- StarPU will notice that it can free such buffers easily, since the content does
 
- not matter.
 
- The @code{examples/pi} example uses scratches for some temporary buffer.
 
- @node Parallel Tasks
 
- @section Parallel Tasks
 
- StarPU can leverage existing parallel computation libraries by the means of
 
- parallel tasks. A parallel task is a task which gets worked on by a set of CPUs
 
- (called a parallel or combined worker) at the same time, by using an existing
 
- parallel CPU implementation of the computation to be achieved. This can also be
 
- useful to improve the load balance between slow CPUs and fast GPUs: since CPUs
 
- work collectively on a single task, the completion time of tasks on CPUs become
 
- comparable to the completion time on GPUs, thus relieving from granularity
 
- discrepancy concerns. Hwloc support needs to be enabled to get good performance,
 
- otherwise StarPU will not know how to better group cores.
 
- Two modes of execution exist to accomodate with existing usages.
 
- @subsection Fork-mode parallel tasks
 
- In the Fork mode, StarPU will call the codelet function on one
 
- of the CPUs of the combined worker. The codelet function can use
 
- @code{starpu_combined_worker_get_size()} to get the number of threads it is
 
- allowed to start to achieve the computation. The CPU binding mask for the whole
 
- set of CPUs is already enforced, so that threads created by the function will
 
- inherit the mask, and thus execute where StarPU expected, the OS being in charge
 
- of choosing how to schedule threads on the corresponding CPUs. The application
 
- can also choose to bind threads by hand, using e.g. sched_getaffinity to know
 
- the CPU binding mask that StarPU chose.
 
- For instance, using OpenMP (full source is available in
 
- @code{examples/openmp/vector_scal.c}):
 
- @cartouche
 
- @smallexample
 
- void scal_cpu_func(void *buffers[], void *_args)
 
- @{
 
-     unsigned i;
 
-     float *factor = _args;
 
-     struct starpu_vector_interface *vector = buffers[0];
 
-     unsigned n = STARPU_VECTOR_GET_NX(vector);
 
-     float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
 
- #pragma omp parallel for num_threads(starpu_combined_worker_get_size())
 
-     for (i = 0; i < n; i++)
 
-         val[i] *= *factor;
 
- @}
 
- static struct starpu_codelet cl =
 
- @{
 
-     .modes = @{ STARPU_RW @},
 
-     .where = STARPU_CPU,
 
-     .type = STARPU_FORKJOIN,
 
-     .max_parallelism = INT_MAX,
 
-     .cpu_funcs = @{scal_cpu_func, NULL@},
 
-     .cpu_funcs_name = @{"scal_cpu_func", NULL@},
 
-     .nbuffers = 1,
 
- @};
 
- @end smallexample
 
- @end cartouche
 
- Other examples include for instance calling a BLAS parallel CPU implementation
 
- (see @code{examples/mult/xgemm.c}).
 
- @subsection SPMD-mode parallel tasks
 
- In the SPMD mode, StarPU will call the codelet function on
 
- each CPU of the combined worker. The codelet function can use
 
- @code{starpu_combined_worker_get_size()} to get the total number of CPUs
 
- involved in the combined worker, and thus the number of calls that are made in
 
- parallel to the function, and @code{starpu_combined_worker_get_rank()} to get
 
- the rank of the current CPU within the combined worker. For instance:
 
- @cartouche
 
- @smallexample
 
- static void func(void *buffers[], void *args)
 
- @{
 
-     unsigned i;
 
-     float *factor = _args;
 
-     struct starpu_vector_interface *vector = buffers[0];
 
-     unsigned n = STARPU_VECTOR_GET_NX(vector);
 
-     float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
 
-     /* Compute slice to compute */
 
-     unsigned m = starpu_combined_worker_get_size();
 
-     unsigned j = starpu_combined_worker_get_rank();
 
-     unsigned slice = (n+m-1)/m;
 
-     for (i = j * slice; i < (j+1) * slice && i < n; i++)
 
-         val[i] *= *factor;
 
- @}
 
- static struct starpu_codelet cl =
 
- @{
 
-     .modes = @{ STARPU_RW @},
 
-     .where = STARP_CPU,
 
-     .type = STARPU_SPMD,
 
-     .max_parallelism = INT_MAX,
 
-     .cpu_funcs = @{ func, NULL @},
 
-     .cpu_funcs_name = @{ "func", NULL @},
 
-     .nbuffers = 1,
 
- @}
 
- @end smallexample
 
- @end cartouche
 
- Of course, this trivial example will not really benefit from parallel task
 
- execution, and was only meant to be simple to understand.  The benefit comes
 
- when the computation to be done is so that threads have to e.g. exchange
 
- intermediate results, or write to the data in a complex but safe way in the same
 
- buffer.
 
- @subsection Parallel tasks performance
 
- To benefit from parallel tasks, a parallel-task-aware StarPU scheduler has to
 
- be used. When exposed to codelets with a Fork or SPMD flag, the @code{pheft}
 
- (parallel-heft) and @code{peager} (parallel eager) schedulers will indeed also
 
- try to execute tasks with several CPUs. It will automatically try the various
 
- available combined worker sizes and thus be able to avoid choosing a large
 
- combined worker if the codelet does not actually scale so much.
 
- @subsection Combined workers
 
- By default, StarPU creates combined workers according to the architecture
 
- structure as detected by hwloc. It means that for each object of the hwloc
 
- topology (NUMA node, socket, cache, ...) a combined worker will be created. If
 
- some nodes of the hierarchy have a big arity (e.g. many cores in a socket
 
- without a hierarchy of shared caches), StarPU will create combined workers of
 
- intermediate sizes. The @code{STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER} variable
 
- permits to tune the maximum arity between levels of combined workers.
 
- The combined workers actually produced can be seen in the output of the
 
- @code{starpu_machine_display} tool (the @code{STARPU_SCHED} environment variable
 
- has to be set to a combined worker-aware scheduler such as @code{pheft} or
 
- @code{peager}).
 
- @subsection Concurrent parallel tasks
 
- Unfortunately, many environments and librairies do not support concurrent
 
- calls.
 
- For instance, most OpenMP implementations (including the main ones) do not
 
- support concurrent @code{pragma omp parallel} statements without nesting them in
 
- another @code{pragma omp parallel} statement, but StarPU does not yet support
 
- creating its CPU workers by using such pragma.
 
- Other parallel libraries are also not safe when being invoked concurrently
 
- from different threads, due to the use of global variables in their sequential
 
- sections for instance.
 
- The solution is then to use only one combined worker at a time.  This can be
 
- done by setting @code{single_combined_worker} to 1 in the @code{starpu_conf}
 
- structure, or setting the @code{STARPU_SINGLE_COMBINED_WORKER} environment
 
- variable to 1. StarPU will then run only one parallel task at a time (but other
 
- CPU and GPU tasks are not affected and can be run concurrently). The parallel
 
- task scheduler will however still however still try varying combined worker
 
- sizes to look for the most efficient ones.
 
- @node Debugging
 
- @section Debugging
 
- StarPU provides several tools to help debugging aplications. Execution traces
 
- can be generated and displayed graphically, see @ref{Generating traces}. Some
 
- gdb helpers are also provided to show the whole StarPU state:
 
- @smallexample
 
- (gdb) source tools/gdbinit
 
- (gdb) help starpu
 
- @end smallexample
 
- The Temanejo task debugger can also be used, see @ref{Task debugger}.
 
- @node The multiformat interface
 
- @section The multiformat interface
 
- It may be interesting to represent the same piece of data using two different
 
- data structures: one that would only be used on CPUs, and one that would only
 
- be used on GPUs. This can be done by using the multiformat interface. StarPU
 
- will be able to convert data from one data structure to the other when needed.
 
- Note that the dmda scheduler is the only one optimized for this interface. The
 
- user must provide StarPU with conversion codelets:
 
- @cartouche
 
- @smallexample
 
- #define NX 1024
 
- struct point array_of_structs[NX];
 
- starpu_data_handle_t handle;
 
- /*
 
-  * The conversion of a piece of data is itself a task, though it is created,
 
-  * submitted and destroyed by StarPU internals and not by the user. Therefore,
 
-  * we have to define two codelets.
 
-  * Note that for now the conversion from the CPU format to the GPU format has to
 
-  * be executed on the GPU, and the conversion from the GPU to the CPU has to be
 
-  * executed on the CPU.
 
-  */
 
- #ifdef STARPU_USE_OPENCL
 
- void cpu_to_opencl_opencl_func(void *buffers[], void *args);
 
- struct starpu_codelet cpu_to_opencl_cl = @{
 
-     .where = STARPU_OPENCL,
 
-     .opencl_funcs = @{ cpu_to_opencl_opencl_func, NULL @},
 
-     .nbuffers = 1,
 
-     .modes = @{ STARPU_RW @}
 
- @};
 
- void opencl_to_cpu_func(void *buffers[], void *args);
 
- struct starpu_codelet opencl_to_cpu_cl = @{
 
-     .where = STARPU_CPU,
 
-     .cpu_funcs = @{ opencl_to_cpu_func, NULL @},
 
-     .cpu_funcs_name = @{ "opencl_to_cpu_func", NULL @},
 
-     .nbuffers = 1,
 
-     .modes = @{ STARPU_RW @}
 
- @};
 
- #endif
 
- struct starpu_multiformat_data_interface_ops format_ops = @{
 
- #ifdef STARPU_USE_OPENCL
 
-     .opencl_elemsize = 2 * sizeof(float),
 
-     .cpu_to_opencl_cl = &cpu_to_opencl_cl,
 
-     .opencl_to_cpu_cl = &opencl_to_cpu_cl,
 
- #endif
 
-     .cpu_elemsize = 2 * sizeof(float),
 
-     ...
 
- @};
 
- starpu_multiformat_data_register(handle, 0, &array_of_structs, NX, &format_ops);
 
- @end smallexample
 
- @end cartouche
 
- Kernels can be written almost as for any other interface. Note that
 
- STARPU_MULTIFORMAT_GET_CPU_PTR shall only be used for CPU kernels. CUDA kernels
 
- must use STARPU_MULTIFORMAT_GET_CUDA_PTR, and OpenCL kernels must use
 
- STARPU_MULTIFORMAT_GET_OPENCL_PTR. STARPU_MULTIFORMAT_GET_NX may be used in any
 
- kind of kernel.
 
- @cartouche
 
- @smallexample
 
- static void
 
- multiformat_scal_cpu_func(void *buffers[], void *args)
 
- @{
 
-     struct point *aos;
 
-     unsigned int n;
 
-     aos = STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
 
-     n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
 
-     ...
 
- @}
 
- extern "C" void multiformat_scal_cuda_func(void *buffers[], void *_args)
 
- @{
 
-     unsigned int n;
 
-     struct struct_of_arrays *soa;
 
-     soa = (struct struct_of_arrays *) STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
 
-     n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
 
-     ...
 
- @}
 
- @end smallexample
 
- @end cartouche
 
- A full example may be found in @code{examples/basic_examples/multiformat.c}.
 
- @node Using the Driver API
 
- @section Using the Driver API
 
- @pxref{Running drivers}
 
- @cartouche
 
- @smallexample
 
- int ret;
 
- struct starpu_driver = @{
 
-     .type = STARPU_CUDA_WORKER,
 
-     .id.cuda_id = 0
 
- @};
 
- ret = starpu_driver_init(&d);
 
- if (ret != 0)
 
-     error();
 
- while (some_condition) @{
 
-     ret = starpu_driver_run_once(&d);
 
-     if (ret != 0)
 
-         error();
 
- @}
 
- ret = starpu_driver_deinit(&d);
 
- if (ret != 0)
 
-     error();
 
- @end smallexample
 
- @end cartouche
 
- @node Defining a New Scheduling Policy
 
- @section Defining a New Scheduling Policy
 
- A full example showing how to define a new scheduling policy is available in
 
- the StarPU sources in the directory @code{examples/scheduler/}.
 
- @pxref{Scheduling Policy}
 
- @cartouche
 
- @smallexample
 
- static struct starpu_sched_policy dummy_sched_policy = @{
 
-     .init_sched = init_dummy_sched,
 
-     .deinit_sched = deinit_dummy_sched,
 
-     .add_workers = dummy_sched_add_workers,
 
-     .remove_workers = dummy_sched_remove_workers,
 
-     .push_task = push_task_dummy,
 
-     .push_prio_task = NULL,
 
-     .pop_task = pop_task_dummy,
 
-     .post_exec_hook = NULL,
 
-     .pop_every_task = NULL,
 
-     .policy_name = "dummy",
 
-     .policy_description = "dummy scheduling strategy"
 
- @};
 
- @end smallexample
 
- @end cartouche
 
- @node On-GPU rendering
 
- @section On-GPU rendering
 
- Graphical-oriented applications need to draw the result of their computations,
 
- typically on the very GPU where these happened. Technologies such as OpenGL/CUDA
 
- interoperability permit to let CUDA directly work on the OpenGL buffers, making
 
- them thus immediately ready for drawing, by mapping OpenGL buffer, textures or
 
- renderbuffer objects into CUDA.  CUDA however imposes some technical
 
- constraints: peer memcpy has to be disabled, and the thread that runs OpenGL has
 
- to be the one that runs CUDA computations for that GPU.
 
- To achieve this with StarPU, pass the @code{--disable-cuda-memcpy-peer} option
 
- to @code{./configure} (TODO: make it dynamic), OpenGL/GLUT has to be initialized
 
- first, and the interoperability mode has to
 
- be enabled by using the @code{cuda_opengl_interoperability} field of the
 
- @code{starpu_conf} structure, and the driver loop has to be run by
 
- the application, by using the @code{not_launched_drivers} field of
 
- @code{starpu_conf} to prevent StarPU from running it in a separate thread, and
 
- by using @code{starpu_driver_run} to run the loop. The @code{gl_interop} and
 
- @code{gl_interop_idle} examples shows how it articulates in a simple case, where
 
- rendering is done in task callbacks. The former uses @code{glutMainLoopEvent}
 
- to make GLUT progress from the StarPU driver loop, while the latter uses
 
- @code{glutIdleFunc} to make StarPU progress from the GLUT main loop.
 
- Then, to use an OpenGL buffer as a CUDA data, StarPU simply needs to be given
 
- the CUDA pointer at registration, for instance:
 
- @cartouche
 
- @smallexample
 
- /* Get the CUDA worker id */
 
- for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)
 
-         if (starpu_worker_get_type(workerid) == STARPU_CUDA_WORKER)
 
-                 break;
 
- /* Build a CUDA pointer pointing at the OpenGL buffer */
 
- cudaGraphicsResourceGetMappedPointer((void**)&output, &num_bytes, resource);
 
- /* And register it to StarPU */
 
- starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid),
 
-                             output, num_bytes / sizeof(float4), sizeof(float4));
 
- /* The handle can now be used as usual */
 
- starpu_insert_task(&cl, STARPU_RW, handle, 0);
 
- /* ... */
 
- /* This gets back data into the OpenGL buffer */
 
- starpu_data_unregister(handle);
 
- @end smallexample
 
- @end cartouche
 
- and display it e.g. in the callback function.
 
- @node Defining a New Data Interface
 
- @section Defining a New Data Interface
 
- Let's define a new data interface to manage complex numbers.
 
- @cartouche
 
- @smallexample
 
- /* interface for complex numbers */
 
- struct starpu_complex_interface
 
- @{
 
-         double *real;
 
-         double *imaginary;
 
-         int nx;
 
- @};
 
- @end smallexample
 
- @end cartouche
 
- Registering such a data to StarPU is easily done using the function
 
- @code{starpu_data_register} (@pxref{Basic Data Management API}). The last
 
- parameter of the function, @code{interface_complex_ops}, will be
 
- described below.
 
- @cartouche
 
- @smallexample
 
- void starpu_complex_data_register(starpu_data_handle_t *handle,
 
-      unsigned home_node, double *real, double *imaginary, int nx)
 
- @{
 
-         struct starpu_complex_interface complex =
 
-         @{
 
-                 .real = real,
 
-                 .imaginary = imaginary,
 
-                 .nx = nx
 
-         @};
 
-         if (interface_complex_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
 
-         @{
 
-                 interface_complex_ops.interfaceid = starpu_data_interface_get_next_id();
 
-         @}
 
-         starpu_data_register(handleptr, home_node, &complex, &interface_complex_ops);
 
- @}
 
- @end smallexample
 
- @end cartouche
 
- Different operations need to be defined for a data interface through
 
- the type @code{struct starpu_data_interface_ops} (@pxref{Defining
 
- Interface}). We only define here the basic operations needed to
 
- run simple applications. The source code for the different functions
 
- can be found in the file
 
- @code{examples/interface/complex_interface.c}.
 
- @cartouche
 
- @smallexample
 
- static struct starpu_data_interface_ops interface_complex_ops =
 
- @{
 
-         .register_data_handle = complex_register_data_handle,
 
-         .allocate_data_on_node = complex_allocate_data_on_node,
 
-         .copy_methods = &complex_copy_methods,
 
-         .get_size = complex_get_size,
 
-         .footprint = complex_footprint,
 
-         .interfaceid = STARPU_UNKNOWN_INTERFACE_ID,
 
-         .interface_size = sizeof(struct starpu_complex_interface),
 
- @};
 
- @end smallexample
 
- @end cartouche
 
- Functions need to be defined to access the different fields of the
 
- complex interface from a StarPU data handle.
 
- @cartouche
 
- @smallexample
 
- double *starpu_complex_get_real(starpu_data_handle_t handle)
 
- @{
 
-         struct starpu_complex_interface *complex_interface =
 
-           (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, 0);
 
-         return complex_interface->real;
 
- @}
 
- double *starpu_complex_get_imaginary(starpu_data_handle_t handle);
 
- int starpu_complex_get_nx(starpu_data_handle_t handle);
 
- @end smallexample
 
- @end cartouche
 
- Similar functions need to be defined to access the different fields of the
 
- complex interface from a @code{void *} pointer to be used within codelet
 
- implemetations.
 
- @cartouche
 
- @smallexample
 
- #define STARPU_COMPLEX_GET_REAL(interface)	\
 
-         (((struct starpu_complex_interface *)(interface))->real)
 
- #define STARPU_COMPLEX_GET_IMAGINARY(interface)	\
 
-         (((struct starpu_complex_interface *)(interface))->imaginary)
 
- #define STARPU_COMPLEX_GET_NX(interface)	\
 
-         (((struct starpu_complex_interface *)(interface))->nx)
 
- @end smallexample
 
- @end cartouche
 
- Complex data interfaces can then be registered to StarPU.
 
- @cartouche
 
- @smallexample
 
- double real = 45.0;
 
- double imaginary = 12.0;
 
- starpu_complex_data_register(&handle1, 0, &real, &imaginary, 1);
 
- starpu_insert_task(&cl_display, STARPU_R, handle1, 0);
 
- @end smallexample
 
- @end cartouche
 
- and used by codelets.
 
- @cartouche
 
- @smallexample
 
- void display_complex_codelet(void *descr[], __attribute__ ((unused)) void *_args)
 
- @{
 
-         int nx = STARPU_COMPLEX_GET_NX(descr[0]);
 
-         double *real = STARPU_COMPLEX_GET_REAL(descr[0]);
 
-         double *imaginary = STARPU_COMPLEX_GET_IMAGINARY(descr[0]);
 
-         int i;
 
-         for(i=0 ; i<nx ; i++)
 
-         @{
 
-                 fprintf(stderr, "Complex[%d] = %3.2f + %3.2f i\n", i, real[i], imaginary[i]);
 
-         @}
 
- @}
 
- @end smallexample
 
- @end cartouche
 
- The whole code for this complex data interface is available in the
 
- directory @code{examples/interface/}.
 
- @node Setting the Data Handles for a Task
 
- @section Setting the Data Handles for a Task
 
- The number of data a task can manage is fixed by the
 
- @code{STARPU_NMAXBUFS} which has a default value which can be changed
 
- through the configure option @code{--enable-maxbuffers} (see
 
- @ref{--enable-maxbuffers}).
 
- However, it is possible to define tasks managing more data by using
 
- the field @code{dyn_handles} when defining a task and the field
 
- @code{dyn_modes} when defining the corresponding codelet.
 
- @cartouche
 
- @smallexample
 
- enum starpu_data_access_mode modes[STARPU_NMAXBUFS+1] = @{
 
- 	STARPU_R, STARPU_R, ...
 
- @};
 
- struct starpu_codelet dummy_big_cl =
 
- @{
 
- 	.cuda_funcs = @{dummy_big_kernel, NULL@},
 
- 	.opencl_funcs = @{dummy_big_kernel, NULL@},
 
- 	.cpu_funcs = @{dummy_big_kernel, NULL@},
 
- 	.cpu_funcs_name = @{"dummy_big_kernel", NULL@},
 
- 	.nbuffers = STARPU_NMAXBUFS+1,
 
- 	.dyn_modes = modes
 
- @};
 
- task = starpu_task_create();
 
- task->cl = &dummy_big_cl;
 
- task->dyn_handles = malloc(task->cl->nbuffers * sizeof(starpu_data_handle_t));
 
- for(i=0 ; i<task->cl->nbuffers ; i++)
 
- @{
 
- 	task->dyn_handles[i] = handle;
 
- @}
 
- starpu_task_submit(task);
 
- @end smallexample
 
- @end cartouche
 
- @cartouche
 
- @smallexample
 
- starpu_data_handle_t *handles = malloc(dummy_big_cl.nbuffers * sizeof(starpu_data_handle_t));
 
- for(i=0 ; i<dummy_big_cl.nbuffers ; i++)
 
- @{
 
- 	handles[i] = handle;
 
- @}
 
- starpu_insert_task(&dummy_big_cl,
 
-         	 STARPU_VALUE, &dummy_big_cl.nbuffers, sizeof(dummy_big_cl.nbuffers),
 
- 		 STARPU_DATA_ARRAY, handles, dummy_big_cl.nbuffers,
 
- 		 0);
 
- @end smallexample
 
- @end cartouche
 
- The whole code for this complex data interface is available in the
 
- directory @code{examples/basic_examples/dynamic_handles.c}.
 
- @node More examples
 
- @section More examples
 
- More examples are available in the StarPU sources in the @code{examples/}
 
- directory. Simple examples include:
 
- @table @asis
 
- @item @code{incrementer/}:
 
-     Trivial incrementation test.
 
- @item @code{basic_examples/}:
 
-         Simple documented Hello world and vector/scalar product (as
 
-         shown in @ref{Basic Examples}), matrix
 
-         product examples (as shown in @ref{Performance model example}), an example using the blocked matrix data
 
-         interface, an example using the variable data interface, and an example
 
-         using different formats on CPUs and GPUs.
 
- @item @code{matvecmult/}:
 
-     OpenCL example from NVidia, adapted to StarPU.
 
- @item @code{axpy/}:
 
-     AXPY CUBLAS operation adapted to StarPU.
 
- @item @code{fortran/}:
 
-     Example of Fortran bindings.
 
- @end table
 
- More advanced examples include:
 
- @table @asis
 
- @item @code{filters/}:
 
-     Examples using filters, as shown in @ref{Partitioning Data}.
 
- @item @code{lu/}:
 
-     LU matrix factorization, see for instance @code{xlu_implicit.c}
 
- @item @code{cholesky/}:
 
-     Cholesky matrix factorization, see for instance @code{cholesky_implicit.c}.
 
- @end table
 
 
  |