| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150 | @c -*-texinfo-*-@c This file is part of the StarPU Handbook.@c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique@c See the file starpu.texi for copying conditions.@menu* Using multiple implementations of a codelet::* Enabling implementation according to capabilities::* Task and Worker Profiling::* Partitioning Data::* Performance model example::* Theoretical lower bound on execution time::* Insert Task Utility::* Data reduction::* Temporary buffers::* Parallel Tasks::* Debugging::* The multiformat interface::* On-GPU rendering::* More examples::               More examples shipped with StarPU@end menu@node Using multiple implementations of a codelet@section Using multiple implementations of a codeletOne may want to write multiple implementations of a codelet for a single type ofdevice and let StarPU choose which one to run. As an example, we will show howto use SSE to scale a vector. The codelet can be written as follows:@cartouche@smallexample#include <xmmintrin.h>void scal_sse_func(void *buffers[], void *cl_arg)@{    float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);    unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);    unsigned int n_iterations = n/4;    if (n % 4 != 0)        n_iterations++;    __m128 *VECTOR = (__m128*) vector;    __m128 factor __attribute__((aligned(16)));    factor = _mm_set1_ps(*(float *) cl_arg);    unsigned int i;    for (i = 0; i < n_iterations; i++)        VECTOR[i] = _mm_mul_ps(factor, VECTOR[i]);@}@end smallexample@end cartouche@cartouche@smallexamplestruct starpu_codelet cl = @{    .where = STARPU_CPU,    .cpu_funcs = @{ scal_cpu_func, scal_sse_func, NULL @},    .nbuffers = 1,    .modes = @{ STARPU_RW @}@};@end smallexample@end cartoucheSchedulers which are multi-implementation aware (only @code{dmda}, @code{heft}and @code{pheft} for now) will use the performance models of all theimplementations it was given, and pick the one that seems to be the fastest.@node Enabling implementation according to capabilities@section Enabling implementation according to capabilitiesSome implementations may not run on some devices. For instance, some CUDAdevices do not support double floating point precision, and thus the kernelexecution would just fail; or the device may not have enough shared memory forthe implementation being used. The @code{can_execute} field of the @code{structstarpu_codelet} structure permits to express this. For instance:@cartouche@smallexamplestatic int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)@{  const struct cudaDeviceProp *props;  if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)    return 1;  /* Cuda device */  props = starpu_cuda_get_device_properties(workerid);  if (props->major >= 2 || props->minor >= 3)    /* At least compute capability 1.3, supports doubles */    return 1;  /* Old card, does not support doubles */  return 0;@}struct starpu_codelet cl = @{    .where = STARPU_CPU|STARPU_CUDA,    .can_execute = can_execute,    .cpu_funcs = @{ cpu_func, NULL @},    .cuda_funcs = @{ gpu_func, NULL @}    .nbuffers = 1,    .modes = @{ STARPU_RW @}@};@end smallexample@end cartoucheThis can be essential e.g. when running on a machine which mixes various modelsof CUDA devices, to take benefit from the new models without crashing on old models.Note: the @code{can_execute} function is called by the scheduler each time ittries to match a task with a worker, and should thus be very fast. The@code{starpu_cuda_get_device_properties} provides a quick access to CUDAproperties of CUDA devices to achieve such efficiency.Another example is compiling CUDA code for various compute capabilities,resulting with two CUDA functions, e.g. @code{scal_gpu_13} for compute capability1.3, and @code{scal_gpu_20} for compute capability 2.0. Both functions can beprovided to StarPU by using @code{cuda_funcs}, and @code{can_execute} can then beused to rule out the @code{scal_gpu_20} variant on a CUDA device whichwill not be able to execute it:@cartouche@smallexamplestatic int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)@{  const struct cudaDeviceProp *props;  if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)    return 1;  /* Cuda device */  if (nimpl == 0)    /* Trying to execute the 1.3 capability variant, we assume it is ok in all cases.  */    return 1;  /* Trying to execute the 2.0 capability variant, check that the card can do it.  */  props = starpu_cuda_get_device_properties(workerid);  if (props->major >= 2 || props->minor >= 0)    /* At least compute capability 2.0, can run it */    return 1;  /* Old card, does not support 2.0, will not be able to execute the 2.0 variant.  */  return 0;@}struct starpu_codelet cl = @{    .where = STARPU_CPU|STARPU_CUDA,    .can_execute = can_execute,    .cpu_funcs = @{ cpu_func, NULL @},    .cuda_funcs = @{ scal_gpu_13, scal_gpu_20, NULL @},    .nbuffers = 1,    .modes = @{ STARPU_RW @}@};@end smallexample@end cartoucheNote: the most generic variant should be provided first, as some schedulers arenot able to try the different variants.@node Task and Worker Profiling@section Task and Worker ProfilingA full example showing how to use the profiling API is available inthe StarPU sources in the directory @code{examples/profiling/}.@cartouche@smallexamplestruct starpu_task *task = starpu_task_create();task->cl = &cl;task->synchronous = 1;/* We will destroy the task structure by hand so that we can * query the profiling info before the task is destroyed. */task->destroy = 0;/* Submit and wait for completion (since synchronous was set to 1) */starpu_task_submit(task);/* The task is finished, get profiling information */struct starpu_task_profiling_info *info = task->profiling_info;/* How much time did it take before the task started ? */double delay += starpu_timing_timespec_delay_us(&info->submit_time, &info->start_time);/* How long was the task execution ? */double length += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);/* We don't need the task structure anymore */starpu_task_destroy(task);@end smallexample@end cartouche@cartouche@smallexample/* Display the occupancy of all workers during the test */int worker;for (worker = 0; worker < starpu_worker_get_count(); worker++)@{        struct starpu_worker_profiling_info worker_info;        int ret = starpu_worker_get_profiling_info(worker, &worker_info);        STARPU_ASSERT(!ret);        double total_time = starpu_timing_timespec_to_us(&worker_info.total_time);        double executing_time = starpu_timing_timespec_to_us(&worker_info.executing_time);        double sleeping_time = starpu_timing_timespec_to_us(&worker_info.sleeping_time);        double overhead_time = total_time - executing_time - sleeping_time;        float executing_ratio = 100.0*executing_time/total_time;        float sleeping_ratio = 100.0*sleeping_time/total_time;	float overhead_ratio = 100.0 - executing_ratio - sleeping_ratio;        char workername[128];        starpu_worker_get_name(worker, workername, 128);        fprintf(stderr, "Worker %s:\n", workername);        fprintf(stderr, "\ttotal time: %.2lf ms\n", total_time*1e-3);        fprintf(stderr, "\texec time: %.2lf ms (%.2f %%)\n", executing_time*1e-3,                executing_ratio);        fprintf(stderr, "\tblocked time: %.2lf ms (%.2f %%)\n", sleeping_time*1e-3,                sleeping_ratio);        fprintf(stderr, "\toverhead time: %.2lf ms (%.2f %%)\n", overhead_time*1e-3,                overhead_ratio);@}@end smallexample@end cartouche@node Partitioning Data@section Partitioning DataAn existing piece of data can be partitioned in sub parts to be used by different tasks, for instance:@cartouche@smallexampleint vector[NX];starpu_data_handle_t handle;/* Declare data to StarPU */starpu_vector_data_register(&handle, 0, (uintptr_t)vector,                            NX, sizeof(vector[0]));/* Partition the vector in PARTS sub-vectors */starpu_data_filter f =@{    .filter_func = starpu_block_filter_func_vector,    .nchildren = PARTS@};starpu_data_partition(handle, &f);@end smallexample@end cartoucheThe task submission then uses @code{starpu_data_get_sub_data} to retrieve thesub-handles to be passed as tasks parameters.@cartouche@smallexample/* Submit a task on each sub-vector */for (i=0; i<starpu_data_get_nb_children(handle); i++) @{    /* Get subdata number i (there is only 1 dimension) */    starpu_data_handle_t sub_handle = starpu_data_get_sub_data(handle, 1, i);    struct starpu_task *task = starpu_task_create();    task->handles[0] = sub_handle;    task->cl = &cl;    task->synchronous = 1;    task->cl_arg = &factor;    task->cl_arg_size = sizeof(factor);    starpu_task_submit(task);@}@end smallexample@end cartouchePartitioning can be applied several times, see@code{examples/basic_examples/mult.c} and @code{examples/filters/}.Wherever the whole piece of data is already available, the partitioning willbe done in-place, i.e. without allocating new buffers but just using pointersinside the existing copy. This is particularly important to be aware of whenusing OpenCL, where the kernel parameters are not pointers, but handles. Thekernel thus needs to be also passed the offset within the OpenCL buffer:@cartouche@smallexamplevoid opencl_func(void *buffers[], void *cl_arg)@{    cl_mem vector = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);    unsigned offset = STARPU_BLOCK_GET_OFFSET(buffers[0]);    ...    clSetKernelArg(kernel, 0, sizeof(vector), &vector);    clSetKernelArg(kernel, 1, sizeof(offset), &offset);    ...@}@end smallexample@end cartoucheAnd the kernel has to shift from the pointer passed by the OpenCL driver:@cartouche@smallexample__kernel void opencl_kernel(__global int *vector, unsigned offset)@{    block = (__global void *)block + offset;    ...@}@end smallexample@end cartoucheStarPU provides various interfaces and filters for matrices, vectors, etc.,but applications can also write their own data interfaces and filters, see@code{examples/interface} and @code{examples/filters/custom_mf} for an example.@node Performance model example@section Performance model exampleTo achieve good scheduling, StarPU scheduling policies need to be able toestimate in advance the duration of a task. This is done by giving to codeletsa performance model, by defining a @code{starpu_perfmodel} structure andproviding its address in the @code{model} field of the @code{struct starpu_codelet}structure. The @code{symbol} and @code{type} fields of @code{starpu_perfmodel}are mandatory, to give a name to the model, and the type of the model, sincethere are several kinds of performance models. For compatibility, make sure toinitialize the whole structure to zero, either by using explicit memset, or byletting the compiler implicitly do it as examplified below.@itemize@itemMeasured at runtime (@code{STARPU_HISTORY_BASED} model type). This assumes that for agiven set of data input/output sizes, the performance will always be about thesame. This is very true for regular kernels on GPUs for instance (<0.1% error),and just a bit less true on CPUs (~=1% error). This also assumes that there arefew different sets of data input/output sizes. StarPU will then keep record ofthe average time of previous executions on the various processing units, and useit as an estimation. History is done per task size, by using a hash of the inputand ouput sizes as an index.It will also save it in @code{$HOME/.starpu/sampling/codelets}for further executions (@code{$USERPROFILE/.starpu/sampling/codelets} in windowsenvironments), and can be observed by using the@code{starpu_perfmodel_display} command, or drawn by usingthe @code{starpu_perfmodel_plot} (@pxref{Performance model calibration}).  Themodels are indexed by machine name. Toshare the models between machines (e.g. for a homogeneous cluster), use@code{export STARPU_HOSTNAME=some_global_name}. Measurements are only donewhen using a task scheduler which makes use of it, such as @code{heft} or@code{dmda}. Measurements can also be provided explicitly by the application, byusing the @code{starpu_perfmodel_update_history} function.The following is a small code example.If e.g. the code is recompiled with other compilation options, or severalvariants of the code are used, the symbol string should be changed to reflectthat, in order to recalibrate a new model from zero. The symbol string can evenbe constructed dynamically at execution time, as long as this is done beforesubmitting any task using it.@cartouche@smallexamplestatic struct starpu_perfmodel mult_perf_model = @{    .type = STARPU_HISTORY_BASED,    .symbol = "mult_perf_model"@};struct starpu_codelet cl = @{    .where = STARPU_CPU,    .cpu_funcs = @{ cpu_mult, NULL @},    .nbuffers = 3,    .modes = @{ STARPU_R, STARPU_R, STARPU_W @},    /* for the scheduling policy to be able to use performance models */    .model = &mult_perf_model@};@end smallexample@end cartouche@itemMeasured at runtime and refined by regression (@code{STARPU_*REGRESSION_BASED}model type). This still assumes performance regularity, but workswith various data input sizes, by applying regression over observedexecution times. STARPU_REGRESSION_BASED uses an a*n^b regressionform, STARPU_NL_REGRESSION_BASED uses an a*n^b+c (more precise thanSTARPU_REGRESSION_BASED, but costs a lot more to compute).For instance,@code{tests/perfmodels/regression_based.c} uses a regression-based performancemodel for the @code{memset} operation.Of course, the application has to issuetasks with varying size so that the regression can be computed. StarPU will nottrust the regression unless there is at least 10% difference between the minimumand maximum observed input size. It can be useful to set the@code{STARPU_CALIBRATE} environment variable to @code{1} and run the applicationon varying input sizes with @code{STARPU_SCHED} set to @code{eager} scheduler,so as to feed the performance model for a variety ofinputs. The application can also provide the measurements explictly by using@code{starpu_perfmodel_update_history}. The @code{starpu_perfmodel_display} and@code{starpu_perfmodel_plot}tools can be used to observe how much the performance model is calibrated (@pxref{Performance model calibration}); whentheir output look good, @code{STARPU_CALIBRATE} can be reset to @code{0} to letStarPU use the resulting performance model without recording new measures, and@code{STARPU_SCHED} can be set to @code{heft} to benefit from the performance models. Ifthe data input sizes vary a lot, it is really important to set@code{STARPU_CALIBRATE} to @code{0}, otherwise StarPU will continue adding themeasures, and result with a very big performance model, which will take time alot of time to load and save.For non-linear regression, since computing itis quite expensive, it is only done at termination of the application. Thismeans that the first execution of the application will use only history-basedperformance model to perform scheduling, without using regression.@itemProvided as an estimation from the application itself (@code{STARPU_COMMON} model type and @code{cost_function} field),see for instance@code{examples/common/blas_model.h} and @code{examples/common/blas_model.c}.@itemProvided explicitly by the application (@code{STARPU_PER_ARCH} model type): the@code{.per_arch[arch][nimpl].cost_function} fields have to be filled with pointers tofunctions which return the expected duration of the task in micro-seconds, oneper architecture.@end itemizeFor the @code{STARPU_HISTORY_BASED} and @code{STARPU_*REGRESSION_BASE},the total size of task data (both input and output) is used as an index bydefault. The @code{size_base} field of @code{struct starpu_perfmodel} howeverpermits the application to override that, when for instance some of the datado not matter for task cost (e.g. mere reference table), or when using sparsestructures (in which case it is the number of non-zeros which matter), or whenthere is some hidden parameter such as the number of iterations, etc.How to use schedulers which can benefit from such performance model is explainedin @ref{Task scheduling policy}.The same can be done for task power consumption estimation, by setting the@code{power_model} field the same way as the @code{model} field. Note: fornow, the application has to give to the power consumption performance modela name which is different from the execution time performance model.The application can request time estimations from the StarPU performancemodels by filling a task structure as usual without actually submittingit. The data handles can be created by calling @code{starpu_data_register}functions with a @code{NULL} pointer (and need to be unregistered as usual)and the desired data sizes. The @code{starpu_task_expected_length} and@code{starpu_task_expected_power} functions can then be called to get anestimation of the task duration on a given arch. @code{starpu_task_destroy}needs to be called to destroy the dummy task afterwards. See@code{tests/perfmodels/regression_based.c} for an example.@node Theoretical lower bound on execution time@section Theoretical lower bound on execution timeFor kernels with history-based performance models (and provided that they are completely calibrated), StarPU can very easily provide a theoretical lowerbound for the execution time of a whole set of tasks. See forinstance @code{examples/lu/lu_example.c}: before submitting tasks,call @code{starpu_bound_start}, and after complete execution, call@code{starpu_bound_stop}. @code{starpu_bound_print_lp} or@code{starpu_bound_print_mps} can then be used to output a Linear Programmingproblem corresponding to the schedule of your tasks. Run it through@code{lp_solve} or any other linear programming solver, and that will give you alower bound for the total execution time of your tasks. If StarPU was compiledwith the glpk library installed, @code{starpu_bound_compute} can be used tosolve it immediately and get the optimized minimum, in ms. Its @code{integer}parameter allows to decide whether integer resolution should be computedand returned too.The @code{deps} parameter tells StarPU whether to take tasks, implicit data, and tagdependencies into account. Tags released in a callback or similarare not taken into account, only tags associated with a task are.It must be understood that the linear programmingproblem size is quadratic with the number of tasks and thus the time to solve itwill be very long, it could be minutes for just a few dozen tasks. You shouldprobably use @code{lp_solve -timeout 1 test.pl -wmps test.mps} to convert theproblem to MPS format and then use a better solver, @code{glpsol} might bebetter than @code{lp_solve} for instance (the @code{--pcost} option may beuseful), but sometimes doesn't manage to converge. @code{cbc} might lookslower, but it is parallel. Be sure to try at least all the @code{-B} optionsof @code{lp_solve}. For instance, we often just use@code{lp_solve -cc -B1 -Bb -Bg -Bp -Bf -Br -BG -Bd -Bs -BB -Bo -Bc -Bi} , andthe @code{-gr} option can also be quite useful.Data transfer time can only be taken into account when @code{deps} is set. Onlydata transfers inferred from implicit data dependencies between tasks are takeninto account.Setting @code{deps} to 0 will only take into account the actual computationson processing units. It however still properly takes into account the varyingperformances of kernels and processing units, which is quite more accurate thanjust comparing StarPU performances with the fastest of the kernels being used.The @code{prio} parameter tells StarPU whether to simulate taking into accountthe priorities as the StarPU scheduler would, i.e. schedule prioritizedtasks before less prioritized tasks, to check to which extend this resultsto a less optimal solution. This increases even more computation time.Note that for simplicity, all this however doesn't take into account datatransfers, which are assumed to be completely overlapped.@node Insert Task Utility@section Insert Task UtilityStarPU provides the wrapper function @code{starpu_insert_task} to easethe creation and submission of tasks.@deftypefun int starpu_insert_task (struct starpu_codelet *@var{cl}, ...)Create and submit a task corresponding to @var{cl} with the followingarguments.  The argument list must be zero-terminated.The arguments following the codelets can be of the following types:@itemize@item@code{STARPU_R}, @code{STARPU_W}, @code{STARPU_RW}, @code{STARPU_SCRATCH}, @code{STARPU_REDUX} an access mode followed by a data handle;@item@code{STARPU_DATA_ARRAY} followed by an array of data handles and its number of elements;@itemthe specific values @code{STARPU_VALUE}, @code{STARPU_CALLBACK},@code{STARPU_CALLBACK_ARG}, @code{STARPU_CALLBACK_WITH_ARG},@code{STARPU_PRIORITY}, @code{STARPU_TAG}, followed by the appropriated objectsas defined below.@end itemizeWhen using @code{STARPU_DATA_ARRAY}, the access mode of the datahandles is not defined.Parameters to be passed to the codelet implementation are definedthrough the type @code{STARPU_VALUE}. The function@code{starpu_codelet_unpack_args} must be called within the codeletimplementation to retrieve them.@end deftypefun@defmac STARPU_VALUEthis macro is used when calling @code{starpu_insert_task}, and must befollowed by a pointer to a constant value and the size of the constant@end defmac@defmac STARPU_CALLBACKthis macro is used when calling @code{starpu_insert_task}, and must befollowed by a pointer to a callback function@end defmac@defmac STARPU_CALLBACK_ARGthis macro is used when calling @code{starpu_insert_task}, and must befollowed by a pointer to be given as an argument to the callbackfunction@end defmac@defmac  STARPU_CALLBACK_WITH_ARGthis macro is used when calling @code{starpu_insert_task}, and must befollowed by two pointers: one to a callback function, and the other tobe given as an argument to the callback function; this is equivalentto using both @code{STARPU_CALLBACK} and@code{STARPU_CALLBACK_WITH_ARG}@end defmac@defmac STARPU_PRIORITYthis macro is used when calling @code{starpu_insert_task}, and must befollowed by a integer defining a priority level@end defmac@defmac STARPU_TAGthis macro is used when calling @code{starpu_insert_task}, and must befollowed by a tag.@end defmac@deftypefun void starpu_codelet_pack_args ({char **}@var{arg_buffer}, {size_t *}@var{arg_buffer_size}, ...)Pack arguments of type @code{STARPU_VALUE} into a buffer which can begiven to a codelet and later unpacked with the function@code{starpu_codelet_unpack_args} defined below.@end deftypefun@deftypefun void starpu_codelet_unpack_args ({void *}@var{cl_arg}, ...)Retrieve the arguments of type @code{STARPU_VALUE} associated to atask automatically created using the function@code{starpu_insert_task} defined above.@end deftypefunHere the implementation of the codelet:@smallexamplevoid func_cpu(void *descr[], void *_args)@{        int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);        float *x1 = (float *)STARPU_VARIABLE_GET_PTR(descr[1]);        int ifactor;        float ffactor;        starpu_codelet_unpack_args(_args, &ifactor, &ffactor);        *x0 = *x0 * ifactor;        *x1 = *x1 * ffactor;@}struct starpu_codelet mycodelet = @{        .where = STARPU_CPU,        .cpu_funcs = @{ func_cpu, NULL @},        .nbuffers = 2,        .modes = @{ STARPU_RW, STARPU_RW @}@};@end smallexampleAnd the call to the @code{starpu_insert_task} wrapper:@smallexamplestarpu_insert_task(&mycodelet,                   STARPU_VALUE, &ifactor, sizeof(ifactor),                   STARPU_VALUE, &ffactor, sizeof(ffactor),                   STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],                   0);@end smallexampleThe call to @code{starpu_insert_task} is equivalent to the followingcode:@smallexamplestruct starpu_task *task = starpu_task_create();task->cl = &mycodelet;task->handles[0] = data_handles[0];task->handles[1] = data_handles[1];char *arg_buffer;size_t arg_buffer_size;starpu_codelet_pack_args(&arg_buffer, &arg_buffer_size,                    STARPU_VALUE, &ifactor, sizeof(ifactor),                    STARPU_VALUE, &ffactor, sizeof(ffactor),                    0);task->cl_arg = arg_buffer;task->cl_arg_size = arg_buffer_size;int ret = starpu_task_submit(task);@end smallexampleHere a similar call using @code{STARPU_DATA_ARRAY}.@smallexamplestarpu_insert_task(&mycodelet,                   STARPU_DATA_ARRAY, data_handles, 2,                   STARPU_VALUE, &ifactor, sizeof(ifactor),                   STARPU_VALUE, &ffactor, sizeof(ffactor),                   0);@end smallexampleIf some part of the task insertion depends on the value of some computation,the @code{STARPU_DATA_ACQUIRE_CB} macro can be very convenient. Forinstance, assuming that the index variable @code{i} was registered as handle@code{i_handle}:@smallexample/* Compute which portion we will work on, e.g. pivot */starpu_insert_task(&which_index, STARPU_W, i_handle, 0);/* And submit the corresponding task */STARPU_DATA_ACQUIRE_CB(i_handle, STARPU_R,                       starpu_insert_task(&work, STARPU_RW, A_handle[i], 0));@end smallexampleThe @code{STARPU_DATA_ACQUIRE_CB} macro submits an asynchronous request foracquiring data @code{i} for the main application, and will execute the codegiven as third parameter when it is acquired. In other words, as soon as thevalue of @code{i} computed by the @code{which_index} codelet can be read, theportion of code passed as third parameter of @code{STARPU_DATA_ACQUIRE_CB} willbe executed, and is allowed to read from @code{i} to use it e.g. as anindex. Note that this macro is only avaible when compiling StarPU withthe compiler @code{gcc}.@node Data reduction@section Data reductionIn various cases, some piece of data is used to accumulate intermediateresults. For instances, the dot product of a vector, maximum/minimum finding,the histogram of a photograph, etc. When these results are produced along thewhole machine, it would not be efficient to accumulate them in only one place,incurring data transmission each and access concurrency.StarPU provides a @code{STARPU_REDUX} mode, which permits to optimizethat case: it will allocate a buffer on each memory node, and accumulateintermediate results there. When the data is eventually accessed in the normal@code{STARPU_R} mode, StarPU will collect the intermediate results in just onebuffer.For this to work, the user has to use the@code{starpu_data_set_reduction_methods} to declare how to initialize thesebuffers, and how to assemble partial results.For instance, @code{cg} uses that to optimize its dot product: it first definesthe codelets for initialization and reduction:@smallexamplestruct starpu_codelet bzero_variable_cl =@{        .cpu_funcs = @{ bzero_variable_cpu, NULL @},        .cuda_funcs = @{ bzero_variable_cuda, NULL @},        .nbuffers = 1,@}static void accumulate_variable_cpu(void *descr[], void *cl_arg)@{        double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);        double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);        *v_dst = *v_dst + *v_src;@}static void accumulate_variable_cuda(void *descr[], void *cl_arg)@{        double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);        double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);        cublasaxpy(1, (double)1.0, v_src, 1, v_dst, 1);        cudaStreamSynchronize(starpu_cuda_get_local_stream());@}struct starpu_codelet accumulate_variable_cl =@{        .cpu_funcs = @{ accumulate_variable_cpu, NULL @},        .cuda_funcs = @{ accumulate_variable_cuda, NULL @},        .nbuffers = 1,@}@end smallexampleand attaches them as reduction methods for its dtq handle:@smallexamplestarpu_data_set_reduction_methods(dtq_handle,        &accumulate_variable_cl, &bzero_variable_cl);@end smallexampleand dtq_handle can now be used in @code{STARPU_REDUX} mode for the dot productswith partitioned vectors:@smallexampleint dots(starpu_data_handle_t v1, starpu_data_handle_t v2,         starpu_data_handle_t s, unsigned nblocks)@{    starpu_insert_task(&bzero_variable_cl, STARPU_W, s, 0);    for (b = 0; b < nblocks; b++)        starpu_insert_task(&dot_kernel_cl,            STARPU_RW, s,            STARPU_R, starpu_data_get_sub_data(v1, 1, b),            STARPU_R, starpu_data_get_sub_data(v2, 1, b),            0);@}@end smallexampleThe @code{cg} example also uses reduction for the blocked gemv kernel, leadingto yet more relaxed dependencies and more parallelism.STARPU_REDUX can also be passed to @code{starpu_mpi_insert_task} in the MPIcase. That will however not produce any MPI communication, but just passSTARPU_REDUX to the underlying @code{starpu_insert_task}. It is up to theapplication to call @code{starpu_mpi_redux_data}, which posts tasks that willreduce the partial results among MPI nodes into the MPI node which owns thedata. For instance, some hypothetical application which collects partial resultsinto data @code{res}, then uses it for other computation, before looping againwith a new reduction:@smallexample@{    for (i = 0; i < 100; i++) @{        starpu_mpi_insert_task(MPI_COMM_WORLD, &init_res, STARPU_W, res, 0);        starpu_mpi_insert_task(MPI_COMM_WORLD, &work, STARPU_RW, A, STARPU_R, B, STARPU_REDUX, res, 0);        starpu_mpi_redux_data(MPI_COMM_WORLD, res);        starpu_mpi_insert_task(MPI_COMM_WORLD, &work2, STARPU_RW, B, STARPU_R, res, 0);    @}@}@end smallexample@node Temporary buffers@section Temporary buffersThere are two kinds of temporary buffers: temporary data which just pass resultsfrom a task to another, and scratch data which are needed only internally bytasks.@subsection Temporary dataData can sometimes be entirely produced by a task, and entirely consumed byanother task, without the need for other parts of the application to accessit. In such case, registration can be done without prior allocation, by usingthe special -1 memory node number, and passing a zero pointer. StarPU willactually allocate memory only when the task creating the content gets scheduled,and destroy it on unregistration.In addition to that, it can be tedious for the application to have to unregisterthe data, since it will not use its content anyway. The unregistration can bedone lazily by using the @code{starpu_data_unregister_submit(handle)} function,which will record that no more tasks accessing the handle will be submitted, sothat it can be freed as soon as the last task accessing it is over.The following code examplifies both points: it registers the temporarydata, submits three tasks accessing it, and records the data for automaticunregistration.@smallexamplestarpu_vector_data_register(&handle, -1, 0, n, sizeof(float));starpu_insert_task(&produce_data, STARPU_W, handle, 0);starpu_insert_task(&compute_data, STARPU_RW, handle, 0);starpu_insert_task(&summarize_data, STARPU_R, handle, STARPU_W, result_handle, 0);starpu_data_unregister_submit(handle);@end smallexample@subsection Scratch dataSome kernels sometimes need temporary data to achieve the computations, i.e. aworkspace. The application could allocate it at the start of the codeletfunction, and free it at the end, but that would be costly. It could alsoallocate one buffer per worker (similarly to @ref{Per-worker libraryinitialization}), but that would make them systematic and permanent. A moreoptimized way is to use the SCRATCH data access mode, as examplified below,which provides per-worker buffers without content consistency.@smallexamplestarpu_vector_data_register(&workspace, -1, 0, sizeof(float));for (i = 0; i < N; i++)    starpu_insert_task(&compute, STARPU_R, input[i],                       STARPU_SCRATCH, workspace, STARPU_W, output[i], 0);@end smallexampleStarPU will make sure that the buffer is allocated before executing the task,and make this allocation per-worker: for CPU workers, notably, each worker hasits own buffer. This means that each task submitted above will actually have itsown workspace, which will actually be the same for all tasks running one afterthe other on the same worker. Also, if for instance GPU memory becomes scarce,StarPU will notice that it can free such buffers easily, since the content doesnot matter.@node Parallel Tasks@section Parallel TasksStarPU can leverage existing parallel computation libraries by the means ofparallel tasks. A parallel task is a task which gets worked on by a set of CPUs(called a parallel or combined worker) at the same time, by using an existingparallel CPU implementation of the computation to be achieved. This can also beuseful to improve the load balance between slow CPUs and fast GPUs: since CPUswork collectively on a single task, the completion time of tasks on CPUs becomecomparable to the completion time on GPUs, thus relieving from granularitydiscrepancy concerns. Hwloc support needs to be enabled to get good performance,otherwise StarPU will not know how to better group cores.Two modes of execution exist to accomodate with existing usages.@subsection Fork-mode parallel tasksIn the Fork mode, StarPU will call the codelet function on oneof the CPUs of the combined worker. The codelet function can use@code{starpu_combined_worker_get_size()} to get the number of threads it isallowed to start to achieve the computation. The CPU binding mask for the wholeset of CPUs is already enforced, so that threads created by the function willinherit the mask, and thus execute where StarPU expected, the OS being in chargeof choosing how to schedule threads on the corresponding CPUs. The applicationcan also choose to bind threads by hand, using e.g. sched_getaffinity to knowthe CPU binding mask that StarPU chose.For instance, using OpenMP (full source is available in@code{examples/openmp/vector_scal.c}):@examplevoid scal_cpu_func(void *buffers[], void *_args)@{    unsigned i;    float *factor = _args;    struct starpu_vector_interface *vector = buffers[0];    unsigned n = STARPU_VECTOR_GET_NX(vector);    float *val = (float *)STARPU_VECTOR_GET_PTR(vector);#pragma omp parallel for num_threads(starpu_combined_worker_get_size())    for (i = 0; i < n; i++)        val[i] *= *factor;@}static struct starpu_codelet cl =@{    .modes = @{ STARPU_RW @},    .where = STARPU_CPU,    .type = STARPU_FORKJOIN,    .max_parallelism = INT_MAX,    .cpu_funcs = @{scal_cpu_func, NULL@},    .nbuffers = 1,@};@end exampleOther examples include for instance calling a BLAS parallel CPU implementation(see @code{examples/mult/xgemm.c}).@subsection SPMD-mode parallel tasksIn the SPMD mode, StarPU will call the codelet function oneach CPU of the combined worker. The codelet function can use@code{starpu_combined_worker_get_size()} to get the total number of CPUsinvolved in the combined worker, and thus the number of calls that are made inparallel to the function, and @code{starpu_combined_worker_get_rank()} to getthe rank of the current CPU within the combined worker. For instance:@examplestatic void func(void *buffers[], void *args)@{    unsigned i;    float *factor = _args;    struct starpu_vector_interface *vector = buffers[0];    unsigned n = STARPU_VECTOR_GET_NX(vector);    float *val = (float *)STARPU_VECTOR_GET_PTR(vector);    /* Compute slice to compute */    unsigned m = starpu_combined_worker_get_size();    unsigned j = starpu_combined_worker_get_rank();    unsigned slice = (n+m-1)/m;    for (i = j * slice; i < (j+1) * slice && i < n; i++)        val[i] *= *factor;@}static struct starpu_codelet cl =@{    .modes = @{ STARPU_RW @},    .where = STARP_CPU,    .type = STARPU_SPMD,    .max_parallelism = INT_MAX,    .cpu_funcs = @{ func, NULL @},    .nbuffers = 1,@}@end exampleOf course, this trivial example will not really benefit from parallel taskexecution, and was only meant to be simple to understand.  The benefit comeswhen the computation to be done is so that threads have to e.g. exchangeintermediate results, or write to the data in a complex but safe way in the samebuffer.@subsection Parallel tasks performanceTo benefit from parallel tasks, a parallel-task-aware StarPU scheduler has tobe used. When exposed to codelets with a Fork or SPMD flag, the @code{pheft}(parallel-heft) and @code{pgreedy} (parallel greedy) schedulers will indeed alsotry to execute tasks with several CPUs. It will automatically try the variousavailable combined worker sizes and thus be able to avoid choosing a largecombined worker if the codelet does not actually scale so much.@subsection Combined workersBy default, StarPU creates combined workers according to the architecturestructure as detected by hwloc. It means that for each object of the hwloctopology (NUMA node, socket, cache, ...) a combined worker will be created. Ifsome nodes of the hierarchy have a big arity (e.g. many cores in a socketwithout a hierarchy of shared caches), StarPU will create combined workers ofintermediate sizes. The @code{STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER} variablepermits to tune the maximum arity between levels of combined workers.The combined workers actually produced can be seen in the output of the@code{starpu_machine_display} tool (the @code{STARPU_SCHED} environment variablehas to be set to a combined worker-aware scheduler such as @code{pheft} or@code{pgreedy}).@subsection Concurrent parallel tasksUnfortunately, many environments and librairies do not support concurrentcalls.For instance, most OpenMP implementations (including the main ones) do notsupport concurrent @code{pragma omp parallel} statements without nesting them inanother @code{pragma omp parallel} statement, but StarPU does not yet supportcreating its CPU workers by using such pragma.Other parallel libraries are also not safe when being invoked concurrentlyfrom different threads, due to the use of global variables in their sequentialsections for instance.The solution is then to use only one combined worker at a time.  This can bedone by setting @code{single_combined_worker} to 1 in the @code{starpu_conf}structure, or setting the @code{STARPU_SINGLE_COMBINED_WORKER} environmentvariable to 1. StarPU will then run only one parallel task at a time (but otherCPU and GPU tasks are not affected and can be run concurrently). The paralleltask scheduler will however still however still try varying combined workersizes to look for the most efficient ones.@node Debugging@section DebuggingStarPU provides several tools to help debugging aplications. Execution tracescan be generated and displayed graphically, see @ref{Generating traces}. Somegdb helpers are also provided to show the whole StarPU state:@smallexample(gdb) source tools/gdbinit(gdb) help starpu@end smallexampleThe Temanejo task debugger can also be used, see @ref{Task debugger}.@node The multiformat interface@section The multiformat interfaceIt may be interesting to represent the same piece of data using two differentdata structures: one that would only be used on CPUs, and one that would onlybe used on GPUs. This can be done by using the multiformat interface. StarPUwill be able to convert data from one data structure to the other when needed.Note that the heft scheduler is the only one optimized for this interface. Theuser must provide StarPU with conversion codelets:@cartouche@smallexample#define NX 1024struct point array_of_structs[NX];starpu_data_handle_t handle;/* * The conversion of a piece of data is itself a task, though it is created, * submitted and destroyed by StarPU internals and not by the user. Therefore, * we have to define two codelets. * Note that for now the conversion from the CPU format to the GPU format has to * be executed on the GPU, and the conversion from the GPU to the CPU has to be * executed on the CPU. */#ifdef STARPU_USE_OPENCLvoid cpu_to_opencl_opencl_func(void *buffers[], void *args);struct starpu_codelet cpu_to_opencl_cl = @{    .where = STARPU_OPENCL,    .opencl_funcs = @{ cpu_to_opencl_opencl_func, NULL @},    .nbuffers = 1,    .modes = @{ STARPU_RW @}@};void opencl_to_cpu_func(void *buffers[], void *args);struct starpu_codelet opencl_to_cpu_cl = @{    .where = STARPU_CPU,    .cpu_funcs = @{ opencl_to_cpu_func, NULL @},    .nbuffers = 1,    .modes = @{ STARPU_RW @}@};#endifstruct starpu_multiformat_data_interface_ops format_ops = @{#ifdef STARPU_USE_OPENCL    .opencl_elemsize = 2 * sizeof(float),    .cpu_to_opencl_cl = &cpu_to_opencl_cl,    .opencl_to_cpu_cl = &opencl_to_cpu_cl,#endif    .cpu_elemsize = 2 * sizeof(float),    ...@};starpu_multiformat_data_register(handle, 0, &array_of_structs, NX, &format_ops);@end smallexample@end cartoucheKernels can be written almost as for any other interface. Note thatSTARPU_MULTIFORMAT_GET_CPU_PTR shall only be used for CPU kernels. CUDA kernelsmust use STARPU_MULTIFORMAT_GET_CUDA_PTR, and OpenCL kernels must useSTARPU_MULTIFORMAT_GET_OPENCL_PTR. STARPU_MULTIFORMAT_GET_NX may be used in anykind of kernel.@cartouche@smallexamplestatic voidmultiformat_scal_cpu_func(void *buffers[], void *args)@{    struct point *aos;    unsigned int n;    aos = STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);    n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);    ...@}extern "C" void multiformat_scal_cuda_func(void *buffers[], void *_args)@{    unsigned int n;    struct struct_of_arrays *soa;    soa = (struct struct_of_arrays *) STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);    n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);    ...@}@end smallexample@end cartoucheA full example may be found in @code{examples/basic_examples/multiformat.c}.@node On-GPU rendering@section On-GPU renderingGraphical-oriented applications need to draw the result of their computations,typically on the very GPU where these happened. Technologies such as OpenGL/CUDAinteroperability permit to let CUDA directly work on the OpenGL buffers, makingthem thus immediately ready for drawing, by mapping OpenGL buffer, textures orrenderbuffer objects into CUDA.  CUDA however imposes some technicalconstraints: peer memcpy has to be disabled, and the thread that runs OpenGL hasto be the one that runs CUDA computations for that GPU.To achieve this with StarPU, pass the @code{--disable-cuda-memcpy-peer} optionto @code{./configure} (TODO: make it dynamic), OpenGL/GLUT has to be initializedfirst, and the interoperability mode has tobe enabled by using the @code{cuda_opengl_interoperability} field of the@code{starpu_conf} structure, and the driver loop has to be run bythe application, by using the @code{not_launched_drivers} field of@code{starpu_conf} to prevent StarPU from running it in a separate thread, andby using @code{starpu_driver_run} to run the loop. The @code{gl_interop} and@code{gl_interop_idle} examples shows how it articulates in a simple case, whererendering is done in task callbacks. The former uses @code{glutMainLoopEvent}to make GLUT progress from the StarPU driver loop, while the latter uses@code{glutIdleFunc} to make StarPU progress from the GLUT main loop.Then, to use an OpenGL buffer as a CUDA data, StarPU simply needs to be giventhe CUDA pointer at registration, for instance:@cartouche@smallexample/* Get the CUDA worker id */for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)        if (starpu_worker_get_type(workerid) == STARPU_CUDA_WORKER)                break;/* Build a CUDA pointer pointing at the OpenGL buffer */cudaGraphicsResourceGetMappedPointer((void**)&output, &num_bytes, resource);/* And register it to StarPU */starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid),                            output, num_bytes / sizeof(float4), sizeof(float4));/* The handle can now be used as usual */starpu_insert_task(&cl, STARPU_RW, handle, 0);/* ... *//* This gets back data into the OpenGL buffer */starpu_data_unregister(handle);@end smallexample@end cartoucheand display it e.g. in the callback function.@node More examples@section More examplesMore examples are available in the StarPU sources in the @code{examples/}directory. Simple examples include:@table @asis@item @code{incrementer/}:    Trivial incrementation test.@item @code{basic_examples/}:        Simple documented Hello world (as shown in @ref{Hello World}), vector/scalar product (as shown        in @ref{Vector Scaling on an Hybrid CPU/GPU Machine}), matrix        product examples (as shown in @ref{Performance model example}), an example using the blocked matrix data        interface, an example using the variable data interface, and an example        using different formats on CPUs and GPUs.@item @code{matvecmult/}:    OpenCL example from NVidia, adapted to StarPU.@item @code{axpy/}:    AXPY CUBLAS operation adapted to StarPU.@item @code{fortran/}:    Example of Fortran bindings.@end tableMore advanced examples include:@table @asis@item @code{filters/}:    Examples using filters, as shown in @ref{Partitioning Data}.@item @code{lu/}:    LU matrix factorization, see for instance @code{xlu_implicit.c}@item @code{cholesky/}:    Cholesky matrix factorization, see for instance @code{cholesky_implicit.c}.@end table
 |