/* * This file is part of the StarPU Handbook. * Copyright (C) 2009--2011 Universit@'e de Bordeaux 1 * Copyright (C) 2010, 2011, 2012, 2013 Centre National de la Recherche Scientifique * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique * See the file version.doxy for copying conditions. */ /*! \page AdvancedExamples Advanced Examples \section UsingMultipleImplementationsOfACodelet Using Multiple Implementations Of A Codelet One may want to write multiple implementations of a codelet for a single type of device and let StarPU choose which one to run. As an example, we will show how to use SSE to scale a vector. The codelet can be written as follows: \code{.c} #include void scal_sse_func(void *buffers[], void *cl_arg) { float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]); unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]); unsigned int n_iterations = n/4; if (n % 4 != 0) n_iterations++; __m128 *VECTOR = (__m128*) vector; __m128 factor __attribute__((aligned(16))); factor = _mm_set1_ps(*(float *) cl_arg); unsigned int i; for (i = 0; i < n_iterations; i++) VECTOR[i] = _mm_mul_ps(factor, VECTOR[i]); } \endcode \code{.c} struct starpu_codelet cl = { .where = STARPU_CPU, .cpu_funcs = { scal_cpu_func, scal_sse_func, NULL }, .nbuffers = 1, .modes = { STARPU_RW } }; \endcode Schedulers which are multi-implementation aware (only dmda and pheft for now) will use the performance models of all the implementations it was given, and pick the one that seems to be the fastest. \section EnablingImplementationAccordingToCapabilities Enabling Implementation According To Capabilities Some implementations may not run on some devices. For instance, some CUDA devices do not support double floating point precision, and thus the kernel execution would just fail; or the device may not have enough shared memory for the implementation being used. The field starpu_codelet::can_execute permits to express this. For instance: \code{.c} static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl) { const struct cudaDeviceProp *props; if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER) return 1; /* Cuda device */ props = starpu_cuda_get_device_properties(workerid); if (props->major >= 2 || props->minor >= 3) /* At least compute capability 1.3, supports doubles */ return 1; /* Old card, does not support doubles */ return 0; } struct starpu_codelet cl = { .where = STARPU_CPU|STARPU_CUDA, .can_execute = can_execute, .cpu_funcs = { cpu_func, NULL }, .cuda_funcs = { gpu_func, NULL } .nbuffers = 1, .modes = { STARPU_RW } }; \endcode This can be essential e.g. when running on a machine which mixes various models of CUDA devices, to take benefit from the new models without crashing on old models. Note: the function starpu_codelet::can_execute is called by the scheduler each time it tries to match a task with a worker, and should thus be very fast. The function starpu_cuda_get_device_properties() provides a quick access to CUDA properties of CUDA devices to achieve such efficiency. Another example is compiling CUDA code for various compute capabilities, resulting with two CUDA functions, e.g. scal_gpu_13 for compute capability 1.3, and scal_gpu_20 for compute capability 2.0. Both functions can be provided to StarPU by using cuda_funcs, and can_execute can then be used to rule out the scal_gpu_20 variant on a CUDA device which will not be able to execute it: \code{.c} static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl) { const struct cudaDeviceProp *props; if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER) return 1; /* Cuda device */ if (nimpl == 0) /* Trying to execute the 1.3 capability variant, we assume it is ok in all cases. */ return 1; /* Trying to execute the 2.0 capability variant, check that the card can do it. */ props = starpu_cuda_get_device_properties(workerid); if (props->major >= 2 || props->minor >= 0) /* At least compute capability 2.0, can run it */ return 1; /* Old card, does not support 2.0, will not be able to execute the 2.0 variant. */ return 0; } struct starpu_codelet cl = { .where = STARPU_CPU|STARPU_CUDA, .can_execute = can_execute, .cpu_funcs = { cpu_func, NULL }, .cuda_funcs = { scal_gpu_13, scal_gpu_20, NULL }, .nbuffers = 1, .modes = { STARPU_RW } }; \endcode Note: the most generic variant should be provided first, as some schedulers are not able to try the different variants. \section TaskAndWorkerProfiling Task And Worker Profiling A full example showing how to use the profiling API is available in the StarPU sources in the directory examples/profiling/. \code{.c} struct starpu_task *task = starpu_task_create(); task->cl = &cl; task->synchronous = 1; /* We will destroy the task structure by hand so that we can * query the profiling info before the task is destroyed. */ task->destroy = 0; /* Submit and wait for completion (since synchronous was set to 1) */ starpu_task_submit(task); /* The task is finished, get profiling information */ struct starpu_profiling_task_info *info = task->profiling_info; /* How much time did it take before the task started ? */ double delay += starpu_timing_timespec_delay_us(&info->submit_time, &info->start_time); /* How long was the task execution ? */ double length += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time); /* We don't need the task structure anymore */ starpu_task_destroy(task); \endcode \code{.c} /* Display the occupancy of all workers during the test */ int worker; for (worker = 0; worker < starpu_worker_get_count(); worker++) { struct starpu_profiling_worker_info worker_info; int ret = starpu_profiling_worker_get_info(worker, &worker_info); STARPU_ASSERT(!ret); double total_time = starpu_timing_timespec_to_us(&worker_info.total_time); double executing_time = starpu_timing_timespec_to_us(&worker_info.executing_time); double sleeping_time = starpu_timing_timespec_to_us(&worker_info.sleeping_time); double overhead_time = total_time - executing_time - sleeping_time; float executing_ratio = 100.0*executing_time/total_time; float sleeping_ratio = 100.0*sleeping_time/total_time; float overhead_ratio = 100.0 - executing_ratio - sleeping_ratio; char workername[128]; starpu_worker_get_name(worker, workername, 128); fprintf(stderr, "Worker %s:\n", workername); fprintf(stderr, "\ttotal time: %.2lf ms\n", total_time*1e-3); fprintf(stderr, "\texec time: %.2lf ms (%.2f %%)\n", executing_time*1e-3, executing_ratio); fprintf(stderr, "\tblocked time: %.2lf ms (%.2f %%)\n", sleeping_time*1e-3, sleeping_ratio); fprintf(stderr, "\toverhead time: %.2lf ms (%.2f %%)\n", overhead_time*1e-3, overhead_ratio); } \endcode \section PartitioningData Partitioning Data An existing piece of data can be partitioned in sub parts to be used by different tasks, for instance: \code{.c} int vector[NX]; starpu_data_handle_t handle; /* Declare data to StarPU */ starpu_vector_data_register(&handle, 0, (uintptr_t)vector, NX, sizeof(vector[0])); /* Partition the vector in PARTS sub-vectors */ starpu_data_filter f = { .filter_func = starpu_vector_filter_block, .nchildren = PARTS }; starpu_data_partition(handle, &f); \endcode The task submission then uses the function starpu_data_get_sub_data() to retrieve the sub-handles to be passed as tasks parameters. \code{.c} /* Submit a task on each sub-vector */ for (i=0; ihandles[0] = sub_handle; task->cl = &cl; task->synchronous = 1; task->cl_arg = &factor; task->cl_arg_size = sizeof(factor); starpu_task_submit(task); } \endcode Partitioning can be applied several times, see examples/basic_examples/mult.c and examples/filters/. Wherever the whole piece of data is already available, the partitioning will be done in-place, i.e. without allocating new buffers but just using pointers inside the existing copy. This is particularly important to be aware of when using OpenCL, where the kernel parameters are not pointers, but handles. The kernel thus needs to be also passed the offset within the OpenCL buffer: \code{.c} void opencl_func(void *buffers[], void *cl_arg) { cl_mem vector = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]); unsigned offset = STARPU_BLOCK_GET_OFFSET(buffers[0]); ... clSetKernelArg(kernel, 0, sizeof(vector), &vector); clSetKernelArg(kernel, 1, sizeof(offset), &offset); ... } \endcode And the kernel has to shift from the pointer passed by the OpenCL driver: \code{.c} __kernel void opencl_kernel(__global int *vector, unsigned offset) { block = (__global void *)block + offset; ... } \endcode StarPU provides various interfaces and filters for matrices, vectors, etc., but applications can also write their own data interfaces and filters, see examples/interface and examples/filters/custom_mf for an example. \section PerformanceModelExample Performance Model Example To achieve good scheduling, StarPU scheduling policies need to be able to estimate in advance the duration of a task. This is done by giving to codelets a performance model, by defining a structure starpu_perfmodel and providing its address in the field starpu_codelet::model. The fields starpu_perfmodel::symbol and starpu_perfmodel::type are mandatory, to give a name to the model, and the type of the model, since there are several kinds of performance models. For compatibility, make sure to initialize the whole structure to zero, either by using explicit memset(), or by letting the compiler implicitly do it as examplified below.

Measured at runtime (model type ::STARPU_HISTORY_BASED). This assumes that for a given set of data input/output sizes, the performance will always be about the same. This is very true for regular kernels on GPUs for instance (<0.1% error), and just a bit less true on CPUs (~=1% error). This also assumes that there are few different sets of data input/output sizes. StarPU will then keep record of the average time of previous executions on the various processing units, and use it as an estimation. History is done per task size, by using a hash of the input and ouput sizes as an index. It will also save it in $STARPU_HOME/.starpu/sampling/codelets for further executions, and can be observed by using the tool starpu_perfmodel_display, or drawn by using the tool starpu_perfmodel_plot (\ref PerformanceModelCalibration). The models are indexed by machine name. To share the models between machines (e.g. for a homogeneous cluster), use export STARPU_HOSTNAME=some_global_name. Measurements are only done when using a task scheduler which makes use of it, such as dmda. Measurements can also be provided explicitly by the application, by using the function starpu_perfmodel_update_history(). The following is a small code example. If e.g. the code is recompiled with other compilation options, or several variants of the code are used, the symbol string should be changed to reflect that, in order to recalibrate a new model from zero. The symbol string can even be constructed dynamically at execution time, as long as this is done before submitting any task using it. \code{.c} static struct starpu_perfmodel mult_perf_model = { .type = STARPU_HISTORY_BASED, .symbol = "mult_perf_model" }; struct starpu_codelet cl = { .where = STARPU_CPU, .cpu_funcs = { cpu_mult, NULL }, .nbuffers = 3, .modes = { STARPU_R, STARPU_R, STARPU_W }, /* for the scheduling policy to be able to use performance models */ .model = &mult_perf_model }; \endcode
Measured at runtime and refined by regression (model types ::STARPU_REGRESSION_BASED and ::STARPU_NL_REGRESSION_BASED). This still assumes performance regularity, but works with various data input sizes, by applying regression over observed execution times. ::STARPU_REGRESSION_BASED uses an a*n^b regression form, ::STARPU_NL_REGRESSION_BASED uses an a*n^b+c (more precise than ::STARPU_REGRESSION_BASED, but costs a lot more to compute). For instance, tests/perfmodels/regression_based.c uses a regression-based performance model for the function memset(). Of course, the application has to issue tasks with varying size so that the regression can be computed. StarPU will not trust the regression unless there is at least 10% difference between the minimum and maximum observed input size. It can be useful to set the environment variable \ref STARPU_CALIBRATE to 1 and run the application on varying input sizes with \ref STARPU_SCHED set to eager scheduler, so as to feed the performance model for a variety of inputs. The application can also provide the measurements explictly by using the function starpu_perfmodel_update_history(). The tools starpu_perfmodel_display and starpu_perfmodel_plot can be used to observe how much the performance model is calibrated (\ref PerformanceModelCalibration); when their output look good, \ref STARPU_CALIBRATE can be reset to 0 to let StarPU use the resulting performance model without recording new measures, and \ref STARPU_SCHED can be set to dmda to benefit from the performance models. If the data input sizes vary a lot, it is really important to set \ref STARPU_CALIBRATE to 0, otherwise StarPU will continue adding the measures, and result with a very big performance model, which will take time a lot of time to load and save. For non-linear regression, since computing it is quite expensive, it is only done at termination of the application. This means that the first execution of the application will use only history-based performance model to perform scheduling, without using regression.
Provided as an estimation from the application itself (model type ::STARPU_COMMON and field starpu_perfmodel::cost_function), see for instance examples/common/blas_model.h and examples/common/blas_model.c.
Provided explicitly by the application (model type ::STARPU_PER_ARCH): the fields .per_arch[arch][nimpl].cost_function have to be filled with pointers to functions which return the expected duration of the task in micro-seconds, one per architecture.

For ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED, and ::STARPU_NL_REGRESSION_BASED, the total size of task data (both input and output) is used as an index by default. The field starpu_perfmodel::size_base however permits the application to override that, when for instance some of the data do not matter for task cost (e.g. mere reference table), or when using sparse structures (in which case it is the number of non-zeros which matter), or when there is some hidden parameter such as the number of iterations, etc. The examples/pi examples uses this to include the number of iterations in the base. How to use schedulers which can benefit from such performance model is explained in \ref TaskSchedulingPolicy. The same can be done for task power consumption estimation, by setting the field starpu_codelet::power_model the same way as the field starpu_codelet::model. Note: for now, the application has to give to the power consumption performance model a name which is different from the execution time performance model. The application can request time estimations from the StarPU performance models by filling a task structure as usual without actually submitting it. The data handles can be created by calling any of the functions starpu_*_data_register with a NULL pointer and -1 node and the desired data sizes, and need to be unregistered as usual. The functions starpu_task_expected_length() and starpu_task_expected_power() can then be called to get an estimation of the task cost on a given arch. starpu_task_footprint() can also be used to get the footprint used for indexing history-based performance models. starpu_task_destroy() needs to be called to destroy the dummy task afterwards. See tests/perfmodels/regression_based.c for an example. \section TheoreticalLowerBoundOnExecutionTimeExample Theoretical Lower Bound On Execution Time Example For kernels with history-based performance models (and provided that they are completely calibrated), StarPU can very easily provide a theoretical lower bound for the execution time of a whole set of tasks. See for instance examples/lu/lu_example.c: before submitting tasks, call the function starpu_bound_start(), and after complete execution, call starpu_bound_stop(). starpu_bound_print_lp() or starpu_bound_print_mps() can then be used to output a Linear Programming problem corresponding to the schedule of your tasks. Run it through lp_solve or any other linear programming solver, and that will give you a lower bound for the total execution time of your tasks. If StarPU was compiled with the glpk library installed, starpu_bound_compute() can be used to solve it immediately and get the optimized minimum, in ms. Its parameter integer allows to decide whether integer resolution should be computed and returned too. The deps parameter tells StarPU whether to take tasks, implicit data, and tag dependencies into account. Tags released in a callback or similar are not taken into account, only tags associated with a task are. It must be understood that the linear programming problem size is quadratic with the number of tasks and thus the time to solve it will be very long, it could be minutes for just a few dozen tasks. You should probably use lp_solve -timeout 1 test.pl -wmps test.mps to convert the problem to MPS format and then use a better solver, glpsol might be better than lp_solve for instance (the --pcost option may be useful), but sometimes doesn't manage to converge. cbc might look slower, but it is parallel. For lp_solve, be sure to try at least all the -B options. For instance, we often just use lp_solve -cc -B1 -Bb -Bg -Bp -Bf -Br -BG -Bd -Bs -BB -Bo -Bc -Bi , and the -gr option can also be quite useful. The resulting schedule can be observed by using the tool starpu_lp2paje, which converts it into the Paje format. Data transfer time can only be taken into account when deps is set. Only data transfers inferred from implicit data dependencies between tasks are taken into account. Other data transfers are assumed to be completely overlapped. Setting deps to 0 will only take into account the actual computations on processing units. It however still properly takes into account the varying performances of kernels and processing units, which is quite more accurate than just comparing StarPU performances with the fastest of the kernels being used. The prio parameter tells StarPU whether to simulate taking into account the priorities as the StarPU scheduler would, i.e. schedule prioritized tasks before less prioritized tasks, to check to which extend this results to a less optimal solution. This increases even more computation time. \section InsertTaskUtility Insert Task Utility StarPU provides the wrapper function starpu_insert_task() to ease the creation and submission of tasks. Here the implementation of the codelet: \code{.c} void func_cpu(void *descr[], void *_args) { int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]); float *x1 = (float *)STARPU_VARIABLE_GET_PTR(descr[1]); int ifactor; float ffactor; starpu_codelet_unpack_args(_args, &ifactor, &ffactor); *x0 = *x0 * ifactor; *x1 = *x1 * ffactor; } struct starpu_codelet mycodelet = { .where = STARPU_CPU, .cpu_funcs = { func_cpu, NULL }, .nbuffers = 2, .modes = { STARPU_RW, STARPU_RW } }; \endcode And the call to the function starpu_insert_task(): \code{.c} starpu_insert_task(&mycodelet, STARPU_VALUE, &ifactor, sizeof(ifactor), STARPU_VALUE, &ffactor, sizeof(ffactor), STARPU_RW, data_handles[0], STARPU_RW, data_handles[1], 0); \endcode The call to starpu_insert_task() is equivalent to the following code: \code{.c} struct starpu_task *task = starpu_task_create(); task->cl = &mycodelet; task->handles[0] = data_handles[0]; task->handles[1] = data_handles[1]; char *arg_buffer; size_t arg_buffer_size; starpu_codelet_pack_args(&arg_buffer, &arg_buffer_size, STARPU_VALUE, &ifactor, sizeof(ifactor), STARPU_VALUE, &ffactor, sizeof(ffactor), 0); task->cl_arg = arg_buffer; task->cl_arg_size = arg_buffer_size; int ret = starpu_task_submit(task); \endcode Here a similar call using ::STARPU_DATA_ARRAY. \code{.c} starpu_insert_task(&mycodelet, STARPU_DATA_ARRAY, data_handles, 2, STARPU_VALUE, &ifactor, sizeof(ifactor), STARPU_VALUE, &ffactor, sizeof(ffactor), 0); \endcode If some part of the task insertion depends on the value of some computation, the macro ::STARPU_DATA_ACQUIRE_CB can be very convenient. For instance, assuming that the index variable i was registered as handle A_handle[i]: \code{.c} /* Compute which portion we will work on, e.g. pivot */ starpu_insert_task(&which_index, STARPU_W, i_handle, 0); /* And submit the corresponding task */ STARPU_DATA_ACQUIRE_CB(i_handle, STARPU_R, starpu_insert_task(&work, STARPU_RW, A_handle[i], 0)); \endcode The macro ::STARPU_DATA_ACQUIRE_CB submits an asynchronous request for acquiring data i for the main application, and will execute the code given as third parameter when it is acquired. In other words, as soon as the value of i computed by the which_index codelet can be read, the portion of code passed as third parameter of ::STARPU_DATA_ACQUIRE_CB will be executed, and is allowed to read from i to use it e.g. as an index. Note that this macro is only avaible when compiling StarPU with the compiler gcc. \section DataReduction Data Reduction In various cases, some piece of data is used to accumulate intermediate results. For instances, the dot product of a vector, maximum/minimum finding, the histogram of a photograph, etc. When these results are produced along the whole machine, it would not be efficient to accumulate them in only one place, incurring data transmission each and access concurrency. StarPU provides a mode ::STARPU_REDUX, which permits to optimize that case: it will allocate a buffer on each memory node, and accumulate intermediate results there. When the data is eventually accessed in the normal mode ::STARPU_R, StarPU will collect the intermediate results in just one buffer. For this to work, the user has to use the function starpu_data_set_reduction_methods() to declare how to initialize these buffers, and how to assemble partial results. For instance, cg uses that to optimize its dot product: it first defines the codelets for initialization and reduction: \code{.c} struct starpu_codelet bzero_variable_cl = { .cpu_funcs = { bzero_variable_cpu, NULL }, .cuda_funcs = { bzero_variable_cuda, NULL }, .nbuffers = 1, } static void accumulate_variable_cpu(void *descr[], void *cl_arg) { double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]); double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]); *v_dst = *v_dst + *v_src; } static void accumulate_variable_cuda(void *descr[], void *cl_arg) { double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]); double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]); cublasaxpy(1, (double)1.0, v_src, 1, v_dst, 1); cudaStreamSynchronize(starpu_cuda_get_local_stream()); } struct starpu_codelet accumulate_variable_cl = { .cpu_funcs = { accumulate_variable_cpu, NULL }, .cuda_funcs = { accumulate_variable_cuda, NULL }, .nbuffers = 1, } \endcode and attaches them as reduction methods for its dtq handle: \code{.c} starpu_variable_data_register(&dtq_handle, -1, NULL, sizeof(type)); starpu_data_set_reduction_methods(dtq_handle, &accumulate_variable_cl, &bzero_variable_cl); \endcode and dtq_handle can now be used in mode ::STARPU_REDUX for the dot products with partitioned vectors: \code{.c} for (b = 0; b < nblocks; b++) starpu_insert_task(&dot_kernel_cl, STARPU_REDUX, dtq_handle, STARPU_R, starpu_data_get_sub_data(v1, 1, b), STARPU_R, starpu_data_get_sub_data(v2, 1, b), 0); \endcode During registration, we have here provided NULL, i.e. there is no initial value to be taken into account during reduction. StarPU will thus only take into account the contributions from the tasks dot_kernel_cl. Also, it will not allocate any memory for dtq_handle before tasks dot_kernel_cl are ready to run. If another dot product has to be performed, one could unregister dtq_handle, and re-register it. But one can also call starpu_data_invalidate_submit() with the parameter dtq_handle, which will clear all data from the handle, thus resetting it back to the initial status register(NULL). The example cg also uses reduction for the blocked gemv kernel, leading to yet more relaxed dependencies and more parallelism. ::STARPU_REDUX can also be passed to starpu_mpi_insert_task() in the MPI case. That will however not produce any MPI communication, but just pass ::STARPU_REDUX to the underlying starpu_insert_task(). It is up to the application to call starpu_mpi_redux_data(), which posts tasks that will reduce the partial results among MPI nodes into the MPI node which owns the data. For instance, some hypothetical application which collects partial results into data res, then uses it for other computation, before looping again with a new reduction: \code{.c} for (i = 0; i < 100; i++) { starpu_mpi_insert_task(MPI_COMM_WORLD, &init_res, STARPU_W, res, 0); starpu_mpi_insert_task(MPI_COMM_WORLD, &work, STARPU_RW, A, STARPU_R, B, STARPU_REDUX, res, 0); starpu_mpi_redux_data(MPI_COMM_WORLD, res); starpu_mpi_insert_task(MPI_COMM_WORLD, &work2, STARPU_RW, B, STARPU_R, res, 0); } \endcode \section TemporaryBuffers Temporary Buffers There are two kinds of temporary buffers: temporary data which just pass results from a task to another, and scratch data which are needed only internally by tasks. \subsection TemporaryData Temporary Data Data can sometimes be entirely produced by a task, and entirely consumed by another task, without the need for other parts of the application to access it. In such case, registration can be done without prior allocation, by using the special -1 memory node number, and passing a zero pointer. StarPU will actually allocate memory only when the task creating the content gets scheduled, and destroy it on unregistration. In addition to that, it can be tedious for the application to have to unregister the data, since it will not use its content anyway. The unregistration can be done lazily by using the function starpu_data_unregister_submit(), which will record that no more tasks accessing the handle will be submitted, so that it can be freed as soon as the last task accessing it is over. The following code examplifies both points: it registers the temporary data, submits three tasks accessing it, and records the data for automatic unregistration. \code{.c} starpu_vector_data_register(&handle, -1, 0, n, sizeof(float)); starpu_insert_task(&produce_data, STARPU_W, handle, 0); starpu_insert_task(&compute_data, STARPU_RW, handle, 0); starpu_insert_task(&summarize_data, STARPU_R, handle, STARPU_W, result_handle, 0); starpu_data_unregister_submit(handle); \endcode \subsection ScratchData Scratch Data Some kernels sometimes need temporary data to achieve the computations, i.e. a workspace. The application could allocate it at the start of the codelet function, and free it at the end, but that would be costly. It could also allocate one buffer per worker (similarly to \ref HowToInitializeAComputationLibraryOnceForEachWorker), but that would make them systematic and permanent. A more optimized way is to use the ::STARPU_SCRATCH data access mode, as examplified below, which provides per-worker buffers without content consistency. \code{.c} starpu_vector_data_register(&workspace, -1, 0, sizeof(float)); for (i = 0; i < N; i++) starpu_insert_task(&compute, STARPU_R, input[i], STARPU_SCRATCH, workspace, STARPU_W, output[i], 0); \endcode StarPU will make sure that the buffer is allocated before executing the task, and make this allocation per-worker: for CPU workers, notably, each worker has its own buffer. This means that each task submitted above will actually have its own workspace, which will actually be the same for all tasks running one after the other on the same worker. Also, if for instance GPU memory becomes scarce, StarPU will notice that it can free such buffers easily, since the content does not matter. The examples/pi example uses scratches for some temporary buffer. \section ParallelTasks Parallel Tasks StarPU can leverage existing parallel computation libraries by the means of parallel tasks. A parallel task is a task which gets worked on by a set of CPUs (called a parallel or combined worker) at the same time, by using an existing parallel CPU implementation of the computation to be achieved. This can also be useful to improve the load balance between slow CPUs and fast GPUs: since CPUs work collectively on a single task, the completion time of tasks on CPUs become comparable to the completion time on GPUs, thus relieving from granularity discrepancy concerns. Hwloc support needs to be enabled to get good performance, otherwise StarPU will not know how to better group cores. Two modes of execution exist to accomodate with existing usages. \subsection Fork-modeParallelTasks Fork-mode Parallel Tasks In the Fork mode, StarPU will call the codelet function on one of the CPUs of the combined worker. The codelet function can use starpu_combined_worker_get_size() to get the number of threads it is allowed to start to achieve the computation. The CPU binding mask for the whole set of CPUs is already enforced, so that threads created by the function will inherit the mask, and thus execute where StarPU expected, the OS being in charge of choosing how to schedule threads on the corresponding CPUs. The application can also choose to bind threads by hand, using e.g. sched_getaffinity to know the CPU binding mask that StarPU chose. For instance, using OpenMP (full source is available in examples/openmp/vector_scal.c): \snippet forkmode.c To be included Other examples include for instance calling a BLAS parallel CPU implementation (see examples/mult/xgemm.c). \subsection SPMD-modeParallelTasks SPMD-mode Parallel Tasks In the SPMD mode, StarPU will call the codelet function on each CPU of the combined worker. The codelet function can use starpu_combined_worker_get_size() to get the total number of CPUs involved in the combined worker, and thus the number of calls that are made in parallel to the function, and starpu_combined_worker_get_rank() to get the rank of the current CPU within the combined worker. For instance: \code{.c} static void func(void *buffers[], void *args) { unsigned i; float *factor = _args; struct starpu_vector_interface *vector = buffers[0]; unsigned n = STARPU_VECTOR_GET_NX(vector); float *val = (float *)STARPU_VECTOR_GET_PTR(vector); /* Compute slice to compute */ unsigned m = starpu_combined_worker_get_size(); unsigned j = starpu_combined_worker_get_rank(); unsigned slice = (n+m-1)/m; for (i = j * slice; i < (j+1) * slice && i < n; i++) val[i] *= *factor; } static struct starpu_codelet cl = { .modes = { STARPU_RW }, .where = STARP_CPU, .type = STARPU_SPMD, .max_parallelism = INT_MAX, .cpu_funcs = { func, NULL }, .nbuffers = 1, } \endcode Of course, this trivial example will not really benefit from parallel task execution, and was only meant to be simple to understand. The benefit comes when the computation to be done is so that threads have to e.g. exchange intermediate results, or write to the data in a complex but safe way in the same buffer. \subsection ParallelTasksPerformance Parallel Tasks Performance To benefit from parallel tasks, a parallel-task-aware StarPU scheduler has to be used. When exposed to codelets with a flag ::STARPU_FORKJOIN or ::STARPU_SPMD, the pheft (parallel-heft) and peager (parallel eager) schedulers will indeed also try to execute tasks with several CPUs. It will automatically try the various available combined worker sizes (making several measurements for each worker size) and thus be able to avoid choosing a large combined worker if the codelet does not actually scale so much. \subsection CombinedWorkers Combined Workers By default, StarPU creates combined workers according to the architecture structure as detected by hwloc. It means that for each object of the hwloc topology (NUMA node, socket, cache, ...) a combined worker will be created. If some nodes of the hierarchy have a big arity (e.g. many cores in a socket without a hierarchy of shared caches), StarPU will create combined workers of intermediate sizes. The variable \ref STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER permits to tune the maximum arity between levels of combined workers. The combined workers actually produced can be seen in the output of the tool starpu_machine_display (the environment variable \ref STARPU_SCHED has to be set to a combined worker-aware scheduler such as pheft or peager). \subsection ConcurrentParallelTasks Concurrent Parallel Tasks Unfortunately, many environments and librairies do not support concurrent calls. For instance, most OpenMP implementations (including the main ones) do not support concurrent pragma omp parallel statements without nesting them in another pragma omp parallel statement, but StarPU does not yet support creating its CPU workers by using such pragma. Other parallel libraries are also not safe when being invoked concurrently from different threads, due to the use of global variables in their sequential sections for instance. The solution is then to use only one combined worker at a time. This can be done by setting the field starpu_conf::single_combined_worker to 1, or setting the environment variable \ref STARPU_SINGLE_COMBINED_WORKER to 1. StarPU will then run only one parallel task at a time (but other CPU and GPU tasks are not affected and can be run concurrently). The parallel task scheduler will however still however still try varying combined worker sizes to look for the most efficient ones. \section Debugging Debugging StarPU provides several tools to help debugging aplications. Execution traces can be generated and displayed graphically, see \ref GeneratingTracesWithFxT. Some gdb helpers are also provided to show the whole StarPU state: \verbatim (gdb) source tools/gdbinit (gdb) help starpu \endverbatim The Temanejo task debugger can also be used, see \ref UsingTheTemanejoTaskDebugger. \section TheMultiformatInterface The Multiformat Interface It may be interesting to represent the same piece of data using two different data structures: one that would only be used on CPUs, and one that would only be used on GPUs. This can be done by using the multiformat interface. StarPU will be able to convert data from one data structure to the other when needed. Note that the scheduler dmda is the only one optimized for this interface. The user must provide StarPU with conversion codelets: \snippet multiformat.c To be included Kernels can be written almost as for any other interface. Note that ::STARPU_MULTIFORMAT_GET_CPU_PTR shall only be used for CPU kernels. CUDA kernels must use ::STARPU_MULTIFORMAT_GET_CUDA_PTR, and OpenCL kernels must use ::STARPU_MULTIFORMAT_GET_OPENCL_PTR. ::STARPU_MULTIFORMAT_GET_NX may be used in any kind of kernel. \code{.c} static void multiformat_scal_cpu_func(void *buffers[], void *args) { struct point *aos; unsigned int n; aos = STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]); n = STARPU_MULTIFORMAT_GET_NX(buffers[0]); ... } extern "C" void multiformat_scal_cuda_func(void *buffers[], void *_args) { unsigned int n; struct struct_of_arrays *soa; soa = (struct struct_of_arrays *) STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]); n = STARPU_MULTIFORMAT_GET_NX(buffers[0]); ... } \endcode A full example may be found in examples/basic_examples/multiformat.c. \section UsingTheDriverAPI Using The Driver API \ref API_Running_Drivers \code{.c} int ret; struct starpu_driver = { .type = STARPU_CUDA_WORKER, .id.cuda_id = 0 }; ret = starpu_driver_init(&d); if (ret != 0) error(); while (some_condition) { ret = starpu_driver_run_once(&d); if (ret != 0) error(); } ret = starpu_driver_deinit(&d); if (ret != 0) error(); \endcode To add a new kind of device to the structure starpu_driver, one needs to:

Add a member to the union starpu_driver::id
Modify the internal function _starpu_launch_drivers() to make sure the driver is not always launched.
Modify the function starpu_driver_run() so that it can handle another kind of architecture.
Write the new function _starpu_run_foobar() in the corresponding driver.

\section DefiningANewSchedulingPolicy Defining A New Scheduling Policy A full example showing how to define a new scheduling policy is available in the StarPU sources in the directory examples/scheduler/. See \ref API_Scheduling_Policy \code{.c} static struct starpu_sched_policy dummy_sched_policy = { .init_sched = init_dummy_sched, .deinit_sched = deinit_dummy_sched, .add_workers = dummy_sched_add_workers, .remove_workers = dummy_sched_remove_workers, .push_task = push_task_dummy, .push_prio_task = NULL, .pop_task = pop_task_dummy, .post_exec_hook = NULL, .pop_every_task = NULL, .policy_name = "dummy", .policy_description = "dummy scheduling strategy" }; \endcode \section On-GPURendering On-GPU Rendering Graphical-oriented applications need to draw the result of their computations, typically on the very GPU where these happened. Technologies such as OpenGL/CUDA interoperability permit to let CUDA directly work on the OpenGL buffers, making them thus immediately ready for drawing, by mapping OpenGL buffer, textures or renderbuffer objects into CUDA. CUDA however imposes some technical constraints: peer memcpy has to be disabled, and the thread that runs OpenGL has to be the one that runs CUDA computations for that GPU. To achieve this with StarPU, pass the option \ref disable-cuda-memcpy-peer "--disable-cuda-memcpy-peer" to ./configure (TODO: make it dynamic), OpenGL/GLUT has to be initialized first, and the interoperability mode has to be enabled by using the field starpu_conf::cuda_opengl_interoperability, and the driver loop has to be run by the application, by using the field starpu_conf::not_launched_drivers to prevent StarPU from running it in a separate thread, and by using starpu_driver_run() to run the loop. The examples gl_interop and gl_interop_idle show how it articulates in a simple case, where rendering is done in task callbacks. The former uses glutMainLoopEvent to make GLUT progress from the StarPU driver loop, while the latter uses glutIdleFunc to make StarPU progress from the GLUT main loop. Then, to use an OpenGL buffer as a CUDA data, StarPU simply needs to be given the CUDA pointer at registration, for instance: \code{.c} /* Get the CUDA worker id */ for (workerid = 0; workerid < starpu_worker_get_count(); workerid++) if (starpu_worker_get_type(workerid) == STARPU_CUDA_WORKER) break; /* Build a CUDA pointer pointing at the OpenGL buffer */ cudaGraphicsResourceGetMappedPointer((void**)&output, &num_bytes, resource); /* And register it to StarPU */ starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid), output, num_bytes / sizeof(float4), sizeof(float4)); /* The handle can now be used as usual */ starpu_insert_task(&cl, STARPU_RW, handle, 0); /* ... */ /* This gets back data into the OpenGL buffer */ starpu_data_unregister(handle); \endcode and display it e.g. in the callback function. \section DefiningANewDataInterface Defining A New Data Interface Let's define a new data interface to manage complex numbers. \code{.c} /* interface for complex numbers */ struct starpu_complex_interface { double *real; double *imaginary; int nx; }; \endcode Registering such a data to StarPU is easily done using the function starpu_data_register(). The last parameter of the function, interface_complex_ops, will be described below. \code{.c} void starpu_complex_data_register(starpu_data_handle_t *handle, unsigned home_node, double *real, double *imaginary, int nx) { struct starpu_complex_interface complex = { .real = real, .imaginary = imaginary, .nx = nx }; if (interface_complex_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID) { interface_complex_ops.interfaceid = starpu_data_interface_get_next_id(); } starpu_data_register(handleptr, home_node, &complex, &interface_complex_ops); } \endcode Different operations need to be defined for a data interface through the type starpu_data_interface_ops. We only define here the basic operations needed to run simple applications. The source code for the different functions can be found in the file examples/interface/complex_interface.c. \code{.c} static struct starpu_data_interface_ops interface_complex_ops = { .register_data_handle = complex_register_data_handle, .allocate_data_on_node = complex_allocate_data_on_node, .copy_methods = &complex_copy_methods, .get_size = complex_get_size, .footprint = complex_footprint, .interfaceid = STARPU_UNKNOWN_INTERFACE_ID, .interface_size = sizeof(struct starpu_complex_interface), }; \endcode Functions need to be defined to access the different fields of the complex interface from a StarPU data handle. \code{.c} double *starpu_complex_get_real(starpu_data_handle_t handle) { struct starpu_complex_interface *complex_interface = (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, 0); return complex_interface->real; } double *starpu_complex_get_imaginary(starpu_data_handle_t handle); int starpu_complex_get_nx(starpu_data_handle_t handle); \endcode Similar functions need to be defined to access the different fields of the complex interface from a void * pointer to be used within codelet implemetations. \snippet complex.c To be included Complex data interfaces can then be registered to StarPU. \code{.c} double real = 45.0; double imaginary = 12.0; starpu_complex_data_register(&handle1, 0, &real, &imaginary, 1); starpu_insert_task(&cl_display, STARPU_R, handle1, 0); \endcode and used by codelets. \code{.c} void display_complex_codelet(void *descr[], __attribute__ ((unused)) void *_args) { int nx = STARPU_COMPLEX_GET_NX(descr[0]); double *real = STARPU_COMPLEX_GET_REAL(descr[0]); double *imaginary = STARPU_COMPLEX_GET_IMAGINARY(descr[0]); int i; for(i=0 ; iexamples/interface/. \section SettingTheDataHandlesForATask Setting The Data Handles For A Task The number of data a task can manage is fixed by the environment variable \ref STARPU_NMAXBUFS which has a default value which can be changed through the configure option \ref enable-maxbuffers "--enable-maxbuffers". However, it is possible to define tasks managing more data by using the field starpu_task::dyn_handles when defining a task and the field starpu_codelet::dyn_modes when defining the corresponding codelet. \code{.c} enum starpu_data_access_mode modes[STARPU_NMAXBUFS+1] = { STARPU_R, STARPU_R, ... }; struct starpu_codelet dummy_big_cl = { .cuda_funcs = {dummy_big_kernel, NULL}, .opencl_funcs = {dummy_big_kernel, NULL}, .cpu_funcs = {dummy_big_kernel, NULL}, .nbuffers = STARPU_NMAXBUFS+1, .dyn_modes = modes }; task = starpu_task_create(); task->cl = &dummy_big_cl; task->dyn_handles = malloc(task->cl->nbuffers * sizeof(starpu_data_handle_t)); for(i=0 ; icl->nbuffers ; i++) { task->dyn_handles[i] = handle; } starpu_task_submit(task); \endcode \code{.c} starpu_data_handle_t *handles = malloc(dummy_big_cl.nbuffers * sizeof(starpu_data_handle_t)); for(i=0 ; iexamples/basic_examples/dynamic_handles.c. \section MoreExamples More Examples More examples are available in the StarPU sources in the examples/ directory. Simple examples include:

incrementer/: Trivial incrementation test.
basic_examples/: Simple documented Hello world and vector/scalar product (as shown in \ref BasicExamples), matrix product examples (as shown in \ref PerformanceModelExample), an example using the blocked matrix data interface, an example using the variable data interface, and an example using different formats on CPUs and GPUs.
matvecmult/: OpenCL example from NVidia, adapted to StarPU.
axpy/: AXPY CUBLAS operation adapted to StarPU.
fortran/: Example of Fortran bindings.

More advanced examples include:

filters/: Examples using filters, as shown in \ref PartitioningData.
lu/: LU matrix factorization, see for instance xlu_implicit.c
cholesky/: Cholesky matrix factorization, see for instance cholesky_implicit.c.