123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509 |
- starpu_vector_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)vector,
- NX, sizeof(vector[0]));
- starpu_data_filter f =
- {
- .filter_func = starpu_vector_filter_block,
- .nchildren = PARTS
- };
- starpu_data_partition(handle, &f);
- \endcode
- The task submission then uses the function starpu_data_get_sub_data()
- to retrieve the sub-handles to be passed as tasks parameters.
- \code{.c}
- for (i=0; i<starpu_data_get_nb_children(handle); i++) {
-
- starpu_data_handle_t sub_handle = starpu_data_get_sub_data(handle, 1, i);
- struct starpu_task *task = starpu_task_create();
- task->handles[0] = sub_handle;
- task->cl = &cl;
- task->synchronous = 1;
- task->cl_arg = &factor;
- task->cl_arg_size = sizeof(factor);
- starpu_task_submit(task);
- }
- \endcode
- Partitioning can be applied several times, see
- <c>examples/basic_examples/mult.c</c> and <c>examples/filters/</c>.
- Wherever the whole piece of data is already available, the partitioning will
- be done in-place, i.e. without allocating new buffers but just using pointers
- inside the existing copy. This is particularly important to be aware of when
- using OpenCL, where the kernel parameters are not pointers, but handles. The
- kernel thus needs to be also passed the offset within the OpenCL buffer:
- \code{.c}
- void opencl_func(void *buffers[], void *cl_arg)
- {
- cl_mem vector = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
- unsigned offset = STARPU_BLOCK_GET_OFFSET(buffers[0]);
- ...
- clSetKernelArg(kernel, 0, sizeof(vector), &vector);
- clSetKernelArg(kernel, 1, sizeof(offset), &offset);
- ...
- }
- \endcode
- And the kernel has to shift from the pointer passed by the OpenCL driver:
- \code{.c}
- __kernel void opencl_kernel(__global int *vector, unsigned offset)
- {
- block = (__global void *)block + offset;
- ...
- }
- \endcode
- StarPU provides various interfaces and filters for matrices, vectors, etc.,
- but applications can also write their own data interfaces and filters, see
- <c>examples/interface</c> and <c>examples/filters/custom_mf</c> for an example.
- \section DataReduction Data Reduction
- In various cases, some piece of data is used to accumulate intermediate
- results. For instances, the dot product of a vector, maximum/minimum finding,
- the histogram of a photograph, etc. When these results are produced along the
- whole machine, it would not be efficient to accumulate them in only one place,
- incurring data transmission each and access concurrency.
- StarPU provides a mode ::STARPU_REDUX, which permits to optimize
- that case: it will allocate a buffer on each memory node, and accumulate
- intermediate results there. When the data is eventually accessed in the normal
- mode ::STARPU_R, StarPU will collect the intermediate results in just one
- buffer.
- For this to work, the user has to use the function
- starpu_data_set_reduction_methods() to declare how to initialize these
- buffers, and how to assemble partial results.
- For instance, <c>cg</c> uses that to optimize its dot product: it first defines
- the codelets for initialization and reduction:
- \code{.c}
- struct starpu_codelet bzero_variable_cl =
- {
- .cpu_funcs = { bzero_variable_cpu, NULL },
- .cpu_funcs_name = { "bzero_variable_cpu", NULL },
- .cuda_funcs = { bzero_variable_cuda, NULL },
- .nbuffers = 1,
- }
- static void accumulate_variable_cpu(void *descr[], void *cl_arg)
- {
- double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);
- double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);
- *v_dst = *v_dst + *v_src;
- }
- static void accumulate_variable_cuda(void *descr[], void *cl_arg)
- {
- double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);
- double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);
- cublasaxpy(1, (double)1.0, v_src, 1, v_dst, 1);
- cudaStreamSynchronize(starpu_cuda_get_local_stream());
- }
- struct starpu_codelet accumulate_variable_cl =
- {
- .cpu_funcs = { accumulate_variable_cpu, NULL },
- .cpu_funcs_name = { "accumulate_variable_cpu", NULL },
- .cuda_funcs = { accumulate_variable_cuda, NULL },
- .nbuffers = 1,
- }
- \endcode
- and attaches them as reduction methods for its handle <c>dtq</c>:
- \code{.c}
- starpu_variable_data_register(&dtq_handle, -1, NULL, sizeof(type));
- starpu_data_set_reduction_methods(dtq_handle,
- &accumulate_variable_cl, &bzero_variable_cl);
- \endcode
- and <c>dtq_handle</c> can now be used in mode ::STARPU_REDUX for the
- dot products with partitioned vectors:
- \code{.c}
- for (b = 0; b < nblocks; b++)
- starpu_task_insert(&dot_kernel_cl,
- STARPU_REDUX, dtq_handle,
- STARPU_R, starpu_data_get_sub_data(v1, 1, b),
- STARPU_R, starpu_data_get_sub_data(v2, 1, b),
- 0);
- \endcode
- During registration, we have here provided <c>NULL</c>, i.e. there is
- no initial value to be taken into account during reduction. StarPU
- will thus only take into account the contributions from the tasks
- <c>dot_kernel_cl</c>. Also, it will not allocate any memory for
- <c>dtq_handle</c> before tasks <c>dot_kernel_cl</c> are ready to run.
- If another dot product has to be performed, one could unregister
- <c>dtq_handle</c>, and re-register it. But one can also call
- starpu_data_invalidate_submit() with the parameter <c>dtq_handle</c>,
- which will clear all data from the handle, thus resetting it back to
- the initial status <c>register(NULL)</c>.
- The example <c>cg</c> also uses reduction for the blocked gemv kernel,
- leading to yet more relaxed dependencies and more parallelism.
- ::STARPU_REDUX can also be passed to starpu_mpi_task_insert() in the MPI
- case. That will however not produce any MPI communication, but just pass
- ::STARPU_REDUX to the underlying starpu_task_insert(). It is up to the
- application to call starpu_mpi_redux_data(), which posts tasks that will
- reduce the partial results among MPI nodes into the MPI node which owns the
- data. For instance, some hypothetical application which collects partial results
- into data <c>res</c>, then uses it for other computation, before looping again
- with a new reduction:
- \code{.c}
- for (i = 0; i < 100; i++) {
- starpu_mpi_task_insert(MPI_COMM_WORLD, &init_res, STARPU_W, res, 0);
- starpu_mpi_task_insert(MPI_COMM_WORLD, &work, STARPU_RW, A,
- STARPU_R, B, STARPU_REDUX, res, 0);
- starpu_mpi_redux_data(MPI_COMM_WORLD, res);
- starpu_mpi_task_insert(MPI_COMM_WORLD, &work2, STARPU_RW, B, STARPU_R, res, 0);
- }
- \endcode
- \section TemporaryBuffers Temporary Buffers
- There are two kinds of temporary buffers: temporary data which just pass results
- from a task to another, and scratch data which are needed only internally by
- tasks.
- \subsection TemporaryData Temporary Data
- Data can sometimes be entirely produced by a task, and entirely consumed by
- another task, without the need for other parts of the application to access
- it. In such case, registration can be done without prior allocation, by using
- the special memory node number <c>-1</c>, and passing a zero pointer. StarPU will
- actually allocate memory only when the task creating the content gets scheduled,
- and destroy it on unregistration.
- In addition to that, it can be tedious for the application to have to unregister
- the data, since it will not use its content anyway. The unregistration can be
- done lazily by using the function starpu_data_unregister_submit(),
- which will record that no more tasks accessing the handle will be submitted, so
- that it can be freed as soon as the last task accessing it is over.
- The following code examplifies both points: it registers the temporary
- data, submits three tasks accessing it, and records the data for automatic
- unregistration.
- \code{.c}
- starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
- starpu_task_insert(&produce_data, STARPU_W, handle, 0);
- starpu_task_insert(&compute_data, STARPU_RW, handle, 0);
- starpu_task_insert(&summarize_data, STARPU_R, handle, STARPU_W, result_handle, 0);
- starpu_data_unregister_submit(handle);
- \endcode
- \subsection ScratchData Scratch Data
- Some kernels sometimes need temporary data to achieve the computations, i.e. a
- workspace. The application could allocate it at the start of the codelet
- function, and free it at the end, but that would be costly. It could also
- allocate one buffer per worker (similarly to \ref
- HowToInitializeAComputationLibraryOnceForEachWorker), but that would
- make them systematic and permanent. A more optimized way is to use
- the data access mode ::STARPU_SCRATCH, as examplified below, which
- provides per-worker buffers without content consistency.
- \code{.c}
- starpu_vector_data_register(&workspace, -1, 0, sizeof(float));
- for (i = 0; i < N; i++)
- starpu_task_insert(&compute, STARPU_R, input[i],
- STARPU_SCRATCH, workspace, STARPU_W, output[i], 0);
- \endcode
- StarPU will make sure that the buffer is allocated before executing the task,
- and make this allocation per-worker: for CPU workers, notably, each worker has
- its own buffer. This means that each task submitted above will actually have its
- own workspace, which will actually be the same for all tasks running one after
- the other on the same worker. Also, if for instance GPU memory becomes scarce,
- StarPU will notice that it can free such buffers easily, since the content does
- not matter.
- The example <c>examples/pi</c> uses scratches for some temporary buffer.
- \section TheMultiformatInterface The Multiformat Interface
- It may be interesting to represent the same piece of data using two different
- data structures: one that would only be used on CPUs, and one that would only
- be used on GPUs. This can be done by using the multiformat interface. StarPU
- will be able to convert data from one data structure to the other when needed.
- Note that the scheduler <c>dmda</c> is the only one optimized for this
- interface. The user must provide StarPU with conversion codelets:
- \snippet multiformat.c To be included. You should update doxygen if you see this text.
- Kernels can be written almost as for any other interface. Note that
- ::STARPU_MULTIFORMAT_GET_CPU_PTR shall only be used for CPU kernels. CUDA kernels
- must use ::STARPU_MULTIFORMAT_GET_CUDA_PTR, and OpenCL kernels must use
- ::STARPU_MULTIFORMAT_GET_OPENCL_PTR. ::STARPU_MULTIFORMAT_GET_NX may
- be used in any kind of kernel.
- \code{.c}
- static void
- multiformat_scal_cpu_func(void *buffers[], void *args)
- {
- struct point *aos;
- unsigned int n;
- aos = STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
- n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
- ...
- }
- extern "C" void multiformat_scal_cuda_func(void *buffers[], void *_args)
- {
- unsigned int n;
- struct struct_of_arrays *soa;
- soa = (struct struct_of_arrays *) STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
- n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
- ...
- }
- \endcode
- A full example may be found in <c>examples/basic_examples/multiformat.c</c>.
- \section DefiningANewDataInterface Defining A New Data Interface
- Let's define a new data interface to manage complex numbers.
- \code{.c}
- /* interface for complex numbers */
- struct starpu_complex_interface
- {
- double *real;
- double *imaginary;
- int nx;
- };
- \endcode
- Registering such a data to StarPU is easily done using the function
- starpu_data_register(). The last
- parameter of the function, <c>interface_complex_ops</c>, will be
- described below.
- \code{.c}
- void starpu_complex_data_register(starpu_data_handle_t *handle,
- unsigned home_node, double *real, double *imaginary, int nx)
- {
- struct starpu_complex_interface complex =
- {
- .real = real,
- .imaginary = imaginary,
- .nx = nx
- };
- if (interface_complex_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
- {
- interface_complex_ops.interfaceid = starpu_data_interface_get_next_id();
- }
- starpu_data_register(handleptr, home_node, &complex, &interface_complex_ops);
- }
- \endcode
- Different operations need to be defined for a data interface through
- the type starpu_data_interface_ops. We only define here the basic
- operations needed to run simple applications. The source code for the
- different functions can be found in the file
- <c>examples/interface/complex_interface.c</c>.
- \code{.c}
- static struct starpu_data_interface_ops interface_complex_ops =
- {
- .register_data_handle = complex_register_data_handle,
- .allocate_data_on_node = complex_allocate_data_on_node,
- .copy_methods = &complex_copy_methods,
- .get_size = complex_get_size,
- .footprint = complex_footprint,
- .interfaceid = STARPU_UNKNOWN_INTERFACE_ID,
- .interface_size = sizeof(struct starpu_complex_interface),
- };
- \endcode
- Functions need to be defined to access the different fields of the
- complex interface from a StarPU data handle.
- \code{.c}
- double *starpu_complex_get_real(starpu_data_handle_t handle)
- {
- struct starpu_complex_interface *complex_interface =
- (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
- return complex_interface->real;
- }
- double *starpu_complex_get_imaginary(starpu_data_handle_t handle);
- int starpu_complex_get_nx(starpu_data_handle_t handle);
- \endcode
- Similar functions need to be defined to access the different fields of the
- complex interface from a <c>void *</c> pointer to be used within codelet
- implemetations.
- \snippet complex.c To be included. You should update doxygen if you see this text.
- Complex data interfaces can then be registered to StarPU.
- \code{.c}
- double real = 45.0;
- double imaginary = 12.0;starpu_complex_data_register(&handle1, STARPU_MAIN_RAM, &real, &imaginary, 1);
- starpu_task_insert(&cl_display, STARPU_R, handle1, 0);
- \endcode
- and used by codelets.
- \code{.c}
- void display_complex_codelet(void *descr[], __attribute__ ((unused)) void *_args)
- {
- int nx = STARPU_COMPLEX_GET_NX(descr[0]);
- double *real = STARPU_COMPLEX_GET_REAL(descr[0]);
- double *imaginary = STARPU_COMPLEX_GET_IMAGINARY(descr[0]);
- int i;
- for(i=0 ; i<nx ; i++)
- {
- fprintf(stderr, "Complex[%d] = %3.2f + %3.2f i\n", i, real[i], imaginary[i]);
- }
- }
- \endcode
- The whole code for this complex data interface is available in the
- directory <c>examples/interface/</c>.
- */
|