|
@@ -731,7 +731,7 @@ otherwise StarPU will not know how to better group cores.
|
|
|
|
|
|
|
|
Two modes of execution exist to accomodate with existing usages.
|
|
Two modes of execution exist to accomodate with existing usages.
|
|
|
|
|
|
|
|
-\subsection Fork-mode parallel tasks Fork-mode_parallel_tasks
|
|
|
|
|
|
|
+\subsection Fork-mode_parallel_tasks Fork-mode Parallel Tasks
|
|
|
|
|
|
|
|
In the Fork mode, StarPU will call the codelet function on one
|
|
In the Fork mode, StarPU will call the codelet function on one
|
|
|
of the CPUs of the combined worker. The codelet function can use
|
|
of the CPUs of the combined worker. The codelet function can use
|
|
@@ -746,30 +746,7 @@ the CPU binding mask that StarPU chose.
|
|
|
For instance, using OpenMP (full source is available in
|
|
For instance, using OpenMP (full source is available in
|
|
|
<c>examples/openmp/vector_scal.c</c>):
|
|
<c>examples/openmp/vector_scal.c</c>):
|
|
|
|
|
|
|
|
-\code{.c}
|
|
|
|
|
-void scal_cpu_func(void *buffers[], void *_args)
|
|
|
|
|
-{
|
|
|
|
|
- unsigned i;
|
|
|
|
|
- float *factor = _args;
|
|
|
|
|
- struct starpu_vector_interface *vector = buffers[0];
|
|
|
|
|
- unsigned n = STARPU_VECTOR_GET_NX(vector);
|
|
|
|
|
- float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
|
|
|
|
|
-
|
|
|
|
|
-#pragma omp parallel for num_threads(starpu_combined_worker_get_size())
|
|
|
|
|
- for (i = 0; i < n; i++)
|
|
|
|
|
- val[i] *= *factor;
|
|
|
|
|
-}
|
|
|
|
|
-
|
|
|
|
|
-static struct starpu_codelet cl =
|
|
|
|
|
-{
|
|
|
|
|
- .modes = { STARPU_RW },
|
|
|
|
|
- .where = STARPU_CPU,
|
|
|
|
|
- .type = STARPU_FORKJOIN,
|
|
|
|
|
- .max_parallelism = INT_MAX,
|
|
|
|
|
- .cpu_funcs = {scal_cpu_func, NULL},
|
|
|
|
|
- .nbuffers = 1,
|
|
|
|
|
-};
|
|
|
|
|
-\endcode
|
|
|
|
|
|
|
+\include forkmode.c
|
|
|
|
|
|
|
|
Other examples include for instance calling a BLAS parallel CPU implementation
|
|
Other examples include for instance calling a BLAS parallel CPU implementation
|
|
|
(see <c>examples/mult/xgemm.c</c>).
|
|
(see <c>examples/mult/xgemm.c</c>).
|
|
@@ -886,48 +863,7 @@ will be able to convert data from one data structure to the other when needed.
|
|
|
Note that the dmda scheduler is the only one optimized for this interface. The
|
|
Note that the dmda scheduler is the only one optimized for this interface. The
|
|
|
user must provide StarPU with conversion codelets:
|
|
user must provide StarPU with conversion codelets:
|
|
|
|
|
|
|
|
-\code{.c}
|
|
|
|
|
-#define NX 1024
|
|
|
|
|
-struct point array_of_structs[NX];
|
|
|
|
|
-starpu_data_handle_t handle;
|
|
|
|
|
-
|
|
|
|
|
-/*
|
|
|
|
|
- * The conversion of a piece of data is itself a task, though it is created,
|
|
|
|
|
- * submitted and destroyed by StarPU internals and not by the user. Therefore,
|
|
|
|
|
- * we have to define two codelets.
|
|
|
|
|
- * Note that for now the conversion from the CPU format to the GPU format has to
|
|
|
|
|
- * be executed on the GPU, and the conversion from the GPU to the CPU has to be
|
|
|
|
|
- * executed on the CPU.
|
|
|
|
|
- */
|
|
|
|
|
-#ifdef STARPU_USE_OPENCL
|
|
|
|
|
-void cpu_to_opencl_opencl_func(void *buffers[], void *args);
|
|
|
|
|
-struct starpu_codelet cpu_to_opencl_cl = {
|
|
|
|
|
- .where = STARPU_OPENCL,
|
|
|
|
|
- .opencl_funcs = { cpu_to_opencl_opencl_func, NULL },
|
|
|
|
|
- .nbuffers = 1,
|
|
|
|
|
- .modes = { STARPU_RW }
|
|
|
|
|
-};
|
|
|
|
|
-
|
|
|
|
|
-void opencl_to_cpu_func(void *buffers[], void *args);
|
|
|
|
|
-struct starpu_codelet opencl_to_cpu_cl = {
|
|
|
|
|
- .where = STARPU_CPU,
|
|
|
|
|
- .cpu_funcs = { opencl_to_cpu_func, NULL },
|
|
|
|
|
- .nbuffers = 1,
|
|
|
|
|
- .modes = { STARPU_RW }
|
|
|
|
|
-};
|
|
|
|
|
-#endif
|
|
|
|
|
-
|
|
|
|
|
-struct starpu_multiformat_data_interface_ops format_ops = {
|
|
|
|
|
-#ifdef STARPU_USE_OPENCL
|
|
|
|
|
- .opencl_elemsize = 2 * sizeof(float),
|
|
|
|
|
- .cpu_to_opencl_cl = &cpu_to_opencl_cl,
|
|
|
|
|
- .opencl_to_cpu_cl = &opencl_to_cpu_cl,
|
|
|
|
|
-#endif
|
|
|
|
|
- .cpu_elemsize = 2 * sizeof(float),
|
|
|
|
|
- ...
|
|
|
|
|
-};
|
|
|
|
|
-starpu_multiformat_data_register(handle, 0, &array_of_structs, NX, &format_ops);
|
|
|
|
|
-\endcode
|
|
|
|
|
|
|
+\include multiformat.c
|
|
|
|
|
|
|
|
Kernels can be written almost as for any other interface. Note that
|
|
Kernels can be written almost as for any other interface. Note that
|
|
|
::STARPU_MULTIFORMAT_GET_CPU_PTR shall only be used for CPU kernels. CUDA kernels
|
|
::STARPU_MULTIFORMAT_GET_CPU_PTR shall only be used for CPU kernels. CUDA kernels
|
|
@@ -1150,14 +1086,7 @@ Similar functions need to be defined to access the different fields of the
|
|
|
complex interface from a <c>void *</c> pointer to be used within codelet
|
|
complex interface from a <c>void *</c> pointer to be used within codelet
|
|
|
implemetations.
|
|
implemetations.
|
|
|
|
|
|
|
|
-\code{.c}
|
|
|
|
|
-#define STARPU_COMPLEX_GET_REAL(interface) \
|
|
|
|
|
- (((struct starpu_complex_interface *)(interface))->real)
|
|
|
|
|
-#define STARPU_COMPLEX_GET_IMAGINARY(interface) \
|
|
|
|
|
- (((struct starpu_complex_interface *)(interface))->imaginary)
|
|
|
|
|
-#define STARPU_COMPLEX_GET_NX(interface) \
|
|
|
|
|
- (((struct starpu_complex_interface *)(interface))->nx)
|
|
|
|
|
-\endcode
|
|
|
|
|
|
|
+\include complex.c
|
|
|
|
|
|
|
|
Complex data interfaces can then be registered to StarPU.
|
|
Complex data interfaces can then be registered to StarPU.
|
|
|
|
|
|