|
@@ -1139,15 +1139,14 @@ typically need to be explicitly casted. Using the
|
|
|
@code{void starpu_tag_declare_deps(starpu_tag_t id, unsigned ndeps, ...);}
|
|
|
|
|
|
@item @emph{Example}:
|
|
|
+@cartouche
|
|
|
@example
|
|
|
-@c @cartouche
|
|
|
/* Tag 0x1 depends on tags 0x32 and 0x52 */
|
|
|
starpu_tag_declare_deps((starpu_tag_t)0x1,
|
|
|
2, (starpu_tag_t)0x32, (starpu_tag_t)0x52);
|
|
|
|
|
|
-@c @end cartouche
|
|
|
@end example
|
|
|
-
|
|
|
+@end cartouche
|
|
|
|
|
|
@end table
|
|
|
|
|
@@ -1161,11 +1160,13 @@ does not take a variable number of arguments but an array of tags of size
|
|
|
@item @emph{Prototype}:
|
|
|
@code{void starpu_tag_declare_deps_array(starpu_tag_t id, unsigned ndeps, starpu_tag_t *array);}
|
|
|
@item @emph{Example}:
|
|
|
+@cartouche
|
|
|
@example
|
|
|
/* Tag 0x1 depends on tags 0x32 and 0x52 */
|
|
|
starpu_tag_t tag_array[2] = @{0x32, 0x52@};
|
|
|
starpu_tag_declare_deps_array((starpu_tag_t)0x1, 2, tag_array);
|
|
|
@end example
|
|
|
+@end cartouche
|
|
|
|
|
|
|
|
|
@end table
|
|
@@ -1291,13 +1292,13 @@ enabled by default. To enable OpenCL, you need either to disable CUDA
|
|
|
when configuring StarPU:
|
|
|
|
|
|
@example
|
|
|
-./configure --disable-cuda
|
|
|
+$ ./configure --disable-cuda
|
|
|
@end example
|
|
|
|
|
|
or when running applications:
|
|
|
|
|
|
@example
|
|
|
-STARPU_NCUDA=0 ./application
|
|
|
+$ STARPU_NCUDA=0 ./application
|
|
|
@end example
|
|
|
|
|
|
OpenCL will automatically be started on any device not yet used by
|
|
@@ -1306,7 +1307,7 @@ enable CUDA on 2 devices, and OpenCL on the 2 other devices by doing
|
|
|
so:
|
|
|
|
|
|
@example
|
|
|
-STARPU_NCUDA=2 ./application
|
|
|
+$ STARPU_NCUDA=2 ./application
|
|
|
@end example
|
|
|
|
|
|
@node Compiling OpenCL codelets
|
|
@@ -1367,12 +1368,12 @@ instance.
|
|
|
The Makefile could for instance contain the following lines to define which
|
|
|
options must be given to the compiler and to the linker:
|
|
|
|
|
|
+@cartouche
|
|
|
@example
|
|
|
-@c @cartouche
|
|
|
CFLAGS+=$$(pkg-config --cflags libstarpu)
|
|
|
LIBS+=$$(pkg-config --libs libstarpu)
|
|
|
-@c @end cartouche
|
|
|
@end example
|
|
|
+@end cartouche
|
|
|
|
|
|
@node Hello World
|
|
|
@section Hello World
|
|
@@ -1383,17 +1384,17 @@ In this section, we show how to implement a simple program that submits a task t
|
|
|
|
|
|
The @code{starpu.h} header should be included in any code using StarPU.
|
|
|
|
|
|
-@example
|
|
|
-@c @cartouche
|
|
|
+@cartouche
|
|
|
+@example
|
|
|
#include <starpu.h>
|
|
|
-@c @end cartouche
|
|
|
@end example
|
|
|
+@end cartouche
|
|
|
|
|
|
|
|
|
@subsection Defining a Codelet
|
|
|
|
|
|
+@cartouche
|
|
|
@example
|
|
|
-@c @cartouche
|
|
|
void cpu_func(void *buffers[], void *cl_arg)
|
|
|
@{
|
|
|
float *array = cl_arg;
|
|
@@ -1407,8 +1408,8 @@ starpu_codelet cl =
|
|
|
.cpu_func = cpu_func,
|
|
|
.nbuffers = 0
|
|
|
@};
|
|
|
-@c @end cartouche
|
|
|
@end example
|
|
|
+@end cartouche
|
|
|
|
|
|
A codelet is a structure that represents a computational kernel. Such a codelet
|
|
|
may contain an implementation of the same kernel on different architectures
|
|
@@ -1446,8 +1447,8 @@ cannot be used as a synchronization medium.
|
|
|
|
|
|
@subsection Submitting a Task
|
|
|
|
|
|
+@cartouche
|
|
|
@example
|
|
|
-@c @cartouche
|
|
|
void callback_func(void *callback_arg)
|
|
|
@{
|
|
|
printf("Callback function (arg %x)\n", callback_arg);
|
|
@@ -1461,7 +1462,7 @@ int main(int argc, char **argv)
|
|
|
struct starpu_task *task = starpu_task_create();
|
|
|
|
|
|
task->cl = &cl;
|
|
|
-
|
|
|
+
|
|
|
float *array[2] = @{1.0f, -1.0f@};
|
|
|
task->cl_arg = &array;
|
|
|
task->cl_arg_size = 2*sizeof(float);
|
|
@@ -1480,8 +1481,8 @@ int main(int argc, char **argv)
|
|
|
|
|
|
return 0;
|
|
|
@}
|
|
|
-@c @end cartouche
|
|
|
@end example
|
|
|
+@end cartouche
|
|
|
|
|
|
Before submitting any tasks to StarPU, @code{starpu_init} must be called. The
|
|
|
@code{NULL} argument specifies that we use default configuration. Tasks cannot
|
|
@@ -1511,9 +1512,12 @@ While the computational kernel could be offloaded on various architectures, the
|
|
|
callback function is always executed on a CPU. The @code{callback_arg}
|
|
|
pointer is passed as an argument of the callback. The prototype of a callback
|
|
|
function must be:
|
|
|
+
|
|
|
+@cartouche
|
|
|
@example
|
|
|
void (*callback_function)(void *);
|
|
|
@end example
|
|
|
+@end cartouche
|
|
|
|
|
|
If the @code{synchronous} field is non-null, task submission will be
|
|
|
synchronous: the @code{starpu_task_submit} function will not return until the
|
|
@@ -1542,12 +1546,15 @@ here we will consider the @b{vector interface}.
|
|
|
|
|
|
The following lines show how to declare an array of @code{n} elements of type
|
|
|
@code{float} using the vector interface:
|
|
|
+
|
|
|
+@cartouche
|
|
|
@example
|
|
|
float tab[n];
|
|
|
|
|
|
starpu_data_handle tab_handle;
|
|
|
starpu_vector_data_register(&tab_handle, 0, tab, n, sizeof(float));
|
|
|
@end example
|
|
|
+@end cartouche
|
|
|
|
|
|
The first argument, called the @b{data handle}, is an opaque pointer which
|
|
|
designates the array in StarPU. This is also the structure which is used to
|
|
@@ -1557,6 +1564,8 @@ the main memory. Then comes the pointer @code{tab} where the data can be found,
|
|
|
the number of elements in the vector and the size of each element.
|
|
|
It is possible to construct a StarPU
|
|
|
task that multiplies this vector by a constant factor:
|
|
|
+
|
|
|
+@cartouche
|
|
|
@example
|
|
|
float factor;
|
|
|
struct starpu_task *task = starpu_task_create();
|
|
@@ -1569,6 +1578,7 @@ task->buffers[0].mode = STARPU_RW;
|
|
|
task->cl_arg = &factor;
|
|
|
task->cl_arg_size = sizeof(float);
|
|
|
@end example
|
|
|
+@end cartouche
|
|
|
|
|
|
Since the factor is constant, it does not need a preliminary declaration, and
|
|
|
can just be passed through the @code{cl_arg} pointer like in the previous
|
|
@@ -1580,6 +1590,7 @@ write-only and @code{STARPU_RW} for read and write access).
|
|
|
|
|
|
The definition of the codelet can be written as follows:
|
|
|
|
|
|
+@cartouche
|
|
|
@example
|
|
|
void scal_func(void *buffers[], void *cl_arg)
|
|
|
@{
|
|
@@ -1603,7 +1614,7 @@ starpu_codelet cl = @{
|
|
|
.nbuffers = 1
|
|
|
@};
|
|
|
@end example
|
|
|
-
|
|
|
+@end cartouche
|
|
|
|
|
|
The second argument of the @code{scal_func} function contains a pointer to the
|
|
|
parameters of the codelet (given in @code{task->cl_arg}), so that we read the
|
|
@@ -1635,10 +1646,12 @@ The CUDA implementation can be written as follows. It needs to be
|
|
|
compiled with a CUDA compiler such as nvcc, the NVIDIA CUDA compiler
|
|
|
driver.
|
|
|
|
|
|
+@cartouche
|
|
|
@example
|
|
|
#include <starpu.h>
|
|
|
|
|
|
-static __global__ void vector_mult_cuda(float *vector, int nx, float *multiplier)
|
|
|
+static __global__ void vector_mult_cuda(float *vector, int nx,
|
|
|
+ float *multiplier)
|
|
|
@{
|
|
|
int i;
|
|
|
for(i=0 ; i<nx ; i++) vector[i] *= *multiplier;
|
|
@@ -1653,8 +1666,11 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
|
|
|
vector_mult_cuda<<<1,1>>>(vector, nx, multiplier);
|
|
|
@}
|
|
|
@end example
|
|
|
+@end cartouche
|
|
|
|
|
|
The CPU implementation can be as follows.
|
|
|
+
|
|
|
+@cartouche
|
|
|
@example
|
|
|
#include <starpu.h>
|
|
|
|
|
@@ -1668,6 +1684,7 @@ void cpu_codelet(void *descr[], void *_args)
|
|
|
for(i=0 ; i<nx ; i++) vector[i] *= *multiplier;
|
|
|
@}
|
|
|
@end example
|
|
|
+@end cartouche
|
|
|
|
|
|
Here the source of the application. You can notice the value of the
|
|
|
field @code{where} for the codelet. We specify
|
|
@@ -1761,7 +1778,8 @@ $ PKG_CONFIG_PATH=$STARPU_DIR/lib/pkgconfig:$PKG_CONFIG_PATH
|
|
|
$ LD_LIBRARY_PATH=$STARPU_DIR/lib:$LD_LIBRARY_PATH
|
|
|
@end example
|
|
|
|
|
|
-It is then possible the application using the following makefile:
|
|
|
+It is then possible to compile the application using the following
|
|
|
+makefile:
|
|
|
|
|
|
@cartouche
|
|
|
@example
|