|
@@ -60,64 +60,143 @@ The header starpu.h should be included in any code using StarPU.
|
|
|
|
|
|
\subsection DefiningACodelet Defining A Codelet
|
|
|
|
|
|
+A codelet is a structure that represents a computational kernel. Such a codelet
|
|
|
+may contain an implementation of the same kernel on different architectures
|
|
|
+(e.g. CUDA, x86, ...). For compatibility, make sure that the whole
|
|
|
+structure is properly initialized to zero, either by using the
|
|
|
+function starpu_codelet_init(), or by letting the
|
|
|
+compiler implicitly do it as examplified above.
|
|
|
+
|
|
|
+The field starpu_codelet::nbuffers specifies the number of data buffers that are
|
|
|
+manipulated by the codelet: here the codelet does not access or modify any data
|
|
|
+that is controlled by our data management library.
|
|
|
+
|
|
|
+We create a codelet which may only be executed on the CPUs. When a CPU
|
|
|
+core will execute a codelet, it will call the function
|
|
|
+<c>cpu_func</c>, which \em must have the following prototype:
|
|
|
+
|
|
|
+\code{.c}
|
|
|
+void (*cpu_func)(void *buffers[], void *cl_arg);
|
|
|
+\endcode
|
|
|
+
|
|
|
+In this example, we can ignore the first argument of this function which gives a
|
|
|
+description of the input and output buffers (e.g. the size and the location of
|
|
|
+the matrices) since there is none. We also ignore the second argument
|
|
|
+which is a pointer to optional arguments for the codelet.
|
|
|
+
|
|
|
\code{.c}
|
|
|
-struct params
|
|
|
-{
|
|
|
- int i;
|
|
|
- float f;
|
|
|
-};
|
|
|
void cpu_func(void *buffers[], void *cl_arg)
|
|
|
{
|
|
|
- struct params *params = cl_arg;
|
|
|
-
|
|
|
- printf("Hello world (params = {%i, %f} )\n", params->i, params->f);
|
|
|
+ printf("Hello world\n");
|
|
|
}
|
|
|
|
|
|
struct starpu_codelet cl =
|
|
|
{
|
|
|
- .where = STARPU_CPU,
|
|
|
.cpu_funcs = { cpu_func, NULL },
|
|
|
- .cpu_funcs_name = { "cpu_func", NULL },
|
|
|
.nbuffers = 0
|
|
|
};
|
|
|
\endcode
|
|
|
|
|
|
-A codelet is a structure that represents a computational kernel. Such a codelet
|
|
|
-may contain an implementation of the same kernel on different architectures
|
|
|
-(e.g. CUDA, x86, ...). For compatibility, make sure that the whole
|
|
|
-structure is properly initialized to zero, either by using the
|
|
|
-function starpu_codelet_init(), or by letting the
|
|
|
-compiler implicitly do it as examplified above.
|
|
|
+\subsection SubmittingATask Submitting A Task
|
|
|
|
|
|
-The field starpu_codelet::nbuffers specifies the number of data buffers that are
|
|
|
-manipulated by the codelet: here the codelet does not access or modify any data
|
|
|
-that is controlled by our data management library. Note that the argument
|
|
|
-passed to the codelet (the parameter <c>cl_arg</c> of the function
|
|
|
-<c>cpu_func</c>) does not count as a buffer since it is not managed by
|
|
|
-our data management library, but just contain trivial parameters.
|
|
|
+Before submitting any tasks to StarPU, starpu_init() must be called. The
|
|
|
+<c>NULL</c> argument specifies that we use the default configuration. Tasks cannot
|
|
|
+be submitted after the termination of StarPU by a call to
|
|
|
+starpu_shutdown().
|
|
|
+
|
|
|
+In the example above, a task structure is allocated by a call to
|
|
|
+starpu_task_create(). This function only allocates and fills the
|
|
|
+corresponding structure with the default settings, but it does not
|
|
|
+submit the task to StarPU.
|
|
|
|
|
|
\internal
|
|
|
-TODO need a crossref to the proper description of "where" see bla for more ...
|
|
|
+not really clear ;)
|
|
|
\endinternal
|
|
|
|
|
|
-We create a codelet which may only be executed on the CPUs. The field
|
|
|
-starpu_codelet::where is a bitmask that defines where the codelet may
|
|
|
-be executed. Here, the value ::STARPU_CPU means that only CPUs can
|
|
|
-execute this codelet. Note that field starpu_codelet::where is
|
|
|
-optional, when unset its value is automatically set based on the
|
|
|
-availability of the different fields <c>XXX_funcs</c>.
|
|
|
-When a CPU core executes a codelet, it calls the function
|
|
|
-<c>cpu_func</c>, which \em must have the following prototype:
|
|
|
+The field starpu_task::cl is a pointer to the codelet which the task will
|
|
|
+execute: in other words, the codelet structure describes which computational
|
|
|
+kernel should be offloaded on the different architectures, and the task
|
|
|
+structure is a wrapper containing a codelet and the piece of data on which the
|
|
|
+codelet should operate.
|
|
|
+
|
|
|
+If the field starpu_task::synchronous is non-zero, task submission
|
|
|
+will be synchronous: the function starpu_task_submit() will not return
|
|
|
+until the task has been executed. Note that the function starpu_shutdown()
|
|
|
+does not guarantee that asynchronous tasks have been executed before
|
|
|
+it returns, starpu_task_wait_for_all() can be used to that effect, or
|
|
|
+data can be unregistered (starpu_data_unregister()), which will
|
|
|
+implicitly wait for all the tasks scheduled to work on it, unless
|
|
|
+explicitly disabled thanks to
|
|
|
+starpu_data_set_default_sequential_consistency_flag() or
|
|
|
+starpu_data_set_sequential_consistency_flag().
|
|
|
|
|
|
\code{.c}
|
|
|
-void (*cpu_func)(void *buffers[], void *cl_arg);
|
|
|
+int main(int argc, char **argv)
|
|
|
+{
|
|
|
+ /* initialize StarPU */
|
|
|
+ starpu_init(NULL);
|
|
|
+
|
|
|
+ struct starpu_task *task = starpu_task_create();
|
|
|
+
|
|
|
+ task->cl = &cl; /* Pointer to the codelet defined above */
|
|
|
+
|
|
|
+ /* starpu_task_submit will be a blocking call. If unset,
|
|
|
+ starpu_task_wait() needs to be called after submitting the task. */
|
|
|
+ task->synchronous = 1;
|
|
|
+
|
|
|
+ /* submit the task to StarPU */
|
|
|
+ starpu_task_submit(task);
|
|
|
+
|
|
|
+ /* terminate StarPU */
|
|
|
+ starpu_shutdown();
|
|
|
+
|
|
|
+ return 0;
|
|
|
+}
|
|
|
\endcode
|
|
|
|
|
|
-In this example, we can ignore the first argument of this function which gives a
|
|
|
-description of the input and output buffers (e.g. the size and the location of
|
|
|
-the matrices) since there is none.
|
|
|
-The second argument is a pointer to a buffer passed as an
|
|
|
-argument to the codelet by the means of the field starpu_task::cl_arg.
|
|
|
+\subsection ExecutionOfHelloWorld Execution Of Hello World
|
|
|
+
|
|
|
+\verbatim
|
|
|
+$ make hello_world
|
|
|
+cc $(pkg-config --cflags starpu-1.2) $(pkg-config --libs starpu-1.2) hello_world.c -o hello_world
|
|
|
+$ ./hello_world
|
|
|
+Hello world
|
|
|
+\endverbatim
|
|
|
+
|
|
|
+\subsection PassingArgumentsToTheCodelet Passing Arguments To The Codelet
|
|
|
+
|
|
|
+The optional field starpu_task::cl_arg field is a pointer to a buffer
|
|
|
+(of size starpu_task::cl_arg_size) with some parameters for the kernel
|
|
|
+described by the codelet. For instance, if a codelet implements a
|
|
|
+computational kernel that multiplies its input vector by a constant,
|
|
|
+the constant could be specified by the means of this buffer, instead
|
|
|
+of registering it as a StarPU data. It must however be noted that
|
|
|
+StarPU avoids making copy whenever possible and rather passes the
|
|
|
+pointer as such, so the buffer which is pointed at must be kept allocated
|
|
|
+until the task terminates, and if several tasks are submitted with
|
|
|
+various parameters, each of them must be given a pointer to their
|
|
|
+own buffer.
|
|
|
+
|
|
|
+\code{.c}
|
|
|
+struct params
|
|
|
+{
|
|
|
+ int i;
|
|
|
+ float f;
|
|
|
+};
|
|
|
+
|
|
|
+void cpu_func(void *buffers[], void *cl_arg)
|
|
|
+{
|
|
|
+ struct params *params = cl_arg;
|
|
|
+
|
|
|
+ printf("Hello world (params = {%i, %f} )\n", params->i, params->f);
|
|
|
+}
|
|
|
+\endcode
|
|
|
+
|
|
|
+As said before, the field starpu_codelet::nbuffers specifies the
|
|
|
+number of data buffers that are manipulated by the codelet. It does
|
|
|
+not count the argument --- the parameter <c>cl_arg</c> of the function
|
|
|
+<c>cpu_func</c> --- since it is not managed by our data management
|
|
|
+library, but just contains trivial parameters.
|
|
|
|
|
|
\internal
|
|
|
TODO rewrite so that it is a little clearer ?
|
|
@@ -130,14 +209,7 @@ buffer will be modified as well: this for instance implies that the buffer
|
|
|
cannot be used as a synchronization medium. If synchronization is needed, data
|
|
|
has to be registered to StarPU, see \ref VectorScalingUsingStarPUAPI.
|
|
|
|
|
|
-\subsection SubmittingATask Submitting A Task
|
|
|
-
|
|
|
\code{.c}
|
|
|
-void callback_func(void *callback_arg)
|
|
|
-{
|
|
|
- printf("Callback function (arg %x)\n", callback_arg);
|
|
|
-}
|
|
|
-
|
|
|
int main(int argc, char **argv)
|
|
|
{
|
|
|
/* initialize StarPU */
|
|
@@ -151,9 +223,6 @@ int main(int argc, char **argv)
|
|
|
task->cl_arg = ¶ms;
|
|
|
task->cl_arg_size = sizeof(params);
|
|
|
|
|
|
- task->callback_func = callback_func;
|
|
|
- task->callback_arg = 0x42;
|
|
|
-
|
|
|
/* starpu_task_submit will be a blocking call */
|
|
|
task->synchronous = 1;
|
|
|
|
|
@@ -167,37 +236,14 @@ int main(int argc, char **argv)
|
|
|
}
|
|
|
\endcode
|
|
|
|
|
|
-Before submitting any tasks to StarPU, starpu_init() must be called. The
|
|
|
-<c>NULL</c> argument specifies that we use the default configuration. Tasks cannot
|
|
|
-be submitted after the termination of StarPU by a call to
|
|
|
-starpu_shutdown().
|
|
|
-
|
|
|
-In the example above, a task structure is allocated by a call to
|
|
|
-starpu_task_create(). This function only allocates and fills the
|
|
|
-corresponding structure with the default settings, but it does not
|
|
|
-submit the task to StarPU.
|
|
|
-
|
|
|
-\internal
|
|
|
-not really clear ;)
|
|
|
-\endinternal
|
|
|
+\verbatim
|
|
|
+$ make hello_world
|
|
|
+cc $(pkg-config --cflags starpu-1.2) $(pkg-config --libs starpu-1.2) hello_world.c -o hello_world
|
|
|
+$ ./hello_world
|
|
|
+Hello world (params = {1, 2.000000} )
|
|
|
+\endverbatim
|
|
|
|
|
|
-The field starpu_task::cl is a pointer to the codelet which the task will
|
|
|
-execute: in other words, the codelet structure describes which computational
|
|
|
-kernel should be offloaded on the different architectures, and the task
|
|
|
-structure is a wrapper containing a codelet and the piece of data on which the
|
|
|
-codelet should operate.
|
|
|
-
|
|
|
-The optional field starpu_task::cl_arg field is a pointer to a buffer
|
|
|
-(of size starpu_task::cl_arg_size) with some parameters for the kernel
|
|
|
-described by the codelet. For instance, if a codelet implements a
|
|
|
-computational kernel that multiplies its input vector by a constant,
|
|
|
-the constant could be specified by the means of this buffer, instead
|
|
|
-of registering it as a StarPU data. It must however be noted that
|
|
|
-StarPU avoids making copy whenever possible and rather passes the
|
|
|
-pointer as such, so the buffer which is pointed at must be kept allocated
|
|
|
-until the task terminates, and if several tasks are submitted with
|
|
|
-various parameters, each of them must be given a pointer to their
|
|
|
-own buffer.
|
|
|
+\subsection DefiningACallback Defining A Callback
|
|
|
|
|
|
Once a task has been executed, an optional callback function
|
|
|
starpu_task::callback_func is called when defined.
|
|
@@ -210,27 +256,66 @@ function. The prototype of a callback function must be:
|
|
|
void (*callback_function)(void *);
|
|
|
\endcode
|
|
|
|
|
|
-If the field starpu_task::synchronous is non-zero, task submission
|
|
|
-will be synchronous: the function starpu_task_submit() will not return
|
|
|
-until the task has been executed. Note that the function starpu_shutdown()
|
|
|
-does not guarantee that asynchronous tasks have been executed before
|
|
|
-it returns, starpu_task_wait_for_all() can be used to that effect, or
|
|
|
-data can be unregistered (starpu_data_unregister()), which will
|
|
|
-implicitly wait for all the tasks scheduled to work on it, unless
|
|
|
-explicitly disabled thanks to
|
|
|
-starpu_data_set_default_sequential_consistency_flag() or
|
|
|
-starpu_data_set_sequential_consistency_flag().
|
|
|
+\code{.c}
|
|
|
+void callback_func(void *callback_arg)
|
|
|
+{
|
|
|
+ printf("Callback function (arg %x)\n", callback_arg);
|
|
|
+}
|
|
|
|
|
|
-\subsection ExecutionOfHelloWorld Execution Of Hello World
|
|
|
+int main(int argc, char **argv)
|
|
|
+{
|
|
|
+ /* initialize StarPU */
|
|
|
+ starpu_init(NULL);
|
|
|
+
|
|
|
+ struct starpu_task *task = starpu_task_create();
|
|
|
+
|
|
|
+ task->cl = &cl; /* Pointer to the codelet defined above */
|
|
|
+
|
|
|
+ task->callback_func = callback_func;
|
|
|
+ task->callback_arg = 0x42;
|
|
|
+
|
|
|
+ /* starpu_task_submit will be a blocking call */
|
|
|
+ task->synchronous = 1;
|
|
|
+
|
|
|
+ /* submit the task to StarPU */
|
|
|
+ starpu_task_submit(task);
|
|
|
+
|
|
|
+ /* terminate StarPU */
|
|
|
+ starpu_shutdown();
|
|
|
+
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+\endcode
|
|
|
|
|
|
\verbatim
|
|
|
$ make hello_world
|
|
|
cc $(pkg-config --cflags starpu-1.2) $(pkg-config --libs starpu-1.2) hello_world.c -o hello_world
|
|
|
$ ./hello_world
|
|
|
-Hello world (params = {1, 2.000000} )
|
|
|
+Hello world
|
|
|
Callback function (arg 42)
|
|
|
\endverbatim
|
|
|
|
|
|
+\subsection WhereToExecuteACodelet Where To Execute A Codelet
|
|
|
+
|
|
|
+\code{.c}
|
|
|
+struct starpu_codelet cl =
|
|
|
+{
|
|
|
+ .where = STARPU_CPU,
|
|
|
+ .cpu_funcs = { cpu_func, NULL },
|
|
|
+ .cpu_funcs_name = { "cpu_func", NULL },
|
|
|
+ .nbuffers = 0
|
|
|
+};
|
|
|
+\endcode
|
|
|
+
|
|
|
+We create a codelet which may only be executed on the CPUs. The
|
|
|
+optional field starpu_codelet::where is a bitmask that defines where
|
|
|
+the codelet may be executed. Here, the value ::STARPU_CPU means that
|
|
|
+only CPUs can execute this codelet. When the optional field
|
|
|
+starpu_codelet::where is unset, its value is automatically set based
|
|
|
+on the availability of the different fields <c>XXX_funcs</c>.
|
|
|
+
|
|
|
+TODO: explain starpu_codelet::cpu_funcs_name
|
|
|
+
|
|
|
\section VectorScalingUsingTheCExtension Vector Scaling Using the C Extension
|
|
|
|
|
|
The previous example has shown how to submit tasks. In this section,
|
|
@@ -444,14 +529,14 @@ The following lines show how to declare an array of <c>NX</c> elements of type
|
|
|
float vector[NX];
|
|
|
|
|
|
starpu_data_handle_t vector_handle;
|
|
|
-starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX,
|
|
|
+starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector, NX,
|
|
|
sizeof(vector[0]));
|
|
|
\endcode
|
|
|
|
|
|
The first argument, called the <b>data handle</b>, is an opaque pointer which
|
|
|
designates the array in StarPU. This is also the structure which is used to
|
|
|
describe which data is used by a task. The second argument is the node number
|
|
|
-where the data originally resides. Here it is 0 since the array <c>vector</c> is in
|
|
|
+where the data originally resides. Here it is STARPU_MAIN_RAM since the array <c>vector</c> is in
|
|
|
the main memory. Then comes the pointer <c>vector</c> where the data can be found in main memory,
|
|
|
the number of elements in the vector and the size of each element.
|
|
|
The following shows how to construct a StarPU task that will manipulate the
|