|
@@ -2,7 +2,7 @@
|
|
|
|
|
|
@c This file is part of the StarPU Handbook.
|
|
|
@c Copyright (C) 2009--2011 Universit@'e de Bordeaux 1
|
|
|
-@c Copyright (C) 2010, 2011, 2012 Centre National de la Recherche Scientifique
|
|
|
+@c Copyright (C) 2010, 2011, 2012, 2013 Centre National de la Recherche Scientifique
|
|
|
@c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
|
|
|
@c See the file starpu.texi for copying conditions.
|
|
|
|
|
@@ -201,7 +201,7 @@ for (worker = 0; worker < starpu_worker_get_count(); worker++)
|
|
|
|
|
|
float executing_ratio = 100.0*executing_time/total_time;
|
|
|
float sleeping_ratio = 100.0*sleeping_time/total_time;
|
|
|
- float overhead_ratio = 100.0 - executing_ratio - sleeping_ratio;
|
|
|
+ float overhead_ratio = 100.0 - executing_ratio - sleeping_ratio;
|
|
|
|
|
|
char workername[128];
|
|
|
starpu_worker_get_name(worker, workername, 128);
|
|
@@ -492,83 +492,12 @@ transfers, which are assumed to be completely overlapped.
|
|
|
@section Insert Task Utility
|
|
|
|
|
|
StarPU provides the wrapper function @code{starpu_insert_task} to ease
|
|
|
-the creation and submission of tasks.
|
|
|
-
|
|
|
-@deftypefun int starpu_insert_task (struct starpu_codelet *@var{cl}, ...)
|
|
|
-Create and submit a task corresponding to @var{cl} with the following
|
|
|
-arguments. The argument list must be zero-terminated.
|
|
|
-
|
|
|
-The arguments following the codelets can be of the following types:
|
|
|
-
|
|
|
-@itemize
|
|
|
-@item
|
|
|
-@code{STARPU_R}, @code{STARPU_W}, @code{STARPU_RW}, @code{STARPU_SCRATCH}, @code{STARPU_REDUX} an access mode followed by a data handle;
|
|
|
-@item
|
|
|
-@code{STARPU_DATA_ARRAY} followed by an array of data handles and its number of elements;
|
|
|
-@item
|
|
|
-the specific values @code{STARPU_VALUE}, @code{STARPU_CALLBACK},
|
|
|
-@code{STARPU_CALLBACK_ARG}, @code{STARPU_CALLBACK_WITH_ARG},
|
|
|
-@code{STARPU_PRIORITY}, @code{STARPU_TAG}, followed by the appropriated objects
|
|
|
-as defined below.
|
|
|
-@end itemize
|
|
|
-
|
|
|
-When using @code{STARPU_DATA_ARRAY}, the access mode of the data
|
|
|
-handles is not defined.
|
|
|
-
|
|
|
-Parameters to be passed to the codelet implementation are defined
|
|
|
-through the type @code{STARPU_VALUE}. The function
|
|
|
-@code{starpu_codelet_unpack_args} must be called within the codelet
|
|
|
-implementation to retrieve them.
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
-@defmac STARPU_VALUE
|
|
|
-this macro is used when calling @code{starpu_insert_task}, and must be
|
|
|
-followed by a pointer to a constant value and the size of the constant
|
|
|
-@end defmac
|
|
|
-
|
|
|
-@defmac STARPU_CALLBACK
|
|
|
-this macro is used when calling @code{starpu_insert_task}, and must be
|
|
|
-followed by a pointer to a callback function
|
|
|
-@end defmac
|
|
|
-
|
|
|
-@defmac STARPU_CALLBACK_ARG
|
|
|
-this macro is used when calling @code{starpu_insert_task}, and must be
|
|
|
-followed by a pointer to be given as an argument to the callback
|
|
|
-function
|
|
|
-@end defmac
|
|
|
-
|
|
|
-@defmac STARPU_CALLBACK_WITH_ARG
|
|
|
-this macro is used when calling @code{starpu_insert_task}, and must be
|
|
|
-followed by two pointers: one to a callback function, and the other to
|
|
|
-be given as an argument to the callback function; this is equivalent
|
|
|
-to using both @code{STARPU_CALLBACK} and
|
|
|
-@code{STARPU_CALLBACK_WITH_ARG}
|
|
|
-@end defmac
|
|
|
-
|
|
|
-@defmac STARPU_PRIORITY
|
|
|
-this macro is used when calling @code{starpu_insert_task}, and must be
|
|
|
-followed by a integer defining a priority level
|
|
|
-@end defmac
|
|
|
-
|
|
|
-@defmac STARPU_TAG
|
|
|
-this macro is used when calling @code{starpu_insert_task}, and must be
|
|
|
-followed by a tag.
|
|
|
-@end defmac
|
|
|
-
|
|
|
-@deftypefun void starpu_codelet_pack_args ({char **}@var{arg_buffer}, {size_t *}@var{arg_buffer_size}, ...)
|
|
|
-Pack arguments of type @code{STARPU_VALUE} into a buffer which can be
|
|
|
-given to a codelet and later unpacked with the function
|
|
|
-@code{starpu_codelet_unpack_args} defined below.
|
|
|
-@end deftypefun
|
|
|
-
|
|
|
-@deftypefun void starpu_codelet_unpack_args ({void *}@var{cl_arg}, ...)
|
|
|
-Retrieve the arguments of type @code{STARPU_VALUE} associated to a
|
|
|
-task automatically created using the function
|
|
|
-@code{starpu_insert_task} defined above.
|
|
|
-@end deftypefun
|
|
|
+the creation and submission of tasks. See the definition of the
|
|
|
+functions in @ref{Insert Task}.
|
|
|
|
|
|
Here the implementation of the codelet:
|
|
|
|
|
|
+@cartouche
|
|
|
@smallexample
|
|
|
void func_cpu(void *descr[], void *_args)
|
|
|
@{
|
|
@@ -589,9 +518,11 @@ struct starpu_codelet mycodelet = @{
|
|
|
.modes = @{ STARPU_RW, STARPU_RW @}
|
|
|
@};
|
|
|
@end smallexample
|
|
|
+@end cartouche
|
|
|
|
|
|
And the call to the @code{starpu_insert_task} wrapper:
|
|
|
|
|
|
+@cartouche
|
|
|
@smallexample
|
|
|
starpu_insert_task(&mycodelet,
|
|
|
STARPU_VALUE, &ifactor, sizeof(ifactor),
|
|
@@ -599,10 +530,12 @@ starpu_insert_task(&mycodelet,
|
|
|
STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],
|
|
|
0);
|
|
|
@end smallexample
|
|
|
+@end cartouche
|
|
|
|
|
|
The call to @code{starpu_insert_task} is equivalent to the following
|
|
|
code:
|
|
|
|
|
|
+@cartouche
|
|
|
@smallexample
|
|
|
struct starpu_task *task = starpu_task_create();
|
|
|
task->cl = &mycodelet;
|
|
@@ -618,9 +551,11 @@ task->cl_arg = arg_buffer;
|
|
|
task->cl_arg_size = arg_buffer_size;
|
|
|
int ret = starpu_task_submit(task);
|
|
|
@end smallexample
|
|
|
+@end cartouche
|
|
|
|
|
|
Here a similar call using @code{STARPU_DATA_ARRAY}.
|
|
|
|
|
|
+@cartouche
|
|
|
@smallexample
|
|
|
starpu_insert_task(&mycodelet,
|
|
|
STARPU_DATA_ARRAY, data_handles, 2,
|
|
@@ -628,12 +563,14 @@ starpu_insert_task(&mycodelet,
|
|
|
STARPU_VALUE, &ffactor, sizeof(ffactor),
|
|
|
0);
|
|
|
@end smallexample
|
|
|
+@end cartouche
|
|
|
|
|
|
If some part of the task insertion depends on the value of some computation,
|
|
|
the @code{STARPU_DATA_ACQUIRE_CB} macro can be very convenient. For
|
|
|
instance, assuming that the index variable @code{i} was registered as handle
|
|
|
@code{i_handle}:
|
|
|
|
|
|
+@cartouche
|
|
|
@smallexample
|
|
|
/* Compute which portion we will work on, e.g. pivot */
|
|
|
starpu_insert_task(&which_index, STARPU_W, i_handle, 0);
|
|
@@ -642,6 +579,7 @@ starpu_insert_task(&which_index, STARPU_W, i_handle, 0);
|
|
|
STARPU_DATA_ACQUIRE_CB(i_handle, STARPU_R,
|
|
|
starpu_insert_task(&work, STARPU_RW, A_handle[i], 0));
|
|
|
@end smallexample
|
|
|
+@end cartouche
|
|
|
|
|
|
The @code{STARPU_DATA_ACQUIRE_CB} macro submits an asynchronous request for
|
|
|
acquiring data @code{i} for the main application, and will execute the code
|
|
@@ -674,6 +612,7 @@ buffers, and how to assemble partial results.
|
|
|
For instance, @code{cg} uses that to optimize its dot product: it first defines
|
|
|
the codelets for initialization and reduction:
|
|
|
|
|
|
+@cartouche
|
|
|
@smallexample
|
|
|
struct starpu_codelet bzero_variable_cl =
|
|
|
@{
|
|
@@ -704,17 +643,21 @@ struct starpu_codelet accumulate_variable_cl =
|
|
|
.nbuffers = 1,
|
|
|
@}
|
|
|
@end smallexample
|
|
|
+@end cartouche
|
|
|
|
|
|
and attaches them as reduction methods for its dtq handle:
|
|
|
|
|
|
+@cartouche
|
|
|
@smallexample
|
|
|
starpu_data_set_reduction_methods(dtq_handle,
|
|
|
&accumulate_variable_cl, &bzero_variable_cl);
|
|
|
@end smallexample
|
|
|
+@end cartouche
|
|
|
|
|
|
-and dtq_handle can now be used in @code{STARPU_REDUX} mode for the dot products
|
|
|
+and @code{dtq_handle} can now be used in @code{STARPU_REDUX} mode for the dot products
|
|
|
with partitioned vectors:
|
|
|
|
|
|
+@cartouche
|
|
|
@smallexample
|
|
|
int dots(starpu_data_handle_t v1, starpu_data_handle_t v2,
|
|
|
starpu_data_handle_t s, unsigned nblocks)
|
|
@@ -728,6 +671,7 @@ int dots(starpu_data_handle_t v1, starpu_data_handle_t v2,
|
|
|
0);
|
|
|
@}
|
|
|
@end smallexample
|
|
|
+@end cartouche
|
|
|
|
|
|
The @code{cg} example also uses reduction for the blocked gemv kernel, leading
|
|
|
to yet more relaxed dependencies and more parallelism.
|
|
@@ -741,16 +685,17 @@ data. For instance, some hypothetical application which collects partial results
|
|
|
into data @code{res}, then uses it for other computation, before looping again
|
|
|
with a new reduction:
|
|
|
|
|
|
+@cartouche
|
|
|
@smallexample
|
|
|
-@{
|
|
|
- for (i = 0; i < 100; i++) @{
|
|
|
- starpu_mpi_insert_task(MPI_COMM_WORLD, &init_res, STARPU_W, res, 0);
|
|
|
- starpu_mpi_insert_task(MPI_COMM_WORLD, &work, STARPU_RW, A, STARPU_R, B, STARPU_REDUX, res, 0);
|
|
|
- starpu_mpi_redux_data(MPI_COMM_WORLD, res);
|
|
|
- starpu_mpi_insert_task(MPI_COMM_WORLD, &work2, STARPU_RW, B, STARPU_R, res, 0);
|
|
|
- @}
|
|
|
+for (i = 0; i < 100; i++) @{
|
|
|
+ starpu_mpi_insert_task(MPI_COMM_WORLD, &init_res, STARPU_W, res, 0);
|
|
|
+ starpu_mpi_insert_task(MPI_COMM_WORLD, &work, STARPU_RW, A,
|
|
|
+ STARPU_R, B, STARPU_REDUX, res, 0);
|
|
|
+ starpu_mpi_redux_data(MPI_COMM_WORLD, res);
|
|
|
+ starpu_mpi_insert_task(MPI_COMM_WORLD, &work2, STARPU_RW, B, STARPU_R, res, 0);
|
|
|
@}
|
|
|
@end smallexample
|
|
|
+@end cartouche
|
|
|
|
|
|
@node Temporary buffers
|
|
|
@section Temporary buffers
|
|
@@ -778,6 +723,7 @@ The following code examplifies both points: it registers the temporary
|
|
|
data, submits three tasks accessing it, and records the data for automatic
|
|
|
unregistration.
|
|
|
|
|
|
+@cartouche
|
|
|
@smallexample
|
|
|
starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
|
|
|
starpu_insert_task(&produce_data, STARPU_W, handle, 0);
|
|
@@ -785,6 +731,7 @@ starpu_insert_task(&compute_data, STARPU_RW, handle, 0);
|
|
|
starpu_insert_task(&summarize_data, STARPU_R, handle, STARPU_W, result_handle, 0);
|
|
|
starpu_data_unregister_submit(handle);
|
|
|
@end smallexample
|
|
|
+@end cartouche
|
|
|
|
|
|
@subsection Scratch data
|
|
|
|
|
@@ -796,12 +743,14 @@ initialization}), but that would make them systematic and permanent. A more
|
|
|
optimized way is to use the SCRATCH data access mode, as examplified below,
|
|
|
which provides per-worker buffers without content consistency.
|
|
|
|
|
|
+@cartouche
|
|
|
@smallexample
|
|
|
starpu_vector_data_register(&workspace, -1, 0, sizeof(float));
|
|
|
for (i = 0; i < N; i++)
|
|
|
starpu_insert_task(&compute, STARPU_R, input[i],
|
|
|
STARPU_SCRATCH, workspace, STARPU_W, output[i], 0);
|
|
|
@end smallexample
|
|
|
+@end cartouche
|
|
|
|
|
|
StarPU will make sure that the buffer is allocated before executing the task,
|
|
|
and make this allocation per-worker: for CPU workers, notably, each worker has
|
|
@@ -841,7 +790,8 @@ the CPU binding mask that StarPU chose.
|
|
|
For instance, using OpenMP (full source is available in
|
|
|
@code{examples/openmp/vector_scal.c}):
|
|
|
|
|
|
-@example
|
|
|
+@cartouche
|
|
|
+@smallexample
|
|
|
void scal_cpu_func(void *buffers[], void *_args)
|
|
|
@{
|
|
|
unsigned i;
|
|
@@ -864,7 +814,8 @@ static struct starpu_codelet cl =
|
|
|
.cpu_funcs = @{scal_cpu_func, NULL@},
|
|
|
.nbuffers = 1,
|
|
|
@};
|
|
|
-@end example
|
|
|
+@end smallexample
|
|
|
+@end cartouche
|
|
|
|
|
|
Other examples include for instance calling a BLAS parallel CPU implementation
|
|
|
(see @code{examples/mult/xgemm.c}).
|
|
@@ -878,7 +829,8 @@ involved in the combined worker, and thus the number of calls that are made in
|
|
|
parallel to the function, and @code{starpu_combined_worker_get_rank()} to get
|
|
|
the rank of the current CPU within the combined worker. For instance:
|
|
|
|
|
|
-@example
|
|
|
+@cartouche
|
|
|
+@smallexample
|
|
|
static void func(void *buffers[], void *args)
|
|
|
@{
|
|
|
unsigned i;
|
|
@@ -905,7 +857,8 @@ static struct starpu_codelet cl =
|
|
|
.cpu_funcs = @{ func, NULL @},
|
|
|
.nbuffers = 1,
|
|
|
@}
|
|
|
-@end example
|
|
|
+@end smallexample
|
|
|
+@end cartouche
|
|
|
|
|
|
Of course, this trivial example will not really benefit from parallel task
|
|
|
execution, and was only meant to be simple to understand. The benefit comes
|