13 years ago · 516a21964d
--- a/doc/chapters/advanced-api.texi
+++ b/doc/chapters/advanced-api.texi
@@ -7,6 +7,7 @@
 
				 @c See the file starpu.texi for copying conditions.
			
 
				 
			
 
				 @menu
			
 
				+* Insert Task::
			
 
				 * MPI Interface::
			
 
				 * Defining a new data interface::
			
 
				 * Multiformat Data Interface::
			
@@ -19,6 +20,82 @@
 
				 * Expert mode::
			
 
				 @end menu
			
 
				 
			
 
				+@node Insert Task
			
 
				+@section Insert Task
			
 
				+
			
 
				+@deftypefun int starpu_insert_task (struct starpu_codelet *@var{cl}, ...)
			
 
				+Create and submit a task corresponding to @var{cl} with the following
			
 
				+arguments.  The argument list must be zero-terminated.
			
 
				+
			
 
				+The arguments following the codelets can be of the following types:
			
 
				+
			
 
				+@itemize
			
 
				+@item
			
 
				+@code{STARPU_R}, @code{STARPU_W}, @code{STARPU_RW}, @code{STARPU_SCRATCH}, @code{STARPU_REDUX} an access mode followed by a data handle;
			
 
				+@item
			
 
				+@code{STARPU_DATA_ARRAY} followed by an array of data handles and its number of elements;
			
 
				+@item
			
 
				+the specific values @code{STARPU_VALUE}, @code{STARPU_CALLBACK},
			
 
				+@code{STARPU_CALLBACK_ARG}, @code{STARPU_CALLBACK_WITH_ARG},
			
 
				+@code{STARPU_PRIORITY}, @code{STARPU_TAG}, followed by the appropriated objects
			
 
				+as defined below.
			
 
				+@end itemize
			
 
				+
			
 
				+When using @code{STARPU_DATA_ARRAY}, the access mode of the data
			
 
				+handles is not defined.
			
 
				+
			
 
				+Parameters to be passed to the codelet implementation are defined
			
 
				+through the type @code{STARPU_VALUE}. The function
			
 
				+@code{starpu_codelet_unpack_args} must be called within the codelet
			
 
				+implementation to retrieve them.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@defmac STARPU_VALUE
			
 
				+this macro is used when calling @code{starpu_insert_task}, and must be
			
 
				+followed by a pointer to a constant value and the size of the constant
			
 
				+@end defmac
			
 
				+
			
 
				+@defmac STARPU_CALLBACK
			
 
				+this macro is used when calling @code{starpu_insert_task}, and must be
			
 
				+followed by a pointer to a callback function
			
 
				+@end defmac
			
 
				+
			
 
				+@defmac STARPU_CALLBACK_ARG
			
 
				+this macro is used when calling @code{starpu_insert_task}, and must be
			
 
				+followed by a pointer to be given as an argument to the callback
			
 
				+function
			
 
				+@end defmac
			
 
				+
			
 
				+@defmac  STARPU_CALLBACK_WITH_ARG
			
 
				+this macro is used when calling @code{starpu_insert_task}, and must be
			
 
				+followed by two pointers: one to a callback function, and the other to
			
 
				+be given as an argument to the callback function; this is equivalent
			
 
				+to using both @code{STARPU_CALLBACK} and
			
 
				+@code{STARPU_CALLBACK_WITH_ARG}
			
 
				+@end defmac
			
 
				+
			
 
				+@defmac STARPU_PRIORITY
			
 
				+this macro is used when calling @code{starpu_insert_task}, and must be
			
 
				+followed by a integer defining a priority level
			
 
				+@end defmac
			
 
				+
			
 
				+@defmac STARPU_TAG
			
 
				+this macro is used when calling @code{starpu_insert_task}, and must be
			
 
				+followed by a tag.
			
 
				+@end defmac
			
 
				+
			
 
				+@deftypefun void starpu_codelet_pack_args ({char **}@var{arg_buffer}, {size_t *}@var{arg_buffer_size}, ...)
			
 
				+Pack arguments of type @code{STARPU_VALUE} into a buffer which can be
			
 
				+given to a codelet and later unpacked with the function
			
 
				+@code{starpu_codelet_unpack_args} defined below.
			
 
				+@end deftypefun
			
 
				+
			
 
				+@deftypefun void starpu_codelet_unpack_args ({void *}@var{cl_arg}, ...)
			
 
				+Retrieve the arguments of type @code{STARPU_VALUE} associated to a
			
 
				+task automatically created using the function
			
 
				+@code{starpu_insert_task} defined above.
			
 
				+@end deftypefun
			
 
				+
			
 
				 @node MPI Interface
			
 
				 @section MPI Interface
			
 
				 
			
--- a/doc/chapters/advanced-examples.texi
+++ b/doc/chapters/advanced-examples.texi
@@ -2,7 +2,7 @@
 
				 
			
 
				 @c This file is part of the StarPU Handbook.
			
 
				 @c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				-@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				+@c Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				 @c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				 @c See the file starpu.texi for copying conditions.
			
 
				 
			
@@ -201,7 +201,7 @@ for (worker = 0; worker < starpu_worker_get_count(); worker++)
 
				 
			
 
				         float executing_ratio = 100.0*executing_time/total_time;
			
 
				         float sleeping_ratio = 100.0*sleeping_time/total_time;
			
 
				-	float overhead_ratio = 100.0 - executing_ratio - sleeping_ratio;
			
 
				+        float overhead_ratio = 100.0 - executing_ratio - sleeping_ratio;
			
 
				 
			
 
				         char workername[128];
			
 
				         starpu_worker_get_name(worker, workername, 128);
			
@@ -492,83 +492,12 @@ transfers, which are assumed to be completely overlapped.
 
				 @section Insert Task Utility
			
 
				 
			
 
				 StarPU provides the wrapper function @code{starpu_insert_task} to ease
			
 
				-the creation and submission of tasks.
			
 
				-
			
 
				-@deftypefun int starpu_insert_task (struct starpu_codelet *@var{cl}, ...)
			
 
				-Create and submit a task corresponding to @var{cl} with the following
			
 
				-arguments.  The argument list must be zero-terminated.
			
 
				-
			
 
				-The arguments following the codelets can be of the following types:
			
 
				-
			
 
				-@itemize
			
 
				-@item
			
 
				-@code{STARPU_R}, @code{STARPU_W}, @code{STARPU_RW}, @code{STARPU_SCRATCH}, @code{STARPU_REDUX} an access mode followed by a data handle;
			
 
				-@item
			
 
				-@code{STARPU_DATA_ARRAY} followed by an array of data handles and its number of elements;
			
 
				-@item
			
 
				-the specific values @code{STARPU_VALUE}, @code{STARPU_CALLBACK},
			
 
				-@code{STARPU_CALLBACK_ARG}, @code{STARPU_CALLBACK_WITH_ARG},
			
 
				-@code{STARPU_PRIORITY}, @code{STARPU_TAG}, followed by the appropriated objects
			
 
				-as defined below.
			
 
				-@end itemize
			
 
				-
			
 
				-When using @code{STARPU_DATA_ARRAY}, the access mode of the data
			
 
				-handles is not defined.
			
 
				-
			
 
				-Parameters to be passed to the codelet implementation are defined
			
 
				-through the type @code{STARPU_VALUE}. The function
			
 
				-@code{starpu_codelet_unpack_args} must be called within the codelet
			
 
				-implementation to retrieve them.
			
 
				-@end deftypefun
			
 
				-
			
 
				-@defmac STARPU_VALUE
			
 
				-this macro is used when calling @code{starpu_insert_task}, and must be
			
 
				-followed by a pointer to a constant value and the size of the constant
			
 
				-@end defmac
			
 
				-
			
 
				-@defmac STARPU_CALLBACK
			
 
				-this macro is used when calling @code{starpu_insert_task}, and must be
			
 
				-followed by a pointer to a callback function
			
 
				-@end defmac
			
 
				-
			
 
				-@defmac STARPU_CALLBACK_ARG
			
 
				-this macro is used when calling @code{starpu_insert_task}, and must be
			
 
				-followed by a pointer to be given as an argument to the callback
			
 
				-function
			
 
				-@end defmac
			
 
				-
			
 
				-@defmac  STARPU_CALLBACK_WITH_ARG
			
 
				-this macro is used when calling @code{starpu_insert_task}, and must be
			
 
				-followed by two pointers: one to a callback function, and the other to
			
 
				-be given as an argument to the callback function; this is equivalent
			
 
				-to using both @code{STARPU_CALLBACK} and
			
 
				-@code{STARPU_CALLBACK_WITH_ARG}
			
 
				-@end defmac
			
 
				-
			
 
				-@defmac STARPU_PRIORITY
			
 
				-this macro is used when calling @code{starpu_insert_task}, and must be
			
 
				-followed by a integer defining a priority level
			
 
				-@end defmac
			
 
				-
			
 
				-@defmac STARPU_TAG
			
 
				-this macro is used when calling @code{starpu_insert_task}, and must be
			
 
				-followed by a tag.
			
 
				-@end defmac
			
 
				-
			
 
				-@deftypefun void starpu_codelet_pack_args ({char **}@var{arg_buffer}, {size_t *}@var{arg_buffer_size}, ...)
			
 
				-Pack arguments of type @code{STARPU_VALUE} into a buffer which can be
			
 
				-given to a codelet and later unpacked with the function
			
 
				-@code{starpu_codelet_unpack_args} defined below.
			
 
				-@end deftypefun
			
 
				-
			
 
				-@deftypefun void starpu_codelet_unpack_args ({void *}@var{cl_arg}, ...)
			
 
				-Retrieve the arguments of type @code{STARPU_VALUE} associated to a
			
 
				-task automatically created using the function
			
 
				-@code{starpu_insert_task} defined above.
			
 
				-@end deftypefun
			
 
				+the creation and submission of tasks. See the definition of the
			
 
				+functions in @ref{Insert Task}.
			
 
				 
			
 
				 Here the implementation of the codelet:
			
 
				 
			
 
				+@cartouche
			
 
				 @smallexample
			
 
				 void func_cpu(void *descr[], void *_args)
			
 
				 @{
			
@@ -589,9 +518,11 @@ struct starpu_codelet mycodelet = @{
 
				         .modes = @{ STARPU_RW, STARPU_RW @}
			
 
				 @};
			
 
				 @end smallexample
			
 
				+@end cartouche
			
 
				 
			
 
				 And the call to the @code{starpu_insert_task} wrapper:
			
 
				 
			
 
				+@cartouche
			
 
				 @smallexample
			
 
				 starpu_insert_task(&mycodelet,
			
 
				                    STARPU_VALUE, &ifactor, sizeof(ifactor),
			
@@ -599,10 +530,12 @@ starpu_insert_task(&mycodelet,
 
				                    STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],
			
 
				                    0);
			
 
				 @end smallexample
			
 
				+@end cartouche
			
 
				 
			
 
				 The call to @code{starpu_insert_task} is equivalent to the following
			
 
				 code:
			
 
				 
			
 
				+@cartouche
			
 
				 @smallexample
			
 
				 struct starpu_task *task = starpu_task_create();
			
 
				 task->cl = &mycodelet;
			
@@ -618,9 +551,11 @@ task->cl_arg = arg_buffer;
 
				 task->cl_arg_size = arg_buffer_size;
			
 
				 int ret = starpu_task_submit(task);
			
 
				 @end smallexample
			
 
				+@end cartouche
			
 
				 
			
 
				 Here a similar call using @code{STARPU_DATA_ARRAY}.
			
 
				 
			
 
				+@cartouche
			
 
				 @smallexample
			
 
				 starpu_insert_task(&mycodelet,
			
 
				                    STARPU_DATA_ARRAY, data_handles, 2,
			
@@ -628,12 +563,14 @@ starpu_insert_task(&mycodelet,
 
				                    STARPU_VALUE, &ffactor, sizeof(ffactor),
			
 
				                    0);
			
 
				 @end smallexample
			
 
				+@end cartouche
			
 
				 
			
 
				 If some part of the task insertion depends on the value of some computation,
			
 
				 the @code{STARPU_DATA_ACQUIRE_CB} macro can be very convenient. For
			
 
				 instance, assuming that the index variable @code{i} was registered as handle
			
 
				 @code{i_handle}:
			
 
				 
			
 
				+@cartouche
			
 
				 @smallexample
			
 
				 /* Compute which portion we will work on, e.g. pivot */
			
 
				 starpu_insert_task(&which_index, STARPU_W, i_handle, 0);
			
@@ -642,6 +579,7 @@ starpu_insert_task(&which_index, STARPU_W, i_handle, 0);
 
				 STARPU_DATA_ACQUIRE_CB(i_handle, STARPU_R,
			
 
				                        starpu_insert_task(&work, STARPU_RW, A_handle[i], 0));
			
 
				 @end smallexample
			
 
				+@end cartouche
			
 
				 
			
 
				 The @code{STARPU_DATA_ACQUIRE_CB} macro submits an asynchronous request for
			
 
				 acquiring data @code{i} for the main application, and will execute the code
			
@@ -674,6 +612,7 @@ buffers, and how to assemble partial results.
 
				 For instance, @code{cg} uses that to optimize its dot product: it first defines
			
 
				 the codelets for initialization and reduction:
			
 
				 
			
 
				+@cartouche
			
 
				 @smallexample
			
 
				 struct starpu_codelet bzero_variable_cl =
			
 
				 @{
			
@@ -704,17 +643,21 @@ struct starpu_codelet accumulate_variable_cl =
 
				         .nbuffers = 1,
			
 
				 @}
			
 
				 @end smallexample
			
 
				+@end cartouche
			
 
				 
			
 
				 and attaches them as reduction methods for its dtq handle:
			
 
				 
			
 
				+@cartouche
			
 
				 @smallexample
			
 
				 starpu_data_set_reduction_methods(dtq_handle,
			
 
				         &accumulate_variable_cl, &bzero_variable_cl);
			
 
				 @end smallexample
			
 
				+@end cartouche
			
 
				 
			
 
				-and dtq_handle can now be used in @code{STARPU_REDUX} mode for the dot products
			
 
				+and @code{dtq_handle} can now be used in @code{STARPU_REDUX} mode for the dot products
			
 
				 with partitioned vectors:
			
 
				 
			
 
				+@cartouche
			
 
				 @smallexample
			
 
				 int dots(starpu_data_handle_t v1, starpu_data_handle_t v2,
			
 
				          starpu_data_handle_t s, unsigned nblocks)
			
@@ -728,6 +671,7 @@ int dots(starpu_data_handle_t v1, starpu_data_handle_t v2,
 
				             0);
			
 
				 @}
			
 
				 @end smallexample
			
 
				+@end cartouche
			
 
				 
			
 
				 The @code{cg} example also uses reduction for the blocked gemv kernel, leading
			
 
				 to yet more relaxed dependencies and more parallelism.
			
@@ -741,16 +685,17 @@ data. For instance, some hypothetical application which collects partial results
 
				 into data @code{res}, then uses it for other computation, before looping again
			
 
				 with a new reduction:
			
 
				 
			
 
				+@cartouche
			
 
				 @smallexample
			
 
				-@{
			
 
				-    for (i = 0; i < 100; i++) @{
			
 
				-        starpu_mpi_insert_task(MPI_COMM_WORLD, &init_res, STARPU_W, res, 0);
			
 
				-        starpu_mpi_insert_task(MPI_COMM_WORLD, &work, STARPU_RW, A, STARPU_R, B, STARPU_REDUX, res, 0);
			
 
				-        starpu_mpi_redux_data(MPI_COMM_WORLD, res);
			
 
				-        starpu_mpi_insert_task(MPI_COMM_WORLD, &work2, STARPU_RW, B, STARPU_R, res, 0);
			
 
				-    @}
			
 
				+for (i = 0; i < 100; i++) @{
			
 
				+    starpu_mpi_insert_task(MPI_COMM_WORLD, &init_res, STARPU_W, res, 0);
			
 
				+    starpu_mpi_insert_task(MPI_COMM_WORLD, &work, STARPU_RW, A,
			
 
				+               STARPU_R, B, STARPU_REDUX, res, 0);
			
 
				+    starpu_mpi_redux_data(MPI_COMM_WORLD, res);
			
 
				+    starpu_mpi_insert_task(MPI_COMM_WORLD, &work2, STARPU_RW, B, STARPU_R, res, 0);
			
 
				 @}
			
 
				 @end smallexample
			
 
				+@end cartouche
			
 
				 
			
 
				 @node Temporary buffers
			
 
				 @section Temporary buffers
			
@@ -778,6 +723,7 @@ The following code examplifies both points: it registers the temporary
 
				 data, submits three tasks accessing it, and records the data for automatic
			
 
				 unregistration.
			
 
				 
			
 
				+@cartouche
			
 
				 @smallexample
			
 
				 starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
			
 
				 starpu_insert_task(&produce_data, STARPU_W, handle, 0);
			
@@ -785,6 +731,7 @@ starpu_insert_task(&compute_data, STARPU_RW, handle, 0);
 
				 starpu_insert_task(&summarize_data, STARPU_R, handle, STARPU_W, result_handle, 0);
			
 
				 starpu_data_unregister_submit(handle);
			
 
				 @end smallexample
			
 
				+@end cartouche
			
 
				 
			
 
				 @subsection Scratch data
			
 
				 
			
@@ -796,12 +743,14 @@ initialization}), but that would make them systematic and permanent. A more
 
				 optimized way is to use the SCRATCH data access mode, as examplified below,
			
 
				 which provides per-worker buffers without content consistency.
			
 
				 
			
 
				+@cartouche
			
 
				 @smallexample
			
 
				 starpu_vector_data_register(&workspace, -1, 0, sizeof(float));
			
 
				 for (i = 0; i < N; i++)
			
 
				     starpu_insert_task(&compute, STARPU_R, input[i],
			
 
				                        STARPU_SCRATCH, workspace, STARPU_W, output[i], 0);
			
 
				 @end smallexample
			
 
				+@end cartouche
			
 
				 
			
 
				 StarPU will make sure that the buffer is allocated before executing the task,
			
 
				 and make this allocation per-worker: for CPU workers, notably, each worker has
			
@@ -841,7 +790,8 @@ the CPU binding mask that StarPU chose.
 
				 For instance, using OpenMP (full source is available in
			
 
				 @code{examples/openmp/vector_scal.c}):
			
 
				 
			
 
				-@example
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				 void scal_cpu_func(void *buffers[], void *_args)
			
 
				 @{
			
 
				     unsigned i;
			
@@ -864,7 +814,8 @@ static struct starpu_codelet cl =
 
				     .cpu_funcs = @{scal_cpu_func, NULL@},
			
 
				     .nbuffers = 1,
			
 
				 @};
			
 
				-@end example
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				 
			
 
				 Other examples include for instance calling a BLAS parallel CPU implementation
			
 
				 (see @code{examples/mult/xgemm.c}).
			
@@ -878,7 +829,8 @@ involved in the combined worker, and thus the number of calls that are made in
 
				 parallel to the function, and @code{starpu_combined_worker_get_rank()} to get
			
 
				 the rank of the current CPU within the combined worker. For instance:
			
 
				 
			
 
				-@example
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				 static void func(void *buffers[], void *args)
			
 
				 @{
			
 
				     unsigned i;
			
@@ -905,7 +857,8 @@ static struct starpu_codelet cl =
 
				     .cpu_funcs = @{ func, NULL @},
			
 
				     .nbuffers = 1,
			
 
				 @}
			
 
				-@end example
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				 
			
 
				 Of course, this trivial example will not really benefit from parallel task
			
 
				 execution, and was only meant to be simple to understand.  The benefit comes
			
--- a/doc/chapters/basic-examples.texi
+++ b/doc/chapters/basic-examples.texi
@@ -244,7 +244,11 @@ callback function is always executed on a CPU. The @code{callback_arg}
 
				 pointer is passed as an argument of the callback. The prototype of a callback
			
 
				 function must be:
			
 
				 
			
 
				-@code{void (*callback_function)(void *);}
			
 
				+@cartouche
			
 
				+@example
			
 
				+void (*callback_function)(void *);
			
 
				+@end example
			
 
				+@end cartouche
			
 
				 
			
 
				 If the @code{synchronous} field is non-zero, task submission will be
			
 
				 synchronous: the @code{starpu_task_submit} function will not return until the