14 years ago · d87e58afc7
--- a/configure.ac
+++ b/configure.ac
@@ -147,6 +147,7 @@ AC_ARG_ENABLE(nmaxcpus, [AS_HELP_STRING([--enable-nmaxcpus=<number>],
 
																 			[maximum number of CPUs])],
															
 
																 			nmaxcpus=$enableval, nmaxcpus=16)
															
 
																 AC_MSG_RESULT($nmaxcpus)
															
 
																+# TODO: rename to STARPU_MAXCPUS for coherency with CUDA/OpenCL?
															
 
																 AC_DEFINE_UNQUOTED(STARPU_NMAXCPUS, [$nmaxcpus], [Maximum number of CPUs supported])
															
 
																 AC_MSG_CHECKING(whether CPUs should be used)
															
--- a/doc/starpu.texi
+++ b/doc/starpu.texi
@@ -35,6 +35,7 @@ This manual documents the usage of StarPU.
 
																 * Installing StarPU::           How to configure, build and install StarPU
															
 
																 * Using StarPU::                How to run StarPU application
															
 
																 * Basic Examples::              Basic examples of the use of StarPU
															
 
																+* Performance options::         Performance options worth knowing
															
 
																 * Performance feedback::        Performance debugging tools
															
 
																 * Configuring StarPU::          How to configure StarPU
															
 
																 * StarPU API::                  The API to use StarPU
															
@@ -70,7 +71,8 @@ StarPU is a runtime system that offers support for heterogeneous multicore
 
																 architectures, it not only offers a unified view of the computational resources
															
 
																 (i.e. CPUs and accelerators at the same time), but it also takes care of
															
 
																 efficiently mapping and executing tasks onto an heterogeneous machine while
															
 
																-transparently handling low-level issues in a portable fashion.
															
 
																+transparently handling low-level issues such as data transfers in a portable
															
 
																+fashion.
															
 
																 @c this leads to a complicated distributed memory design
															
 
																 @c which is not (easily) manageable by hand
															
@@ -100,7 +102,7 @@ fashion.
 
																 @node Codelet and Tasks
															
 
																 @subsection Codelet and Tasks
															
 
																-One of StarPU primary data structure is the @b{codelet}. A codelet describes a
															
 
																+One of the StarPU primary data structures is the @b{codelet}. A codelet describes a
															
 
																 computational kernel that can possibly be implemented on multiple architectures
															
 
																 such as a CPU, a CUDA device or a Cell's SPU.
															
@@ -109,7 +111,7 @@ such as a CPU, a CUDA device or a Cell's SPU.
 
																 Another important data structure is the @b{task}. Executing a StarPU task
															
 
																 consists in applying a codelet on a data set, on one of the architectures on
															
 
																 which the codelet is implemented. In addition to the codelet that a task
															
 
																-implements, it also describes which data are accessed, and how they are
															
 
																+useuses, it also describes which data are accessed, and how they are
															
 
																 accessed during the computation (read and/or write).
															
 
																 StarPU tasks are asynchronous: submitting a task to StarPU is a non-blocking
															
 
																 operation. The task structure can also specify a @b{callback} function that is
															
@@ -117,9 +119,12 @@ called once StarPU has properly executed the task. It also contains optional
 
																 fields that the application may use to give hints to the scheduler (such as
															
 
																 priority levels).
															
 
																-A task may be identified by a unique 64-bit number which we refer as a @b{tag}.
															
 
																-Task dependencies can be enforced either by the means of callback functions, or
															
 
																-by expressing dependencies between tags.
															
 
																+A task may be identified by a unique 64-bit number chosen by the application
															
 
																+which we refer as a @b{tag}.
															
 
																+Task dependencies can be enforced either by the means of callback functions, by
															
 
																+expressing dependencies between explicit tasks or by expressing dependencies
															
 
																+between tags (which can thus correspond to tasks that have not been submitted
															
 
																+yet).
															
 
																 @c TODO insert illustration f(Ar, Brw, Cr) + ..
															
@@ -174,6 +179,14 @@ can be used to install StarPU.
 
																 @node Getting Sources
															
 
																 @subsection Getting Sources
															
 
																+The simplest way to get StarPU sources is to download the latest official
															
 
																+release tarball from @indicateurl{https://gforge.inria.fr/frs/?group_id=1570} ,
															
 
																+or the latest nightly snapshot from
															
 
																+@indicateurl{http://starpu.gforge.inria.fr/testing/} . The following documents
															
 
																+how to get the very latest version from the subversion repository itself, it
															
 
																+should be needed only if you need the very latest changes (i.e. less than a
															
 
																+day!)
															
 
																+
															
 
																 The source code is managed by a Subversion server hosted by the
															
 
																 InriaGforge. To get the source code, you need:
															
@@ -181,16 +194,18 @@ InriaGforge. To get the source code, you need:
 
																 @item
															
 
																 To install the client side of the software Subversion if it is
															
 
																 not already available on your system. The software can be obtained from
															
 
																-@indicateurl{http://subversion.tigris.org}.
															
 
																+@indicateurl{http://subversion.tigris.org} . If you are running
															
 
																+on Windows, you will probably prefer to use TortoiseSVN from
															
 
																+@indicateurl{http://tortoisesvn.tigris.org/} .
															
 
																 @item
															
 
																 You can check out the project's SVN repository through anonymous
															
 
																 access. This will provide you with a read access to the
															
 
																 repository.
															
 
																-You can also choose to become a member of the project @code{starpu}.
															
 
																-For this, you first need to get an account to the gForge server. You
															
 
																-can then send a request to join the project
															
 
																+If you need to have write access on the StarPU project, you can also choose to
															
 
																+become a member of the project @code{starpu}.  For this, you first need to get
															
 
																+an account to the gForge server. You can then send a request to join the project
															
 
																 (@indicateurl{https://gforge.inria.fr/project/request.php?group_id=1570}).
															
 
																 @item
															
@@ -225,7 +240,7 @@ using your gForge account
 
																 These steps require to run autoconf and automake to generate the
															
 
																 @code{./configure} script. This can be done by calling
															
 
																 @code{./autogen.sh}. The required version for autoconf is 2.60 or
															
 
																-higher.
															
 
																+higher. You will also need makeinfo.
															
 
																 @example
															
 
																 % ./autogen.sh
															
@@ -377,9 +392,9 @@ installed. This step is done only once per user and per machine.
 
																 @section Using accelerators
															
 
																 When both CUDA and OpenCL drivers are enabled, StarPU will launch an
															
 
																-OpenCL worker only if CUDA is not already running on the GPU.
															
 
																+OpenCL worker for NVIDIA GPUs only if CUDA is not already running on them.
															
 
																 This design choice was necessary as OpenCL and CUDA can not run at the
															
 
																-same time on the same GPU, as there is currently no interoperability
															
 
																+same time on the same NVIDIA GPU, as there is currently no interoperability
															
 
																 between them.
															
 
																 Details on how to specify devices running OpenCL and the ones running
															
@@ -487,7 +502,7 @@ manipulated by the codelet: here the codelet does not access or modify any data
 
																 that is controlled by our data management library. Note that the argument
															
 
																 passed to the codelet (the @code{cl_arg} field of the @code{starpu_task}
															
 
																 structure) does not count as a buffer since it is not managed by our data
															
 
																-management library.
															
 
																+management library, but just contain trivial parameters.
															
 
																 @c TODO need a crossref to the proper description of "where" see bla for more ...
															
 
																 We create a codelet which may only be executed on the CPUs. The @code{where}
															
@@ -501,7 +516,8 @@ which @emph{must} have the following prototype:
 
																 In this example, we can ignore the first argument of this function which gives a
															
 
																 description of the input and output buffers (e.g. the size and the location of
															
 
																-the matrices). The second argument is a pointer to a buffer passed as an
															
 
																+the matrices) since there is none.
															
 
																+The second argument is a pointer to a buffer passed as an
															
 
																 argument to the codelet by the means of the @code{cl_arg} field of the
															
 
																 @code{starpu_task} structure.
															
@@ -510,7 +526,8 @@ Be aware that this may be a pointer to a
 
																 @emph{copy} of the actual buffer, and not the pointer given by the programmer:
															
 
																 if the codelet modifies this buffer, there is no guarantee that the initial
															
 
																 buffer will be modified as well: this for instance implies that the buffer
															
 
																-cannot be used as a synchronization medium.
															
 
																+cannot be used as a synchronization medium. If synchronization is needed, data
															
 
																+has to be registered to StarPU, see @ref{Scaling a Vector}.
															
 
																 @node Submitting a Task
															
 
																 @subsection Submitting a Task
															
@@ -573,9 +590,10 @@ The optional @code{cl_arg} field is a pointer to a buffer (of size
 
																 @code{cl_arg_size}) with some parameters for the kernel
															
 
																 described by the codelet. For instance, if a codelet implements a computational
															
 
																 kernel that multiplies its input vector by a constant, the constant could be
															
 
																-specified by the means of this buffer, instead of registering it.
															
 
																+specified by the means of this buffer, instead of registering it as a StarPU
															
 
																+data.
															
 
																-Once a task has been executed, an optional callback function can be called.
															
 
																+Once a task has been executed, an optional callback function is be called.
															
 
																 While the computational kernel could be offloaded on various architectures, the
															
 
																 callback function is always executed on a CPU. The @code{callback_arg}
															
 
																 pointer is passed as an argument of the callback. The prototype of a callback
															
@@ -583,10 +601,11 @@ function must be:
 
																 @code{void (*callback_function)(void *);}
															
 
																-If the @code{synchronous} field is non-null, task submission will be
															
 
																+If the @code{synchronous} field is non-zero, task submission will be
															
 
																 synchronous: the @code{starpu_task_submit} function will not return until the
															
 
																 task was executed. Note that the @code{starpu_shutdown} method does not
															
 
																-guarantee that asynchronous tasks have been executed before it returns.
															
 
																+guarantee that asynchronous tasks have been executed before it returns,
															
 
																+@code{starpu_task_wait_for_all} can be used to that effect..
															
 
																 @node Execution of Hello World
															
 
																 @subsection Execution of Hello World
															
@@ -595,7 +614,7 @@ guarantee that asynchronous tasks have been executed before it returns.
 
																 % make hello_world
															
 
																 cc $(pkg-config --cflags libstarpu)  $(pkg-config --libs libstarpu) hello_world.c -o hello_world
															
 
																 % ./hello_world
															
 
																-Hello world (array = @{1.000000, -1.000000@} )
															
 
																+Hello world (params = @{1, 2.000000@} )
															
 
																 Callback function (arg 42)
															
 
																 @end smallexample
															
@@ -625,7 +644,7 @@ Before submitting those tasks, the programmer first needs to declare the
 
																 different pieces of data to StarPU using the @code{starpu_*_data_register}
															
 
																 functions. To ease the development of applications for StarPU, it is possible
															
 
																 to describe multiple types of data layout. A type of data layout is called an
															
 
																-@b{interface}. By default, there are different interfaces available in StarPU:
															
 
																+@b{interface}. There are different predefined interfaces available in StarPU:
															
 
																 here we will consider the @b{vector interface}.
															
 
																 The following lines show how to declare an array of @code{NX} elements of type
															
@@ -644,10 +663,10 @@ starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX,
 
																 The first argument, called the @b{data handle}, is an opaque pointer which
															
 
																 designates the array in StarPU. This is also the structure which is used to
															
 
																 describe which data is used by a task. The second argument is the node number
															
 
																-where the data currently resides. Here it is 0 since the @code{vector} array is in
															
 
																-the main memory. Then comes the pointer @code{vector} where the data can be found,
															
 
																+where the data originally resides. Here it is 0 since the @code{vector} array is in
															
 
																+the main memory. Then comes the pointer @code{vector} where the data can be found in main memory,
															
 
																 the number of elements in the vector and the size of each element.
															
 
																-It is possible to construct a StarPU task that will manipulate the
															
 
																+The following shows how to construct a StarPU task that will manipulate the
															
 
																 vector and a constant factor.
															
 
																 @cartouche
															
@@ -666,7 +685,8 @@ starpu_task_submit(task);
 
																 @end smallexample
															
 
																 @end cartouche
															
 
																-Since the factor is a mere float value parameter, it does not need a preliminary registration, and
															
 
																+Since the factor is a mere constant float value parameter,
															
 
																+it does not need a preliminary registration, and
															
 
																 can just be passed through the @code{cl_arg} pointer like in the previous
															
 
																 example.  The vector parameter is described by its handle.
															
 
																 There are two fields in each element of the @code{buffers} array.
															
@@ -703,7 +723,7 @@ starpu_codelet cl = @{
 
																 The first argument is an array that gives
															
 
																 a description of all the buffers passed in the @code{task->buffers}@ array. The
															
 
																 size of this array is given by the @code{nbuffers} field of the codelet
															
 
																-structure. For the sake of generality, this array contains pointers to the
															
 
																+structure. For the sake of genericity, this array contains pointers to the
															
 
																 different interfaces describing each buffer.  In the case of the @b{vector
															
 
																 interface}, the location of the vector (resp. its length) is accessible in the
															
 
																 @code{ptr} (resp. @code{nx}) of this array. Since the vector is accessed in a
															
@@ -752,7 +772,7 @@ static __global__ void vector_mult_cuda(float *val, unsigned n,
 
																                                         float factor)
															
 
																 @{
															
 
																     unsigned i;
															
 
																-    for(i = 0 ; i < n ; i++)
															
 
																+    if (i < n)
															
 
																         val[i] *= factor;
															
 
																 @}
															
@@ -764,8 +784,10 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 
																     unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
															
 
																     /* local copy of the vector pointer */
															
 
																     float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
															
 
																+    unsigned threads_per_block = 64;
															
 
																+    unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
															
 
																-@i{    vector_mult_cuda<<<1,1>>>(val, n, *factor);}
															
 
																+@i{    vector_mult_cuda<<<nblocks,threads_per_block>>>(val, n, *factor);}
															
 
																 @i{    cudaThreadSynchronize();}
															
 
																 @}
															
@@ -882,8 +904,8 @@ int main(int argc, char **argv)
 
																     starpu_init(NULL);                            /* @b{Initialising StarPU} */
															
 
																 #ifdef STARPU_USE_OPENCL
															
 
																-    starpu_opencl_load_opencl_from_file("examples/basic_examples/vector_scal_opencl_codelet.cl",
															
 
																-                                        &programs);
															
 
																+    starpu_opencl_load_opencl_from_file(
															
 
																+            "examples/basic_examples/vector_scal_opencl_codelet.cl", &programs);
															
 
																 #endif
															
 
																     vector = malloc(NX*sizeof(vector[0]));
															
@@ -917,6 +939,7 @@ int main(int argc, char **argv)
 
																             return 1;
															
 
																     @}
															
 
																+@c TODO: Mmm, should rather be an unregistration with an implicit dependency, no?
															
 
																     /* @b{Waiting for its termination} */
															
 
																     starpu_task_wait_for_all();
															
@@ -985,7 +1008,8 @@ or for example, by disabling CPU devices:
 
																 0.000000 3.000000 6.000000 9.000000 12.000000
															
 
																 @end smallexample
															
 
																-or by disabling CUDA devices:
															
 
																+or by disabling CUDA devices (which may permit to enable the use of OpenCL,
															
 
																+see @ref{Using accelerators}):
															
 
																 @smallexample
															
 
																 % STARPU_NCUDA=0 ./vector_scal
															
@@ -1007,6 +1031,7 @@ task->synchronous = 1;
 
																  * query the profiling info before the task is destroyed. */
															
 
																 task->destroy = 0;
															
 
																+/* Submit and wait for completion (since synchronous was set to 1) */
															
 
																 starpu_task_submit(task);
															
 
																 /* The task is finished, get profiling information */
															
@@ -1081,6 +1106,7 @@ starpu_data_partition(handle, &f);
 
																 @smallexample
															
 
																 /* Submit a task on each sub-vector */
															
 
																 for (i=0; i<starpu_data_get_nb_children(handle); i++) @{
															
 
																+    /* Get subdata number i (there is only 1 dimension) */
															
 
																     starpu_data_handle sub_handle = starpu_data_get_sub_data(handle, 1, i);
															
 
																     struct starpu_task *task = starpu_task_create();
															
@@ -1110,7 +1136,7 @@ performance model. There are several kinds of performance models.
 
																 @item
															
 
																 Providing an estimation from the application itself (@code{STARPU_COMMON} model type and @code{cost_model} field),
															
 
																 see for instance
															
 
																-@code{examples/common/blas_model.c} and @code{examples/common/blas_model.h}. It can also be provided for each architecture (@code{STARPU_PER_ARCH} model type and @code{per_arch} field)
															
 
																+@code{examples/common/blas_model.h} and @code{examples/common/blas_model.c}. It can also be provided for each architecture (@code{STARPU_PER_ARCH} model type and @code{per_arch} field)
															
 
																 @item
															
 
																 Measured at runtime (STARPU_HISTORY_BASED model type). This assumes that for a
															
 
																 given set of data input/output sizes, the performance will always be about the
															
@@ -1184,8 +1210,8 @@ the priorities as the StarPU scheduler would, i.e. schedule prioritized
 
																 tasks before less prioritized tasks, to check to which extend this results
															
 
																 to a less optimal solution. This increases even more computation time.
															
 
																-Note that all this however doesn't take into account data transfer, which is
															
 
																-assumed to be completely overlapped.
															
 
																+Note that for simplicity, all this however doesn't take into account data
															
 
																+transfers, which are assumed to be completely overlapped.
															
 
																 @node More examples
															
 
																 @section More examples
															
@@ -1222,6 +1248,18 @@ More advanced examples include:
 
																 @c Performance feedback
															
 
																 @c ---------------------------------------------------------------------
															
 
																+@node Performance options
															
 
																+@chapter Performance options worth knowing
															
 
																+
															
 
																+TODO: explain why execution should be tried with
															
 
																+@code{STARPU_PREFETCH=1 STARPU_SCHED=dmda}, when to use
															
 
																+@code{STARPU_CALIBRATE=2} to force re-calibration, and how to play with
															
 
																+@code{STARPU_BETA=2} or more.
															
 
																+
															
 
																+@c ---------------------------------------------------------------------
															
 
																+@c Performance feedback
															
 
																+@c ---------------------------------------------------------------------
															
 
																+
															
 
																 @node Performance feedback
															
 
																 @chapter Performance feedback
															
@@ -1381,9 +1419,9 @@ generate a trace in the Paje format by calling:
 
																 @end example
															
 
																 This will create a @code{paje.trace} file in the current directory that can be
															
 
																-inspected with the Vite trace visualizing open-source tool. More information
															
 
																-about Vite is available at @indicateurl{http://vite.gforge.inria.fr/}. It is
															
 
																-possible to open the @code{paje.trace} file with Vite by using the following
															
 
																+inspected with the ViTE trace visualizing open-source tool. More information
															
 
																+about ViTE is available at @indicateurl{http://vite.gforge.inria.fr/}. It is
															
 
																+possible to open the @code{paje.trace} file with ViTE by using the following
															
 
																 command:
															
 
																 @example
															
 
																 % vite paje.trace
															
@@ -1497,7 +1535,7 @@ Augment the verbosity of the debugging messages.
 
																 @subsubsection @code{--enable-coverage}
															
 
																 @table @asis
															
 
																 @item @emph{Description}:
															
 
																-Enable flags for the coverage tool.
															
 
																+Enable flags for the @code{gcov} coverage tool.
															
 
																 @end table
															
 
																 @node Configuring workers
															
@@ -1563,14 +1601,18 @@ Specify the directory where CUDA is installed. This directory should notably con
 
																 @table @asis
															
 
																 @item @emph{Description}:
															
 
																 Specify the directory where CUDA headers are installed. This directory should
															
 
																-notably contain @code{cuda.h}.
															
 
																+notably contain @code{cuda.h}. This defaults to @code{/include} appended to the
															
 
																+value given to @code{--with-cuda-dir}.
															
 
																 @end table
															
 
																 @node --with-cuda-lib-dir
															
 
																 @subsubsection @code{--with-cuda-lib-dir=<path>}
															
 
																 @table @asis
															
 
																 @item @emph{Description}:
															
 
																-Specify the directory where the CUDA library is installed.
															
 
																+Specify the directory where the CUDA library is installed. This directory should
															
 
																+notably contain the CUDA shared libraries (e.g. libcuda.so). This defaults to
															
 
																+@code{/lib} appended to the value given to @code{--with-cuda-dir}.
															
 
																+
															
 
																 @end table
															
 
																 @node --enable-maxopencldev
															
@@ -1601,15 +1643,18 @@ Specify the location of the OpenCL SDK. This directory should notably contain
 
																 @table @asis
															
 
																 @item @emph{Description}:
															
 
																 Specify the location of OpenCL headers. This directory should notably contain
															
 
																-@code{CL/cl.h}.
															
 
																+@code{CL/cl.h}. This defaults to
															
 
																+@code{/include} appended to the value given to @code{--with-opencl-dir}.
															
 
																+
															
 
																 @end table
															
 
																 @node --with-opencl-lib-dir
															
 
																 @subsubsection @code{--with-opencl-lib-dir=<path>}
															
 
																 @table @asis
															
 
																 @item @emph{Description}:
															
 
																-Specify the location of the OpenCL library.
															
 
																-@code{include/CL/cl.h}.
															
 
																+Specify the location of the OpenCL library. This directory should notably
															
 
																+contain the OpenCL shared libraries (e.g. libOpenCL.so). This defaults to
															
 
																+@code{/lib} appended to the value given to @code{--with-opencl-dir}.
															
 
																 @end table
															
 
																 @node --enable-gordon
															
@@ -1705,7 +1750,8 @@ library has to be 'atlas' or 'goto'.
 
																 @subsubsection @code{--with-magma=<path>}
															
 
																 @table @asis
															
 
																 @item @emph{Description}:
															
 
																-Specify where magma is installed.
															
 
																+Specify where magma is installed. This directory should notably contain
															
 
																+@code{include/magmablas.h}.
															
 
																 @end table
															
 
																 @node --with-fxt
															
@@ -1843,9 +1889,9 @@ Specify the number of SPUs that StarPU can use.
 
																 @item @emph{Description}:
															
 
																 Passing an array of integers (starting from 0) in @code{STARPU_WORKERS_CPUID}
															
 
																 specifies on which logical CPU the different workers should be
															
 
																-bound. For instance, if @code{STARPU_WORKERS_CPUID = "1 3 0 2"}, the first
															
 
																-worker will be bound to logical CPU #1, the second CPU worker will be bound to
															
 
																-logical CPU #3 and so on.  Note that the logical ordering of the CPUs is either
															
 
																+bound. For instance, if @code{STARPU_WORKERS_CPUID = "0 1 4 5"}, the first
															
 
																+worker will be bound to logical CPU #0, the second CPU worker will be bound to
															
 
																+logical CPU #1 and so on.  Note that the logical ordering of the CPUs is either
															
 
																 determined by the OS, or provided by the @code{hwloc} library in case it is
															
 
																 available.
															
@@ -2480,7 +2526,7 @@ The application may access the requested data during the execution of this
 
																 callback. The callback function must call @code{starpu_data_release} once the
															
 
																 application does not need to access the piece of data anymore. 
															
 
																 Note that implicit data dependencies are also enforced by
															
 
																-@code{starpu_data_acquire} in case they are enabled.
															
 
																+@code{starpu_data_acquire_cb} in case they are enabled.
															
 
																  Contrary to @code{starpu_data_acquire}, this function is non-blocking and may
															
 
																 be called from task callbacks. Upon successful completion, this function
															
 
																 returns 0.
															
@@ -2505,9 +2551,9 @@ This function releases the piece of data acquired by the application either by
 
																 * Variable Interface::          
															
 
																 * Vector Interface::            
															
 
																 * Matrix Interface::            
															
 
																+* 3D Matrix Interface::             
															
 
																 * BCSR Interface for Sparse Matrices (Blocked Compressed Sparse Row Representation)::  
															
 
																 * CSR Interface for Sparse Matrices (Compressed Sparse Row Representation)::  
															
 
																-* Block Interface::             
															
 
																 @end menu
															
 
																 @node Variable Interface
															
@@ -2582,33 +2628,45 @@ starpu_matrix_data_register(&matrix_handle, 0, (uintptr_t)matrix,
 
																 @end cartouche
															
 
																 @end table
															
 
																-@node BCSR Interface for Sparse Matrices (Blocked Compressed Sparse Row Representation)
															
 
																-@subsection BCSR Interface for Sparse Matrices (Blocked Compressed Sparse Row Representation)
															
 
																+@node 3D Matrix Interface
															
 
																+@subsection 3D Matrix Interface
															
 
																 @table @asis
															
 
																 @item @emph{Description}:
															
 
																-This variant of @code{starpu_data_register} uses the BCSR sparse matrix interface.
															
 
																-TODO
															
 
																+This variant of @code{starpu_data_register} uses the 3D matrix interface.
															
 
																+@code{ptr} is the address of the array of first element in the home node.
															
 
																+@code{ldy} is the number of elements between rows. @code{ldz} is the number
															
 
																+of rows between z planes. @code{nx} is the number of elements in a row (this
															
 
																+can be different from @code{ldy} if there are extra elements for alignment
															
 
																+for instance). @code{ny} is the number of rows in a z plane (likewise with
															
 
																+@code{ldz}). @code{nz} is the number of z planes. @code{elemsize} is the size of
															
 
																+each element.
															
 
																 @item @emph{Prototype}:
															
 
																-@code{void starpu_bcsr_data_register(starpu_data_handle *handle, uint32_t home_node, uint32_t nnz, uint32_t nrow,
															
 
																-		uintptr_t nzval, uint32_t *colind, uint32_t *rowptr, uint32_t firstentry, uint32_t r, uint32_t c, size_t elemsize);}
															
 
																+@code{void starpu_block_data_register(starpu_data_handle *handle, uint32_t home_node,
															
 
																+                        uintptr_t ptr, uint32_t ldy, uint32_t ldz, uint32_t nx,
															
 
																+                        uint32_t ny, uint32_t nz, size_t elemsize);}
															
 
																 @item @emph{Example}:
															
 
																 @cartouche
															
 
																 @smallexample
															
 
																+float *block;
															
 
																+starpu_data_handle block_handle;
															
 
																+block = (float*)malloc(nx*ny*nz*sizeof(float));
															
 
																+starpu_block_data_register(&block_handle, 0, (uintptr_t)block,
															
 
																+                           nx, nx*ny, nx, ny, nz, sizeof(float));
															
 
																 @end smallexample
															
 
																 @end cartouche
															
 
																 @end table
															
 
																-@node CSR Interface for Sparse Matrices (Compressed Sparse Row Representation)
															
 
																-@subsection CSR Interface for Sparse Matrices (Compressed Sparse Row Representation)
															
 
																+@node BCSR Interface for Sparse Matrices (Blocked Compressed Sparse Row Representation)
															
 
																+@subsection BCSR Interface for Sparse Matrices (Blocked Compressed Sparse Row Representation)
															
 
																 @table @asis
															
 
																 @item @emph{Description}:
															
 
																-This variant of @code{starpu_data_register} uses the CSR sparse matrix interface.
															
 
																+This variant of @code{starpu_data_register} uses the BCSR sparse matrix interface.
															
 
																 TODO
															
 
																 @item @emph{Prototype}:
															
 
																-@code{void starpu_csr_data_register(starpu_data_handle *handle, uint32_t home_node, uint32_t nnz, uint32_t nrow,
															
 
																-		uintptr_t nzval, uint32_t *colind, uint32_t *rowptr, uint32_t firstentry, size_t elemsize);}
															
 
																+@code{void starpu_bcsr_data_register(starpu_data_handle *handle, uint32_t home_node, uint32_t nnz, uint32_t nrow,
															
 
																+		uintptr_t nzval, uint32_t *colind, uint32_t *rowptr, uint32_t firstentry, uint32_t r, uint32_t c, size_t elemsize);}
															
 
																 @item @emph{Example}:
															
 
																 @cartouche
															
 
																 @smallexample
															
@@ -2616,31 +2674,19 @@ TODO
 
																 @end cartouche
															
 
																 @end table
															
 
																-@node Block Interface
															
 
																-@subsection Block Interface
															
 
																+@node CSR Interface for Sparse Matrices (Compressed Sparse Row Representation)
															
 
																+@subsection CSR Interface for Sparse Matrices (Compressed Sparse Row Representation)
															
 
																 @table @asis
															
 
																 @item @emph{Description}:
															
 
																-This variant of @code{starpu_data_register} uses the 3D matrix interface.
															
 
																-@code{ptr} is the address of the array of first element in the home node.
															
 
																-@code{ldy} is the number of elements between rows. @code{ldz} is the number
															
 
																-of rows between z planes. @code{nx} is the number of elements in a row (this
															
 
																-can be different from @code{ldy} if there are extra elements for alignment
															
 
																-for instance). @code{ny} is the number of rows in a z plane (likewise with
															
 
																-@code{ldz}). @code{nz} is the number of z planes. @code{elemsize} is the size of
															
 
																-each element.
															
 
																+This variant of @code{starpu_data_register} uses the CSR sparse matrix interface.
															
 
																+TODO
															
 
																 @item @emph{Prototype}:
															
 
																-@code{void starpu_block_data_register(starpu_data_handle *handle, uint32_t home_node,
															
 
																-                        uintptr_t ptr, uint32_t ldy, uint32_t ldz, uint32_t nx,
															
 
																-                        uint32_t ny, uint32_t nz, size_t elemsize);}
															
 
																+@code{void starpu_csr_data_register(starpu_data_handle *handle, uint32_t home_node, uint32_t nnz, uint32_t nrow,
															
 
																+		uintptr_t nzval, uint32_t *colind, uint32_t *rowptr, uint32_t firstentry, size_t elemsize);}
															
 
																 @item @emph{Example}:
															
 
																 @cartouche
															
 
																 @smallexample
															
 
																-float *block;
															
 
																-starpu_data_handle block_handle;
															
 
																-block = (float*)malloc(nx*ny*nz*sizeof(float));
															
 
																-starpu_block_data_register(&block_handle, 0, (uintptr_t)block,
															
 
																-                           nx, nx*ny, nx, ny, nz, sizeof(float));
															
 
																 @end smallexample
															
 
																 @end cartouche
															
 
																 @end table
															
@@ -2661,24 +2707,28 @@ starpu_block_data_register(&block_handle, 0, (uintptr_t)block,
 
																 @subsection @code{struct starpu_data_filter} -- StarPU filter structure
															
 
																 @table @asis
															
 
																 @item @emph{Description}:
															
 
																-The filter structure describes a data partitioning function.
															
 
																+The filter structure describes a data partitioning operation, to be given to the
															
 
																+@code{starpu_data_partition} function, see @ref{starpu_data_partition} for an example.
															
 
																 @item @emph{Fields}:
															
 
																 @table @asis
															
 
																 @item @code{filter_func}:
															
 
																-TODO
															
 
																+This function fills the @code{child_interface} structure with interface
															
 
																+information for the @code{id}-th child of the parent @code{father_interface} (among @code{nparts}).
															
 
																 @code{void (*filter_func)(void *father_interface, void* child_interface, struct starpu_data_filter *, unsigned id, unsigned nparts);}
															
 
																 @item @code{get_nchildren}:
															
 
																-TODO
															
 
																+This returns the number of children.
															
 
																 @code{unsigned (*get_nchildren)(struct starpu_data_filter *, starpu_data_handle initial_handle);}
															
 
																 @item @code{get_child_ops}:
															
 
																-TODO
															
 
																+In case the resulting children use a different data interface, this function
															
 
																+returns which interface is used by child number @code{id}.
															
 
																 @code{struct starpu_data_interface_ops_t *(*get_child_ops)(struct starpu_data_filter *, unsigned id);}
															
 
																-@item @code{filter_arg}:
															
 
																-TODO
															
 
																 @item @code{nchildren}:
															
 
																-TODO
															
 
																+This is the number of parts to partition the data into.
															
 
																+@item @code{filter_arg}:
															
 
																+Some filters take an addition parameter, but this is usually unused.
															
 
																 @item @code{filter_arg_ptr}:
															
 
																-TODO
															
 
																+Some filters take an additional array parameter like the sizes of the parts, but
															
 
																+this is usually unused.
															
 
																 @end table
															
 
																 @end table
															
@@ -2687,9 +2737,22 @@ TODO
 
																 @table @asis
															
 
																 @item @emph{Description}:
															
 
																-TODO
															
 
																+This requests partitioning one StarPU data @code{initial_handle} into several
															
 
																+subdata according to the filter @code{f}
															
 
																 @item @emph{Prototype}:
															
 
																 @code{void starpu_data_partition(starpu_data_handle initial_handle, struct starpu_data_filter *f);}
															
 
																+@item @emph{Example}:
															
 
																+@cartouche
															
 
																+@smallexample
															
 
																+struct starpu_data_filter f = {
															
 
																+    .filter_func = starpu_vertical_block_filter_func,
															
 
																+    .nchildren = nslicesx,
															
 
																+    .get_nchildren = NULL,
															
 
																+    .get_child_ops = NULL
															
 
																+};
															
 
																+starpu_data_partition(A_handle, &f);
															
 
																+@end smallexample
															
 
																+@end cartouche
															
 
																 @end table
															
 
																 @node starpu_data_unpartition
															
@@ -2697,9 +2760,16 @@ TODO
 
																 @table @asis
															
 
																 @item @emph{Description}:
															
 
																-TODO
															
 
																+This unapplies one filter, thus unpartitioning the data. The pieces of data are
															
 
																+collected back into one big piece in the @code{gathering_node} (usually 0).
															
 
																 @item @emph{Prototype}:
															
 
																 @code{void starpu_data_unpartition(starpu_data_handle root_data, uint32_t gathering_node);}
															
 
																+@item @emph{Example}:
															
 
																+@cartouche
															
 
																+@smallexample
															
 
																+starpu_data_unpartition(A_handle, 0);
															
 
																+@end smallexample
															
 
																+@end cartouche
															
 
																 @end table
															
 
																 @node starpu_data_get_nb_children
															
@@ -2707,9 +2777,9 @@ TODO
 
																 @table @asis
															
 
																 @item @emph{Description}:
															
 
																-TODO
															
 
																+This function returns the number of children.
															
 
																 @item @emph{Return value}:
															
 
																-This function returns returns the number of children.
															
 
																+The number of children.
															
 
																 @item @emph{Prototype}:
															
 
																 @code{int starpu_data_get_nb_children(starpu_data_handle handle);}
															
 
																 @end table
															
@@ -2721,11 +2791,22 @@ This function returns returns the number of children.
 
																 @table @asis
															
 
																 @item @emph{Description}:
															
 
																-TODO
															
 
																+After partitioning a StarPU data by applying a filter,
															
 
																+@code{starpu_data_get_sub_data} can be used to get handles for each of the data
															
 
																+portions. @code{root_data} is the parent data that was partitioned. @code{depth}
															
 
																+is the number of filters to traverse (in case several filters have been applied,
															
 
																+to e.g. partition in row blocks, and then in column blocks), and the subsequent
															
 
																+parameters are the indexes.
															
 
																 @item @emph{Return value}:
															
 
																-TODO
															
 
																+A handle to the subdata.
															
 
																 @item @emph{Prototype}:
															
 
																 @code{starpu_data_handle starpu_data_get_sub_data(starpu_data_handle root_data, unsigned depth, ... );}
															
 
																+@item @emph{Example}:
															
 
																+@cartouche
															
 
																+@smallexample
															
 
																+h = starpu_data_get_sub_data(A_handle, 1, taskx);
															
 
																+@end smallexample
															
 
																+@end cartouche
															
 
																 @end table
															
 
																 @node Predefined filter functions
															
@@ -2738,52 +2819,79 @@ TODO
 
																 * Partitioning Block Data::     
															
 
																 @end menu
															
 
																-This section gives a list of the predefined partitioning functions.
															
 
																-Examples on how to use them are shown in @ref{Partitioning Data}.
															
 
																+This section gives a partial list of the predefined partitioning functions.
															
 
																+Examples on how to use them are shown in @ref{Partitioning Data}. The complete
															
 
																+list can be found in @code{starpu_data_filters.h} .
															
 
																 @node Partitioning BCSR Data
															
 
																 @subsubsection Partitioning BCSR Data
															
 
																-@itemize
															
 
																-@item
															
 
																+
															
 
																+@table @asis
															
 
																+@item @emph{Description}:
															
 
																 TODO
															
 
																+@item @emph{Prototype}:
															
 
																 @code{void starpu_canonical_block_filter_bcsr(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);}
															
 
																-@item
															
 
																+@end table
															
 
																+
															
 
																+@table @asis
															
 
																+@item @emph{Description}:
															
 
																 TODO
															
 
																+@item @emph{Prototype}:
															
 
																 @code{void starpu_vertical_block_filter_func_csr(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);}
															
 
																-@end itemize
															
 
																+@end table
															
 
																 @node Partitioning BLAS interface
															
 
																 @subsubsection Partitioning BLAS interface
															
 
																-@itemize
															
 
																-@item
															
 
																-TODO
															
 
																+
															
 
																+@table @asis
															
 
																+@item @emph{Description}:
															
 
																+This partitions a dense Matrix into horizontal blocks.
															
 
																+@item @emph{Prototype}:
															
 
																 @code{void starpu_block_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);}
															
 
																-@item
															
 
																-TODO
															
 
																+@end table
															
 
																+
															
 
																+@table @asis
															
 
																+@item @emph{Description}:
															
 
																+This partitions a dense Matrix into vertical blocks.
															
 
																+@item @emph{Prototype}:
															
 
																 @code{void starpu_vertical_block_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);}
															
 
																-@end itemize
															
 
																+@end table
															
 
																 @node Partitioning Vector Data
															
 
																 @subsubsection Partitioning Vector Data
															
 
																-@itemize
															
 
																-@item
															
 
																-TODO
															
 
																+
															
 
																+@table @asis
															
 
																+@item @emph{Description}:
															
 
																+This partitions a vector into blocks of the same size.
															
 
																+@item @emph{Prototype}:
															
 
																 @code{void starpu_block_filter_func_vector(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);}
															
 
																-@item
															
 
																-TODO
															
 
																+@end table
															
 
																+
															
 
																+
															
 
																+@table @asis
															
 
																+@item @emph{Description}:
															
 
																+This partitions a vector into blocks of sizes given in @code{filter_arg_ptr}.
															
 
																+@item @emph{Prototype}:
															
 
																 @code{void starpu_vector_list_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);}
															
 
																-@item
															
 
																-TODO
															
 
																+@end table
															
 
																+
															
 
																+@table @asis
															
 
																+@item @emph{Description}:
															
 
																+This partitions a vector into two blocks, the first block size being given in @code{filter_arg}.
															
 
																+@item @emph{Prototype}:
															
 
																 @code{void starpu_vector_divide_in_2_filter_func(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);}
															
 
																-@end itemize
															
 
																+@end table
															
 
																+
															
 
																 @node Partitioning Block Data
															
 
																 @subsubsection Partitioning Block Data
															
 
																-@itemize
															
 
																-@item
															
 
																-TODO
															
 
																+
															
 
																+@table @asis
															
 
																+@item @emph{Description}:
															
 
																+This partitions a 3D matrix along the X axis.
															
 
																+@item @emph{Prototype}:
															
 
																 @code{void starpu_block_filter_func_block(void *father_interface, void *child_interface, struct starpu_data_filter *f, unsigned id, unsigned nparts);}
															
 
																-@end itemize
															
 
																+@end table
															
 
																 @node Codelets and Tasks
															
 
																 @section Codelets and Tasks
															
@@ -3086,7 +3194,7 @@ because there is no task being executed at the moment.
 
																 @subsection @code{starpu_display_codelet_stats} -- Display statistics
															
 
																 @table @asis
															
 
																 @item @emph{Description}:
															
 
																-TODO
															
 
																+Output on @code{stderr} some statistics on the codelet @code{cl}.
															
 
																 @item @emph{Prototype}:
															
 
																 @code{void starpu_display_codelet_stats(struct starpu_codelet_t *cl);}
															
 
																 @end table
															
@@ -3133,7 +3241,7 @@ redundancy in the task dependencies.
 
																 @subsection @code{starpu_tag_t} -- Task logical identifier
															
 
																 @table @asis
															
 
																 @item @emph{Description}:
															
 
																-It is possible to associate a task with a unique ``tag'' and to express
															
 
																+It is possible to associate a task with a unique ``tag'' chosen by the application, and to express
															
 
																 dependencies between tasks by the means of those tags. To do so, fill the
															
 
																 @code{tag_id} field of the @code{starpu_task} structure with a tag number (can
															
 
																 be arbitrary) and set the @code{use_tag} field to 1.
															
@@ -3342,7 +3450,7 @@ TODO
 
																 @subsection @code{starpu_force_bus_sampling}
															
 
																 @table @asis
															
 
																 @item @emph{Description}:
															
 
																-TODO
															
 
																+This forces sampling the bus performance model again.
															
 
																 @item @emph{Prototype}:
															
 
																 @code{void starpu_force_bus_sampling(void);}
															
 
																 @end table
															
--- a/doc/vector_scal_c.texi
+++ b/doc/vector_scal_c.texi
@@ -48,8 +48,8 @@ int main(int argc, char **argv)
 
																     starpu_init(NULL);
															
 
																 #ifdef STARPU_USE_OPENCL
															
 
																-        starpu_opencl_load_opencl_from_file("examples/basic_examples/vector_scal_opencl_kernel.cl",
															
 
																-                                            &programs);
															
 
																+        starpu_opencl_load_opencl_from_file(
															
 
																+               "examples/basic_examples/vector_scal_opencl_kernel.cl", &programs);
															
 
																 #endif
															
 
																     /* Tell StaPU to associate the "vector" vector with the "vector_handle"
															
@@ -66,7 +66,8 @@ int main(int argc, char **argv)
 
																      *  - the fifth argument is the size of each element.
															
 
																      */
															
 
																     starpu_data_handle vector_handle;
															
 
																-    starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX, sizeof(vector[0]));
															
 
																+    starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector,
															
 
																+                                NX, sizeof(vector[0]));
															
 
																     float factor = 3.14;
															
--- a/doc/vector_scal_cuda.texi
+++ b/doc/vector_scal_cuda.texi
@@ -4,7 +4,7 @@ static __global__ void vector_mult_cuda(float *val, unsigned n,
 
																                                         float factor)
															
 
																 @{
															
 
																         unsigned i;
															
 
																-        for(i = 0 ; i < n ; i++)
															
 
																+        if (i < n)
															
 
																                val[i] *= factor;
															
 
																 @}
															
@@ -16,8 +16,10 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 
																         unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
															
 
																         /* local copy of the vector pointer */
															
 
																         float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
															
 
																+        unsigned threads_per_block = 64;
															
 
																+        unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
															
 
																-        vector_mult_cuda<<<1,1>>>(val, n, *factor);
															
 
																+        vector_mult_cuda<<<nblocks,threads_per_block>>>(val, n, *factor);
															
 
																-    cudaThreadSynchronize();
															
 
																+        cudaThreadSynchronize();
															
 
																 @}
															
--- a/doc/vector_scal_opencl.texi
+++ b/doc/vector_scal_opencl.texi
@@ -18,7 +18,8 @@ void scal_opencl_func(void *buffers[], void *_args)
 
																     id = starpu_worker_get_id();
															
 
																     devid = starpu_worker_get_devid(id);
															
 
																-    err = starpu_opencl_load_kernel(&kernel, &queue, &programs, "vector_mult_opencl", devid);
															
 
																+    err = starpu_opencl_load_kernel(&kernel, &queue, &programs, "vector_mult_opencl",
															
 
																+                                    devid);
															
 
																     if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																     err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &val);
															
@@ -28,16 +29,18 @@ void scal_opencl_func(void *buffers[], void *_args)
 
																     @{
															
 
																         size_t global=n;
															
 
																-	size_t local;
															
 
																+        size_t local;
															
 
																         size_t s;
															
 
																         cl_device_id device;
															
 
																         starpu_opencl_get_device(devid, &device);
															
 
																-        err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
															
 
																+        err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE,
															
 
																+                                        sizeof(local), &local, &s);
															
 
																         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																         if (local > global) local=global;
															
 
																-        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
															
 
																+        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0,
															
 
																+                                     NULL, NULL);
															
 
																         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
															
 
																     @}