13 vuotta sitten · d5b24e39d3
--- a/doc/chapters/advanced-examples.texi
+++ b/doc/chapters/advanced-examples.texi
@@ -57,7 +57,8 @@ void scal_sse_func(void *buffers[], void *cl_arg)
 
				 struct starpu_codelet cl = @{
			
 
				     .where = STARPU_CPU,
			
 
				     .cpu_funcs = @{ scal_cpu_func, scal_sse_func, NULL @},
			
 
				-    .nbuffers = 1
			
 
				+    .nbuffers = 1,
			
 
				+    .modes = @{ STARPU_RW @}
			
 
				 @};
			
 
				 @end smallexample
			
 
				 @end cartouche
			
@@ -142,7 +143,8 @@ struct starpu_codelet cl = @{
 
				     .can_execute = can_execute,
			
 
				     .cpu_funcs = @{ cpu_func, NULL @},
			
 
				     .cuda_funcs = @{ scal_gpu_13, scal_gpu_20, NULL @},
			
 
				-    .nbuffers = 1
			
 
				+    .nbuffers = 1,
			
 
				+    .modes = @{ STARPU_RW @}
			
 
				 @};
			
 
				 @end smallexample
			
 
				 @end cartouche
			
@@ -339,6 +341,7 @@ struct starpu_codelet cl = @{
 
				     .where = STARPU_CPU,
			
 
				     .cpu_funcs = @{ cpu_mult, NULL @},
			
 
				     .nbuffers = 3,
			
 
				+    .modes = @{ STARPU_R, STARPU_R, STARPU_W @},
			
 
				     /* for the scheduling policy to be able to use performance models */
			
 
				     .model = &mult_perf_model
			
 
				 @};
			
@@ -576,7 +579,8 @@ will be able to convert data from one data structure to the other when needed.
 
				 Note that the heft scheduler is the only one optimized for this interface. The
			
 
				 user must provide StarPU with conversion codelets :
			
 
				 
			
 
				-@example
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				 #define NX 1024
			
 
				 struct point array_of_structs[NX];
			
 
				 starpu_data_handle_t handle;
			
@@ -615,14 +619,16 @@ struct starpu_multiformat_data_interface_ops format_ops = @{
 
				     ...
			
 
				 @};
			
 
				 starpu_multiformat_data_register(handle, 0, &array_of_structs, NX, &format_ops);
			
 
				-@end example
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				 
			
 
				 Kernels can be written almost as for any other interface. Note that
			
 
				 STARPU_MULTIFORMAT_GET_PTR shall only be used for CPU kernels. CUDA kernels
			
 
				 must use STARPU_MULTIFORMAT_GET_CUDA_PTR, and OpenCL kernels must use
			
 
				 STARPU_MULTIFORMAT_GET_OPENCL_PTR. STARPU_MULTIFORMAT_GET_NX may be used in any
			
 
				 kind of kernel.
			
 
				-@example
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				 static void
			
 
				 multiformat_scal_cpu_func(void *buffers[], void *args)
			
 
				 @{
			
@@ -644,7 +650,8 @@ extern "C" void multiformat_scal_cuda_func(void *buffers[], void *_args)
 
				 
			
 
				 	...
			
 
				 @}
			
 
				-@end example
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				 
			
 
				 A full example may be found in @code{examples/basic_examples/multiformat.c}.
			
 
				 
			
@@ -659,7 +666,7 @@ renderbuffer objects into CUDA. To achieve this with StarPU, it simply needs to
 
				 be given the CUDA pointer at registration, for instance:
			
 
				 
			
 
				 @cartouche
			
 
				-@example
			
 
				+@smallexample
			
 
				 	for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)
			
 
				 		if (starpu_worker_get_type(workerid) == STARPU_CUDA_WORKER)
			
 
				 			break;
			
@@ -677,7 +684,7 @@ be given the CUDA pointer at registration, for instance:
 
				 	cudaGraphicsUnmapResources(1, &resource, 0);
			
 
				 
			
 
				 	/* Now display it */
			
 
				-@end example
			
 
				+@end smallexample
			
 
				 @end cartouche
			
 
				 
			
 
				 @node More examples
			
--- a/doc/chapters/basic-api.texi
+++ b/doc/chapters/basic-api.texi
@@ -454,23 +454,27 @@ Register the @var{size}-byte element pointed to by @var{ptr}, which is
 
				 typically a scalar, and initialize @var{handle} to represent this data
			
 
				 item.
			
 
				 
			
 
				+@cartouche
			
 
				 @smallexample
			
 
				 float var;
			
 
				 starpu_data_handle_t var_handle;
			
 
				 starpu_variable_data_register(&var_handle, 0, (uintptr_t)&var, sizeof(var));
			
 
				 @end smallexample
			
 
				+@end cartouche
			
 
				 @end deftypefun
			
 
				 
			
 
				 @deftypefun void starpu_vector_data_register ({starpu_data_handle_t *}@var{handle}, uint32_t @var{home_node}, uintptr_t @var{ptr}, uint32_t @var{count}, size_t @var{size})
			
 
				 Register the @var{count} @var{size}-byte elements pointed to by
			
 
				 @var{ptr} and initialize @var{handle} to represent it.
			
 
				 
			
 
				-@example
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				 float vector[NX];
			
 
				 starpu_data_handle_t vector_handle;
			
 
				 starpu_vector_data_register(&vector_handle, 0, (uintptr_t)vector, NX,
			
 
				                             sizeof(vector[0]));
			
 
				-@end example
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				 @end deftypefun
			
 
				 
			
 
				 @deftypefun void starpu_matrix_data_register ({starpu_data_handle_t *}@var{handle}, uint32_t @var{home_node}, uintptr_t @var{ptr}, uint32_t @var{ld}, uint32_t @var{nx}, uint32_t @var{ny}, size_t @var{size})
			
@@ -480,13 +484,15 @@ pointed by @var{ptr} and initialize @var{handle} to represent it.
 
				 each row; a non-zero @var{ld} adds padding, which can be useful for
			
 
				 alignment purposes.
			
 
				 
			
 
				-@example
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				 float *matrix;
			
 
				 starpu_data_handle_t matrix_handle;
			
 
				 matrix = (float*)malloc(width * height * sizeof(float));
			
 
				 starpu_matrix_data_register(&matrix_handle, 0, (uintptr_t)matrix,
			
 
				                             width, width, height, sizeof(float));
			
 
				-@end example
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				 @end deftypefun
			
 
				 
			
 
				 @deftypefun void starpu_block_data_register ({starpu_data_handle_t *}@var{handle}, uint32_t @var{home_node}, uintptr_t @var{ptr}, uint32_t @var{ldy}, uint32_t @var{ldz}, uint32_t @var{nx}, uint32_t @var{ny}, uint32_t @var{nz}, size_t @var{size})
			
@@ -495,13 +501,15 @@ elements pointed by @var{ptr} and initialize @var{handle} to represent
 
				 it.  Again, @var{ldy} and @var{ldz} specify the number of extra elements
			
 
				 present at the end of each row or column.
			
 
				 
			
 
				-@example
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				 float *block;
			
 
				 starpu_data_handle_t block_handle;
			
 
				 block = (float*)malloc(nx*ny*nz*sizeof(float));
			
 
				 starpu_block_data_register(&block_handle, 0, (uintptr_t)block,
			
 
				                            nx, nx*ny, nx, ny, nz, sizeof(float));
			
 
				-@end example
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				 @end deftypefun
			
 
				 
			
 
				 @deftypefun void starpu_bcsr_data_register (starpu_data_handle_t *@var{handle}, uint32_t @var{home_node}, uint32_t @var{nnz}, uint32_t @var{nrow}, uintptr_t @var{nzval}, uint32_t *@var{colind}, uint32_t *@var{rowptr}, uint32_t @var{firstentry}, uint32_t @var{r}, uint32_t @var{c}, size_t @var{elemsize})
			
@@ -1351,11 +1359,11 @@ typically need to be explicitly casted. Using the
 
				 @code{starpu_tag_declare_deps_array} function avoids this hazard.
			
 
				 
			
 
				 @cartouche
			
 
				-@example
			
 
				+@smallexample
			
 
				 /*  Tag 0x1 depends on tags 0x32 and 0x52 */
			
 
				 starpu_tag_declare_deps((starpu_tag_t)0x1,
			
 
				         2, (starpu_tag_t)0x32, (starpu_tag_t)0x52);
			
 
				-@end example
			
 
				+@end smallexample
			
 
				 @end cartouche
			
 
				 @end deftypefun
			
 
				 
			
@@ -1364,11 +1372,11 @@ This function is similar to @code{starpu_tag_declare_deps}, except
 
				 that its does not take a variable number of arguments but an array of
			
 
				 tags of size @var{ndeps}.
			
 
				 @cartouche
			
 
				-@example
			
 
				+@smallexample
			
 
				 /*  Tag 0x1 depends on tags 0x32 and 0x52 */
			
 
				 starpu_tag_t tag_array[2] = @{0x32, 0x52@};
			
 
				 starpu_tag_declare_deps_array((starpu_tag_t)0x1, 2, tag_array);
			
 
				-@end example
			
 
				+@end smallexample
			
 
				 @end cartouche
			
 
				 @end deftypefun
			
 
				 
			
--- a/doc/chapters/c-extensions.texi
+++ b/doc/chapters/c-extensions.texi
@@ -98,7 +98,8 @@ Declare the given function as an implementation of @var{task} to run on
 
				 
			
 
				 Here is an example:
			
 
				 
			
 
				-@example
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				 #define __output  __attribute__ ((output))
			
 
				 
			
 
				 static void matmul (const float *A, const float *B,
			
@@ -125,7 +126,8 @@ matmul_cpu (const float *A, const float *B, __output float *C,
 
				           C[j * nx + i] += A[j * nz + k] * B[k * nx + i];
			
 
				       @}
			
 
				 @}
			
 
				-@end example
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				 
			
 
				 @noindent
			
 
				 A @code{matmult} task is defined; it has only one implementation,
			
@@ -135,7 +137,8 @@ buffer.
 
				 
			
 
				 CUDA and OpenCL implementations can be declared in a similar way:
			
 
				 
			
 
				-@example
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				 static void matmul_cuda (const float *A, const float *B, float *C,
			
 
				                          size_t nx, size_t ny, size_t nz)
			
 
				   __attribute__ ((task_implementation ("cuda", matmul)));
			
@@ -143,7 +146,8 @@ static void matmul_cuda (const float *A, const float *B, float *C,
 
				 static void matmul_opencl (const float *A, const float *B, float *C,
			
 
				                            size_t nx, size_t ny, size_t nz)
			
 
				   __attribute__ ((task_implementation ("opencl", matmul)));
			
 
				-@end example
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				 
			
 
				 @noindent
			
 
				 The CUDA and OpenCL implementations typically either invoke a kernel
			
@@ -151,7 +155,8 @@ written in CUDA or OpenCL (for similar code, @pxref{CUDA Kernel}, and
 
				 @pxref{OpenCL Kernel}), or call a library function that uses CUDA or
			
 
				 OpenCL under the hood, such as CUBLAS functions:
			
 
				 
			
 
				-@example
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				 static void
			
 
				 matmul_cuda (const float *A, const float *B, float *C,
			
 
				              size_t nx, size_t ny, size_t nz)
			
@@ -161,16 +166,19 @@ matmul_cuda (const float *A, const float *B, float *C,
 
				                0.0f, C, 0);
			
 
				   cudaStreamSynchronize (starpu_cuda_get_local_stream ());
			
 
				 @}
			
 
				-@end example
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				 
			
 
				 A task can be invoked like a regular C function:
			
 
				 
			
 
				-@example
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				 matmul (&A[i * zdim * bydim + k * bzdim * bydim],
			
 
				         &B[k * xdim * bzdim + j * bxdim * bzdim],
			
 
				         &C[i * xdim * bydim + j * bxdim * bydim],
			
 
				         bxdim, bydim, bzdim);
			
 
				-@end example
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				 
			
 
				 @noindent
			
 
				 This leads to an @dfn{asynchronous invocation}, whereby @code{matmult}'s
			
@@ -225,7 +233,8 @@ supported C extensions.
 
				 The code below illustrates how to define a task and its implementations
			
 
				 in a way that allows it to be compiled without the GCC plug-in:
			
 
				 
			
 
				-@example
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				 /* The macros below abstract over the attributes specific to
			
 
				    StarPU-GCC and the name of the CPU implementation.  */
			
 
				 #ifdef STARPU_GCC_PLUGIN
			
@@ -279,7 +288,8 @@ main (int argc, char *argv[])
 
				 
			
 
				   return EXIT_SUCCESS;
			
 
				 @}
			
 
				-@end example
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				 
			
 
				 Note that attributes such as @code{task} are simply ignored by GCC when
			
 
				 the StarPU plug-in is not loaded, so the @code{__task} macro could be
			
--- a/doc/chapters/perf-feedback.texi
+++ b/doc/chapters/perf-feedback.texi
@@ -2,7 +2,7 @@
 
				 
			
 
				 @c This file is part of the StarPU Handbook.
			
 
				 @c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				-@c Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				 @c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				 @c See the file starpu.texi for copying conditions.
			
 
				 
			
@@ -135,47 +135,59 @@ Variables to be monitored can be registered by calling the
 
				 @code{starpu_top_add_data_boolean}, @code{starpu_top_add_data_integer},
			
 
				 @code{starpu_top_add_data_float} functions, e.g.:
			
 
				 
			
 
				-@example
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				 starpu_top_data *data = starpu_top_add_data_integer("mynum", 0, 100, 1);
			
 
				-@end example
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				 
			
 
				 The application should then call @code{starpu_top_init_and_wait} to give its name
			
 
				 and wait for StarPU-Top to get a start request from the user. The name is used
			
 
				 by StarPU-Top to quickly reload a previously-saved layout of parameter display.
			
 
				 
			
 
				-@example
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				 starpu_top_init_and_wait("the application");
			
 
				-@end example
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				 
			
 
				 The new values can then be provided thanks to
			
 
				 @code{starpu_top_update_data_boolean}, @code{starpu_top_update_data_integer},
			
 
				 @code{starpu_top_update_data_float}, e.g.:
			
 
				 
			
 
				-@example
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				 starpu_top_update_data_integer(data, mynum);
			
 
				-@end example
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				 
			
 
				 Updateable parameters can be registered thanks to @code{starpu_top_register_parameter_boolean}, @code{starpu_top_register_parameter_integer}, @code{starpu_top_register_parameter_float}, e.g.:
			
 
				 
			
 
				-@example
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				 float alpha;
			
 
				 starpu_top_register_parameter_float("alpha", &alpha, 0, 10, modif_hook);
			
 
				-@end example
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				 
			
 
				 @code{modif_hook} is a function which will be called when the parameter is being modified, it can for instance print the new value:
			
 
				 
			
 
				-@example
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				 void modif_hook(struct starpu_top_param *d) @{
			
 
				     fprintf(stderr,"%s has been modified: %f\n", d->name, alpha);
			
 
				 @}
			
 
				-@end example
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				 
			
 
				 Task schedulers should notify StarPU-Top when it has decided when a task will be
			
 
				 scheduled, so that it can show it in its Gantt chart, for instance:
			
 
				 
			
 
				-@example
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				 starpu_top_task_prevision(task, workerid, begin, end);
			
 
				-@end example
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				 
			
 
				 Starting StarPU-Top and the application can be done two ways:
			
 
				 
			
--- a/doc/chapters/perf-optimization.texi
+++ b/doc/chapters/perf-optimization.texi
@@ -2,7 +2,7 @@
 
				 
			
 
				 @c This file is part of the StarPU Handbook.
			
 
				 @c Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				-@c Copyright (C) 2010, 2011  Centre National de la Recherche Scientifique
			
 
				+@c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
			
 
				 @c Copyright (C) 2011 Institut National de Recherche en Informatique et Automatique
			
 
				 @c See the file starpu.texi for copying conditions.
			
 
				 
			
@@ -47,16 +47,20 @@ that this data will not be re-used by further tasks, it should advise StarPU to
 
				 immediately replicate it to a desired list of memory nodes (given through a
			
 
				 bitmask). This can be understood like the write-through mode of CPU caches.
			
 
				 
			
 
				-@example
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				 starpu_data_set_wt_mask(img_handle, 1<<0);
			
 
				-@end example
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				 
			
 
				 will for instance request to always automatically transfer a replicate into the
			
 
				 main memory (node 0), as bit 0 of the write-through bitmask is being set.
			
 
				 
			
 
				-@example
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				 starpu_data_set_wt_mask(img_handle, ~0U);
			
 
				-@end example
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				 
			
 
				 will request to always automatically broadcast the updated data to all memory
			
 
				 nodes.
			
@@ -265,10 +269,12 @@ dedicated CUDA stream for its computations. StarPU provides one by the use of
 
				 @code{starpu_cuda_get_local_stream()} which should be used by all CUDA codelet
			
 
				 operations. For instance:
			
 
				 
			
 
				-@example
			
 
				+@cartouche
			
 
				+@smallexample
			
 
				 func <<<grid,block,0,starpu_cuda_get_local_stream()>>> (foo, bar);
			
 
				 cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				-@end example
			
 
				+@end smallexample
			
 
				+@end cartouche
			
 
				 
			
 
				 StarPU already does appropriate calls for the CUBLAS library.