13 years ago · d7cb5a1d9d
--- a/doc/chapters/advanced-examples.texi
+++ b/doc/chapters/advanced-examples.texi
@@ -10,10 +10,11 @@
 
				 * Using multiple implementations of a codelet::
			
 
				 * Enabling implementation according to capabilities::
			
 
				 * Task and Worker Profiling::   
			
 
				-* Partitioning Data::           Partitioning Data
			
 
				+* Partitioning Data::
			
 
				 * Performance model example::   
			
 
				 * Theoretical lower bound on execution time::  
			
 
				 * Insert Task Utility::          
			
 
				+* Data reduction::  
			
 
				 * Parallel Tasks::
			
 
				 * Debugging::
			
 
				 * The multiformat interface::
			
@@ -587,6 +588,86 @@ be executed, and is allowed to read from @code{i} to use it e.g. as an
 
				 index. Note that this macro is only avaible when compiling StarPU with
			
 
				 the compiler @code{gcc}.
			
 
				 
			
 
				+@node Data reduction
			
 
				+@section Data reduction
			
 
				+
			
 
				+In various cases, some piece of data is used to accumulate intermediate
			
 
				+results. For instances, the dot product of a vector, maximum/minimum finding,
			
 
				+the histogram of a photograph, etc. When these results are produced along the
			
 
				+whole machine, it would not be efficient to accumulate them in only one place,
			
 
				+incurring data transmission each and access concurrency.
			
 
				+
			
 
				+StarPU provides a @code{STARPU_REDUX} mode, which permits to optimize
			
 
				+that case: it will allocate a buffer on each memory node, and accumulate
			
 
				+intermediate results there. When the data is eventually accessed in the normal
			
 
				+@code{STARPU_R} mode, StarPU will collect the intermediate results in just one
			
 
				+buffer.
			
 
				+
			
 
				+For this to work, the user has to use the
			
 
				+@code{starpu_data_set_reduction_methods} to declare how to initialize these
			
 
				+buffers, and how to assemble partial results.
			
 
				+
			
 
				+For instance, @code{cg} uses that to optimize its dot product: it first defines
			
 
				+the codelets for initialization and reduction:
			
 
				+
			
 
				+@smallexample
			
 
				+struct starpu_codelet bzero_variable_cl =
			
 
				+@{
			
 
				+        .cpu_funcs = @{ bzero_variable_cpu, NULL @},
			
 
				+        .cuda_funcs = @{ bzero_variable_cuda, NULL @},
			
 
				+        .nbuffers = 1,
			
 
				+@}
			
 
				+
			
 
				+static void accumulate_variable_cpu(void *descr[], void *cl_arg)
			
 
				+@{
			
 
				+        double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+        double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				+        *v_dst = *v_dst + *v_src;
			
 
				+@}
			
 
				+
			
 
				+static void accumulate_variable_cuda(void *descr[], void *cl_arg)
			
 
				+@{
			
 
				+        double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+        double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				+        cublasaxpy(1, (double)1.0, v_src, 1, v_dst, 1);
			
 
				+        cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+@}
			
 
				+
			
 
				+struct starpu_codelet accumulate_variable_cl =
			
 
				+@{
			
 
				+        .cpu_funcs = @{ accumulate_variable_cpu, NULL @},
			
 
				+        .cuda_funcs = @{ accumulate_variable_cuda, NULL @},
			
 
				+        .nbuffers = 1,
			
 
				+@}
			
 
				+@end smallexample
			
 
				+
			
 
				+and attaches them as reduction methods for its dtq handle:
			
 
				+
			
 
				+@smallexample
			
 
				+starpu_data_set_reduction_methods(dtq_handle,
			
 
				+        &accumulate_variable_cl, &bzero_variable_cl);
			
 
				+@end smallexample
			
 
				+
			
 
				+and dtq_handle can now be used in @code{STARPU_REDUX} mode for the dot products
			
 
				+with partitioned vectors:
			
 
				+
			
 
				+@smallexample
			
 
				+int dots(starpu_data_handle v1, starpu_data_handle v2,
			
 
				+         starpu_data_handle s, unsigned nblocks)
			
 
				+@{
			
 
				+    starpu_insert_task(&bzero_variable_cl, STARPU_W, s, 0);
			
 
				+    for (b = 0; b < nblocks; b++)
			
 
				+        starpu_insert_task(&dot_kernel_cl,
			
 
				+            STARPU_RW, s,
			
 
				+            STARPU_R, starpu_data_get_sub_data(v1, 1, b),
			
 
				+            STARPU_R, starpu_data_get_sub_data(v2, 1, b),
			
 
				+            0);
			
 
				+@}
			
 
				+@end smallexample
			
 
				+
			
 
				+The @code{cg} example also uses reduction for the blocked gemv kernel, leading
			
 
				+to yet more relaxed dependencies and more parallelism.
			
 
				+
			
 
				 @node Parallel Tasks
			
 
				 @section Parallel Tasks