13 years ago · d7cb5a1d9d
--- a/doc/chapters/advanced-examples.texi
+++ b/doc/chapters/advanced-examples.texi
@@ -10,10 +10,11 @@
 
																 * Using multiple implementations of a codelet::
															
 
																 * Enabling implementation according to capabilities::
															
 
																 * Task and Worker Profiling::   
															
 
																-* Partitioning Data::           Partitioning Data
															
 
																+* Partitioning Data::
															
 
																 * Performance model example::   
															
 
																 * Theoretical lower bound on execution time::  
															
 
																 * Insert Task Utility::          
															
 
																+* Data reduction::  
															
 
																 * Parallel Tasks::
															
 
																 * Debugging::
															
 
																 * The multiformat interface::
															
@@ -587,6 +588,86 @@ be executed, and is allowed to read from @code{i} to use it e.g. as an
 
																 index. Note that this macro is only avaible when compiling StarPU with
															
 
																 the compiler @code{gcc}.
															
 
																+@node Data reduction
															
 
																+@section Data reduction
															
 
																+
															
 
																+In various cases, some piece of data is used to accumulate intermediate
															
 
																+results. For instances, the dot product of a vector, maximum/minimum finding,
															
 
																+the histogram of a photograph, etc. When these results are produced along the
															
 
																+whole machine, it would not be efficient to accumulate them in only one place,
															
 
																+incurring data transmission each and access concurrency.
															
 
																+
															
 
																+StarPU provides a @code{STARPU_REDUX} mode, which permits to optimize
															
 
																+that case: it will allocate a buffer on each memory node, and accumulate
															
 
																+intermediate results there. When the data is eventually accessed in the normal
															
 
																+@code{STARPU_R} mode, StarPU will collect the intermediate results in just one
															
 
																+buffer.
															
 
																+
															
 
																+For this to work, the user has to use the
															
 
																+@code{starpu_data_set_reduction_methods} to declare how to initialize these
															
 
																+buffers, and how to assemble partial results.
															
 
																+
															
 
																+For instance, @code{cg} uses that to optimize its dot product: it first defines
															
 
																+the codelets for initialization and reduction:
															
 
																+
															
 
																+@smallexample
															
 
																+struct starpu_codelet bzero_variable_cl =
															
 
																+@{
															
 
																+        .cpu_funcs = @{ bzero_variable_cpu, NULL @},
															
 
																+        .cuda_funcs = @{ bzero_variable_cuda, NULL @},
															
 
																+        .nbuffers = 1,
															
 
																+@}
															
 
																+
															
 
																+static void accumulate_variable_cpu(void *descr[], void *cl_arg)
															
 
																+@{
															
 
																+        double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																+        double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);
															
 
																+        *v_dst = *v_dst + *v_src;
															
 
																+@}
															
 
																+
															
 
																+static void accumulate_variable_cuda(void *descr[], void *cl_arg)
															
 
																+@{
															
 
																+        double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																+        double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);
															
 
																+        cublasaxpy(1, (double)1.0, v_src, 1, v_dst, 1);
															
 
																+        cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																+@}
															
 
																+
															
 
																+struct starpu_codelet accumulate_variable_cl =
															
 
																+@{
															
 
																+        .cpu_funcs = @{ accumulate_variable_cpu, NULL @},
															
 
																+        .cuda_funcs = @{ accumulate_variable_cuda, NULL @},
															
 
																+        .nbuffers = 1,
															
 
																+@}
															
 
																+@end smallexample
															
 
																+
															
 
																+and attaches them as reduction methods for its dtq handle:
															
 
																+
															
 
																+@smallexample
															
 
																+starpu_data_set_reduction_methods(dtq_handle,
															
 
																+        &accumulate_variable_cl, &bzero_variable_cl);
															
 
																+@end smallexample
															
 
																+
															
 
																+and dtq_handle can now be used in @code{STARPU_REDUX} mode for the dot products
															
 
																+with partitioned vectors:
															
 
																+
															
 
																+@smallexample
															
 
																+int dots(starpu_data_handle v1, starpu_data_handle v2,
															
 
																+         starpu_data_handle s, unsigned nblocks)
															
 
																+@{
															
 
																+    starpu_insert_task(&bzero_variable_cl, STARPU_W, s, 0);
															
 
																+    for (b = 0; b < nblocks; b++)
															
 
																+        starpu_insert_task(&dot_kernel_cl,
															
 
																+            STARPU_RW, s,
															
 
																+            STARPU_R, starpu_data_get_sub_data(v1, 1, b),
															
 
																+            STARPU_R, starpu_data_get_sub_data(v2, 1, b),
															
 
																+            0);
															
 
																+@}
															
 
																+@end smallexample
															
 
																+
															
 
																+The @code{cg} example also uses reduction for the blocked gemv kernel, leading
															
 
																+to yet more relaxed dependencies and more parallelism.
															
 
																+
															
 
																 @node Parallel Tasks
															
 
																 @section Parallel Tasks