11 yıl önce · 4839a2cfce
--- a/doc/doxygen/Makefile.am
+++ b/doc/doxygen/Makefile.am
@@ -28,22 +28,28 @@ chapters =	\
 
																 	chapters/01building.doxy \
															
 
																 	chapters/02basic_examples.doxy \
															
 
																 	chapters/03advanced_examples.doxy \
															
 
																-	chapters/04optimize_performance.doxy \
															
 
																-	chapters/05performance_feedback.doxy \
															
 
																-	chapters/06tips_and_tricks.doxy \
															
 
																-	chapters/07out_of_core.doxy \
															
 
																-	chapters/08mpi_support.doxy \
															
 
																-	chapters/09fft_support.doxy \
															
 
																-	chapters/10mic_scc_support.doxy \
															
 
																-	chapters/11c_extensions.doxy \
															
 
																-	chapters/12socl_opencl_extensions.doxy \
															
 
																-	chapters/13scheduling_contexts.doxy \
															
 
																-	chapters/14scheduling_context_hypervisor.doxy \
															
 
																-	chapters/15environment_variables.doxy \
															
 
																-	chapters/16configure_options.doxy \
															
 
																-	chapters/17files.doxy \
															
 
																-	chapters/18scaling-vector-example.doxy \
															
 
																-	chapters/19fdl-1.3.doxy \
															
 
																+	chapters/05check_list_performance.doxy \
															
 
																+	chapters/06tasks.doxy \
															
 
																+	chapters/07data_management.doxy \
															
 
																+	chapters/08scheduling.doxy \
															
 
																+	chapters/09scheduling_contexts.doxy \
															
 
																+	chapters/10scheduling_context_hypervisor.doxy \
															
 
																+	chapters/11debugging_tools.doxy \
															
 
																+	chapters/12online_performance_tools.doxy \
															
 
																+	chapters/13offline_performance_tools.doxy \
															
 
																+	chapters/14faq.doxy \
															
 
																+	chapters/15out_of_core.doxy \
															
 
																+	chapters/16mpi_support.doxy \
															
 
																+	chapters/17fft_support.doxy \
															
 
																+	chapters/18mic_scc_support.doxy \
															
 
																+	chapters/19c_extensions.doxy \
															
 
																+	chapters/20socl_opencl_extensions.doxy \
															
 
																+	chapters/21simgrid.doxy \
															
 
																+	chapters/40environment_variables.doxy \
															
 
																+	chapters/41configure_options.doxy \
															
 
																+	chapters/45files.doxy \
															
 
																+	chapters/50scaling-vector-example.doxy \
															
 
																+	chapters/51fdl-1.3.doxy \
															
 
																 	chapters/code/hello_pragma2.c \
															
 
																 	chapters/code/hello_pragma.c \
															
 
																 	chapters/code/scal_pragma.cu \
															
@@ -218,8 +224,8 @@ $(DOX_TAG): $(dox_inputs)
 
																 	$(DOXYGEN) $(DOX_CONFIG)
															
 
																 	sed -i 's/ModuleDocumentation <\/li>/<a class="el" href="modules.html">Modules<\/a>/' html/index.html
															
 
																 	sed -i 's/FileDocumentation <\/li>/<a class="el" href="files.html">Files<\/a>/' html/index.html
															
 
																-        # comment for the line above: what we really want to do is to remove the line, but dy doing so, it avoids opening the interactive menu when browsing files
															
 
																-	if test -f html/navtree.js ; then sed -i 's/\[ "Files", "Files.html", null \]/\[ "", "Files.html", null \]/' html/navtree.js ; fi
															
 
																+        # comment for the line below: what we really want to do is to remove the line, but dy doing so, it avoids opening the interactive menu when browsing files
															
 
																+#	if test -f html/navtree.js ; then sed -i 's/\[ "Files", "Files.html", null \]/\[ "", "Files.html", null \]/' html/navtree.js ; fi
															
 
																 	sed -i 's/.*"Files.html".*//' html/pages.html
															
 
																 	if test -f latex/main.tex ; then mv latex/main.tex latex/index.tex ; fi
															
--- a/doc/doxygen/chapters/00introduction.doxy
+++ b/doc/doxygen/chapters/00introduction.doxy
@@ -1,7 +1,7 @@
 
																 /*
															
 
																  * This file is part of the StarPU Handbook.
															
 
																  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
															
 
																  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
															
 
																  * See the file version.doxy for copying conditions.
															
 
																 */
															
@@ -184,30 +184,94 @@ http://runtime.bordeaux.inria.fr/Publis/Keyword/STARPU.html.
 
																 A good overview is available in the research report at
															
 
																 http://hal.archives-ouvertes.fr/inria-00467677.
															
 
																+\section StarPUApplications StarPU Applications
															
 
																+
															
 
																+You can first have a look at the chapters \ref BasicExamples and \ref AdvancedExamples.
															
 
																+A tutorial is also installed in the directory <c>share/doc/starpu/tutorial/</c>.
															
 
																+
															
 
																+Many examples are also available in the StarPU sources in the directory
															
 
																+<c>examples/</c>. Simple examples include:
															
 
																+
															
 
																+<dl>
															
 
																+<dt> <c>incrementer/</c> </dt>
															
 
																+<dd> Trivial incrementation test. </dd>
															
 
																+<dt> <c>basic_examples/</c> </dt>
															
 
																+<dd>
															
 
																+        Simple documented Hello world and vector/scalar product (as
															
 
																+        shown in \ref BasicExamples), matrix
															
 
																+        product examples (as shown in \ref PerformanceModelExample), an example using the blocked matrix data
															
 
																+        interface, an example using the variable data interface, and an example
															
 
																+        using different formats on CPUs and GPUs.
															
 
																+</dd>
															
 
																+<dt> <c>matvecmult/</c></dt>
															
 
																+<dd>
															
 
																+    OpenCL example from NVidia, adapted to StarPU.
															
 
																+</dd>
															
 
																+<dt> <c>axpy/</c></dt>
															
 
																+<dd>
															
 
																+    AXPY CUBLAS operation adapted to StarPU.
															
 
																+</dd>
															
 
																+<dt> <c>fortran/</c> </dt>
															
 
																+<dd>
															
 
																+    Example of Fortran bindings.
															
 
																+</dd>
															
 
																+</dl>
															
 
																+
															
 
																+More advanced examples include:
															
 
																+
															
 
																+<dl>
															
 
																+<dt><c>filters/</c></dt>
															
 
																+<dd>
															
 
																+    Examples using filters, as shown in \ref PartitioningData.
															
 
																+</dd>
															
 
																+<dt><c>lu/</c></dt>
															
 
																+<dd>
															
 
																+    LU matrix factorization, see for instance <c>xlu_implicit.c</c>
															
 
																+</dd>
															
 
																+<dt><c>cholesky/</c></dt>
															
 
																+<dd>
															
 
																+    Cholesky matrix factorization, see for instance <c>cholesky_implicit.c</c>.
															
 
																+</dd>
															
 
																+</dl>
															
 
																+
															
 
																 \section FurtherReading Further Reading
															
 
																 The documentation chapters include
															
 
																-<ol>
															
 
																-<li> Part: Using StarPU
															
 
																+<ul>
															
 
																+<li> Part 1: StarPU Basics
															
 
																 <ul>
															
 
																 <li> \ref BuildingAndInstallingStarPU
															
 
																 <li> \ref BasicExamples
															
 
																+</ul>
															
 
																+<li> Part 2: StarPU Quick Programming Guide
															
 
																+<ul>
															
 
																 <li> \ref AdvancedExamples
															
 
																-<li> \ref HowToOptimizePerformanceWithStarPU
															
 
																-<li> \ref PerformanceFeedback
															
 
																-<li> \ref TipsAndTricksToKnowAbout
															
 
																+<li> \ref CheckListWhenPerformanceAreNotThere
															
 
																+</ul>
															
 
																+<li> Part 3: StarPU Inside
															
 
																+<ul>
															
 
																+<li> \ref TasksInStarPU
															
 
																+<li> \ref DataManagement
															
 
																+<li> \ref Scheduling
															
 
																+<li> \ref SchedulingContexts
															
 
																+<li> \ref SchedulingContextHypervisor
															
 
																+<li> \ref DebuggingTools
															
 
																+<li> \ref OnlinePerformanceTools
															
 
																+<li> \ref OfflinePerformanceTools
															
 
																+<li> \ref FrequentlyAskedQuestions
															
 
																+</ul>
															
 
																+<li> Part 4: StarPU Extensions
															
 
																+<ul>
															
 
																 <li> \ref OutOfCore
															
 
																 <li> \ref MPISupport
															
 
																 <li> \ref FFTSupport
															
 
																 <li> \ref MICSCCSupport
															
 
																 <li> \ref cExtensions
															
 
																 <li> \ref SOCLOpenclExtensions
															
 
																-<li> \ref SchedulingContexts
															
 
																-<li> \ref SchedulingContextHypervisor
															
 
																+<li> \ref SimGridSupport
															
 
																 </ul>
															
 
																-</li>
															
 
																-<li> Part: Inside StarPU
															
 
																+<li> Part 5: StarPU Reference API
															
 
																 <ul>
															
 
																 <li> \ref ExecutionConfigurationThroughEnvironmentVariables
															
 
																 <li> \ref CompilationConfiguration
															
@@ -220,8 +284,7 @@ The documentation chapters include
 
																 <li> \ref FullSourceCodeVectorScal
															
 
																 <li> \ref GNUFreeDocumentationLicense
															
 
																 </ul>
															
 
																-</ol>
															
 
																-
															
 
																+</ul>
															
 
																 Make sure to have had a look at those too!
															
--- a/doc/doxygen/chapters/01building.doxy
+++ b/doc/doxygen/chapters/01building.doxy
@@ -1,7 +1,7 @@
 
																 /*
															
 
																  * This file is part of the StarPU Handbook.
															
 
																  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
															
 
																  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
															
 
																  * See the file version.doxy for copying conditions.
															
 
																  */
															
--- a/doc/doxygen/chapters/03advanced_examples.doxy
+++ b/doc/doxygen/chapters/03advanced_examples.doxy
--- a/doc/doxygen/chapters/04optimize_performance.doxy
+++ b/doc/doxygen/chapters/04optimize_performance.doxy
@@ -1,552 +0,0 @@
 
																-/*
															
 
																- * This file is part of the StarPU Handbook.
															
 
																- * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																- * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
															
 
																- * See the file version.doxy for copying conditions.
															
 
																- */
															
 
																-
															
 
																-/*! \page HowToOptimizePerformanceWithStarPU How To Optimize Performance With StarPU
															
 
																-
															
 
																-TODO: improve!
															
 
																-
															
 
																-Simply encapsulating application kernels into tasks already permits to
															
 
																-seamlessly support CPU and GPUs at the same time. To achieve good performance, a
															
 
																-few additional changes are needed.
															
 
																-
															
 
																-\section DataManagement Data Management
															
 
																-
															
 
																-When the application allocates data, whenever possible it should use
															
 
																-the function starpu_malloc(), which will ask CUDA or OpenCL to make
															
 
																-the allocation itself and pin the corresponding allocated memory. This
															
 
																-is needed to permit asynchronous data transfer, i.e. permit data
															
 
																-transfer to overlap with computations. Otherwise, the trace will show
															
 
																-that the <c>DriverCopyAsync</c> state takes a lot of time, this is
															
 
																-because CUDA or OpenCL then reverts to synchronous transfers.
															
 
																-
															
 
																-By default, StarPU leaves replicates of data wherever they were used, in case they
															
 
																-will be re-used by other tasks, thus saving the data transfer time. When some
															
 
																-task modifies some data, all the other replicates are invalidated, and only the
															
 
																-processing unit which ran that task will have a valid replicate of the data. If the application knows
															
 
																-that this data will not be re-used by further tasks, it should advise StarPU to
															
 
																-immediately replicate it to a desired list of memory nodes (given through a
															
 
																-bitmask). This can be understood like the write-through mode of CPU caches.
															
 
																-
															
 
																-\code{.c}
															
 
																-starpu_data_set_wt_mask(img_handle, 1<<0);
															
 
																-\endcode
															
 
																-
															
 
																-will for instance request to always automatically transfer a replicate into the
															
 
																-main memory (node <c>0</c>), as bit <c>0</c> of the write-through bitmask is being set.
															
 
																-
															
 
																-\code{.c}
															
 
																-starpu_data_set_wt_mask(img_handle, ~0U);
															
 
																-\endcode
															
 
																-
															
 
																-will request to always automatically broadcast the updated data to all memory
															
 
																-nodes.
															
 
																-
															
 
																-Setting the write-through mask to <c>~0U</c> can also be useful to make sure all
															
 
																-memory nodes always have a copy of the data, so that it is never evicted when
															
 
																-memory gets scarse.
															
 
																-
															
 
																-Implicit data dependency computation can become expensive if a lot
															
 
																-of tasks access the same piece of data. If no dependency is required
															
 
																-on some piece of data (e.g. because it is only accessed in read-only
															
 
																-mode, or because write accesses are actually commutative), use the
															
 
																-function starpu_data_set_sequential_consistency_flag() to disable
															
 
																-implicit dependencies on that data.
															
 
																-
															
 
																-In the same vein, accumulation of results in the same data can become a
															
 
																-bottleneck. The use of the mode ::STARPU_REDUX permits to optimize such
															
 
																-accumulation (see \ref DataReduction). To a lesser extent, the use of
															
 
																-the flag ::STARPU_COMMUTE keeps the bottleneck, but at least permits
															
 
																-the accumulation to happen in any order.
															
 
																-
															
 
																-Applications often need a data just for temporary results.  In such a case,
															
 
																-registration can be made without an initial value, for instance this produces a vector data:
															
 
																-
															
 
																-\code{.c}
															
 
																-starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
															
 
																-\endcode
															
 
																-
															
 
																-StarPU will then allocate the actual buffer only when it is actually needed,
															
 
																-e.g. directly on the GPU without allocating in main memory.
															
 
																-
															
 
																-In the same vein, once the temporary results are not useful any more, the
															
 
																-data should be thrown away. If the handle is not to be reused, it can be
															
 
																-unregistered:
															
 
																-
															
 
																-\code{.c}
															
 
																-starpu_unregister_submit(handle);
															
 
																-\endcode
															
 
																-
															
 
																-actual unregistration will be done after all tasks working on the handle
															
 
																-terminate.
															
 
																-
															
 
																-If the handle is to be reused, instead of unregistering it, it can simply be invalidated:
															
 
																-
															
 
																-\code{.c}
															
 
																-starpu_invalidate_submit(handle);
															
 
																-\endcode
															
 
																-
															
 
																-the buffers containing the current value will then be freed, and reallocated
															
 
																-only when another task writes some value to the handle.
															
 
																-
															
 
																-\section TaskGranularity Task Granularity
															
 
																-
															
 
																-Like any other runtime, StarPU has some overhead to manage tasks. Since
															
 
																-it does smart scheduling and data management, that overhead is not always
															
 
																-neglectable. The order of magnitude of the overhead is typically a couple of
															
 
																-microseconds, which is actually quite smaller than the CUDA overhead itself. The
															
 
																-amount of work that a task should do should thus be somewhat
															
 
																-bigger, to make sure that the overhead becomes neglectible. The offline
															
 
																-performance feedback can provide a measure of task length, which should thus be
															
 
																-checked if bad performance are observed. To get a grasp at the scalability
															
 
																-possibility according to task size, one can run
															
 
																-<c>tests/microbenchs/tasks_size_overhead.sh</c> which draws curves of the
															
 
																-speedup of independent tasks of very small sizes.
															
 
																-
															
 
																-The choice of scheduler also has impact over the overhead: for instance, the
															
 
																- scheduler <c>dmda</c> takes time to make a decision, while <c>eager</c> does
															
 
																-not. <c>tasks_size_overhead.sh</c> can again be used to get a grasp at how much
															
 
																-impact that has on the target machine.
															
 
																-
															
 
																-\section TaskSubmission Task Submission
															
 
																-
															
 
																-To let StarPU make online optimizations, tasks should be submitted
															
 
																-asynchronously as much as possible. Ideally, all the tasks should be
															
 
																-submitted, and mere calls to starpu_task_wait_for_all() or
															
 
																-starpu_data_unregister() be done to wait for
															
 
																-termination. StarPU will then be able to rework the whole schedule, overlap
															
 
																-computation with communication, manage accelerator local memory usage, etc.
															
 
																-
															
 
																-\section TaskPriorities Task Priorities
															
 
																-
															
 
																-By default, StarPU will consider the tasks in the order they are submitted by
															
 
																-the application. If the application programmer knows that some tasks should
															
 
																-be performed in priority (for instance because their output is needed by many
															
 
																-other tasks and may thus be a bottleneck if not executed early
															
 
																-enough), the field starpu_task::priority should be set to transmit the
															
 
																-priority information to StarPU.
															
 
																-
															
 
																-\section TaskSchedulingPolicy Task Scheduling Policy
															
 
																-
															
 
																-By default, StarPU uses the simple greedy scheduler <c>eager</c>. This is
															
 
																-because it provides correct load balance even if the application codelets do not
															
 
																-have performance models. If your application codelets have performance models
															
 
																-(\ref PerformanceModelExample), you should change the scheduler thanks
															
 
																-to the environment variable \ref STARPU_SCHED. For instance <c>export
															
 
																-STARPU_SCHED=dmda</c> . Use <c>help</c> to get the list of available schedulers.
															
 
																-
															
 
																-The <b>eager</b> scheduler uses a central task queue, from which workers draw tasks
															
 
																-to work on. This however does not permit to prefetch data since the scheduling
															
 
																-decision is taken late. If a task has a non-0 priority, it is put at the front of the queue.
															
 
																-
															
 
																-The <b>prio</b> scheduler also uses a central task queue, but sorts tasks by
															
 
																-priority (between -5 and 5).
															
 
																-
															
 
																-The <b>random</b> scheduler distributes tasks randomly according to assumed worker
															
 
																-overall performance.
															
 
																-
															
 
																-The <b>ws</b> (work stealing) scheduler schedules tasks on the local worker by
															
 
																-default. When a worker becomes idle, it steals a task from the most loaded
															
 
																-worker.
															
 
																-
															
 
																-The <b>dm</b> (deque model) scheduler uses task execution performance models into account to
															
 
																-perform an HEFT-similar scheduling strategy: it schedules tasks where their
															
 
																-termination time will be minimal.
															
 
																-
															
 
																-The <b>dmda</b> (deque model data aware) scheduler is similar to dm, it also takes
															
 
																-into account data transfer time.
															
 
																-
															
 
																-The <b>dmdar</b> (deque model data aware ready) scheduler is similar to dmda,
															
 
																-it also sorts tasks on per-worker queues by number of already-available data
															
 
																-buffers.
															
 
																-
															
 
																-The <b>dmdas</b> (deque model data aware sorted) scheduler is similar to dmda, it
															
 
																-also supports arbitrary priority values.
															
 
																-
															
 
																-The <b>heft</b> (heterogeneous earliest finish time) scheduler is deprecated. It
															
 
																-is now just an alias for <b>dmda</b>.
															
 
																-
															
 
																-The <b>pheft</b> (parallel HEFT) scheduler is similar to heft, it also supports
															
 
																-parallel tasks (still experimental). Should not be used when several contexts using
															
 
																-it are being executed simultaneously.
															
 
																-
															
 
																-The <b>peager</b> (parallel eager) scheduler is similar to eager, it also
															
 
																-supports parallel tasks (still experimental). Should not be used when several 
															
 
																-contexts using it are being executed simultaneously.
															
 
																-
															
 
																-
															
 
																-\section PerformanceModelCalibration Performance Model Calibration
															
 
																-
															
 
																-Most schedulers are based on an estimation of codelet duration on each kind
															
 
																-of processing unit. For this to be possible, the application programmer needs
															
 
																-to configure a performance model for the codelets of the application (see
															
 
																-\ref PerformanceModelExample for instance). History-based performance models
															
 
																-use on-line calibration.  StarPU will automatically calibrate codelets
															
 
																-which have never been calibrated yet, and save the result in
															
 
																-<c>$STARPU_HOME/.starpu/sampling/codelets</c>.
															
 
																-The models are indexed by machine name. To share the models between
															
 
																-machines (e.g. for a homogeneous cluster), use <c>export
															
 
																-STARPU_HOSTNAME=some_global_name</c>. To force continuing calibration,
															
 
																-use <c>export STARPU_CALIBRATE=1</c> . This may be necessary if your application
															
 
																-has not-so-stable performance. StarPU will force calibration (and thus ignore
															
 
																-the current result) until 10 (<c>_STARPU_CALIBRATION_MINIMUM</c>) measurements have been
															
 
																-made on each architecture, to avoid badly scheduling tasks just because the
															
 
																-first measurements were not so good. Details on the current performance model status
															
 
																-can be obtained from the command <c>starpu_perfmodel_display</c>: the <c>-l</c>
															
 
																-option lists the available performance models, and the <c>-s</c> option permits
															
 
																-to choose the performance model to be displayed. The result looks like:
															
 
																-
															
 
																-\verbatim
															
 
																-$ starpu_perfmodel_display -s starpu_slu_lu_model_11
															
 
																-performance model for cpu_impl_0
															
 
																-# hash    size     flops         mean          dev           n
															
 
																-914f3bef  1048576  0.000000e+00  2.503577e+04  1.982465e+02  8
															
 
																-3e921964  65536    0.000000e+00  5.527003e+02  1.848114e+01  7
															
 
																-e5a07e31  4096     0.000000e+00  1.717457e+01  5.190038e+00  14
															
 
																-...
															
 
																-\endverbatim
															
 
																-
															
 
																-Which shows that for the LU 11 kernel with a 1MiB matrix, the average
															
 
																-execution time on CPUs was about 25ms, with a 0.2ms standard deviation, over
															
 
																-8 samples. It is a good idea to check this before doing actual performance
															
 
																-measurements.
															
 
																-
															
 
																-A graph can be drawn by using the tool <c>starpu_perfmodel_plot</c>:
															
 
																-
															
 
																-\verbatim
															
 
																-$ starpu_perfmodel_plot -s starpu_slu_lu_model_11
															
 
																-4096 16384 65536 262144 1048576 4194304 
															
 
																-$ gnuplot starpu_starpu_slu_lu_model_11.gp
															
 
																-$ gv starpu_starpu_slu_lu_model_11.eps
															
 
																-\endverbatim
															
 
																-
															
 
																-\image html starpu_starpu_slu_lu_model_11.png
															
 
																-\image latex starpu_starpu_slu_lu_model_11.eps "" width=\textwidth
															
 
																-
															
 
																-If a kernel source code was modified (e.g. performance improvement), the
															
 
																-calibration information is stale and should be dropped, to re-calibrate from
															
 
																-start. This can be done by using <c>export STARPU_CALIBRATE=2</c>.
															
 
																-
															
 
																-Note: due to CUDA limitations, to be able to measure kernel duration,
															
 
																-calibration mode needs to disable asynchronous data transfers. Calibration thus
															
 
																-disables data transfer / computation overlapping, and should thus not be used
															
 
																-for eventual benchmarks. Note 2: history-based performance models get calibrated
															
 
																-only if a performance-model-based scheduler is chosen.
															
 
																-
															
 
																-The history-based performance models can also be explicitly filled by the
															
 
																-application without execution, if e.g. the application already has a series of
															
 
																-measurements. This can be done by using starpu_perfmodel_update_history(),
															
 
																-for instance:
															
 
																-
															
 
																-\code{.c}
															
 
																-static struct starpu_perfmodel perf_model = {
															
 
																-    .type = STARPU_HISTORY_BASED,
															
 
																-    .symbol = "my_perfmodel",
															
 
																-};
															
 
																-
															
 
																-struct starpu_codelet cl = {
															
 
																-    .where = STARPU_CUDA,
															
 
																-    .cuda_funcs = { cuda_func1, cuda_func2, NULL },
															
 
																-    .nbuffers = 1,
															
 
																-    .modes = {STARPU_W},
															
 
																-    .model = &perf_model
															
 
																-};
															
 
																-
															
 
																-void feed(void) {
															
 
																-    struct my_measure *measure;
															
 
																-    struct starpu_task task;
															
 
																-    starpu_task_init(&task);
															
 
																-
															
 
																-    task.cl = &cl;
															
 
																-
															
 
																-    for (measure = &measures[0]; measure < measures[last]; measure++) {
															
 
																-        starpu_data_handle_t handle;
															
 
																-	starpu_vector_data_register(&handle, -1, 0, measure->size, sizeof(float));
															
 
																-	task.handles[0] = handle;
															
 
																-	starpu_perfmodel_update_history(&perf_model, &task,
															
 
																-	                                STARPU_CUDA_DEFAULT + measure->cudadev, 0,
															
 
																-	                                measure->implementation, measure->time);
															
 
																-	starpu_task_clean(&task);
															
 
																-	starpu_data_unregister(handle);
															
 
																-    }
															
 
																-}
															
 
																-\endcode
															
 
																-
															
 
																-Measurement has to be provided in milliseconds for the completion time models,
															
 
																-and in Joules for the energy consumption models.
															
 
																-
															
 
																-\section TaskDistributionVsDataTransfer Task Distribution Vs Data Transfer
															
 
																-
															
 
																-Distributing tasks to balance the load induces data transfer penalty. StarPU
															
 
																-thus needs to find a balance between both. The target function that the
															
 
																-scheduler <c>dmda</c> of StarPU
															
 
																-tries to minimize is <c>alpha * T_execution + beta * T_data_transfer</c>, where
															
 
																-<c>T_execution</c> is the estimated execution time of the codelet (usually
															
 
																-accurate), and <c>T_data_transfer</c> is the estimated data transfer time. The
															
 
																-latter is estimated based on bus calibration before execution start,
															
 
																-i.e. with an idle machine, thus without contention. You can force bus
															
 
																-re-calibration by running the tool <c>starpu_calibrate_bus</c>. The
															
 
																-beta parameter defaults to <c>1</c>, but it can be worth trying to tweak it
															
 
																-by using <c>export STARPU_SCHED_BETA=2</c> for instance, since during
															
 
																-real application execution, contention makes transfer times bigger.
															
 
																-This is of course imprecise, but in practice, a rough estimation
															
 
																-already gives the good results that a precise estimation would give.
															
 
																-
															
 
																-\section DataPrefetch Data Prefetch
															
 
																-
															
 
																-The scheduling policies <c>heft</c>, <c>dmda</c> and <c>pheft</c>
															
 
																-perform data prefetch (see \ref STARPU_PREFETCH):
															
 
																-as soon as a scheduling decision is taken for a task, requests are issued to
															
 
																-transfer its required data to the target processing unit, if needed, so that
															
 
																-when the processing unit actually starts the task, its data will hopefully be
															
 
																-already available and it will not have to wait for the transfer to finish.
															
 
																-
															
 
																-The application may want to perform some manual prefetching, for several reasons
															
 
																-such as excluding initial data transfers from performance measurements, or
															
 
																-setting up an initial statically-computed data distribution on the machine
															
 
																-before submitting tasks, which will thus guide StarPU toward an initial task
															
 
																-distribution (since StarPU will try to avoid further transfers).
															
 
																-
															
 
																-This can be achieved by giving the function starpu_data_prefetch_on_node()
															
 
																-the handle and the desired target memory node.
															
 
																-
															
 
																-\section Power-basedScheduling Power-based Scheduling
															
 
																-
															
 
																-If the application can provide some power performance model (through
															
 
																-the field starpu_codelet::power_model), StarPU will
															
 
																-take it into account when distributing tasks. The target function that
															
 
																-the scheduler <c>dmda</c> minimizes becomes <c>alpha * T_execution +
															
 
																-beta * T_data_transfer + gamma * Consumption</c> , where <c>Consumption</c>
															
 
																-is the estimated task consumption in Joules. To tune this parameter, use
															
 
																-<c>export STARPU_SCHED_GAMMA=3000</c> for instance, to express that each Joule
															
 
																-(i.e kW during 1000us) is worth 3000us execution time penalty. Setting
															
 
																-<c>alpha</c> and <c>beta</c> to zero permits to only take into account power consumption.
															
 
																-
															
 
																-This is however not sufficient to correctly optimize power: the scheduler would
															
 
																-simply tend to run all computations on the most energy-conservative processing
															
 
																-unit. To account for the consumption of the whole machine (including idle
															
 
																-processing units), the idle power of the machine should be given by setting
															
 
																-<c>export STARPU_IDLE_POWER=200</c> for 200W, for instance. This value can often
															
 
																-be obtained from the machine power supplier.
															
 
																-
															
 
																-The power actually consumed by the total execution can be displayed by setting
															
 
																-<c>export STARPU_PROFILING=1 STARPU_WORKER_STATS=1</c> .
															
 
																-
															
 
																-On-line task consumption measurement is currently only supported through the
															
 
																-<c>CL_PROFILING_POWER_CONSUMED</c> OpenCL extension, implemented in the MoviSim
															
 
																-simulator. Applications can however provide explicit measurements by
															
 
																-using the function starpu_perfmodel_update_history() (examplified in \ref PerformanceModelExample
															
 
																-with the <c>power_model</c> performance model). Fine-grain
															
 
																-measurement is often not feasible with the feedback provided by the hardware, so
															
 
																-the user can for instance run a given task a thousand times, measure the global
															
 
																-consumption for that series of tasks, divide it by a thousand, repeat for
															
 
																-varying kinds of tasks and task sizes, and eventually feed StarPU
															
 
																-with these manual measurements through starpu_perfmodel_update_history().
															
 
																-
															
 
																-\section StaticScheduling Static Scheduling
															
 
																-
															
 
																-In some cases, one may want to force some scheduling, for instance force a given
															
 
																-set of tasks to GPU0, another set to GPU1, etc. while letting some other tasks
															
 
																-be scheduled on any other device. This can indeed be useful to guide StarPU into
															
 
																-some work distribution, while still letting some degree of dynamism. For
															
 
																-instance, to force execution of a task on CUDA0:
															
 
																-
															
 
																-\code{.c}
															
 
																-task->execute_on_a_specific_worker = 1;
															
 
																-task->worker = starpu_worker_get_by_type(STARPU_CUDA_WORKER, 0);
															
 
																-\endcode
															
 
																-
															
 
																-Note however that using scheduling contexts while statically scheduling tasks on workers
															
 
																-could be tricky. Be careful to schedule the tasks exactly on the workers of the corresponding
															
 
																-contexts, otherwise the workers' corresponding scheduling structures may not be allocated or
															
 
																-the execution of the application may deadlock. Moreover, the hypervisor should not be used when
															
 
																-statically scheduling tasks.
															
 
																-
															
 
																-\section Profiling Profiling
															
 
																-
															
 
																-A quick view of how many tasks each worker has executed can be obtained by setting
															
 
																-<c>export STARPU_WORKER_STATS=1</c> This is a convenient way to check that
															
 
																-execution did happen on accelerators without penalizing performance with
															
 
																-the profiling overhead.
															
 
																-
															
 
																-A quick view of how much data transfers have been issued can be obtained by setting
															
 
																-<c>export STARPU_BUS_STATS=1</c> .
															
 
																-
															
 
																-More detailed profiling information can be enabled by using <c>export STARPU_PROFILING=1</c> or by
															
 
																-calling starpu_profiling_status_set() from the source code.
															
 
																-Statistics on the execution can then be obtained by using <c>export
															
 
																-STARPU_BUS_STATS=1</c> and <c>export STARPU_WORKER_STATS=1</c> .
															
 
																- More details on performance feedback are provided by the next chapter.
															
 
																-
															
 
																-\section DetectionStuckConditions Detection Stuck Conditions
															
 
																-
															
 
																-It may happen that for some reason, StarPU does not make progress for a long
															
 
																-period of time.  Reason are sometimes due to contention inside StarPU, but
															
 
																-sometimes this is due to external reasons, such as stuck MPI driver, or CUDA
															
 
																-driver, etc.
															
 
																-
															
 
																-<c>export STARPU_WATCHDOG_TIMEOUT=10000</c>
															
 
																-
															
 
																-allows to make StarPU print an error message whenever StarPU does not terminate
															
 
																-any task for 10ms. In addition to that,
															
 
																-
															
 
																-<c>export STARPU_WATCHDOG_CRASH=1</c>
															
 
																-
															
 
																-triggers a crash in that condition, thus allowing to catch the situation in gdb
															
 
																-etc.
															
 
																-
															
 
																-\section CUDA-specificOptimizations CUDA-specific Optimizations
															
 
																-
															
 
																-Due to CUDA limitations, StarPU will have a hard time overlapping its own
															
 
																-communications and the codelet computations if the application does not use a
															
 
																-dedicated CUDA stream for its computations instead of the default stream,
															
 
																-which synchronizes all operations of the GPU. StarPU provides one by the use
															
 
																-of starpu_cuda_get_local_stream() which can be used by all CUDA codelet
															
 
																-operations to avoid this issue. For instance:
															
 
																-
															
 
																-\code{.c}
															
 
																-func <<<grid,block,0,starpu_cuda_get_local_stream()>>> (foo, bar);
															
 
																-cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																-\endcode
															
 
																-
															
 
																-StarPU already does appropriate calls for the CUBLAS library.
															
 
																-
															
 
																-Unfortunately, some CUDA libraries do not have stream variants of
															
 
																-kernels. That will lower the potential for overlapping.
															
 
																-
															
 
																-\section PerformanceDebugging Performance Debugging
															
 
																-
															
 
																-To get an idea of what is happening, a lot of performance feedback is available,
															
 
																-detailed in the next chapter. The various informations should be checked for.
															
 
																-
															
 
																-<ul>
															
 
																-<li>
															
 
																-What does the Gantt diagram look like? (see \ref CreatingAGanttDiagram)
															
 
																-<ul>
															
 
																-  <li> If it's mostly green (tasks running in the initial context) or context specific
															
 
																-  color prevailing, then the machine is properly
															
 
																-  utilized, and perhaps the codelets are just slow. Check their performance, see
															
 
																-  \ref PerformanceOfCodelets.
															
 
																-  </li>
															
 
																-  <li> If it's mostly purple (FetchingInput), tasks keep waiting for data
															
 
																-  transfers, do you perhaps have far more communication than computation? Did
															
 
																-  you properly use CUDA streams to make sure communication can be
															
 
																-  overlapped? Did you use data-locality aware schedulers to avoid transfers as
															
 
																-  much as possible?
															
 
																-  </li>
															
 
																-  <li> If it's mostly red (Blocked), tasks keep waiting for dependencies,
															
 
																-  do you have enough parallelism? It might be a good idea to check what the DAG
															
 
																-  looks like (see \ref CreatingADAGWithGraphviz).
															
 
																-  </li>
															
 
																-  <li> If only some workers are completely red (Blocked), for some reason the
															
 
																-  scheduler didn't assign tasks to them. Perhaps the performance model is bogus,
															
 
																-  check it (see \ref PerformanceOfCodelets). Do all your codelets have a
															
 
																-  performance model?  When some of them don't, the schedulers switches to a
															
 
																-  greedy algorithm which thus performs badly.
															
 
																-  </li>
															
 
																-</ul>
															
 
																-</li>
															
 
																-</ul>
															
 
																-
															
 
																-You can also use the Temanejo task debugger (see \ref UsingTheTemanejoTaskDebugger) to
															
 
																-visualize the task graph more easily.
															
 
																-
															
 
																-\section SimulatedPerformance Simulated Performance
															
 
																-
															
 
																-StarPU can use Simgrid in order to simulate execution on an arbitrary
															
 
																-platform.
															
 
																-
															
 
																-\subsection Calibration Calibration
															
 
																-
															
 
																-The idea is to first compile StarPU normally, and run the application,
															
 
																-so as to automatically benchmark the bus and the codelets.
															
 
																-
															
 
																-\verbatim
															
 
																-$ ./configure && make
															
 
																-$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
															
 
																-[starpu][_starpu_load_history_based_model] Warning: model matvecmult
															
 
																-   is not calibrated, forcing calibration for this run. Use the
															
 
																-   STARPU_CALIBRATE environment variable to control this.
															
 
																-$ ...
															
 
																-$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
															
 
																-TEST PASSED
															
 
																-\endverbatim
															
 
																-
															
 
																-Note that we force to use the scheduler <c>dmda</c> to generate
															
 
																-performance models for the application. The application may need to be
															
 
																-run several times before the model is calibrated.
															
 
																-
															
 
																-\subsection Simulation Simulation
															
 
																-
															
 
																-Then, recompile StarPU, passing \ref enable-simgrid "--enable-simgrid"
															
 
																-to <c>./configure</c>, and re-run the application:
															
 
																-
															
 
																-\verbatim
															
 
																-$ ./configure --enable-simgrid && make
															
 
																-$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
															
 
																-TEST FAILED !!!
															
 
																-\endverbatim
															
 
																-
															
 
																-It is normal that the test fails: since the computation are not actually done
															
 
																-(that is the whole point of simgrid), the result is wrong, of course.
															
 
																-
															
 
																-If the performance model is not calibrated enough, the following error
															
 
																-message will be displayed
															
 
																-
															
 
																-\verbatim
															
 
																-$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
															
 
																-[starpu][_starpu_load_history_based_model] Warning: model matvecmult
															
 
																-    is not calibrated, forcing calibration for this run. Use the
															
 
																-    STARPU_CALIBRATE environment variable to control this.
															
 
																-[starpu][_starpu_simgrid_execute_job][assert failure] Codelet
															
 
																-    matvecmult does not have a perfmodel, or is not calibrated enough
															
 
																-\endverbatim
															
 
																-
															
 
																-The number of devices can be chosen as usual with \ref STARPU_NCPU,
															
 
																-\ref STARPU_NCUDA, and \ref STARPU_NOPENCL.  For now, only the number of
															
 
																-cpus can be arbitrarily chosen. The number of CUDA and OpenCL devices have to be
															
 
																-lower than the real number on the current machine.
															
 
																-
															
 
																-The amount of simulated GPU memory is for now unbound by default, but
															
 
																-it can be chosen by hand through the \ref STARPU_LIMIT_CUDA_MEM,
															
 
																-\ref STARPU_LIMIT_CUDA_devid_MEM, \ref STARPU_LIMIT_OPENCL_MEM, and
															
 
																-\ref STARPU_LIMIT_OPENCL_devid_MEM environment variables.
															
 
																-
															
 
																-The Simgrid default stack size is small; to increase it use the
															
 
																-parameter <c>--cfg=contexts/stack_size</c>, for example:
															
 
																-
															
 
																-\verbatim
															
 
																-$ ./example --cfg=contexts/stack_size:8192
															
 
																-TEST FAILED !!!
															
 
																-\endverbatim
															
 
																-
															
 
																-Note: of course, if the application uses <c>gettimeofday</c> to make its
															
 
																-performance measurements, the real time will be used, which will be bogus. To
															
 
																-get the simulated time, it has to use starpu_timing_now() which returns the
															
 
																-virtual timestamp in ms.
															
 
																-
															
 
																-\subsection SimulationOnAnotherMachine Simulation On Another Machine
															
 
																-
															
 
																-The simgrid support even permits to perform simulations on another machine, your
															
 
																-desktop, typically. To achieve this, one still needs to perform the Calibration
															
 
																-step on the actual machine to be simulated, then copy them to your desktop
															
 
																-machine (the <c>$STARPU_HOME/.starpu</c> directory). One can then perform the
															
 
																-Simulation step on the desktop machine, by setting the environment
															
 
																-variable \ref STARPU_HOSTNAME to the name of the actual machine, to
															
 
																-make StarPU use the performance models of the simulated machine even
															
 
																-on the desktop machine.
															
 
																-
															
 
																-If the desktop machine does not have CUDA or OpenCL, StarPU is still able to
															
 
																-use simgrid to simulate execution with CUDA/OpenCL devices, but the application
															
 
																-source code will probably disable the CUDA and OpenCL codelets in thatcd sc
															
 
																-case. Since during simgrid execution, the functions of the codelet are actually
															
 
																-not called, one can use dummy functions such as the following to still permit
															
 
																-CUDA or OpenCL execution:
															
 
																-
															
 
																-\snippet simgrid.c To be included. You should update doxygen if you see this text.
															
 
																-
															
 
																-*/
															
--- a/doc/doxygen/chapters/05check_list_performance.doxy
+++ b/doc/doxygen/chapters/05check_list_performance.doxy
@@ -0,0 +1,204 @@
 
																+/*
															
 
																+ * This file is part of the StarPU Handbook.
															
 
																+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
															
 
																+ * See the file version.doxy for copying conditions.
															
 
																+ */
															
 
																+
															
 
																+/*! \page CheckListWhenPerformanceAreNotThere Check List When Performance Are Not There
															
 
																+
															
 
																+TODO: improve!
															
 
																+
															
 
																+Simply encapsulating application kernels into tasks already permits to
															
 
																+seamlessly support CPU and GPUs at the same time. To achieve good
															
 
																+performance, we give below a list of features which should be checked.
															
 
																+
															
 
																+\section DataRelatedFeaturesToImprovePerformance Data Related Features That May Improve Performance
															
 
																+
															
 
																+link to \ref DataManagement
															
 
																+
															
 
																+link to \ref DataPrefetch
															
 
																+
															
 
																+\section TaskRelatedFeaturesToImprovePerformance Task Related Features That May Improve Performance
															
 
																+
															
 
																+link to \ref TaskGranularity
															
 
																+
															
 
																+link to \ref TaskSubmission
															
 
																+
															
 
																+link to \ref TaskPriorities
															
 
																+
															
 
																+\section SchedulingRelatedFeaturesToImprovePerformance Scheduling Related Features That May Improve Performance
															
 
																+
															
 
																+link to \ref TaskSchedulingPolicy
															
 
																+
															
 
																+link to \ref TaskDistributionVsDataTransfer
															
 
																+
															
 
																+link to \ref Power-basedScheduling
															
 
																+
															
 
																+link to \ref StaticScheduling
															
 
																+
															
 
																+\section CUDA-specificOptimizations CUDA-specific Optimizations
															
 
																+
															
 
																+Due to CUDA limitations, StarPU will have a hard time overlapping its own
															
 
																+communications and the codelet computations if the application does not use a
															
 
																+dedicated CUDA stream for its computations instead of the default stream,
															
 
																+which synchronizes all operations of the GPU. StarPU provides one by the use
															
 
																+of starpu_cuda_get_local_stream() which can be used by all CUDA codelet
															
 
																+operations to avoid this issue. For instance:
															
 
																+
															
 
																+\code{.c}
															
 
																+func <<<grid,block,0,starpu_cuda_get_local_stream()>>> (foo, bar);
															
 
																+cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																+\endcode
															
 
																+
															
 
																+StarPU already does appropriate calls for the CUBLAS library.
															
 
																+
															
 
																+Unfortunately, some CUDA libraries do not have stream variants of
															
 
																+kernels. That will lower the potential for overlapping.
															
 
																+
															
 
																+\section DetectionStuckConditions Detection Stuck Conditions
															
 
																+
															
 
																+It may happen that for some reason, StarPU does not make progress for a long
															
 
																+period of time.  Reason are sometimes due to contention inside StarPU, but
															
 
																+sometimes this is due to external reasons, such as stuck MPI driver, or CUDA
															
 
																+driver, etc.
															
 
																+
															
 
																+<c>export STARPU_WATCHDOG_TIMEOUT=10000</c>
															
 
																+
															
 
																+allows to make StarPU print an error message whenever StarPU does not terminate
															
 
																+any task for 10ms. In addition to that,
															
 
																+
															
 
																+<c>export STARPU_WATCHDOG_CRASH=1</c>
															
 
																+
															
 
																+triggers a crash in that condition, thus allowing to catch the situation in gdb
															
 
																+etc.
															
 
																+
															
 
																+\section HowToLimitMemoryPerNode How to limit memory per node
															
 
																+
															
 
																+TODO
															
 
																+
															
 
																+Talk about
															
 
																+\ref STARPU_LIMIT_CUDA_devid_MEM, \ref STARPU_LIMIT_CUDA_MEM,
															
 
																+\ref STARPU_LIMIT_OPENCL_devid_MEM, \ref STARPU_LIMIT_OPENCL_MEM
															
 
																+and \ref STARPU_LIMIT_CPU_MEM
															
 
																+
															
 
																+starpu_memory_get_available()
															
 
																+
															
 
																+\section PerformanceModelCalibration Performance Model Calibration
															
 
																+
															
 
																+Most schedulers are based on an estimation of codelet duration on each kind
															
 
																+of processing unit. For this to be possible, the application programmer needs
															
 
																+to configure a performance model for the codelets of the application (see
															
 
																+\ref PerformanceModelExample for instance). History-based performance models
															
 
																+use on-line calibration.  StarPU will automatically calibrate codelets
															
 
																+which have never been calibrated yet, and save the result in
															
 
																+<c>$STARPU_HOME/.starpu/sampling/codelets</c>.
															
 
																+The models are indexed by machine name. To share the models between
															
 
																+machines (e.g. for a homogeneous cluster), use <c>export
															
 
																+STARPU_HOSTNAME=some_global_name</c>. To force continuing calibration,
															
 
																+use <c>export STARPU_CALIBRATE=1</c> . This may be necessary if your application
															
 
																+has not-so-stable performance. StarPU will force calibration (and thus ignore
															
 
																+the current result) until 10 (<c>_STARPU_CALIBRATION_MINIMUM</c>) measurements have been
															
 
																+made on each architecture, to avoid badly scheduling tasks just because the
															
 
																+first measurements were not so good. Details on the current performance model status
															
 
																+can be obtained from the command <c>starpu_perfmodel_display</c>: the <c>-l</c>
															
 
																+option lists the available performance models, and the <c>-s</c> option permits
															
 
																+to choose the performance model to be displayed. The result looks like:
															
 
																+
															
 
																+\verbatim
															
 
																+$ starpu_perfmodel_display -s starpu_slu_lu_model_11
															
 
																+performance model for cpu_impl_0
															
 
																+# hash    size     flops         mean          dev           n
															
 
																+914f3bef  1048576  0.000000e+00  2.503577e+04  1.982465e+02  8
															
 
																+3e921964  65536    0.000000e+00  5.527003e+02  1.848114e+01  7
															
 
																+e5a07e31  4096     0.000000e+00  1.717457e+01  5.190038e+00  14
															
 
																+...
															
 
																+\endverbatim
															
 
																+
															
 
																+Which shows that for the LU 11 kernel with a 1MiB matrix, the average
															
 
																+execution time on CPUs was about 25ms, with a 0.2ms standard deviation, over
															
 
																+8 samples. It is a good idea to check this before doing actual performance
															
 
																+measurements.
															
 
																+
															
 
																+A graph can be drawn by using the tool <c>starpu_perfmodel_plot</c>:
															
 
																+
															
 
																+\verbatim
															
 
																+$ starpu_perfmodel_plot -s starpu_slu_lu_model_11
															
 
																+4096 16384 65536 262144 1048576 4194304 
															
 
																+$ gnuplot starpu_starpu_slu_lu_model_11.gp
															
 
																+$ gv starpu_starpu_slu_lu_model_11.eps
															
 
																+\endverbatim
															
 
																+
															
 
																+\image html starpu_starpu_slu_lu_model_11.png
															
 
																+\image latex starpu_starpu_slu_lu_model_11.eps "" width=\textwidth
															
 
																+
															
 
																+If a kernel source code was modified (e.g. performance improvement), the
															
 
																+calibration information is stale and should be dropped, to re-calibrate from
															
 
																+start. This can be done by using <c>export STARPU_CALIBRATE=2</c>.
															
 
																+
															
 
																+Note: due to CUDA limitations, to be able to measure kernel duration,
															
 
																+calibration mode needs to disable asynchronous data transfers. Calibration thus
															
 
																+disables data transfer / computation overlapping, and should thus not be used
															
 
																+for eventual benchmarks. Note 2: history-based performance models get calibrated
															
 
																+only if a performance-model-based scheduler is chosen.
															
 
																+
															
 
																+The history-based performance models can also be explicitly filled by the
															
 
																+application without execution, if e.g. the application already has a series of
															
 
																+measurements. This can be done by using starpu_perfmodel_update_history(),
															
 
																+for instance:
															
 
																+
															
 
																+\code{.c}
															
 
																+static struct starpu_perfmodel perf_model = {
															
 
																+    .type = STARPU_HISTORY_BASED,
															
 
																+    .symbol = "my_perfmodel",
															
 
																+};
															
 
																+
															
 
																+struct starpu_codelet cl = {
															
 
																+    .where = STARPU_CUDA,
															
 
																+    .cuda_funcs = { cuda_func1, cuda_func2, NULL },
															
 
																+    .nbuffers = 1,
															
 
																+    .modes = {STARPU_W},
															
 
																+    .model = &perf_model
															
 
																+};
															
 
																+
															
 
																+void feed(void) {
															
 
																+    struct my_measure *measure;
															
 
																+    struct starpu_task task;
															
 
																+    starpu_task_init(&task);
															
 
																+
															
 
																+    task.cl = &cl;
															
 
																+
															
 
																+    for (measure = &measures[0]; measure < measures[last]; measure++) {
															
 
																+        starpu_data_handle_t handle;
															
 
																+	starpu_vector_data_register(&handle, -1, 0, measure->size, sizeof(float));
															
 
																+	task.handles[0] = handle;
															
 
																+	starpu_perfmodel_update_history(&perf_model, &task,
															
 
																+	                                STARPU_CUDA_DEFAULT + measure->cudadev, 0,
															
 
																+	                                measure->implementation, measure->time);
															
 
																+	starpu_task_clean(&task);
															
 
																+	starpu_data_unregister(handle);
															
 
																+    }
															
 
																+}
															
 
																+\endcode
															
 
																+
															
 
																+Measurement has to be provided in milliseconds for the completion time models,
															
 
																+and in Joules for the energy consumption models.
															
 
																+
															
 
																+\section Profiling Profiling
															
 
																+
															
 
																+A quick view of how many tasks each worker has executed can be obtained by setting
															
 
																+<c>export STARPU_WORKER_STATS=1</c> This is a convenient way to check that
															
 
																+execution did happen on accelerators without penalizing performance with
															
 
																+the profiling overhead.
															
 
																+
															
 
																+A quick view of how much data transfers have been issued can be obtained by setting
															
 
																+<c>export STARPU_BUS_STATS=1</c> .
															
 
																+
															
 
																+More detailed profiling information can be enabled by using <c>export STARPU_PROFILING=1</c> or by
															
 
																+calling starpu_profiling_status_set() from the source code.
															
 
																+Statistics on the execution can then be obtained by using <c>export
															
 
																+STARPU_BUS_STATS=1</c> and <c>export STARPU_WORKER_STATS=1</c> .
															
 
																+ More details on performance feedback are provided by the next chapter.
															
 
																+
															
 
																+*/
															
--- a/doc/doxygen/chapters/06tasks.doxy
+++ b/doc/doxygen/chapters/06tasks.doxy
@@ -0,0 +1,443 @@
 
																+/*
															
 
																+ * This file is part of the StarPU Handbook.
															
 
																+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
															
 
																+ * See the file version.doxy for copying conditions.
															
 
																+ */
															
 
																+
															
 
																+/*! \page TasksInStarPU Tasks In StarPU
															
 
																+
															
 
																+\section TaskGranularity Task Granularity
															
 
																+
															
 
																+Like any other runtime, StarPU has some overhead to manage tasks. Since
															
 
																+it does smart scheduling and data management, that overhead is not always
															
 
																+neglectable. The order of magnitude of the overhead is typically a couple of
															
 
																+microseconds, which is actually quite smaller than the CUDA overhead itself. The
															
 
																+amount of work that a task should do should thus be somewhat
															
 
																+bigger, to make sure that the overhead becomes neglectible. The offline
															
 
																+performance feedback can provide a measure of task length, which should thus be
															
 
																+checked if bad performance are observed. To get a grasp at the scalability
															
 
																+possibility according to task size, one can run
															
 
																+<c>tests/microbenchs/tasks_size_overhead.sh</c> which draws curves of the
															
 
																+speedup of independent tasks of very small sizes.
															
 
																+
															
 
																+The choice of scheduler also has impact over the overhead: for instance, the
															
 
																+ scheduler <c>dmda</c> takes time to make a decision, while <c>eager</c> does
															
 
																+not. <c>tasks_size_overhead.sh</c> can again be used to get a grasp at how much
															
 
																+impact that has on the target machine.
															
 
																+
															
 
																+\section TaskSubmission Task Submission
															
 
																+
															
 
																+To let StarPU make online optimizations, tasks should be submitted
															
 
																+asynchronously as much as possible. Ideally, all the tasks should be
															
 
																+submitted, and mere calls to starpu_task_wait_for_all() or
															
 
																+starpu_data_unregister() be done to wait for
															
 
																+termination. StarPU will then be able to rework the whole schedule, overlap
															
 
																+computation with communication, manage accelerator local memory usage, etc.
															
 
																+
															
 
																+\section TaskPriorities Task Priorities
															
 
																+
															
 
																+By default, StarPU will consider the tasks in the order they are submitted by
															
 
																+the application. If the application programmer knows that some tasks should
															
 
																+be performed in priority (for instance because their output is needed by many
															
 
																+other tasks and may thus be a bottleneck if not executed early
															
 
																+enough), the field starpu_task::priority should be set to transmit the
															
 
																+priority information to StarPU.
															
 
																+
															
 
																+\section SettingTheDataHandlesForATask Setting The Data Handles For A Task
															
 
																+
															
 
																+The number of data a task can manage is fixed by the environment variable
															
 
																+\ref STARPU_NMAXBUFS which has a default value which can be changed
															
 
																+through the configure option \ref enable-maxbuffers "--enable-maxbuffers".
															
 
																+
															
 
																+However, it is possible to define tasks managing more data by using
															
 
																+the field starpu_task::dyn_handles when defining a task and the field
															
 
																+starpu_codelet::dyn_modes when defining the corresponding codelet.
															
 
																+
															
 
																+\code{.c}
															
 
																+enum starpu_data_access_mode modes[STARPU_NMAXBUFS+1] = {
															
 
																+	STARPU_R, STARPU_R, ...
															
 
																+};
															
 
																+
															
 
																+struct starpu_codelet dummy_big_cl =
															
 
																+{
															
 
																+	.cuda_funcs = { dummy_big_kernel, NULL },
															
 
																+	.opencl_funcs = { dummy_big_kernel, NULL },
															
 
																+	.cpu_funcs = { dummy_big_kernel, NULL },
															
 
																+	.cpu_funcs_name = { "dummy_big_kernel", NULL },
															
 
																+	.nbuffers = STARPU_NMAXBUFS+1,
															
 
																+	.dyn_modes = modes
															
 
																+};
															
 
																+
															
 
																+task = starpu_task_create();
															
 
																+task->cl = &dummy_big_cl;
															
 
																+task->dyn_handles = malloc(task->cl->nbuffers * sizeof(starpu_data_handle_t));
															
 
																+for(i=0 ; i<task->cl->nbuffers ; i++)
															
 
																+{
															
 
																+	task->dyn_handles[i] = handle;
															
 
																+}
															
 
																+starpu_task_submit(task);
															
 
																+\endcode
															
 
																+
															
 
																+\code{.c}
															
 
																+starpu_data_handle_t *handles = malloc(dummy_big_cl.nbuffers * sizeof(starpu_data_handle_t));
															
 
																+for(i=0 ; i<dummy_big_cl.nbuffers ; i++)
															
 
																+{
															
 
																+	handles[i] = handle;
															
 
																+}
															
 
																+starpu_task_insert(&dummy_big_cl,
															
 
																+        	 STARPU_VALUE, &dummy_big_cl.nbuffers, sizeof(dummy_big_cl.nbuffers),
															
 
																+		 STARPU_DATA_ARRAY, handles, dummy_big_cl.nbuffers,
															
 
																+		 0);
															
 
																+\endcode
															
 
																+
															
 
																+The whole code for this complex data interface is available in the
															
 
																+directory <c>examples/basic_examples/dynamic_handles.c</c>.
															
 
																+
															
 
																+\section UsingMultipleImplementationsOfACodelet Using Multiple Implementations Of A Codelet
															
 
																+
															
 
																+One may want to write multiple implementations of a codelet for a single type of
															
 
																+device and let StarPU choose which one to run. As an example, we will show how
															
 
																+to use SSE to scale a vector. The codelet can be written as follows:
															
 
																+
															
 
																+\code{.c}
															
 
																+#include <xmmintrin.h>
															
 
																+
															
 
																+void scal_sse_func(void *buffers[], void *cl_arg)
															
 
																+{
															
 
																+    float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
															
 
																+    unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
															
 
																+    unsigned int n_iterations = n/4;
															
 
																+    if (n % 4 != 0)
															
 
																+        n_iterations++;
															
 
																+
															
 
																+    __m128 *VECTOR = (__m128*) vector;
															
 
																+    __m128 factor __attribute__((aligned(16)));
															
 
																+    factor = _mm_set1_ps(*(float *) cl_arg);
															
 
																+
															
 
																+    unsigned int i;
															
 
																+    for (i = 0; i < n_iterations; i++)
															
 
																+        VECTOR[i] = _mm_mul_ps(factor, VECTOR[i]);
															
 
																+}
															
 
																+\endcode
															
 
																+
															
 
																+\code{.c}
															
 
																+struct starpu_codelet cl = {
															
 
																+    .where = STARPU_CPU,
															
 
																+    .cpu_funcs = { scal_cpu_func, scal_sse_func, NULL },
															
 
																+    .cpu_funcs_name = { "scal_cpu_func", "scal_sse_func", NULL },
															
 
																+    .nbuffers = 1,
															
 
																+    .modes = { STARPU_RW }
															
 
																+};
															
 
																+\endcode
															
 
																+
															
 
																+Schedulers which are multi-implementation aware (only <c>dmda</c> and
															
 
																+<c>pheft</c> for now) will use the performance models of all the
															
 
																+implementations it was given, and pick the one that seems to be the fastest.
															
 
																+
															
 
																+\section EnablingImplementationAccordingToCapabilities Enabling Implementation According To Capabilities
															
 
																+
															
 
																+Some implementations may not run on some devices. For instance, some CUDA
															
 
																+devices do not support double floating point precision, and thus the kernel
															
 
																+execution would just fail; or the device may not have enough shared memory for
															
 
																+the implementation being used. The field starpu_codelet::can_execute
															
 
																+permits to express this. For instance:
															
 
																+
															
 
																+\code{.c}
															
 
																+static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
															
 
																+{
															
 
																+  const struct cudaDeviceProp *props;
															
 
																+  if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
															
 
																+    return 1;
															
 
																+  /* Cuda device */
															
 
																+  props = starpu_cuda_get_device_properties(workerid);
															
 
																+  if (props->major >= 2 || props->minor >= 3)
															
 
																+    /* At least compute capability 1.3, supports doubles */
															
 
																+    return 1;
															
 
																+  /* Old card, does not support doubles */
															
 
																+  return 0;
															
 
																+}
															
 
																+
															
 
																+struct starpu_codelet cl = {
															
 
																+    .where = STARPU_CPU|STARPU_CUDA,
															
 
																+    .can_execute = can_execute,
															
 
																+    .cpu_funcs = { cpu_func, NULL },
															
 
																+    .cpu_funcs_name = { "cpu_func", NULL },
															
 
																+    .cuda_funcs = { gpu_func, NULL }
															
 
																+    .nbuffers = 1,
															
 
																+    .modes = { STARPU_RW }
															
 
																+};
															
 
																+\endcode
															
 
																+
															
 
																+This can be essential e.g. when running on a machine which mixes various models
															
 
																+of CUDA devices, to take benefit from the new models without crashing on old models.
															
 
																+
															
 
																+Note: the function starpu_codelet::can_execute is called by the
															
 
																+scheduler each time it tries to match a task with a worker, and should
															
 
																+thus be very fast. The function starpu_cuda_get_device_properties()
															
 
																+provides a quick access to CUDA properties of CUDA devices to achieve
															
 
																+such efficiency.
															
 
																+
															
 
																+Another example is to compile CUDA code for various compute capabilities,
															
 
																+resulting with two CUDA functions, e.g. <c>scal_gpu_13</c> for compute capability
															
 
																+1.3, and <c>scal_gpu_20</c> for compute capability 2.0. Both functions can be
															
 
																+provided to StarPU by using starpu_codelet::cuda_funcs, and
															
 
																+starpu_codelet::can_execute can then be used to rule out the
															
 
																+<c>scal_gpu_20</c> variant on a CUDA device which will not be able to execute it:
															
 
																+
															
 
																+\code{.c}
															
 
																+static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
															
 
																+{
															
 
																+  const struct cudaDeviceProp *props;
															
 
																+  if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
															
 
																+    return 1;
															
 
																+  /* Cuda device */
															
 
																+  if (nimpl == 0)
															
 
																+    /* Trying to execute the 1.3 capability variant, we assume it is ok in all cases.  */
															
 
																+    return 1;
															
 
																+  /* Trying to execute the 2.0 capability variant, check that the card can do it.  */
															
 
																+  props = starpu_cuda_get_device_properties(workerid);
															
 
																+  if (props->major >= 2 || props->minor >= 0)
															
 
																+    /* At least compute capability 2.0, can run it */
															
 
																+    return 1;
															
 
																+  /* Old card, does not support 2.0, will not be able to execute the 2.0 variant.  */
															
 
																+  return 0;
															
 
																+}
															
 
																+
															
 
																+struct starpu_codelet cl = {
															
 
																+    .where = STARPU_CPU|STARPU_CUDA,
															
 
																+    .can_execute = can_execute,
															
 
																+    .cpu_funcs = { cpu_func, NULL },
															
 
																+    .cpu_funcs_name = { "cpu_func", NULL },
															
 
																+    .cuda_funcs = { scal_gpu_13, scal_gpu_20, NULL },
															
 
																+    .nbuffers = 1,
															
 
																+    .modes = { STARPU_RW }
															
 
																+};
															
 
																+\endcode
															
 
																+
															
 
																+Note: the most generic variant should be provided first, as some schedulers are
															
 
																+not able to try the different variants.
															
 
																+
															
 
																+\section InsertTaskUtility Insert Task Utility
															
 
																+
															
 
																+StarPU provides the wrapper function starpu_task_insert() to ease
															
 
																+the creation and submission of tasks.
															
 
																+
															
 
																+Here the implementation of the codelet:
															
 
																+
															
 
																+\code{.c}
															
 
																+void func_cpu(void *descr[], void *_args)
															
 
																+{
															
 
																+        int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																+        float *x1 = (float *)STARPU_VARIABLE_GET_PTR(descr[1]);
															
 
																+        int ifactor;
															
 
																+        float ffactor;
															
 
																+
															
 
																+        starpu_codelet_unpack_args(_args, &ifactor, &ffactor);
															
 
																+        *x0 = *x0 * ifactor;
															
 
																+        *x1 = *x1 * ffactor;
															
 
																+}
															
 
																+
															
 
																+struct starpu_codelet mycodelet = {
															
 
																+        .where = STARPU_CPU,
															
 
																+        .cpu_funcs = { func_cpu, NULL },
															
 
																+        .cpu_funcs_name = { "func_cpu", NULL },
															
 
																+        .nbuffers = 2,
															
 
																+        .modes = { STARPU_RW, STARPU_RW }
															
 
																+};
															
 
																+\endcode
															
 
																+
															
 
																+And the call to the function starpu_task_insert():
															
 
																+
															
 
																+\code{.c}
															
 
																+starpu_task_insert(&mycodelet,
															
 
																+                   STARPU_VALUE, &ifactor, sizeof(ifactor),
															
 
																+                   STARPU_VALUE, &ffactor, sizeof(ffactor),
															
 
																+                   STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],
															
 
																+                   0);
															
 
																+\endcode
															
 
																+
															
 
																+The call to starpu_task_insert() is equivalent to the following
															
 
																+code:
															
 
																+
															
 
																+\code{.c}
															
 
																+struct starpu_task *task = starpu_task_create();
															
 
																+task->cl = &mycodelet;
															
 
																+task->handles[0] = data_handles[0];
															
 
																+task->handles[1] = data_handles[1];
															
 
																+char *arg_buffer;
															
 
																+size_t arg_buffer_size;
															
 
																+starpu_codelet_pack_args(&arg_buffer, &arg_buffer_size,
															
 
																+                    STARPU_VALUE, &ifactor, sizeof(ifactor),
															
 
																+                    STARPU_VALUE, &ffactor, sizeof(ffactor),
															
 
																+                    0);
															
 
																+task->cl_arg = arg_buffer;
															
 
																+task->cl_arg_size = arg_buffer_size;
															
 
																+int ret = starpu_task_submit(task);
															
 
																+\endcode
															
 
																+
															
 
																+Here a similar call using ::STARPU_DATA_ARRAY.
															
 
																+
															
 
																+\code{.c}
															
 
																+starpu_task_insert(&mycodelet,
															
 
																+                   STARPU_DATA_ARRAY, data_handles, 2,
															
 
																+                   STARPU_VALUE, &ifactor, sizeof(ifactor),
															
 
																+                   STARPU_VALUE, &ffactor, sizeof(ffactor),
															
 
																+                   0);
															
 
																+\endcode
															
 
																+
															
 
																+If some part of the task insertion depends on the value of some computation,
															
 
																+the macro ::STARPU_DATA_ACQUIRE_CB can be very convenient. For
															
 
																+instance, assuming that the index variable <c>i</c> was registered as handle
															
 
																+<c>A_handle[i]</c>:
															
 
																+
															
 
																+\code{.c}
															
 
																+/* Compute which portion we will work on, e.g. pivot */
															
 
																+starpu_task_insert(&which_index, STARPU_W, i_handle, 0);
															
 
																+
															
 
																+/* And submit the corresponding task */
															
 
																+STARPU_DATA_ACQUIRE_CB(i_handle, STARPU_R,
															
 
																+                       starpu_task_insert(&work, STARPU_RW, A_handle[i], 0));
															
 
																+\endcode
															
 
																+
															
 
																+The macro ::STARPU_DATA_ACQUIRE_CB submits an asynchronous request for
															
 
																+acquiring data <c>i</c> for the main application, and will execute the code
															
 
																+given as third parameter when it is acquired. In other words, as soon as the
															
 
																+value of <c>i</c> computed by the codelet <c>which_index</c> can be read, the
															
 
																+portion of code passed as third parameter of ::STARPU_DATA_ACQUIRE_CB will
															
 
																+be executed, and is allowed to read from <c>i</c> to use it e.g. as an
															
 
																+index. Note that this macro is only avaible when compiling StarPU with
															
 
																+the compiler <c>gcc</c>.
															
 
																+
															
 
																+\section ParallelTasks Parallel Tasks
															
 
																+
															
 
																+StarPU can leverage existing parallel computation libraries by the means of
															
 
																+parallel tasks. A parallel task is a task which gets worked on by a set of CPUs
															
 
																+(called a parallel or combined worker) at the same time, by using an existing
															
 
																+parallel CPU implementation of the computation to be achieved. This can also be
															
 
																+useful to improve the load balance between slow CPUs and fast GPUs: since CPUs
															
 
																+work collectively on a single task, the completion time of tasks on CPUs become
															
 
																+comparable to the completion time on GPUs, thus relieving from granularity
															
 
																+discrepancy concerns. <c>hwloc</c> support needs to be enabled to get
															
 
																+good performance, otherwise StarPU will not know how to better group
															
 
																+cores.
															
 
																+
															
 
																+Two modes of execution exist to accomodate with existing usages.
															
 
																+
															
 
																+\subsection Fork-modeParallelTasks Fork-mode Parallel Tasks
															
 
																+
															
 
																+In the Fork mode, StarPU will call the codelet function on one
															
 
																+of the CPUs of the combined worker. The codelet function can use
															
 
																+starpu_combined_worker_get_size() to get the number of threads it is
															
 
																+allowed to start to achieve the computation. The CPU binding mask for the whole
															
 
																+set of CPUs is already enforced, so that threads created by the function will
															
 
																+inherit the mask, and thus execute where StarPU expected, the OS being in charge
															
 
																+of choosing how to schedule threads on the corresponding CPUs. The application
															
 
																+can also choose to bind threads by hand, using e.g. sched_getaffinity to know
															
 
																+the CPU binding mask that StarPU chose.
															
 
																+
															
 
																+For instance, using OpenMP (full source is available in
															
 
																+<c>examples/openmp/vector_scal.c</c>):
															
 
																+
															
 
																+\snippet forkmode.c To be included. You should update doxygen if you see this text.
															
 
																+
															
 
																+Other examples include for instance calling a BLAS parallel CPU implementation
															
 
																+(see <c>examples/mult/xgemm.c</c>).
															
 
																+
															
 
																+\subsection SPMD-modeParallelTasks SPMD-mode Parallel Tasks
															
 
																+
															
 
																+In the SPMD mode, StarPU will call the codelet function on
															
 
																+each CPU of the combined worker. The codelet function can use
															
 
																+starpu_combined_worker_get_size() to get the total number of CPUs
															
 
																+involved in the combined worker, and thus the number of calls that are made in
															
 
																+parallel to the function, and starpu_combined_worker_get_rank() to get
															
 
																+the rank of the current CPU within the combined worker. For instance:
															
 
																+
															
 
																+\code{.c}
															
 
																+static void func(void *buffers[], void *args)
															
 
																+{
															
 
																+    unsigned i;
															
 
																+    float *factor = _args;
															
 
																+    struct starpu_vector_interface *vector = buffers[0];
															
 
																+    unsigned n = STARPU_VECTOR_GET_NX(vector);
															
 
																+    float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
															
 
																+
															
 
																+    /* Compute slice to compute */
															
 
																+    unsigned m = starpu_combined_worker_get_size();
															
 
																+    unsigned j = starpu_combined_worker_get_rank();
															
 
																+    unsigned slice = (n+m-1)/m;
															
 
																+
															
 
																+    for (i = j * slice; i < (j+1) * slice && i < n; i++)
															
 
																+        val[i] *= *factor;
															
 
																+}
															
 
																+
															
 
																+static struct starpu_codelet cl =
															
 
																+{
															
 
																+    .modes = { STARPU_RW },
															
 
																+    .where = STARP_CPU,
															
 
																+    .type = STARPU_SPMD,
															
 
																+    .max_parallelism = INT_MAX,
															
 
																+    .cpu_funcs = { func, NULL },
															
 
																+    .cpu_funcs_name = { "func", NULL },
															
 
																+    .nbuffers = 1,
															
 
																+}
															
 
																+\endcode
															
 
																+
															
 
																+Of course, this trivial example will not really benefit from parallel task
															
 
																+execution, and was only meant to be simple to understand.  The benefit comes
															
 
																+when the computation to be done is so that threads have to e.g. exchange
															
 
																+intermediate results, or write to the data in a complex but safe way in the same
															
 
																+buffer.
															
 
																+
															
 
																+\subsection ParallelTasksPerformance Parallel Tasks Performance
															
 
																+
															
 
																+To benefit from parallel tasks, a parallel-task-aware StarPU scheduler has to
															
 
																+be used. When exposed to codelets with a flag ::STARPU_FORKJOIN or
															
 
																+::STARPU_SPMD, the schedulers <c>pheft</c> (parallel-heft) and <c>peager</c>
															
 
																+(parallel eager) will indeed also try to execute tasks with
															
 
																+several CPUs. It will automatically try the various available combined
															
 
																+worker sizes (making several measurements for each worker size) and
															
 
																+thus be able to avoid choosing a large combined worker if the codelet
															
 
																+does not actually scale so much.
															
 
																+
															
 
																+\subsection CombinedWorkers Combined Workers
															
 
																+
															
 
																+By default, StarPU creates combined workers according to the architecture
															
 
																+structure as detected by <c>hwloc</c>. It means that for each object of the <c>hwloc</c>
															
 
																+topology (NUMA node, socket, cache, ...) a combined worker will be created. If
															
 
																+some nodes of the hierarchy have a big arity (e.g. many cores in a socket
															
 
																+without a hierarchy of shared caches), StarPU will create combined workers of
															
 
																+intermediate sizes. The variable \ref
															
 
																+STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER permits to tune the maximum
															
 
																+arity between levels of combined workers.
															
 
																+
															
 
																+The combined workers actually produced can be seen in the output of the
															
 
																+tool <c>starpu_machine_display</c> (the environment variable \ref
															
 
																+STARPU_SCHED has to be set to a combined worker-aware scheduler such
															
 
																+as <c>pheft</c> or <c>peager</c>).
															
 
																+
															
 
																+\subsection ConcurrentParallelTasks Concurrent Parallel Tasks
															
 
																+
															
 
																+Unfortunately, many environments and librairies do not support concurrent
															
 
																+calls.
															
 
																+
															
 
																+For instance, most OpenMP implementations (including the main ones) do not
															
 
																+support concurrent <c>pragma omp parallel</c> statements without nesting them in
															
 
																+another <c>pragma omp parallel</c> statement, but StarPU does not yet support
															
 
																+creating its CPU workers by using such pragma.
															
 
																+
															
 
																+Other parallel libraries are also not safe when being invoked concurrently
															
 
																+from different threads, due to the use of global variables in their sequential
															
 
																+sections for instance.
															
 
																+
															
 
																+The solution is then to use only one combined worker at a time.  This can be
															
 
																+done by setting the field starpu_conf::single_combined_worker to <c>1</c>, or
															
 
																+setting the environment variable \ref STARPU_SINGLE_COMBINED_WORKER
															
 
																+to <c>1</c>. StarPU will then run only one parallel task at a time (but other
															
 
																+CPU and GPU tasks are not affected and can be run concurrently). The parallel
															
 
																+task scheduler will however still however still try varying combined worker
															
 
																+sizes to look for the most efficient ones.
															
 
																+
															
 
																+
															
 
																+*/
															
--- a/doc/doxygen/chapters/07data_management.doxy
+++ b/doc/doxygen/chapters/07data_management.doxy
@@ -0,0 +1,508 @@
 
																+/*
															
 
																+ * This file is part of the StarPU Handbook.
															
 
																+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
															
 
																+ * See the file version.doxy for copying conditions.
															
 
																+ */
															
 
																+
															
 
																+/*! \page DataManagement Data Management
															
 
																+
															
 
																+intro qui parle de coherency entre autres
															
 
																+
															
 
																+\section DataManagement Data Management
															
 
																+
															
 
																+When the application allocates data, whenever possible it should use
															
 
																+the function starpu_malloc(), which will ask CUDA or OpenCL to make
															
 
																+the allocation itself and pin the corresponding allocated memory. This
															
 
																+is needed to permit asynchronous data transfer, i.e. permit data
															
 
																+transfer to overlap with computations. Otherwise, the trace will show
															
 
																+that the <c>DriverCopyAsync</c> state takes a lot of time, this is
															
 
																+because CUDA or OpenCL then reverts to synchronous transfers.
															
 
																+
															
 
																+By default, StarPU leaves replicates of data wherever they were used, in case they
															
 
																+will be re-used by other tasks, thus saving the data transfer time. When some
															
 
																+task modifies some data, all the other replicates are invalidated, and only the
															
 
																+processing unit which ran that task will have a valid replicate of the data. If the application knows
															
 
																+that this data will not be re-used by further tasks, it should advise StarPU to
															
 
																+immediately replicate it to a desired list of memory nodes (given through a
															
 
																+bitmask). This can be understood like the write-through mode of CPU caches.
															
 
																+
															
 
																+\code{.c}
															
 
																+starpu_data_set_wt_mask(img_handle, 1<<0);
															
 
																+\endcode
															
 
																+
															
 
																+will for instance request to always automatically transfer a replicate into the
															
 
																+main memory (node <c>0</c>), as bit <c>0</c> of the write-through bitmask is being set.
															
 
																+
															
 
																+\code{.c}
															
 
																+starpu_data_set_wt_mask(img_handle, ~0U);
															
 
																+\endcode
															
 
																+
															
 
																+will request to always automatically broadcast the updated data to all memory
															
 
																+nodes.
															
 
																+
															
 
																+Setting the write-through mask to <c>~0U</c> can also be useful to make sure all
															
 
																+memory nodes always have a copy of the data, so that it is never evicted when
															
 
																+memory gets scarse.
															
 
																+
															
 
																+Implicit data dependency computation can become expensive if a lot
															
 
																+of tasks access the same piece of data. If no dependency is required
															
 
																+on some piece of data (e.g. because it is only accessed in read-only
															
 
																+mode, or because write accesses are actually commutative), use the
															
 
																+function starpu_data_set_sequential_consistency_flag() to disable
															
 
																+implicit dependencies on that data.
															
 
																+
															
 
																+In the same vein, accumulation of results in the same data can become a
															
 
																+bottleneck. The use of the mode ::STARPU_REDUX permits to optimize such
															
 
																+accumulation (see \ref DataReduction). To a lesser extent, the use of
															
 
																+the flag ::STARPU_COMMUTE keeps the bottleneck, but at least permits
															
 
																+the accumulation to happen in any order.
															
 
																+
															
 
																+Applications often need a data just for temporary results.  In such a case,
															
 
																+registration can be made without an initial value, for instance this produces a vector data:
															
 
																+
															
 
																+\code{.c}
															
 
																+starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
															
 
																+\endcode
															
 
																+
															
 
																+StarPU will then allocate the actual buffer only when it is actually needed,
															
 
																+e.g. directly on the GPU without allocating in main memory.
															
 
																+
															
 
																+In the same vein, once the temporary results are not useful any more, the
															
 
																+data should be thrown away. If the handle is not to be reused, it can be
															
 
																+unregistered:
															
 
																+
															
 
																+\code{.c}
															
 
																+starpu_unregister_submit(handle);
															
 
																+\endcode
															
 
																+
															
 
																+actual unregistration will be done after all tasks working on the handle
															
 
																+terminate.
															
 
																+
															
 
																+If the handle is to be reused, instead of unregistering it, it can simply be invalidated:
															
 
																+
															
 
																+\code{.c}
															
 
																+starpu_invalidate_submit(handle);
															
 
																+\endcode
															
 
																+
															
 
																+the buffers containing the current value will then be freed, and reallocated
															
 
																+only when another task writes some value to the handle.
															
 
																+
															
 
																+\section DataPrefetch Data Prefetch
															
 
																+
															
 
																+The scheduling policies <c>heft</c>, <c>dmda</c> and <c>pheft</c>
															
 
																+perform data prefetch (see \ref STARPU_PREFETCH):
															
 
																+as soon as a scheduling decision is taken for a task, requests are issued to
															
 
																+transfer its required data to the target processing unit, if needed, so that
															
 
																+when the processing unit actually starts the task, its data will hopefully be
															
 
																+already available and it will not have to wait for the transfer to finish.
															
 
																+
															
 
																+The application may want to perform some manual prefetching, for several reasons
															
 
																+such as excluding initial data transfers from performance measurements, or
															
 
																+setting up an initial statically-computed data distribution on the machine
															
 
																+before submitting tasks, which will thus guide StarPU toward an initial task
															
 
																+distribution (since StarPU will try to avoid further transfers).
															
 
																+
															
 
																+This can be achieved by giving the function starpu_data_prefetch_on_node()
															
 
																+the handle and the desired target memory node.
															
 
																+
															
 
																+\section PartitioningData Partitioning Data
															
 
																+
															
 
																+An existing piece of data can be partitioned in sub parts to be used by different tasks, for instance:
															
 
																+
															
 
																+\code{.c}
															
 
																+int vector[NX];
															
 
																+starpu_data_handle_t handle;
															
 
																+
															
 
																+/* Declare data to StarPU */
															
 
																+starpu_vector_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)vector,
															
 
																+                            NX, sizeof(vector[0]));
															
 
																+
															
 
																+/* Partition the vector in PARTS sub-vectors */
															
 
																+starpu_data_filter f =
															
 
																+{
															
 
																+    .filter_func = starpu_vector_filter_block,
															
 
																+    .nchildren = PARTS
															
 
																+};
															
 
																+starpu_data_partition(handle, &f);
															
 
																+\endcode
															
 
																+
															
 
																+The task submission then uses the function starpu_data_get_sub_data()
															
 
																+to retrieve the sub-handles to be passed as tasks parameters.
															
 
																+
															
 
																+\code{.c}
															
 
																+/* Submit a task on each sub-vector */
															
 
																+for (i=0; i<starpu_data_get_nb_children(handle); i++) {
															
 
																+    /* Get subdata number i (there is only 1 dimension) */
															
 
																+    starpu_data_handle_t sub_handle = starpu_data_get_sub_data(handle, 1, i);
															
 
																+    struct starpu_task *task = starpu_task_create();
															
 
																+
															
 
																+    task->handles[0] = sub_handle;
															
 
																+    task->cl = &cl;
															
 
																+    task->synchronous = 1;
															
 
																+    task->cl_arg = &factor;
															
 
																+    task->cl_arg_size = sizeof(factor);
															
 
																+
															
 
																+    starpu_task_submit(task);
															
 
																+}
															
 
																+\endcode
															
 
																+
															
 
																+Partitioning can be applied several times, see
															
 
																+<c>examples/basic_examples/mult.c</c> and <c>examples/filters/</c>.
															
 
																+
															
 
																+Wherever the whole piece of data is already available, the partitioning will
															
 
																+be done in-place, i.e. without allocating new buffers but just using pointers
															
 
																+inside the existing copy. This is particularly important to be aware of when
															
 
																+using OpenCL, where the kernel parameters are not pointers, but handles. The
															
 
																+kernel thus needs to be also passed the offset within the OpenCL buffer:
															
 
																+
															
 
																+\code{.c}
															
 
																+void opencl_func(void *buffers[], void *cl_arg)
															
 
																+{
															
 
																+    cl_mem vector = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
															
 
																+    unsigned offset = STARPU_BLOCK_GET_OFFSET(buffers[0]);
															
 
																+
															
 
																+    ...
															
 
																+    clSetKernelArg(kernel, 0, sizeof(vector), &vector);
															
 
																+    clSetKernelArg(kernel, 1, sizeof(offset), &offset);
															
 
																+    ...
															
 
																+}
															
 
																+\endcode
															
 
																+
															
 
																+And the kernel has to shift from the pointer passed by the OpenCL driver:
															
 
																+
															
 
																+\code{.c}
															
 
																+__kernel void opencl_kernel(__global int *vector, unsigned offset)
															
 
																+{
															
 
																+    block = (__global void *)block + offset;
															
 
																+    ...
															
 
																+}
															
 
																+\endcode
															
 
																+
															
 
																+StarPU provides various interfaces and filters for matrices, vectors, etc.,
															
 
																+but applications can also write their own data interfaces and filters, see
															
 
																+<c>examples/interface</c> and <c>examples/filters/custom_mf</c> for an example.
															
 
																+
															
 
																+\section DataReduction Data Reduction
															
 
																+
															
 
																+In various cases, some piece of data is used to accumulate intermediate
															
 
																+results. For instances, the dot product of a vector, maximum/minimum finding,
															
 
																+the histogram of a photograph, etc. When these results are produced along the
															
 
																+whole machine, it would not be efficient to accumulate them in only one place,
															
 
																+incurring data transmission each and access concurrency.
															
 
																+
															
 
																+StarPU provides a mode ::STARPU_REDUX, which permits to optimize
															
 
																+that case: it will allocate a buffer on each memory node, and accumulate
															
 
																+intermediate results there. When the data is eventually accessed in the normal
															
 
																+mode ::STARPU_R, StarPU will collect the intermediate results in just one
															
 
																+buffer.
															
 
																+
															
 
																+For this to work, the user has to use the function
															
 
																+starpu_data_set_reduction_methods() to declare how to initialize these
															
 
																+buffers, and how to assemble partial results.
															
 
																+
															
 
																+For instance, <c>cg</c> uses that to optimize its dot product: it first defines
															
 
																+the codelets for initialization and reduction:
															
 
																+
															
 
																+\code{.c}
															
 
																+struct starpu_codelet bzero_variable_cl =
															
 
																+{
															
 
																+        .cpu_funcs = { bzero_variable_cpu, NULL },
															
 
																+        .cpu_funcs_name = { "bzero_variable_cpu", NULL },
															
 
																+        .cuda_funcs = { bzero_variable_cuda, NULL },
															
 
																+        .nbuffers = 1,
															
 
																+}
															
 
																+
															
 
																+static void accumulate_variable_cpu(void *descr[], void *cl_arg)
															
 
																+{
															
 
																+        double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																+        double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);
															
 
																+        *v_dst = *v_dst + *v_src;
															
 
																+}
															
 
																+
															
 
																+static void accumulate_variable_cuda(void *descr[], void *cl_arg)
															
 
																+{
															
 
																+        double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																+        double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);
															
 
																+        cublasaxpy(1, (double)1.0, v_src, 1, v_dst, 1);
															
 
																+        cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																+}
															
 
																+
															
 
																+struct starpu_codelet accumulate_variable_cl =
															
 
																+{
															
 
																+        .cpu_funcs = { accumulate_variable_cpu, NULL },
															
 
																+        .cpu_funcs_name = { "accumulate_variable_cpu", NULL },
															
 
																+        .cuda_funcs = { accumulate_variable_cuda, NULL },
															
 
																+        .nbuffers = 1,
															
 
																+}
															
 
																+\endcode
															
 
																+
															
 
																+and attaches them as reduction methods for its handle <c>dtq</c>:
															
 
																+
															
 
																+\code{.c}
															
 
																+starpu_variable_data_register(&dtq_handle, -1, NULL, sizeof(type));
															
 
																+starpu_data_set_reduction_methods(dtq_handle,
															
 
																+        &accumulate_variable_cl, &bzero_variable_cl);
															
 
																+\endcode
															
 
																+
															
 
																+and <c>dtq_handle</c> can now be used in mode ::STARPU_REDUX for the
															
 
																+dot products with partitioned vectors:
															
 
																+
															
 
																+\code{.c}
															
 
																+for (b = 0; b < nblocks; b++)
															
 
																+    starpu_task_insert(&dot_kernel_cl,
															
 
																+        STARPU_REDUX, dtq_handle,
															
 
																+        STARPU_R, starpu_data_get_sub_data(v1, 1, b),
															
 
																+        STARPU_R, starpu_data_get_sub_data(v2, 1, b),
															
 
																+        0);
															
 
																+\endcode
															
 
																+
															
 
																+During registration, we have here provided <c>NULL</c>, i.e. there is
															
 
																+no initial value to be taken into account during reduction. StarPU
															
 
																+will thus only take into account the contributions from the tasks
															
 
																+<c>dot_kernel_cl</c>. Also, it will not allocate any memory for
															
 
																+<c>dtq_handle</c> before tasks <c>dot_kernel_cl</c> are ready to run.
															
 
																+
															
 
																+If another dot product has to be performed, one could unregister
															
 
																+<c>dtq_handle</c>, and re-register it. But one can also call
															
 
																+starpu_data_invalidate_submit() with the parameter <c>dtq_handle</c>,
															
 
																+which will clear all data from the handle, thus resetting it back to
															
 
																+the initial status <c>register(NULL)</c>.
															
 
																+
															
 
																+The example <c>cg</c> also uses reduction for the blocked gemv kernel,
															
 
																+leading to yet more relaxed dependencies and more parallelism.
															
 
																+
															
 
																+::STARPU_REDUX can also be passed to starpu_mpi_task_insert() in the MPI
															
 
																+case. That will however not produce any MPI communication, but just pass
															
 
																+::STARPU_REDUX to the underlying starpu_task_insert(). It is up to the
															
 
																+application to call starpu_mpi_redux_data(), which posts tasks that will
															
 
																+reduce the partial results among MPI nodes into the MPI node which owns the
															
 
																+data. For instance, some hypothetical application which collects partial results
															
 
																+into data <c>res</c>, then uses it for other computation, before looping again
															
 
																+with a new reduction:
															
 
																+
															
 
																+\code{.c}
															
 
																+for (i = 0; i < 100; i++) {
															
 
																+    starpu_mpi_task_insert(MPI_COMM_WORLD, &init_res, STARPU_W, res, 0);
															
 
																+    starpu_mpi_task_insert(MPI_COMM_WORLD, &work, STARPU_RW, A,
															
 
																+               STARPU_R, B, STARPU_REDUX, res, 0);
															
 
																+    starpu_mpi_redux_data(MPI_COMM_WORLD, res);
															
 
																+    starpu_mpi_task_insert(MPI_COMM_WORLD, &work2, STARPU_RW, B, STARPU_R, res, 0);
															
 
																+}
															
 
																+\endcode
															
 
																+
															
 
																+\section TemporaryBuffers Temporary Buffers
															
 
																+
															
 
																+There are two kinds of temporary buffers: temporary data which just pass results
															
 
																+from a task to another, and scratch data which are needed only internally by
															
 
																+tasks.
															
 
																+
															
 
																+\subsection TemporaryData Temporary Data
															
 
																+
															
 
																+Data can sometimes be entirely produced by a task, and entirely consumed by
															
 
																+another task, without the need for other parts of the application to access
															
 
																+it. In such case, registration can be done without prior allocation, by using
															
 
																+the special memory node number <c>-1</c>, and passing a zero pointer. StarPU will
															
 
																+actually allocate memory only when the task creating the content gets scheduled,
															
 
																+and destroy it on unregistration.
															
 
																+
															
 
																+In addition to that, it can be tedious for the application to have to unregister
															
 
																+the data, since it will not use its content anyway. The unregistration can be
															
 
																+done lazily by using the function starpu_data_unregister_submit(),
															
 
																+which will record that no more tasks accessing the handle will be submitted, so
															
 
																+that it can be freed as soon as the last task accessing it is over.
															
 
																+
															
 
																+The following code examplifies both points: it registers the temporary
															
 
																+data, submits three tasks accessing it, and records the data for automatic
															
 
																+unregistration.
															
 
																+
															
 
																+\code{.c}
															
 
																+starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
															
 
																+starpu_task_insert(&produce_data, STARPU_W, handle, 0);
															
 
																+starpu_task_insert(&compute_data, STARPU_RW, handle, 0);
															
 
																+starpu_task_insert(&summarize_data, STARPU_R, handle, STARPU_W, result_handle, 0);
															
 
																+starpu_data_unregister_submit(handle);
															
 
																+\endcode
															
 
																+
															
 
																+\subsection ScratchData Scratch Data
															
 
																+
															
 
																+Some kernels sometimes need temporary data to achieve the computations, i.e. a
															
 
																+workspace. The application could allocate it at the start of the codelet
															
 
																+function, and free it at the end, but that would be costly. It could also
															
 
																+allocate one buffer per worker (similarly to \ref
															
 
																+HowToInitializeAComputationLibraryOnceForEachWorker), but that would
															
 
																+make them systematic and permanent. A more  optimized way is to use
															
 
																+the data access mode ::STARPU_SCRATCH, as examplified below, which
															
 
																+provides per-worker buffers without content consistency.
															
 
																+
															
 
																+\code{.c}
															
 
																+starpu_vector_data_register(&workspace, -1, 0, sizeof(float));
															
 
																+for (i = 0; i < N; i++)
															
 
																+    starpu_task_insert(&compute, STARPU_R, input[i],
															
 
																+                       STARPU_SCRATCH, workspace, STARPU_W, output[i], 0);
															
 
																+\endcode
															
 
																+
															
 
																+StarPU will make sure that the buffer is allocated before executing the task,
															
 
																+and make this allocation per-worker: for CPU workers, notably, each worker has
															
 
																+its own buffer. This means that each task submitted above will actually have its
															
 
																+own workspace, which will actually be the same for all tasks running one after
															
 
																+the other on the same worker. Also, if for instance GPU memory becomes scarce,
															
 
																+StarPU will notice that it can free such buffers easily, since the content does
															
 
																+not matter.
															
 
																+
															
 
																+The example <c>examples/pi</c> uses scratches for some temporary buffer.
															
 
																+
															
 
																+\section TheMultiformatInterface The Multiformat Interface
															
 
																+
															
 
																+It may be interesting to represent the same piece of data using two different
															
 
																+data structures: one that would only be used on CPUs, and one that would only
															
 
																+be used on GPUs. This can be done by using the multiformat interface. StarPU
															
 
																+will be able to convert data from one data structure to the other when needed.
															
 
																+Note that the scheduler <c>dmda</c> is the only one optimized for this
															
 
																+interface. The user must provide StarPU with conversion codelets:
															
 
																+
															
 
																+\snippet multiformat.c To be included. You should update doxygen if you see this text.
															
 
																+
															
 
																+Kernels can be written almost as for any other interface. Note that
															
 
																+::STARPU_MULTIFORMAT_GET_CPU_PTR shall only be used for CPU kernels. CUDA kernels
															
 
																+must use ::STARPU_MULTIFORMAT_GET_CUDA_PTR, and OpenCL kernels must use
															
 
																+::STARPU_MULTIFORMAT_GET_OPENCL_PTR. ::STARPU_MULTIFORMAT_GET_NX may
															
 
																+be used in any kind of kernel.
															
 
																+
															
 
																+\code{.c}
															
 
																+static void
															
 
																+multiformat_scal_cpu_func(void *buffers[], void *args)
															
 
																+{
															
 
																+    struct point *aos;
															
 
																+    unsigned int n;
															
 
																+
															
 
																+    aos = STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
															
 
																+    n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
															
 
																+    ...
															
 
																+}
															
 
																+
															
 
																+extern "C" void multiformat_scal_cuda_func(void *buffers[], void *_args)
															
 
																+{
															
 
																+    unsigned int n;
															
 
																+    struct struct_of_arrays *soa;
															
 
																+
															
 
																+    soa = (struct struct_of_arrays *) STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
															
 
																+    n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
															
 
																+
															
 
																+    ...
															
 
																+}
															
 
																+\endcode
															
 
																+
															
 
																+A full example may be found in <c>examples/basic_examples/multiformat.c</c>.
															
 
																+
															
 
																+\section DefiningANewDataInterface Defining A New Data Interface
															
 
																+
															
 
																+Let's define a new data interface to manage complex numbers.
															
 
																+
															
 
																+\code{.c}
															
 
																+/* interface for complex numbers */
															
 
																+struct starpu_complex_interface
															
 
																+{
															
 
																+        double *real;
															
 
																+        double *imaginary;
															
 
																+        int nx;
															
 
																+};
															
 
																+\endcode
															
 
																+
															
 
																+Registering such a data to StarPU is easily done using the function
															
 
																+starpu_data_register(). The last
															
 
																+parameter of the function, <c>interface_complex_ops</c>, will be
															
 
																+described below.
															
 
																+
															
 
																+\code{.c}
															
 
																+void starpu_complex_data_register(starpu_data_handle_t *handle,
															
 
																+     unsigned home_node, double *real, double *imaginary, int nx)
															
 
																+{
															
 
																+        struct starpu_complex_interface complex =
															
 
																+        {
															
 
																+                .real = real,
															
 
																+                .imaginary = imaginary,
															
 
																+                .nx = nx
															
 
																+        };
															
 
																+
															
 
																+        if (interface_complex_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
															
 
																+        {
															
 
																+                interface_complex_ops.interfaceid = starpu_data_interface_get_next_id();
															
 
																+        }
															
 
																+
															
 
																+        starpu_data_register(handleptr, home_node, &complex, &interface_complex_ops);
															
 
																+}
															
 
																+\endcode
															
 
																+
															
 
																+Different operations need to be defined for a data interface through
															
 
																+the type starpu_data_interface_ops. We only define here the basic
															
 
																+operations needed to run simple applications. The source code for the
															
 
																+different functions can be found in the file
															
 
																+<c>examples/interface/complex_interface.c</c>.
															
 
																+
															
 
																+\code{.c}
															
 
																+static struct starpu_data_interface_ops interface_complex_ops =
															
 
																+{
															
 
																+        .register_data_handle = complex_register_data_handle,
															
 
																+        .allocate_data_on_node = complex_allocate_data_on_node,
															
 
																+        .copy_methods = &complex_copy_methods,
															
 
																+        .get_size = complex_get_size,
															
 
																+        .footprint = complex_footprint,
															
 
																+        .interfaceid = STARPU_UNKNOWN_INTERFACE_ID,
															
 
																+        .interface_size = sizeof(struct starpu_complex_interface),
															
 
																+};
															
 
																+\endcode
															
 
																+
															
 
																+Functions need to be defined to access the different fields of the
															
 
																+complex interface from a StarPU data handle.
															
 
																+
															
 
																+\code{.c}
															
 
																+double *starpu_complex_get_real(starpu_data_handle_t handle)
															
 
																+{
															
 
																+        struct starpu_complex_interface *complex_interface =
															
 
																+          (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
															
 
																+        return complex_interface->real;
															
 
																+}
															
 
																+
															
 
																+double *starpu_complex_get_imaginary(starpu_data_handle_t handle);
															
 
																+int starpu_complex_get_nx(starpu_data_handle_t handle);
															
 
																+\endcode
															
 
																+
															
 
																+Similar functions need to be defined to access the different fields of the
															
 
																+complex interface from a <c>void *</c> pointer to be used within codelet
															
 
																+implemetations.
															
 
																+
															
 
																+\snippet complex.c To be included. You should update doxygen if you see this text.
															
 
																+
															
 
																+Complex data interfaces can then be registered to StarPU.
															
 
																+
															
 
																+\code{.c}
															
 
																+double real = 45.0;
															
 
																+double imaginary = 12.0;starpu_complex_data_register(&handle1, STARPU_MAIN_RAM, &real, &imaginary, 1);
															
 
																+starpu_task_insert(&cl_display, STARPU_R, handle1, 0);
															
 
																+\endcode
															
 
																+
															
 
																+and used by codelets.
															
 
																+
															
 
																+\code{.c}
															
 
																+void display_complex_codelet(void *descr[], __attribute__ ((unused)) void *_args)
															
 
																+{
															
 
																+        int nx = STARPU_COMPLEX_GET_NX(descr[0]);
															
 
																+        double *real = STARPU_COMPLEX_GET_REAL(descr[0]);
															
 
																+        double *imaginary = STARPU_COMPLEX_GET_IMAGINARY(descr[0]);
															
 
																+        int i;
															
 
																+
															
 
																+        for(i=0 ; i<nx ; i++)
															
 
																+        {
															
 
																+                fprintf(stderr, "Complex[%d] = %3.2f + %3.2f i\n", i, real[i], imaginary[i]);
															
 
																+        }
															
 
																+}
															
 
																+\endcode
															
 
																+
															
 
																+The whole code for this complex data interface is available in the
															
 
																+directory <c>examples/interface/</c>.
															
 
																+
															
 
																+
															
 
																+
															
 
																+*/
															
--- a/doc/doxygen/chapters/08scheduling.doxy
+++ b/doc/doxygen/chapters/08scheduling.doxy
@@ -0,0 +1,151 @@
 
																+/*
															
 
																+ * This file is part of the StarPU Handbook.
															
 
																+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
															
 
																+ * See the file version.doxy for copying conditions.
															
 
																+ */
															
 
																+
															
 
																+/*! \page Scheduling Scheduling
															
 
																+
															
 
																+\section TaskSchedulingPolicy Task Scheduling Policy
															
 
																+
															
 
																+By default, StarPU uses the simple greedy scheduler <c>eager</c>. This is
															
 
																+because it provides correct load balance even if the application codelets do not
															
 
																+have performance models. If your application codelets have performance models
															
 
																+(\ref PerformanceModelExample), you should change the scheduler thanks
															
 
																+to the environment variable \ref STARPU_SCHED. For instance <c>export
															
 
																+STARPU_SCHED=dmda</c> . Use <c>help</c> to get the list of available schedulers.
															
 
																+
															
 
																+The <b>eager</b> scheduler uses a central task queue, from which workers draw tasks
															
 
																+to work on. This however does not permit to prefetch data since the scheduling
															
 
																+decision is taken late. If a task has a non-0 priority, it is put at the front of the queue.
															
 
																+
															
 
																+The <b>prio</b> scheduler also uses a central task queue, but sorts tasks by
															
 
																+priority (between -5 and 5).
															
 
																+
															
 
																+The <b>random</b> scheduler distributes tasks randomly according to assumed worker
															
 
																+overall performance.
															
 
																+
															
 
																+The <b>ws</b> (work stealing) scheduler schedules tasks on the local worker by
															
 
																+default. When a worker becomes idle, it steals a task from the most loaded
															
 
																+worker.
															
 
																+
															
 
																+The <b>dm</b> (deque model) scheduler uses task execution performance models into account to
															
 
																+perform an HEFT-similar scheduling strategy: it schedules tasks where their
															
 
																+termination time will be minimal.
															
 
																+
															
 
																+The <b>dmda</b> (deque model data aware) scheduler is similar to dm, it also takes
															
 
																+into account data transfer time.
															
 
																+
															
 
																+The <b>dmdar</b> (deque model data aware ready) scheduler is similar to dmda,
															
 
																+it also sorts tasks on per-worker queues by number of already-available data
															
 
																+buffers.
															
 
																+
															
 
																+The <b>dmdas</b> (deque model data aware sorted) scheduler is similar to dmda, it
															
 
																+also supports arbitrary priority values.
															
 
																+
															
 
																+The <b>heft</b> (heterogeneous earliest finish time) scheduler is deprecated. It
															
 
																+is now just an alias for <b>dmda</b>.
															
 
																+
															
 
																+The <b>pheft</b> (parallel HEFT) scheduler is similar to heft, it also supports
															
 
																+parallel tasks (still experimental). Should not be used when several contexts using
															
 
																+it are being executed simultaneously.
															
 
																+
															
 
																+The <b>peager</b> (parallel eager) scheduler is similar to eager, it also
															
 
																+supports parallel tasks (still experimental). Should not be used when several 
															
 
																+contexts using it are being executed simultaneously.
															
 
																+
															
 
																+\section TaskDistributionVsDataTransfer Task Distribution Vs Data Transfer
															
 
																+
															
 
																+Distributing tasks to balance the load induces data transfer penalty. StarPU
															
 
																+thus needs to find a balance between both. The target function that the
															
 
																+scheduler <c>dmda</c> of StarPU
															
 
																+tries to minimize is <c>alpha * T_execution + beta * T_data_transfer</c>, where
															
 
																+<c>T_execution</c> is the estimated execution time of the codelet (usually
															
 
																+accurate), and <c>T_data_transfer</c> is the estimated data transfer time. The
															
 
																+latter is estimated based on bus calibration before execution start,
															
 
																+i.e. with an idle machine, thus without contention. You can force bus
															
 
																+re-calibration by running the tool <c>starpu_calibrate_bus</c>. The
															
 
																+beta parameter defaults to <c>1</c>, but it can be worth trying to tweak it
															
 
																+by using <c>export STARPU_SCHED_BETA=2</c> for instance, since during
															
 
																+real application execution, contention makes transfer times bigger.
															
 
																+This is of course imprecise, but in practice, a rough estimation
															
 
																+already gives the good results that a precise estimation would give.
															
 
																+
															
 
																+\section Power-basedScheduling Power-based Scheduling
															
 
																+
															
 
																+If the application can provide some power performance model (through
															
 
																+the field starpu_codelet::power_model), StarPU will
															
 
																+take it into account when distributing tasks. The target function that
															
 
																+the scheduler <c>dmda</c> minimizes becomes <c>alpha * T_execution +
															
 
																+beta * T_data_transfer + gamma * Consumption</c> , where <c>Consumption</c>
															
 
																+is the estimated task consumption in Joules. To tune this parameter, use
															
 
																+<c>export STARPU_SCHED_GAMMA=3000</c> for instance, to express that each Joule
															
 
																+(i.e kW during 1000us) is worth 3000us execution time penalty. Setting
															
 
																+<c>alpha</c> and <c>beta</c> to zero permits to only take into account power consumption.
															
 
																+
															
 
																+This is however not sufficient to correctly optimize power: the scheduler would
															
 
																+simply tend to run all computations on the most energy-conservative processing
															
 
																+unit. To account for the consumption of the whole machine (including idle
															
 
																+processing units), the idle power of the machine should be given by setting
															
 
																+<c>export STARPU_IDLE_POWER=200</c> for 200W, for instance. This value can often
															
 
																+be obtained from the machine power supplier.
															
 
																+
															
 
																+The power actually consumed by the total execution can be displayed by setting
															
 
																+<c>export STARPU_PROFILING=1 STARPU_WORKER_STATS=1</c> .
															
 
																+
															
 
																+On-line task consumption measurement is currently only supported through the
															
 
																+<c>CL_PROFILING_POWER_CONSUMED</c> OpenCL extension, implemented in the MoviSim
															
 
																+simulator. Applications can however provide explicit measurements by
															
 
																+using the function starpu_perfmodel_update_history() (examplified in \ref PerformanceModelExample
															
 
																+with the <c>power_model</c> performance model). Fine-grain
															
 
																+measurement is often not feasible with the feedback provided by the hardware, so
															
 
																+the user can for instance run a given task a thousand times, measure the global
															
 
																+consumption for that series of tasks, divide it by a thousand, repeat for
															
 
																+varying kinds of tasks and task sizes, and eventually feed StarPU
															
 
																+with these manual measurements through starpu_perfmodel_update_history().
															
 
																+
															
 
																+\section StaticScheduling Static Scheduling
															
 
																+
															
 
																+In some cases, one may want to force some scheduling, for instance force a given
															
 
																+set of tasks to GPU0, another set to GPU1, etc. while letting some other tasks
															
 
																+be scheduled on any other device. This can indeed be useful to guide StarPU into
															
 
																+some work distribution, while still letting some degree of dynamism. For
															
 
																+instance, to force execution of a task on CUDA0:
															
 
																+
															
 
																+\code{.c}
															
 
																+task->execute_on_a_specific_worker = 1;
															
 
																+task->worker = starpu_worker_get_by_type(STARPU_CUDA_WORKER, 0);
															
 
																+\endcode
															
 
																+
															
 
																+Note however that using scheduling contexts while statically scheduling tasks on workers
															
 
																+could be tricky. Be careful to schedule the tasks exactly on the workers of the corresponding
															
 
																+contexts, otherwise the workers' corresponding scheduling structures may not be allocated or
															
 
																+the execution of the application may deadlock. Moreover, the hypervisor should not be used when
															
 
																+statically scheduling tasks.
															
 
																+
															
 
																+\section DefiningANewSchedulingPolicy Defining A New Scheduling Policy
															
 
																+
															
 
																+A full example showing how to define a new scheduling policy is available in
															
 
																+the StarPU sources in the directory <c>examples/scheduler/</c>.
															
 
																+
															
 
																+See \ref API_Scheduling_Policy
															
 
																+
															
 
																+\code{.c}
															
 
																+static struct starpu_sched_policy dummy_sched_policy = {
															
 
																+    .init_sched = init_dummy_sched,
															
 
																+    .deinit_sched = deinit_dummy_sched,
															
 
																+    .add_workers = dummy_sched_add_workers,
															
 
																+    .remove_workers = dummy_sched_remove_workers,
															
 
																+    .push_task = push_task_dummy,
															
 
																+    .push_prio_task = NULL,
															
 
																+    .pop_task = pop_task_dummy,
															
 
																+    .post_exec_hook = NULL,
															
 
																+    .pop_every_task = NULL,
															
 
																+    .policy_name = "dummy",
															
 
																+    .policy_description = "dummy scheduling strategy"
															
 
																+};
															
 
																+\endcode
															
 
																+
															
 
																+*/
															
--- a/doc/doxygen/chapters/09scheduling_contexts.doxy
+++ b/doc/doxygen/chapters/09scheduling_contexts.doxy
--- a/doc/doxygen/chapters/10scheduling_context_hypervisor.doxy
+++ b/doc/doxygen/chapters/10scheduling_context_hypervisor.doxy
--- a/doc/doxygen/chapters/11debugging_tools.doxy
+++ b/doc/doxygen/chapters/11debugging_tools.doxy
@@ -0,0 +1,42 @@
 
																+/*
															
 
																+ * This file is part of the StarPU Handbook.
															
 
																+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
															
 
																+ * See the file version.doxy for copying conditions.
															
 
																+ */
															
 
																+
															
 
																+/*! \page DebuggingTools Debugging Tools
															
 
																+
															
 
																+StarPU provides several tools to help debugging applications. Execution traces
															
 
																+can be generated and displayed graphically, see \ref
															
 
																+GeneratingTracesWithFxT. Some gdb helpers are also provided to show
															
 
																+the whole StarPU state:
															
 
																+
															
 
																+\verbatim
															
 
																+(gdb) source tools/gdbinit
															
 
																+(gdb) help starpu
															
 
																+\endverbatim
															
 
																+
															
 
																+The Temanejo task debugger can also be used, see \ref UsingTheTemanejoTaskDebugger.
															
 
																+
															
 
																+\section UsingTheTemanejoTaskDebugger Using The Temanejo Task Debugger
															
 
																+
															
 
																+StarPU can connect to Temanejo >= 1.0rc2 (see
															
 
																+http://www.hlrs.de/temanejo), to permit
															
 
																+nice visual task debugging. To do so, build Temanejo's <c>libayudame.so</c>,
															
 
																+install <c>Ayudame.h</c> to e.g. <c>/usr/local/include</c>, apply the
															
 
																+<c>tools/patch-ayudame</c> to it to fix C build, re-<c>./configure</c>, make
															
 
																+sure that it found it, rebuild StarPU.  Run the Temanejo GUI, give it the path
															
 
																+to your application, any options you want to pass it, the path to <c>libayudame.so</c>.
															
 
																+
															
 
																+Make sure to specify at least the same number of CPUs in the dialog box as your
															
 
																+machine has, otherwise an error will happen during execution. Future versions
															
 
																+of Temanejo should be able to tell StarPU the number of CPUs to use.
															
 
																+
															
 
																+Tag numbers have to be below <c>4000000000000000000ULL</c> to be usable for
															
 
																+Temanejo (so as to distinguish them from tasks).
															
 
																+
															
 
																+
															
 
																+
															
 
																+*/
															
--- a/doc/doxygen/chapters/12online_performance_tools.doxy
+++ b/doc/doxygen/chapters/12online_performance_tools.doxy
@@ -0,0 +1,432 @@
 
																+/*
															
 
																+ * This file is part of the StarPU Handbook.
															
 
																+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
															
 
																+ * See the file version.doxy for copying conditions.
															
 
																+ */
															
 
																+
															
 
																+/*! \page OnlinePerformanceTools Online Performance Tools
															
 
																+
															
 
																+\section On-linePerformanceFeedback On-line Performance Feedback
															
 
																+
															
 
																+\subsection EnablingOn-linePerformanceMonitoring Enabling On-line Performance Monitoring
															
 
																+
															
 
																+In order to enable online performance monitoring, the application can
															
 
																+call starpu_profiling_status_set() with the parameter
															
 
																+::STARPU_PROFILING_ENABLE. It is possible to detect whether monitoring
															
 
																+is already enabled or not by calling starpu_profiling_status_get().
															
 
																+Enabling monitoring also reinitialize all previously collected
															
 
																+feedback. The environment variable \ref STARPU_PROFILING can also be
															
 
																+set to <c>1</c> to achieve the same effect. The function
															
 
																+starpu_profiling_init() can also be called during the execution to
															
 
																+reinitialize performance counters and to start the profiling if the
															
 
																+environment variable \ref STARPU_PROFILING is set to <c>1</c>.
															
 
																+
															
 
																+Likewise, performance monitoring is stopped by calling
															
 
																+starpu_profiling_status_set() with the parameter
															
 
																+::STARPU_PROFILING_DISABLE. Note that this does not reset the
															
 
																+performance counters so that the application may consult them later
															
 
																+on.
															
 
																+
															
 
																+More details about the performance monitoring API are available in \ref API_Profiling.
															
 
																+
															
 
																+\subsection Per-taskFeedback Per-task Feedback
															
 
																+
															
 
																+If profiling is enabled, a pointer to a structure
															
 
																+starpu_profiling_task_info is put in the field
															
 
																+starpu_task::profiling_info when a task terminates. This structure is
															
 
																+automatically destroyed when the task structure is destroyed, either
															
 
																+automatically or by calling starpu_task_destroy().
															
 
																+
															
 
																+The structure starpu_profiling_task_info indicates the date when the
															
 
																+task was submitted (starpu_profiling_task_info::submit_time), started
															
 
																+(starpu_profiling_task_info::start_time), and terminated
															
 
																+(starpu_profiling_task_info::end_time), relative to the initialization
															
 
																+of StarPU with starpu_init(). It also specifies the identifier of the worker
															
 
																+that has executed the task (starpu_profiling_task_info::workerid).
															
 
																+These date are stored as <c>timespec</c> structures which the user may convert
															
 
																+into micro-seconds using the helper function
															
 
																+starpu_timing_timespec_to_us().
															
 
																+
															
 
																+It it worth noting that the application may directly access this structure from
															
 
																+the callback executed at the end of the task. The structure starpu_task
															
 
																+associated to the callback currently being executed is indeed accessible with
															
 
																+the function starpu_task_get_current().
															
 
																+
															
 
																+\subsection Per-codeletFeedback Per-codelet Feedback
															
 
																+
															
 
																+The field starpu_codelet::per_worker_stats is
															
 
																+an array of counters. The i-th entry of the array is incremented every time a
															
 
																+task implementing the codelet is executed on the i-th worker.
															
 
																+This array is not reinitialized when profiling is enabled or disabled.
															
 
																+
															
 
																+\subsection Per-workerFeedback Per-worker Feedback
															
 
																+
															
 
																+The second argument returned by the function
															
 
																+starpu_profiling_worker_get_info() is a structure
															
 
																+starpu_profiling_worker_info that gives statistics about the specified
															
 
																+worker. This structure specifies when StarPU started collecting
															
 
																+profiling information for that worker
															
 
																+(starpu_profiling_worker_info::start_time), the
															
 
																+duration of the profiling measurement interval
															
 
																+(starpu_profiling_worker_info::total_time), the time spent executing
															
 
																+kernels (starpu_profiling_worker_info::executing_time), the time
															
 
																+spent sleeping because there is no task to execute at all
															
 
																+(starpu_profiling_worker_info::sleeping_time), and the number of tasks that were executed
															
 
																+while profiling was enabled. These values give an estimation of the
															
 
																+proportion of time spent do real work, and the time spent either
															
 
																+sleeping because there are not enough executable tasks or simply
															
 
																+wasted in pure StarPU overhead.
															
 
																+
															
 
																+Calling starpu_profiling_worker_get_info() resets the profiling
															
 
																+information associated to a worker.
															
 
																+
															
 
																+When an FxT trace is generated (see \ref GeneratingTracesWithFxT), it is also
															
 
																+possible to use the tool <c>starpu_workers_activity</c> (see \ref
															
 
																+MonitoringActivity) to generate a graphic showing the evolution of
															
 
																+these values during the time, for the different workers.
															
 
																+
															
 
																+\subsection Bus-relatedFeedback Bus-related Feedback
															
 
																+
															
 
																+TODO: ajouter \ref STARPU_BUS_STATS
															
 
																+
															
 
																+// how to enable/disable performance monitoring
															
 
																+// what kind of information do we get ?
															
 
																+
															
 
																+The bus speed measured by StarPU can be displayed by using the tool
															
 
																+<c>starpu_machine_display</c>, for instance:
															
 
																+
															
 
																+\verbatim
															
 
																+StarPU has found:
															
 
																+        3 CUDA devices
															
 
																+                CUDA 0 (Tesla C2050 02:00.0)
															
 
																+                CUDA 1 (Tesla C2050 03:00.0)
															
 
																+                CUDA 2 (Tesla C2050 84:00.0)
															
 
																+from    to RAM          to CUDA 0       to CUDA 1       to CUDA 2
															
 
																+RAM     0.000000        5176.530428     5176.492994     5191.710722
															
 
																+CUDA 0  4523.732446     0.000000        2414.074751     2417.379201
															
 
																+CUDA 1  4523.718152     2414.078822     0.000000        2417.375119
															
 
																+CUDA 2  4534.229519     2417.069025     2417.060863     0.000000
															
 
																+\endverbatim
															
 
																+
															
 
																+\subsection StarPU-TopInterface StarPU-Top Interface
															
 
																+
															
 
																+StarPU-Top is an interface which remotely displays the on-line state of a StarPU
															
 
																+application and permits the user to change parameters on the fly.
															
 
																+
															
 
																+Variables to be monitored can be registered by calling the functions
															
 
																+starpu_top_add_data_boolean(), starpu_top_add_data_integer(),
															
 
																+starpu_top_add_data_float(), e.g.:
															
 
																+
															
 
																+\code{.c}
															
 
																+starpu_top_data *data = starpu_top_add_data_integer("mynum", 0, 100, 1);
															
 
																+\endcode
															
 
																+
															
 
																+The application should then call starpu_top_init_and_wait() to give its name
															
 
																+and wait for StarPU-Top to get a start request from the user. The name is used
															
 
																+by StarPU-Top to quickly reload a previously-saved layout of parameter display.
															
 
																+
															
 
																+\code{.c}
															
 
																+starpu_top_init_and_wait("the application");
															
 
																+\endcode
															
 
																+
															
 
																+The new values can then be provided thanks to
															
 
																+starpu_top_update_data_boolean(), starpu_top_update_data_integer(),
															
 
																+starpu_top_update_data_float(), e.g.:
															
 
																+
															
 
																+\code{.c}
															
 
																+starpu_top_update_data_integer(data, mynum);
															
 
																+\endcode
															
 
																+
															
 
																+Updateable parameters can be registered thanks to starpu_top_register_parameter_boolean(), starpu_top_register_parameter_integer(), starpu_top_register_parameter_float(), e.g.:
															
 
																+
															
 
																+\code{.c}
															
 
																+float alpha;
															
 
																+starpu_top_register_parameter_float("alpha", &alpha, 0, 10, modif_hook);
															
 
																+\endcode
															
 
																+
															
 
																+<c>modif_hook</c> is a function which will be called when the parameter is being modified, it can for instance print the new value:
															
 
																+
															
 
																+\code{.c}
															
 
																+void modif_hook(struct starpu_top_param *d) {
															
 
																+    fprintf(stderr,"%s has been modified: %f\n", d->name, alpha);
															
 
																+}
															
 
																+\endcode
															
 
																+
															
 
																+Task schedulers should notify StarPU-Top when it has decided when a task will be
															
 
																+scheduled, so that it can show it in its Gantt chart, for instance:
															
 
																+
															
 
																+\code{.c}
															
 
																+starpu_top_task_prevision(task, workerid, begin, end);
															
 
																+\endcode
															
 
																+
															
 
																+Starting StarPU-Top (StarPU-Top is started via the binary
															
 
																+<c>starpu_top</c>.) and the application can be done two ways:
															
 
																+
															
 
																+<ul>
															
 
																+<li> The application is started by hand on some machine (and thus already
															
 
																+waiting for the start event). In the Preference dialog of StarPU-Top, the SSH
															
 
																+checkbox should be unchecked, and the hostname and port (default is 2011) on
															
 
																+which the application is already running should be specified. Clicking on the
															
 
																+connection button will thus connect to the already-running application.
															
 
																+</li>
															
 
																+<li> StarPU-Top is started first, and clicking on the connection button will
															
 
																+start the application itself (possibly on a remote machine). The SSH checkbox
															
 
																+should be checked, and a command line provided, e.g.:
															
 
																+
															
 
																+\verbatim
															
 
																+$ ssh myserver STARPU_SCHED=dmda ./application
															
 
																+\endverbatim
															
 
																+
															
 
																+If port 2011 of the remote machine can not be accessed directly, an ssh port bridge should be added:
															
 
																+
															
 
																+\verbatim
															
 
																+$ ssh -L 2011:localhost:2011 myserver STARPU_SCHED=dmda ./application
															
 
																+\endverbatim
															
 
																+
															
 
																+and "localhost" should be used as IP Address to connect to.
															
 
																+</li>
															
 
																+</ul>
															
 
																+
															
 
																+\section TaskAndWorkerProfiling Task And Worker Profiling
															
 
																+
															
 
																+A full example showing how to use the profiling API is available in
															
 
																+the StarPU sources in the directory <c>examples/profiling/</c>.
															
 
																+
															
 
																+\code{.c}
															
 
																+struct starpu_task *task = starpu_task_create();
															
 
																+task->cl = &cl;
															
 
																+task->synchronous = 1;
															
 
																+/* We will destroy the task structure by hand so that we can
															
 
																+ * query the profiling info before the task is destroyed. */
															
 
																+task->destroy = 0;
															
 
																+
															
 
																+/* Submit and wait for completion (since synchronous was set to 1) */
															
 
																+starpu_task_submit(task);
															
 
																+
															
 
																+/* The task is finished, get profiling information */
															
 
																+struct starpu_profiling_task_info *info = task->profiling_info;
															
 
																+
															
 
																+/* How much time did it take before the task started ? */
															
 
																+double delay += starpu_timing_timespec_delay_us(&info->submit_time, &info->start_time);
															
 
																+
															
 
																+/* How long was the task execution ? */
															
 
																+double length += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
															
 
																+
															
 
																+/* We don't need the task structure anymore */
															
 
																+starpu_task_destroy(task);
															
 
																+\endcode
															
 
																+
															
 
																+\code{.c}
															
 
																+/* Display the occupancy of all workers during the test */
															
 
																+int worker;
															
 
																+for (worker = 0; worker < starpu_worker_get_count(); worker++)
															
 
																+{
															
 
																+        struct starpu_profiling_worker_info worker_info;
															
 
																+        int ret = starpu_profiling_worker_get_info(worker, &worker_info);
															
 
																+        STARPU_ASSERT(!ret);
															
 
																+
															
 
																+        double total_time = starpu_timing_timespec_to_us(&worker_info.total_time);
															
 
																+        double executing_time = starpu_timing_timespec_to_us(&worker_info.executing_time);
															
 
																+        double sleeping_time = starpu_timing_timespec_to_us(&worker_info.sleeping_time);
															
 
																+        double overhead_time = total_time - executing_time - sleeping_time;
															
 
																+
															
 
																+        float executing_ratio = 100.0*executing_time/total_time;
															
 
																+        float sleeping_ratio = 100.0*sleeping_time/total_time;
															
 
																+        float overhead_ratio = 100.0 - executing_ratio - sleeping_ratio;
															
 
																+
															
 
																+        char workername[128];
															
 
																+        starpu_worker_get_name(worker, workername, 128);
															
 
																+        fprintf(stderr, "Worker %s:\n", workername);
															
 
																+        fprintf(stderr, "\ttotal time: %.2lf ms\n", total_time*1e-3);
															
 
																+        fprintf(stderr, "\texec time: %.2lf ms (%.2f %%)\n",
															
 
																+                executing_time*1e-3, executing_ratio);
															
 
																+        fprintf(stderr, "\tblocked time: %.2lf ms (%.2f %%)\n",
															
 
																+                sleeping_time*1e-3, sleeping_ratio);
															
 
																+        fprintf(stderr, "\toverhead time: %.2lf ms (%.2f %%)\n",
															
 
																+                overhead_time*1e-3, overhead_ratio);
															
 
																+}
															
 
																+\endcode
															
 
																+
															
 
																+\section PerformanceModelExample Performance Model Example
															
 
																+
															
 
																+To achieve good scheduling, StarPU scheduling policies need to be able to
															
 
																+estimate in advance the duration of a task. This is done by giving to codelets
															
 
																+a performance model, by defining a structure starpu_perfmodel and
															
 
																+providing its address in the field starpu_codelet::model. The fields
															
 
																+starpu_perfmodel::symbol and starpu_perfmodel::type are mandatory, to
															
 
																+give a name to the model, and the type of the model, since there are
															
 
																+several kinds of performance models. For compatibility, make sure to
															
 
																+initialize the whole structure to zero, either by using explicit
															
 
																+memset(), or by letting the compiler implicitly do it as examplified
															
 
																+below.
															
 
																+
															
 
																+<ul>
															
 
																+<li>
															
 
																+Measured at runtime (model type ::STARPU_HISTORY_BASED). This assumes that for a
															
 
																+given set of data input/output sizes, the performance will always be about the
															
 
																+same. This is very true for regular kernels on GPUs for instance (<0.1% error),
															
 
																+and just a bit less true on CPUs (~=1% error). This also assumes that there are
															
 
																+few different sets of data input/output sizes. StarPU will then keep record of
															
 
																+the average time of previous executions on the various processing units, and use
															
 
																+it as an estimation. History is done per task size, by using a hash of the input
															
 
																+and ouput sizes as an index.
															
 
																+It will also save it in <c>$STARPU_HOME/.starpu/sampling/codelets</c>
															
 
																+for further executions, and can be observed by using the tool
															
 
																+<c>starpu_perfmodel_display</c>, or drawn by using
															
 
																+the tool <c>starpu_perfmodel_plot</c> (\ref PerformanceModelCalibration).  The
															
 
																+models are indexed by machine name. To
															
 
																+share the models between machines (e.g. for a homogeneous cluster), use
															
 
																+<c>export STARPU_HOSTNAME=some_global_name</c>. Measurements are only done
															
 
																+when using a task scheduler which makes use of it, such as
															
 
																+<c>dmda</c>. Measurements can also be provided explicitly by the application, by
															
 
																+using the function starpu_perfmodel_update_history().
															
 
																+
															
 
																+The following is a small code example.
															
 
																+
															
 
																+If e.g. the code is recompiled with other compilation options, or several
															
 
																+variants of the code are used, the symbol string should be changed to reflect
															
 
																+that, in order to recalibrate a new model from zero. The symbol string can even
															
 
																+be constructed dynamically at execution time, as long as this is done before
															
 
																+submitting any task using it.
															
 
																+
															
 
																+\code{.c}
															
 
																+static struct starpu_perfmodel mult_perf_model = {
															
 
																+    .type = STARPU_HISTORY_BASED,
															
 
																+    .symbol = "mult_perf_model"
															
 
																+};
															
 
																+
															
 
																+struct starpu_codelet cl = {
															
 
																+    .where = STARPU_CPU,
															
 
																+    .cpu_funcs = { cpu_mult, NULL },
															
 
																+    .cpu_funcs_name = { "cpu_mult", NULL },
															
 
																+    .nbuffers = 3,
															
 
																+    .modes = { STARPU_R, STARPU_R, STARPU_W },
															
 
																+    /* for the scheduling policy to be able to use performance models */
															
 
																+    .model = &mult_perf_model
															
 
																+};
															
 
																+\endcode
															
 
																+
															
 
																+</li>
															
 
																+<li>
															
 
																+Measured at runtime and refined by regression (model types
															
 
																+::STARPU_REGRESSION_BASED and ::STARPU_NL_REGRESSION_BASED). This
															
 
																+still assumes performance regularity, but works 
															
 
																+with various data input sizes, by applying regression over observed
															
 
																+execution times. ::STARPU_REGRESSION_BASED uses an a*n^b regression
															
 
																+form, ::STARPU_NL_REGRESSION_BASED uses an a*n^b+c (more precise than
															
 
																+::STARPU_REGRESSION_BASED, but costs a lot more to compute).
															
 
																+
															
 
																+For instance,
															
 
																+<c>tests/perfmodels/regression_based.c</c> uses a regression-based performance
															
 
																+model for the function memset().
															
 
																+
															
 
																+Of course, the application has to issue
															
 
																+tasks with varying size so that the regression can be computed. StarPU will not
															
 
																+trust the regression unless there is at least 10% difference between the minimum
															
 
																+and maximum observed input size. It can be useful to set the
															
 
																+environment variable \ref STARPU_CALIBRATE to <c>1</c> and run the application
															
 
																+on varying input sizes with \ref STARPU_SCHED set to <c>dmda</c> scheduler,
															
 
																+so as to feed the performance model for a variety of
															
 
																+inputs. The application can also provide the measurements explictly by
															
 
																+using the function starpu_perfmodel_update_history(). The tools
															
 
																+<c>starpu_perfmodel_display</c> and <c>starpu_perfmodel_plot</c> can
															
 
																+be used to observe how much the performance model is calibrated (\ref
															
 
																+PerformanceModelCalibration); when their output look good,
															
 
																+\ref STARPU_CALIBRATE can be reset to <c>0</c> to let
															
 
																+StarPU use the resulting performance model without recording new measures, and
															
 
																+\ref STARPU_SCHED can be set to <c>dmda</c> to benefit from the performance models. If
															
 
																+the data input sizes vary a lot, it is really important to set
															
 
																+\ref STARPU_CALIBRATE to <c>0</c>, otherwise StarPU will continue adding the
															
 
																+measures, and result with a very big performance model, which will take time a
															
 
																+lot of time to load and save.
															
 
																+
															
 
																+For non-linear regression, since computing it
															
 
																+is quite expensive, it is only done at termination of the application. This
															
 
																+means that the first execution of the application will use only history-based
															
 
																+performance model to perform scheduling, without using regression.
															
 
																+</li>
															
 
																+
															
 
																+<li>
															
 
																+Provided as an estimation from the application itself (model type
															
 
																+::STARPU_COMMON and field starpu_perfmodel::cost_function),
															
 
																+see for instance
															
 
																+<c>examples/common/blas_model.h</c> and <c>examples/common/blas_model.c</c>.
															
 
																+</li>
															
 
																+
															
 
																+<li>
															
 
																+Provided explicitly by the application (model type ::STARPU_PER_ARCH):
															
 
																+the fields <c>.per_arch[arch][nimpl].cost_function</c> have to be
															
 
																+filled with pointers to functions which return the expected duration
															
 
																+of the task in micro-seconds, one per architecture.
															
 
																+</li>
															
 
																+</ul>
															
 
																+
															
 
																+For ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED, and
															
 
																+::STARPU_NL_REGRESSION_BASED, the total size of task data (both input
															
 
																+and output) is used as an index by default. The field
															
 
																+starpu_perfmodel::size_base however permits the application to
															
 
																+override that, when for instance some of the data do not matter for
															
 
																+task cost (e.g. mere reference table), or when using sparse
															
 
																+structures (in which case it is the number of non-zeros which matter), or when
															
 
																+there is some hidden parameter such as the number of iterations, or when the application
															
 
																+actually has a very good idea of the complexity of the algorithm, and just not
															
 
																+the speed of the processor, etc.
															
 
																+The example in the directory <c>examples/pi</c> uses this to include
															
 
																+the number of iterations in the base.
															
 
																+
															
 
																+StarPU will automatically determine when the performance model is calibrated,
															
 
																+or rather, it will assume the performance model is calibrated until the
															
 
																+application submits a task for which the performance can not be predicted. For
															
 
																+::STARPU_HISTORY_BASED, StarPU will require 10 (_STARPU_CALIBRATION_MINIMUM)
															
 
																+measurements for a given size before estimating that an average can be taken as
															
 
																+estimation for further executions with the same size. For
															
 
																+::STARPU_REGRESSION_BASED and ::STARPU_NL_REGRESSION_BASED, StarPU will require
															
 
																+10 (_STARPU_CALIBRATION_MINIMUM) measurements, and that the minimum measured
															
 
																+data size is smaller than 90% of the maximum measured data size (i.e. the
															
 
																+measurement interval is large enough for a regression to have a meaning).
															
 
																+Calibration can also be forced by setting the \ref STARPU_CALIBRATE environment
															
 
																+variable to <c>1</c>, or even reset by setting it to <c>2</c>.
															
 
																+
															
 
																+How to use schedulers which can benefit from such performance model is explained
															
 
																+in \ref TaskSchedulingPolicy.
															
 
																+
															
 
																+The same can be done for task power consumption estimation, by setting
															
 
																+the field starpu_codelet::power_model the same way as the field
															
 
																+starpu_codelet::model. Note: for now, the application has to give to
															
 
																+the power consumption performance model a name which is different from
															
 
																+the execution time performance model.
															
 
																+
															
 
																+The application can request time estimations from the StarPU performance
															
 
																+models by filling a task structure as usual without actually submitting
															
 
																+it. The data handles can be created by calling any of the functions
															
 
																+<c>starpu_*_data_register</c> with a <c>NULL</c> pointer and <c>-1</c>
															
 
																+node and the desired data sizes, and need to be unregistered as usual.
															
 
																+The functions starpu_task_expected_length() and
															
 
																+starpu_task_expected_power() can then be called to get an estimation
															
 
																+of the task cost on a given arch. starpu_task_footprint() can also be
															
 
																+used to get the footprint used for indexing history-based performance
															
 
																+models. starpu_task_destroy() needs to be called to destroy the dummy
															
 
																+task afterwards. See <c>tests/perfmodels/regression_based.c</c> for an example.
															
 
																+
															
 
																+\section DataTrace Data trace and tasks length
															
 
																+It is possible to get statistics about tasks length and data size by using :
															
 
																+\verbatim
															
 
																+$ starpu_fxt_data_trace filename [codelet1 codelet2 ... codeletn]
															
 
																+\endverbatim
															
 
																+Where filename is the FxT trace file and codeletX the names of the codelets you
															
 
																+want to profile (if no names are specified, <c>starpu_fxt_data_trace</c> will profile them all).
															
 
																+This will create a file, <c>data_trace.gp</c> which
															
 
																+can be executed to get a <c>.eps</c> image of these results. On the image, each point represents a
															
 
																+task, and each color corresponds to a codelet.
															
 
																+
															
 
																+\image html data_trace.png
															
 
																+\image latex data_trace.eps "" width=\textwidth
															
 
																+
															
 
																+// TODO: data transfer stats are similar to the ones displayed when
															
 
																+// setting STARPU_BUS_STATS
															
 
																+
															
 
																+
															
 
																+
															
 
																+*/
															
--- a/doc/doxygen/chapters/05performance_feedback.doxy
+++ b/doc/doxygen/chapters/05performance_feedback.doxy
@@ -1,211 +1,47 @@
 
																 /*
															
 
																  * This file is part of the StarPU Handbook.
															
 
																  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
															
 
																  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
															
 
																  * See the file version.doxy for copying conditions.
															
 
																  */
															
 
																-/*! \page PerformanceFeedback Performance Feedback
															
 
																-
															
 
																-\section UsingTheTemanejoTaskDebugger Using The Temanejo Task Debugger
															
 
																-
															
 
																-StarPU can connect to Temanejo >= 1.0rc2 (see
															
 
																-http://www.hlrs.de/temanejo), to permit
															
 
																-nice visual task debugging. To do so, build Temanejo's <c>libayudame.so</c>,
															
 
																-install <c>Ayudame.h</c> to e.g. <c>/usr/local/include</c>, apply the
															
 
																-<c>tools/patch-ayudame</c> to it to fix C build, re-<c>./configure</c>, make
															
 
																-sure that it found it, rebuild StarPU.  Run the Temanejo GUI, give it the path
															
 
																-to your application, any options you want to pass it, the path to <c>libayudame.so</c>.
															
 
																-
															
 
																-Make sure to specify at least the same number of CPUs in the dialog box as your
															
 
																-machine has, otherwise an error will happen during execution. Future versions
															
 
																-of Temanejo should be able to tell StarPU the number of CPUs to use.
															
 
																-
															
 
																-Tag numbers have to be below <c>4000000000000000000ULL</c> to be usable for
															
 
																-Temanejo (so as to distinguish them from tasks).
															
 
																-
															
 
																-\section On-linePerformanceFeedback On-line Performance Feedback
															
 
																-
															
 
																-\subsection EnablingOn-linePerformanceMonitoring Enabling On-line Performance Monitoring
															
 
																-
															
 
																-In order to enable online performance monitoring, the application can
															
 
																-call starpu_profiling_status_set() with the parameter
															
 
																-::STARPU_PROFILING_ENABLE. It is possible to detect whether monitoring
															
 
																-is already enabled or not by calling starpu_profiling_status_get().
															
 
																-Enabling monitoring also reinitialize all previously collected
															
 
																-feedback. The environment variable \ref STARPU_PROFILING can also be
															
 
																-set to <c>1</c> to achieve the same effect. The function
															
 
																-starpu_profiling_init() can also be called during the execution to
															
 
																-reinitialize performance counters and to start the profiling if the
															
 
																-environment variable \ref STARPU_PROFILING is set to <c>1</c>.
															
 
																-
															
 
																-Likewise, performance monitoring is stopped by calling
															
 
																-starpu_profiling_status_set() with the parameter
															
 
																-::STARPU_PROFILING_DISABLE. Note that this does not reset the
															
 
																-performance counters so that the application may consult them later
															
 
																-on.
															
 
																-
															
 
																-More details about the performance monitoring API are available in \ref API_Profiling.
															
 
																-
															
 
																-\subsection Per-taskFeedback Per-task Feedback
															
 
																-
															
 
																-If profiling is enabled, a pointer to a structure
															
 
																-starpu_profiling_task_info is put in the field
															
 
																-starpu_task::profiling_info when a task terminates. This structure is
															
 
																-automatically destroyed when the task structure is destroyed, either
															
 
																-automatically or by calling starpu_task_destroy().
															
 
																-
															
 
																-The structure starpu_profiling_task_info indicates the date when the
															
 
																-task was submitted (starpu_profiling_task_info::submit_time), started
															
 
																-(starpu_profiling_task_info::start_time), and terminated
															
 
																-(starpu_profiling_task_info::end_time), relative to the initialization
															
 
																-of StarPU with starpu_init(). It also specifies the identifier of the worker
															
 
																-that has executed the task (starpu_profiling_task_info::workerid).
															
 
																-These date are stored as <c>timespec</c> structures which the user may convert
															
 
																-into micro-seconds using the helper function
															
 
																-starpu_timing_timespec_to_us().
															
 
																-
															
 
																-It it worth noting that the application may directly access this structure from
															
 
																-the callback executed at the end of the task. The structure starpu_task
															
 
																-associated to the callback currently being executed is indeed accessible with
															
 
																-the function starpu_task_get_current().
															
 
																-
															
 
																-\subsection Per-codeletFeedback Per-codelet Feedback
															
 
																-
															
 
																-The field starpu_codelet::per_worker_stats is
															
 
																-an array of counters. The i-th entry of the array is incremented every time a
															
 
																-task implementing the codelet is executed on the i-th worker.
															
 
																-This array is not reinitialized when profiling is enabled or disabled.
															
 
																-
															
 
																-\subsection Per-workerFeedback Per-worker Feedback
															
 
																-
															
 
																-The second argument returned by the function
															
 
																-starpu_profiling_worker_get_info() is a structure
															
 
																-starpu_profiling_worker_info that gives statistics about the specified
															
 
																-worker. This structure specifies when StarPU started collecting
															
 
																-profiling information for that worker
															
 
																-(starpu_profiling_worker_info::start_time), the
															
 
																-duration of the profiling measurement interval
															
 
																-(starpu_profiling_worker_info::total_time), the time spent executing
															
 
																-kernels (starpu_profiling_worker_info::executing_time), the time
															
 
																-spent sleeping because there is no task to execute at all
															
 
																-(starpu_profiling_worker_info::sleeping_time), and the number of tasks that were executed
															
 
																-while profiling was enabled. These values give an estimation of the
															
 
																-proportion of time spent do real work, and the time spent either
															
 
																-sleeping because there are not enough executable tasks or simply
															
 
																-wasted in pure StarPU overhead.
															
 
																-
															
 
																-Calling starpu_profiling_worker_get_info() resets the profiling
															
 
																-information associated to a worker.
															
 
																-
															
 
																-When an FxT trace is generated (see \ref GeneratingTracesWithFxT), it is also
															
 
																-possible to use the tool <c>starpu_workers_activity</c> (see \ref
															
 
																-MonitoringActivity) to generate a graphic showing the evolution of
															
 
																-these values during the time, for the different workers.
															
 
																-
															
 
																-\subsection Bus-relatedFeedback Bus-related Feedback
															
 
																-
															
 
																-TODO: ajouter \ref STARPU_BUS_STATS
															
 
																-
															
 
																-// how to enable/disable performance monitoring
															
 
																-// what kind of information do we get ?
															
 
																-
															
 
																-The bus speed measured by StarPU can be displayed by using the tool
															
 
																-<c>starpu_machine_display</c>, for instance:
															
 
																+/*! \page OfflinePerformanceTools Offline Performance Tools
															
 
																-\verbatim
															
 
																-StarPU has found:
															
 
																-        3 CUDA devices
															
 
																-                CUDA 0 (Tesla C2050 02:00.0)
															
 
																-                CUDA 1 (Tesla C2050 03:00.0)
															
 
																-                CUDA 2 (Tesla C2050 84:00.0)
															
 
																-from    to RAM          to CUDA 0       to CUDA 1       to CUDA 2
															
 
																-RAM     0.000000        5176.530428     5176.492994     5191.710722
															
 
																-CUDA 0  4523.732446     0.000000        2414.074751     2417.379201
															
 
																-CUDA 1  4523.718152     2414.078822     0.000000        2417.375119
															
 
																-CUDA 2  4534.229519     2417.069025     2417.060863     0.000000
															
 
																-\endverbatim
															
 
																-
															
 
																-\subsection StarPU-TopInterface StarPU-Top Interface
															
 
																-
															
 
																-StarPU-Top is an interface which remotely displays the on-line state of a StarPU
															
 
																-application and permits the user to change parameters on the fly.
															
 
																-
															
 
																-Variables to be monitored can be registered by calling the functions
															
 
																-starpu_top_add_data_boolean(), starpu_top_add_data_integer(),
															
 
																-starpu_top_add_data_float(), e.g.:
															
 
																-
															
 
																-\code{.c}
															
 
																-starpu_top_data *data = starpu_top_add_data_integer("mynum", 0, 100, 1);
															
 
																-\endcode
															
 
																-
															
 
																-The application should then call starpu_top_init_and_wait() to give its name
															
 
																-and wait for StarPU-Top to get a start request from the user. The name is used
															
 
																-by StarPU-Top to quickly reload a previously-saved layout of parameter display.
															
 
																-
															
 
																-\code{.c}
															
 
																-starpu_top_init_and_wait("the application");
															
 
																-\endcode
															
 
																-
															
 
																-The new values can then be provided thanks to
															
 
																-starpu_top_update_data_boolean(), starpu_top_update_data_integer(),
															
 
																-starpu_top_update_data_float(), e.g.:
															
 
																-
															
 
																-\code{.c}
															
 
																-starpu_top_update_data_integer(data, mynum);
															
 
																-\endcode
															
 
																-
															
 
																-Updateable parameters can be registered thanks to starpu_top_register_parameter_boolean(), starpu_top_register_parameter_integer(), starpu_top_register_parameter_float(), e.g.:
															
 
																-
															
 
																-\code{.c}
															
 
																-float alpha;
															
 
																-starpu_top_register_parameter_float("alpha", &alpha, 0, 10, modif_hook);
															
 
																-\endcode
															
 
																-
															
 
																-<c>modif_hook</c> is a function which will be called when the parameter is being modified, it can for instance print the new value:
															
 
																-
															
 
																-\code{.c}
															
 
																-void modif_hook(struct starpu_top_param *d) {
															
 
																-    fprintf(stderr,"%s has been modified: %f\n", d->name, alpha);
															
 
																-}
															
 
																-\endcode
															
 
																-
															
 
																-Task schedulers should notify StarPU-Top when it has decided when a task will be
															
 
																-scheduled, so that it can show it in its Gantt chart, for instance:
															
 
																-
															
 
																-\code{.c}
															
 
																-starpu_top_task_prevision(task, workerid, begin, end);
															
 
																-\endcode
															
 
																-
															
 
																-Starting StarPU-Top (StarPU-Top is started via the binary
															
 
																-<c>starpu_top</c>.) and the application can be done two ways:
															
 
																+To get an idea of what is happening, a lot of performance feedback is available,
															
 
																+detailed in this chapter. The various informations should be checked for.
															
 
																 <ul>
															
 
																-<li> The application is started by hand on some machine (and thus already
															
 
																-waiting for the start event). In the Preference dialog of StarPU-Top, the SSH
															
 
																-checkbox should be unchecked, and the hostname and port (default is 2011) on
															
 
																-which the application is already running should be specified. Clicking on the
															
 
																-connection button will thus connect to the already-running application.
															
 
																-</li>
															
 
																-<li> StarPU-Top is started first, and clicking on the connection button will
															
 
																-start the application itself (possibly on a remote machine). The SSH checkbox
															
 
																-should be checked, and a command line provided, e.g.:
															
 
																-
															
 
																-\verbatim
															
 
																-$ ssh myserver STARPU_SCHED=dmda ./application
															
 
																-\endverbatim
															
 
																-
															
 
																-If port 2011 of the remote machine can not be accessed directly, an ssh port bridge should be added:
															
 
																-
															
 
																-\verbatim
															
 
																-$ ssh -L 2011:localhost:2011 myserver STARPU_SCHED=dmda ./application
															
 
																-\endverbatim
															
 
																-
															
 
																-and "localhost" should be used as IP Address to connect to.
															
 
																+<li>
															
 
																+What does the Gantt diagram look like? (see \ref CreatingAGanttDiagram)
															
 
																+<ul>
															
 
																+  <li> If it's mostly green (tasks running in the initial context) or context specific
															
 
																+  color prevailing, then the machine is properly
															
 
																+  utilized, and perhaps the codelets are just slow. Check their performance, see
															
 
																+  \ref PerformanceOfCodelets.
															
 
																+  </li>
															
 
																+  <li> If it's mostly purple (FetchingInput), tasks keep waiting for data
															
 
																+  transfers, do you perhaps have far more communication than computation? Did
															
 
																+  you properly use CUDA streams to make sure communication can be
															
 
																+  overlapped? Did you use data-locality aware schedulers to avoid transfers as
															
 
																+  much as possible?
															
 
																+  </li>
															
 
																+  <li> If it's mostly red (Blocked), tasks keep waiting for dependencies,
															
 
																+  do you have enough parallelism? It might be a good idea to check what the DAG
															
 
																+  looks like (see \ref CreatingADAGWithGraphviz).
															
 
																+  </li>
															
 
																+  <li> If only some workers are completely red (Blocked), for some reason the
															
 
																+  scheduler didn't assign tasks to them. Perhaps the performance model is bogus,
															
 
																+  check it (see \ref PerformanceOfCodelets). Do all your codelets have a
															
 
																+  performance model?  When some of them don't, the schedulers switches to a
															
 
																+  greedy algorithm which thus performs badly.
															
 
																+  </li>
															
 
																+</ul>
															
 
																 </li>
															
 
																 </ul>
															
 
																+You can also use the Temanejo task debugger (see \ref UsingTheTemanejoTaskDebugger) to
															
 
																+visualize the task graph more easily.
															
 
																 \section Off-linePerformanceFeedback Off-line Performance Feedback
															
 
																 \subsection GeneratingTracesWithFxT Generating Traces With FxT
															
@@ -492,6 +328,55 @@ execution time.
 
																 \ref TheoreticalLowerBoundOnExecutionTimeExample provides an example on how to
															
 
																 use this.
															
 
																+\section TheoreticalLowerBoundOnExecutionTimeExample Theoretical Lower Bound On Execution Time Example
															
 
																+
															
 
																+For kernels with history-based performance models (and provided that
															
 
																+they are completely calibrated), StarPU can very easily provide a
															
 
																+theoretical lower bound for the execution time of a whole set of
															
 
																+tasks. See for instance <c>examples/lu/lu_example.c</c>: before
															
 
																+submitting tasks, call the function starpu_bound_start(), and after
															
 
																+complete execution, call starpu_bound_stop().
															
 
																+starpu_bound_print_lp() or starpu_bound_print_mps() can then be used
															
 
																+to output a Linear Programming problem corresponding to the schedule
															
 
																+of your tasks. Run it through <c>lp_solve</c> or any other linear
															
 
																+programming solver, and that will give you a lower bound for the total
															
 
																+execution time of your tasks. If StarPU was compiled with the library
															
 
																+<c>glpk</c> installed, starpu_bound_compute() can be used to solve it
															
 
																+immediately and get the optimized minimum, in ms. Its parameter
															
 
																+<c>integer</c> allows to decide whether integer resolution should be
															
 
																+computed and returned 
															
 
																+
															
 
																+The <c>deps</c> parameter tells StarPU whether to take tasks, implicit
															
 
																+data, and tag dependencies into account. Tags released in a callback
															
 
																+or similar are not taken into account, only tags associated with a task are.
															
 
																+It must be understood that the linear programming
															
 
																+problem size is quadratic with the number of tasks and thus the time to solve it
															
 
																+will be very long, it could be minutes for just a few dozen tasks. You should
															
 
																+probably use <c>lp_solve -timeout 1 test.pl -wmps test.mps</c> to convert the
															
 
																+problem to MPS format and then use a better solver, <c>glpsol</c> might be
															
 
																+better than <c>lp_solve</c> for instance (the <c>--pcost</c> option may be
															
 
																+useful), but sometimes doesn't manage to converge. <c>cbc</c> might look
															
 
																+slower, but it is parallel. For <c>lp_solve</c>, be sure to try at least all the
															
 
																+<c>-B</c> options. For instance, we often just use <c>lp_solve -cc -B1 -Bb
															
 
																+-Bg -Bp -Bf -Br -BG -Bd -Bs -BB -Bo -Bc -Bi</c> , and the <c>-gr</c> option can
															
 
																+also be quite useful. The resulting schedule can be observed by using
															
 
																+the tool <c>starpu_lp2paje</c>, which converts it into the Paje
															
 
																+format.
															
 
																+
															
 
																+Data transfer time can only be taken into account when <c>deps</c> is set. Only
															
 
																+data transfers inferred from implicit data dependencies between tasks are taken
															
 
																+into account. Other data transfers are assumed to be completely overlapped.
															
 
																+
															
 
																+Setting <c>deps</c> to 0 will only take into account the actual computations
															
 
																+on processing units. It however still properly takes into account the varying
															
 
																+performances of kernels and processing units, which is quite more accurate than
															
 
																+just comparing StarPU performances with the fastest of the kernels being used.
															
 
																+
															
 
																+The <c>prio</c> parameter tells StarPU whether to simulate taking into account
															
 
																+the priorities as the StarPU scheduler would, i.e. schedule prioritized
															
 
																+tasks before less prioritized tasks, to check to which extend this results
															
 
																+to a less optimal solution. This increases even more computation time.
															
 
																+
															
 
																 \section MemoryFeedback Memory Feedback
															
 
																 It is possible to enable memory statistics. To do so, you need to pass
															
@@ -592,21 +477,4 @@ Computation took (in ms)
 
																 Synthetic GFlops : 44.21
															
 
																 \endverbatim
															
 
																-// TODO: data transfer stats are similar to the ones displayed when
															
 
																-// setting STARPU_BUS_STATS
															
 
																-
															
 
																-\section DataTrace Data trace and tasks length
															
 
																-It is possible to get statistics about tasks length and data size by using :
															
 
																-\verbatim
															
 
																-$starpu_fxt_data_trace filename [codelet1 codelet2 ... codeletn]
															
 
																-\endverbatim
															
 
																-Where filename is the FxT trace file and codeletX the names of the codelets you 
															
 
																-want to profile (if no names are specified, starpu_fxt_data_trace will use them all). 
															
 
																-This will create a file, <c>data_trace.gp</c> which
															
 
																-can be plotted to get a .eps image of these results. On the image, each point represents a 
															
 
																-task, and each color corresponds to a codelet.
															
 
																-
															
 
																-\image html data_trace.png
															
 
																-\image latex data_trace.eps "" width=\textwidth
															
 
																-
															
 
																 */
															
--- a/doc/doxygen/chapters/06tips_and_tricks.doxy
+++ b/doc/doxygen/chapters/06tips_and_tricks.doxy
@@ -1,12 +1,12 @@
 
																 /*
															
 
																  * This file is part of the StarPU Handbook.
															
 
																  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
															
 
																- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
															
 
																  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
															
 
																  * See the file version.doxy for copying conditions.
															
 
																  */
															
 
																-/*! \page TipsAndTricksToKnowAbout Tips and Tricks To Know About
															
 
																+/*! \page FrequentlyAskedQuestions Frequently Asked Questions
															
 
																 \section HowToInitializeAComputationLibraryOnceForEachWorker How To Initialize A Computation Library Once For Each Worker?
															
@@ -69,33 +69,95 @@ void starpufft_plan(void)
 
																 }
															
 
																 \endcode
															
 
																-\section HowToLimitMemoryPerNode How to limit memory per node
															
 
																+\section UsingTheDriverAPI Using The Driver API
															
 
																-TODO
															
 
																+\ref API_Running_Drivers
															
 
																-Talk about
															
 
																-\ref STARPU_LIMIT_CUDA_devid_MEM, \ref STARPU_LIMIT_CUDA_MEM,
															
 
																-\ref STARPU_LIMIT_OPENCL_devid_MEM, \ref STARPU_LIMIT_OPENCL_MEM
															
 
																-and \ref STARPU_LIMIT_CPU_MEM
															
 
																+\code{.c}
															
 
																+int ret;
															
 
																+struct starpu_driver = {
															
 
																+    .type = STARPU_CUDA_WORKER,
															
 
																+    .id.cuda_id = 0
															
 
																+};
															
 
																+ret = starpu_driver_init(&d);
															
 
																+if (ret != 0)
															
 
																+    error();
															
 
																+while (some_condition) {
															
 
																+    ret = starpu_driver_run_once(&d);
															
 
																+    if (ret != 0)
															
 
																+        error();
															
 
																+}
															
 
																+ret = starpu_driver_deinit(&d);
															
 
																+if (ret != 0)
															
 
																+    error();
															
 
																+\endcode
															
 
																-starpu_memory_get_available()
															
 
																+To add a new kind of device to the structure starpu_driver, one needs to:
															
 
																+<ol>
															
 
																+<li> Add a member to the union starpu_driver::id
															
 
																+</li>
															
 
																+<li> Modify the internal function <c>_starpu_launch_drivers()</c> to
															
 
																+make sure the driver is not always launched.
															
 
																+</li>
															
 
																+<li> Modify the function starpu_driver_run() so that it can handle
															
 
																+another kind of architecture.
															
 
																+</li>
															
 
																+<li> Write the new function <c>_starpu_run_foobar()</c> in the
															
 
																+corresponding driver.
															
 
																+</li>
															
 
																+</ol>
															
 
																+
															
 
																+\section On-GPURendering On-GPU Rendering
															
 
																+
															
 
																+Graphical-oriented applications need to draw the result of their computations,
															
 
																+typically on the very GPU where these happened. Technologies such as OpenGL/CUDA
															
 
																+interoperability permit to let CUDA directly work on the OpenGL buffers, making
															
 
																+them thus immediately ready for drawing, by mapping OpenGL buffer, textures or
															
 
																+renderbuffer objects into CUDA.  CUDA however imposes some technical
															
 
																+constraints: peer memcpy has to be disabled, and the thread that runs OpenGL has
															
 
																+to be the one that runs CUDA computations for that GPU.
															
 
																+
															
 
																+To achieve this with StarPU, pass the option
															
 
																+\ref disable-cuda-memcpy-peer "--disable-cuda-memcpy-peer"
															
 
																+to <c>./configure</c> (TODO: make it dynamic), OpenGL/GLUT has to be initialized
															
 
																+first, and the interoperability mode has to
															
 
																+be enabled by using the field
															
 
																+starpu_conf::cuda_opengl_interoperability, and the driver loop has to
															
 
																+be run by the application, by using the field
															
 
																+starpu_conf::not_launched_drivers to prevent StarPU from running it in
															
 
																+a separate thread, and by using starpu_driver_run() to run the loop.
															
 
																+The examples <c>gl_interop</c> and <c>gl_interop_idle</c> show how it
															
 
																+articulates in a simple case, where rendering is done in task
															
 
																+callbacks. The former uses <c>glutMainLoopEvent</c> to make GLUT
															
 
																+progress from the StarPU driver loop, while the latter uses
															
 
																+<c>glutIdleFunc</c> to make StarPU progress from the GLUT main loop.
															
 
																+
															
 
																+Then, to use an OpenGL buffer as a CUDA data, StarPU simply needs to be given
															
 
																+the CUDA pointer at registration, for instance:
															
 
																-\section ThreadBindingOnNetBSD Thread Binding on NetBSD
															
 
																+\code{.c}
															
 
																+/* Get the CUDA worker id */
															
 
																+for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)
															
 
																+        if (starpu_worker_get_type(workerid) == STARPU_CUDA_WORKER)
															
 
																+                break;
															
 
																-When using StarPU on a NetBSD machine, if the topology
															
 
																-discovery library <c>hwloc</c> is used, thread binding will fail. To
															
 
																-prevent the problem, you should at least use the version 1.7 of
															
 
																-<c>hwloc</c>, and also issue the following call:
															
 
																+/* Build a CUDA pointer pointing at the OpenGL buffer */
															
 
																+cudaGraphicsResourceGetMappedPointer((void**)&output, &num_bytes, resource);
															
 
																-\verbatim
															
 
																-$ sysctl -w security.models.extensions.user_set_cpu_affinity=1
															
 
																-\endverbatim
															
 
																+/* And register it to StarPU */
															
 
																+starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid),
															
 
																+                            output, num_bytes / sizeof(float4), sizeof(float4));
															
 
																-Or add the following line in the file <c>/etc/sysctl.conf</c>
															
 
																+/* The handle can now be used as usual */
															
 
																+starpu_task_insert(&cl, STARPU_RW, handle, 0);
															
 
																-\verbatim
															
 
																-security.models.extensions.user_set_cpu_affinity=1
															
 
																-\endverbatim
															
 
																+/* ... */
															
 
																+
															
 
																+/* This gets back data into the OpenGL buffer */
															
 
																+starpu_data_unregister(handle);
															
 
																+\endcode
															
 
																+
															
 
																+and display it e.g. in the callback function.
															
 
																 \section UsingStarPUWithMKL Using StarPU With MKL 11 (Intel Composer XE 2013)
															
@@ -111,4 +173,21 @@ Using this configuration, StarPU uses only 1 core, no matter the value of
 
																 The solution is to set the environment variable KMP_AFFINITY to <c>disabled</c>
															
 
																 (http://software.intel.com/sites/products/documentation/studio/composer/en-us/2011Update/compiler_c/optaps/common/optaps_openmp_thread_affinity.htm).
															
 
																+\section ThreadBindingOnNetBSD Thread Binding on NetBSD
															
 
																+
															
 
																+When using StarPU on a NetBSD machine, if the topology
															
 
																+discovery library <c>hwloc</c> is used, thread binding will fail. To
															
 
																+prevent the problem, you should at least use the version 1.7 of
															
 
																+<c>hwloc</c>, and also issue the following call:
															
 
																+
															
 
																+\verbatim
															
 
																+$ sysctl -w security.models.extensions.user_set_cpu_affinity=1
															
 
																+\endverbatim
															
 
																+
															
 
																+Or add the following line in the file <c>/etc/sysctl.conf</c>
															
 
																+
															
 
																+\verbatim
															
 
																+security.models.extensions.user_set_cpu_affinity=1
															
 
																+\endverbatim
															
 
																+
															
 
																 */
															
--- a/doc/doxygen/chapters/15out_of_core.doxy
+++ b/doc/doxygen/chapters/15out_of_core.doxy
--- a/doc/doxygen/chapters/16mpi_support.doxy
+++ b/doc/doxygen/chapters/16mpi_support.doxy
--- a/doc/doxygen/chapters/17fft_support.doxy
+++ b/doc/doxygen/chapters/17fft_support.doxy
--- a/doc/doxygen/chapters/18mic_scc_support.doxy
+++ b/doc/doxygen/chapters/18mic_scc_support.doxy
--- a/doc/doxygen/chapters/19c_extensions.doxy
+++ b/doc/doxygen/chapters/19c_extensions.doxy
--- a/doc/doxygen/chapters/20socl_opencl_extensions.doxy
+++ b/doc/doxygen/chapters/20socl_opencl_extensions.doxy
--- a/doc/doxygen/chapters/21simgrid.doxy
+++ b/doc/doxygen/chapters/21simgrid.doxy
@@ -0,0 +1,104 @@
 
																+/*
															
 
																+ * This file is part of the StarPU Handbook.
															
 
																+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
															
 
																+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
															
 
																+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
															
 
																+ * See the file version.doxy for copying conditions.
															
 
																+ */
															
 
																+
															
 
																+/*! \page SimGridSupport SimGrid Support
															
 
																+
															
 
																+StarPU can use Simgrid in order to simulate execution on an arbitrary
															
 
																+platform.
															
 
																+
															
 
																+\section Calibration Calibration
															
 
																+
															
 
																+The idea is to first compile StarPU normally, and run the application,
															
 
																+so as to automatically benchmark the bus and the codelets.
															
 
																+
															
 
																+\verbatim
															
 
																+$ ./configure && make
															
 
																+$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
															
 
																+[starpu][_starpu_load_history_based_model] Warning: model matvecmult
															
 
																+   is not calibrated, forcing calibration for this run. Use the
															
 
																+   STARPU_CALIBRATE environment variable to control this.
															
 
																+$ ...
															
 
																+$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
															
 
																+TEST PASSED
															
 
																+\endverbatim
															
 
																+
															
 
																+Note that we force to use the scheduler <c>dmda</c> to generate
															
 
																+performance models for the application. The application may need to be
															
 
																+run several times before the model is calibrated.
															
 
																+
															
 
																+\section Simulation Simulation
															
 
																+
															
 
																+Then, recompile StarPU, passing \ref enable-simgrid "--enable-simgrid"
															
 
																+to <c>./configure</c>, and re-run the application:
															
 
																+
															
 
																+\verbatim
															
 
																+$ ./configure --enable-simgrid && make
															
 
																+$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
															
 
																+TEST FAILED !!!
															
 
																+\endverbatim
															
 
																+
															
 
																+It is normal that the test fails: since the computation are not actually done
															
 
																+(that is the whole point of simgrid), the result is wrong, of course.
															
 
																+
															
 
																+If the performance model is not calibrated enough, the following error
															
 
																+message will be displayed
															
 
																+
															
 
																+\verbatim
															
 
																+$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
															
 
																+[starpu][_starpu_load_history_based_model] Warning: model matvecmult
															
 
																+    is not calibrated, forcing calibration for this run. Use the
															
 
																+    STARPU_CALIBRATE environment variable to control this.
															
 
																+[starpu][_starpu_simgrid_execute_job][assert failure] Codelet
															
 
																+    matvecmult does not have a perfmodel, or is not calibrated enough
															
 
																+\endverbatim
															
 
																+
															
 
																+The number of devices can be chosen as usual with \ref STARPU_NCPU,
															
 
																+\ref STARPU_NCUDA, and \ref STARPU_NOPENCL.  For now, only the number of
															
 
																+cpus can be arbitrarily chosen. The number of CUDA and OpenCL devices have to be
															
 
																+lower than the real number on the current machine.
															
 
																+
															
 
																+The amount of simulated GPU memory is for now unbound by default, but
															
 
																+it can be chosen by hand through the \ref STARPU_LIMIT_CUDA_MEM,
															
 
																+\ref STARPU_LIMIT_CUDA_devid_MEM, \ref STARPU_LIMIT_OPENCL_MEM, and
															
 
																+\ref STARPU_LIMIT_OPENCL_devid_MEM environment variables.
															
 
																+
															
 
																+The Simgrid default stack size is small; to increase it use the
															
 
																+parameter <c>--cfg=contexts/stack_size</c>, for example:
															
 
																+
															
 
																+\verbatim
															
 
																+$ ./example --cfg=contexts/stack_size:8192
															
 
																+TEST FAILED !!!
															
 
																+\endverbatim
															
 
																+
															
 
																+Note: of course, if the application uses <c>gettimeofday</c> to make its
															
 
																+performance measurements, the real time will be used, which will be bogus. To
															
 
																+get the simulated time, it has to use starpu_timing_now() which returns the
															
 
																+virtual timestamp in ms.
															
 
																+
															
 
																+\section SimulationOnAnotherMachine Simulation On Another Machine
															
 
																+
															
 
																+The simgrid support even permits to perform simulations on another machine, your
															
 
																+desktop, typically. To achieve this, one still needs to perform the Calibration
															
 
																+step on the actual machine to be simulated, then copy them to your desktop
															
 
																+machine (the <c>$STARPU_HOME/.starpu</c> directory). One can then perform the
															
 
																+Simulation step on the desktop machine, by setting the environment
															
 
																+variable \ref STARPU_HOSTNAME to the name of the actual machine, to
															
 
																+make StarPU use the performance models of the simulated machine even
															
 
																+on the desktop machine.
															
 
																+
															
 
																+If the desktop machine does not have CUDA or OpenCL, StarPU is still able to
															
 
																+use simgrid to simulate execution with CUDA/OpenCL devices, but the application
															
 
																+source code will probably disable the CUDA and OpenCL codelets in thatcd sc
															
 
																+case. Since during simgrid execution, the functions of the codelet are actually
															
 
																+not called, one can use dummy functions such as the following to still permit
															
 
																+CUDA or OpenCL execution:
															
 
																+
															
 
																+\snippet simgrid.c To be included. You should update doxygen if you see this text.
															
 
																+
															
 
																+
															
 
																+*/
															
--- a/doc/doxygen/chapters/40environment_variables.doxy
+++ b/doc/doxygen/chapters/40environment_variables.doxy
--- a/doc/doxygen/chapters/41configure_options.doxy
+++ b/doc/doxygen/chapters/41configure_options.doxy
--- a/doc/doxygen/chapters/45files.doxy
+++ b/doc/doxygen/chapters/45files.doxy
--- a/doc/doxygen/chapters/50scaling-vector-example.doxy
+++ b/doc/doxygen/chapters/50scaling-vector-example.doxy
--- a/doc/doxygen/chapters/51fdl-1.3.doxy
+++ b/doc/doxygen/chapters/51fdl-1.3.doxy
--- a/doc/doxygen/refman.tex
+++ b/doc/doxygen/refman.tex
@@ -68,7 +68,7 @@ was last updated on \STARPUUPDATED.\\
 
																 Copyright © 2009–2013 Université de Bordeaux 1\\
															
 
																-Copyright © 2010-2013 Centre National de la Recherche Scientifique\\
															
 
																+Copyright © 2010-2014 Centre National de la Recherche Scientifique\\
															
 
																 Copyright © 2011, 2012 Institut National de Recherche en Informatique et Automatique\\
															
@@ -94,7 +94,7 @@ Documentation License”.
 
																 \hypertarget{index}{}
															
 
																 \input{index}
															
 
																-\part{Using StarPU}
															
 
																+\part{StarPU Basics}
															
 
																 \chapter{Building and Installing StarPU}
															
 
																 \label{BuildingAndInstallingStarPU}
															
@@ -106,33 +106,72 @@ Documentation License”.
 
																 \hypertarget{BasicExamples}{}
															
 
																 \input{BasicExamples}
															
 
																+\part{StarPU Quick Programming Guide}
															
 
																+
															
 
																 \chapter{Advanced Examples}
															
 
																 \label{AdvancedExamples}
															
 
																 \hypertarget{AdvancedExamples}{}
															
 
																 \input{AdvancedExamples}
															
 
																-\chapter{How To Optimize Performance With StarPU}
															
 
																-\label{HowToOptimizePerformanceWithStarPU}
															
 
																-\hypertarget{HowToOptimizePerformanceWithStarPU}{}
															
 
																-\input{HowToOptimizePerformanceWithStarPU}
															
 
																+\chapter{Check List When Performance Are Not There}
															
 
																+\label{CheckListWhenPerformanceAreNotThere}
															
 
																+\hypertarget{CheckListWhenPerformanceAreNotThere}{}
															
 
																+\input{CheckListWhenPerformanceAreNotThere}
															
 
																+
															
 
																+\part{StarPU Inside}
															
 
																+
															
 
																+\chapter{Tasks In StarPU}
															
 
																+\label{TasksInStarPU}
															
 
																+\hypertarget{TasksInStarPU}{}
															
 
																+\input{TasksInStarPU}
															
 
																+
															
 
																+\chapter{Data Management}
															
 
																+\label{DataManagement}
															
 
																+\hypertarget{DataManagement}{}
															
 
																+\input{DataManagement}
															
 
																+
															
 
																+\chapter{Scheduling}
															
 
																+\label{Scheduling}
															
 
																+\hypertarget{Scheduling}{}
															
 
																+\input{Scheduling}
															
 
																+
															
 
																+\chapter{Scheduling Contexts}
															
 
																+\label{SchedulingContexts}
															
 
																+\hypertarget{SchedulingContexts}{}
															
 
																+\input{SchedulingContexts}
															
 
																+
															
 
																+\chapter{Scheduling Context Hypervisor}
															
 
																+\label{SchedulingContextHypervisor}
															
 
																+\hypertarget{SchedulingContextHypervisor}{}
															
 
																+\input{SchedulingContextHypervisor}
															
 
																+
															
 
																+\chapter{Debugging Tools}
															
 
																+\label{DebuggingTools}
															
 
																+\hypertarget{DebuggingTools}{}
															
 
																+\input{DebuggingTools}
															
 
																+
															
 
																+\chapter{Online Performance Tools}
															
 
																+\label{OnlinePerformanceTools}
															
 
																+\hypertarget{OnlinePerformanceTools}{}
															
 
																+\input{OnlinePerformanceTools}
															
 
																+
															
 
																+\chapter{Offline Performance Tools}
															
 
																+\label{OfflinePerformanceTools}
															
 
																+\hypertarget{OfflinePerformanceTools}{}
															
 
																+\input{OfflinePerformanceTools}
															
 
																-\chapter{Performance Feedback}
															
 
																-\label{PerformanceFeedback}
															
 
																-\hypertarget{PerformanceFeedback}{}
															
 
																-\input{PerformanceFeedback}
															
 
																+\chapter{Frequently Asked Questions}
															
 
																+\label{FrequentlyAskedQuestions}
															
 
																+\hypertarget{FrequentlyAskedQuestions}{}
															
 
																+\input{FrequentlyAskedQuestions}
															
 
																-\chapter{Tips and Tricks To Know About}
															
 
																-\label{TipsAndTricksToKnowAbout}
															
 
																-\hypertarget{TipsAndTricksToKnowAbout}{}
															
 
																-\input{TipsAndTricksToKnowAbout}
															
 
																+\part{StarPU Extensions}
															
 
																 \chapter{Out Of Core}
															
 
																 \label{OutOfCore}
															
 
																 \hypertarget{OutOfCore}{}
															
 
																 \input{OutOfCore}
															
 
																-
															
 
																-
															
 
																 \chapter{MPI Support}
															
 
																 \label{MPISupport}
															
 
																 \hypertarget{MPISupport}{}
															
@@ -158,17 +197,12 @@ Documentation License”.
 
																 \hypertarget{SOCLOpenclExtensions}{}
															
 
																 \input{SOCLOpenclExtensions}
															
 
																-\chapter{Scheduling Contexts}
															
 
																-\label{SchedulingContexts}
															
 
																-\hypertarget{SchedulingContexts}{}
															
 
																-\input{SchedulingContexts}
															
 
																-
															
 
																-\chapter{Scheduling Context Hypervisor}
															
 
																-\label{SchedulingContextHypervisor}
															
 
																-\hypertarget{SchedulingContextHypervisor}{}
															
 
																-\input{SchedulingContextHypervisor}
															
 
																+\chapter{SimGrid Support}
															
 
																+\label{SimGridSupport}
															
 
																+\hypertarget{SimGridSupport}{}
															
 
																+\input{SimGridSupport}
															
 
																-\part{Inside StarPU}
															
 
																+\part{StarPU Reference API}
															
 
																 \chapter{Execution Configuration Through Environment Variables}
															
 
																 \label{ExecutionConfigurationThroughEnvironmentVariables}
															
@@ -277,10 +311,6 @@ Documentation License”.
 
																 \hypertarget{deprecated}{}
															
 
																 \input{deprecated}
															
 
																-
															
 
																-\addcontentsline{toc}{chapter}{Index}
															
 
																-\printindex
															
 
																-
															
 
																 \part{Appendix}
															
 
																 \chapter{Full Source Code for the ’Scaling a Vector’ Example}
															
@@ -293,4 +323,8 @@ Documentation License”.
 
																 \hypertarget{GNUFreeDocumentationLicense}{}
															
 
																 \input{GNUFreeDocumentationLicense}
															
 
																+\part{Index}
															
 
																+\addcontentsline{toc}{chapter}{Index}
															
 
																+\printindex
															
 
																+
															
 
																 \end{document}