12 years ago · 4839a2cfce
--- a/doc/doxygen/Makefile.am
+++ b/doc/doxygen/Makefile.am
@@ -28,22 +28,28 @@ chapters =	\
 
				 	chapters/01building.doxy \
			
 
				 	chapters/02basic_examples.doxy \
			
 
				 	chapters/03advanced_examples.doxy \
			
 
				-	chapters/04optimize_performance.doxy \
			
 
				-	chapters/05performance_feedback.doxy \
			
 
				-	chapters/06tips_and_tricks.doxy \
			
 
				-	chapters/07out_of_core.doxy \
			
 
				-	chapters/08mpi_support.doxy \
			
 
				-	chapters/09fft_support.doxy \
			
 
				-	chapters/10mic_scc_support.doxy \
			
 
				-	chapters/11c_extensions.doxy \
			
 
				-	chapters/12socl_opencl_extensions.doxy \
			
 
				-	chapters/13scheduling_contexts.doxy \
			
 
				-	chapters/14scheduling_context_hypervisor.doxy \
			
 
				-	chapters/15environment_variables.doxy \
			
 
				-	chapters/16configure_options.doxy \
			
 
				-	chapters/17files.doxy \
			
 
				-	chapters/18scaling-vector-example.doxy \
			
 
				-	chapters/19fdl-1.3.doxy \
			
 
				+	chapters/05check_list_performance.doxy \
			
 
				+	chapters/06tasks.doxy \
			
 
				+	chapters/07data_management.doxy \
			
 
				+	chapters/08scheduling.doxy \
			
 
				+	chapters/09scheduling_contexts.doxy \
			
 
				+	chapters/10scheduling_context_hypervisor.doxy \
			
 
				+	chapters/11debugging_tools.doxy \
			
 
				+	chapters/12online_performance_tools.doxy \
			
 
				+	chapters/13offline_performance_tools.doxy \
			
 
				+	chapters/14faq.doxy \
			
 
				+	chapters/15out_of_core.doxy \
			
 
				+	chapters/16mpi_support.doxy \
			
 
				+	chapters/17fft_support.doxy \
			
 
				+	chapters/18mic_scc_support.doxy \
			
 
				+	chapters/19c_extensions.doxy \
			
 
				+	chapters/20socl_opencl_extensions.doxy \
			
 
				+	chapters/21simgrid.doxy \
			
 
				+	chapters/40environment_variables.doxy \
			
 
				+	chapters/41configure_options.doxy \
			
 
				+	chapters/45files.doxy \
			
 
				+	chapters/50scaling-vector-example.doxy \
			
 
				+	chapters/51fdl-1.3.doxy \
			
 
				 	chapters/code/hello_pragma2.c \
			
 
				 	chapters/code/hello_pragma.c \
			
 
				 	chapters/code/scal_pragma.cu \
			
@@ -218,8 +224,8 @@ $(DOX_TAG): $(dox_inputs)
 
				 	$(DOXYGEN) $(DOX_CONFIG)
			
 
				 	sed -i 's/ModuleDocumentation <\/li>/<a class="el" href="modules.html">Modules<\/a>/' html/index.html
			
 
				 	sed -i 's/FileDocumentation <\/li>/<a class="el" href="files.html">Files<\/a>/' html/index.html
			
 
				-        # comment for the line above: what we really want to do is to remove the line, but dy doing so, it avoids opening the interactive menu when browsing files
			
 
				-	if test -f html/navtree.js ; then sed -i 's/\[ "Files", "Files.html", null \]/\[ "", "Files.html", null \]/' html/navtree.js ; fi
			
 
				+        # comment for the line below: what we really want to do is to remove the line, but dy doing so, it avoids opening the interactive menu when browsing files
			
 
				+#	if test -f html/navtree.js ; then sed -i 's/\[ "Files", "Files.html", null \]/\[ "", "Files.html", null \]/' html/navtree.js ; fi
			
 
				 	sed -i 's/.*"Files.html".*//' html/pages.html
			
 
				 	if test -f latex/main.tex ; then mv latex/main.tex latex/index.tex ; fi
			
 
				 
			
--- a/doc/doxygen/chapters/00introduction.doxy
+++ b/doc/doxygen/chapters/00introduction.doxy
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
			
 
				  * See the file version.doxy for copying conditions.
			
 
				 */
			
@@ -184,30 +184,94 @@ http://runtime.bordeaux.inria.fr/Publis/Keyword/STARPU.html.
 
				 A good overview is available in the research report at
			
 
				 http://hal.archives-ouvertes.fr/inria-00467677.
			
 
				 
			
 
				+\section StarPUApplications StarPU Applications
			
 
				+
			
 
				+You can first have a look at the chapters \ref BasicExamples and \ref AdvancedExamples.
			
 
				+A tutorial is also installed in the directory <c>share/doc/starpu/tutorial/</c>.
			
 
				+
			
 
				+Many examples are also available in the StarPU sources in the directory
			
 
				+<c>examples/</c>. Simple examples include:
			
 
				+
			
 
				+<dl>
			
 
				+<dt> <c>incrementer/</c> </dt>
			
 
				+<dd> Trivial incrementation test. </dd>
			
 
				+<dt> <c>basic_examples/</c> </dt>
			
 
				+<dd>
			
 
				+        Simple documented Hello world and vector/scalar product (as
			
 
				+        shown in \ref BasicExamples), matrix
			
 
				+        product examples (as shown in \ref PerformanceModelExample), an example using the blocked matrix data
			
 
				+        interface, an example using the variable data interface, and an example
			
 
				+        using different formats on CPUs and GPUs.
			
 
				+</dd>
			
 
				+<dt> <c>matvecmult/</c></dt>
			
 
				+<dd>
			
 
				+    OpenCL example from NVidia, adapted to StarPU.
			
 
				+</dd>
			
 
				+<dt> <c>axpy/</c></dt>
			
 
				+<dd>
			
 
				+    AXPY CUBLAS operation adapted to StarPU.
			
 
				+</dd>
			
 
				+<dt> <c>fortran/</c> </dt>
			
 
				+<dd>
			
 
				+    Example of Fortran bindings.
			
 
				+</dd>
			
 
				+</dl>
			
 
				+
			
 
				+More advanced examples include:
			
 
				+
			
 
				+<dl>
			
 
				+<dt><c>filters/</c></dt>
			
 
				+<dd>
			
 
				+    Examples using filters, as shown in \ref PartitioningData.
			
 
				+</dd>
			
 
				+<dt><c>lu/</c></dt>
			
 
				+<dd>
			
 
				+    LU matrix factorization, see for instance <c>xlu_implicit.c</c>
			
 
				+</dd>
			
 
				+<dt><c>cholesky/</c></dt>
			
 
				+<dd>
			
 
				+    Cholesky matrix factorization, see for instance <c>cholesky_implicit.c</c>.
			
 
				+</dd>
			
 
				+</dl>
			
 
				+
			
 
				 \section FurtherReading Further Reading
			
 
				 
			
 
				 The documentation chapters include
			
 
				 
			
 
				-<ol>
			
 
				-<li> Part: Using StarPU
			
 
				+<ul>
			
 
				+<li> Part 1: StarPU Basics
			
 
				 <ul>
			
 
				 <li> \ref BuildingAndInstallingStarPU
			
 
				 <li> \ref BasicExamples
			
 
				+</ul>
			
 
				+<li> Part 2: StarPU Quick Programming Guide
			
 
				+<ul>
			
 
				 <li> \ref AdvancedExamples
			
 
				-<li> \ref HowToOptimizePerformanceWithStarPU
			
 
				-<li> \ref PerformanceFeedback
			
 
				-<li> \ref TipsAndTricksToKnowAbout
			
 
				+<li> \ref CheckListWhenPerformanceAreNotThere
			
 
				+</ul>
			
 
				+<li> Part 3: StarPU Inside
			
 
				+<ul>
			
 
				+<li> \ref TasksInStarPU
			
 
				+<li> \ref DataManagement
			
 
				+<li> \ref Scheduling
			
 
				+<li> \ref SchedulingContexts
			
 
				+<li> \ref SchedulingContextHypervisor
			
 
				+<li> \ref DebuggingTools
			
 
				+<li> \ref OnlinePerformanceTools
			
 
				+<li> \ref OfflinePerformanceTools
			
 
				+<li> \ref FrequentlyAskedQuestions
			
 
				+</ul>
			
 
				+<li> Part 4: StarPU Extensions
			
 
				+<ul>
			
 
				 <li> \ref OutOfCore
			
 
				 <li> \ref MPISupport
			
 
				 <li> \ref FFTSupport
			
 
				 <li> \ref MICSCCSupport
			
 
				 <li> \ref cExtensions
			
 
				 <li> \ref SOCLOpenclExtensions
			
 
				-<li> \ref SchedulingContexts
			
 
				-<li> \ref SchedulingContextHypervisor
			
 
				+<li> \ref SimGridSupport
			
 
				 </ul>
			
 
				-</li>
			
 
				-<li> Part: Inside StarPU
			
 
				+<li> Part 5: StarPU Reference API
			
 
				 <ul>
			
 
				 <li> \ref ExecutionConfigurationThroughEnvironmentVariables
			
 
				 <li> \ref CompilationConfiguration
			
@@ -220,8 +284,7 @@ The documentation chapters include
 
				 <li> \ref FullSourceCodeVectorScal
			
 
				 <li> \ref GNUFreeDocumentationLicense
			
 
				 </ul>
			
 
				-</ol>
			
 
				-
			
 
				+</ul>
			
 
				 
			
 
				 Make sure to have had a look at those too!
			
 
				 
			
--- a/doc/doxygen/chapters/01building.doxy
+++ b/doc/doxygen/chapters/01building.doxy
@@ -1,7 +1,7 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
--- a/doc/doxygen/chapters/03advanced_examples.doxy
+++ b/doc/doxygen/chapters/03advanced_examples.doxy
--- a/doc/doxygen/chapters/04optimize_performance.doxy
+++ b/doc/doxygen/chapters/04optimize_performance.doxy
@@ -1,552 +0,0 @@
 
				-/*
			
 
				- * This file is part of the StarPU Handbook.
			
 
				- * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				- * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
			
 
				- * See the file version.doxy for copying conditions.
			
 
				- */
			
 
				-
			
 
				-/*! \page HowToOptimizePerformanceWithStarPU How To Optimize Performance With StarPU
			
 
				-
			
 
				-TODO: improve!
			
 
				-
			
 
				-Simply encapsulating application kernels into tasks already permits to
			
 
				-seamlessly support CPU and GPUs at the same time. To achieve good performance, a
			
 
				-few additional changes are needed.
			
 
				-
			
 
				-\section DataManagement Data Management
			
 
				-
			
 
				-When the application allocates data, whenever possible it should use
			
 
				-the function starpu_malloc(), which will ask CUDA or OpenCL to make
			
 
				-the allocation itself and pin the corresponding allocated memory. This
			
 
				-is needed to permit asynchronous data transfer, i.e. permit data
			
 
				-transfer to overlap with computations. Otherwise, the trace will show
			
 
				-that the <c>DriverCopyAsync</c> state takes a lot of time, this is
			
 
				-because CUDA or OpenCL then reverts to synchronous transfers.
			
 
				-
			
 
				-By default, StarPU leaves replicates of data wherever they were used, in case they
			
 
				-will be re-used by other tasks, thus saving the data transfer time. When some
			
 
				-task modifies some data, all the other replicates are invalidated, and only the
			
 
				-processing unit which ran that task will have a valid replicate of the data. If the application knows
			
 
				-that this data will not be re-used by further tasks, it should advise StarPU to
			
 
				-immediately replicate it to a desired list of memory nodes (given through a
			
 
				-bitmask). This can be understood like the write-through mode of CPU caches.
			
 
				-
			
 
				-\code{.c}
			
 
				-starpu_data_set_wt_mask(img_handle, 1<<0);
			
 
				-\endcode
			
 
				-
			
 
				-will for instance request to always automatically transfer a replicate into the
			
 
				-main memory (node <c>0</c>), as bit <c>0</c> of the write-through bitmask is being set.
			
 
				-
			
 
				-\code{.c}
			
 
				-starpu_data_set_wt_mask(img_handle, ~0U);
			
 
				-\endcode
			
 
				-
			
 
				-will request to always automatically broadcast the updated data to all memory
			
 
				-nodes.
			
 
				-
			
 
				-Setting the write-through mask to <c>~0U</c> can also be useful to make sure all
			
 
				-memory nodes always have a copy of the data, so that it is never evicted when
			
 
				-memory gets scarse.
			
 
				-
			
 
				-Implicit data dependency computation can become expensive if a lot
			
 
				-of tasks access the same piece of data. If no dependency is required
			
 
				-on some piece of data (e.g. because it is only accessed in read-only
			
 
				-mode, or because write accesses are actually commutative), use the
			
 
				-function starpu_data_set_sequential_consistency_flag() to disable
			
 
				-implicit dependencies on that data.
			
 
				-
			
 
				-In the same vein, accumulation of results in the same data can become a
			
 
				-bottleneck. The use of the mode ::STARPU_REDUX permits to optimize such
			
 
				-accumulation (see \ref DataReduction). To a lesser extent, the use of
			
 
				-the flag ::STARPU_COMMUTE keeps the bottleneck, but at least permits
			
 
				-the accumulation to happen in any order.
			
 
				-
			
 
				-Applications often need a data just for temporary results.  In such a case,
			
 
				-registration can be made without an initial value, for instance this produces a vector data:
			
 
				-
			
 
				-\code{.c}
			
 
				-starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
			
 
				-\endcode
			
 
				-
			
 
				-StarPU will then allocate the actual buffer only when it is actually needed,
			
 
				-e.g. directly on the GPU without allocating in main memory.
			
 
				-
			
 
				-In the same vein, once the temporary results are not useful any more, the
			
 
				-data should be thrown away. If the handle is not to be reused, it can be
			
 
				-unregistered:
			
 
				-
			
 
				-\code{.c}
			
 
				-starpu_unregister_submit(handle);
			
 
				-\endcode
			
 
				-
			
 
				-actual unregistration will be done after all tasks working on the handle
			
 
				-terminate.
			
 
				-
			
 
				-If the handle is to be reused, instead of unregistering it, it can simply be invalidated:
			
 
				-
			
 
				-\code{.c}
			
 
				-starpu_invalidate_submit(handle);
			
 
				-\endcode
			
 
				-
			
 
				-the buffers containing the current value will then be freed, and reallocated
			
 
				-only when another task writes some value to the handle.
			
 
				-
			
 
				-\section TaskGranularity Task Granularity
			
 
				-
			
 
				-Like any other runtime, StarPU has some overhead to manage tasks. Since
			
 
				-it does smart scheduling and data management, that overhead is not always
			
 
				-neglectable. The order of magnitude of the overhead is typically a couple of
			
 
				-microseconds, which is actually quite smaller than the CUDA overhead itself. The
			
 
				-amount of work that a task should do should thus be somewhat
			
 
				-bigger, to make sure that the overhead becomes neglectible. The offline
			
 
				-performance feedback can provide a measure of task length, which should thus be
			
 
				-checked if bad performance are observed. To get a grasp at the scalability
			
 
				-possibility according to task size, one can run
			
 
				-<c>tests/microbenchs/tasks_size_overhead.sh</c> which draws curves of the
			
 
				-speedup of independent tasks of very small sizes.
			
 
				-
			
 
				-The choice of scheduler also has impact over the overhead: for instance, the
			
 
				- scheduler <c>dmda</c> takes time to make a decision, while <c>eager</c> does
			
 
				-not. <c>tasks_size_overhead.sh</c> can again be used to get a grasp at how much
			
 
				-impact that has on the target machine.
			
 
				-
			
 
				-\section TaskSubmission Task Submission
			
 
				-
			
 
				-To let StarPU make online optimizations, tasks should be submitted
			
 
				-asynchronously as much as possible. Ideally, all the tasks should be
			
 
				-submitted, and mere calls to starpu_task_wait_for_all() or
			
 
				-starpu_data_unregister() be done to wait for
			
 
				-termination. StarPU will then be able to rework the whole schedule, overlap
			
 
				-computation with communication, manage accelerator local memory usage, etc.
			
 
				-
			
 
				-\section TaskPriorities Task Priorities
			
 
				-
			
 
				-By default, StarPU will consider the tasks in the order they are submitted by
			
 
				-the application. If the application programmer knows that some tasks should
			
 
				-be performed in priority (for instance because their output is needed by many
			
 
				-other tasks and may thus be a bottleneck if not executed early
			
 
				-enough), the field starpu_task::priority should be set to transmit the
			
 
				-priority information to StarPU.
			
 
				-
			
 
				-\section TaskSchedulingPolicy Task Scheduling Policy
			
 
				-
			
 
				-By default, StarPU uses the simple greedy scheduler <c>eager</c>. This is
			
 
				-because it provides correct load balance even if the application codelets do not
			
 
				-have performance models. If your application codelets have performance models
			
 
				-(\ref PerformanceModelExample), you should change the scheduler thanks
			
 
				-to the environment variable \ref STARPU_SCHED. For instance <c>export
			
 
				-STARPU_SCHED=dmda</c> . Use <c>help</c> to get the list of available schedulers.
			
 
				-
			
 
				-The <b>eager</b> scheduler uses a central task queue, from which workers draw tasks
			
 
				-to work on. This however does not permit to prefetch data since the scheduling
			
 
				-decision is taken late. If a task has a non-0 priority, it is put at the front of the queue.
			
 
				-
			
 
				-The <b>prio</b> scheduler also uses a central task queue, but sorts tasks by
			
 
				-priority (between -5 and 5).
			
 
				-
			
 
				-The <b>random</b> scheduler distributes tasks randomly according to assumed worker
			
 
				-overall performance.
			
 
				-
			
 
				-The <b>ws</b> (work stealing) scheduler schedules tasks on the local worker by
			
 
				-default. When a worker becomes idle, it steals a task from the most loaded
			
 
				-worker.
			
 
				-
			
 
				-The <b>dm</b> (deque model) scheduler uses task execution performance models into account to
			
 
				-perform an HEFT-similar scheduling strategy: it schedules tasks where their
			
 
				-termination time will be minimal.
			
 
				-
			
 
				-The <b>dmda</b> (deque model data aware) scheduler is similar to dm, it also takes
			
 
				-into account data transfer time.
			
 
				-
			
 
				-The <b>dmdar</b> (deque model data aware ready) scheduler is similar to dmda,
			
 
				-it also sorts tasks on per-worker queues by number of already-available data
			
 
				-buffers.
			
 
				-
			
 
				-The <b>dmdas</b> (deque model data aware sorted) scheduler is similar to dmda, it
			
 
				-also supports arbitrary priority values.
			
 
				-
			
 
				-The <b>heft</b> (heterogeneous earliest finish time) scheduler is deprecated. It
			
 
				-is now just an alias for <b>dmda</b>.
			
 
				-
			
 
				-The <b>pheft</b> (parallel HEFT) scheduler is similar to heft, it also supports
			
 
				-parallel tasks (still experimental). Should not be used when several contexts using
			
 
				-it are being executed simultaneously.
			
 
				-
			
 
				-The <b>peager</b> (parallel eager) scheduler is similar to eager, it also
			
 
				-supports parallel tasks (still experimental). Should not be used when several 
			
 
				-contexts using it are being executed simultaneously.
			
 
				-
			
 
				-
			
 
				-\section PerformanceModelCalibration Performance Model Calibration
			
 
				-
			
 
				-Most schedulers are based on an estimation of codelet duration on each kind
			
 
				-of processing unit. For this to be possible, the application programmer needs
			
 
				-to configure a performance model for the codelets of the application (see
			
 
				-\ref PerformanceModelExample for instance). History-based performance models
			
 
				-use on-line calibration.  StarPU will automatically calibrate codelets
			
 
				-which have never been calibrated yet, and save the result in
			
 
				-<c>$STARPU_HOME/.starpu/sampling/codelets</c>.
			
 
				-The models are indexed by machine name. To share the models between
			
 
				-machines (e.g. for a homogeneous cluster), use <c>export
			
 
				-STARPU_HOSTNAME=some_global_name</c>. To force continuing calibration,
			
 
				-use <c>export STARPU_CALIBRATE=1</c> . This may be necessary if your application
			
 
				-has not-so-stable performance. StarPU will force calibration (and thus ignore
			
 
				-the current result) until 10 (<c>_STARPU_CALIBRATION_MINIMUM</c>) measurements have been
			
 
				-made on each architecture, to avoid badly scheduling tasks just because the
			
 
				-first measurements were not so good. Details on the current performance model status
			
 
				-can be obtained from the command <c>starpu_perfmodel_display</c>: the <c>-l</c>
			
 
				-option lists the available performance models, and the <c>-s</c> option permits
			
 
				-to choose the performance model to be displayed. The result looks like:
			
 
				-
			
 
				-\verbatim
			
 
				-$ starpu_perfmodel_display -s starpu_slu_lu_model_11
			
 
				-performance model for cpu_impl_0
			
 
				-# hash    size     flops         mean          dev           n
			
 
				-914f3bef  1048576  0.000000e+00  2.503577e+04  1.982465e+02  8
			
 
				-3e921964  65536    0.000000e+00  5.527003e+02  1.848114e+01  7
			
 
				-e5a07e31  4096     0.000000e+00  1.717457e+01  5.190038e+00  14
			
 
				-...
			
 
				-\endverbatim
			
 
				-
			
 
				-Which shows that for the LU 11 kernel with a 1MiB matrix, the average
			
 
				-execution time on CPUs was about 25ms, with a 0.2ms standard deviation, over
			
 
				-8 samples. It is a good idea to check this before doing actual performance
			
 
				-measurements.
			
 
				-
			
 
				-A graph can be drawn by using the tool <c>starpu_perfmodel_plot</c>:
			
 
				-
			
 
				-\verbatim
			
 
				-$ starpu_perfmodel_plot -s starpu_slu_lu_model_11
			
 
				-4096 16384 65536 262144 1048576 4194304 
			
 
				-$ gnuplot starpu_starpu_slu_lu_model_11.gp
			
 
				-$ gv starpu_starpu_slu_lu_model_11.eps
			
 
				-\endverbatim
			
 
				-
			
 
				-\image html starpu_starpu_slu_lu_model_11.png
			
 
				-\image latex starpu_starpu_slu_lu_model_11.eps "" width=\textwidth
			
 
				-
			
 
				-If a kernel source code was modified (e.g. performance improvement), the
			
 
				-calibration information is stale and should be dropped, to re-calibrate from
			
 
				-start. This can be done by using <c>export STARPU_CALIBRATE=2</c>.
			
 
				-
			
 
				-Note: due to CUDA limitations, to be able to measure kernel duration,
			
 
				-calibration mode needs to disable asynchronous data transfers. Calibration thus
			
 
				-disables data transfer / computation overlapping, and should thus not be used
			
 
				-for eventual benchmarks. Note 2: history-based performance models get calibrated
			
 
				-only if a performance-model-based scheduler is chosen.
			
 
				-
			
 
				-The history-based performance models can also be explicitly filled by the
			
 
				-application without execution, if e.g. the application already has a series of
			
 
				-measurements. This can be done by using starpu_perfmodel_update_history(),
			
 
				-for instance:
			
 
				-
			
 
				-\code{.c}
			
 
				-static struct starpu_perfmodel perf_model = {
			
 
				-    .type = STARPU_HISTORY_BASED,
			
 
				-    .symbol = "my_perfmodel",
			
 
				-};
			
 
				-
			
 
				-struct starpu_codelet cl = {
			
 
				-    .where = STARPU_CUDA,
			
 
				-    .cuda_funcs = { cuda_func1, cuda_func2, NULL },
			
 
				-    .nbuffers = 1,
			
 
				-    .modes = {STARPU_W},
			
 
				-    .model = &perf_model
			
 
				-};
			
 
				-
			
 
				-void feed(void) {
			
 
				-    struct my_measure *measure;
			
 
				-    struct starpu_task task;
			
 
				-    starpu_task_init(&task);
			
 
				-
			
 
				-    task.cl = &cl;
			
 
				-
			
 
				-    for (measure = &measures[0]; measure < measures[last]; measure++) {
			
 
				-        starpu_data_handle_t handle;
			
 
				-	starpu_vector_data_register(&handle, -1, 0, measure->size, sizeof(float));
			
 
				-	task.handles[0] = handle;
			
 
				-	starpu_perfmodel_update_history(&perf_model, &task,
			
 
				-	                                STARPU_CUDA_DEFAULT + measure->cudadev, 0,
			
 
				-	                                measure->implementation, measure->time);
			
 
				-	starpu_task_clean(&task);
			
 
				-	starpu_data_unregister(handle);
			
 
				-    }
			
 
				-}
			
 
				-\endcode
			
 
				-
			
 
				-Measurement has to be provided in milliseconds for the completion time models,
			
 
				-and in Joules for the energy consumption models.
			
 
				-
			
 
				-\section TaskDistributionVsDataTransfer Task Distribution Vs Data Transfer
			
 
				-
			
 
				-Distributing tasks to balance the load induces data transfer penalty. StarPU
			
 
				-thus needs to find a balance between both. The target function that the
			
 
				-scheduler <c>dmda</c> of StarPU
			
 
				-tries to minimize is <c>alpha * T_execution + beta * T_data_transfer</c>, where
			
 
				-<c>T_execution</c> is the estimated execution time of the codelet (usually
			
 
				-accurate), and <c>T_data_transfer</c> is the estimated data transfer time. The
			
 
				-latter is estimated based on bus calibration before execution start,
			
 
				-i.e. with an idle machine, thus without contention. You can force bus
			
 
				-re-calibration by running the tool <c>starpu_calibrate_bus</c>. The
			
 
				-beta parameter defaults to <c>1</c>, but it can be worth trying to tweak it
			
 
				-by using <c>export STARPU_SCHED_BETA=2</c> for instance, since during
			
 
				-real application execution, contention makes transfer times bigger.
			
 
				-This is of course imprecise, but in practice, a rough estimation
			
 
				-already gives the good results that a precise estimation would give.
			
 
				-
			
 
				-\section DataPrefetch Data Prefetch
			
 
				-
			
 
				-The scheduling policies <c>heft</c>, <c>dmda</c> and <c>pheft</c>
			
 
				-perform data prefetch (see \ref STARPU_PREFETCH):
			
 
				-as soon as a scheduling decision is taken for a task, requests are issued to
			
 
				-transfer its required data to the target processing unit, if needed, so that
			
 
				-when the processing unit actually starts the task, its data will hopefully be
			
 
				-already available and it will not have to wait for the transfer to finish.
			
 
				-
			
 
				-The application may want to perform some manual prefetching, for several reasons
			
 
				-such as excluding initial data transfers from performance measurements, or
			
 
				-setting up an initial statically-computed data distribution on the machine
			
 
				-before submitting tasks, which will thus guide StarPU toward an initial task
			
 
				-distribution (since StarPU will try to avoid further transfers).
			
 
				-
			
 
				-This can be achieved by giving the function starpu_data_prefetch_on_node()
			
 
				-the handle and the desired target memory node.
			
 
				-
			
 
				-\section Power-basedScheduling Power-based Scheduling
			
 
				-
			
 
				-If the application can provide some power performance model (through
			
 
				-the field starpu_codelet::power_model), StarPU will
			
 
				-take it into account when distributing tasks. The target function that
			
 
				-the scheduler <c>dmda</c> minimizes becomes <c>alpha * T_execution +
			
 
				-beta * T_data_transfer + gamma * Consumption</c> , where <c>Consumption</c>
			
 
				-is the estimated task consumption in Joules. To tune this parameter, use
			
 
				-<c>export STARPU_SCHED_GAMMA=3000</c> for instance, to express that each Joule
			
 
				-(i.e kW during 1000us) is worth 3000us execution time penalty. Setting
			
 
				-<c>alpha</c> and <c>beta</c> to zero permits to only take into account power consumption.
			
 
				-
			
 
				-This is however not sufficient to correctly optimize power: the scheduler would
			
 
				-simply tend to run all computations on the most energy-conservative processing
			
 
				-unit. To account for the consumption of the whole machine (including idle
			
 
				-processing units), the idle power of the machine should be given by setting
			
 
				-<c>export STARPU_IDLE_POWER=200</c> for 200W, for instance. This value can often
			
 
				-be obtained from the machine power supplier.
			
 
				-
			
 
				-The power actually consumed by the total execution can be displayed by setting
			
 
				-<c>export STARPU_PROFILING=1 STARPU_WORKER_STATS=1</c> .
			
 
				-
			
 
				-On-line task consumption measurement is currently only supported through the
			
 
				-<c>CL_PROFILING_POWER_CONSUMED</c> OpenCL extension, implemented in the MoviSim
			
 
				-simulator. Applications can however provide explicit measurements by
			
 
				-using the function starpu_perfmodel_update_history() (examplified in \ref PerformanceModelExample
			
 
				-with the <c>power_model</c> performance model). Fine-grain
			
 
				-measurement is often not feasible with the feedback provided by the hardware, so
			
 
				-the user can for instance run a given task a thousand times, measure the global
			
 
				-consumption for that series of tasks, divide it by a thousand, repeat for
			
 
				-varying kinds of tasks and task sizes, and eventually feed StarPU
			
 
				-with these manual measurements through starpu_perfmodel_update_history().
			
 
				-
			
 
				-\section StaticScheduling Static Scheduling
			
 
				-
			
 
				-In some cases, one may want to force some scheduling, for instance force a given
			
 
				-set of tasks to GPU0, another set to GPU1, etc. while letting some other tasks
			
 
				-be scheduled on any other device. This can indeed be useful to guide StarPU into
			
 
				-some work distribution, while still letting some degree of dynamism. For
			
 
				-instance, to force execution of a task on CUDA0:
			
 
				-
			
 
				-\code{.c}
			
 
				-task->execute_on_a_specific_worker = 1;
			
 
				-task->worker = starpu_worker_get_by_type(STARPU_CUDA_WORKER, 0);
			
 
				-\endcode
			
 
				-
			
 
				-Note however that using scheduling contexts while statically scheduling tasks on workers
			
 
				-could be tricky. Be careful to schedule the tasks exactly on the workers of the corresponding
			
 
				-contexts, otherwise the workers' corresponding scheduling structures may not be allocated or
			
 
				-the execution of the application may deadlock. Moreover, the hypervisor should not be used when
			
 
				-statically scheduling tasks.
			
 
				-
			
 
				-\section Profiling Profiling
			
 
				-
			
 
				-A quick view of how many tasks each worker has executed can be obtained by setting
			
 
				-<c>export STARPU_WORKER_STATS=1</c> This is a convenient way to check that
			
 
				-execution did happen on accelerators without penalizing performance with
			
 
				-the profiling overhead.
			
 
				-
			
 
				-A quick view of how much data transfers have been issued can be obtained by setting
			
 
				-<c>export STARPU_BUS_STATS=1</c> .
			
 
				-
			
 
				-More detailed profiling information can be enabled by using <c>export STARPU_PROFILING=1</c> or by
			
 
				-calling starpu_profiling_status_set() from the source code.
			
 
				-Statistics on the execution can then be obtained by using <c>export
			
 
				-STARPU_BUS_STATS=1</c> and <c>export STARPU_WORKER_STATS=1</c> .
			
 
				- More details on performance feedback are provided by the next chapter.
			
 
				-
			
 
				-\section DetectionStuckConditions Detection Stuck Conditions
			
 
				-
			
 
				-It may happen that for some reason, StarPU does not make progress for a long
			
 
				-period of time.  Reason are sometimes due to contention inside StarPU, but
			
 
				-sometimes this is due to external reasons, such as stuck MPI driver, or CUDA
			
 
				-driver, etc.
			
 
				-
			
 
				-<c>export STARPU_WATCHDOG_TIMEOUT=10000</c>
			
 
				-
			
 
				-allows to make StarPU print an error message whenever StarPU does not terminate
			
 
				-any task for 10ms. In addition to that,
			
 
				-
			
 
				-<c>export STARPU_WATCHDOG_CRASH=1</c>
			
 
				-
			
 
				-triggers a crash in that condition, thus allowing to catch the situation in gdb
			
 
				-etc.
			
 
				-
			
 
				-\section CUDA-specificOptimizations CUDA-specific Optimizations
			
 
				-
			
 
				-Due to CUDA limitations, StarPU will have a hard time overlapping its own
			
 
				-communications and the codelet computations if the application does not use a
			
 
				-dedicated CUDA stream for its computations instead of the default stream,
			
 
				-which synchronizes all operations of the GPU. StarPU provides one by the use
			
 
				-of starpu_cuda_get_local_stream() which can be used by all CUDA codelet
			
 
				-operations to avoid this issue. For instance:
			
 
				-
			
 
				-\code{.c}
			
 
				-func <<<grid,block,0,starpu_cuda_get_local_stream()>>> (foo, bar);
			
 
				-cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				-\endcode
			
 
				-
			
 
				-StarPU already does appropriate calls for the CUBLAS library.
			
 
				-
			
 
				-Unfortunately, some CUDA libraries do not have stream variants of
			
 
				-kernels. That will lower the potential for overlapping.
			
 
				-
			
 
				-\section PerformanceDebugging Performance Debugging
			
 
				-
			
 
				-To get an idea of what is happening, a lot of performance feedback is available,
			
 
				-detailed in the next chapter. The various informations should be checked for.
			
 
				-
			
 
				-<ul>
			
 
				-<li>
			
 
				-What does the Gantt diagram look like? (see \ref CreatingAGanttDiagram)
			
 
				-<ul>
			
 
				-  <li> If it's mostly green (tasks running in the initial context) or context specific
			
 
				-  color prevailing, then the machine is properly
			
 
				-  utilized, and perhaps the codelets are just slow. Check their performance, see
			
 
				-  \ref PerformanceOfCodelets.
			
 
				-  </li>
			
 
				-  <li> If it's mostly purple (FetchingInput), tasks keep waiting for data
			
 
				-  transfers, do you perhaps have far more communication than computation? Did
			
 
				-  you properly use CUDA streams to make sure communication can be
			
 
				-  overlapped? Did you use data-locality aware schedulers to avoid transfers as
			
 
				-  much as possible?
			
 
				-  </li>
			
 
				-  <li> If it's mostly red (Blocked), tasks keep waiting for dependencies,
			
 
				-  do you have enough parallelism? It might be a good idea to check what the DAG
			
 
				-  looks like (see \ref CreatingADAGWithGraphviz).
			
 
				-  </li>
			
 
				-  <li> If only some workers are completely red (Blocked), for some reason the
			
 
				-  scheduler didn't assign tasks to them. Perhaps the performance model is bogus,
			
 
				-  check it (see \ref PerformanceOfCodelets). Do all your codelets have a
			
 
				-  performance model?  When some of them don't, the schedulers switches to a
			
 
				-  greedy algorithm which thus performs badly.
			
 
				-  </li>
			
 
				-</ul>
			
 
				-</li>
			
 
				-</ul>
			
 
				-
			
 
				-You can also use the Temanejo task debugger (see \ref UsingTheTemanejoTaskDebugger) to
			
 
				-visualize the task graph more easily.
			
 
				-
			
 
				-\section SimulatedPerformance Simulated Performance
			
 
				-
			
 
				-StarPU can use Simgrid in order to simulate execution on an arbitrary
			
 
				-platform.
			
 
				-
			
 
				-\subsection Calibration Calibration
			
 
				-
			
 
				-The idea is to first compile StarPU normally, and run the application,
			
 
				-so as to automatically benchmark the bus and the codelets.
			
 
				-
			
 
				-\verbatim
			
 
				-$ ./configure && make
			
 
				-$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
			
 
				-[starpu][_starpu_load_history_based_model] Warning: model matvecmult
			
 
				-   is not calibrated, forcing calibration for this run. Use the
			
 
				-   STARPU_CALIBRATE environment variable to control this.
			
 
				-$ ...
			
 
				-$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
			
 
				-TEST PASSED
			
 
				-\endverbatim
			
 
				-
			
 
				-Note that we force to use the scheduler <c>dmda</c> to generate
			
 
				-performance models for the application. The application may need to be
			
 
				-run several times before the model is calibrated.
			
 
				-
			
 
				-\subsection Simulation Simulation
			
 
				-
			
 
				-Then, recompile StarPU, passing \ref enable-simgrid "--enable-simgrid"
			
 
				-to <c>./configure</c>, and re-run the application:
			
 
				-
			
 
				-\verbatim
			
 
				-$ ./configure --enable-simgrid && make
			
 
				-$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
			
 
				-TEST FAILED !!!
			
 
				-\endverbatim
			
 
				-
			
 
				-It is normal that the test fails: since the computation are not actually done
			
 
				-(that is the whole point of simgrid), the result is wrong, of course.
			
 
				-
			
 
				-If the performance model is not calibrated enough, the following error
			
 
				-message will be displayed
			
 
				-
			
 
				-\verbatim
			
 
				-$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
			
 
				-[starpu][_starpu_load_history_based_model] Warning: model matvecmult
			
 
				-    is not calibrated, forcing calibration for this run. Use the
			
 
				-    STARPU_CALIBRATE environment variable to control this.
			
 
				-[starpu][_starpu_simgrid_execute_job][assert failure] Codelet
			
 
				-    matvecmult does not have a perfmodel, or is not calibrated enough
			
 
				-\endverbatim
			
 
				-
			
 
				-The number of devices can be chosen as usual with \ref STARPU_NCPU,
			
 
				-\ref STARPU_NCUDA, and \ref STARPU_NOPENCL.  For now, only the number of
			
 
				-cpus can be arbitrarily chosen. The number of CUDA and OpenCL devices have to be
			
 
				-lower than the real number on the current machine.
			
 
				-
			
 
				-The amount of simulated GPU memory is for now unbound by default, but
			
 
				-it can be chosen by hand through the \ref STARPU_LIMIT_CUDA_MEM,
			
 
				-\ref STARPU_LIMIT_CUDA_devid_MEM, \ref STARPU_LIMIT_OPENCL_MEM, and
			
 
				-\ref STARPU_LIMIT_OPENCL_devid_MEM environment variables.
			
 
				-
			
 
				-The Simgrid default stack size is small; to increase it use the
			
 
				-parameter <c>--cfg=contexts/stack_size</c>, for example:
			
 
				-
			
 
				-\verbatim
			
 
				-$ ./example --cfg=contexts/stack_size:8192
			
 
				-TEST FAILED !!!
			
 
				-\endverbatim
			
 
				-
			
 
				-Note: of course, if the application uses <c>gettimeofday</c> to make its
			
 
				-performance measurements, the real time will be used, which will be bogus. To
			
 
				-get the simulated time, it has to use starpu_timing_now() which returns the
			
 
				-virtual timestamp in ms.
			
 
				-
			
 
				-\subsection SimulationOnAnotherMachine Simulation On Another Machine
			
 
				-
			
 
				-The simgrid support even permits to perform simulations on another machine, your
			
 
				-desktop, typically. To achieve this, one still needs to perform the Calibration
			
 
				-step on the actual machine to be simulated, then copy them to your desktop
			
 
				-machine (the <c>$STARPU_HOME/.starpu</c> directory). One can then perform the
			
 
				-Simulation step on the desktop machine, by setting the environment
			
 
				-variable \ref STARPU_HOSTNAME to the name of the actual machine, to
			
 
				-make StarPU use the performance models of the simulated machine even
			
 
				-on the desktop machine.
			
 
				-
			
 
				-If the desktop machine does not have CUDA or OpenCL, StarPU is still able to
			
 
				-use simgrid to simulate execution with CUDA/OpenCL devices, but the application
			
 
				-source code will probably disable the CUDA and OpenCL codelets in thatcd sc
			
 
				-case. Since during simgrid execution, the functions of the codelet are actually
			
 
				-not called, one can use dummy functions such as the following to still permit
			
 
				-CUDA or OpenCL execution:
			
 
				-
			
 
				-\snippet simgrid.c To be included. You should update doxygen if you see this text.
			
 
				-
			
 
				-*/
			
--- a/doc/doxygen/chapters/05check_list_performance.doxy
+++ b/doc/doxygen/chapters/05check_list_performance.doxy
@@ -0,0 +1,204 @@
 
				+/*
			
 
				+ * This file is part of the StarPU Handbook.
			
 
				+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
			
 
				+ * See the file version.doxy for copying conditions.
			
 
				+ */
			
 
				+
			
 
				+/*! \page CheckListWhenPerformanceAreNotThere Check List When Performance Are Not There
			
 
				+
			
 
				+TODO: improve!
			
 
				+
			
 
				+Simply encapsulating application kernels into tasks already permits to
			
 
				+seamlessly support CPU and GPUs at the same time. To achieve good
			
 
				+performance, we give below a list of features which should be checked.
			
 
				+
			
 
				+\section DataRelatedFeaturesToImprovePerformance Data Related Features That May Improve Performance
			
 
				+
			
 
				+link to \ref DataManagement
			
 
				+
			
 
				+link to \ref DataPrefetch
			
 
				+
			
 
				+\section TaskRelatedFeaturesToImprovePerformance Task Related Features That May Improve Performance
			
 
				+
			
 
				+link to \ref TaskGranularity
			
 
				+
			
 
				+link to \ref TaskSubmission
			
 
				+
			
 
				+link to \ref TaskPriorities
			
 
				+
			
 
				+\section SchedulingRelatedFeaturesToImprovePerformance Scheduling Related Features That May Improve Performance
			
 
				+
			
 
				+link to \ref TaskSchedulingPolicy
			
 
				+
			
 
				+link to \ref TaskDistributionVsDataTransfer
			
 
				+
			
 
				+link to \ref Power-basedScheduling
			
 
				+
			
 
				+link to \ref StaticScheduling
			
 
				+
			
 
				+\section CUDA-specificOptimizations CUDA-specific Optimizations
			
 
				+
			
 
				+Due to CUDA limitations, StarPU will have a hard time overlapping its own
			
 
				+communications and the codelet computations if the application does not use a
			
 
				+dedicated CUDA stream for its computations instead of the default stream,
			
 
				+which synchronizes all operations of the GPU. StarPU provides one by the use
			
 
				+of starpu_cuda_get_local_stream() which can be used by all CUDA codelet
			
 
				+operations to avoid this issue. For instance:
			
 
				+
			
 
				+\code{.c}
			
 
				+func <<<grid,block,0,starpu_cuda_get_local_stream()>>> (foo, bar);
			
 
				+cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+\endcode
			
 
				+
			
 
				+StarPU already does appropriate calls for the CUBLAS library.
			
 
				+
			
 
				+Unfortunately, some CUDA libraries do not have stream variants of
			
 
				+kernels. That will lower the potential for overlapping.
			
 
				+
			
 
				+\section DetectionStuckConditions Detection Stuck Conditions
			
 
				+
			
 
				+It may happen that for some reason, StarPU does not make progress for a long
			
 
				+period of time.  Reason are sometimes due to contention inside StarPU, but
			
 
				+sometimes this is due to external reasons, such as stuck MPI driver, or CUDA
			
 
				+driver, etc.
			
 
				+
			
 
				+<c>export STARPU_WATCHDOG_TIMEOUT=10000</c>
			
 
				+
			
 
				+allows to make StarPU print an error message whenever StarPU does not terminate
			
 
				+any task for 10ms. In addition to that,
			
 
				+
			
 
				+<c>export STARPU_WATCHDOG_CRASH=1</c>
			
 
				+
			
 
				+triggers a crash in that condition, thus allowing to catch the situation in gdb
			
 
				+etc.
			
 
				+
			
 
				+\section HowToLimitMemoryPerNode How to limit memory per node
			
 
				+
			
 
				+TODO
			
 
				+
			
 
				+Talk about
			
 
				+\ref STARPU_LIMIT_CUDA_devid_MEM, \ref STARPU_LIMIT_CUDA_MEM,
			
 
				+\ref STARPU_LIMIT_OPENCL_devid_MEM, \ref STARPU_LIMIT_OPENCL_MEM
			
 
				+and \ref STARPU_LIMIT_CPU_MEM
			
 
				+
			
 
				+starpu_memory_get_available()
			
 
				+
			
 
				+\section PerformanceModelCalibration Performance Model Calibration
			
 
				+
			
 
				+Most schedulers are based on an estimation of codelet duration on each kind
			
 
				+of processing unit. For this to be possible, the application programmer needs
			
 
				+to configure a performance model for the codelets of the application (see
			
 
				+\ref PerformanceModelExample for instance). History-based performance models
			
 
				+use on-line calibration.  StarPU will automatically calibrate codelets
			
 
				+which have never been calibrated yet, and save the result in
			
 
				+<c>$STARPU_HOME/.starpu/sampling/codelets</c>.
			
 
				+The models are indexed by machine name. To share the models between
			
 
				+machines (e.g. for a homogeneous cluster), use <c>export
			
 
				+STARPU_HOSTNAME=some_global_name</c>. To force continuing calibration,
			
 
				+use <c>export STARPU_CALIBRATE=1</c> . This may be necessary if your application
			
 
				+has not-so-stable performance. StarPU will force calibration (and thus ignore
			
 
				+the current result) until 10 (<c>_STARPU_CALIBRATION_MINIMUM</c>) measurements have been
			
 
				+made on each architecture, to avoid badly scheduling tasks just because the
			
 
				+first measurements were not so good. Details on the current performance model status
			
 
				+can be obtained from the command <c>starpu_perfmodel_display</c>: the <c>-l</c>
			
 
				+option lists the available performance models, and the <c>-s</c> option permits
			
 
				+to choose the performance model to be displayed. The result looks like:
			
 
				+
			
 
				+\verbatim
			
 
				+$ starpu_perfmodel_display -s starpu_slu_lu_model_11
			
 
				+performance model for cpu_impl_0
			
 
				+# hash    size     flops         mean          dev           n
			
 
				+914f3bef  1048576  0.000000e+00  2.503577e+04  1.982465e+02  8
			
 
				+3e921964  65536    0.000000e+00  5.527003e+02  1.848114e+01  7
			
 
				+e5a07e31  4096     0.000000e+00  1.717457e+01  5.190038e+00  14
			
 
				+...
			
 
				+\endverbatim
			
 
				+
			
 
				+Which shows that for the LU 11 kernel with a 1MiB matrix, the average
			
 
				+execution time on CPUs was about 25ms, with a 0.2ms standard deviation, over
			
 
				+8 samples. It is a good idea to check this before doing actual performance
			
 
				+measurements.
			
 
				+
			
 
				+A graph can be drawn by using the tool <c>starpu_perfmodel_plot</c>:
			
 
				+
			
 
				+\verbatim
			
 
				+$ starpu_perfmodel_plot -s starpu_slu_lu_model_11
			
 
				+4096 16384 65536 262144 1048576 4194304 
			
 
				+$ gnuplot starpu_starpu_slu_lu_model_11.gp
			
 
				+$ gv starpu_starpu_slu_lu_model_11.eps
			
 
				+\endverbatim
			
 
				+
			
 
				+\image html starpu_starpu_slu_lu_model_11.png
			
 
				+\image latex starpu_starpu_slu_lu_model_11.eps "" width=\textwidth
			
 
				+
			
 
				+If a kernel source code was modified (e.g. performance improvement), the
			
 
				+calibration information is stale and should be dropped, to re-calibrate from
			
 
				+start. This can be done by using <c>export STARPU_CALIBRATE=2</c>.
			
 
				+
			
 
				+Note: due to CUDA limitations, to be able to measure kernel duration,
			
 
				+calibration mode needs to disable asynchronous data transfers. Calibration thus
			
 
				+disables data transfer / computation overlapping, and should thus not be used
			
 
				+for eventual benchmarks. Note 2: history-based performance models get calibrated
			
 
				+only if a performance-model-based scheduler is chosen.
			
 
				+
			
 
				+The history-based performance models can also be explicitly filled by the
			
 
				+application without execution, if e.g. the application already has a series of
			
 
				+measurements. This can be done by using starpu_perfmodel_update_history(),
			
 
				+for instance:
			
 
				+
			
 
				+\code{.c}
			
 
				+static struct starpu_perfmodel perf_model = {
			
 
				+    .type = STARPU_HISTORY_BASED,
			
 
				+    .symbol = "my_perfmodel",
			
 
				+};
			
 
				+
			
 
				+struct starpu_codelet cl = {
			
 
				+    .where = STARPU_CUDA,
			
 
				+    .cuda_funcs = { cuda_func1, cuda_func2, NULL },
			
 
				+    .nbuffers = 1,
			
 
				+    .modes = {STARPU_W},
			
 
				+    .model = &perf_model
			
 
				+};
			
 
				+
			
 
				+void feed(void) {
			
 
				+    struct my_measure *measure;
			
 
				+    struct starpu_task task;
			
 
				+    starpu_task_init(&task);
			
 
				+
			
 
				+    task.cl = &cl;
			
 
				+
			
 
				+    for (measure = &measures[0]; measure < measures[last]; measure++) {
			
 
				+        starpu_data_handle_t handle;
			
 
				+	starpu_vector_data_register(&handle, -1, 0, measure->size, sizeof(float));
			
 
				+	task.handles[0] = handle;
			
 
				+	starpu_perfmodel_update_history(&perf_model, &task,
			
 
				+	                                STARPU_CUDA_DEFAULT + measure->cudadev, 0,
			
 
				+	                                measure->implementation, measure->time);
			
 
				+	starpu_task_clean(&task);
			
 
				+	starpu_data_unregister(handle);
			
 
				+    }
			
 
				+}
			
 
				+\endcode
			
 
				+
			
 
				+Measurement has to be provided in milliseconds for the completion time models,
			
 
				+and in Joules for the energy consumption models.
			
 
				+
			
 
				+\section Profiling Profiling
			
 
				+
			
 
				+A quick view of how many tasks each worker has executed can be obtained by setting
			
 
				+<c>export STARPU_WORKER_STATS=1</c> This is a convenient way to check that
			
 
				+execution did happen on accelerators without penalizing performance with
			
 
				+the profiling overhead.
			
 
				+
			
 
				+A quick view of how much data transfers have been issued can be obtained by setting
			
 
				+<c>export STARPU_BUS_STATS=1</c> .
			
 
				+
			
 
				+More detailed profiling information can be enabled by using <c>export STARPU_PROFILING=1</c> or by
			
 
				+calling starpu_profiling_status_set() from the source code.
			
 
				+Statistics on the execution can then be obtained by using <c>export
			
 
				+STARPU_BUS_STATS=1</c> and <c>export STARPU_WORKER_STATS=1</c> .
			
 
				+ More details on performance feedback are provided by the next chapter.
			
 
				+
			
 
				+*/
			
--- a/doc/doxygen/chapters/06tasks.doxy
+++ b/doc/doxygen/chapters/06tasks.doxy
@@ -0,0 +1,443 @@
 
				+/*
			
 
				+ * This file is part of the StarPU Handbook.
			
 
				+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
			
 
				+ * See the file version.doxy for copying conditions.
			
 
				+ */
			
 
				+
			
 
				+/*! \page TasksInStarPU Tasks In StarPU
			
 
				+
			
 
				+\section TaskGranularity Task Granularity
			
 
				+
			
 
				+Like any other runtime, StarPU has some overhead to manage tasks. Since
			
 
				+it does smart scheduling and data management, that overhead is not always
			
 
				+neglectable. The order of magnitude of the overhead is typically a couple of
			
 
				+microseconds, which is actually quite smaller than the CUDA overhead itself. The
			
 
				+amount of work that a task should do should thus be somewhat
			
 
				+bigger, to make sure that the overhead becomes neglectible. The offline
			
 
				+performance feedback can provide a measure of task length, which should thus be
			
 
				+checked if bad performance are observed. To get a grasp at the scalability
			
 
				+possibility according to task size, one can run
			
 
				+<c>tests/microbenchs/tasks_size_overhead.sh</c> which draws curves of the
			
 
				+speedup of independent tasks of very small sizes.
			
 
				+
			
 
				+The choice of scheduler also has impact over the overhead: for instance, the
			
 
				+ scheduler <c>dmda</c> takes time to make a decision, while <c>eager</c> does
			
 
				+not. <c>tasks_size_overhead.sh</c> can again be used to get a grasp at how much
			
 
				+impact that has on the target machine.
			
 
				+
			
 
				+\section TaskSubmission Task Submission
			
 
				+
			
 
				+To let StarPU make online optimizations, tasks should be submitted
			
 
				+asynchronously as much as possible. Ideally, all the tasks should be
			
 
				+submitted, and mere calls to starpu_task_wait_for_all() or
			
 
				+starpu_data_unregister() be done to wait for
			
 
				+termination. StarPU will then be able to rework the whole schedule, overlap
			
 
				+computation with communication, manage accelerator local memory usage, etc.
			
 
				+
			
 
				+\section TaskPriorities Task Priorities
			
 
				+
			
 
				+By default, StarPU will consider the tasks in the order they are submitted by
			
 
				+the application. If the application programmer knows that some tasks should
			
 
				+be performed in priority (for instance because their output is needed by many
			
 
				+other tasks and may thus be a bottleneck if not executed early
			
 
				+enough), the field starpu_task::priority should be set to transmit the
			
 
				+priority information to StarPU.
			
 
				+
			
 
				+\section SettingTheDataHandlesForATask Setting The Data Handles For A Task
			
 
				+
			
 
				+The number of data a task can manage is fixed by the environment variable
			
 
				+\ref STARPU_NMAXBUFS which has a default value which can be changed
			
 
				+through the configure option \ref enable-maxbuffers "--enable-maxbuffers".
			
 
				+
			
 
				+However, it is possible to define tasks managing more data by using
			
 
				+the field starpu_task::dyn_handles when defining a task and the field
			
 
				+starpu_codelet::dyn_modes when defining the corresponding codelet.
			
 
				+
			
 
				+\code{.c}
			
 
				+enum starpu_data_access_mode modes[STARPU_NMAXBUFS+1] = {
			
 
				+	STARPU_R, STARPU_R, ...
			
 
				+};
			
 
				+
			
 
				+struct starpu_codelet dummy_big_cl =
			
 
				+{
			
 
				+	.cuda_funcs = { dummy_big_kernel, NULL },
			
 
				+	.opencl_funcs = { dummy_big_kernel, NULL },
			
 
				+	.cpu_funcs = { dummy_big_kernel, NULL },
			
 
				+	.cpu_funcs_name = { "dummy_big_kernel", NULL },
			
 
				+	.nbuffers = STARPU_NMAXBUFS+1,
			
 
				+	.dyn_modes = modes
			
 
				+};
			
 
				+
			
 
				+task = starpu_task_create();
			
 
				+task->cl = &dummy_big_cl;
			
 
				+task->dyn_handles = malloc(task->cl->nbuffers * sizeof(starpu_data_handle_t));
			
 
				+for(i=0 ; i<task->cl->nbuffers ; i++)
			
 
				+{
			
 
				+	task->dyn_handles[i] = handle;
			
 
				+}
			
 
				+starpu_task_submit(task);
			
 
				+\endcode
			
 
				+
			
 
				+\code{.c}
			
 
				+starpu_data_handle_t *handles = malloc(dummy_big_cl.nbuffers * sizeof(starpu_data_handle_t));
			
 
				+for(i=0 ; i<dummy_big_cl.nbuffers ; i++)
			
 
				+{
			
 
				+	handles[i] = handle;
			
 
				+}
			
 
				+starpu_task_insert(&dummy_big_cl,
			
 
				+        	 STARPU_VALUE, &dummy_big_cl.nbuffers, sizeof(dummy_big_cl.nbuffers),
			
 
				+		 STARPU_DATA_ARRAY, handles, dummy_big_cl.nbuffers,
			
 
				+		 0);
			
 
				+\endcode
			
 
				+
			
 
				+The whole code for this complex data interface is available in the
			
 
				+directory <c>examples/basic_examples/dynamic_handles.c</c>.
			
 
				+
			
 
				+\section UsingMultipleImplementationsOfACodelet Using Multiple Implementations Of A Codelet
			
 
				+
			
 
				+One may want to write multiple implementations of a codelet for a single type of
			
 
				+device and let StarPU choose which one to run. As an example, we will show how
			
 
				+to use SSE to scale a vector. The codelet can be written as follows:
			
 
				+
			
 
				+\code{.c}
			
 
				+#include <xmmintrin.h>
			
 
				+
			
 
				+void scal_sse_func(void *buffers[], void *cl_arg)
			
 
				+{
			
 
				+    float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
			
 
				+    unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
			
 
				+    unsigned int n_iterations = n/4;
			
 
				+    if (n % 4 != 0)
			
 
				+        n_iterations++;
			
 
				+
			
 
				+    __m128 *VECTOR = (__m128*) vector;
			
 
				+    __m128 factor __attribute__((aligned(16)));
			
 
				+    factor = _mm_set1_ps(*(float *) cl_arg);
			
 
				+
			
 
				+    unsigned int i;
			
 
				+    for (i = 0; i < n_iterations; i++)
			
 
				+        VECTOR[i] = _mm_mul_ps(factor, VECTOR[i]);
			
 
				+}
			
 
				+\endcode
			
 
				+
			
 
				+\code{.c}
			
 
				+struct starpu_codelet cl = {
			
 
				+    .where = STARPU_CPU,
			
 
				+    .cpu_funcs = { scal_cpu_func, scal_sse_func, NULL },
			
 
				+    .cpu_funcs_name = { "scal_cpu_func", "scal_sse_func", NULL },
			
 
				+    .nbuffers = 1,
			
 
				+    .modes = { STARPU_RW }
			
 
				+};
			
 
				+\endcode
			
 
				+
			
 
				+Schedulers which are multi-implementation aware (only <c>dmda</c> and
			
 
				+<c>pheft</c> for now) will use the performance models of all the
			
 
				+implementations it was given, and pick the one that seems to be the fastest.
			
 
				+
			
 
				+\section EnablingImplementationAccordingToCapabilities Enabling Implementation According To Capabilities
			
 
				+
			
 
				+Some implementations may not run on some devices. For instance, some CUDA
			
 
				+devices do not support double floating point precision, and thus the kernel
			
 
				+execution would just fail; or the device may not have enough shared memory for
			
 
				+the implementation being used. The field starpu_codelet::can_execute
			
 
				+permits to express this. For instance:
			
 
				+
			
 
				+\code{.c}
			
 
				+static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
			
 
				+{
			
 
				+  const struct cudaDeviceProp *props;
			
 
				+  if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
			
 
				+    return 1;
			
 
				+  /* Cuda device */
			
 
				+  props = starpu_cuda_get_device_properties(workerid);
			
 
				+  if (props->major >= 2 || props->minor >= 3)
			
 
				+    /* At least compute capability 1.3, supports doubles */
			
 
				+    return 1;
			
 
				+  /* Old card, does not support doubles */
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+struct starpu_codelet cl = {
			
 
				+    .where = STARPU_CPU|STARPU_CUDA,
			
 
				+    .can_execute = can_execute,
			
 
				+    .cpu_funcs = { cpu_func, NULL },
			
 
				+    .cpu_funcs_name = { "cpu_func", NULL },
			
 
				+    .cuda_funcs = { gpu_func, NULL }
			
 
				+    .nbuffers = 1,
			
 
				+    .modes = { STARPU_RW }
			
 
				+};
			
 
				+\endcode
			
 
				+
			
 
				+This can be essential e.g. when running on a machine which mixes various models
			
 
				+of CUDA devices, to take benefit from the new models without crashing on old models.
			
 
				+
			
 
				+Note: the function starpu_codelet::can_execute is called by the
			
 
				+scheduler each time it tries to match a task with a worker, and should
			
 
				+thus be very fast. The function starpu_cuda_get_device_properties()
			
 
				+provides a quick access to CUDA properties of CUDA devices to achieve
			
 
				+such efficiency.
			
 
				+
			
 
				+Another example is to compile CUDA code for various compute capabilities,
			
 
				+resulting with two CUDA functions, e.g. <c>scal_gpu_13</c> for compute capability
			
 
				+1.3, and <c>scal_gpu_20</c> for compute capability 2.0. Both functions can be
			
 
				+provided to StarPU by using starpu_codelet::cuda_funcs, and
			
 
				+starpu_codelet::can_execute can then be used to rule out the
			
 
				+<c>scal_gpu_20</c> variant on a CUDA device which will not be able to execute it:
			
 
				+
			
 
				+\code{.c}
			
 
				+static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
			
 
				+{
			
 
				+  const struct cudaDeviceProp *props;
			
 
				+  if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
			
 
				+    return 1;
			
 
				+  /* Cuda device */
			
 
				+  if (nimpl == 0)
			
 
				+    /* Trying to execute the 1.3 capability variant, we assume it is ok in all cases.  */
			
 
				+    return 1;
			
 
				+  /* Trying to execute the 2.0 capability variant, check that the card can do it.  */
			
 
				+  props = starpu_cuda_get_device_properties(workerid);
			
 
				+  if (props->major >= 2 || props->minor >= 0)
			
 
				+    /* At least compute capability 2.0, can run it */
			
 
				+    return 1;
			
 
				+  /* Old card, does not support 2.0, will not be able to execute the 2.0 variant.  */
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				+struct starpu_codelet cl = {
			
 
				+    .where = STARPU_CPU|STARPU_CUDA,
			
 
				+    .can_execute = can_execute,
			
 
				+    .cpu_funcs = { cpu_func, NULL },
			
 
				+    .cpu_funcs_name = { "cpu_func", NULL },
			
 
				+    .cuda_funcs = { scal_gpu_13, scal_gpu_20, NULL },
			
 
				+    .nbuffers = 1,
			
 
				+    .modes = { STARPU_RW }
			
 
				+};
			
 
				+\endcode
			
 
				+
			
 
				+Note: the most generic variant should be provided first, as some schedulers are
			
 
				+not able to try the different variants.
			
 
				+
			
 
				+\section InsertTaskUtility Insert Task Utility
			
 
				+
			
 
				+StarPU provides the wrapper function starpu_task_insert() to ease
			
 
				+the creation and submission of tasks.
			
 
				+
			
 
				+Here the implementation of the codelet:
			
 
				+
			
 
				+\code{.c}
			
 
				+void func_cpu(void *descr[], void *_args)
			
 
				+{
			
 
				+        int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+        float *x1 = (float *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				+        int ifactor;
			
 
				+        float ffactor;
			
 
				+
			
 
				+        starpu_codelet_unpack_args(_args, &ifactor, &ffactor);
			
 
				+        *x0 = *x0 * ifactor;
			
 
				+        *x1 = *x1 * ffactor;
			
 
				+}
			
 
				+
			
 
				+struct starpu_codelet mycodelet = {
			
 
				+        .where = STARPU_CPU,
			
 
				+        .cpu_funcs = { func_cpu, NULL },
			
 
				+        .cpu_funcs_name = { "func_cpu", NULL },
			
 
				+        .nbuffers = 2,
			
 
				+        .modes = { STARPU_RW, STARPU_RW }
			
 
				+};
			
 
				+\endcode
			
 
				+
			
 
				+And the call to the function starpu_task_insert():
			
 
				+
			
 
				+\code{.c}
			
 
				+starpu_task_insert(&mycodelet,
			
 
				+                   STARPU_VALUE, &ifactor, sizeof(ifactor),
			
 
				+                   STARPU_VALUE, &ffactor, sizeof(ffactor),
			
 
				+                   STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],
			
 
				+                   0);
			
 
				+\endcode
			
 
				+
			
 
				+The call to starpu_task_insert() is equivalent to the following
			
 
				+code:
			
 
				+
			
 
				+\code{.c}
			
 
				+struct starpu_task *task = starpu_task_create();
			
 
				+task->cl = &mycodelet;
			
 
				+task->handles[0] = data_handles[0];
			
 
				+task->handles[1] = data_handles[1];
			
 
				+char *arg_buffer;
			
 
				+size_t arg_buffer_size;
			
 
				+starpu_codelet_pack_args(&arg_buffer, &arg_buffer_size,
			
 
				+                    STARPU_VALUE, &ifactor, sizeof(ifactor),
			
 
				+                    STARPU_VALUE, &ffactor, sizeof(ffactor),
			
 
				+                    0);
			
 
				+task->cl_arg = arg_buffer;
			
 
				+task->cl_arg_size = arg_buffer_size;
			
 
				+int ret = starpu_task_submit(task);
			
 
				+\endcode
			
 
				+
			
 
				+Here a similar call using ::STARPU_DATA_ARRAY.
			
 
				+
			
 
				+\code{.c}
			
 
				+starpu_task_insert(&mycodelet,
			
 
				+                   STARPU_DATA_ARRAY, data_handles, 2,
			
 
				+                   STARPU_VALUE, &ifactor, sizeof(ifactor),
			
 
				+                   STARPU_VALUE, &ffactor, sizeof(ffactor),
			
 
				+                   0);
			
 
				+\endcode
			
 
				+
			
 
				+If some part of the task insertion depends on the value of some computation,
			
 
				+the macro ::STARPU_DATA_ACQUIRE_CB can be very convenient. For
			
 
				+instance, assuming that the index variable <c>i</c> was registered as handle
			
 
				+<c>A_handle[i]</c>:
			
 
				+
			
 
				+\code{.c}
			
 
				+/* Compute which portion we will work on, e.g. pivot */
			
 
				+starpu_task_insert(&which_index, STARPU_W, i_handle, 0);
			
 
				+
			
 
				+/* And submit the corresponding task */
			
 
				+STARPU_DATA_ACQUIRE_CB(i_handle, STARPU_R,
			
 
				+                       starpu_task_insert(&work, STARPU_RW, A_handle[i], 0));
			
 
				+\endcode
			
 
				+
			
 
				+The macro ::STARPU_DATA_ACQUIRE_CB submits an asynchronous request for
			
 
				+acquiring data <c>i</c> for the main application, and will execute the code
			
 
				+given as third parameter when it is acquired. In other words, as soon as the
			
 
				+value of <c>i</c> computed by the codelet <c>which_index</c> can be read, the
			
 
				+portion of code passed as third parameter of ::STARPU_DATA_ACQUIRE_CB will
			
 
				+be executed, and is allowed to read from <c>i</c> to use it e.g. as an
			
 
				+index. Note that this macro is only avaible when compiling StarPU with
			
 
				+the compiler <c>gcc</c>.
			
 
				+
			
 
				+\section ParallelTasks Parallel Tasks
			
 
				+
			
 
				+StarPU can leverage existing parallel computation libraries by the means of
			
 
				+parallel tasks. A parallel task is a task which gets worked on by a set of CPUs
			
 
				+(called a parallel or combined worker) at the same time, by using an existing
			
 
				+parallel CPU implementation of the computation to be achieved. This can also be
			
 
				+useful to improve the load balance between slow CPUs and fast GPUs: since CPUs
			
 
				+work collectively on a single task, the completion time of tasks on CPUs become
			
 
				+comparable to the completion time on GPUs, thus relieving from granularity
			
 
				+discrepancy concerns. <c>hwloc</c> support needs to be enabled to get
			
 
				+good performance, otherwise StarPU will not know how to better group
			
 
				+cores.
			
 
				+
			
 
				+Two modes of execution exist to accomodate with existing usages.
			
 
				+
			
 
				+\subsection Fork-modeParallelTasks Fork-mode Parallel Tasks
			
 
				+
			
 
				+In the Fork mode, StarPU will call the codelet function on one
			
 
				+of the CPUs of the combined worker. The codelet function can use
			
 
				+starpu_combined_worker_get_size() to get the number of threads it is
			
 
				+allowed to start to achieve the computation. The CPU binding mask for the whole
			
 
				+set of CPUs is already enforced, so that threads created by the function will
			
 
				+inherit the mask, and thus execute where StarPU expected, the OS being in charge
			
 
				+of choosing how to schedule threads on the corresponding CPUs. The application
			
 
				+can also choose to bind threads by hand, using e.g. sched_getaffinity to know
			
 
				+the CPU binding mask that StarPU chose.
			
 
				+
			
 
				+For instance, using OpenMP (full source is available in
			
 
				+<c>examples/openmp/vector_scal.c</c>):
			
 
				+
			
 
				+\snippet forkmode.c To be included. You should update doxygen if you see this text.
			
 
				+
			
 
				+Other examples include for instance calling a BLAS parallel CPU implementation
			
 
				+(see <c>examples/mult/xgemm.c</c>).
			
 
				+
			
 
				+\subsection SPMD-modeParallelTasks SPMD-mode Parallel Tasks
			
 
				+
			
 
				+In the SPMD mode, StarPU will call the codelet function on
			
 
				+each CPU of the combined worker. The codelet function can use
			
 
				+starpu_combined_worker_get_size() to get the total number of CPUs
			
 
				+involved in the combined worker, and thus the number of calls that are made in
			
 
				+parallel to the function, and starpu_combined_worker_get_rank() to get
			
 
				+the rank of the current CPU within the combined worker. For instance:
			
 
				+
			
 
				+\code{.c}
			
 
				+static void func(void *buffers[], void *args)
			
 
				+{
			
 
				+    unsigned i;
			
 
				+    float *factor = _args;
			
 
				+    struct starpu_vector_interface *vector = buffers[0];
			
 
				+    unsigned n = STARPU_VECTOR_GET_NX(vector);
			
 
				+    float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
			
 
				+
			
 
				+    /* Compute slice to compute */
			
 
				+    unsigned m = starpu_combined_worker_get_size();
			
 
				+    unsigned j = starpu_combined_worker_get_rank();
			
 
				+    unsigned slice = (n+m-1)/m;
			
 
				+
			
 
				+    for (i = j * slice; i < (j+1) * slice && i < n; i++)
			
 
				+        val[i] *= *factor;
			
 
				+}
			
 
				+
			
 
				+static struct starpu_codelet cl =
			
 
				+{
			
 
				+    .modes = { STARPU_RW },
			
 
				+    .where = STARP_CPU,
			
 
				+    .type = STARPU_SPMD,
			
 
				+    .max_parallelism = INT_MAX,
			
 
				+    .cpu_funcs = { func, NULL },
			
 
				+    .cpu_funcs_name = { "func", NULL },
			
 
				+    .nbuffers = 1,
			
 
				+}
			
 
				+\endcode
			
 
				+
			
 
				+Of course, this trivial example will not really benefit from parallel task
			
 
				+execution, and was only meant to be simple to understand.  The benefit comes
			
 
				+when the computation to be done is so that threads have to e.g. exchange
			
 
				+intermediate results, or write to the data in a complex but safe way in the same
			
 
				+buffer.
			
 
				+
			
 
				+\subsection ParallelTasksPerformance Parallel Tasks Performance
			
 
				+
			
 
				+To benefit from parallel tasks, a parallel-task-aware StarPU scheduler has to
			
 
				+be used. When exposed to codelets with a flag ::STARPU_FORKJOIN or
			
 
				+::STARPU_SPMD, the schedulers <c>pheft</c> (parallel-heft) and <c>peager</c>
			
 
				+(parallel eager) will indeed also try to execute tasks with
			
 
				+several CPUs. It will automatically try the various available combined
			
 
				+worker sizes (making several measurements for each worker size) and
			
 
				+thus be able to avoid choosing a large combined worker if the codelet
			
 
				+does not actually scale so much.
			
 
				+
			
 
				+\subsection CombinedWorkers Combined Workers
			
 
				+
			
 
				+By default, StarPU creates combined workers according to the architecture
			
 
				+structure as detected by <c>hwloc</c>. It means that for each object of the <c>hwloc</c>
			
 
				+topology (NUMA node, socket, cache, ...) a combined worker will be created. If
			
 
				+some nodes of the hierarchy have a big arity (e.g. many cores in a socket
			
 
				+without a hierarchy of shared caches), StarPU will create combined workers of
			
 
				+intermediate sizes. The variable \ref
			
 
				+STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER permits to tune the maximum
			
 
				+arity between levels of combined workers.
			
 
				+
			
 
				+The combined workers actually produced can be seen in the output of the
			
 
				+tool <c>starpu_machine_display</c> (the environment variable \ref
			
 
				+STARPU_SCHED has to be set to a combined worker-aware scheduler such
			
 
				+as <c>pheft</c> or <c>peager</c>).
			
 
				+
			
 
				+\subsection ConcurrentParallelTasks Concurrent Parallel Tasks
			
 
				+
			
 
				+Unfortunately, many environments and librairies do not support concurrent
			
 
				+calls.
			
 
				+
			
 
				+For instance, most OpenMP implementations (including the main ones) do not
			
 
				+support concurrent <c>pragma omp parallel</c> statements without nesting them in
			
 
				+another <c>pragma omp parallel</c> statement, but StarPU does not yet support
			
 
				+creating its CPU workers by using such pragma.
			
 
				+
			
 
				+Other parallel libraries are also not safe when being invoked concurrently
			
 
				+from different threads, due to the use of global variables in their sequential
			
 
				+sections for instance.
			
 
				+
			
 
				+The solution is then to use only one combined worker at a time.  This can be
			
 
				+done by setting the field starpu_conf::single_combined_worker to <c>1</c>, or
			
 
				+setting the environment variable \ref STARPU_SINGLE_COMBINED_WORKER
			
 
				+to <c>1</c>. StarPU will then run only one parallel task at a time (but other
			
 
				+CPU and GPU tasks are not affected and can be run concurrently). The parallel
			
 
				+task scheduler will however still however still try varying combined worker
			
 
				+sizes to look for the most efficient ones.
			
 
				+
			
 
				+
			
 
				+*/
			
--- a/doc/doxygen/chapters/07data_management.doxy
+++ b/doc/doxygen/chapters/07data_management.doxy
@@ -0,0 +1,508 @@
 
				+/*
			
 
				+ * This file is part of the StarPU Handbook.
			
 
				+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
			
 
				+ * See the file version.doxy for copying conditions.
			
 
				+ */
			
 
				+
			
 
				+/*! \page DataManagement Data Management
			
 
				+
			
 
				+intro qui parle de coherency entre autres
			
 
				+
			
 
				+\section DataManagement Data Management
			
 
				+
			
 
				+When the application allocates data, whenever possible it should use
			
 
				+the function starpu_malloc(), which will ask CUDA or OpenCL to make
			
 
				+the allocation itself and pin the corresponding allocated memory. This
			
 
				+is needed to permit asynchronous data transfer, i.e. permit data
			
 
				+transfer to overlap with computations. Otherwise, the trace will show
			
 
				+that the <c>DriverCopyAsync</c> state takes a lot of time, this is
			
 
				+because CUDA or OpenCL then reverts to synchronous transfers.
			
 
				+
			
 
				+By default, StarPU leaves replicates of data wherever they were used, in case they
			
 
				+will be re-used by other tasks, thus saving the data transfer time. When some
			
 
				+task modifies some data, all the other replicates are invalidated, and only the
			
 
				+processing unit which ran that task will have a valid replicate of the data. If the application knows
			
 
				+that this data will not be re-used by further tasks, it should advise StarPU to
			
 
				+immediately replicate it to a desired list of memory nodes (given through a
			
 
				+bitmask). This can be understood like the write-through mode of CPU caches.
			
 
				+
			
 
				+\code{.c}
			
 
				+starpu_data_set_wt_mask(img_handle, 1<<0);
			
 
				+\endcode
			
 
				+
			
 
				+will for instance request to always automatically transfer a replicate into the
			
 
				+main memory (node <c>0</c>), as bit <c>0</c> of the write-through bitmask is being set.
			
 
				+
			
 
				+\code{.c}
			
 
				+starpu_data_set_wt_mask(img_handle, ~0U);
			
 
				+\endcode
			
 
				+
			
 
				+will request to always automatically broadcast the updated data to all memory
			
 
				+nodes.
			
 
				+
			
 
				+Setting the write-through mask to <c>~0U</c> can also be useful to make sure all
			
 
				+memory nodes always have a copy of the data, so that it is never evicted when
			
 
				+memory gets scarse.
			
 
				+
			
 
				+Implicit data dependency computation can become expensive if a lot
			
 
				+of tasks access the same piece of data. If no dependency is required
			
 
				+on some piece of data (e.g. because it is only accessed in read-only
			
 
				+mode, or because write accesses are actually commutative), use the
			
 
				+function starpu_data_set_sequential_consistency_flag() to disable
			
 
				+implicit dependencies on that data.
			
 
				+
			
 
				+In the same vein, accumulation of results in the same data can become a
			
 
				+bottleneck. The use of the mode ::STARPU_REDUX permits to optimize such
			
 
				+accumulation (see \ref DataReduction). To a lesser extent, the use of
			
 
				+the flag ::STARPU_COMMUTE keeps the bottleneck, but at least permits
			
 
				+the accumulation to happen in any order.
			
 
				+
			
 
				+Applications often need a data just for temporary results.  In such a case,
			
 
				+registration can be made without an initial value, for instance this produces a vector data:
			
 
				+
			
 
				+\code{.c}
			
 
				+starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
			
 
				+\endcode
			
 
				+
			
 
				+StarPU will then allocate the actual buffer only when it is actually needed,
			
 
				+e.g. directly on the GPU without allocating in main memory.
			
 
				+
			
 
				+In the same vein, once the temporary results are not useful any more, the
			
 
				+data should be thrown away. If the handle is not to be reused, it can be
			
 
				+unregistered:
			
 
				+
			
 
				+\code{.c}
			
 
				+starpu_unregister_submit(handle);
			
 
				+\endcode
			
 
				+
			
 
				+actual unregistration will be done after all tasks working on the handle
			
 
				+terminate.
			
 
				+
			
 
				+If the handle is to be reused, instead of unregistering it, it can simply be invalidated:
			
 
				+
			
 
				+\code{.c}
			
 
				+starpu_invalidate_submit(handle);
			
 
				+\endcode
			
 
				+
			
 
				+the buffers containing the current value will then be freed, and reallocated
			
 
				+only when another task writes some value to the handle.
			
 
				+
			
 
				+\section DataPrefetch Data Prefetch
			
 
				+
			
 
				+The scheduling policies <c>heft</c>, <c>dmda</c> and <c>pheft</c>
			
 
				+perform data prefetch (see \ref STARPU_PREFETCH):
			
 
				+as soon as a scheduling decision is taken for a task, requests are issued to
			
 
				+transfer its required data to the target processing unit, if needed, so that
			
 
				+when the processing unit actually starts the task, its data will hopefully be
			
 
				+already available and it will not have to wait for the transfer to finish.
			
 
				+
			
 
				+The application may want to perform some manual prefetching, for several reasons
			
 
				+such as excluding initial data transfers from performance measurements, or
			
 
				+setting up an initial statically-computed data distribution on the machine
			
 
				+before submitting tasks, which will thus guide StarPU toward an initial task
			
 
				+distribution (since StarPU will try to avoid further transfers).
			
 
				+
			
 
				+This can be achieved by giving the function starpu_data_prefetch_on_node()
			
 
				+the handle and the desired target memory node.
			
 
				+
			
 
				+\section PartitioningData Partitioning Data
			
 
				+
			
 
				+An existing piece of data can be partitioned in sub parts to be used by different tasks, for instance:
			
 
				+
			
 
				+\code{.c}
			
 
				+int vector[NX];
			
 
				+starpu_data_handle_t handle;
			
 
				+
			
 
				+/* Declare data to StarPU */
			
 
				+starpu_vector_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)vector,
			
 
				+                            NX, sizeof(vector[0]));
			
 
				+
			
 
				+/* Partition the vector in PARTS sub-vectors */
			
 
				+starpu_data_filter f =
			
 
				+{
			
 
				+    .filter_func = starpu_vector_filter_block,
			
 
				+    .nchildren = PARTS
			
 
				+};
			
 
				+starpu_data_partition(handle, &f);
			
 
				+\endcode
			
 
				+
			
 
				+The task submission then uses the function starpu_data_get_sub_data()
			
 
				+to retrieve the sub-handles to be passed as tasks parameters.
			
 
				+
			
 
				+\code{.c}
			
 
				+/* Submit a task on each sub-vector */
			
 
				+for (i=0; i<starpu_data_get_nb_children(handle); i++) {
			
 
				+    /* Get subdata number i (there is only 1 dimension) */
			
 
				+    starpu_data_handle_t sub_handle = starpu_data_get_sub_data(handle, 1, i);
			
 
				+    struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+    task->handles[0] = sub_handle;
			
 
				+    task->cl = &cl;
			
 
				+    task->synchronous = 1;
			
 
				+    task->cl_arg = &factor;
			
 
				+    task->cl_arg_size = sizeof(factor);
			
 
				+
			
 
				+    starpu_task_submit(task);
			
 
				+}
			
 
				+\endcode
			
 
				+
			
 
				+Partitioning can be applied several times, see
			
 
				+<c>examples/basic_examples/mult.c</c> and <c>examples/filters/</c>.
			
 
				+
			
 
				+Wherever the whole piece of data is already available, the partitioning will
			
 
				+be done in-place, i.e. without allocating new buffers but just using pointers
			
 
				+inside the existing copy. This is particularly important to be aware of when
			
 
				+using OpenCL, where the kernel parameters are not pointers, but handles. The
			
 
				+kernel thus needs to be also passed the offset within the OpenCL buffer:
			
 
				+
			
 
				+\code{.c}
			
 
				+void opencl_func(void *buffers[], void *cl_arg)
			
 
				+{
			
 
				+    cl_mem vector = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
			
 
				+    unsigned offset = STARPU_BLOCK_GET_OFFSET(buffers[0]);
			
 
				+
			
 
				+    ...
			
 
				+    clSetKernelArg(kernel, 0, sizeof(vector), &vector);
			
 
				+    clSetKernelArg(kernel, 1, sizeof(offset), &offset);
			
 
				+    ...
			
 
				+}
			
 
				+\endcode
			
 
				+
			
 
				+And the kernel has to shift from the pointer passed by the OpenCL driver:
			
 
				+
			
 
				+\code{.c}
			
 
				+__kernel void opencl_kernel(__global int *vector, unsigned offset)
			
 
				+{
			
 
				+    block = (__global void *)block + offset;
			
 
				+    ...
			
 
				+}
			
 
				+\endcode
			
 
				+
			
 
				+StarPU provides various interfaces and filters for matrices, vectors, etc.,
			
 
				+but applications can also write their own data interfaces and filters, see
			
 
				+<c>examples/interface</c> and <c>examples/filters/custom_mf</c> for an example.
			
 
				+
			
 
				+\section DataReduction Data Reduction
			
 
				+
			
 
				+In various cases, some piece of data is used to accumulate intermediate
			
 
				+results. For instances, the dot product of a vector, maximum/minimum finding,
			
 
				+the histogram of a photograph, etc. When these results are produced along the
			
 
				+whole machine, it would not be efficient to accumulate them in only one place,
			
 
				+incurring data transmission each and access concurrency.
			
 
				+
			
 
				+StarPU provides a mode ::STARPU_REDUX, which permits to optimize
			
 
				+that case: it will allocate a buffer on each memory node, and accumulate
			
 
				+intermediate results there. When the data is eventually accessed in the normal
			
 
				+mode ::STARPU_R, StarPU will collect the intermediate results in just one
			
 
				+buffer.
			
 
				+
			
 
				+For this to work, the user has to use the function
			
 
				+starpu_data_set_reduction_methods() to declare how to initialize these
			
 
				+buffers, and how to assemble partial results.
			
 
				+
			
 
				+For instance, <c>cg</c> uses that to optimize its dot product: it first defines
			
 
				+the codelets for initialization and reduction:
			
 
				+
			
 
				+\code{.c}
			
 
				+struct starpu_codelet bzero_variable_cl =
			
 
				+{
			
 
				+        .cpu_funcs = { bzero_variable_cpu, NULL },
			
 
				+        .cpu_funcs_name = { "bzero_variable_cpu", NULL },
			
 
				+        .cuda_funcs = { bzero_variable_cuda, NULL },
			
 
				+        .nbuffers = 1,
			
 
				+}
			
 
				+
			
 
				+static void accumulate_variable_cpu(void *descr[], void *cl_arg)
			
 
				+{
			
 
				+        double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+        double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				+        *v_dst = *v_dst + *v_src;
			
 
				+}
			
 
				+
			
 
				+static void accumulate_variable_cuda(void *descr[], void *cl_arg)
			
 
				+{
			
 
				+        double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+        double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				+        cublasaxpy(1, (double)1.0, v_src, 1, v_dst, 1);
			
 
				+        cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+}
			
 
				+
			
 
				+struct starpu_codelet accumulate_variable_cl =
			
 
				+{
			
 
				+        .cpu_funcs = { accumulate_variable_cpu, NULL },
			
 
				+        .cpu_funcs_name = { "accumulate_variable_cpu", NULL },
			
 
				+        .cuda_funcs = { accumulate_variable_cuda, NULL },
			
 
				+        .nbuffers = 1,
			
 
				+}
			
 
				+\endcode
			
 
				+
			
 
				+and attaches them as reduction methods for its handle <c>dtq</c>:
			
 
				+
			
 
				+\code{.c}
			
 
				+starpu_variable_data_register(&dtq_handle, -1, NULL, sizeof(type));
			
 
				+starpu_data_set_reduction_methods(dtq_handle,
			
 
				+        &accumulate_variable_cl, &bzero_variable_cl);
			
 
				+\endcode
			
 
				+
			
 
				+and <c>dtq_handle</c> can now be used in mode ::STARPU_REDUX for the
			
 
				+dot products with partitioned vectors:
			
 
				+
			
 
				+\code{.c}
			
 
				+for (b = 0; b < nblocks; b++)
			
 
				+    starpu_task_insert(&dot_kernel_cl,
			
 
				+        STARPU_REDUX, dtq_handle,
			
 
				+        STARPU_R, starpu_data_get_sub_data(v1, 1, b),
			
 
				+        STARPU_R, starpu_data_get_sub_data(v2, 1, b),
			
 
				+        0);
			
 
				+\endcode
			
 
				+
			
 
				+During registration, we have here provided <c>NULL</c>, i.e. there is
			
 
				+no initial value to be taken into account during reduction. StarPU
			
 
				+will thus only take into account the contributions from the tasks
			
 
				+<c>dot_kernel_cl</c>. Also, it will not allocate any memory for
			
 
				+<c>dtq_handle</c> before tasks <c>dot_kernel_cl</c> are ready to run.
			
 
				+
			
 
				+If another dot product has to be performed, one could unregister
			
 
				+<c>dtq_handle</c>, and re-register it. But one can also call
			
 
				+starpu_data_invalidate_submit() with the parameter <c>dtq_handle</c>,
			
 
				+which will clear all data from the handle, thus resetting it back to
			
 
				+the initial status <c>register(NULL)</c>.
			
 
				+
			
 
				+The example <c>cg</c> also uses reduction for the blocked gemv kernel,
			
 
				+leading to yet more relaxed dependencies and more parallelism.
			
 
				+
			
 
				+::STARPU_REDUX can also be passed to starpu_mpi_task_insert() in the MPI
			
 
				+case. That will however not produce any MPI communication, but just pass
			
 
				+::STARPU_REDUX to the underlying starpu_task_insert(). It is up to the
			
 
				+application to call starpu_mpi_redux_data(), which posts tasks that will
			
 
				+reduce the partial results among MPI nodes into the MPI node which owns the
			
 
				+data. For instance, some hypothetical application which collects partial results
			
 
				+into data <c>res</c>, then uses it for other computation, before looping again
			
 
				+with a new reduction:
			
 
				+
			
 
				+\code{.c}
			
 
				+for (i = 0; i < 100; i++) {
			
 
				+    starpu_mpi_task_insert(MPI_COMM_WORLD, &init_res, STARPU_W, res, 0);
			
 
				+    starpu_mpi_task_insert(MPI_COMM_WORLD, &work, STARPU_RW, A,
			
 
				+               STARPU_R, B, STARPU_REDUX, res, 0);
			
 
				+    starpu_mpi_redux_data(MPI_COMM_WORLD, res);
			
 
				+    starpu_mpi_task_insert(MPI_COMM_WORLD, &work2, STARPU_RW, B, STARPU_R, res, 0);
			
 
				+}
			
 
				+\endcode
			
 
				+
			
 
				+\section TemporaryBuffers Temporary Buffers
			
 
				+
			
 
				+There are two kinds of temporary buffers: temporary data which just pass results
			
 
				+from a task to another, and scratch data which are needed only internally by
			
 
				+tasks.
			
 
				+
			
 
				+\subsection TemporaryData Temporary Data
			
 
				+
			
 
				+Data can sometimes be entirely produced by a task, and entirely consumed by
			
 
				+another task, without the need for other parts of the application to access
			
 
				+it. In such case, registration can be done without prior allocation, by using
			
 
				+the special memory node number <c>-1</c>, and passing a zero pointer. StarPU will
			
 
				+actually allocate memory only when the task creating the content gets scheduled,
			
 
				+and destroy it on unregistration.
			
 
				+
			
 
				+In addition to that, it can be tedious for the application to have to unregister
			
 
				+the data, since it will not use its content anyway. The unregistration can be
			
 
				+done lazily by using the function starpu_data_unregister_submit(),
			
 
				+which will record that no more tasks accessing the handle will be submitted, so
			
 
				+that it can be freed as soon as the last task accessing it is over.
			
 
				+
			
 
				+The following code examplifies both points: it registers the temporary
			
 
				+data, submits three tasks accessing it, and records the data for automatic
			
 
				+unregistration.
			
 
				+
			
 
				+\code{.c}
			
 
				+starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
			
 
				+starpu_task_insert(&produce_data, STARPU_W, handle, 0);
			
 
				+starpu_task_insert(&compute_data, STARPU_RW, handle, 0);
			
 
				+starpu_task_insert(&summarize_data, STARPU_R, handle, STARPU_W, result_handle, 0);
			
 
				+starpu_data_unregister_submit(handle);
			
 
				+\endcode
			
 
				+
			
 
				+\subsection ScratchData Scratch Data
			
 
				+
			
 
				+Some kernels sometimes need temporary data to achieve the computations, i.e. a
			
 
				+workspace. The application could allocate it at the start of the codelet
			
 
				+function, and free it at the end, but that would be costly. It could also
			
 
				+allocate one buffer per worker (similarly to \ref
			
 
				+HowToInitializeAComputationLibraryOnceForEachWorker), but that would
			
 
				+make them systematic and permanent. A more  optimized way is to use
			
 
				+the data access mode ::STARPU_SCRATCH, as examplified below, which
			
 
				+provides per-worker buffers without content consistency.
			
 
				+
			
 
				+\code{.c}
			
 
				+starpu_vector_data_register(&workspace, -1, 0, sizeof(float));
			
 
				+for (i = 0; i < N; i++)
			
 
				+    starpu_task_insert(&compute, STARPU_R, input[i],
			
 
				+                       STARPU_SCRATCH, workspace, STARPU_W, output[i], 0);
			
 
				+\endcode
			
 
				+
			
 
				+StarPU will make sure that the buffer is allocated before executing the task,
			
 
				+and make this allocation per-worker: for CPU workers, notably, each worker has
			
 
				+its own buffer. This means that each task submitted above will actually have its
			
 
				+own workspace, which will actually be the same for all tasks running one after
			
 
				+the other on the same worker. Also, if for instance GPU memory becomes scarce,
			
 
				+StarPU will notice that it can free such buffers easily, since the content does
			
 
				+not matter.
			
 
				+
			
 
				+The example <c>examples/pi</c> uses scratches for some temporary buffer.
			
 
				+
			
 
				+\section TheMultiformatInterface The Multiformat Interface
			
 
				+
			
 
				+It may be interesting to represent the same piece of data using two different
			
 
				+data structures: one that would only be used on CPUs, and one that would only
			
 
				+be used on GPUs. This can be done by using the multiformat interface. StarPU
			
 
				+will be able to convert data from one data structure to the other when needed.
			
 
				+Note that the scheduler <c>dmda</c> is the only one optimized for this
			
 
				+interface. The user must provide StarPU with conversion codelets:
			
 
				+
			
 
				+\snippet multiformat.c To be included. You should update doxygen if you see this text.
			
 
				+
			
 
				+Kernels can be written almost as for any other interface. Note that
			
 
				+::STARPU_MULTIFORMAT_GET_CPU_PTR shall only be used for CPU kernels. CUDA kernels
			
 
				+must use ::STARPU_MULTIFORMAT_GET_CUDA_PTR, and OpenCL kernels must use
			
 
				+::STARPU_MULTIFORMAT_GET_OPENCL_PTR. ::STARPU_MULTIFORMAT_GET_NX may
			
 
				+be used in any kind of kernel.
			
 
				+
			
 
				+\code{.c}
			
 
				+static void
			
 
				+multiformat_scal_cpu_func(void *buffers[], void *args)
			
 
				+{
			
 
				+    struct point *aos;
			
 
				+    unsigned int n;
			
 
				+
			
 
				+    aos = STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
			
 
				+    n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
			
 
				+    ...
			
 
				+}
			
 
				+
			
 
				+extern "C" void multiformat_scal_cuda_func(void *buffers[], void *_args)
			
 
				+{
			
 
				+    unsigned int n;
			
 
				+    struct struct_of_arrays *soa;
			
 
				+
			
 
				+    soa = (struct struct_of_arrays *) STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
			
 
				+    n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
			
 
				+
			
 
				+    ...
			
 
				+}
			
 
				+\endcode
			
 
				+
			
 
				+A full example may be found in <c>examples/basic_examples/multiformat.c</c>.
			
 
				+
			
 
				+\section DefiningANewDataInterface Defining A New Data Interface
			
 
				+
			
 
				+Let's define a new data interface to manage complex numbers.
			
 
				+
			
 
				+\code{.c}
			
 
				+/* interface for complex numbers */
			
 
				+struct starpu_complex_interface
			
 
				+{
			
 
				+        double *real;
			
 
				+        double *imaginary;
			
 
				+        int nx;
			
 
				+};
			
 
				+\endcode
			
 
				+
			
 
				+Registering such a data to StarPU is easily done using the function
			
 
				+starpu_data_register(). The last
			
 
				+parameter of the function, <c>interface_complex_ops</c>, will be
			
 
				+described below.
			
 
				+
			
 
				+\code{.c}
			
 
				+void starpu_complex_data_register(starpu_data_handle_t *handle,
			
 
				+     unsigned home_node, double *real, double *imaginary, int nx)
			
 
				+{
			
 
				+        struct starpu_complex_interface complex =
			
 
				+        {
			
 
				+                .real = real,
			
 
				+                .imaginary = imaginary,
			
 
				+                .nx = nx
			
 
				+        };
			
 
				+
			
 
				+        if (interface_complex_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
			
 
				+        {
			
 
				+                interface_complex_ops.interfaceid = starpu_data_interface_get_next_id();
			
 
				+        }
			
 
				+
			
 
				+        starpu_data_register(handleptr, home_node, &complex, &interface_complex_ops);
			
 
				+}
			
 
				+\endcode
			
 
				+
			
 
				+Different operations need to be defined for a data interface through
			
 
				+the type starpu_data_interface_ops. We only define here the basic
			
 
				+operations needed to run simple applications. The source code for the
			
 
				+different functions can be found in the file
			
 
				+<c>examples/interface/complex_interface.c</c>.
			
 
				+
			
 
				+\code{.c}
			
 
				+static struct starpu_data_interface_ops interface_complex_ops =
			
 
				+{
			
 
				+        .register_data_handle = complex_register_data_handle,
			
 
				+        .allocate_data_on_node = complex_allocate_data_on_node,
			
 
				+        .copy_methods = &complex_copy_methods,
			
 
				+        .get_size = complex_get_size,
			
 
				+        .footprint = complex_footprint,
			
 
				+        .interfaceid = STARPU_UNKNOWN_INTERFACE_ID,
			
 
				+        .interface_size = sizeof(struct starpu_complex_interface),
			
 
				+};
			
 
				+\endcode
			
 
				+
			
 
				+Functions need to be defined to access the different fields of the
			
 
				+complex interface from a StarPU data handle.
			
 
				+
			
 
				+\code{.c}
			
 
				+double *starpu_complex_get_real(starpu_data_handle_t handle)
			
 
				+{
			
 
				+        struct starpu_complex_interface *complex_interface =
			
 
				+          (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
			
 
				+        return complex_interface->real;
			
 
				+}
			
 
				+
			
 
				+double *starpu_complex_get_imaginary(starpu_data_handle_t handle);
			
 
				+int starpu_complex_get_nx(starpu_data_handle_t handle);
			
 
				+\endcode
			
 
				+
			
 
				+Similar functions need to be defined to access the different fields of the
			
 
				+complex interface from a <c>void *</c> pointer to be used within codelet
			
 
				+implemetations.
			
 
				+
			
 
				+\snippet complex.c To be included. You should update doxygen if you see this text.
			
 
				+
			
 
				+Complex data interfaces can then be registered to StarPU.
			
 
				+
			
 
				+\code{.c}
			
 
				+double real = 45.0;
			
 
				+double imaginary = 12.0;starpu_complex_data_register(&handle1, STARPU_MAIN_RAM, &real, &imaginary, 1);
			
 
				+starpu_task_insert(&cl_display, STARPU_R, handle1, 0);
			
 
				+\endcode
			
 
				+
			
 
				+and used by codelets.
			
 
				+
			
 
				+\code{.c}
			
 
				+void display_complex_codelet(void *descr[], __attribute__ ((unused)) void *_args)
			
 
				+{
			
 
				+        int nx = STARPU_COMPLEX_GET_NX(descr[0]);
			
 
				+        double *real = STARPU_COMPLEX_GET_REAL(descr[0]);
			
 
				+        double *imaginary = STARPU_COMPLEX_GET_IMAGINARY(descr[0]);
			
 
				+        int i;
			
 
				+
			
 
				+        for(i=0 ; i<nx ; i++)
			
 
				+        {
			
 
				+                fprintf(stderr, "Complex[%d] = %3.2f + %3.2f i\n", i, real[i], imaginary[i]);
			
 
				+        }
			
 
				+}
			
 
				+\endcode
			
 
				+
			
 
				+The whole code for this complex data interface is available in the
			
 
				+directory <c>examples/interface/</c>.
			
 
				+
			
 
				+
			
 
				+
			
 
				+*/
			
--- a/doc/doxygen/chapters/08scheduling.doxy
+++ b/doc/doxygen/chapters/08scheduling.doxy
@@ -0,0 +1,151 @@
 
				+/*
			
 
				+ * This file is part of the StarPU Handbook.
			
 
				+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
			
 
				+ * See the file version.doxy for copying conditions.
			
 
				+ */
			
 
				+
			
 
				+/*! \page Scheduling Scheduling
			
 
				+
			
 
				+\section TaskSchedulingPolicy Task Scheduling Policy
			
 
				+
			
 
				+By default, StarPU uses the simple greedy scheduler <c>eager</c>. This is
			
 
				+because it provides correct load balance even if the application codelets do not
			
 
				+have performance models. If your application codelets have performance models
			
 
				+(\ref PerformanceModelExample), you should change the scheduler thanks
			
 
				+to the environment variable \ref STARPU_SCHED. For instance <c>export
			
 
				+STARPU_SCHED=dmda</c> . Use <c>help</c> to get the list of available schedulers.
			
 
				+
			
 
				+The <b>eager</b> scheduler uses a central task queue, from which workers draw tasks
			
 
				+to work on. This however does not permit to prefetch data since the scheduling
			
 
				+decision is taken late. If a task has a non-0 priority, it is put at the front of the queue.
			
 
				+
			
 
				+The <b>prio</b> scheduler also uses a central task queue, but sorts tasks by
			
 
				+priority (between -5 and 5).
			
 
				+
			
 
				+The <b>random</b> scheduler distributes tasks randomly according to assumed worker
			
 
				+overall performance.
			
 
				+
			
 
				+The <b>ws</b> (work stealing) scheduler schedules tasks on the local worker by
			
 
				+default. When a worker becomes idle, it steals a task from the most loaded
			
 
				+worker.
			
 
				+
			
 
				+The <b>dm</b> (deque model) scheduler uses task execution performance models into account to
			
 
				+perform an HEFT-similar scheduling strategy: it schedules tasks where their
			
 
				+termination time will be minimal.
			
 
				+
			
 
				+The <b>dmda</b> (deque model data aware) scheduler is similar to dm, it also takes
			
 
				+into account data transfer time.
			
 
				+
			
 
				+The <b>dmdar</b> (deque model data aware ready) scheduler is similar to dmda,
			
 
				+it also sorts tasks on per-worker queues by number of already-available data
			
 
				+buffers.
			
 
				+
			
 
				+The <b>dmdas</b> (deque model data aware sorted) scheduler is similar to dmda, it
			
 
				+also supports arbitrary priority values.
			
 
				+
			
 
				+The <b>heft</b> (heterogeneous earliest finish time) scheduler is deprecated. It
			
 
				+is now just an alias for <b>dmda</b>.
			
 
				+
			
 
				+The <b>pheft</b> (parallel HEFT) scheduler is similar to heft, it also supports
			
 
				+parallel tasks (still experimental). Should not be used when several contexts using
			
 
				+it are being executed simultaneously.
			
 
				+
			
 
				+The <b>peager</b> (parallel eager) scheduler is similar to eager, it also
			
 
				+supports parallel tasks (still experimental). Should not be used when several 
			
 
				+contexts using it are being executed simultaneously.
			
 
				+
			
 
				+\section TaskDistributionVsDataTransfer Task Distribution Vs Data Transfer
			
 
				+
			
 
				+Distributing tasks to balance the load induces data transfer penalty. StarPU
			
 
				+thus needs to find a balance between both. The target function that the
			
 
				+scheduler <c>dmda</c> of StarPU
			
 
				+tries to minimize is <c>alpha * T_execution + beta * T_data_transfer</c>, where
			
 
				+<c>T_execution</c> is the estimated execution time of the codelet (usually
			
 
				+accurate), and <c>T_data_transfer</c> is the estimated data transfer time. The
			
 
				+latter is estimated based on bus calibration before execution start,
			
 
				+i.e. with an idle machine, thus without contention. You can force bus
			
 
				+re-calibration by running the tool <c>starpu_calibrate_bus</c>. The
			
 
				+beta parameter defaults to <c>1</c>, but it can be worth trying to tweak it
			
 
				+by using <c>export STARPU_SCHED_BETA=2</c> for instance, since during
			
 
				+real application execution, contention makes transfer times bigger.
			
 
				+This is of course imprecise, but in practice, a rough estimation
			
 
				+already gives the good results that a precise estimation would give.
			
 
				+
			
 
				+\section Power-basedScheduling Power-based Scheduling
			
 
				+
			
 
				+If the application can provide some power performance model (through
			
 
				+the field starpu_codelet::power_model), StarPU will
			
 
				+take it into account when distributing tasks. The target function that
			
 
				+the scheduler <c>dmda</c> minimizes becomes <c>alpha * T_execution +
			
 
				+beta * T_data_transfer + gamma * Consumption</c> , where <c>Consumption</c>
			
 
				+is the estimated task consumption in Joules. To tune this parameter, use
			
 
				+<c>export STARPU_SCHED_GAMMA=3000</c> for instance, to express that each Joule
			
 
				+(i.e kW during 1000us) is worth 3000us execution time penalty. Setting
			
 
				+<c>alpha</c> and <c>beta</c> to zero permits to only take into account power consumption.
			
 
				+
			
 
				+This is however not sufficient to correctly optimize power: the scheduler would
			
 
				+simply tend to run all computations on the most energy-conservative processing
			
 
				+unit. To account for the consumption of the whole machine (including idle
			
 
				+processing units), the idle power of the machine should be given by setting
			
 
				+<c>export STARPU_IDLE_POWER=200</c> for 200W, for instance. This value can often
			
 
				+be obtained from the machine power supplier.
			
 
				+
			
 
				+The power actually consumed by the total execution can be displayed by setting
			
 
				+<c>export STARPU_PROFILING=1 STARPU_WORKER_STATS=1</c> .
			
 
				+
			
 
				+On-line task consumption measurement is currently only supported through the
			
 
				+<c>CL_PROFILING_POWER_CONSUMED</c> OpenCL extension, implemented in the MoviSim
			
 
				+simulator. Applications can however provide explicit measurements by
			
 
				+using the function starpu_perfmodel_update_history() (examplified in \ref PerformanceModelExample
			
 
				+with the <c>power_model</c> performance model). Fine-grain
			
 
				+measurement is often not feasible with the feedback provided by the hardware, so
			
 
				+the user can for instance run a given task a thousand times, measure the global
			
 
				+consumption for that series of tasks, divide it by a thousand, repeat for
			
 
				+varying kinds of tasks and task sizes, and eventually feed StarPU
			
 
				+with these manual measurements through starpu_perfmodel_update_history().
			
 
				+
			
 
				+\section StaticScheduling Static Scheduling
			
 
				+
			
 
				+In some cases, one may want to force some scheduling, for instance force a given
			
 
				+set of tasks to GPU0, another set to GPU1, etc. while letting some other tasks
			
 
				+be scheduled on any other device. This can indeed be useful to guide StarPU into
			
 
				+some work distribution, while still letting some degree of dynamism. For
			
 
				+instance, to force execution of a task on CUDA0:
			
 
				+
			
 
				+\code{.c}
			
 
				+task->execute_on_a_specific_worker = 1;
			
 
				+task->worker = starpu_worker_get_by_type(STARPU_CUDA_WORKER, 0);
			
 
				+\endcode
			
 
				+
			
 
				+Note however that using scheduling contexts while statically scheduling tasks on workers
			
 
				+could be tricky. Be careful to schedule the tasks exactly on the workers of the corresponding
			
 
				+contexts, otherwise the workers' corresponding scheduling structures may not be allocated or
			
 
				+the execution of the application may deadlock. Moreover, the hypervisor should not be used when
			
 
				+statically scheduling tasks.
			
 
				+
			
 
				+\section DefiningANewSchedulingPolicy Defining A New Scheduling Policy
			
 
				+
			
 
				+A full example showing how to define a new scheduling policy is available in
			
 
				+the StarPU sources in the directory <c>examples/scheduler/</c>.
			
 
				+
			
 
				+See \ref API_Scheduling_Policy
			
 
				+
			
 
				+\code{.c}
			
 
				+static struct starpu_sched_policy dummy_sched_policy = {
			
 
				+    .init_sched = init_dummy_sched,
			
 
				+    .deinit_sched = deinit_dummy_sched,
			
 
				+    .add_workers = dummy_sched_add_workers,
			
 
				+    .remove_workers = dummy_sched_remove_workers,
			
 
				+    .push_task = push_task_dummy,
			
 
				+    .push_prio_task = NULL,
			
 
				+    .pop_task = pop_task_dummy,
			
 
				+    .post_exec_hook = NULL,
			
 
				+    .pop_every_task = NULL,
			
 
				+    .policy_name = "dummy",
			
 
				+    .policy_description = "dummy scheduling strategy"
			
 
				+};
			
 
				+\endcode
			
 
				+
			
 
				+*/
			
--- a/doc/doxygen/chapters/09scheduling_contexts.doxy
+++ b/doc/doxygen/chapters/09scheduling_contexts.doxy
--- a/doc/doxygen/chapters/10scheduling_context_hypervisor.doxy
+++ b/doc/doxygen/chapters/10scheduling_context_hypervisor.doxy
--- a/doc/doxygen/chapters/11debugging_tools.doxy
+++ b/doc/doxygen/chapters/11debugging_tools.doxy
@@ -0,0 +1,42 @@
 
				+/*
			
 
				+ * This file is part of the StarPU Handbook.
			
 
				+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
			
 
				+ * See the file version.doxy for copying conditions.
			
 
				+ */
			
 
				+
			
 
				+/*! \page DebuggingTools Debugging Tools
			
 
				+
			
 
				+StarPU provides several tools to help debugging applications. Execution traces
			
 
				+can be generated and displayed graphically, see \ref
			
 
				+GeneratingTracesWithFxT. Some gdb helpers are also provided to show
			
 
				+the whole StarPU state:
			
 
				+
			
 
				+\verbatim
			
 
				+(gdb) source tools/gdbinit
			
 
				+(gdb) help starpu
			
 
				+\endverbatim
			
 
				+
			
 
				+The Temanejo task debugger can also be used, see \ref UsingTheTemanejoTaskDebugger.
			
 
				+
			
 
				+\section UsingTheTemanejoTaskDebugger Using The Temanejo Task Debugger
			
 
				+
			
 
				+StarPU can connect to Temanejo >= 1.0rc2 (see
			
 
				+http://www.hlrs.de/temanejo), to permit
			
 
				+nice visual task debugging. To do so, build Temanejo's <c>libayudame.so</c>,
			
 
				+install <c>Ayudame.h</c> to e.g. <c>/usr/local/include</c>, apply the
			
 
				+<c>tools/patch-ayudame</c> to it to fix C build, re-<c>./configure</c>, make
			
 
				+sure that it found it, rebuild StarPU.  Run the Temanejo GUI, give it the path
			
 
				+to your application, any options you want to pass it, the path to <c>libayudame.so</c>.
			
 
				+
			
 
				+Make sure to specify at least the same number of CPUs in the dialog box as your
			
 
				+machine has, otherwise an error will happen during execution. Future versions
			
 
				+of Temanejo should be able to tell StarPU the number of CPUs to use.
			
 
				+
			
 
				+Tag numbers have to be below <c>4000000000000000000ULL</c> to be usable for
			
 
				+Temanejo (so as to distinguish them from tasks).
			
 
				+
			
 
				+
			
 
				+
			
 
				+*/
			
--- a/doc/doxygen/chapters/12online_performance_tools.doxy
+++ b/doc/doxygen/chapters/12online_performance_tools.doxy
@@ -0,0 +1,432 @@
 
				+/*
			
 
				+ * This file is part of the StarPU Handbook.
			
 
				+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
			
 
				+ * See the file version.doxy for copying conditions.
			
 
				+ */
			
 
				+
			
 
				+/*! \page OnlinePerformanceTools Online Performance Tools
			
 
				+
			
 
				+\section On-linePerformanceFeedback On-line Performance Feedback
			
 
				+
			
 
				+\subsection EnablingOn-linePerformanceMonitoring Enabling On-line Performance Monitoring
			
 
				+
			
 
				+In order to enable online performance monitoring, the application can
			
 
				+call starpu_profiling_status_set() with the parameter
			
 
				+::STARPU_PROFILING_ENABLE. It is possible to detect whether monitoring
			
 
				+is already enabled or not by calling starpu_profiling_status_get().
			
 
				+Enabling monitoring also reinitialize all previously collected
			
 
				+feedback. The environment variable \ref STARPU_PROFILING can also be
			
 
				+set to <c>1</c> to achieve the same effect. The function
			
 
				+starpu_profiling_init() can also be called during the execution to
			
 
				+reinitialize performance counters and to start the profiling if the
			
 
				+environment variable \ref STARPU_PROFILING is set to <c>1</c>.
			
 
				+
			
 
				+Likewise, performance monitoring is stopped by calling
			
 
				+starpu_profiling_status_set() with the parameter
			
 
				+::STARPU_PROFILING_DISABLE. Note that this does not reset the
			
 
				+performance counters so that the application may consult them later
			
 
				+on.
			
 
				+
			
 
				+More details about the performance monitoring API are available in \ref API_Profiling.
			
 
				+
			
 
				+\subsection Per-taskFeedback Per-task Feedback
			
 
				+
			
 
				+If profiling is enabled, a pointer to a structure
			
 
				+starpu_profiling_task_info is put in the field
			
 
				+starpu_task::profiling_info when a task terminates. This structure is
			
 
				+automatically destroyed when the task structure is destroyed, either
			
 
				+automatically or by calling starpu_task_destroy().
			
 
				+
			
 
				+The structure starpu_profiling_task_info indicates the date when the
			
 
				+task was submitted (starpu_profiling_task_info::submit_time), started
			
 
				+(starpu_profiling_task_info::start_time), and terminated
			
 
				+(starpu_profiling_task_info::end_time), relative to the initialization
			
 
				+of StarPU with starpu_init(). It also specifies the identifier of the worker
			
 
				+that has executed the task (starpu_profiling_task_info::workerid).
			
 
				+These date are stored as <c>timespec</c> structures which the user may convert
			
 
				+into micro-seconds using the helper function
			
 
				+starpu_timing_timespec_to_us().
			
 
				+
			
 
				+It it worth noting that the application may directly access this structure from
			
 
				+the callback executed at the end of the task. The structure starpu_task
			
 
				+associated to the callback currently being executed is indeed accessible with
			
 
				+the function starpu_task_get_current().
			
 
				+
			
 
				+\subsection Per-codeletFeedback Per-codelet Feedback
			
 
				+
			
 
				+The field starpu_codelet::per_worker_stats is
			
 
				+an array of counters. The i-th entry of the array is incremented every time a
			
 
				+task implementing the codelet is executed on the i-th worker.
			
 
				+This array is not reinitialized when profiling is enabled or disabled.
			
 
				+
			
 
				+\subsection Per-workerFeedback Per-worker Feedback
			
 
				+
			
 
				+The second argument returned by the function
			
 
				+starpu_profiling_worker_get_info() is a structure
			
 
				+starpu_profiling_worker_info that gives statistics about the specified
			
 
				+worker. This structure specifies when StarPU started collecting
			
 
				+profiling information for that worker
			
 
				+(starpu_profiling_worker_info::start_time), the
			
 
				+duration of the profiling measurement interval
			
 
				+(starpu_profiling_worker_info::total_time), the time spent executing
			
 
				+kernels (starpu_profiling_worker_info::executing_time), the time
			
 
				+spent sleeping because there is no task to execute at all
			
 
				+(starpu_profiling_worker_info::sleeping_time), and the number of tasks that were executed
			
 
				+while profiling was enabled. These values give an estimation of the
			
 
				+proportion of time spent do real work, and the time spent either
			
 
				+sleeping because there are not enough executable tasks or simply
			
 
				+wasted in pure StarPU overhead.
			
 
				+
			
 
				+Calling starpu_profiling_worker_get_info() resets the profiling
			
 
				+information associated to a worker.
			
 
				+
			
 
				+When an FxT trace is generated (see \ref GeneratingTracesWithFxT), it is also
			
 
				+possible to use the tool <c>starpu_workers_activity</c> (see \ref
			
 
				+MonitoringActivity) to generate a graphic showing the evolution of
			
 
				+these values during the time, for the different workers.
			
 
				+
			
 
				+\subsection Bus-relatedFeedback Bus-related Feedback
			
 
				+
			
 
				+TODO: ajouter \ref STARPU_BUS_STATS
			
 
				+
			
 
				+// how to enable/disable performance monitoring
			
 
				+// what kind of information do we get ?
			
 
				+
			
 
				+The bus speed measured by StarPU can be displayed by using the tool
			
 
				+<c>starpu_machine_display</c>, for instance:
			
 
				+
			
 
				+\verbatim
			
 
				+StarPU has found:
			
 
				+        3 CUDA devices
			
 
				+                CUDA 0 (Tesla C2050 02:00.0)
			
 
				+                CUDA 1 (Tesla C2050 03:00.0)
			
 
				+                CUDA 2 (Tesla C2050 84:00.0)
			
 
				+from    to RAM          to CUDA 0       to CUDA 1       to CUDA 2
			
 
				+RAM     0.000000        5176.530428     5176.492994     5191.710722
			
 
				+CUDA 0  4523.732446     0.000000        2414.074751     2417.379201
			
 
				+CUDA 1  4523.718152     2414.078822     0.000000        2417.375119
			
 
				+CUDA 2  4534.229519     2417.069025     2417.060863     0.000000
			
 
				+\endverbatim
			
 
				+
			
 
				+\subsection StarPU-TopInterface StarPU-Top Interface
			
 
				+
			
 
				+StarPU-Top is an interface which remotely displays the on-line state of a StarPU
			
 
				+application and permits the user to change parameters on the fly.
			
 
				+
			
 
				+Variables to be monitored can be registered by calling the functions
			
 
				+starpu_top_add_data_boolean(), starpu_top_add_data_integer(),
			
 
				+starpu_top_add_data_float(), e.g.:
			
 
				+
			
 
				+\code{.c}
			
 
				+starpu_top_data *data = starpu_top_add_data_integer("mynum", 0, 100, 1);
			
 
				+\endcode
			
 
				+
			
 
				+The application should then call starpu_top_init_and_wait() to give its name
			
 
				+and wait for StarPU-Top to get a start request from the user. The name is used
			
 
				+by StarPU-Top to quickly reload a previously-saved layout of parameter display.
			
 
				+
			
 
				+\code{.c}
			
 
				+starpu_top_init_and_wait("the application");
			
 
				+\endcode
			
 
				+
			
 
				+The new values can then be provided thanks to
			
 
				+starpu_top_update_data_boolean(), starpu_top_update_data_integer(),
			
 
				+starpu_top_update_data_float(), e.g.:
			
 
				+
			
 
				+\code{.c}
			
 
				+starpu_top_update_data_integer(data, mynum);
			
 
				+\endcode
			
 
				+
			
 
				+Updateable parameters can be registered thanks to starpu_top_register_parameter_boolean(), starpu_top_register_parameter_integer(), starpu_top_register_parameter_float(), e.g.:
			
 
				+
			
 
				+\code{.c}
			
 
				+float alpha;
			
 
				+starpu_top_register_parameter_float("alpha", &alpha, 0, 10, modif_hook);
			
 
				+\endcode
			
 
				+
			
 
				+<c>modif_hook</c> is a function which will be called when the parameter is being modified, it can for instance print the new value:
			
 
				+
			
 
				+\code{.c}
			
 
				+void modif_hook(struct starpu_top_param *d) {
			
 
				+    fprintf(stderr,"%s has been modified: %f\n", d->name, alpha);
			
 
				+}
			
 
				+\endcode
			
 
				+
			
 
				+Task schedulers should notify StarPU-Top when it has decided when a task will be
			
 
				+scheduled, so that it can show it in its Gantt chart, for instance:
			
 
				+
			
 
				+\code{.c}
			
 
				+starpu_top_task_prevision(task, workerid, begin, end);
			
 
				+\endcode
			
 
				+
			
 
				+Starting StarPU-Top (StarPU-Top is started via the binary
			
 
				+<c>starpu_top</c>.) and the application can be done two ways:
			
 
				+
			
 
				+<ul>
			
 
				+<li> The application is started by hand on some machine (and thus already
			
 
				+waiting for the start event). In the Preference dialog of StarPU-Top, the SSH
			
 
				+checkbox should be unchecked, and the hostname and port (default is 2011) on
			
 
				+which the application is already running should be specified. Clicking on the
			
 
				+connection button will thus connect to the already-running application.
			
 
				+</li>
			
 
				+<li> StarPU-Top is started first, and clicking on the connection button will
			
 
				+start the application itself (possibly on a remote machine). The SSH checkbox
			
 
				+should be checked, and a command line provided, e.g.:
			
 
				+
			
 
				+\verbatim
			
 
				+$ ssh myserver STARPU_SCHED=dmda ./application
			
 
				+\endverbatim
			
 
				+
			
 
				+If port 2011 of the remote machine can not be accessed directly, an ssh port bridge should be added:
			
 
				+
			
 
				+\verbatim
			
 
				+$ ssh -L 2011:localhost:2011 myserver STARPU_SCHED=dmda ./application
			
 
				+\endverbatim
			
 
				+
			
 
				+and "localhost" should be used as IP Address to connect to.
			
 
				+</li>
			
 
				+</ul>
			
 
				+
			
 
				+\section TaskAndWorkerProfiling Task And Worker Profiling
			
 
				+
			
 
				+A full example showing how to use the profiling API is available in
			
 
				+the StarPU sources in the directory <c>examples/profiling/</c>.
			
 
				+
			
 
				+\code{.c}
			
 
				+struct starpu_task *task = starpu_task_create();
			
 
				+task->cl = &cl;
			
 
				+task->synchronous = 1;
			
 
				+/* We will destroy the task structure by hand so that we can
			
 
				+ * query the profiling info before the task is destroyed. */
			
 
				+task->destroy = 0;
			
 
				+
			
 
				+/* Submit and wait for completion (since synchronous was set to 1) */
			
 
				+starpu_task_submit(task);
			
 
				+
			
 
				+/* The task is finished, get profiling information */
			
 
				+struct starpu_profiling_task_info *info = task->profiling_info;
			
 
				+
			
 
				+/* How much time did it take before the task started ? */
			
 
				+double delay += starpu_timing_timespec_delay_us(&info->submit_time, &info->start_time);
			
 
				+
			
 
				+/* How long was the task execution ? */
			
 
				+double length += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
			
 
				+
			
 
				+/* We don't need the task structure anymore */
			
 
				+starpu_task_destroy(task);
			
 
				+\endcode
			
 
				+
			
 
				+\code{.c}
			
 
				+/* Display the occupancy of all workers during the test */
			
 
				+int worker;
			
 
				+for (worker = 0; worker < starpu_worker_get_count(); worker++)
			
 
				+{
			
 
				+        struct starpu_profiling_worker_info worker_info;
			
 
				+        int ret = starpu_profiling_worker_get_info(worker, &worker_info);
			
 
				+        STARPU_ASSERT(!ret);
			
 
				+
			
 
				+        double total_time = starpu_timing_timespec_to_us(&worker_info.total_time);
			
 
				+        double executing_time = starpu_timing_timespec_to_us(&worker_info.executing_time);
			
 
				+        double sleeping_time = starpu_timing_timespec_to_us(&worker_info.sleeping_time);
			
 
				+        double overhead_time = total_time - executing_time - sleeping_time;
			
 
				+
			
 
				+        float executing_ratio = 100.0*executing_time/total_time;
			
 
				+        float sleeping_ratio = 100.0*sleeping_time/total_time;
			
 
				+        float overhead_ratio = 100.0 - executing_ratio - sleeping_ratio;
			
 
				+
			
 
				+        char workername[128];
			
 
				+        starpu_worker_get_name(worker, workername, 128);
			
 
				+        fprintf(stderr, "Worker %s:\n", workername);
			
 
				+        fprintf(stderr, "\ttotal time: %.2lf ms\n", total_time*1e-3);
			
 
				+        fprintf(stderr, "\texec time: %.2lf ms (%.2f %%)\n",
			
 
				+                executing_time*1e-3, executing_ratio);
			
 
				+        fprintf(stderr, "\tblocked time: %.2lf ms (%.2f %%)\n",
			
 
				+                sleeping_time*1e-3, sleeping_ratio);
			
 
				+        fprintf(stderr, "\toverhead time: %.2lf ms (%.2f %%)\n",
			
 
				+                overhead_time*1e-3, overhead_ratio);
			
 
				+}
			
 
				+\endcode
			
 
				+
			
 
				+\section PerformanceModelExample Performance Model Example
			
 
				+
			
 
				+To achieve good scheduling, StarPU scheduling policies need to be able to
			
 
				+estimate in advance the duration of a task. This is done by giving to codelets
			
 
				+a performance model, by defining a structure starpu_perfmodel and
			
 
				+providing its address in the field starpu_codelet::model. The fields
			
 
				+starpu_perfmodel::symbol and starpu_perfmodel::type are mandatory, to
			
 
				+give a name to the model, and the type of the model, since there are
			
 
				+several kinds of performance models. For compatibility, make sure to
			
 
				+initialize the whole structure to zero, either by using explicit
			
 
				+memset(), or by letting the compiler implicitly do it as examplified
			
 
				+below.
			
 
				+
			
 
				+<ul>
			
 
				+<li>
			
 
				+Measured at runtime (model type ::STARPU_HISTORY_BASED). This assumes that for a
			
 
				+given set of data input/output sizes, the performance will always be about the
			
 
				+same. This is very true for regular kernels on GPUs for instance (<0.1% error),
			
 
				+and just a bit less true on CPUs (~=1% error). This also assumes that there are
			
 
				+few different sets of data input/output sizes. StarPU will then keep record of
			
 
				+the average time of previous executions on the various processing units, and use
			
 
				+it as an estimation. History is done per task size, by using a hash of the input
			
 
				+and ouput sizes as an index.
			
 
				+It will also save it in <c>$STARPU_HOME/.starpu/sampling/codelets</c>
			
 
				+for further executions, and can be observed by using the tool
			
 
				+<c>starpu_perfmodel_display</c>, or drawn by using
			
 
				+the tool <c>starpu_perfmodel_plot</c> (\ref PerformanceModelCalibration).  The
			
 
				+models are indexed by machine name. To
			
 
				+share the models between machines (e.g. for a homogeneous cluster), use
			
 
				+<c>export STARPU_HOSTNAME=some_global_name</c>. Measurements are only done
			
 
				+when using a task scheduler which makes use of it, such as
			
 
				+<c>dmda</c>. Measurements can also be provided explicitly by the application, by
			
 
				+using the function starpu_perfmodel_update_history().
			
 
				+
			
 
				+The following is a small code example.
			
 
				+
			
 
				+If e.g. the code is recompiled with other compilation options, or several
			
 
				+variants of the code are used, the symbol string should be changed to reflect
			
 
				+that, in order to recalibrate a new model from zero. The symbol string can even
			
 
				+be constructed dynamically at execution time, as long as this is done before
			
 
				+submitting any task using it.
			
 
				+
			
 
				+\code{.c}
			
 
				+static struct starpu_perfmodel mult_perf_model = {
			
 
				+    .type = STARPU_HISTORY_BASED,
			
 
				+    .symbol = "mult_perf_model"
			
 
				+};
			
 
				+
			
 
				+struct starpu_codelet cl = {
			
 
				+    .where = STARPU_CPU,
			
 
				+    .cpu_funcs = { cpu_mult, NULL },
			
 
				+    .cpu_funcs_name = { "cpu_mult", NULL },
			
 
				+    .nbuffers = 3,
			
 
				+    .modes = { STARPU_R, STARPU_R, STARPU_W },
			
 
				+    /* for the scheduling policy to be able to use performance models */
			
 
				+    .model = &mult_perf_model
			
 
				+};
			
 
				+\endcode
			
 
				+
			
 
				+</li>
			
 
				+<li>
			
 
				+Measured at runtime and refined by regression (model types
			
 
				+::STARPU_REGRESSION_BASED and ::STARPU_NL_REGRESSION_BASED). This
			
 
				+still assumes performance regularity, but works 
			
 
				+with various data input sizes, by applying regression over observed
			
 
				+execution times. ::STARPU_REGRESSION_BASED uses an a*n^b regression
			
 
				+form, ::STARPU_NL_REGRESSION_BASED uses an a*n^b+c (more precise than
			
 
				+::STARPU_REGRESSION_BASED, but costs a lot more to compute).
			
 
				+
			
 
				+For instance,
			
 
				+<c>tests/perfmodels/regression_based.c</c> uses a regression-based performance
			
 
				+model for the function memset().
			
 
				+
			
 
				+Of course, the application has to issue
			
 
				+tasks with varying size so that the regression can be computed. StarPU will not
			
 
				+trust the regression unless there is at least 10% difference between the minimum
			
 
				+and maximum observed input size. It can be useful to set the
			
 
				+environment variable \ref STARPU_CALIBRATE to <c>1</c> and run the application
			
 
				+on varying input sizes with \ref STARPU_SCHED set to <c>dmda</c> scheduler,
			
 
				+so as to feed the performance model for a variety of
			
 
				+inputs. The application can also provide the measurements explictly by
			
 
				+using the function starpu_perfmodel_update_history(). The tools
			
 
				+<c>starpu_perfmodel_display</c> and <c>starpu_perfmodel_plot</c> can
			
 
				+be used to observe how much the performance model is calibrated (\ref
			
 
				+PerformanceModelCalibration); when their output look good,
			
 
				+\ref STARPU_CALIBRATE can be reset to <c>0</c> to let
			
 
				+StarPU use the resulting performance model without recording new measures, and
			
 
				+\ref STARPU_SCHED can be set to <c>dmda</c> to benefit from the performance models. If
			
 
				+the data input sizes vary a lot, it is really important to set
			
 
				+\ref STARPU_CALIBRATE to <c>0</c>, otherwise StarPU will continue adding the
			
 
				+measures, and result with a very big performance model, which will take time a
			
 
				+lot of time to load and save.
			
 
				+
			
 
				+For non-linear regression, since computing it
			
 
				+is quite expensive, it is only done at termination of the application. This
			
 
				+means that the first execution of the application will use only history-based
			
 
				+performance model to perform scheduling, without using regression.
			
 
				+</li>
			
 
				+
			
 
				+<li>
			
 
				+Provided as an estimation from the application itself (model type
			
 
				+::STARPU_COMMON and field starpu_perfmodel::cost_function),
			
 
				+see for instance
			
 
				+<c>examples/common/blas_model.h</c> and <c>examples/common/blas_model.c</c>.
			
 
				+</li>
			
 
				+
			
 
				+<li>
			
 
				+Provided explicitly by the application (model type ::STARPU_PER_ARCH):
			
 
				+the fields <c>.per_arch[arch][nimpl].cost_function</c> have to be
			
 
				+filled with pointers to functions which return the expected duration
			
 
				+of the task in micro-seconds, one per architecture.
			
 
				+</li>
			
 
				+</ul>
			
 
				+
			
 
				+For ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED, and
			
 
				+::STARPU_NL_REGRESSION_BASED, the total size of task data (both input
			
 
				+and output) is used as an index by default. The field
			
 
				+starpu_perfmodel::size_base however permits the application to
			
 
				+override that, when for instance some of the data do not matter for
			
 
				+task cost (e.g. mere reference table), or when using sparse
			
 
				+structures (in which case it is the number of non-zeros which matter), or when
			
 
				+there is some hidden parameter such as the number of iterations, or when the application
			
 
				+actually has a very good idea of the complexity of the algorithm, and just not
			
 
				+the speed of the processor, etc.
			
 
				+The example in the directory <c>examples/pi</c> uses this to include
			
 
				+the number of iterations in the base.
			
 
				+
			
 
				+StarPU will automatically determine when the performance model is calibrated,
			
 
				+or rather, it will assume the performance model is calibrated until the
			
 
				+application submits a task for which the performance can not be predicted. For
			
 
				+::STARPU_HISTORY_BASED, StarPU will require 10 (_STARPU_CALIBRATION_MINIMUM)
			
 
				+measurements for a given size before estimating that an average can be taken as
			
 
				+estimation for further executions with the same size. For
			
 
				+::STARPU_REGRESSION_BASED and ::STARPU_NL_REGRESSION_BASED, StarPU will require
			
 
				+10 (_STARPU_CALIBRATION_MINIMUM) measurements, and that the minimum measured
			
 
				+data size is smaller than 90% of the maximum measured data size (i.e. the
			
 
				+measurement interval is large enough for a regression to have a meaning).
			
 
				+Calibration can also be forced by setting the \ref STARPU_CALIBRATE environment
			
 
				+variable to <c>1</c>, or even reset by setting it to <c>2</c>.
			
 
				+
			
 
				+How to use schedulers which can benefit from such performance model is explained
			
 
				+in \ref TaskSchedulingPolicy.
			
 
				+
			
 
				+The same can be done for task power consumption estimation, by setting
			
 
				+the field starpu_codelet::power_model the same way as the field
			
 
				+starpu_codelet::model. Note: for now, the application has to give to
			
 
				+the power consumption performance model a name which is different from
			
 
				+the execution time performance model.
			
 
				+
			
 
				+The application can request time estimations from the StarPU performance
			
 
				+models by filling a task structure as usual without actually submitting
			
 
				+it. The data handles can be created by calling any of the functions
			
 
				+<c>starpu_*_data_register</c> with a <c>NULL</c> pointer and <c>-1</c>
			
 
				+node and the desired data sizes, and need to be unregistered as usual.
			
 
				+The functions starpu_task_expected_length() and
			
 
				+starpu_task_expected_power() can then be called to get an estimation
			
 
				+of the task cost on a given arch. starpu_task_footprint() can also be
			
 
				+used to get the footprint used for indexing history-based performance
			
 
				+models. starpu_task_destroy() needs to be called to destroy the dummy
			
 
				+task afterwards. See <c>tests/perfmodels/regression_based.c</c> for an example.
			
 
				+
			
 
				+\section DataTrace Data trace and tasks length
			
 
				+It is possible to get statistics about tasks length and data size by using :
			
 
				+\verbatim
			
 
				+$ starpu_fxt_data_trace filename [codelet1 codelet2 ... codeletn]
			
 
				+\endverbatim
			
 
				+Where filename is the FxT trace file and codeletX the names of the codelets you
			
 
				+want to profile (if no names are specified, <c>starpu_fxt_data_trace</c> will profile them all).
			
 
				+This will create a file, <c>data_trace.gp</c> which
			
 
				+can be executed to get a <c>.eps</c> image of these results. On the image, each point represents a
			
 
				+task, and each color corresponds to a codelet.
			
 
				+
			
 
				+\image html data_trace.png
			
 
				+\image latex data_trace.eps "" width=\textwidth
			
 
				+
			
 
				+// TODO: data transfer stats are similar to the ones displayed when
			
 
				+// setting STARPU_BUS_STATS
			
 
				+
			
 
				+
			
 
				+
			
 
				+*/
			
--- a/doc/doxygen/chapters/05performance_feedback.doxy
+++ b/doc/doxygen/chapters/05performance_feedback.doxy
@@ -1,211 +1,47 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
 
				 
			
 
				-/*! \page PerformanceFeedback Performance Feedback
			
 
				-
			
 
				-\section UsingTheTemanejoTaskDebugger Using The Temanejo Task Debugger
			
 
				-
			
 
				-StarPU can connect to Temanejo >= 1.0rc2 (see
			
 
				-http://www.hlrs.de/temanejo), to permit
			
 
				-nice visual task debugging. To do so, build Temanejo's <c>libayudame.so</c>,
			
 
				-install <c>Ayudame.h</c> to e.g. <c>/usr/local/include</c>, apply the
			
 
				-<c>tools/patch-ayudame</c> to it to fix C build, re-<c>./configure</c>, make
			
 
				-sure that it found it, rebuild StarPU.  Run the Temanejo GUI, give it the path
			
 
				-to your application, any options you want to pass it, the path to <c>libayudame.so</c>.
			
 
				-
			
 
				-Make sure to specify at least the same number of CPUs in the dialog box as your
			
 
				-machine has, otherwise an error will happen during execution. Future versions
			
 
				-of Temanejo should be able to tell StarPU the number of CPUs to use.
			
 
				-
			
 
				-Tag numbers have to be below <c>4000000000000000000ULL</c> to be usable for
			
 
				-Temanejo (so as to distinguish them from tasks).
			
 
				-
			
 
				-\section On-linePerformanceFeedback On-line Performance Feedback
			
 
				-
			
 
				-\subsection EnablingOn-linePerformanceMonitoring Enabling On-line Performance Monitoring
			
 
				-
			
 
				-In order to enable online performance monitoring, the application can
			
 
				-call starpu_profiling_status_set() with the parameter
			
 
				-::STARPU_PROFILING_ENABLE. It is possible to detect whether monitoring
			
 
				-is already enabled or not by calling starpu_profiling_status_get().
			
 
				-Enabling monitoring also reinitialize all previously collected
			
 
				-feedback. The environment variable \ref STARPU_PROFILING can also be
			
 
				-set to <c>1</c> to achieve the same effect. The function
			
 
				-starpu_profiling_init() can also be called during the execution to
			
 
				-reinitialize performance counters and to start the profiling if the
			
 
				-environment variable \ref STARPU_PROFILING is set to <c>1</c>.
			
 
				-
			
 
				-Likewise, performance monitoring is stopped by calling
			
 
				-starpu_profiling_status_set() with the parameter
			
 
				-::STARPU_PROFILING_DISABLE. Note that this does not reset the
			
 
				-performance counters so that the application may consult them later
			
 
				-on.
			
 
				-
			
 
				-More details about the performance monitoring API are available in \ref API_Profiling.
			
 
				-
			
 
				-\subsection Per-taskFeedback Per-task Feedback
			
 
				-
			
 
				-If profiling is enabled, a pointer to a structure
			
 
				-starpu_profiling_task_info is put in the field
			
 
				-starpu_task::profiling_info when a task terminates. This structure is
			
 
				-automatically destroyed when the task structure is destroyed, either
			
 
				-automatically or by calling starpu_task_destroy().
			
 
				-
			
 
				-The structure starpu_profiling_task_info indicates the date when the
			
 
				-task was submitted (starpu_profiling_task_info::submit_time), started
			
 
				-(starpu_profiling_task_info::start_time), and terminated
			
 
				-(starpu_profiling_task_info::end_time), relative to the initialization
			
 
				-of StarPU with starpu_init(). It also specifies the identifier of the worker
			
 
				-that has executed the task (starpu_profiling_task_info::workerid).
			
 
				-These date are stored as <c>timespec</c> structures which the user may convert
			
 
				-into micro-seconds using the helper function
			
 
				-starpu_timing_timespec_to_us().
			
 
				-
			
 
				-It it worth noting that the application may directly access this structure from
			
 
				-the callback executed at the end of the task. The structure starpu_task
			
 
				-associated to the callback currently being executed is indeed accessible with
			
 
				-the function starpu_task_get_current().
			
 
				-
			
 
				-\subsection Per-codeletFeedback Per-codelet Feedback
			
 
				-
			
 
				-The field starpu_codelet::per_worker_stats is
			
 
				-an array of counters. The i-th entry of the array is incremented every time a
			
 
				-task implementing the codelet is executed on the i-th worker.
			
 
				-This array is not reinitialized when profiling is enabled or disabled.
			
 
				-
			
 
				-\subsection Per-workerFeedback Per-worker Feedback
			
 
				-
			
 
				-The second argument returned by the function
			
 
				-starpu_profiling_worker_get_info() is a structure
			
 
				-starpu_profiling_worker_info that gives statistics about the specified
			
 
				-worker. This structure specifies when StarPU started collecting
			
 
				-profiling information for that worker
			
 
				-(starpu_profiling_worker_info::start_time), the
			
 
				-duration of the profiling measurement interval
			
 
				-(starpu_profiling_worker_info::total_time), the time spent executing
			
 
				-kernels (starpu_profiling_worker_info::executing_time), the time
			
 
				-spent sleeping because there is no task to execute at all
			
 
				-(starpu_profiling_worker_info::sleeping_time), and the number of tasks that were executed
			
 
				-while profiling was enabled. These values give an estimation of the
			
 
				-proportion of time spent do real work, and the time spent either
			
 
				-sleeping because there are not enough executable tasks or simply
			
 
				-wasted in pure StarPU overhead.
			
 
				-
			
 
				-Calling starpu_profiling_worker_get_info() resets the profiling
			
 
				-information associated to a worker.
			
 
				-
			
 
				-When an FxT trace is generated (see \ref GeneratingTracesWithFxT), it is also
			
 
				-possible to use the tool <c>starpu_workers_activity</c> (see \ref
			
 
				-MonitoringActivity) to generate a graphic showing the evolution of
			
 
				-these values during the time, for the different workers.
			
 
				-
			
 
				-\subsection Bus-relatedFeedback Bus-related Feedback
			
 
				-
			
 
				-TODO: ajouter \ref STARPU_BUS_STATS
			
 
				-
			
 
				-// how to enable/disable performance monitoring
			
 
				-// what kind of information do we get ?
			
 
				-
			
 
				-The bus speed measured by StarPU can be displayed by using the tool
			
 
				-<c>starpu_machine_display</c>, for instance:
			
 
				+/*! \page OfflinePerformanceTools Offline Performance Tools
			
 
				 
			
 
				-\verbatim
			
 
				-StarPU has found:
			
 
				-        3 CUDA devices
			
 
				-                CUDA 0 (Tesla C2050 02:00.0)
			
 
				-                CUDA 1 (Tesla C2050 03:00.0)
			
 
				-                CUDA 2 (Tesla C2050 84:00.0)
			
 
				-from    to RAM          to CUDA 0       to CUDA 1       to CUDA 2
			
 
				-RAM     0.000000        5176.530428     5176.492994     5191.710722
			
 
				-CUDA 0  4523.732446     0.000000        2414.074751     2417.379201
			
 
				-CUDA 1  4523.718152     2414.078822     0.000000        2417.375119
			
 
				-CUDA 2  4534.229519     2417.069025     2417.060863     0.000000
			
 
				-\endverbatim
			
 
				-
			
 
				-\subsection StarPU-TopInterface StarPU-Top Interface
			
 
				-
			
 
				-StarPU-Top is an interface which remotely displays the on-line state of a StarPU
			
 
				-application and permits the user to change parameters on the fly.
			
 
				-
			
 
				-Variables to be monitored can be registered by calling the functions
			
 
				-starpu_top_add_data_boolean(), starpu_top_add_data_integer(),
			
 
				-starpu_top_add_data_float(), e.g.:
			
 
				-
			
 
				-\code{.c}
			
 
				-starpu_top_data *data = starpu_top_add_data_integer("mynum", 0, 100, 1);
			
 
				-\endcode
			
 
				-
			
 
				-The application should then call starpu_top_init_and_wait() to give its name
			
 
				-and wait for StarPU-Top to get a start request from the user. The name is used
			
 
				-by StarPU-Top to quickly reload a previously-saved layout of parameter display.
			
 
				-
			
 
				-\code{.c}
			
 
				-starpu_top_init_and_wait("the application");
			
 
				-\endcode
			
 
				-
			
 
				-The new values can then be provided thanks to
			
 
				-starpu_top_update_data_boolean(), starpu_top_update_data_integer(),
			
 
				-starpu_top_update_data_float(), e.g.:
			
 
				-
			
 
				-\code{.c}
			
 
				-starpu_top_update_data_integer(data, mynum);
			
 
				-\endcode
			
 
				-
			
 
				-Updateable parameters can be registered thanks to starpu_top_register_parameter_boolean(), starpu_top_register_parameter_integer(), starpu_top_register_parameter_float(), e.g.:
			
 
				-
			
 
				-\code{.c}
			
 
				-float alpha;
			
 
				-starpu_top_register_parameter_float("alpha", &alpha, 0, 10, modif_hook);
			
 
				-\endcode
			
 
				-
			
 
				-<c>modif_hook</c> is a function which will be called when the parameter is being modified, it can for instance print the new value:
			
 
				-
			
 
				-\code{.c}
			
 
				-void modif_hook(struct starpu_top_param *d) {
			
 
				-    fprintf(stderr,"%s has been modified: %f\n", d->name, alpha);
			
 
				-}
			
 
				-\endcode
			
 
				-
			
 
				-Task schedulers should notify StarPU-Top when it has decided when a task will be
			
 
				-scheduled, so that it can show it in its Gantt chart, for instance:
			
 
				-
			
 
				-\code{.c}
			
 
				-starpu_top_task_prevision(task, workerid, begin, end);
			
 
				-\endcode
			
 
				-
			
 
				-Starting StarPU-Top (StarPU-Top is started via the binary
			
 
				-<c>starpu_top</c>.) and the application can be done two ways:
			
 
				+To get an idea of what is happening, a lot of performance feedback is available,
			
 
				+detailed in this chapter. The various informations should be checked for.
			
 
				 
			
 
				 <ul>
			
 
				-<li> The application is started by hand on some machine (and thus already
			
 
				-waiting for the start event). In the Preference dialog of StarPU-Top, the SSH
			
 
				-checkbox should be unchecked, and the hostname and port (default is 2011) on
			
 
				-which the application is already running should be specified. Clicking on the
			
 
				-connection button will thus connect to the already-running application.
			
 
				-</li>
			
 
				-<li> StarPU-Top is started first, and clicking on the connection button will
			
 
				-start the application itself (possibly on a remote machine). The SSH checkbox
			
 
				-should be checked, and a command line provided, e.g.:
			
 
				-
			
 
				-\verbatim
			
 
				-$ ssh myserver STARPU_SCHED=dmda ./application
			
 
				-\endverbatim
			
 
				-
			
 
				-If port 2011 of the remote machine can not be accessed directly, an ssh port bridge should be added:
			
 
				-
			
 
				-\verbatim
			
 
				-$ ssh -L 2011:localhost:2011 myserver STARPU_SCHED=dmda ./application
			
 
				-\endverbatim
			
 
				-
			
 
				-and "localhost" should be used as IP Address to connect to.
			
 
				+<li>
			
 
				+What does the Gantt diagram look like? (see \ref CreatingAGanttDiagram)
			
 
				+<ul>
			
 
				+  <li> If it's mostly green (tasks running in the initial context) or context specific
			
 
				+  color prevailing, then the machine is properly
			
 
				+  utilized, and perhaps the codelets are just slow. Check their performance, see
			
 
				+  \ref PerformanceOfCodelets.
			
 
				+  </li>
			
 
				+  <li> If it's mostly purple (FetchingInput), tasks keep waiting for data
			
 
				+  transfers, do you perhaps have far more communication than computation? Did
			
 
				+  you properly use CUDA streams to make sure communication can be
			
 
				+  overlapped? Did you use data-locality aware schedulers to avoid transfers as
			
 
				+  much as possible?
			
 
				+  </li>
			
 
				+  <li> If it's mostly red (Blocked), tasks keep waiting for dependencies,
			
 
				+  do you have enough parallelism? It might be a good idea to check what the DAG
			
 
				+  looks like (see \ref CreatingADAGWithGraphviz).
			
 
				+  </li>
			
 
				+  <li> If only some workers are completely red (Blocked), for some reason the
			
 
				+  scheduler didn't assign tasks to them. Perhaps the performance model is bogus,
			
 
				+  check it (see \ref PerformanceOfCodelets). Do all your codelets have a
			
 
				+  performance model?  When some of them don't, the schedulers switches to a
			
 
				+  greedy algorithm which thus performs badly.
			
 
				+  </li>
			
 
				+</ul>
			
 
				 </li>
			
 
				 </ul>
			
 
				 
			
 
				+You can also use the Temanejo task debugger (see \ref UsingTheTemanejoTaskDebugger) to
			
 
				+visualize the task graph more easily.
			
 
				 \section Off-linePerformanceFeedback Off-line Performance Feedback
			
 
				 
			
 
				 \subsection GeneratingTracesWithFxT Generating Traces With FxT
			
@@ -492,6 +328,55 @@ execution time.
 
				 \ref TheoreticalLowerBoundOnExecutionTimeExample provides an example on how to
			
 
				 use this.
			
 
				 
			
 
				+\section TheoreticalLowerBoundOnExecutionTimeExample Theoretical Lower Bound On Execution Time Example
			
 
				+
			
 
				+For kernels with history-based performance models (and provided that
			
 
				+they are completely calibrated), StarPU can very easily provide a
			
 
				+theoretical lower bound for the execution time of a whole set of
			
 
				+tasks. See for instance <c>examples/lu/lu_example.c</c>: before
			
 
				+submitting tasks, call the function starpu_bound_start(), and after
			
 
				+complete execution, call starpu_bound_stop().
			
 
				+starpu_bound_print_lp() or starpu_bound_print_mps() can then be used
			
 
				+to output a Linear Programming problem corresponding to the schedule
			
 
				+of your tasks. Run it through <c>lp_solve</c> or any other linear
			
 
				+programming solver, and that will give you a lower bound for the total
			
 
				+execution time of your tasks. If StarPU was compiled with the library
			
 
				+<c>glpk</c> installed, starpu_bound_compute() can be used to solve it
			
 
				+immediately and get the optimized minimum, in ms. Its parameter
			
 
				+<c>integer</c> allows to decide whether integer resolution should be
			
 
				+computed and returned 
			
 
				+
			
 
				+The <c>deps</c> parameter tells StarPU whether to take tasks, implicit
			
 
				+data, and tag dependencies into account. Tags released in a callback
			
 
				+or similar are not taken into account, only tags associated with a task are.
			
 
				+It must be understood that the linear programming
			
 
				+problem size is quadratic with the number of tasks and thus the time to solve it
			
 
				+will be very long, it could be minutes for just a few dozen tasks. You should
			
 
				+probably use <c>lp_solve -timeout 1 test.pl -wmps test.mps</c> to convert the
			
 
				+problem to MPS format and then use a better solver, <c>glpsol</c> might be
			
 
				+better than <c>lp_solve</c> for instance (the <c>--pcost</c> option may be
			
 
				+useful), but sometimes doesn't manage to converge. <c>cbc</c> might look
			
 
				+slower, but it is parallel. For <c>lp_solve</c>, be sure to try at least all the
			
 
				+<c>-B</c> options. For instance, we often just use <c>lp_solve -cc -B1 -Bb
			
 
				+-Bg -Bp -Bf -Br -BG -Bd -Bs -BB -Bo -Bc -Bi</c> , and the <c>-gr</c> option can
			
 
				+also be quite useful. The resulting schedule can be observed by using
			
 
				+the tool <c>starpu_lp2paje</c>, which converts it into the Paje
			
 
				+format.
			
 
				+
			
 
				+Data transfer time can only be taken into account when <c>deps</c> is set. Only
			
 
				+data transfers inferred from implicit data dependencies between tasks are taken
			
 
				+into account. Other data transfers are assumed to be completely overlapped.
			
 
				+
			
 
				+Setting <c>deps</c> to 0 will only take into account the actual computations
			
 
				+on processing units. It however still properly takes into account the varying
			
 
				+performances of kernels and processing units, which is quite more accurate than
			
 
				+just comparing StarPU performances with the fastest of the kernels being used.
			
 
				+
			
 
				+The <c>prio</c> parameter tells StarPU whether to simulate taking into account
			
 
				+the priorities as the StarPU scheduler would, i.e. schedule prioritized
			
 
				+tasks before less prioritized tasks, to check to which extend this results
			
 
				+to a less optimal solution. This increases even more computation time.
			
 
				+
			
 
				 \section MemoryFeedback Memory Feedback
			
 
				 
			
 
				 It is possible to enable memory statistics. To do so, you need to pass
			
@@ -592,21 +477,4 @@ Computation took (in ms)
 
				 Synthetic GFlops : 44.21
			
 
				 \endverbatim
			
 
				 
			
 
				-// TODO: data transfer stats are similar to the ones displayed when
			
 
				-// setting STARPU_BUS_STATS
			
 
				-
			
 
				-\section DataTrace Data trace and tasks length
			
 
				-It is possible to get statistics about tasks length and data size by using :
			
 
				-\verbatim
			
 
				-$starpu_fxt_data_trace filename [codelet1 codelet2 ... codeletn]
			
 
				-\endverbatim
			
 
				-Where filename is the FxT trace file and codeletX the names of the codelets you 
			
 
				-want to profile (if no names are specified, starpu_fxt_data_trace will use them all). 
			
 
				-This will create a file, <c>data_trace.gp</c> which
			
 
				-can be plotted to get a .eps image of these results. On the image, each point represents a 
			
 
				-task, and each color corresponds to a codelet.
			
 
				-
			
 
				-\image html data_trace.png
			
 
				-\image latex data_trace.eps "" width=\textwidth
			
 
				-
			
 
				 */
			
--- a/doc/doxygen/chapters/06tips_and_tricks.doxy
+++ b/doc/doxygen/chapters/06tips_and_tricks.doxy
@@ -1,12 +1,12 @@
 
				 /*
			
 
				  * This file is part of the StarPU Handbook.
			
 
				  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
			
 
				  * See the file version.doxy for copying conditions.
			
 
				  */
			
 
				 
			
 
				-/*! \page TipsAndTricksToKnowAbout Tips and Tricks To Know About
			
 
				+/*! \page FrequentlyAskedQuestions Frequently Asked Questions
			
 
				 
			
 
				 \section HowToInitializeAComputationLibraryOnceForEachWorker How To Initialize A Computation Library Once For Each Worker?
			
 
				 
			
@@ -69,33 +69,95 @@ void starpufft_plan(void)
 
				 }
			
 
				 \endcode
			
 
				 
			
 
				-\section HowToLimitMemoryPerNode How to limit memory per node
			
 
				+\section UsingTheDriverAPI Using The Driver API
			
 
				 
			
 
				-TODO
			
 
				+\ref API_Running_Drivers
			
 
				 
			
 
				-Talk about
			
 
				-\ref STARPU_LIMIT_CUDA_devid_MEM, \ref STARPU_LIMIT_CUDA_MEM,
			
 
				-\ref STARPU_LIMIT_OPENCL_devid_MEM, \ref STARPU_LIMIT_OPENCL_MEM
			
 
				-and \ref STARPU_LIMIT_CPU_MEM
			
 
				+\code{.c}
			
 
				+int ret;
			
 
				+struct starpu_driver = {
			
 
				+    .type = STARPU_CUDA_WORKER,
			
 
				+    .id.cuda_id = 0
			
 
				+};
			
 
				+ret = starpu_driver_init(&d);
			
 
				+if (ret != 0)
			
 
				+    error();
			
 
				+while (some_condition) {
			
 
				+    ret = starpu_driver_run_once(&d);
			
 
				+    if (ret != 0)
			
 
				+        error();
			
 
				+}
			
 
				+ret = starpu_driver_deinit(&d);
			
 
				+if (ret != 0)
			
 
				+    error();
			
 
				+\endcode
			
 
				 
			
 
				-starpu_memory_get_available()
			
 
				+To add a new kind of device to the structure starpu_driver, one needs to:
			
 
				+<ol>
			
 
				+<li> Add a member to the union starpu_driver::id
			
 
				+</li>
			
 
				+<li> Modify the internal function <c>_starpu_launch_drivers()</c> to
			
 
				+make sure the driver is not always launched.
			
 
				+</li>
			
 
				+<li> Modify the function starpu_driver_run() so that it can handle
			
 
				+another kind of architecture.
			
 
				+</li>
			
 
				+<li> Write the new function <c>_starpu_run_foobar()</c> in the
			
 
				+corresponding driver.
			
 
				+</li>
			
 
				+</ol>
			
 
				+
			
 
				+\section On-GPURendering On-GPU Rendering
			
 
				+
			
 
				+Graphical-oriented applications need to draw the result of their computations,
			
 
				+typically on the very GPU where these happened. Technologies such as OpenGL/CUDA
			
 
				+interoperability permit to let CUDA directly work on the OpenGL buffers, making
			
 
				+them thus immediately ready for drawing, by mapping OpenGL buffer, textures or
			
 
				+renderbuffer objects into CUDA.  CUDA however imposes some technical
			
 
				+constraints: peer memcpy has to be disabled, and the thread that runs OpenGL has
			
 
				+to be the one that runs CUDA computations for that GPU.
			
 
				+
			
 
				+To achieve this with StarPU, pass the option
			
 
				+\ref disable-cuda-memcpy-peer "--disable-cuda-memcpy-peer"
			
 
				+to <c>./configure</c> (TODO: make it dynamic), OpenGL/GLUT has to be initialized
			
 
				+first, and the interoperability mode has to
			
 
				+be enabled by using the field
			
 
				+starpu_conf::cuda_opengl_interoperability, and the driver loop has to
			
 
				+be run by the application, by using the field
			
 
				+starpu_conf::not_launched_drivers to prevent StarPU from running it in
			
 
				+a separate thread, and by using starpu_driver_run() to run the loop.
			
 
				+The examples <c>gl_interop</c> and <c>gl_interop_idle</c> show how it
			
 
				+articulates in a simple case, where rendering is done in task
			
 
				+callbacks. The former uses <c>glutMainLoopEvent</c> to make GLUT
			
 
				+progress from the StarPU driver loop, while the latter uses
			
 
				+<c>glutIdleFunc</c> to make StarPU progress from the GLUT main loop.
			
 
				+
			
 
				+Then, to use an OpenGL buffer as a CUDA data, StarPU simply needs to be given
			
 
				+the CUDA pointer at registration, for instance:
			
 
				 
			
 
				-\section ThreadBindingOnNetBSD Thread Binding on NetBSD
			
 
				+\code{.c}
			
 
				+/* Get the CUDA worker id */
			
 
				+for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)
			
 
				+        if (starpu_worker_get_type(workerid) == STARPU_CUDA_WORKER)
			
 
				+                break;
			
 
				 
			
 
				-When using StarPU on a NetBSD machine, if the topology
			
 
				-discovery library <c>hwloc</c> is used, thread binding will fail. To
			
 
				-prevent the problem, you should at least use the version 1.7 of
			
 
				-<c>hwloc</c>, and also issue the following call:
			
 
				+/* Build a CUDA pointer pointing at the OpenGL buffer */
			
 
				+cudaGraphicsResourceGetMappedPointer((void**)&output, &num_bytes, resource);
			
 
				 
			
 
				-\verbatim
			
 
				-$ sysctl -w security.models.extensions.user_set_cpu_affinity=1
			
 
				-\endverbatim
			
 
				+/* And register it to StarPU */
			
 
				+starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid),
			
 
				+                            output, num_bytes / sizeof(float4), sizeof(float4));
			
 
				 
			
 
				-Or add the following line in the file <c>/etc/sysctl.conf</c>
			
 
				+/* The handle can now be used as usual */
			
 
				+starpu_task_insert(&cl, STARPU_RW, handle, 0);
			
 
				 
			
 
				-\verbatim
			
 
				-security.models.extensions.user_set_cpu_affinity=1
			
 
				-\endverbatim
			
 
				+/* ... */
			
 
				+
			
 
				+/* This gets back data into the OpenGL buffer */
			
 
				+starpu_data_unregister(handle);
			
 
				+\endcode
			
 
				+
			
 
				+and display it e.g. in the callback function.
			
 
				 
			
 
				 \section UsingStarPUWithMKL Using StarPU With MKL 11 (Intel Composer XE 2013)
			
 
				 
			
@@ -111,4 +173,21 @@ Using this configuration, StarPU uses only 1 core, no matter the value of
 
				 The solution is to set the environment variable KMP_AFFINITY to <c>disabled</c>
			
 
				 (http://software.intel.com/sites/products/documentation/studio/composer/en-us/2011Update/compiler_c/optaps/common/optaps_openmp_thread_affinity.htm).
			
 
				 
			
 
				+\section ThreadBindingOnNetBSD Thread Binding on NetBSD
			
 
				+
			
 
				+When using StarPU on a NetBSD machine, if the topology
			
 
				+discovery library <c>hwloc</c> is used, thread binding will fail. To
			
 
				+prevent the problem, you should at least use the version 1.7 of
			
 
				+<c>hwloc</c>, and also issue the following call:
			
 
				+
			
 
				+\verbatim
			
 
				+$ sysctl -w security.models.extensions.user_set_cpu_affinity=1
			
 
				+\endverbatim
			
 
				+
			
 
				+Or add the following line in the file <c>/etc/sysctl.conf</c>
			
 
				+
			
 
				+\verbatim
			
 
				+security.models.extensions.user_set_cpu_affinity=1
			
 
				+\endverbatim
			
 
				+
			
 
				 */
			
--- a/doc/doxygen/chapters/15out_of_core.doxy
+++ b/doc/doxygen/chapters/15out_of_core.doxy
--- a/doc/doxygen/chapters/16mpi_support.doxy
+++ b/doc/doxygen/chapters/16mpi_support.doxy
--- a/doc/doxygen/chapters/17fft_support.doxy
+++ b/doc/doxygen/chapters/17fft_support.doxy
--- a/doc/doxygen/chapters/18mic_scc_support.doxy
+++ b/doc/doxygen/chapters/18mic_scc_support.doxy
--- a/doc/doxygen/chapters/19c_extensions.doxy
+++ b/doc/doxygen/chapters/19c_extensions.doxy
--- a/doc/doxygen/chapters/20socl_opencl_extensions.doxy
+++ b/doc/doxygen/chapters/20socl_opencl_extensions.doxy
--- a/doc/doxygen/chapters/21simgrid.doxy
+++ b/doc/doxygen/chapters/21simgrid.doxy
@@ -0,0 +1,104 @@
 
				+/*
			
 
				+ * This file is part of the StarPU Handbook.
			
 
				+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
			
 
				+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
			
 
				+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
			
 
				+ * See the file version.doxy for copying conditions.
			
 
				+ */
			
 
				+
			
 
				+/*! \page SimGridSupport SimGrid Support
			
 
				+
			
 
				+StarPU can use Simgrid in order to simulate execution on an arbitrary
			
 
				+platform.
			
 
				+
			
 
				+\section Calibration Calibration
			
 
				+
			
 
				+The idea is to first compile StarPU normally, and run the application,
			
 
				+so as to automatically benchmark the bus and the codelets.
			
 
				+
			
 
				+\verbatim
			
 
				+$ ./configure && make
			
 
				+$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
			
 
				+[starpu][_starpu_load_history_based_model] Warning: model matvecmult
			
 
				+   is not calibrated, forcing calibration for this run. Use the
			
 
				+   STARPU_CALIBRATE environment variable to control this.
			
 
				+$ ...
			
 
				+$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
			
 
				+TEST PASSED
			
 
				+\endverbatim
			
 
				+
			
 
				+Note that we force to use the scheduler <c>dmda</c> to generate
			
 
				+performance models for the application. The application may need to be
			
 
				+run several times before the model is calibrated.
			
 
				+
			
 
				+\section Simulation Simulation
			
 
				+
			
 
				+Then, recompile StarPU, passing \ref enable-simgrid "--enable-simgrid"
			
 
				+to <c>./configure</c>, and re-run the application:
			
 
				+
			
 
				+\verbatim
			
 
				+$ ./configure --enable-simgrid && make
			
 
				+$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
			
 
				+TEST FAILED !!!
			
 
				+\endverbatim
			
 
				+
			
 
				+It is normal that the test fails: since the computation are not actually done
			
 
				+(that is the whole point of simgrid), the result is wrong, of course.
			
 
				+
			
 
				+If the performance model is not calibrated enough, the following error
			
 
				+message will be displayed
			
 
				+
			
 
				+\verbatim
			
 
				+$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
			
 
				+[starpu][_starpu_load_history_based_model] Warning: model matvecmult
			
 
				+    is not calibrated, forcing calibration for this run. Use the
			
 
				+    STARPU_CALIBRATE environment variable to control this.
			
 
				+[starpu][_starpu_simgrid_execute_job][assert failure] Codelet
			
 
				+    matvecmult does not have a perfmodel, or is not calibrated enough
			
 
				+\endverbatim
			
 
				+
			
 
				+The number of devices can be chosen as usual with \ref STARPU_NCPU,
			
 
				+\ref STARPU_NCUDA, and \ref STARPU_NOPENCL.  For now, only the number of
			
 
				+cpus can be arbitrarily chosen. The number of CUDA and OpenCL devices have to be
			
 
				+lower than the real number on the current machine.
			
 
				+
			
 
				+The amount of simulated GPU memory is for now unbound by default, but
			
 
				+it can be chosen by hand through the \ref STARPU_LIMIT_CUDA_MEM,
			
 
				+\ref STARPU_LIMIT_CUDA_devid_MEM, \ref STARPU_LIMIT_OPENCL_MEM, and
			
 
				+\ref STARPU_LIMIT_OPENCL_devid_MEM environment variables.
			
 
				+
			
 
				+The Simgrid default stack size is small; to increase it use the
			
 
				+parameter <c>--cfg=contexts/stack_size</c>, for example:
			
 
				+
			
 
				+\verbatim
			
 
				+$ ./example --cfg=contexts/stack_size:8192
			
 
				+TEST FAILED !!!
			
 
				+\endverbatim
			
 
				+
			
 
				+Note: of course, if the application uses <c>gettimeofday</c> to make its
			
 
				+performance measurements, the real time will be used, which will be bogus. To
			
 
				+get the simulated time, it has to use starpu_timing_now() which returns the
			
 
				+virtual timestamp in ms.
			
 
				+
			
 
				+\section SimulationOnAnotherMachine Simulation On Another Machine
			
 
				+
			
 
				+The simgrid support even permits to perform simulations on another machine, your
			
 
				+desktop, typically. To achieve this, one still needs to perform the Calibration
			
 
				+step on the actual machine to be simulated, then copy them to your desktop
			
 
				+machine (the <c>$STARPU_HOME/.starpu</c> directory). One can then perform the
			
 
				+Simulation step on the desktop machine, by setting the environment
			
 
				+variable \ref STARPU_HOSTNAME to the name of the actual machine, to
			
 
				+make StarPU use the performance models of the simulated machine even
			
 
				+on the desktop machine.
			
 
				+
			
 
				+If the desktop machine does not have CUDA or OpenCL, StarPU is still able to
			
 
				+use simgrid to simulate execution with CUDA/OpenCL devices, but the application
			
 
				+source code will probably disable the CUDA and OpenCL codelets in thatcd sc
			
 
				+case. Since during simgrid execution, the functions of the codelet are actually
			
 
				+not called, one can use dummy functions such as the following to still permit
			
 
				+CUDA or OpenCL execution:
			
 
				+
			
 
				+\snippet simgrid.c To be included. You should update doxygen if you see this text.
			
 
				+
			
 
				+
			
 
				+*/
			
--- a/doc/doxygen/chapters/40environment_variables.doxy
+++ b/doc/doxygen/chapters/40environment_variables.doxy
--- a/doc/doxygen/chapters/41configure_options.doxy
+++ b/doc/doxygen/chapters/41configure_options.doxy
--- a/doc/doxygen/chapters/45files.doxy
+++ b/doc/doxygen/chapters/45files.doxy
--- a/doc/doxygen/chapters/50scaling-vector-example.doxy
+++ b/doc/doxygen/chapters/50scaling-vector-example.doxy
--- a/doc/doxygen/chapters/51fdl-1.3.doxy
+++ b/doc/doxygen/chapters/51fdl-1.3.doxy
--- a/doc/doxygen/refman.tex
+++ b/doc/doxygen/refman.tex
@@ -68,7 +68,7 @@ was last updated on \STARPUUPDATED.\\
 
				 
			
 
				 Copyright © 2009–2013 Université de Bordeaux 1\\
			
 
				 
			
 
				-Copyright © 2010-2013 Centre National de la Recherche Scientifique\\
			
 
				+Copyright © 2010-2014 Centre National de la Recherche Scientifique\\
			
 
				 
			
 
				 Copyright © 2011, 2012 Institut National de Recherche en Informatique et Automatique\\
			
 
				 
			
@@ -94,7 +94,7 @@ Documentation License”.
 
				 \hypertarget{index}{}
			
 
				 \input{index}
			
 
				 
			
 
				-\part{Using StarPU}
			
 
				+\part{StarPU Basics}
			
 
				 
			
 
				 \chapter{Building and Installing StarPU}
			
 
				 \label{BuildingAndInstallingStarPU}
			
@@ -106,33 +106,72 @@ Documentation License”.
 
				 \hypertarget{BasicExamples}{}
			
 
				 \input{BasicExamples}
			
 
				 
			
 
				+\part{StarPU Quick Programming Guide}
			
 
				+
			
 
				 \chapter{Advanced Examples}
			
 
				 \label{AdvancedExamples}
			
 
				 \hypertarget{AdvancedExamples}{}
			
 
				 \input{AdvancedExamples}
			
 
				 
			
 
				-\chapter{How To Optimize Performance With StarPU}
			
 
				-\label{HowToOptimizePerformanceWithStarPU}
			
 
				-\hypertarget{HowToOptimizePerformanceWithStarPU}{}
			
 
				-\input{HowToOptimizePerformanceWithStarPU}
			
 
				+\chapter{Check List When Performance Are Not There}
			
 
				+\label{CheckListWhenPerformanceAreNotThere}
			
 
				+\hypertarget{CheckListWhenPerformanceAreNotThere}{}
			
 
				+\input{CheckListWhenPerformanceAreNotThere}
			
 
				+
			
 
				+\part{StarPU Inside}
			
 
				+
			
 
				+\chapter{Tasks In StarPU}
			
 
				+\label{TasksInStarPU}
			
 
				+\hypertarget{TasksInStarPU}{}
			
 
				+\input{TasksInStarPU}
			
 
				+
			
 
				+\chapter{Data Management}
			
 
				+\label{DataManagement}
			
 
				+\hypertarget{DataManagement}{}
			
 
				+\input{DataManagement}
			
 
				+
			
 
				+\chapter{Scheduling}
			
 
				+\label{Scheduling}
			
 
				+\hypertarget{Scheduling}{}
			
 
				+\input{Scheduling}
			
 
				+
			
 
				+\chapter{Scheduling Contexts}
			
 
				+\label{SchedulingContexts}
			
 
				+\hypertarget{SchedulingContexts}{}
			
 
				+\input{SchedulingContexts}
			
 
				+
			
 
				+\chapter{Scheduling Context Hypervisor}
			
 
				+\label{SchedulingContextHypervisor}
			
 
				+\hypertarget{SchedulingContextHypervisor}{}
			
 
				+\input{SchedulingContextHypervisor}
			
 
				+
			
 
				+\chapter{Debugging Tools}
			
 
				+\label{DebuggingTools}
			
 
				+\hypertarget{DebuggingTools}{}
			
 
				+\input{DebuggingTools}
			
 
				+
			
 
				+\chapter{Online Performance Tools}
			
 
				+\label{OnlinePerformanceTools}
			
 
				+\hypertarget{OnlinePerformanceTools}{}
			
 
				+\input{OnlinePerformanceTools}
			
 
				+
			
 
				+\chapter{Offline Performance Tools}
			
 
				+\label{OfflinePerformanceTools}
			
 
				+\hypertarget{OfflinePerformanceTools}{}
			
 
				+\input{OfflinePerformanceTools}
			
 
				 
			
 
				-\chapter{Performance Feedback}
			
 
				-\label{PerformanceFeedback}
			
 
				-\hypertarget{PerformanceFeedback}{}
			
 
				-\input{PerformanceFeedback}
			
 
				+\chapter{Frequently Asked Questions}
			
 
				+\label{FrequentlyAskedQuestions}
			
 
				+\hypertarget{FrequentlyAskedQuestions}{}
			
 
				+\input{FrequentlyAskedQuestions}
			
 
				 
			
 
				-\chapter{Tips and Tricks To Know About}
			
 
				-\label{TipsAndTricksToKnowAbout}
			
 
				-\hypertarget{TipsAndTricksToKnowAbout}{}
			
 
				-\input{TipsAndTricksToKnowAbout}
			
 
				+\part{StarPU Extensions}
			
 
				 
			
 
				 \chapter{Out Of Core}
			
 
				 \label{OutOfCore}
			
 
				 \hypertarget{OutOfCore}{}
			
 
				 \input{OutOfCore}
			
 
				 
			
 
				-
			
 
				-
			
 
				 \chapter{MPI Support}
			
 
				 \label{MPISupport}
			
 
				 \hypertarget{MPISupport}{}
			
@@ -158,17 +197,12 @@ Documentation License”.
 
				 \hypertarget{SOCLOpenclExtensions}{}
			
 
				 \input{SOCLOpenclExtensions}
			
 
				 
			
 
				-\chapter{Scheduling Contexts}
			
 
				-\label{SchedulingContexts}
			
 
				-\hypertarget{SchedulingContexts}{}
			
 
				-\input{SchedulingContexts}
			
 
				-
			
 
				-\chapter{Scheduling Context Hypervisor}
			
 
				-\label{SchedulingContextHypervisor}
			
 
				-\hypertarget{SchedulingContextHypervisor}{}
			
 
				-\input{SchedulingContextHypervisor}
			
 
				+\chapter{SimGrid Support}
			
 
				+\label{SimGridSupport}
			
 
				+\hypertarget{SimGridSupport}{}
			
 
				+\input{SimGridSupport}
			
 
				 
			
 
				-\part{Inside StarPU}
			
 
				+\part{StarPU Reference API}
			
 
				 
			
 
				 \chapter{Execution Configuration Through Environment Variables}
			
 
				 \label{ExecutionConfigurationThroughEnvironmentVariables}
			
@@ -277,10 +311,6 @@ Documentation License”.
 
				 \hypertarget{deprecated}{}
			
 
				 \input{deprecated}
			
 
				 
			
 
				-
			
 
				-\addcontentsline{toc}{chapter}{Index}
			
 
				-\printindex
			
 
				-
			
 
				 \part{Appendix}
			
 
				 
			
 
				 \chapter{Full Source Code for the ’Scaling a Vector’ Example}
			
@@ -293,4 +323,8 @@ Documentation License”.
 
				 \hypertarget{GNUFreeDocumentationLicense}{}
			
 
				 \input{GNUFreeDocumentationLicense}
			
 
				 
			
 
				+\part{Index}
			
 
				+\addcontentsline{toc}{chapter}{Index}
			
 
				+\printindex
			
 
				+
			
 
				 \end{document}