Browse Source

doc: modify chapters outline, we are trying here to divide the whole documentation in distinct self-readable parts

Nathalie Furmento 11 years ago
parent
commit
4839a2cfce
28 changed files with 2230 additions and 2080 deletions
  1. 24 18
      doc/doxygen/Makefile.am
  2. 75 12
      doc/doxygen/chapters/00introduction.doxy
  3. 1 1
      doc/doxygen/chapters/01building.doxy
  4. 2 1234
      doc/doxygen/chapters/03advanced_examples.doxy
  5. 0 552
      doc/doxygen/chapters/04optimize_performance.doxy
  6. 204 0
      doc/doxygen/chapters/05check_list_performance.doxy
  7. 443 0
      doc/doxygen/chapters/06tasks.doxy
  8. 508 0
      doc/doxygen/chapters/07data_management.doxy
  9. 151 0
      doc/doxygen/chapters/08scheduling.doxy
  10. 0 0
      doc/doxygen/chapters/09scheduling_contexts.doxy
  11. 0 0
      doc/doxygen/chapters/10scheduling_context_hypervisor.doxy
  12. 42 0
      doc/doxygen/chapters/11debugging_tools.doxy
  13. 432 0
      doc/doxygen/chapters/12online_performance_tools.doxy
  14. 80 212
      doc/doxygen/chapters/05performance_feedback.doxy
  15. 100 21
      doc/doxygen/chapters/06tips_and_tricks.doxy
  16. 0 0
      doc/doxygen/chapters/15out_of_core.doxy
  17. 0 0
      doc/doxygen/chapters/16mpi_support.doxy
  18. 0 0
      doc/doxygen/chapters/17fft_support.doxy
  19. 0 0
      doc/doxygen/chapters/18mic_scc_support.doxy
  20. 0 0
      doc/doxygen/chapters/19c_extensions.doxy
  21. 0 0
      doc/doxygen/chapters/20socl_opencl_extensions.doxy
  22. 104 0
      doc/doxygen/chapters/21simgrid.doxy
  23. 0 0
      doc/doxygen/chapters/40environment_variables.doxy
  24. 0 0
      doc/doxygen/chapters/41configure_options.doxy
  25. 0 0
      doc/doxygen/chapters/45files.doxy
  26. 0 0
      doc/doxygen/chapters/50scaling-vector-example.doxy
  27. 0 0
      doc/doxygen/chapters/51fdl-1.3.doxy
  28. 64 30
      doc/doxygen/refman.tex

+ 24 - 18
doc/doxygen/Makefile.am

@@ -28,22 +28,28 @@ chapters =	\
 	chapters/01building.doxy \
 	chapters/02basic_examples.doxy \
 	chapters/03advanced_examples.doxy \
-	chapters/04optimize_performance.doxy \
-	chapters/05performance_feedback.doxy \
-	chapters/06tips_and_tricks.doxy \
-	chapters/07out_of_core.doxy \
-	chapters/08mpi_support.doxy \
-	chapters/09fft_support.doxy \
-	chapters/10mic_scc_support.doxy \
-	chapters/11c_extensions.doxy \
-	chapters/12socl_opencl_extensions.doxy \
-	chapters/13scheduling_contexts.doxy \
-	chapters/14scheduling_context_hypervisor.doxy \
-	chapters/15environment_variables.doxy \
-	chapters/16configure_options.doxy \
-	chapters/17files.doxy \
-	chapters/18scaling-vector-example.doxy \
-	chapters/19fdl-1.3.doxy \
+	chapters/05check_list_performance.doxy \
+	chapters/06tasks.doxy \
+	chapters/07data_management.doxy \
+	chapters/08scheduling.doxy \
+	chapters/09scheduling_contexts.doxy \
+	chapters/10scheduling_context_hypervisor.doxy \
+	chapters/11debugging_tools.doxy \
+	chapters/12online_performance_tools.doxy \
+	chapters/13offline_performance_tools.doxy \
+	chapters/14faq.doxy \
+	chapters/15out_of_core.doxy \
+	chapters/16mpi_support.doxy \
+	chapters/17fft_support.doxy \
+	chapters/18mic_scc_support.doxy \
+	chapters/19c_extensions.doxy \
+	chapters/20socl_opencl_extensions.doxy \
+	chapters/21simgrid.doxy \
+	chapters/40environment_variables.doxy \
+	chapters/41configure_options.doxy \
+	chapters/45files.doxy \
+	chapters/50scaling-vector-example.doxy \
+	chapters/51fdl-1.3.doxy \
 	chapters/code/hello_pragma2.c \
 	chapters/code/hello_pragma.c \
 	chapters/code/scal_pragma.cu \
@@ -218,8 +224,8 @@ $(DOX_TAG): $(dox_inputs)
 	$(DOXYGEN) $(DOX_CONFIG)
 	sed -i 's/ModuleDocumentation <\/li>/<a class="el" href="modules.html">Modules<\/a>/' html/index.html
 	sed -i 's/FileDocumentation <\/li>/<a class="el" href="files.html">Files<\/a>/' html/index.html
-        # comment for the line above: what we really want to do is to remove the line, but dy doing so, it avoids opening the interactive menu when browsing files
-	if test -f html/navtree.js ; then sed -i 's/\[ "Files", "Files.html", null \]/\[ "", "Files.html", null \]/' html/navtree.js ; fi
+        # comment for the line below: what we really want to do is to remove the line, but dy doing so, it avoids opening the interactive menu when browsing files
+#	if test -f html/navtree.js ; then sed -i 's/\[ "Files", "Files.html", null \]/\[ "", "Files.html", null \]/' html/navtree.js ; fi
 	sed -i 's/.*"Files.html".*//' html/pages.html
 	if test -f latex/main.tex ; then mv latex/main.tex latex/index.tex ; fi
 

+ 75 - 12
doc/doxygen/chapters/00introduction.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
  * See the file version.doxy for copying conditions.
 */
@@ -184,30 +184,94 @@ http://runtime.bordeaux.inria.fr/Publis/Keyword/STARPU.html.
 A good overview is available in the research report at
 http://hal.archives-ouvertes.fr/inria-00467677.
 
+\section StarPUApplications StarPU Applications
+
+You can first have a look at the chapters \ref BasicExamples and \ref AdvancedExamples.
+A tutorial is also installed in the directory <c>share/doc/starpu/tutorial/</c>.
+
+Many examples are also available in the StarPU sources in the directory
+<c>examples/</c>. Simple examples include:
+
+<dl>
+<dt> <c>incrementer/</c> </dt>
+<dd> Trivial incrementation test. </dd>
+<dt> <c>basic_examples/</c> </dt>
+<dd>
+        Simple documented Hello world and vector/scalar product (as
+        shown in \ref BasicExamples), matrix
+        product examples (as shown in \ref PerformanceModelExample), an example using the blocked matrix data
+        interface, an example using the variable data interface, and an example
+        using different formats on CPUs and GPUs.
+</dd>
+<dt> <c>matvecmult/</c></dt>
+<dd>
+    OpenCL example from NVidia, adapted to StarPU.
+</dd>
+<dt> <c>axpy/</c></dt>
+<dd>
+    AXPY CUBLAS operation adapted to StarPU.
+</dd>
+<dt> <c>fortran/</c> </dt>
+<dd>
+    Example of Fortran bindings.
+</dd>
+</dl>
+
+More advanced examples include:
+
+<dl>
+<dt><c>filters/</c></dt>
+<dd>
+    Examples using filters, as shown in \ref PartitioningData.
+</dd>
+<dt><c>lu/</c></dt>
+<dd>
+    LU matrix factorization, see for instance <c>xlu_implicit.c</c>
+</dd>
+<dt><c>cholesky/</c></dt>
+<dd>
+    Cholesky matrix factorization, see for instance <c>cholesky_implicit.c</c>.
+</dd>
+</dl>
+
 \section FurtherReading Further Reading
 
 The documentation chapters include
 
-<ol>
-<li> Part: Using StarPU
+<ul>
+<li> Part 1: StarPU Basics
 <ul>
 <li> \ref BuildingAndInstallingStarPU
 <li> \ref BasicExamples
+</ul>
+<li> Part 2: StarPU Quick Programming Guide
+<ul>
 <li> \ref AdvancedExamples
-<li> \ref HowToOptimizePerformanceWithStarPU
-<li> \ref PerformanceFeedback
-<li> \ref TipsAndTricksToKnowAbout
+<li> \ref CheckListWhenPerformanceAreNotThere
+</ul>
+<li> Part 3: StarPU Inside
+<ul>
+<li> \ref TasksInStarPU
+<li> \ref DataManagement
+<li> \ref Scheduling
+<li> \ref SchedulingContexts
+<li> \ref SchedulingContextHypervisor
+<li> \ref DebuggingTools
+<li> \ref OnlinePerformanceTools
+<li> \ref OfflinePerformanceTools
+<li> \ref FrequentlyAskedQuestions
+</ul>
+<li> Part 4: StarPU Extensions
+<ul>
 <li> \ref OutOfCore
 <li> \ref MPISupport
 <li> \ref FFTSupport
 <li> \ref MICSCCSupport
 <li> \ref cExtensions
 <li> \ref SOCLOpenclExtensions
-<li> \ref SchedulingContexts
-<li> \ref SchedulingContextHypervisor
+<li> \ref SimGridSupport
 </ul>
-</li>
-<li> Part: Inside StarPU
+<li> Part 5: StarPU Reference API
 <ul>
 <li> \ref ExecutionConfigurationThroughEnvironmentVariables
 <li> \ref CompilationConfiguration
@@ -220,8 +284,7 @@ The documentation chapters include
 <li> \ref FullSourceCodeVectorScal
 <li> \ref GNUFreeDocumentationLicense
 </ul>
-</ol>
-
+</ul>
 
 Make sure to have had a look at those too!
 

+ 1 - 1
doc/doxygen/chapters/01building.doxy

@@ -1,7 +1,7 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
  * See the file version.doxy for copying conditions.
  */

File diff suppressed because it is too large
+ 2 - 1234
doc/doxygen/chapters/03advanced_examples.doxy


+ 0 - 552
doc/doxygen/chapters/04optimize_performance.doxy

@@ -1,552 +0,0 @@
-/*
- * This file is part of the StarPU Handbook.
- * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
- * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
- * See the file version.doxy for copying conditions.
- */
-
-/*! \page HowToOptimizePerformanceWithStarPU How To Optimize Performance With StarPU
-
-TODO: improve!
-
-Simply encapsulating application kernels into tasks already permits to
-seamlessly support CPU and GPUs at the same time. To achieve good performance, a
-few additional changes are needed.
-
-\section DataManagement Data Management
-
-When the application allocates data, whenever possible it should use
-the function starpu_malloc(), which will ask CUDA or OpenCL to make
-the allocation itself and pin the corresponding allocated memory. This
-is needed to permit asynchronous data transfer, i.e. permit data
-transfer to overlap with computations. Otherwise, the trace will show
-that the <c>DriverCopyAsync</c> state takes a lot of time, this is
-because CUDA or OpenCL then reverts to synchronous transfers.
-
-By default, StarPU leaves replicates of data wherever they were used, in case they
-will be re-used by other tasks, thus saving the data transfer time. When some
-task modifies some data, all the other replicates are invalidated, and only the
-processing unit which ran that task will have a valid replicate of the data. If the application knows
-that this data will not be re-used by further tasks, it should advise StarPU to
-immediately replicate it to a desired list of memory nodes (given through a
-bitmask). This can be understood like the write-through mode of CPU caches.
-
-\code{.c}
-starpu_data_set_wt_mask(img_handle, 1<<0);
-\endcode
-
-will for instance request to always automatically transfer a replicate into the
-main memory (node <c>0</c>), as bit <c>0</c> of the write-through bitmask is being set.
-
-\code{.c}
-starpu_data_set_wt_mask(img_handle, ~0U);
-\endcode
-
-will request to always automatically broadcast the updated data to all memory
-nodes.
-
-Setting the write-through mask to <c>~0U</c> can also be useful to make sure all
-memory nodes always have a copy of the data, so that it is never evicted when
-memory gets scarse.
-
-Implicit data dependency computation can become expensive if a lot
-of tasks access the same piece of data. If no dependency is required
-on some piece of data (e.g. because it is only accessed in read-only
-mode, or because write accesses are actually commutative), use the
-function starpu_data_set_sequential_consistency_flag() to disable
-implicit dependencies on that data.
-
-In the same vein, accumulation of results in the same data can become a
-bottleneck. The use of the mode ::STARPU_REDUX permits to optimize such
-accumulation (see \ref DataReduction). To a lesser extent, the use of
-the flag ::STARPU_COMMUTE keeps the bottleneck, but at least permits
-the accumulation to happen in any order.
-
-Applications often need a data just for temporary results.  In such a case,
-registration can be made without an initial value, for instance this produces a vector data:
-
-\code{.c}
-starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
-\endcode
-
-StarPU will then allocate the actual buffer only when it is actually needed,
-e.g. directly on the GPU without allocating in main memory.
-
-In the same vein, once the temporary results are not useful any more, the
-data should be thrown away. If the handle is not to be reused, it can be
-unregistered:
-
-\code{.c}
-starpu_unregister_submit(handle);
-\endcode
-
-actual unregistration will be done after all tasks working on the handle
-terminate.
-
-If the handle is to be reused, instead of unregistering it, it can simply be invalidated:
-
-\code{.c}
-starpu_invalidate_submit(handle);
-\endcode
-
-the buffers containing the current value will then be freed, and reallocated
-only when another task writes some value to the handle.
-
-\section TaskGranularity Task Granularity
-
-Like any other runtime, StarPU has some overhead to manage tasks. Since
-it does smart scheduling and data management, that overhead is not always
-neglectable. The order of magnitude of the overhead is typically a couple of
-microseconds, which is actually quite smaller than the CUDA overhead itself. The
-amount of work that a task should do should thus be somewhat
-bigger, to make sure that the overhead becomes neglectible. The offline
-performance feedback can provide a measure of task length, which should thus be
-checked if bad performance are observed. To get a grasp at the scalability
-possibility according to task size, one can run
-<c>tests/microbenchs/tasks_size_overhead.sh</c> which draws curves of the
-speedup of independent tasks of very small sizes.
-
-The choice of scheduler also has impact over the overhead: for instance, the
- scheduler <c>dmda</c> takes time to make a decision, while <c>eager</c> does
-not. <c>tasks_size_overhead.sh</c> can again be used to get a grasp at how much
-impact that has on the target machine.
-
-\section TaskSubmission Task Submission
-
-To let StarPU make online optimizations, tasks should be submitted
-asynchronously as much as possible. Ideally, all the tasks should be
-submitted, and mere calls to starpu_task_wait_for_all() or
-starpu_data_unregister() be done to wait for
-termination. StarPU will then be able to rework the whole schedule, overlap
-computation with communication, manage accelerator local memory usage, etc.
-
-\section TaskPriorities Task Priorities
-
-By default, StarPU will consider the tasks in the order they are submitted by
-the application. If the application programmer knows that some tasks should
-be performed in priority (for instance because their output is needed by many
-other tasks and may thus be a bottleneck if not executed early
-enough), the field starpu_task::priority should be set to transmit the
-priority information to StarPU.
-
-\section TaskSchedulingPolicy Task Scheduling Policy
-
-By default, StarPU uses the simple greedy scheduler <c>eager</c>. This is
-because it provides correct load balance even if the application codelets do not
-have performance models. If your application codelets have performance models
-(\ref PerformanceModelExample), you should change the scheduler thanks
-to the environment variable \ref STARPU_SCHED. For instance <c>export
-STARPU_SCHED=dmda</c> . Use <c>help</c> to get the list of available schedulers.
-
-The <b>eager</b> scheduler uses a central task queue, from which workers draw tasks
-to work on. This however does not permit to prefetch data since the scheduling
-decision is taken late. If a task has a non-0 priority, it is put at the front of the queue.
-
-The <b>prio</b> scheduler also uses a central task queue, but sorts tasks by
-priority (between -5 and 5).
-
-The <b>random</b> scheduler distributes tasks randomly according to assumed worker
-overall performance.
-
-The <b>ws</b> (work stealing) scheduler schedules tasks on the local worker by
-default. When a worker becomes idle, it steals a task from the most loaded
-worker.
-
-The <b>dm</b> (deque model) scheduler uses task execution performance models into account to
-perform an HEFT-similar scheduling strategy: it schedules tasks where their
-termination time will be minimal.
-
-The <b>dmda</b> (deque model data aware) scheduler is similar to dm, it also takes
-into account data transfer time.
-
-The <b>dmdar</b> (deque model data aware ready) scheduler is similar to dmda,
-it also sorts tasks on per-worker queues by number of already-available data
-buffers.
-
-The <b>dmdas</b> (deque model data aware sorted) scheduler is similar to dmda, it
-also supports arbitrary priority values.
-
-The <b>heft</b> (heterogeneous earliest finish time) scheduler is deprecated. It
-is now just an alias for <b>dmda</b>.
-
-The <b>pheft</b> (parallel HEFT) scheduler is similar to heft, it also supports
-parallel tasks (still experimental). Should not be used when several contexts using
-it are being executed simultaneously.
-
-The <b>peager</b> (parallel eager) scheduler is similar to eager, it also
-supports parallel tasks (still experimental). Should not be used when several 
-contexts using it are being executed simultaneously.
-
-
-\section PerformanceModelCalibration Performance Model Calibration
-
-Most schedulers are based on an estimation of codelet duration on each kind
-of processing unit. For this to be possible, the application programmer needs
-to configure a performance model for the codelets of the application (see
-\ref PerformanceModelExample for instance). History-based performance models
-use on-line calibration.  StarPU will automatically calibrate codelets
-which have never been calibrated yet, and save the result in
-<c>$STARPU_HOME/.starpu/sampling/codelets</c>.
-The models are indexed by machine name. To share the models between
-machines (e.g. for a homogeneous cluster), use <c>export
-STARPU_HOSTNAME=some_global_name</c>. To force continuing calibration,
-use <c>export STARPU_CALIBRATE=1</c> . This may be necessary if your application
-has not-so-stable performance. StarPU will force calibration (and thus ignore
-the current result) until 10 (<c>_STARPU_CALIBRATION_MINIMUM</c>) measurements have been
-made on each architecture, to avoid badly scheduling tasks just because the
-first measurements were not so good. Details on the current performance model status
-can be obtained from the command <c>starpu_perfmodel_display</c>: the <c>-l</c>
-option lists the available performance models, and the <c>-s</c> option permits
-to choose the performance model to be displayed. The result looks like:
-
-\verbatim
-$ starpu_perfmodel_display -s starpu_slu_lu_model_11
-performance model for cpu_impl_0
-# hash    size     flops         mean          dev           n
-914f3bef  1048576  0.000000e+00  2.503577e+04  1.982465e+02  8
-3e921964  65536    0.000000e+00  5.527003e+02  1.848114e+01  7
-e5a07e31  4096     0.000000e+00  1.717457e+01  5.190038e+00  14
-...
-\endverbatim
-
-Which shows that for the LU 11 kernel with a 1MiB matrix, the average
-execution time on CPUs was about 25ms, with a 0.2ms standard deviation, over
-8 samples. It is a good idea to check this before doing actual performance
-measurements.
-
-A graph can be drawn by using the tool <c>starpu_perfmodel_plot</c>:
-
-\verbatim
-$ starpu_perfmodel_plot -s starpu_slu_lu_model_11
-4096 16384 65536 262144 1048576 4194304 
-$ gnuplot starpu_starpu_slu_lu_model_11.gp
-$ gv starpu_starpu_slu_lu_model_11.eps
-\endverbatim
-
-\image html starpu_starpu_slu_lu_model_11.png
-\image latex starpu_starpu_slu_lu_model_11.eps "" width=\textwidth
-
-If a kernel source code was modified (e.g. performance improvement), the
-calibration information is stale and should be dropped, to re-calibrate from
-start. This can be done by using <c>export STARPU_CALIBRATE=2</c>.
-
-Note: due to CUDA limitations, to be able to measure kernel duration,
-calibration mode needs to disable asynchronous data transfers. Calibration thus
-disables data transfer / computation overlapping, and should thus not be used
-for eventual benchmarks. Note 2: history-based performance models get calibrated
-only if a performance-model-based scheduler is chosen.
-
-The history-based performance models can also be explicitly filled by the
-application without execution, if e.g. the application already has a series of
-measurements. This can be done by using starpu_perfmodel_update_history(),
-for instance:
-
-\code{.c}
-static struct starpu_perfmodel perf_model = {
-    .type = STARPU_HISTORY_BASED,
-    .symbol = "my_perfmodel",
-};
-
-struct starpu_codelet cl = {
-    .where = STARPU_CUDA,
-    .cuda_funcs = { cuda_func1, cuda_func2, NULL },
-    .nbuffers = 1,
-    .modes = {STARPU_W},
-    .model = &perf_model
-};
-
-void feed(void) {
-    struct my_measure *measure;
-    struct starpu_task task;
-    starpu_task_init(&task);
-
-    task.cl = &cl;
-
-    for (measure = &measures[0]; measure < measures[last]; measure++) {
-        starpu_data_handle_t handle;
-	starpu_vector_data_register(&handle, -1, 0, measure->size, sizeof(float));
-	task.handles[0] = handle;
-	starpu_perfmodel_update_history(&perf_model, &task,
-	                                STARPU_CUDA_DEFAULT + measure->cudadev, 0,
-	                                measure->implementation, measure->time);
-	starpu_task_clean(&task);
-	starpu_data_unregister(handle);
-    }
-}
-\endcode
-
-Measurement has to be provided in milliseconds for the completion time models,
-and in Joules for the energy consumption models.
-
-\section TaskDistributionVsDataTransfer Task Distribution Vs Data Transfer
-
-Distributing tasks to balance the load induces data transfer penalty. StarPU
-thus needs to find a balance between both. The target function that the
-scheduler <c>dmda</c> of StarPU
-tries to minimize is <c>alpha * T_execution + beta * T_data_transfer</c>, where
-<c>T_execution</c> is the estimated execution time of the codelet (usually
-accurate), and <c>T_data_transfer</c> is the estimated data transfer time. The
-latter is estimated based on bus calibration before execution start,
-i.e. with an idle machine, thus without contention. You can force bus
-re-calibration by running the tool <c>starpu_calibrate_bus</c>. The
-beta parameter defaults to <c>1</c>, but it can be worth trying to tweak it
-by using <c>export STARPU_SCHED_BETA=2</c> for instance, since during
-real application execution, contention makes transfer times bigger.
-This is of course imprecise, but in practice, a rough estimation
-already gives the good results that a precise estimation would give.
-
-\section DataPrefetch Data Prefetch
-
-The scheduling policies <c>heft</c>, <c>dmda</c> and <c>pheft</c>
-perform data prefetch (see \ref STARPU_PREFETCH):
-as soon as a scheduling decision is taken for a task, requests are issued to
-transfer its required data to the target processing unit, if needed, so that
-when the processing unit actually starts the task, its data will hopefully be
-already available and it will not have to wait for the transfer to finish.
-
-The application may want to perform some manual prefetching, for several reasons
-such as excluding initial data transfers from performance measurements, or
-setting up an initial statically-computed data distribution on the machine
-before submitting tasks, which will thus guide StarPU toward an initial task
-distribution (since StarPU will try to avoid further transfers).
-
-This can be achieved by giving the function starpu_data_prefetch_on_node()
-the handle and the desired target memory node.
-
-\section Power-basedScheduling Power-based Scheduling
-
-If the application can provide some power performance model (through
-the field starpu_codelet::power_model), StarPU will
-take it into account when distributing tasks. The target function that
-the scheduler <c>dmda</c> minimizes becomes <c>alpha * T_execution +
-beta * T_data_transfer + gamma * Consumption</c> , where <c>Consumption</c>
-is the estimated task consumption in Joules. To tune this parameter, use
-<c>export STARPU_SCHED_GAMMA=3000</c> for instance, to express that each Joule
-(i.e kW during 1000us) is worth 3000us execution time penalty. Setting
-<c>alpha</c> and <c>beta</c> to zero permits to only take into account power consumption.
-
-This is however not sufficient to correctly optimize power: the scheduler would
-simply tend to run all computations on the most energy-conservative processing
-unit. To account for the consumption of the whole machine (including idle
-processing units), the idle power of the machine should be given by setting
-<c>export STARPU_IDLE_POWER=200</c> for 200W, for instance. This value can often
-be obtained from the machine power supplier.
-
-The power actually consumed by the total execution can be displayed by setting
-<c>export STARPU_PROFILING=1 STARPU_WORKER_STATS=1</c> .
-
-On-line task consumption measurement is currently only supported through the
-<c>CL_PROFILING_POWER_CONSUMED</c> OpenCL extension, implemented in the MoviSim
-simulator. Applications can however provide explicit measurements by
-using the function starpu_perfmodel_update_history() (examplified in \ref PerformanceModelExample
-with the <c>power_model</c> performance model). Fine-grain
-measurement is often not feasible with the feedback provided by the hardware, so
-the user can for instance run a given task a thousand times, measure the global
-consumption for that series of tasks, divide it by a thousand, repeat for
-varying kinds of tasks and task sizes, and eventually feed StarPU
-with these manual measurements through starpu_perfmodel_update_history().
-
-\section StaticScheduling Static Scheduling
-
-In some cases, one may want to force some scheduling, for instance force a given
-set of tasks to GPU0, another set to GPU1, etc. while letting some other tasks
-be scheduled on any other device. This can indeed be useful to guide StarPU into
-some work distribution, while still letting some degree of dynamism. For
-instance, to force execution of a task on CUDA0:
-
-\code{.c}
-task->execute_on_a_specific_worker = 1;
-task->worker = starpu_worker_get_by_type(STARPU_CUDA_WORKER, 0);
-\endcode
-
-Note however that using scheduling contexts while statically scheduling tasks on workers
-could be tricky. Be careful to schedule the tasks exactly on the workers of the corresponding
-contexts, otherwise the workers' corresponding scheduling structures may not be allocated or
-the execution of the application may deadlock. Moreover, the hypervisor should not be used when
-statically scheduling tasks.
-
-\section Profiling Profiling
-
-A quick view of how many tasks each worker has executed can be obtained by setting
-<c>export STARPU_WORKER_STATS=1</c> This is a convenient way to check that
-execution did happen on accelerators without penalizing performance with
-the profiling overhead.
-
-A quick view of how much data transfers have been issued can be obtained by setting
-<c>export STARPU_BUS_STATS=1</c> .
-
-More detailed profiling information can be enabled by using <c>export STARPU_PROFILING=1</c> or by
-calling starpu_profiling_status_set() from the source code.
-Statistics on the execution can then be obtained by using <c>export
-STARPU_BUS_STATS=1</c> and <c>export STARPU_WORKER_STATS=1</c> .
- More details on performance feedback are provided by the next chapter.
-
-\section DetectionStuckConditions Detection Stuck Conditions
-
-It may happen that for some reason, StarPU does not make progress for a long
-period of time.  Reason are sometimes due to contention inside StarPU, but
-sometimes this is due to external reasons, such as stuck MPI driver, or CUDA
-driver, etc.
-
-<c>export STARPU_WATCHDOG_TIMEOUT=10000</c>
-
-allows to make StarPU print an error message whenever StarPU does not terminate
-any task for 10ms. In addition to that,
-
-<c>export STARPU_WATCHDOG_CRASH=1</c>
-
-triggers a crash in that condition, thus allowing to catch the situation in gdb
-etc.
-
-\section CUDA-specificOptimizations CUDA-specific Optimizations
-
-Due to CUDA limitations, StarPU will have a hard time overlapping its own
-communications and the codelet computations if the application does not use a
-dedicated CUDA stream for its computations instead of the default stream,
-which synchronizes all operations of the GPU. StarPU provides one by the use
-of starpu_cuda_get_local_stream() which can be used by all CUDA codelet
-operations to avoid this issue. For instance:
-
-\code{.c}
-func <<<grid,block,0,starpu_cuda_get_local_stream()>>> (foo, bar);
-cudaStreamSynchronize(starpu_cuda_get_local_stream());
-\endcode
-
-StarPU already does appropriate calls for the CUBLAS library.
-
-Unfortunately, some CUDA libraries do not have stream variants of
-kernels. That will lower the potential for overlapping.
-
-\section PerformanceDebugging Performance Debugging
-
-To get an idea of what is happening, a lot of performance feedback is available,
-detailed in the next chapter. The various informations should be checked for.
-
-<ul>
-<li>
-What does the Gantt diagram look like? (see \ref CreatingAGanttDiagram)
-<ul>
-  <li> If it's mostly green (tasks running in the initial context) or context specific
-  color prevailing, then the machine is properly
-  utilized, and perhaps the codelets are just slow. Check their performance, see
-  \ref PerformanceOfCodelets.
-  </li>
-  <li> If it's mostly purple (FetchingInput), tasks keep waiting for data
-  transfers, do you perhaps have far more communication than computation? Did
-  you properly use CUDA streams to make sure communication can be
-  overlapped? Did you use data-locality aware schedulers to avoid transfers as
-  much as possible?
-  </li>
-  <li> If it's mostly red (Blocked), tasks keep waiting for dependencies,
-  do you have enough parallelism? It might be a good idea to check what the DAG
-  looks like (see \ref CreatingADAGWithGraphviz).
-  </li>
-  <li> If only some workers are completely red (Blocked), for some reason the
-  scheduler didn't assign tasks to them. Perhaps the performance model is bogus,
-  check it (see \ref PerformanceOfCodelets). Do all your codelets have a
-  performance model?  When some of them don't, the schedulers switches to a
-  greedy algorithm which thus performs badly.
-  </li>
-</ul>
-</li>
-</ul>
-
-You can also use the Temanejo task debugger (see \ref UsingTheTemanejoTaskDebugger) to
-visualize the task graph more easily.
-
-\section SimulatedPerformance Simulated Performance
-
-StarPU can use Simgrid in order to simulate execution on an arbitrary
-platform.
-
-\subsection Calibration Calibration
-
-The idea is to first compile StarPU normally, and run the application,
-so as to automatically benchmark the bus and the codelets.
-
-\verbatim
-$ ./configure && make
-$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
-[starpu][_starpu_load_history_based_model] Warning: model matvecmult
-   is not calibrated, forcing calibration for this run. Use the
-   STARPU_CALIBRATE environment variable to control this.
-$ ...
-$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
-TEST PASSED
-\endverbatim
-
-Note that we force to use the scheduler <c>dmda</c> to generate
-performance models for the application. The application may need to be
-run several times before the model is calibrated.
-
-\subsection Simulation Simulation
-
-Then, recompile StarPU, passing \ref enable-simgrid "--enable-simgrid"
-to <c>./configure</c>, and re-run the application:
-
-\verbatim
-$ ./configure --enable-simgrid && make
-$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
-TEST FAILED !!!
-\endverbatim
-
-It is normal that the test fails: since the computation are not actually done
-(that is the whole point of simgrid), the result is wrong, of course.
-
-If the performance model is not calibrated enough, the following error
-message will be displayed
-
-\verbatim
-$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
-[starpu][_starpu_load_history_based_model] Warning: model matvecmult
-    is not calibrated, forcing calibration for this run. Use the
-    STARPU_CALIBRATE environment variable to control this.
-[starpu][_starpu_simgrid_execute_job][assert failure] Codelet
-    matvecmult does not have a perfmodel, or is not calibrated enough
-\endverbatim
-
-The number of devices can be chosen as usual with \ref STARPU_NCPU,
-\ref STARPU_NCUDA, and \ref STARPU_NOPENCL.  For now, only the number of
-cpus can be arbitrarily chosen. The number of CUDA and OpenCL devices have to be
-lower than the real number on the current machine.
-
-The amount of simulated GPU memory is for now unbound by default, but
-it can be chosen by hand through the \ref STARPU_LIMIT_CUDA_MEM,
-\ref STARPU_LIMIT_CUDA_devid_MEM, \ref STARPU_LIMIT_OPENCL_MEM, and
-\ref STARPU_LIMIT_OPENCL_devid_MEM environment variables.
-
-The Simgrid default stack size is small; to increase it use the
-parameter <c>--cfg=contexts/stack_size</c>, for example:
-
-\verbatim
-$ ./example --cfg=contexts/stack_size:8192
-TEST FAILED !!!
-\endverbatim
-
-Note: of course, if the application uses <c>gettimeofday</c> to make its
-performance measurements, the real time will be used, which will be bogus. To
-get the simulated time, it has to use starpu_timing_now() which returns the
-virtual timestamp in ms.
-
-\subsection SimulationOnAnotherMachine Simulation On Another Machine
-
-The simgrid support even permits to perform simulations on another machine, your
-desktop, typically. To achieve this, one still needs to perform the Calibration
-step on the actual machine to be simulated, then copy them to your desktop
-machine (the <c>$STARPU_HOME/.starpu</c> directory). One can then perform the
-Simulation step on the desktop machine, by setting the environment
-variable \ref STARPU_HOSTNAME to the name of the actual machine, to
-make StarPU use the performance models of the simulated machine even
-on the desktop machine.
-
-If the desktop machine does not have CUDA or OpenCL, StarPU is still able to
-use simgrid to simulate execution with CUDA/OpenCL devices, but the application
-source code will probably disable the CUDA and OpenCL codelets in thatcd sc
-case. Since during simgrid execution, the functions of the codelet are actually
-not called, one can use dummy functions such as the following to still permit
-CUDA or OpenCL execution:
-
-\snippet simgrid.c To be included. You should update doxygen if you see this text.
-
-*/

+ 204 - 0
doc/doxygen/chapters/05check_list_performance.doxy

@@ -0,0 +1,204 @@
+/*
+ * This file is part of the StarPU Handbook.
+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+ * See the file version.doxy for copying conditions.
+ */
+
+/*! \page CheckListWhenPerformanceAreNotThere Check List When Performance Are Not There
+
+TODO: improve!
+
+Simply encapsulating application kernels into tasks already permits to
+seamlessly support CPU and GPUs at the same time. To achieve good
+performance, we give below a list of features which should be checked.
+
+\section DataRelatedFeaturesToImprovePerformance Data Related Features That May Improve Performance
+
+link to \ref DataManagement
+
+link to \ref DataPrefetch
+
+\section TaskRelatedFeaturesToImprovePerformance Task Related Features That May Improve Performance
+
+link to \ref TaskGranularity
+
+link to \ref TaskSubmission
+
+link to \ref TaskPriorities
+
+\section SchedulingRelatedFeaturesToImprovePerformance Scheduling Related Features That May Improve Performance
+
+link to \ref TaskSchedulingPolicy
+
+link to \ref TaskDistributionVsDataTransfer
+
+link to \ref Power-basedScheduling
+
+link to \ref StaticScheduling
+
+\section CUDA-specificOptimizations CUDA-specific Optimizations
+
+Due to CUDA limitations, StarPU will have a hard time overlapping its own
+communications and the codelet computations if the application does not use a
+dedicated CUDA stream for its computations instead of the default stream,
+which synchronizes all operations of the GPU. StarPU provides one by the use
+of starpu_cuda_get_local_stream() which can be used by all CUDA codelet
+operations to avoid this issue. For instance:
+
+\code{.c}
+func <<<grid,block,0,starpu_cuda_get_local_stream()>>> (foo, bar);
+cudaStreamSynchronize(starpu_cuda_get_local_stream());
+\endcode
+
+StarPU already does appropriate calls for the CUBLAS library.
+
+Unfortunately, some CUDA libraries do not have stream variants of
+kernels. That will lower the potential for overlapping.
+
+\section DetectionStuckConditions Detection Stuck Conditions
+
+It may happen that for some reason, StarPU does not make progress for a long
+period of time.  Reason are sometimes due to contention inside StarPU, but
+sometimes this is due to external reasons, such as stuck MPI driver, or CUDA
+driver, etc.
+
+<c>export STARPU_WATCHDOG_TIMEOUT=10000</c>
+
+allows to make StarPU print an error message whenever StarPU does not terminate
+any task for 10ms. In addition to that,
+
+<c>export STARPU_WATCHDOG_CRASH=1</c>
+
+triggers a crash in that condition, thus allowing to catch the situation in gdb
+etc.
+
+\section HowToLimitMemoryPerNode How to limit memory per node
+
+TODO
+
+Talk about
+\ref STARPU_LIMIT_CUDA_devid_MEM, \ref STARPU_LIMIT_CUDA_MEM,
+\ref STARPU_LIMIT_OPENCL_devid_MEM, \ref STARPU_LIMIT_OPENCL_MEM
+and \ref STARPU_LIMIT_CPU_MEM
+
+starpu_memory_get_available()
+
+\section PerformanceModelCalibration Performance Model Calibration
+
+Most schedulers are based on an estimation of codelet duration on each kind
+of processing unit. For this to be possible, the application programmer needs
+to configure a performance model for the codelets of the application (see
+\ref PerformanceModelExample for instance). History-based performance models
+use on-line calibration.  StarPU will automatically calibrate codelets
+which have never been calibrated yet, and save the result in
+<c>$STARPU_HOME/.starpu/sampling/codelets</c>.
+The models are indexed by machine name. To share the models between
+machines (e.g. for a homogeneous cluster), use <c>export
+STARPU_HOSTNAME=some_global_name</c>. To force continuing calibration,
+use <c>export STARPU_CALIBRATE=1</c> . This may be necessary if your application
+has not-so-stable performance. StarPU will force calibration (and thus ignore
+the current result) until 10 (<c>_STARPU_CALIBRATION_MINIMUM</c>) measurements have been
+made on each architecture, to avoid badly scheduling tasks just because the
+first measurements were not so good. Details on the current performance model status
+can be obtained from the command <c>starpu_perfmodel_display</c>: the <c>-l</c>
+option lists the available performance models, and the <c>-s</c> option permits
+to choose the performance model to be displayed. The result looks like:
+
+\verbatim
+$ starpu_perfmodel_display -s starpu_slu_lu_model_11
+performance model for cpu_impl_0
+# hash    size     flops         mean          dev           n
+914f3bef  1048576  0.000000e+00  2.503577e+04  1.982465e+02  8
+3e921964  65536    0.000000e+00  5.527003e+02  1.848114e+01  7
+e5a07e31  4096     0.000000e+00  1.717457e+01  5.190038e+00  14
+...
+\endverbatim
+
+Which shows that for the LU 11 kernel with a 1MiB matrix, the average
+execution time on CPUs was about 25ms, with a 0.2ms standard deviation, over
+8 samples. It is a good idea to check this before doing actual performance
+measurements.
+
+A graph can be drawn by using the tool <c>starpu_perfmodel_plot</c>:
+
+\verbatim
+$ starpu_perfmodel_plot -s starpu_slu_lu_model_11
+4096 16384 65536 262144 1048576 4194304 
+$ gnuplot starpu_starpu_slu_lu_model_11.gp
+$ gv starpu_starpu_slu_lu_model_11.eps
+\endverbatim
+
+\image html starpu_starpu_slu_lu_model_11.png
+\image latex starpu_starpu_slu_lu_model_11.eps "" width=\textwidth
+
+If a kernel source code was modified (e.g. performance improvement), the
+calibration information is stale and should be dropped, to re-calibrate from
+start. This can be done by using <c>export STARPU_CALIBRATE=2</c>.
+
+Note: due to CUDA limitations, to be able to measure kernel duration,
+calibration mode needs to disable asynchronous data transfers. Calibration thus
+disables data transfer / computation overlapping, and should thus not be used
+for eventual benchmarks. Note 2: history-based performance models get calibrated
+only if a performance-model-based scheduler is chosen.
+
+The history-based performance models can also be explicitly filled by the
+application without execution, if e.g. the application already has a series of
+measurements. This can be done by using starpu_perfmodel_update_history(),
+for instance:
+
+\code{.c}
+static struct starpu_perfmodel perf_model = {
+    .type = STARPU_HISTORY_BASED,
+    .symbol = "my_perfmodel",
+};
+
+struct starpu_codelet cl = {
+    .where = STARPU_CUDA,
+    .cuda_funcs = { cuda_func1, cuda_func2, NULL },
+    .nbuffers = 1,
+    .modes = {STARPU_W},
+    .model = &perf_model
+};
+
+void feed(void) {
+    struct my_measure *measure;
+    struct starpu_task task;
+    starpu_task_init(&task);
+
+    task.cl = &cl;
+
+    for (measure = &measures[0]; measure < measures[last]; measure++) {
+        starpu_data_handle_t handle;
+	starpu_vector_data_register(&handle, -1, 0, measure->size, sizeof(float));
+	task.handles[0] = handle;
+	starpu_perfmodel_update_history(&perf_model, &task,
+	                                STARPU_CUDA_DEFAULT + measure->cudadev, 0,
+	                                measure->implementation, measure->time);
+	starpu_task_clean(&task);
+	starpu_data_unregister(handle);
+    }
+}
+\endcode
+
+Measurement has to be provided in milliseconds for the completion time models,
+and in Joules for the energy consumption models.
+
+\section Profiling Profiling
+
+A quick view of how many tasks each worker has executed can be obtained by setting
+<c>export STARPU_WORKER_STATS=1</c> This is a convenient way to check that
+execution did happen on accelerators without penalizing performance with
+the profiling overhead.
+
+A quick view of how much data transfers have been issued can be obtained by setting
+<c>export STARPU_BUS_STATS=1</c> .
+
+More detailed profiling information can be enabled by using <c>export STARPU_PROFILING=1</c> or by
+calling starpu_profiling_status_set() from the source code.
+Statistics on the execution can then be obtained by using <c>export
+STARPU_BUS_STATS=1</c> and <c>export STARPU_WORKER_STATS=1</c> .
+ More details on performance feedback are provided by the next chapter.
+
+*/

+ 443 - 0
doc/doxygen/chapters/06tasks.doxy

@@ -0,0 +1,443 @@
+/*
+ * This file is part of the StarPU Handbook.
+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+ * See the file version.doxy for copying conditions.
+ */
+
+/*! \page TasksInStarPU Tasks In StarPU
+
+\section TaskGranularity Task Granularity
+
+Like any other runtime, StarPU has some overhead to manage tasks. Since
+it does smart scheduling and data management, that overhead is not always
+neglectable. The order of magnitude of the overhead is typically a couple of
+microseconds, which is actually quite smaller than the CUDA overhead itself. The
+amount of work that a task should do should thus be somewhat
+bigger, to make sure that the overhead becomes neglectible. The offline
+performance feedback can provide a measure of task length, which should thus be
+checked if bad performance are observed. To get a grasp at the scalability
+possibility according to task size, one can run
+<c>tests/microbenchs/tasks_size_overhead.sh</c> which draws curves of the
+speedup of independent tasks of very small sizes.
+
+The choice of scheduler also has impact over the overhead: for instance, the
+ scheduler <c>dmda</c> takes time to make a decision, while <c>eager</c> does
+not. <c>tasks_size_overhead.sh</c> can again be used to get a grasp at how much
+impact that has on the target machine.
+
+\section TaskSubmission Task Submission
+
+To let StarPU make online optimizations, tasks should be submitted
+asynchronously as much as possible. Ideally, all the tasks should be
+submitted, and mere calls to starpu_task_wait_for_all() or
+starpu_data_unregister() be done to wait for
+termination. StarPU will then be able to rework the whole schedule, overlap
+computation with communication, manage accelerator local memory usage, etc.
+
+\section TaskPriorities Task Priorities
+
+By default, StarPU will consider the tasks in the order they are submitted by
+the application. If the application programmer knows that some tasks should
+be performed in priority (for instance because their output is needed by many
+other tasks and may thus be a bottleneck if not executed early
+enough), the field starpu_task::priority should be set to transmit the
+priority information to StarPU.
+
+\section SettingTheDataHandlesForATask Setting The Data Handles For A Task
+
+The number of data a task can manage is fixed by the environment variable
+\ref STARPU_NMAXBUFS which has a default value which can be changed
+through the configure option \ref enable-maxbuffers "--enable-maxbuffers".
+
+However, it is possible to define tasks managing more data by using
+the field starpu_task::dyn_handles when defining a task and the field
+starpu_codelet::dyn_modes when defining the corresponding codelet.
+
+\code{.c}
+enum starpu_data_access_mode modes[STARPU_NMAXBUFS+1] = {
+	STARPU_R, STARPU_R, ...
+};
+
+struct starpu_codelet dummy_big_cl =
+{
+	.cuda_funcs = { dummy_big_kernel, NULL },
+	.opencl_funcs = { dummy_big_kernel, NULL },
+	.cpu_funcs = { dummy_big_kernel, NULL },
+	.cpu_funcs_name = { "dummy_big_kernel", NULL },
+	.nbuffers = STARPU_NMAXBUFS+1,
+	.dyn_modes = modes
+};
+
+task = starpu_task_create();
+task->cl = &dummy_big_cl;
+task->dyn_handles = malloc(task->cl->nbuffers * sizeof(starpu_data_handle_t));
+for(i=0 ; i<task->cl->nbuffers ; i++)
+{
+	task->dyn_handles[i] = handle;
+}
+starpu_task_submit(task);
+\endcode
+
+\code{.c}
+starpu_data_handle_t *handles = malloc(dummy_big_cl.nbuffers * sizeof(starpu_data_handle_t));
+for(i=0 ; i<dummy_big_cl.nbuffers ; i++)
+{
+	handles[i] = handle;
+}
+starpu_task_insert(&dummy_big_cl,
+        	 STARPU_VALUE, &dummy_big_cl.nbuffers, sizeof(dummy_big_cl.nbuffers),
+		 STARPU_DATA_ARRAY, handles, dummy_big_cl.nbuffers,
+		 0);
+\endcode
+
+The whole code for this complex data interface is available in the
+directory <c>examples/basic_examples/dynamic_handles.c</c>.
+
+\section UsingMultipleImplementationsOfACodelet Using Multiple Implementations Of A Codelet
+
+One may want to write multiple implementations of a codelet for a single type of
+device and let StarPU choose which one to run. As an example, we will show how
+to use SSE to scale a vector. The codelet can be written as follows:
+
+\code{.c}
+#include <xmmintrin.h>
+
+void scal_sse_func(void *buffers[], void *cl_arg)
+{
+    float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
+    unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
+    unsigned int n_iterations = n/4;
+    if (n % 4 != 0)
+        n_iterations++;
+
+    __m128 *VECTOR = (__m128*) vector;
+    __m128 factor __attribute__((aligned(16)));
+    factor = _mm_set1_ps(*(float *) cl_arg);
+
+    unsigned int i;
+    for (i = 0; i < n_iterations; i++)
+        VECTOR[i] = _mm_mul_ps(factor, VECTOR[i]);
+}
+\endcode
+
+\code{.c}
+struct starpu_codelet cl = {
+    .where = STARPU_CPU,
+    .cpu_funcs = { scal_cpu_func, scal_sse_func, NULL },
+    .cpu_funcs_name = { "scal_cpu_func", "scal_sse_func", NULL },
+    .nbuffers = 1,
+    .modes = { STARPU_RW }
+};
+\endcode
+
+Schedulers which are multi-implementation aware (only <c>dmda</c> and
+<c>pheft</c> for now) will use the performance models of all the
+implementations it was given, and pick the one that seems to be the fastest.
+
+\section EnablingImplementationAccordingToCapabilities Enabling Implementation According To Capabilities
+
+Some implementations may not run on some devices. For instance, some CUDA
+devices do not support double floating point precision, and thus the kernel
+execution would just fail; or the device may not have enough shared memory for
+the implementation being used. The field starpu_codelet::can_execute
+permits to express this. For instance:
+
+\code{.c}
+static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
+{
+  const struct cudaDeviceProp *props;
+  if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
+    return 1;
+  /* Cuda device */
+  props = starpu_cuda_get_device_properties(workerid);
+  if (props->major >= 2 || props->minor >= 3)
+    /* At least compute capability 1.3, supports doubles */
+    return 1;
+  /* Old card, does not support doubles */
+  return 0;
+}
+
+struct starpu_codelet cl = {
+    .where = STARPU_CPU|STARPU_CUDA,
+    .can_execute = can_execute,
+    .cpu_funcs = { cpu_func, NULL },
+    .cpu_funcs_name = { "cpu_func", NULL },
+    .cuda_funcs = { gpu_func, NULL }
+    .nbuffers = 1,
+    .modes = { STARPU_RW }
+};
+\endcode
+
+This can be essential e.g. when running on a machine which mixes various models
+of CUDA devices, to take benefit from the new models without crashing on old models.
+
+Note: the function starpu_codelet::can_execute is called by the
+scheduler each time it tries to match a task with a worker, and should
+thus be very fast. The function starpu_cuda_get_device_properties()
+provides a quick access to CUDA properties of CUDA devices to achieve
+such efficiency.
+
+Another example is to compile CUDA code for various compute capabilities,
+resulting with two CUDA functions, e.g. <c>scal_gpu_13</c> for compute capability
+1.3, and <c>scal_gpu_20</c> for compute capability 2.0. Both functions can be
+provided to StarPU by using starpu_codelet::cuda_funcs, and
+starpu_codelet::can_execute can then be used to rule out the
+<c>scal_gpu_20</c> variant on a CUDA device which will not be able to execute it:
+
+\code{.c}
+static int can_execute(unsigned workerid, struct starpu_task *task, unsigned nimpl)
+{
+  const struct cudaDeviceProp *props;
+  if (starpu_worker_get_type(workerid) == STARPU_CPU_WORKER)
+    return 1;
+  /* Cuda device */
+  if (nimpl == 0)
+    /* Trying to execute the 1.3 capability variant, we assume it is ok in all cases.  */
+    return 1;
+  /* Trying to execute the 2.0 capability variant, check that the card can do it.  */
+  props = starpu_cuda_get_device_properties(workerid);
+  if (props->major >= 2 || props->minor >= 0)
+    /* At least compute capability 2.0, can run it */
+    return 1;
+  /* Old card, does not support 2.0, will not be able to execute the 2.0 variant.  */
+  return 0;
+}
+
+struct starpu_codelet cl = {
+    .where = STARPU_CPU|STARPU_CUDA,
+    .can_execute = can_execute,
+    .cpu_funcs = { cpu_func, NULL },
+    .cpu_funcs_name = { "cpu_func", NULL },
+    .cuda_funcs = { scal_gpu_13, scal_gpu_20, NULL },
+    .nbuffers = 1,
+    .modes = { STARPU_RW }
+};
+\endcode
+
+Note: the most generic variant should be provided first, as some schedulers are
+not able to try the different variants.
+
+\section InsertTaskUtility Insert Task Utility
+
+StarPU provides the wrapper function starpu_task_insert() to ease
+the creation and submission of tasks.
+
+Here the implementation of the codelet:
+
+\code{.c}
+void func_cpu(void *descr[], void *_args)
+{
+        int *x0 = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
+        float *x1 = (float *)STARPU_VARIABLE_GET_PTR(descr[1]);
+        int ifactor;
+        float ffactor;
+
+        starpu_codelet_unpack_args(_args, &ifactor, &ffactor);
+        *x0 = *x0 * ifactor;
+        *x1 = *x1 * ffactor;
+}
+
+struct starpu_codelet mycodelet = {
+        .where = STARPU_CPU,
+        .cpu_funcs = { func_cpu, NULL },
+        .cpu_funcs_name = { "func_cpu", NULL },
+        .nbuffers = 2,
+        .modes = { STARPU_RW, STARPU_RW }
+};
+\endcode
+
+And the call to the function starpu_task_insert():
+
+\code{.c}
+starpu_task_insert(&mycodelet,
+                   STARPU_VALUE, &ifactor, sizeof(ifactor),
+                   STARPU_VALUE, &ffactor, sizeof(ffactor),
+                   STARPU_RW, data_handles[0], STARPU_RW, data_handles[1],
+                   0);
+\endcode
+
+The call to starpu_task_insert() is equivalent to the following
+code:
+
+\code{.c}
+struct starpu_task *task = starpu_task_create();
+task->cl = &mycodelet;
+task->handles[0] = data_handles[0];
+task->handles[1] = data_handles[1];
+char *arg_buffer;
+size_t arg_buffer_size;
+starpu_codelet_pack_args(&arg_buffer, &arg_buffer_size,
+                    STARPU_VALUE, &ifactor, sizeof(ifactor),
+                    STARPU_VALUE, &ffactor, sizeof(ffactor),
+                    0);
+task->cl_arg = arg_buffer;
+task->cl_arg_size = arg_buffer_size;
+int ret = starpu_task_submit(task);
+\endcode
+
+Here a similar call using ::STARPU_DATA_ARRAY.
+
+\code{.c}
+starpu_task_insert(&mycodelet,
+                   STARPU_DATA_ARRAY, data_handles, 2,
+                   STARPU_VALUE, &ifactor, sizeof(ifactor),
+                   STARPU_VALUE, &ffactor, sizeof(ffactor),
+                   0);
+\endcode
+
+If some part of the task insertion depends on the value of some computation,
+the macro ::STARPU_DATA_ACQUIRE_CB can be very convenient. For
+instance, assuming that the index variable <c>i</c> was registered as handle
+<c>A_handle[i]</c>:
+
+\code{.c}
+/* Compute which portion we will work on, e.g. pivot */
+starpu_task_insert(&which_index, STARPU_W, i_handle, 0);
+
+/* And submit the corresponding task */
+STARPU_DATA_ACQUIRE_CB(i_handle, STARPU_R,
+                       starpu_task_insert(&work, STARPU_RW, A_handle[i], 0));
+\endcode
+
+The macro ::STARPU_DATA_ACQUIRE_CB submits an asynchronous request for
+acquiring data <c>i</c> for the main application, and will execute the code
+given as third parameter when it is acquired. In other words, as soon as the
+value of <c>i</c> computed by the codelet <c>which_index</c> can be read, the
+portion of code passed as third parameter of ::STARPU_DATA_ACQUIRE_CB will
+be executed, and is allowed to read from <c>i</c> to use it e.g. as an
+index. Note that this macro is only avaible when compiling StarPU with
+the compiler <c>gcc</c>.
+
+\section ParallelTasks Parallel Tasks
+
+StarPU can leverage existing parallel computation libraries by the means of
+parallel tasks. A parallel task is a task which gets worked on by a set of CPUs
+(called a parallel or combined worker) at the same time, by using an existing
+parallel CPU implementation of the computation to be achieved. This can also be
+useful to improve the load balance between slow CPUs and fast GPUs: since CPUs
+work collectively on a single task, the completion time of tasks on CPUs become
+comparable to the completion time on GPUs, thus relieving from granularity
+discrepancy concerns. <c>hwloc</c> support needs to be enabled to get
+good performance, otherwise StarPU will not know how to better group
+cores.
+
+Two modes of execution exist to accomodate with existing usages.
+
+\subsection Fork-modeParallelTasks Fork-mode Parallel Tasks
+
+In the Fork mode, StarPU will call the codelet function on one
+of the CPUs of the combined worker. The codelet function can use
+starpu_combined_worker_get_size() to get the number of threads it is
+allowed to start to achieve the computation. The CPU binding mask for the whole
+set of CPUs is already enforced, so that threads created by the function will
+inherit the mask, and thus execute where StarPU expected, the OS being in charge
+of choosing how to schedule threads on the corresponding CPUs. The application
+can also choose to bind threads by hand, using e.g. sched_getaffinity to know
+the CPU binding mask that StarPU chose.
+
+For instance, using OpenMP (full source is available in
+<c>examples/openmp/vector_scal.c</c>):
+
+\snippet forkmode.c To be included. You should update doxygen if you see this text.
+
+Other examples include for instance calling a BLAS parallel CPU implementation
+(see <c>examples/mult/xgemm.c</c>).
+
+\subsection SPMD-modeParallelTasks SPMD-mode Parallel Tasks
+
+In the SPMD mode, StarPU will call the codelet function on
+each CPU of the combined worker. The codelet function can use
+starpu_combined_worker_get_size() to get the total number of CPUs
+involved in the combined worker, and thus the number of calls that are made in
+parallel to the function, and starpu_combined_worker_get_rank() to get
+the rank of the current CPU within the combined worker. For instance:
+
+\code{.c}
+static void func(void *buffers[], void *args)
+{
+    unsigned i;
+    float *factor = _args;
+    struct starpu_vector_interface *vector = buffers[0];
+    unsigned n = STARPU_VECTOR_GET_NX(vector);
+    float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
+
+    /* Compute slice to compute */
+    unsigned m = starpu_combined_worker_get_size();
+    unsigned j = starpu_combined_worker_get_rank();
+    unsigned slice = (n+m-1)/m;
+
+    for (i = j * slice; i < (j+1) * slice && i < n; i++)
+        val[i] *= *factor;
+}
+
+static struct starpu_codelet cl =
+{
+    .modes = { STARPU_RW },
+    .where = STARP_CPU,
+    .type = STARPU_SPMD,
+    .max_parallelism = INT_MAX,
+    .cpu_funcs = { func, NULL },
+    .cpu_funcs_name = { "func", NULL },
+    .nbuffers = 1,
+}
+\endcode
+
+Of course, this trivial example will not really benefit from parallel task
+execution, and was only meant to be simple to understand.  The benefit comes
+when the computation to be done is so that threads have to e.g. exchange
+intermediate results, or write to the data in a complex but safe way in the same
+buffer.
+
+\subsection ParallelTasksPerformance Parallel Tasks Performance
+
+To benefit from parallel tasks, a parallel-task-aware StarPU scheduler has to
+be used. When exposed to codelets with a flag ::STARPU_FORKJOIN or
+::STARPU_SPMD, the schedulers <c>pheft</c> (parallel-heft) and <c>peager</c>
+(parallel eager) will indeed also try to execute tasks with
+several CPUs. It will automatically try the various available combined
+worker sizes (making several measurements for each worker size) and
+thus be able to avoid choosing a large combined worker if the codelet
+does not actually scale so much.
+
+\subsection CombinedWorkers Combined Workers
+
+By default, StarPU creates combined workers according to the architecture
+structure as detected by <c>hwloc</c>. It means that for each object of the <c>hwloc</c>
+topology (NUMA node, socket, cache, ...) a combined worker will be created. If
+some nodes of the hierarchy have a big arity (e.g. many cores in a socket
+without a hierarchy of shared caches), StarPU will create combined workers of
+intermediate sizes. The variable \ref
+STARPU_SYNTHESIZE_ARITY_COMBINED_WORKER permits to tune the maximum
+arity between levels of combined workers.
+
+The combined workers actually produced can be seen in the output of the
+tool <c>starpu_machine_display</c> (the environment variable \ref
+STARPU_SCHED has to be set to a combined worker-aware scheduler such
+as <c>pheft</c> or <c>peager</c>).
+
+\subsection ConcurrentParallelTasks Concurrent Parallel Tasks
+
+Unfortunately, many environments and librairies do not support concurrent
+calls.
+
+For instance, most OpenMP implementations (including the main ones) do not
+support concurrent <c>pragma omp parallel</c> statements without nesting them in
+another <c>pragma omp parallel</c> statement, but StarPU does not yet support
+creating its CPU workers by using such pragma.
+
+Other parallel libraries are also not safe when being invoked concurrently
+from different threads, due to the use of global variables in their sequential
+sections for instance.
+
+The solution is then to use only one combined worker at a time.  This can be
+done by setting the field starpu_conf::single_combined_worker to <c>1</c>, or
+setting the environment variable \ref STARPU_SINGLE_COMBINED_WORKER
+to <c>1</c>. StarPU will then run only one parallel task at a time (but other
+CPU and GPU tasks are not affected and can be run concurrently). The parallel
+task scheduler will however still however still try varying combined worker
+sizes to look for the most efficient ones.
+
+
+*/

+ 508 - 0
doc/doxygen/chapters/07data_management.doxy

@@ -0,0 +1,508 @@
+/*
+ * This file is part of the StarPU Handbook.
+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+ * See the file version.doxy for copying conditions.
+ */
+
+/*! \page DataManagement Data Management
+
+intro qui parle de coherency entre autres
+
+\section DataManagement Data Management
+
+When the application allocates data, whenever possible it should use
+the function starpu_malloc(), which will ask CUDA or OpenCL to make
+the allocation itself and pin the corresponding allocated memory. This
+is needed to permit asynchronous data transfer, i.e. permit data
+transfer to overlap with computations. Otherwise, the trace will show
+that the <c>DriverCopyAsync</c> state takes a lot of time, this is
+because CUDA or OpenCL then reverts to synchronous transfers.
+
+By default, StarPU leaves replicates of data wherever they were used, in case they
+will be re-used by other tasks, thus saving the data transfer time. When some
+task modifies some data, all the other replicates are invalidated, and only the
+processing unit which ran that task will have a valid replicate of the data. If the application knows
+that this data will not be re-used by further tasks, it should advise StarPU to
+immediately replicate it to a desired list of memory nodes (given through a
+bitmask). This can be understood like the write-through mode of CPU caches.
+
+\code{.c}
+starpu_data_set_wt_mask(img_handle, 1<<0);
+\endcode
+
+will for instance request to always automatically transfer a replicate into the
+main memory (node <c>0</c>), as bit <c>0</c> of the write-through bitmask is being set.
+
+\code{.c}
+starpu_data_set_wt_mask(img_handle, ~0U);
+\endcode
+
+will request to always automatically broadcast the updated data to all memory
+nodes.
+
+Setting the write-through mask to <c>~0U</c> can also be useful to make sure all
+memory nodes always have a copy of the data, so that it is never evicted when
+memory gets scarse.
+
+Implicit data dependency computation can become expensive if a lot
+of tasks access the same piece of data. If no dependency is required
+on some piece of data (e.g. because it is only accessed in read-only
+mode, or because write accesses are actually commutative), use the
+function starpu_data_set_sequential_consistency_flag() to disable
+implicit dependencies on that data.
+
+In the same vein, accumulation of results in the same data can become a
+bottleneck. The use of the mode ::STARPU_REDUX permits to optimize such
+accumulation (see \ref DataReduction). To a lesser extent, the use of
+the flag ::STARPU_COMMUTE keeps the bottleneck, but at least permits
+the accumulation to happen in any order.
+
+Applications often need a data just for temporary results.  In such a case,
+registration can be made without an initial value, for instance this produces a vector data:
+
+\code{.c}
+starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
+\endcode
+
+StarPU will then allocate the actual buffer only when it is actually needed,
+e.g. directly on the GPU without allocating in main memory.
+
+In the same vein, once the temporary results are not useful any more, the
+data should be thrown away. If the handle is not to be reused, it can be
+unregistered:
+
+\code{.c}
+starpu_unregister_submit(handle);
+\endcode
+
+actual unregistration will be done after all tasks working on the handle
+terminate.
+
+If the handle is to be reused, instead of unregistering it, it can simply be invalidated:
+
+\code{.c}
+starpu_invalidate_submit(handle);
+\endcode
+
+the buffers containing the current value will then be freed, and reallocated
+only when another task writes some value to the handle.
+
+\section DataPrefetch Data Prefetch
+
+The scheduling policies <c>heft</c>, <c>dmda</c> and <c>pheft</c>
+perform data prefetch (see \ref STARPU_PREFETCH):
+as soon as a scheduling decision is taken for a task, requests are issued to
+transfer its required data to the target processing unit, if needed, so that
+when the processing unit actually starts the task, its data will hopefully be
+already available and it will not have to wait for the transfer to finish.
+
+The application may want to perform some manual prefetching, for several reasons
+such as excluding initial data transfers from performance measurements, or
+setting up an initial statically-computed data distribution on the machine
+before submitting tasks, which will thus guide StarPU toward an initial task
+distribution (since StarPU will try to avoid further transfers).
+
+This can be achieved by giving the function starpu_data_prefetch_on_node()
+the handle and the desired target memory node.
+
+\section PartitioningData Partitioning Data
+
+An existing piece of data can be partitioned in sub parts to be used by different tasks, for instance:
+
+\code{.c}
+int vector[NX];
+starpu_data_handle_t handle;
+
+/* Declare data to StarPU */
+starpu_vector_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)vector,
+                            NX, sizeof(vector[0]));
+
+/* Partition the vector in PARTS sub-vectors */
+starpu_data_filter f =
+{
+    .filter_func = starpu_vector_filter_block,
+    .nchildren = PARTS
+};
+starpu_data_partition(handle, &f);
+\endcode
+
+The task submission then uses the function starpu_data_get_sub_data()
+to retrieve the sub-handles to be passed as tasks parameters.
+
+\code{.c}
+/* Submit a task on each sub-vector */
+for (i=0; i<starpu_data_get_nb_children(handle); i++) {
+    /* Get subdata number i (there is only 1 dimension) */
+    starpu_data_handle_t sub_handle = starpu_data_get_sub_data(handle, 1, i);
+    struct starpu_task *task = starpu_task_create();
+
+    task->handles[0] = sub_handle;
+    task->cl = &cl;
+    task->synchronous = 1;
+    task->cl_arg = &factor;
+    task->cl_arg_size = sizeof(factor);
+
+    starpu_task_submit(task);
+}
+\endcode
+
+Partitioning can be applied several times, see
+<c>examples/basic_examples/mult.c</c> and <c>examples/filters/</c>.
+
+Wherever the whole piece of data is already available, the partitioning will
+be done in-place, i.e. without allocating new buffers but just using pointers
+inside the existing copy. This is particularly important to be aware of when
+using OpenCL, where the kernel parameters are not pointers, but handles. The
+kernel thus needs to be also passed the offset within the OpenCL buffer:
+
+\code{.c}
+void opencl_func(void *buffers[], void *cl_arg)
+{
+    cl_mem vector = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
+    unsigned offset = STARPU_BLOCK_GET_OFFSET(buffers[0]);
+
+    ...
+    clSetKernelArg(kernel, 0, sizeof(vector), &vector);
+    clSetKernelArg(kernel, 1, sizeof(offset), &offset);
+    ...
+}
+\endcode
+
+And the kernel has to shift from the pointer passed by the OpenCL driver:
+
+\code{.c}
+__kernel void opencl_kernel(__global int *vector, unsigned offset)
+{
+    block = (__global void *)block + offset;
+    ...
+}
+\endcode
+
+StarPU provides various interfaces and filters for matrices, vectors, etc.,
+but applications can also write their own data interfaces and filters, see
+<c>examples/interface</c> and <c>examples/filters/custom_mf</c> for an example.
+
+\section DataReduction Data Reduction
+
+In various cases, some piece of data is used to accumulate intermediate
+results. For instances, the dot product of a vector, maximum/minimum finding,
+the histogram of a photograph, etc. When these results are produced along the
+whole machine, it would not be efficient to accumulate them in only one place,
+incurring data transmission each and access concurrency.
+
+StarPU provides a mode ::STARPU_REDUX, which permits to optimize
+that case: it will allocate a buffer on each memory node, and accumulate
+intermediate results there. When the data is eventually accessed in the normal
+mode ::STARPU_R, StarPU will collect the intermediate results in just one
+buffer.
+
+For this to work, the user has to use the function
+starpu_data_set_reduction_methods() to declare how to initialize these
+buffers, and how to assemble partial results.
+
+For instance, <c>cg</c> uses that to optimize its dot product: it first defines
+the codelets for initialization and reduction:
+
+\code{.c}
+struct starpu_codelet bzero_variable_cl =
+{
+        .cpu_funcs = { bzero_variable_cpu, NULL },
+        .cpu_funcs_name = { "bzero_variable_cpu", NULL },
+        .cuda_funcs = { bzero_variable_cuda, NULL },
+        .nbuffers = 1,
+}
+
+static void accumulate_variable_cpu(void *descr[], void *cl_arg)
+{
+        double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);
+        double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);
+        *v_dst = *v_dst + *v_src;
+}
+
+static void accumulate_variable_cuda(void *descr[], void *cl_arg)
+{
+        double *v_dst = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);
+        double *v_src = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);
+        cublasaxpy(1, (double)1.0, v_src, 1, v_dst, 1);
+        cudaStreamSynchronize(starpu_cuda_get_local_stream());
+}
+
+struct starpu_codelet accumulate_variable_cl =
+{
+        .cpu_funcs = { accumulate_variable_cpu, NULL },
+        .cpu_funcs_name = { "accumulate_variable_cpu", NULL },
+        .cuda_funcs = { accumulate_variable_cuda, NULL },
+        .nbuffers = 1,
+}
+\endcode
+
+and attaches them as reduction methods for its handle <c>dtq</c>:
+
+\code{.c}
+starpu_variable_data_register(&dtq_handle, -1, NULL, sizeof(type));
+starpu_data_set_reduction_methods(dtq_handle,
+        &accumulate_variable_cl, &bzero_variable_cl);
+\endcode
+
+and <c>dtq_handle</c> can now be used in mode ::STARPU_REDUX for the
+dot products with partitioned vectors:
+
+\code{.c}
+for (b = 0; b < nblocks; b++)
+    starpu_task_insert(&dot_kernel_cl,
+        STARPU_REDUX, dtq_handle,
+        STARPU_R, starpu_data_get_sub_data(v1, 1, b),
+        STARPU_R, starpu_data_get_sub_data(v2, 1, b),
+        0);
+\endcode
+
+During registration, we have here provided <c>NULL</c>, i.e. there is
+no initial value to be taken into account during reduction. StarPU
+will thus only take into account the contributions from the tasks
+<c>dot_kernel_cl</c>. Also, it will not allocate any memory for
+<c>dtq_handle</c> before tasks <c>dot_kernel_cl</c> are ready to run.
+
+If another dot product has to be performed, one could unregister
+<c>dtq_handle</c>, and re-register it. But one can also call
+starpu_data_invalidate_submit() with the parameter <c>dtq_handle</c>,
+which will clear all data from the handle, thus resetting it back to
+the initial status <c>register(NULL)</c>.
+
+The example <c>cg</c> also uses reduction for the blocked gemv kernel,
+leading to yet more relaxed dependencies and more parallelism.
+
+::STARPU_REDUX can also be passed to starpu_mpi_task_insert() in the MPI
+case. That will however not produce any MPI communication, but just pass
+::STARPU_REDUX to the underlying starpu_task_insert(). It is up to the
+application to call starpu_mpi_redux_data(), which posts tasks that will
+reduce the partial results among MPI nodes into the MPI node which owns the
+data. For instance, some hypothetical application which collects partial results
+into data <c>res</c>, then uses it for other computation, before looping again
+with a new reduction:
+
+\code{.c}
+for (i = 0; i < 100; i++) {
+    starpu_mpi_task_insert(MPI_COMM_WORLD, &init_res, STARPU_W, res, 0);
+    starpu_mpi_task_insert(MPI_COMM_WORLD, &work, STARPU_RW, A,
+               STARPU_R, B, STARPU_REDUX, res, 0);
+    starpu_mpi_redux_data(MPI_COMM_WORLD, res);
+    starpu_mpi_task_insert(MPI_COMM_WORLD, &work2, STARPU_RW, B, STARPU_R, res, 0);
+}
+\endcode
+
+\section TemporaryBuffers Temporary Buffers
+
+There are two kinds of temporary buffers: temporary data which just pass results
+from a task to another, and scratch data which are needed only internally by
+tasks.
+
+\subsection TemporaryData Temporary Data
+
+Data can sometimes be entirely produced by a task, and entirely consumed by
+another task, without the need for other parts of the application to access
+it. In such case, registration can be done without prior allocation, by using
+the special memory node number <c>-1</c>, and passing a zero pointer. StarPU will
+actually allocate memory only when the task creating the content gets scheduled,
+and destroy it on unregistration.
+
+In addition to that, it can be tedious for the application to have to unregister
+the data, since it will not use its content anyway. The unregistration can be
+done lazily by using the function starpu_data_unregister_submit(),
+which will record that no more tasks accessing the handle will be submitted, so
+that it can be freed as soon as the last task accessing it is over.
+
+The following code examplifies both points: it registers the temporary
+data, submits three tasks accessing it, and records the data for automatic
+unregistration.
+
+\code{.c}
+starpu_vector_data_register(&handle, -1, 0, n, sizeof(float));
+starpu_task_insert(&produce_data, STARPU_W, handle, 0);
+starpu_task_insert(&compute_data, STARPU_RW, handle, 0);
+starpu_task_insert(&summarize_data, STARPU_R, handle, STARPU_W, result_handle, 0);
+starpu_data_unregister_submit(handle);
+\endcode
+
+\subsection ScratchData Scratch Data
+
+Some kernels sometimes need temporary data to achieve the computations, i.e. a
+workspace. The application could allocate it at the start of the codelet
+function, and free it at the end, but that would be costly. It could also
+allocate one buffer per worker (similarly to \ref
+HowToInitializeAComputationLibraryOnceForEachWorker), but that would
+make them systematic and permanent. A more  optimized way is to use
+the data access mode ::STARPU_SCRATCH, as examplified below, which
+provides per-worker buffers without content consistency.
+
+\code{.c}
+starpu_vector_data_register(&workspace, -1, 0, sizeof(float));
+for (i = 0; i < N; i++)
+    starpu_task_insert(&compute, STARPU_R, input[i],
+                       STARPU_SCRATCH, workspace, STARPU_W, output[i], 0);
+\endcode
+
+StarPU will make sure that the buffer is allocated before executing the task,
+and make this allocation per-worker: for CPU workers, notably, each worker has
+its own buffer. This means that each task submitted above will actually have its
+own workspace, which will actually be the same for all tasks running one after
+the other on the same worker. Also, if for instance GPU memory becomes scarce,
+StarPU will notice that it can free such buffers easily, since the content does
+not matter.
+
+The example <c>examples/pi</c> uses scratches for some temporary buffer.
+
+\section TheMultiformatInterface The Multiformat Interface
+
+It may be interesting to represent the same piece of data using two different
+data structures: one that would only be used on CPUs, and one that would only
+be used on GPUs. This can be done by using the multiformat interface. StarPU
+will be able to convert data from one data structure to the other when needed.
+Note that the scheduler <c>dmda</c> is the only one optimized for this
+interface. The user must provide StarPU with conversion codelets:
+
+\snippet multiformat.c To be included. You should update doxygen if you see this text.
+
+Kernels can be written almost as for any other interface. Note that
+::STARPU_MULTIFORMAT_GET_CPU_PTR shall only be used for CPU kernels. CUDA kernels
+must use ::STARPU_MULTIFORMAT_GET_CUDA_PTR, and OpenCL kernels must use
+::STARPU_MULTIFORMAT_GET_OPENCL_PTR. ::STARPU_MULTIFORMAT_GET_NX may
+be used in any kind of kernel.
+
+\code{.c}
+static void
+multiformat_scal_cpu_func(void *buffers[], void *args)
+{
+    struct point *aos;
+    unsigned int n;
+
+    aos = STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]);
+    n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
+    ...
+}
+
+extern "C" void multiformat_scal_cuda_func(void *buffers[], void *_args)
+{
+    unsigned int n;
+    struct struct_of_arrays *soa;
+
+    soa = (struct struct_of_arrays *) STARPU_MULTIFORMAT_GET_CUDA_PTR(buffers[0]);
+    n = STARPU_MULTIFORMAT_GET_NX(buffers[0]);
+
+    ...
+}
+\endcode
+
+A full example may be found in <c>examples/basic_examples/multiformat.c</c>.
+
+\section DefiningANewDataInterface Defining A New Data Interface
+
+Let's define a new data interface to manage complex numbers.
+
+\code{.c}
+/* interface for complex numbers */
+struct starpu_complex_interface
+{
+        double *real;
+        double *imaginary;
+        int nx;
+};
+\endcode
+
+Registering such a data to StarPU is easily done using the function
+starpu_data_register(). The last
+parameter of the function, <c>interface_complex_ops</c>, will be
+described below.
+
+\code{.c}
+void starpu_complex_data_register(starpu_data_handle_t *handle,
+     unsigned home_node, double *real, double *imaginary, int nx)
+{
+        struct starpu_complex_interface complex =
+        {
+                .real = real,
+                .imaginary = imaginary,
+                .nx = nx
+        };
+
+        if (interface_complex_ops.interfaceid == STARPU_UNKNOWN_INTERFACE_ID)
+        {
+                interface_complex_ops.interfaceid = starpu_data_interface_get_next_id();
+        }
+
+        starpu_data_register(handleptr, home_node, &complex, &interface_complex_ops);
+}
+\endcode
+
+Different operations need to be defined for a data interface through
+the type starpu_data_interface_ops. We only define here the basic
+operations needed to run simple applications. The source code for the
+different functions can be found in the file
+<c>examples/interface/complex_interface.c</c>.
+
+\code{.c}
+static struct starpu_data_interface_ops interface_complex_ops =
+{
+        .register_data_handle = complex_register_data_handle,
+        .allocate_data_on_node = complex_allocate_data_on_node,
+        .copy_methods = &complex_copy_methods,
+        .get_size = complex_get_size,
+        .footprint = complex_footprint,
+        .interfaceid = STARPU_UNKNOWN_INTERFACE_ID,
+        .interface_size = sizeof(struct starpu_complex_interface),
+};
+\endcode
+
+Functions need to be defined to access the different fields of the
+complex interface from a StarPU data handle.
+
+\code{.c}
+double *starpu_complex_get_real(starpu_data_handle_t handle)
+{
+        struct starpu_complex_interface *complex_interface =
+          (struct starpu_complex_interface *) starpu_data_get_interface_on_node(handle, STARPU_MAIN_RAM);
+        return complex_interface->real;
+}
+
+double *starpu_complex_get_imaginary(starpu_data_handle_t handle);
+int starpu_complex_get_nx(starpu_data_handle_t handle);
+\endcode
+
+Similar functions need to be defined to access the different fields of the
+complex interface from a <c>void *</c> pointer to be used within codelet
+implemetations.
+
+\snippet complex.c To be included. You should update doxygen if you see this text.
+
+Complex data interfaces can then be registered to StarPU.
+
+\code{.c}
+double real = 45.0;
+double imaginary = 12.0;starpu_complex_data_register(&handle1, STARPU_MAIN_RAM, &real, &imaginary, 1);
+starpu_task_insert(&cl_display, STARPU_R, handle1, 0);
+\endcode
+
+and used by codelets.
+
+\code{.c}
+void display_complex_codelet(void *descr[], __attribute__ ((unused)) void *_args)
+{
+        int nx = STARPU_COMPLEX_GET_NX(descr[0]);
+        double *real = STARPU_COMPLEX_GET_REAL(descr[0]);
+        double *imaginary = STARPU_COMPLEX_GET_IMAGINARY(descr[0]);
+        int i;
+
+        for(i=0 ; i<nx ; i++)
+        {
+                fprintf(stderr, "Complex[%d] = %3.2f + %3.2f i\n", i, real[i], imaginary[i]);
+        }
+}
+\endcode
+
+The whole code for this complex data interface is available in the
+directory <c>examples/interface/</c>.
+
+
+
+*/

+ 151 - 0
doc/doxygen/chapters/08scheduling.doxy

@@ -0,0 +1,151 @@
+/*
+ * This file is part of the StarPU Handbook.
+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+ * See the file version.doxy for copying conditions.
+ */
+
+/*! \page Scheduling Scheduling
+
+\section TaskSchedulingPolicy Task Scheduling Policy
+
+By default, StarPU uses the simple greedy scheduler <c>eager</c>. This is
+because it provides correct load balance even if the application codelets do not
+have performance models. If your application codelets have performance models
+(\ref PerformanceModelExample), you should change the scheduler thanks
+to the environment variable \ref STARPU_SCHED. For instance <c>export
+STARPU_SCHED=dmda</c> . Use <c>help</c> to get the list of available schedulers.
+
+The <b>eager</b> scheduler uses a central task queue, from which workers draw tasks
+to work on. This however does not permit to prefetch data since the scheduling
+decision is taken late. If a task has a non-0 priority, it is put at the front of the queue.
+
+The <b>prio</b> scheduler also uses a central task queue, but sorts tasks by
+priority (between -5 and 5).
+
+The <b>random</b> scheduler distributes tasks randomly according to assumed worker
+overall performance.
+
+The <b>ws</b> (work stealing) scheduler schedules tasks on the local worker by
+default. When a worker becomes idle, it steals a task from the most loaded
+worker.
+
+The <b>dm</b> (deque model) scheduler uses task execution performance models into account to
+perform an HEFT-similar scheduling strategy: it schedules tasks where their
+termination time will be minimal.
+
+The <b>dmda</b> (deque model data aware) scheduler is similar to dm, it also takes
+into account data transfer time.
+
+The <b>dmdar</b> (deque model data aware ready) scheduler is similar to dmda,
+it also sorts tasks on per-worker queues by number of already-available data
+buffers.
+
+The <b>dmdas</b> (deque model data aware sorted) scheduler is similar to dmda, it
+also supports arbitrary priority values.
+
+The <b>heft</b> (heterogeneous earliest finish time) scheduler is deprecated. It
+is now just an alias for <b>dmda</b>.
+
+The <b>pheft</b> (parallel HEFT) scheduler is similar to heft, it also supports
+parallel tasks (still experimental). Should not be used when several contexts using
+it are being executed simultaneously.
+
+The <b>peager</b> (parallel eager) scheduler is similar to eager, it also
+supports parallel tasks (still experimental). Should not be used when several 
+contexts using it are being executed simultaneously.
+
+\section TaskDistributionVsDataTransfer Task Distribution Vs Data Transfer
+
+Distributing tasks to balance the load induces data transfer penalty. StarPU
+thus needs to find a balance between both. The target function that the
+scheduler <c>dmda</c> of StarPU
+tries to minimize is <c>alpha * T_execution + beta * T_data_transfer</c>, where
+<c>T_execution</c> is the estimated execution time of the codelet (usually
+accurate), and <c>T_data_transfer</c> is the estimated data transfer time. The
+latter is estimated based on bus calibration before execution start,
+i.e. with an idle machine, thus without contention. You can force bus
+re-calibration by running the tool <c>starpu_calibrate_bus</c>. The
+beta parameter defaults to <c>1</c>, but it can be worth trying to tweak it
+by using <c>export STARPU_SCHED_BETA=2</c> for instance, since during
+real application execution, contention makes transfer times bigger.
+This is of course imprecise, but in practice, a rough estimation
+already gives the good results that a precise estimation would give.
+
+\section Power-basedScheduling Power-based Scheduling
+
+If the application can provide some power performance model (through
+the field starpu_codelet::power_model), StarPU will
+take it into account when distributing tasks. The target function that
+the scheduler <c>dmda</c> minimizes becomes <c>alpha * T_execution +
+beta * T_data_transfer + gamma * Consumption</c> , where <c>Consumption</c>
+is the estimated task consumption in Joules. To tune this parameter, use
+<c>export STARPU_SCHED_GAMMA=3000</c> for instance, to express that each Joule
+(i.e kW during 1000us) is worth 3000us execution time penalty. Setting
+<c>alpha</c> and <c>beta</c> to zero permits to only take into account power consumption.
+
+This is however not sufficient to correctly optimize power: the scheduler would
+simply tend to run all computations on the most energy-conservative processing
+unit. To account for the consumption of the whole machine (including idle
+processing units), the idle power of the machine should be given by setting
+<c>export STARPU_IDLE_POWER=200</c> for 200W, for instance. This value can often
+be obtained from the machine power supplier.
+
+The power actually consumed by the total execution can be displayed by setting
+<c>export STARPU_PROFILING=1 STARPU_WORKER_STATS=1</c> .
+
+On-line task consumption measurement is currently only supported through the
+<c>CL_PROFILING_POWER_CONSUMED</c> OpenCL extension, implemented in the MoviSim
+simulator. Applications can however provide explicit measurements by
+using the function starpu_perfmodel_update_history() (examplified in \ref PerformanceModelExample
+with the <c>power_model</c> performance model). Fine-grain
+measurement is often not feasible with the feedback provided by the hardware, so
+the user can for instance run a given task a thousand times, measure the global
+consumption for that series of tasks, divide it by a thousand, repeat for
+varying kinds of tasks and task sizes, and eventually feed StarPU
+with these manual measurements through starpu_perfmodel_update_history().
+
+\section StaticScheduling Static Scheduling
+
+In some cases, one may want to force some scheduling, for instance force a given
+set of tasks to GPU0, another set to GPU1, etc. while letting some other tasks
+be scheduled on any other device. This can indeed be useful to guide StarPU into
+some work distribution, while still letting some degree of dynamism. For
+instance, to force execution of a task on CUDA0:
+
+\code{.c}
+task->execute_on_a_specific_worker = 1;
+task->worker = starpu_worker_get_by_type(STARPU_CUDA_WORKER, 0);
+\endcode
+
+Note however that using scheduling contexts while statically scheduling tasks on workers
+could be tricky. Be careful to schedule the tasks exactly on the workers of the corresponding
+contexts, otherwise the workers' corresponding scheduling structures may not be allocated or
+the execution of the application may deadlock. Moreover, the hypervisor should not be used when
+statically scheduling tasks.
+
+\section DefiningANewSchedulingPolicy Defining A New Scheduling Policy
+
+A full example showing how to define a new scheduling policy is available in
+the StarPU sources in the directory <c>examples/scheduler/</c>.
+
+See \ref API_Scheduling_Policy
+
+\code{.c}
+static struct starpu_sched_policy dummy_sched_policy = {
+    .init_sched = init_dummy_sched,
+    .deinit_sched = deinit_dummy_sched,
+    .add_workers = dummy_sched_add_workers,
+    .remove_workers = dummy_sched_remove_workers,
+    .push_task = push_task_dummy,
+    .push_prio_task = NULL,
+    .pop_task = pop_task_dummy,
+    .post_exec_hook = NULL,
+    .pop_every_task = NULL,
+    .policy_name = "dummy",
+    .policy_description = "dummy scheduling strategy"
+};
+\endcode
+
+*/

doc/doxygen/chapters/13scheduling_contexts.doxy → doc/doxygen/chapters/09scheduling_contexts.doxy


doc/doxygen/chapters/14scheduling_context_hypervisor.doxy → doc/doxygen/chapters/10scheduling_context_hypervisor.doxy


+ 42 - 0
doc/doxygen/chapters/11debugging_tools.doxy

@@ -0,0 +1,42 @@
+/*
+ * This file is part of the StarPU Handbook.
+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+ * See the file version.doxy for copying conditions.
+ */
+
+/*! \page DebuggingTools Debugging Tools
+
+StarPU provides several tools to help debugging applications. Execution traces
+can be generated and displayed graphically, see \ref
+GeneratingTracesWithFxT. Some gdb helpers are also provided to show
+the whole StarPU state:
+
+\verbatim
+(gdb) source tools/gdbinit
+(gdb) help starpu
+\endverbatim
+
+The Temanejo task debugger can also be used, see \ref UsingTheTemanejoTaskDebugger.
+
+\section UsingTheTemanejoTaskDebugger Using The Temanejo Task Debugger
+
+StarPU can connect to Temanejo >= 1.0rc2 (see
+http://www.hlrs.de/temanejo), to permit
+nice visual task debugging. To do so, build Temanejo's <c>libayudame.so</c>,
+install <c>Ayudame.h</c> to e.g. <c>/usr/local/include</c>, apply the
+<c>tools/patch-ayudame</c> to it to fix C build, re-<c>./configure</c>, make
+sure that it found it, rebuild StarPU.  Run the Temanejo GUI, give it the path
+to your application, any options you want to pass it, the path to <c>libayudame.so</c>.
+
+Make sure to specify at least the same number of CPUs in the dialog box as your
+machine has, otherwise an error will happen during execution. Future versions
+of Temanejo should be able to tell StarPU the number of CPUs to use.
+
+Tag numbers have to be below <c>4000000000000000000ULL</c> to be usable for
+Temanejo (so as to distinguish them from tasks).
+
+
+
+*/

+ 432 - 0
doc/doxygen/chapters/12online_performance_tools.doxy

@@ -0,0 +1,432 @@
+/*
+ * This file is part of the StarPU Handbook.
+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+ * See the file version.doxy for copying conditions.
+ */
+
+/*! \page OnlinePerformanceTools Online Performance Tools
+
+\section On-linePerformanceFeedback On-line Performance Feedback
+
+\subsection EnablingOn-linePerformanceMonitoring Enabling On-line Performance Monitoring
+
+In order to enable online performance monitoring, the application can
+call starpu_profiling_status_set() with the parameter
+::STARPU_PROFILING_ENABLE. It is possible to detect whether monitoring
+is already enabled or not by calling starpu_profiling_status_get().
+Enabling monitoring also reinitialize all previously collected
+feedback. The environment variable \ref STARPU_PROFILING can also be
+set to <c>1</c> to achieve the same effect. The function
+starpu_profiling_init() can also be called during the execution to
+reinitialize performance counters and to start the profiling if the
+environment variable \ref STARPU_PROFILING is set to <c>1</c>.
+
+Likewise, performance monitoring is stopped by calling
+starpu_profiling_status_set() with the parameter
+::STARPU_PROFILING_DISABLE. Note that this does not reset the
+performance counters so that the application may consult them later
+on.
+
+More details about the performance monitoring API are available in \ref API_Profiling.
+
+\subsection Per-taskFeedback Per-task Feedback
+
+If profiling is enabled, a pointer to a structure
+starpu_profiling_task_info is put in the field
+starpu_task::profiling_info when a task terminates. This structure is
+automatically destroyed when the task structure is destroyed, either
+automatically or by calling starpu_task_destroy().
+
+The structure starpu_profiling_task_info indicates the date when the
+task was submitted (starpu_profiling_task_info::submit_time), started
+(starpu_profiling_task_info::start_time), and terminated
+(starpu_profiling_task_info::end_time), relative to the initialization
+of StarPU with starpu_init(). It also specifies the identifier of the worker
+that has executed the task (starpu_profiling_task_info::workerid).
+These date are stored as <c>timespec</c> structures which the user may convert
+into micro-seconds using the helper function
+starpu_timing_timespec_to_us().
+
+It it worth noting that the application may directly access this structure from
+the callback executed at the end of the task. The structure starpu_task
+associated to the callback currently being executed is indeed accessible with
+the function starpu_task_get_current().
+
+\subsection Per-codeletFeedback Per-codelet Feedback
+
+The field starpu_codelet::per_worker_stats is
+an array of counters. The i-th entry of the array is incremented every time a
+task implementing the codelet is executed on the i-th worker.
+This array is not reinitialized when profiling is enabled or disabled.
+
+\subsection Per-workerFeedback Per-worker Feedback
+
+The second argument returned by the function
+starpu_profiling_worker_get_info() is a structure
+starpu_profiling_worker_info that gives statistics about the specified
+worker. This structure specifies when StarPU started collecting
+profiling information for that worker
+(starpu_profiling_worker_info::start_time), the
+duration of the profiling measurement interval
+(starpu_profiling_worker_info::total_time), the time spent executing
+kernels (starpu_profiling_worker_info::executing_time), the time
+spent sleeping because there is no task to execute at all
+(starpu_profiling_worker_info::sleeping_time), and the number of tasks that were executed
+while profiling was enabled. These values give an estimation of the
+proportion of time spent do real work, and the time spent either
+sleeping because there are not enough executable tasks or simply
+wasted in pure StarPU overhead.
+
+Calling starpu_profiling_worker_get_info() resets the profiling
+information associated to a worker.
+
+When an FxT trace is generated (see \ref GeneratingTracesWithFxT), it is also
+possible to use the tool <c>starpu_workers_activity</c> (see \ref
+MonitoringActivity) to generate a graphic showing the evolution of
+these values during the time, for the different workers.
+
+\subsection Bus-relatedFeedback Bus-related Feedback
+
+TODO: ajouter \ref STARPU_BUS_STATS
+
+// how to enable/disable performance monitoring
+// what kind of information do we get ?
+
+The bus speed measured by StarPU can be displayed by using the tool
+<c>starpu_machine_display</c>, for instance:
+
+\verbatim
+StarPU has found:
+        3 CUDA devices
+                CUDA 0 (Tesla C2050 02:00.0)
+                CUDA 1 (Tesla C2050 03:00.0)
+                CUDA 2 (Tesla C2050 84:00.0)
+from    to RAM          to CUDA 0       to CUDA 1       to CUDA 2
+RAM     0.000000        5176.530428     5176.492994     5191.710722
+CUDA 0  4523.732446     0.000000        2414.074751     2417.379201
+CUDA 1  4523.718152     2414.078822     0.000000        2417.375119
+CUDA 2  4534.229519     2417.069025     2417.060863     0.000000
+\endverbatim
+
+\subsection StarPU-TopInterface StarPU-Top Interface
+
+StarPU-Top is an interface which remotely displays the on-line state of a StarPU
+application and permits the user to change parameters on the fly.
+
+Variables to be monitored can be registered by calling the functions
+starpu_top_add_data_boolean(), starpu_top_add_data_integer(),
+starpu_top_add_data_float(), e.g.:
+
+\code{.c}
+starpu_top_data *data = starpu_top_add_data_integer("mynum", 0, 100, 1);
+\endcode
+
+The application should then call starpu_top_init_and_wait() to give its name
+and wait for StarPU-Top to get a start request from the user. The name is used
+by StarPU-Top to quickly reload a previously-saved layout of parameter display.
+
+\code{.c}
+starpu_top_init_and_wait("the application");
+\endcode
+
+The new values can then be provided thanks to
+starpu_top_update_data_boolean(), starpu_top_update_data_integer(),
+starpu_top_update_data_float(), e.g.:
+
+\code{.c}
+starpu_top_update_data_integer(data, mynum);
+\endcode
+
+Updateable parameters can be registered thanks to starpu_top_register_parameter_boolean(), starpu_top_register_parameter_integer(), starpu_top_register_parameter_float(), e.g.:
+
+\code{.c}
+float alpha;
+starpu_top_register_parameter_float("alpha", &alpha, 0, 10, modif_hook);
+\endcode
+
+<c>modif_hook</c> is a function which will be called when the parameter is being modified, it can for instance print the new value:
+
+\code{.c}
+void modif_hook(struct starpu_top_param *d) {
+    fprintf(stderr,"%s has been modified: %f\n", d->name, alpha);
+}
+\endcode
+
+Task schedulers should notify StarPU-Top when it has decided when a task will be
+scheduled, so that it can show it in its Gantt chart, for instance:
+
+\code{.c}
+starpu_top_task_prevision(task, workerid, begin, end);
+\endcode
+
+Starting StarPU-Top (StarPU-Top is started via the binary
+<c>starpu_top</c>.) and the application can be done two ways:
+
+<ul>
+<li> The application is started by hand on some machine (and thus already
+waiting for the start event). In the Preference dialog of StarPU-Top, the SSH
+checkbox should be unchecked, and the hostname and port (default is 2011) on
+which the application is already running should be specified. Clicking on the
+connection button will thus connect to the already-running application.
+</li>
+<li> StarPU-Top is started first, and clicking on the connection button will
+start the application itself (possibly on a remote machine). The SSH checkbox
+should be checked, and a command line provided, e.g.:
+
+\verbatim
+$ ssh myserver STARPU_SCHED=dmda ./application
+\endverbatim
+
+If port 2011 of the remote machine can not be accessed directly, an ssh port bridge should be added:
+
+\verbatim
+$ ssh -L 2011:localhost:2011 myserver STARPU_SCHED=dmda ./application
+\endverbatim
+
+and "localhost" should be used as IP Address to connect to.
+</li>
+</ul>
+
+\section TaskAndWorkerProfiling Task And Worker Profiling
+
+A full example showing how to use the profiling API is available in
+the StarPU sources in the directory <c>examples/profiling/</c>.
+
+\code{.c}
+struct starpu_task *task = starpu_task_create();
+task->cl = &cl;
+task->synchronous = 1;
+/* We will destroy the task structure by hand so that we can
+ * query the profiling info before the task is destroyed. */
+task->destroy = 0;
+
+/* Submit and wait for completion (since synchronous was set to 1) */
+starpu_task_submit(task);
+
+/* The task is finished, get profiling information */
+struct starpu_profiling_task_info *info = task->profiling_info;
+
+/* How much time did it take before the task started ? */
+double delay += starpu_timing_timespec_delay_us(&info->submit_time, &info->start_time);
+
+/* How long was the task execution ? */
+double length += starpu_timing_timespec_delay_us(&info->start_time, &info->end_time);
+
+/* We don't need the task structure anymore */
+starpu_task_destroy(task);
+\endcode
+
+\code{.c}
+/* Display the occupancy of all workers during the test */
+int worker;
+for (worker = 0; worker < starpu_worker_get_count(); worker++)
+{
+        struct starpu_profiling_worker_info worker_info;
+        int ret = starpu_profiling_worker_get_info(worker, &worker_info);
+        STARPU_ASSERT(!ret);
+
+        double total_time = starpu_timing_timespec_to_us(&worker_info.total_time);
+        double executing_time = starpu_timing_timespec_to_us(&worker_info.executing_time);
+        double sleeping_time = starpu_timing_timespec_to_us(&worker_info.sleeping_time);
+        double overhead_time = total_time - executing_time - sleeping_time;
+
+        float executing_ratio = 100.0*executing_time/total_time;
+        float sleeping_ratio = 100.0*sleeping_time/total_time;
+        float overhead_ratio = 100.0 - executing_ratio - sleeping_ratio;
+
+        char workername[128];
+        starpu_worker_get_name(worker, workername, 128);
+        fprintf(stderr, "Worker %s:\n", workername);
+        fprintf(stderr, "\ttotal time: %.2lf ms\n", total_time*1e-3);
+        fprintf(stderr, "\texec time: %.2lf ms (%.2f %%)\n",
+                executing_time*1e-3, executing_ratio);
+        fprintf(stderr, "\tblocked time: %.2lf ms (%.2f %%)\n",
+                sleeping_time*1e-3, sleeping_ratio);
+        fprintf(stderr, "\toverhead time: %.2lf ms (%.2f %%)\n",
+                overhead_time*1e-3, overhead_ratio);
+}
+\endcode
+
+\section PerformanceModelExample Performance Model Example
+
+To achieve good scheduling, StarPU scheduling policies need to be able to
+estimate in advance the duration of a task. This is done by giving to codelets
+a performance model, by defining a structure starpu_perfmodel and
+providing its address in the field starpu_codelet::model. The fields
+starpu_perfmodel::symbol and starpu_perfmodel::type are mandatory, to
+give a name to the model, and the type of the model, since there are
+several kinds of performance models. For compatibility, make sure to
+initialize the whole structure to zero, either by using explicit
+memset(), or by letting the compiler implicitly do it as examplified
+below.
+
+<ul>
+<li>
+Measured at runtime (model type ::STARPU_HISTORY_BASED). This assumes that for a
+given set of data input/output sizes, the performance will always be about the
+same. This is very true for regular kernels on GPUs for instance (<0.1% error),
+and just a bit less true on CPUs (~=1% error). This also assumes that there are
+few different sets of data input/output sizes. StarPU will then keep record of
+the average time of previous executions on the various processing units, and use
+it as an estimation. History is done per task size, by using a hash of the input
+and ouput sizes as an index.
+It will also save it in <c>$STARPU_HOME/.starpu/sampling/codelets</c>
+for further executions, and can be observed by using the tool
+<c>starpu_perfmodel_display</c>, or drawn by using
+the tool <c>starpu_perfmodel_plot</c> (\ref PerformanceModelCalibration).  The
+models are indexed by machine name. To
+share the models between machines (e.g. for a homogeneous cluster), use
+<c>export STARPU_HOSTNAME=some_global_name</c>. Measurements are only done
+when using a task scheduler which makes use of it, such as
+<c>dmda</c>. Measurements can also be provided explicitly by the application, by
+using the function starpu_perfmodel_update_history().
+
+The following is a small code example.
+
+If e.g. the code is recompiled with other compilation options, or several
+variants of the code are used, the symbol string should be changed to reflect
+that, in order to recalibrate a new model from zero. The symbol string can even
+be constructed dynamically at execution time, as long as this is done before
+submitting any task using it.
+
+\code{.c}
+static struct starpu_perfmodel mult_perf_model = {
+    .type = STARPU_HISTORY_BASED,
+    .symbol = "mult_perf_model"
+};
+
+struct starpu_codelet cl = {
+    .where = STARPU_CPU,
+    .cpu_funcs = { cpu_mult, NULL },
+    .cpu_funcs_name = { "cpu_mult", NULL },
+    .nbuffers = 3,
+    .modes = { STARPU_R, STARPU_R, STARPU_W },
+    /* for the scheduling policy to be able to use performance models */
+    .model = &mult_perf_model
+};
+\endcode
+
+</li>
+<li>
+Measured at runtime and refined by regression (model types
+::STARPU_REGRESSION_BASED and ::STARPU_NL_REGRESSION_BASED). This
+still assumes performance regularity, but works 
+with various data input sizes, by applying regression over observed
+execution times. ::STARPU_REGRESSION_BASED uses an a*n^b regression
+form, ::STARPU_NL_REGRESSION_BASED uses an a*n^b+c (more precise than
+::STARPU_REGRESSION_BASED, but costs a lot more to compute).
+
+For instance,
+<c>tests/perfmodels/regression_based.c</c> uses a regression-based performance
+model for the function memset().
+
+Of course, the application has to issue
+tasks with varying size so that the regression can be computed. StarPU will not
+trust the regression unless there is at least 10% difference between the minimum
+and maximum observed input size. It can be useful to set the
+environment variable \ref STARPU_CALIBRATE to <c>1</c> and run the application
+on varying input sizes with \ref STARPU_SCHED set to <c>dmda</c> scheduler,
+so as to feed the performance model for a variety of
+inputs. The application can also provide the measurements explictly by
+using the function starpu_perfmodel_update_history(). The tools
+<c>starpu_perfmodel_display</c> and <c>starpu_perfmodel_plot</c> can
+be used to observe how much the performance model is calibrated (\ref
+PerformanceModelCalibration); when their output look good,
+\ref STARPU_CALIBRATE can be reset to <c>0</c> to let
+StarPU use the resulting performance model without recording new measures, and
+\ref STARPU_SCHED can be set to <c>dmda</c> to benefit from the performance models. If
+the data input sizes vary a lot, it is really important to set
+\ref STARPU_CALIBRATE to <c>0</c>, otherwise StarPU will continue adding the
+measures, and result with a very big performance model, which will take time a
+lot of time to load and save.
+
+For non-linear regression, since computing it
+is quite expensive, it is only done at termination of the application. This
+means that the first execution of the application will use only history-based
+performance model to perform scheduling, without using regression.
+</li>
+
+<li>
+Provided as an estimation from the application itself (model type
+::STARPU_COMMON and field starpu_perfmodel::cost_function),
+see for instance
+<c>examples/common/blas_model.h</c> and <c>examples/common/blas_model.c</c>.
+</li>
+
+<li>
+Provided explicitly by the application (model type ::STARPU_PER_ARCH):
+the fields <c>.per_arch[arch][nimpl].cost_function</c> have to be
+filled with pointers to functions which return the expected duration
+of the task in micro-seconds, one per architecture.
+</li>
+</ul>
+
+For ::STARPU_HISTORY_BASED, ::STARPU_REGRESSION_BASED, and
+::STARPU_NL_REGRESSION_BASED, the total size of task data (both input
+and output) is used as an index by default. The field
+starpu_perfmodel::size_base however permits the application to
+override that, when for instance some of the data do not matter for
+task cost (e.g. mere reference table), or when using sparse
+structures (in which case it is the number of non-zeros which matter), or when
+there is some hidden parameter such as the number of iterations, or when the application
+actually has a very good idea of the complexity of the algorithm, and just not
+the speed of the processor, etc.
+The example in the directory <c>examples/pi</c> uses this to include
+the number of iterations in the base.
+
+StarPU will automatically determine when the performance model is calibrated,
+or rather, it will assume the performance model is calibrated until the
+application submits a task for which the performance can not be predicted. For
+::STARPU_HISTORY_BASED, StarPU will require 10 (_STARPU_CALIBRATION_MINIMUM)
+measurements for a given size before estimating that an average can be taken as
+estimation for further executions with the same size. For
+::STARPU_REGRESSION_BASED and ::STARPU_NL_REGRESSION_BASED, StarPU will require
+10 (_STARPU_CALIBRATION_MINIMUM) measurements, and that the minimum measured
+data size is smaller than 90% of the maximum measured data size (i.e. the
+measurement interval is large enough for a regression to have a meaning).
+Calibration can also be forced by setting the \ref STARPU_CALIBRATE environment
+variable to <c>1</c>, or even reset by setting it to <c>2</c>.
+
+How to use schedulers which can benefit from such performance model is explained
+in \ref TaskSchedulingPolicy.
+
+The same can be done for task power consumption estimation, by setting
+the field starpu_codelet::power_model the same way as the field
+starpu_codelet::model. Note: for now, the application has to give to
+the power consumption performance model a name which is different from
+the execution time performance model.
+
+The application can request time estimations from the StarPU performance
+models by filling a task structure as usual without actually submitting
+it. The data handles can be created by calling any of the functions
+<c>starpu_*_data_register</c> with a <c>NULL</c> pointer and <c>-1</c>
+node and the desired data sizes, and need to be unregistered as usual.
+The functions starpu_task_expected_length() and
+starpu_task_expected_power() can then be called to get an estimation
+of the task cost on a given arch. starpu_task_footprint() can also be
+used to get the footprint used for indexing history-based performance
+models. starpu_task_destroy() needs to be called to destroy the dummy
+task afterwards. See <c>tests/perfmodels/regression_based.c</c> for an example.
+
+\section DataTrace Data trace and tasks length
+It is possible to get statistics about tasks length and data size by using :
+\verbatim
+$ starpu_fxt_data_trace filename [codelet1 codelet2 ... codeletn]
+\endverbatim
+Where filename is the FxT trace file and codeletX the names of the codelets you
+want to profile (if no names are specified, <c>starpu_fxt_data_trace</c> will profile them all).
+This will create a file, <c>data_trace.gp</c> which
+can be executed to get a <c>.eps</c> image of these results. On the image, each point represents a
+task, and each color corresponds to a codelet.
+
+\image html data_trace.png
+\image latex data_trace.eps "" width=\textwidth
+
+// TODO: data transfer stats are similar to the ones displayed when
+// setting STARPU_BUS_STATS
+
+
+
+*/

+ 80 - 212
doc/doxygen/chapters/05performance_feedback.doxy

@@ -1,211 +1,47 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
  * See the file version.doxy for copying conditions.
  */
 
-/*! \page PerformanceFeedback Performance Feedback
-
-\section UsingTheTemanejoTaskDebugger Using The Temanejo Task Debugger
-
-StarPU can connect to Temanejo >= 1.0rc2 (see
-http://www.hlrs.de/temanejo), to permit
-nice visual task debugging. To do so, build Temanejo's <c>libayudame.so</c>,
-install <c>Ayudame.h</c> to e.g. <c>/usr/local/include</c>, apply the
-<c>tools/patch-ayudame</c> to it to fix C build, re-<c>./configure</c>, make
-sure that it found it, rebuild StarPU.  Run the Temanejo GUI, give it the path
-to your application, any options you want to pass it, the path to <c>libayudame.so</c>.
-
-Make sure to specify at least the same number of CPUs in the dialog box as your
-machine has, otherwise an error will happen during execution. Future versions
-of Temanejo should be able to tell StarPU the number of CPUs to use.
-
-Tag numbers have to be below <c>4000000000000000000ULL</c> to be usable for
-Temanejo (so as to distinguish them from tasks).
-
-\section On-linePerformanceFeedback On-line Performance Feedback
-
-\subsection EnablingOn-linePerformanceMonitoring Enabling On-line Performance Monitoring
-
-In order to enable online performance monitoring, the application can
-call starpu_profiling_status_set() with the parameter
-::STARPU_PROFILING_ENABLE. It is possible to detect whether monitoring
-is already enabled or not by calling starpu_profiling_status_get().
-Enabling monitoring also reinitialize all previously collected
-feedback. The environment variable \ref STARPU_PROFILING can also be
-set to <c>1</c> to achieve the same effect. The function
-starpu_profiling_init() can also be called during the execution to
-reinitialize performance counters and to start the profiling if the
-environment variable \ref STARPU_PROFILING is set to <c>1</c>.
-
-Likewise, performance monitoring is stopped by calling
-starpu_profiling_status_set() with the parameter
-::STARPU_PROFILING_DISABLE. Note that this does not reset the
-performance counters so that the application may consult them later
-on.
-
-More details about the performance monitoring API are available in \ref API_Profiling.
-
-\subsection Per-taskFeedback Per-task Feedback
-
-If profiling is enabled, a pointer to a structure
-starpu_profiling_task_info is put in the field
-starpu_task::profiling_info when a task terminates. This structure is
-automatically destroyed when the task structure is destroyed, either
-automatically or by calling starpu_task_destroy().
-
-The structure starpu_profiling_task_info indicates the date when the
-task was submitted (starpu_profiling_task_info::submit_time), started
-(starpu_profiling_task_info::start_time), and terminated
-(starpu_profiling_task_info::end_time), relative to the initialization
-of StarPU with starpu_init(). It also specifies the identifier of the worker
-that has executed the task (starpu_profiling_task_info::workerid).
-These date are stored as <c>timespec</c> structures which the user may convert
-into micro-seconds using the helper function
-starpu_timing_timespec_to_us().
-
-It it worth noting that the application may directly access this structure from
-the callback executed at the end of the task. The structure starpu_task
-associated to the callback currently being executed is indeed accessible with
-the function starpu_task_get_current().
-
-\subsection Per-codeletFeedback Per-codelet Feedback
-
-The field starpu_codelet::per_worker_stats is
-an array of counters. The i-th entry of the array is incremented every time a
-task implementing the codelet is executed on the i-th worker.
-This array is not reinitialized when profiling is enabled or disabled.
-
-\subsection Per-workerFeedback Per-worker Feedback
-
-The second argument returned by the function
-starpu_profiling_worker_get_info() is a structure
-starpu_profiling_worker_info that gives statistics about the specified
-worker. This structure specifies when StarPU started collecting
-profiling information for that worker
-(starpu_profiling_worker_info::start_time), the
-duration of the profiling measurement interval
-(starpu_profiling_worker_info::total_time), the time spent executing
-kernels (starpu_profiling_worker_info::executing_time), the time
-spent sleeping because there is no task to execute at all
-(starpu_profiling_worker_info::sleeping_time), and the number of tasks that were executed
-while profiling was enabled. These values give an estimation of the
-proportion of time spent do real work, and the time spent either
-sleeping because there are not enough executable tasks or simply
-wasted in pure StarPU overhead.
-
-Calling starpu_profiling_worker_get_info() resets the profiling
-information associated to a worker.
-
-When an FxT trace is generated (see \ref GeneratingTracesWithFxT), it is also
-possible to use the tool <c>starpu_workers_activity</c> (see \ref
-MonitoringActivity) to generate a graphic showing the evolution of
-these values during the time, for the different workers.
-
-\subsection Bus-relatedFeedback Bus-related Feedback
-
-TODO: ajouter \ref STARPU_BUS_STATS
-
-// how to enable/disable performance monitoring
-// what kind of information do we get ?
-
-The bus speed measured by StarPU can be displayed by using the tool
-<c>starpu_machine_display</c>, for instance:
+/*! \page OfflinePerformanceTools Offline Performance Tools
 
-\verbatim
-StarPU has found:
-        3 CUDA devices
-                CUDA 0 (Tesla C2050 02:00.0)
-                CUDA 1 (Tesla C2050 03:00.0)
-                CUDA 2 (Tesla C2050 84:00.0)
-from    to RAM          to CUDA 0       to CUDA 1       to CUDA 2
-RAM     0.000000        5176.530428     5176.492994     5191.710722
-CUDA 0  4523.732446     0.000000        2414.074751     2417.379201
-CUDA 1  4523.718152     2414.078822     0.000000        2417.375119
-CUDA 2  4534.229519     2417.069025     2417.060863     0.000000
-\endverbatim
-
-\subsection StarPU-TopInterface StarPU-Top Interface
-
-StarPU-Top is an interface which remotely displays the on-line state of a StarPU
-application and permits the user to change parameters on the fly.
-
-Variables to be monitored can be registered by calling the functions
-starpu_top_add_data_boolean(), starpu_top_add_data_integer(),
-starpu_top_add_data_float(), e.g.:
-
-\code{.c}
-starpu_top_data *data = starpu_top_add_data_integer("mynum", 0, 100, 1);
-\endcode
-
-The application should then call starpu_top_init_and_wait() to give its name
-and wait for StarPU-Top to get a start request from the user. The name is used
-by StarPU-Top to quickly reload a previously-saved layout of parameter display.
-
-\code{.c}
-starpu_top_init_and_wait("the application");
-\endcode
-
-The new values can then be provided thanks to
-starpu_top_update_data_boolean(), starpu_top_update_data_integer(),
-starpu_top_update_data_float(), e.g.:
-
-\code{.c}
-starpu_top_update_data_integer(data, mynum);
-\endcode
-
-Updateable parameters can be registered thanks to starpu_top_register_parameter_boolean(), starpu_top_register_parameter_integer(), starpu_top_register_parameter_float(), e.g.:
-
-\code{.c}
-float alpha;
-starpu_top_register_parameter_float("alpha", &alpha, 0, 10, modif_hook);
-\endcode
-
-<c>modif_hook</c> is a function which will be called when the parameter is being modified, it can for instance print the new value:
-
-\code{.c}
-void modif_hook(struct starpu_top_param *d) {
-    fprintf(stderr,"%s has been modified: %f\n", d->name, alpha);
-}
-\endcode
-
-Task schedulers should notify StarPU-Top when it has decided when a task will be
-scheduled, so that it can show it in its Gantt chart, for instance:
-
-\code{.c}
-starpu_top_task_prevision(task, workerid, begin, end);
-\endcode
-
-Starting StarPU-Top (StarPU-Top is started via the binary
-<c>starpu_top</c>.) and the application can be done two ways:
+To get an idea of what is happening, a lot of performance feedback is available,
+detailed in this chapter. The various informations should be checked for.
 
 <ul>
-<li> The application is started by hand on some machine (and thus already
-waiting for the start event). In the Preference dialog of StarPU-Top, the SSH
-checkbox should be unchecked, and the hostname and port (default is 2011) on
-which the application is already running should be specified. Clicking on the
-connection button will thus connect to the already-running application.
-</li>
-<li> StarPU-Top is started first, and clicking on the connection button will
-start the application itself (possibly on a remote machine). The SSH checkbox
-should be checked, and a command line provided, e.g.:
-
-\verbatim
-$ ssh myserver STARPU_SCHED=dmda ./application
-\endverbatim
-
-If port 2011 of the remote machine can not be accessed directly, an ssh port bridge should be added:
-
-\verbatim
-$ ssh -L 2011:localhost:2011 myserver STARPU_SCHED=dmda ./application
-\endverbatim
-
-and "localhost" should be used as IP Address to connect to.
+<li>
+What does the Gantt diagram look like? (see \ref CreatingAGanttDiagram)
+<ul>
+  <li> If it's mostly green (tasks running in the initial context) or context specific
+  color prevailing, then the machine is properly
+  utilized, and perhaps the codelets are just slow. Check their performance, see
+  \ref PerformanceOfCodelets.
+  </li>
+  <li> If it's mostly purple (FetchingInput), tasks keep waiting for data
+  transfers, do you perhaps have far more communication than computation? Did
+  you properly use CUDA streams to make sure communication can be
+  overlapped? Did you use data-locality aware schedulers to avoid transfers as
+  much as possible?
+  </li>
+  <li> If it's mostly red (Blocked), tasks keep waiting for dependencies,
+  do you have enough parallelism? It might be a good idea to check what the DAG
+  looks like (see \ref CreatingADAGWithGraphviz).
+  </li>
+  <li> If only some workers are completely red (Blocked), for some reason the
+  scheduler didn't assign tasks to them. Perhaps the performance model is bogus,
+  check it (see \ref PerformanceOfCodelets). Do all your codelets have a
+  performance model?  When some of them don't, the schedulers switches to a
+  greedy algorithm which thus performs badly.
+  </li>
+</ul>
 </li>
 </ul>
 
+You can also use the Temanejo task debugger (see \ref UsingTheTemanejoTaskDebugger) to
+visualize the task graph more easily.
 \section Off-linePerformanceFeedback Off-line Performance Feedback
 
 \subsection GeneratingTracesWithFxT Generating Traces With FxT
@@ -492,6 +328,55 @@ execution time.
 \ref TheoreticalLowerBoundOnExecutionTimeExample provides an example on how to
 use this.
 
+\section TheoreticalLowerBoundOnExecutionTimeExample Theoretical Lower Bound On Execution Time Example
+
+For kernels with history-based performance models (and provided that
+they are completely calibrated), StarPU can very easily provide a
+theoretical lower bound for the execution time of a whole set of
+tasks. See for instance <c>examples/lu/lu_example.c</c>: before
+submitting tasks, call the function starpu_bound_start(), and after
+complete execution, call starpu_bound_stop().
+starpu_bound_print_lp() or starpu_bound_print_mps() can then be used
+to output a Linear Programming problem corresponding to the schedule
+of your tasks. Run it through <c>lp_solve</c> or any other linear
+programming solver, and that will give you a lower bound for the total
+execution time of your tasks. If StarPU was compiled with the library
+<c>glpk</c> installed, starpu_bound_compute() can be used to solve it
+immediately and get the optimized minimum, in ms. Its parameter
+<c>integer</c> allows to decide whether integer resolution should be
+computed and returned 
+
+The <c>deps</c> parameter tells StarPU whether to take tasks, implicit
+data, and tag dependencies into account. Tags released in a callback
+or similar are not taken into account, only tags associated with a task are.
+It must be understood that the linear programming
+problem size is quadratic with the number of tasks and thus the time to solve it
+will be very long, it could be minutes for just a few dozen tasks. You should
+probably use <c>lp_solve -timeout 1 test.pl -wmps test.mps</c> to convert the
+problem to MPS format and then use a better solver, <c>glpsol</c> might be
+better than <c>lp_solve</c> for instance (the <c>--pcost</c> option may be
+useful), but sometimes doesn't manage to converge. <c>cbc</c> might look
+slower, but it is parallel. For <c>lp_solve</c>, be sure to try at least all the
+<c>-B</c> options. For instance, we often just use <c>lp_solve -cc -B1 -Bb
+-Bg -Bp -Bf -Br -BG -Bd -Bs -BB -Bo -Bc -Bi</c> , and the <c>-gr</c> option can
+also be quite useful. The resulting schedule can be observed by using
+the tool <c>starpu_lp2paje</c>, which converts it into the Paje
+format.
+
+Data transfer time can only be taken into account when <c>deps</c> is set. Only
+data transfers inferred from implicit data dependencies between tasks are taken
+into account. Other data transfers are assumed to be completely overlapped.
+
+Setting <c>deps</c> to 0 will only take into account the actual computations
+on processing units. It however still properly takes into account the varying
+performances of kernels and processing units, which is quite more accurate than
+just comparing StarPU performances with the fastest of the kernels being used.
+
+The <c>prio</c> parameter tells StarPU whether to simulate taking into account
+the priorities as the StarPU scheduler would, i.e. schedule prioritized
+tasks before less prioritized tasks, to check to which extend this results
+to a less optimal solution. This increases even more computation time.
+
 \section MemoryFeedback Memory Feedback
 
 It is possible to enable memory statistics. To do so, you need to pass
@@ -592,21 +477,4 @@ Computation took (in ms)
 Synthetic GFlops : 44.21
 \endverbatim
 
-// TODO: data transfer stats are similar to the ones displayed when
-// setting STARPU_BUS_STATS
-
-\section DataTrace Data trace and tasks length
-It is possible to get statistics about tasks length and data size by using :
-\verbatim
-$starpu_fxt_data_trace filename [codelet1 codelet2 ... codeletn]
-\endverbatim
-Where filename is the FxT trace file and codeletX the names of the codelets you 
-want to profile (if no names are specified, starpu_fxt_data_trace will use them all). 
-This will create a file, <c>data_trace.gp</c> which
-can be plotted to get a .eps image of these results. On the image, each point represents a 
-task, and each color corresponds to a codelet.
-
-\image html data_trace.png
-\image latex data_trace.eps "" width=\textwidth
-
 */

+ 100 - 21
doc/doxygen/chapters/06tips_and_tricks.doxy

@@ -1,12 +1,12 @@
 /*
  * This file is part of the StarPU Handbook.
  * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
  * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
  * See the file version.doxy for copying conditions.
  */
 
-/*! \page TipsAndTricksToKnowAbout Tips and Tricks To Know About
+/*! \page FrequentlyAskedQuestions Frequently Asked Questions
 
 \section HowToInitializeAComputationLibraryOnceForEachWorker How To Initialize A Computation Library Once For Each Worker?
 
@@ -69,33 +69,95 @@ void starpufft_plan(void)
 }
 \endcode
 
-\section HowToLimitMemoryPerNode How to limit memory per node
+\section UsingTheDriverAPI Using The Driver API
 
-TODO
+\ref API_Running_Drivers
 
-Talk about
-\ref STARPU_LIMIT_CUDA_devid_MEM, \ref STARPU_LIMIT_CUDA_MEM,
-\ref STARPU_LIMIT_OPENCL_devid_MEM, \ref STARPU_LIMIT_OPENCL_MEM
-and \ref STARPU_LIMIT_CPU_MEM
+\code{.c}
+int ret;
+struct starpu_driver = {
+    .type = STARPU_CUDA_WORKER,
+    .id.cuda_id = 0
+};
+ret = starpu_driver_init(&d);
+if (ret != 0)
+    error();
+while (some_condition) {
+    ret = starpu_driver_run_once(&d);
+    if (ret != 0)
+        error();
+}
+ret = starpu_driver_deinit(&d);
+if (ret != 0)
+    error();
+\endcode
 
-starpu_memory_get_available()
+To add a new kind of device to the structure starpu_driver, one needs to:
+<ol>
+<li> Add a member to the union starpu_driver::id
+</li>
+<li> Modify the internal function <c>_starpu_launch_drivers()</c> to
+make sure the driver is not always launched.
+</li>
+<li> Modify the function starpu_driver_run() so that it can handle
+another kind of architecture.
+</li>
+<li> Write the new function <c>_starpu_run_foobar()</c> in the
+corresponding driver.
+</li>
+</ol>
+
+\section On-GPURendering On-GPU Rendering
+
+Graphical-oriented applications need to draw the result of their computations,
+typically on the very GPU where these happened. Technologies such as OpenGL/CUDA
+interoperability permit to let CUDA directly work on the OpenGL buffers, making
+them thus immediately ready for drawing, by mapping OpenGL buffer, textures or
+renderbuffer objects into CUDA.  CUDA however imposes some technical
+constraints: peer memcpy has to be disabled, and the thread that runs OpenGL has
+to be the one that runs CUDA computations for that GPU.
+
+To achieve this with StarPU, pass the option
+\ref disable-cuda-memcpy-peer "--disable-cuda-memcpy-peer"
+to <c>./configure</c> (TODO: make it dynamic), OpenGL/GLUT has to be initialized
+first, and the interoperability mode has to
+be enabled by using the field
+starpu_conf::cuda_opengl_interoperability, and the driver loop has to
+be run by the application, by using the field
+starpu_conf::not_launched_drivers to prevent StarPU from running it in
+a separate thread, and by using starpu_driver_run() to run the loop.
+The examples <c>gl_interop</c> and <c>gl_interop_idle</c> show how it
+articulates in a simple case, where rendering is done in task
+callbacks. The former uses <c>glutMainLoopEvent</c> to make GLUT
+progress from the StarPU driver loop, while the latter uses
+<c>glutIdleFunc</c> to make StarPU progress from the GLUT main loop.
+
+Then, to use an OpenGL buffer as a CUDA data, StarPU simply needs to be given
+the CUDA pointer at registration, for instance:
 
-\section ThreadBindingOnNetBSD Thread Binding on NetBSD
+\code{.c}
+/* Get the CUDA worker id */
+for (workerid = 0; workerid < starpu_worker_get_count(); workerid++)
+        if (starpu_worker_get_type(workerid) == STARPU_CUDA_WORKER)
+                break;
 
-When using StarPU on a NetBSD machine, if the topology
-discovery library <c>hwloc</c> is used, thread binding will fail. To
-prevent the problem, you should at least use the version 1.7 of
-<c>hwloc</c>, and also issue the following call:
+/* Build a CUDA pointer pointing at the OpenGL buffer */
+cudaGraphicsResourceGetMappedPointer((void**)&output, &num_bytes, resource);
 
-\verbatim
-$ sysctl -w security.models.extensions.user_set_cpu_affinity=1
-\endverbatim
+/* And register it to StarPU */
+starpu_vector_data_register(&handle, starpu_worker_get_memory_node(workerid),
+                            output, num_bytes / sizeof(float4), sizeof(float4));
 
-Or add the following line in the file <c>/etc/sysctl.conf</c>
+/* The handle can now be used as usual */
+starpu_task_insert(&cl, STARPU_RW, handle, 0);
 
-\verbatim
-security.models.extensions.user_set_cpu_affinity=1
-\endverbatim
+/* ... */
+
+/* This gets back data into the OpenGL buffer */
+starpu_data_unregister(handle);
+\endcode
+
+and display it e.g. in the callback function.
 
 \section UsingStarPUWithMKL Using StarPU With MKL 11 (Intel Composer XE 2013)
 
@@ -111,4 +173,21 @@ Using this configuration, StarPU uses only 1 core, no matter the value of
 The solution is to set the environment variable KMP_AFFINITY to <c>disabled</c>
 (http://software.intel.com/sites/products/documentation/studio/composer/en-us/2011Update/compiler_c/optaps/common/optaps_openmp_thread_affinity.htm).
 
+\section ThreadBindingOnNetBSD Thread Binding on NetBSD
+
+When using StarPU on a NetBSD machine, if the topology
+discovery library <c>hwloc</c> is used, thread binding will fail. To
+prevent the problem, you should at least use the version 1.7 of
+<c>hwloc</c>, and also issue the following call:
+
+\verbatim
+$ sysctl -w security.models.extensions.user_set_cpu_affinity=1
+\endverbatim
+
+Or add the following line in the file <c>/etc/sysctl.conf</c>
+
+\verbatim
+security.models.extensions.user_set_cpu_affinity=1
+\endverbatim
+
 */

doc/doxygen/chapters/07out_of_core.doxy → doc/doxygen/chapters/15out_of_core.doxy


doc/doxygen/chapters/08mpi_support.doxy → doc/doxygen/chapters/16mpi_support.doxy


doc/doxygen/chapters/09fft_support.doxy → doc/doxygen/chapters/17fft_support.doxy


doc/doxygen/chapters/10mic_scc_support.doxy → doc/doxygen/chapters/18mic_scc_support.doxy


doc/doxygen/chapters/11c_extensions.doxy → doc/doxygen/chapters/19c_extensions.doxy


doc/doxygen/chapters/12socl_opencl_extensions.doxy → doc/doxygen/chapters/20socl_opencl_extensions.doxy


+ 104 - 0
doc/doxygen/chapters/21simgrid.doxy

@@ -0,0 +1,104 @@
+/*
+ * This file is part of the StarPU Handbook.
+ * Copyright (C) 2009--2011  Universit@'e de Bordeaux 1
+ * Copyright (C) 2010, 2011, 2012, 2013, 2014  Centre National de la Recherche Scientifique
+ * Copyright (C) 2011, 2012 Institut National de Recherche en Informatique et Automatique
+ * See the file version.doxy for copying conditions.
+ */
+
+/*! \page SimGridSupport SimGrid Support
+
+StarPU can use Simgrid in order to simulate execution on an arbitrary
+platform.
+
+\section Calibration Calibration
+
+The idea is to first compile StarPU normally, and run the application,
+so as to automatically benchmark the bus and the codelets.
+
+\verbatim
+$ ./configure && make
+$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
+[starpu][_starpu_load_history_based_model] Warning: model matvecmult
+   is not calibrated, forcing calibration for this run. Use the
+   STARPU_CALIBRATE environment variable to control this.
+$ ...
+$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
+TEST PASSED
+\endverbatim
+
+Note that we force to use the scheduler <c>dmda</c> to generate
+performance models for the application. The application may need to be
+run several times before the model is calibrated.
+
+\section Simulation Simulation
+
+Then, recompile StarPU, passing \ref enable-simgrid "--enable-simgrid"
+to <c>./configure</c>, and re-run the application:
+
+\verbatim
+$ ./configure --enable-simgrid && make
+$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
+TEST FAILED !!!
+\endverbatim
+
+It is normal that the test fails: since the computation are not actually done
+(that is the whole point of simgrid), the result is wrong, of course.
+
+If the performance model is not calibrated enough, the following error
+message will be displayed
+
+\verbatim
+$ STARPU_SCHED=dmda ./examples/matvecmult/matvecmult
+[starpu][_starpu_load_history_based_model] Warning: model matvecmult
+    is not calibrated, forcing calibration for this run. Use the
+    STARPU_CALIBRATE environment variable to control this.
+[starpu][_starpu_simgrid_execute_job][assert failure] Codelet
+    matvecmult does not have a perfmodel, or is not calibrated enough
+\endverbatim
+
+The number of devices can be chosen as usual with \ref STARPU_NCPU,
+\ref STARPU_NCUDA, and \ref STARPU_NOPENCL.  For now, only the number of
+cpus can be arbitrarily chosen. The number of CUDA and OpenCL devices have to be
+lower than the real number on the current machine.
+
+The amount of simulated GPU memory is for now unbound by default, but
+it can be chosen by hand through the \ref STARPU_LIMIT_CUDA_MEM,
+\ref STARPU_LIMIT_CUDA_devid_MEM, \ref STARPU_LIMIT_OPENCL_MEM, and
+\ref STARPU_LIMIT_OPENCL_devid_MEM environment variables.
+
+The Simgrid default stack size is small; to increase it use the
+parameter <c>--cfg=contexts/stack_size</c>, for example:
+
+\verbatim
+$ ./example --cfg=contexts/stack_size:8192
+TEST FAILED !!!
+\endverbatim
+
+Note: of course, if the application uses <c>gettimeofday</c> to make its
+performance measurements, the real time will be used, which will be bogus. To
+get the simulated time, it has to use starpu_timing_now() which returns the
+virtual timestamp in ms.
+
+\section SimulationOnAnotherMachine Simulation On Another Machine
+
+The simgrid support even permits to perform simulations on another machine, your
+desktop, typically. To achieve this, one still needs to perform the Calibration
+step on the actual machine to be simulated, then copy them to your desktop
+machine (the <c>$STARPU_HOME/.starpu</c> directory). One can then perform the
+Simulation step on the desktop machine, by setting the environment
+variable \ref STARPU_HOSTNAME to the name of the actual machine, to
+make StarPU use the performance models of the simulated machine even
+on the desktop machine.
+
+If the desktop machine does not have CUDA or OpenCL, StarPU is still able to
+use simgrid to simulate execution with CUDA/OpenCL devices, but the application
+source code will probably disable the CUDA and OpenCL codelets in thatcd sc
+case. Since during simgrid execution, the functions of the codelet are actually
+not called, one can use dummy functions such as the following to still permit
+CUDA or OpenCL execution:
+
+\snippet simgrid.c To be included. You should update doxygen if you see this text.
+
+
+*/

doc/doxygen/chapters/15environment_variables.doxy → doc/doxygen/chapters/40environment_variables.doxy


doc/doxygen/chapters/16configure_options.doxy → doc/doxygen/chapters/41configure_options.doxy


doc/doxygen/chapters/17files.doxy → doc/doxygen/chapters/45files.doxy


doc/doxygen/chapters/18scaling-vector-example.doxy → doc/doxygen/chapters/50scaling-vector-example.doxy


doc/doxygen/chapters/19fdl-1.3.doxy → doc/doxygen/chapters/51fdl-1.3.doxy


+ 64 - 30
doc/doxygen/refman.tex

@@ -68,7 +68,7 @@ was last updated on \STARPUUPDATED.\\
 
 Copyright © 2009–2013 Université de Bordeaux 1\\
 
-Copyright © 2010-2013 Centre National de la Recherche Scientifique\\
+Copyright © 2010-2014 Centre National de la Recherche Scientifique\\
 
 Copyright © 2011, 2012 Institut National de Recherche en Informatique et Automatique\\
 
@@ -94,7 +94,7 @@ Documentation License”.
 \hypertarget{index}{}
 \input{index}
 
-\part{Using StarPU}
+\part{StarPU Basics}
 
 \chapter{Building and Installing StarPU}
 \label{BuildingAndInstallingStarPU}
@@ -106,33 +106,72 @@ Documentation License”.
 \hypertarget{BasicExamples}{}
 \input{BasicExamples}
 
+\part{StarPU Quick Programming Guide}
+
 \chapter{Advanced Examples}
 \label{AdvancedExamples}
 \hypertarget{AdvancedExamples}{}
 \input{AdvancedExamples}
 
-\chapter{How To Optimize Performance With StarPU}
-\label{HowToOptimizePerformanceWithStarPU}
-\hypertarget{HowToOptimizePerformanceWithStarPU}{}
-\input{HowToOptimizePerformanceWithStarPU}
+\chapter{Check List When Performance Are Not There}
+\label{CheckListWhenPerformanceAreNotThere}
+\hypertarget{CheckListWhenPerformanceAreNotThere}{}
+\input{CheckListWhenPerformanceAreNotThere}
+
+\part{StarPU Inside}
+
+\chapter{Tasks In StarPU}
+\label{TasksInStarPU}
+\hypertarget{TasksInStarPU}{}
+\input{TasksInStarPU}
+
+\chapter{Data Management}
+\label{DataManagement}
+\hypertarget{DataManagement}{}
+\input{DataManagement}
+
+\chapter{Scheduling}
+\label{Scheduling}
+\hypertarget{Scheduling}{}
+\input{Scheduling}
+
+\chapter{Scheduling Contexts}
+\label{SchedulingContexts}
+\hypertarget{SchedulingContexts}{}
+\input{SchedulingContexts}
+
+\chapter{Scheduling Context Hypervisor}
+\label{SchedulingContextHypervisor}
+\hypertarget{SchedulingContextHypervisor}{}
+\input{SchedulingContextHypervisor}
+
+\chapter{Debugging Tools}
+\label{DebuggingTools}
+\hypertarget{DebuggingTools}{}
+\input{DebuggingTools}
+
+\chapter{Online Performance Tools}
+\label{OnlinePerformanceTools}
+\hypertarget{OnlinePerformanceTools}{}
+\input{OnlinePerformanceTools}
+
+\chapter{Offline Performance Tools}
+\label{OfflinePerformanceTools}
+\hypertarget{OfflinePerformanceTools}{}
+\input{OfflinePerformanceTools}
 
-\chapter{Performance Feedback}
-\label{PerformanceFeedback}
-\hypertarget{PerformanceFeedback}{}
-\input{PerformanceFeedback}
+\chapter{Frequently Asked Questions}
+\label{FrequentlyAskedQuestions}
+\hypertarget{FrequentlyAskedQuestions}{}
+\input{FrequentlyAskedQuestions}
 
-\chapter{Tips and Tricks To Know About}
-\label{TipsAndTricksToKnowAbout}
-\hypertarget{TipsAndTricksToKnowAbout}{}
-\input{TipsAndTricksToKnowAbout}
+\part{StarPU Extensions}
 
 \chapter{Out Of Core}
 \label{OutOfCore}
 \hypertarget{OutOfCore}{}
 \input{OutOfCore}
 
-
-
 \chapter{MPI Support}
 \label{MPISupport}
 \hypertarget{MPISupport}{}
@@ -158,17 +197,12 @@ Documentation License”.
 \hypertarget{SOCLOpenclExtensions}{}
 \input{SOCLOpenclExtensions}
 
-\chapter{Scheduling Contexts}
-\label{SchedulingContexts}
-\hypertarget{SchedulingContexts}{}
-\input{SchedulingContexts}
-
-\chapter{Scheduling Context Hypervisor}
-\label{SchedulingContextHypervisor}
-\hypertarget{SchedulingContextHypervisor}{}
-\input{SchedulingContextHypervisor}
+\chapter{SimGrid Support}
+\label{SimGridSupport}
+\hypertarget{SimGridSupport}{}
+\input{SimGridSupport}
 
-\part{Inside StarPU}
+\part{StarPU Reference API}
 
 \chapter{Execution Configuration Through Environment Variables}
 \label{ExecutionConfigurationThroughEnvironmentVariables}
@@ -277,10 +311,6 @@ Documentation License”.
 \hypertarget{deprecated}{}
 \input{deprecated}
 
-
-\addcontentsline{toc}{chapter}{Index}
-\printindex
-
 \part{Appendix}
 
 \chapter{Full Source Code for the ’Scaling a Vector’ Example}
@@ -293,4 +323,8 @@ Documentation License”.
 \hypertarget{GNUFreeDocumentationLicense}{}
 \input{GNUFreeDocumentationLicense}
 
+\part{Index}
+\addcontentsline{toc}{chapter}{Index}
+\printindex
+
 \end{document}